Skip to content

其他功能

除了用于表格数据的深度网络训练和使用外,PyTorch Tabular 还具备一些酷炫的功能,可以帮助您的经典机器学习/sci-kit learn 管道。

类别嵌入

CategoryEmbedding 模型也可以作为一种编码分类列的方式。与使用独热编码或目标均值编码的变体不同,您可以使用学习到的嵌入来编码分类特征。所有这些都可以通过一个类似 scikit-learn 风格的转换器来完成。

使用示例

# 将训练好的模型作为参数传递
transformer = CategoricalEmbeddingTransformer(tabular_model)
# 将训练数据框传递以提取嵌入并替换训练好的 tabular_model 中定义的分类特征
train_transformed = transformer.fit_transform(train)
# 在新数据框上使用提取的嵌入
val_transformed = transformer.transform(val)

pytorch_tabular.categorical_encoders.CategoricalEmbeddingTransformer

Bases: BaseEstimator, TransformerMixin

Source code in src/pytorch_tabular/categorical_encoders.py
class CategoricalEmbeddingTransformer(BaseEstimator, TransformerMixin):
    NAN_CATEGORY = 0

    def __init__(self, tabular_model):
        """初始化Transformer并提取神经嵌入.

Parameters:
    tabular_model (TabularModel): 训练好的TabularModel对象
"""
        self._categorical_encoder = tabular_model.datamodule.categorical_encoder
        self.cols = tabular_model.model.hparams.categorical_cols
        # dict {str: np.ndarray} column name --> mapping from category (index of df) to value (column of df)
        self._mapping = {}

        self._extract_embedding(tabular_model.model)

    def _extract_embedding(self, model):
        try:
            embedding_layer = model.extract_embedding()
        except ValueError as e:
            logger.error(
                f"Extracting embedding layer from model received this error: {e}."
                f" Some models do not support this feature."
            )
            embedding_layer = None
        if embedding_layer is not None:
            for i, col in enumerate(self.cols):
                self._mapping[col] = {}
                embedding = embedding_layer[i]
                self._mapping[col][self.NAN_CATEGORY] = embedding.weight[0, :].detach().cpu().numpy().ravel()
                for key in self._categorical_encoder._mapping[col].index:
                    self._mapping[col][key] = (
                        embedding.weight[self._categorical_encoder._mapping[col].loc[key], :]
                        .detach()
                        .cpu()
                        .numpy()
                        .ravel()
                    )
        else:
            raise ValueError("Passed model doesn't support this feature.")

    def fit(self, X, y=None):
        """只是为了兼容.

不做任何事情
"""
        return self

    def transform(self, X: DataFrame, y=None) -> DataFrame:
        """将指定的分类列转换为模型训练得到的神经嵌入.

Parameters:
    X (DataFrame): 特征的DataFrame,形状为(n_samples, n_features).必须包含要编码的列.
    y ([type], 可选): 仅用于兼容性.未使用.默认为None.

引发:
    ValueError: [描述]

Returns:
    DataFrame: 编码后的DataFrame
"""
        if not self._mapping:
            raise ValueError(
                "Passed model should either have an attribute `embeddng_layers`"
                " or a method `extract_embedding` defined for `transform`."
            )
        assert all(c in X.columns for c in self.cols)

        X_encoded = X.copy(deep=True)
        for col, mapping in track(
            self._mapping.items(),
            description="Encoding the data...",
            total=len(self._mapping.values()),
        ):
            for dim in range(mapping[self.NAN_CATEGORY].shape[0]):
                X_encoded.loc[:, f"{col}_embed_dim_{dim}"] = (
                    X_encoded[col].fillna(self.NAN_CATEGORY).map({k: v[dim] for k, v in mapping.items()})
                )
                # Filling unseen categories also with NAN Embedding
                X_encoded[f"{col}_embed_dim_{dim}"].fillna(mapping[self.NAN_CATEGORY][dim], inplace=True)
        X_encoded.drop(columns=self.cols, inplace=True)
        return X_encoded

    def fit_transform(self, X: DataFrame, y=None) -> DataFrame:
        """    根据学习到的嵌入对给定的X列进行编码.

Parameters:
    X (DataFrame): 特征的DataFrame,形状为(n_samples, n_features).必须包含要编码的列.
    y ([type], 可选): 仅用于兼容性.未使用.默认为None.

Returns:
    DataFrame: 编码后的DataFrame
"""
        self.fit(X, y)
        return self.transform(X)

    def save_as_object_file(self, path):
        if not self._mapping:
            raise ValueError("`fit` method must be called before `save_as_object_file`.")
        pickle.dump(self.__dict__, open(path, "wb"))

    def load_from_object_file(self, path):
        for k, v in pickle.load(open(path, "rb")).items():
            setattr(self, k, v)

__init__(tabular_model)

初始化Transformer并提取神经嵌入.

Parameters:

Name Type Description Default
tabular_model TabularModel

训练好的TabularModel对象

required
Source code in src/pytorch_tabular/categorical_encoders.py
    def __init__(self, tabular_model):
        """初始化Transformer并提取神经嵌入.

Parameters:
    tabular_model (TabularModel): 训练好的TabularModel对象
"""
        self._categorical_encoder = tabular_model.datamodule.categorical_encoder
        self.cols = tabular_model.model.hparams.categorical_cols
        # dict {str: np.ndarray} column name --> mapping from category (index of df) to value (column of df)
        self._mapping = {}

        self._extract_embedding(tabular_model.model)

fit(X, y=None)

只是为了兼容.

不做任何事情

Source code in src/pytorch_tabular/categorical_encoders.py
    def fit(self, X, y=None):
        """只是为了兼容.

不做任何事情
"""
        return self

fit_transform(X, y=None)

根据学习到的嵌入对给定的X列进行编码.

Parameters:

Name Type Description Default
X DataFrame

特征的DataFrame,形状为(n_samples, n_features).必须包含要编码的列.

required
y ([type], 可选)

仅用于兼容性.未使用.默认为None.

None

Returns:

Name Type Description
DataFrame DataFrame

编码后的DataFrame

Source code in src/pytorch_tabular/categorical_encoders.py
    def fit_transform(self, X: DataFrame, y=None) -> DataFrame:
        """    根据学习到的嵌入对给定的X列进行编码.

Parameters:
    X (DataFrame): 特征的DataFrame,形状为(n_samples, n_features).必须包含要编码的列.
    y ([type], 可选): 仅用于兼容性.未使用.默认为None.

Returns:
    DataFrame: 编码后的DataFrame
"""
        self.fit(X, y)
        return self.transform(X)

transform(X, y=None)

将指定的分类列转换为模型训练得到的神经嵌入.

Parameters:

Name Type Description Default
X DataFrame

特征的DataFrame,形状为(n_samples, n_features).必须包含要编码的列.

required
y ([type], 可选)

仅用于兼容性.未使用.默认为None.

None
引发

ValueError: [描述]

Returns:

Name Type Description
DataFrame DataFrame

编码后的DataFrame

Source code in src/pytorch_tabular/categorical_encoders.py
    def transform(self, X: DataFrame, y=None) -> DataFrame:
        """将指定的分类列转换为模型训练得到的神经嵌入.

Parameters:
    X (DataFrame): 特征的DataFrame,形状为(n_samples, n_features).必须包含要编码的列.
    y ([type], 可选): 仅用于兼容性.未使用.默认为None.

引发:
    ValueError: [描述]

Returns:
    DataFrame: 编码后的DataFrame
"""
        if not self._mapping:
            raise ValueError(
                "Passed model should either have an attribute `embeddng_layers`"
                " or a method `extract_embedding` defined for `transform`."
            )
        assert all(c in X.columns for c in self.cols)

        X_encoded = X.copy(deep=True)
        for col, mapping in track(
            self._mapping.items(),
            description="Encoding the data...",
            total=len(self._mapping.values()),
        ):
            for dim in range(mapping[self.NAN_CATEGORY].shape[0]):
                X_encoded.loc[:, f"{col}_embed_dim_{dim}"] = (
                    X_encoded[col].fillna(self.NAN_CATEGORY).map({k: v[dim] for k, v in mapping.items()})
                )
                # Filling unseen categories also with NAN Embedding
                X_encoded[f"{col}_embed_dim_{dim}"].fillna(mapping[self.NAN_CATEGORY][dim], inplace=True)
        X_encoded.drop(columns=self.cols, inplace=True)
        return X_encoded

特征提取器

如果您想在机器学习模型中使用神经网络学习到的特征,该怎么办?Pytorch Tabular 也可以轻松实现这一点。同样,一个类似 scikit-learn 风格的转换器可以为您完成这项工作。

使用示例

# 将训练好的模型作为参数传递
dt = DeepFeatureExtractor(tabular_model)
# 将训练数据框传递以提取最后一层的特征
# 这里 `fit` 仅用于兼容性,实际上不做任何操作
enc_df = dt.fit_transform(train)
# 在新数据框上使用提取的嵌入
val_transformed = transformer.transform(val)

pytorch_tabular.feature_extractor.DeepFeatureExtractor

Bases: BaseEstimator, TransformerMixin

Source code in src/pytorch_tabular/feature_extractor.py
class DeepFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, tabular_model, extract_keys=["backbone_features"], drop_original=True):
        """初始化Transformer并提取神经特征.

Parameters:
    tabular_model (TabularModel): 训练好的TabularModel对象
    extract_keys (list, 可选): 要提取的特征的键.默认为["backbone_features"].
    drop_original (bool, 可选): 是否删除原始列.默认为True.
"""
        assert not (
            isinstance(tabular_model.model, NODEModel)
            or isinstance(tabular_model.model, TabNetModel)
            or isinstance(tabular_model.model, MDNModel)
        ), "FeatureExtractor doesn't work for Mixture Density Networks, NODE Model, & Tabnet Model"
        self.tabular_model = tabular_model
        self.extract_keys = extract_keys
        self.drop_original = drop_original

    def fit(self, X, y=None):
        """只是为了兼容.

不做任何事情
"""
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """将指定的分类列转换为模型训练得到的神经特征.

Parameters:
    X (pd.DataFrame): 特征数据框,形状为 (n_samples, n_features).必须包含需要编码的列.
    y ([type], 可选): 仅用于兼容性.未使用.默认为 None.

Raises:
    ValueError: [描述]

Returns:
    pd.DataFrame: 编码后的数据框
"""

        X_encoded = X.copy(deep=True)
        orig_features = X_encoded.columns
        self.tabular_model.model.eval()
        inference_dataloader = self.tabular_model.datamodule.prepare_inference_dataloader(X_encoded)
        logits_predictions = defaultdict(list)
        for batch in track(inference_dataloader, description="Generating Features..."):
            for k, v in batch.items():
                if isinstance(v, list) and (len(v) == 0):
                    # Skipping empty list
                    continue
                batch[k] = v.to(self.tabular_model.model.device)
            if self.tabular_model.config.task == "ssl":
                ret_value = {"backbone_features": self.tabular_model.model.predict(batch, ret_model_output=True)}
            else:
                _, ret_value = self.tabular_model.model.predict(batch, ret_model_output=True)
            for k in self.extract_keys:
                if k in ret_value.keys():
                    logits_predictions[k].append(ret_value[k].detach().cpu())

        for k, v in logits_predictions.items():
            v = torch.cat(v, dim=0).numpy()
            if v.ndim == 1:
                v = v.reshape(-1, 1)
            for i in range(v.shape[-1]):
                if v.shape[-1] > 1:
                    X_encoded[f"{k}_{i}"] = v[:, i]
                else:
                    X_encoded[f"{k}"] = v[:, i]

        if self.drop_original:
            X_encoded.drop(columns=orig_features, inplace=True)
        return X_encoded

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """    基于学习到的特征对给定的X列进行编码.

Parameters:
    X (pd.DataFrame): 特征的DataFrame,形状为(n_samples, n_features).必须包含要编码的列.
    y ([type], 可选): 仅用于兼容性.未使用.默认为None.

Returns:
    pd.DataFrame: 编码后的DataFrame
"""
        self.fit(X, y)
        return self.transform(X)

    def save_as_object_file(self, path):
        """保存特征提取器为pickle文件.

Parameters:
    path (str): 保存文件的路径
"""
        if not self._mapping:
            raise ValueError("`fit` method must be called before `save_as_object_file`.")
        pickle.dump(self.__dict__, open(path, "wb"))

    def load_from_object_file(self, path):
        """加载从pickle文件中的特征提取器.

Parameters:
    path (str): 文件的加载路径
"""
        for k, v in pickle.load(open(path, "rb")).items():
            setattr(self, k, v)

__init__(tabular_model, extract_keys=['backbone_features'], drop_original=True)

初始化Transformer并提取神经特征.

Parameters:

Name Type Description Default
tabular_model TabularModel

训练好的TabularModel对象

required
extract_keys (list, 可选)

要提取的特征的键.默认为["backbone_features"].

['backbone_features']
drop_original (bool, 可选)

是否删除原始列.默认为True.

True
Source code in src/pytorch_tabular/feature_extractor.py
    def __init__(self, tabular_model, extract_keys=["backbone_features"], drop_original=True):
        """初始化Transformer并提取神经特征.

Parameters:
    tabular_model (TabularModel): 训练好的TabularModel对象
    extract_keys (list, 可选): 要提取的特征的键.默认为["backbone_features"].
    drop_original (bool, 可选): 是否删除原始列.默认为True.
"""
        assert not (
            isinstance(tabular_model.model, NODEModel)
            or isinstance(tabular_model.model, TabNetModel)
            or isinstance(tabular_model.model, MDNModel)
        ), "FeatureExtractor doesn't work for Mixture Density Networks, NODE Model, & Tabnet Model"
        self.tabular_model = tabular_model
        self.extract_keys = extract_keys
        self.drop_original = drop_original

fit(X, y=None)

只是为了兼容.

不做任何事情

Source code in src/pytorch_tabular/feature_extractor.py
    def fit(self, X, y=None):
        """只是为了兼容.

不做任何事情
"""
        return self

fit_transform(X, y=None)

基于学习到的特征对给定的X列进行编码.

Parameters:

Name Type Description Default
X DataFrame

特征的DataFrame,形状为(n_samples, n_features).必须包含要编码的列.

required
y ([type], 可选)

仅用于兼容性.未使用.默认为None.

None

Returns:

Type Description
DataFrame

pd.DataFrame: 编码后的DataFrame

Source code in src/pytorch_tabular/feature_extractor.py
    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """    基于学习到的特征对给定的X列进行编码.

Parameters:
    X (pd.DataFrame): 特征的DataFrame,形状为(n_samples, n_features).必须包含要编码的列.
    y ([type], 可选): 仅用于兼容性.未使用.默认为None.

Returns:
    pd.DataFrame: 编码后的DataFrame
"""
        self.fit(X, y)
        return self.transform(X)

load_from_object_file(path)

加载从pickle文件中的特征提取器.

Parameters:

Name Type Description Default
path str

文件的加载路径

required
Source code in src/pytorch_tabular/feature_extractor.py
    def load_from_object_file(self, path):
        """加载从pickle文件中的特征提取器.

Parameters:
    path (str): 文件的加载路径
"""
        for k, v in pickle.load(open(path, "rb")).items():
            setattr(self, k, v)

save_as_object_file(path)

保存特征提取器为pickle文件.

Parameters:

Name Type Description Default
path str

保存文件的路径

required
Source code in src/pytorch_tabular/feature_extractor.py
    def save_as_object_file(self, path):
        """保存特征提取器为pickle文件.

Parameters:
    path (str): 保存文件的路径
"""
        if not self._mapping:
            raise ValueError("`fit` method must be called before `save_as_object_file`.")
        pickle.dump(self.__dict__, open(path, "wb"))

transform(X, y=None)

将指定的分类列转换为模型训练得到的神经特征.

Parameters:

Name Type Description Default
X DataFrame

特征数据框,形状为 (n_samples, n_features).必须包含需要编码的列.

required
y ([type], 可选)

仅用于兼容性.未使用.默认为 None.

None

Raises:

Type Description
ValueError

[描述]

Returns:

Type Description
DataFrame

pd.DataFrame: 编码后的数据框

Source code in src/pytorch_tabular/feature_extractor.py
    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """将指定的分类列转换为模型训练得到的神经特征.

Parameters:
    X (pd.DataFrame): 特征数据框,形状为 (n_samples, n_features).必须包含需要编码的列.
    y ([type], 可选): 仅用于兼容性.未使用.默认为 None.

Raises:
    ValueError: [描述]

Returns:
    pd.DataFrame: 编码后的数据框
"""

        X_encoded = X.copy(deep=True)
        orig_features = X_encoded.columns
        self.tabular_model.model.eval()
        inference_dataloader = self.tabular_model.datamodule.prepare_inference_dataloader(X_encoded)
        logits_predictions = defaultdict(list)
        for batch in track(inference_dataloader, description="Generating Features..."):
            for k, v in batch.items():
                if isinstance(v, list) and (len(v) == 0):
                    # Skipping empty list
                    continue
                batch[k] = v.to(self.tabular_model.model.device)
            if self.tabular_model.config.task == "ssl":
                ret_value = {"backbone_features": self.tabular_model.model.predict(batch, ret_model_output=True)}
            else:
                _, ret_value = self.tabular_model.model.predict(batch, ret_model_output=True)
            for k in self.extract_keys:
                if k in ret_value.keys():
                    logits_predictions[k].append(ret_value[k].detach().cpu())

        for k, v in logits_predictions.items():
            v = torch.cat(v, dim=0).numpy()
            if v.ndim == 1:
                v = v.reshape(-1, 1)
            for i in range(v.shape[-1]):
                if v.shape[-1] > 1:
                    X_encoded[f"{k}_{i}"] = v[:, i]
                else:
                    X_encoded[f"{k}"] = v[:, i]

        if self.drop_original:
            X_encoded.drop(columns=orig_features, inplace=True)
        return X_encoded