Skip to content
import category_encoders as ce
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_covtype
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

from pytorch_tabular.utils import print_metrics, load_covertype_dataset

# %load_ext autoreload
# %自动重载 2
def load_classification_data():
    data, cat_col_names, num_col_names, target = load_covertype_dataset()
    test_idx = data.sample(int(0.2 * len(data)), random_state=42).index
    test = data[data.index.isin(test_idx)]
    train = data[~data.index.isin(test_idx)]
    return (train, test, cat_col_names, num_col_names, target)

加载森林覆盖数据

train, test, cat_col_names, num_col_names, target_col = load_classification_data()
train, val = train_test_split(train, random_state=42)
encoder = ce.OneHotEncoder(cols=cat_col_names)
train_transform = encoder.fit_transform(train)
val_transform = encoder.transform(val)
test_transform = encoder.transform(test)

基准

让我们使用默认的 LightGBM 模型作为基准。

results = []
metrics = [
    (accuracy_score, "Accuracy", {}),
    (f1_score, "F1", {"average": "weighted"}),
]
clf = lgb.LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1)
clf.fit(
    train_transform.drop(columns=target_col),
    train_transform[target_col].values.ravel(),
)
val_pred = clf.predict(val_transform.drop(columns=target_col))
val_metrics = print_metrics(
    metrics, val_transform[target_col], val_pred, "Validation", return_dict=True
)
test_pred = clf.predict(test_transform.drop(columns=target_col))
holdout_metrics = print_metrics(
    metrics, test_transform[target_col], test_pred, "Holdout", return_dict=True
)
Validation Accuracy: 0.8561396865829626 | Validation F1: 0.8555362266480759
Holdout Accuracy: 0.8555876835166348 | Holdout F1: 0.8548755340164053

results.append(
    {
        "Mode": "OneHot Encoding",
        "Validation Acc": val_metrics["Accuracy"],
        "Validation F1": val_metrics["F1"],
        "Holdout Acc": holdout_metrics["Accuracy"],
        "Holdout F1": holdout_metrics["F1"],
    }
)

类别嵌入模型

from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
from pytorch_tabular.models.common.heads import LinearHeadConfig
data_config = DataConfig(
    target=[target_col],  # 目标应始终为一个列表。仅在回归任务中支持多目标设置。多任务分类功能尚未实现。
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
    continuous_feature_transform="quantile_normal",
    normalize_continuous_features=True,
)
trainer_config = TrainerConfig(
    auto_lr_find=True,  # 运行LRFinder以自动推导学习率
    batch_size=1024,
    max_epochs=50,
    accelerator="auto",  # can be 'cpu','gpu', 'tpu', or 'ipu'
    devices=-1,  # -1 表示使用所有可用资源
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="",  # 头部没有额外的层,仅有一个映射层输出到`output_dim`。
    dropout=0.1,
    initialization="kaiming",
).__dict__  # 转换为字典以传递给模型配置(OmegaConf不接受对象)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="512-256-16",  # 每层节点数
    activation="LeakyReLU",  # 各层之间的激活
    dropout=0.1,
    initialization="kaiming",
    head="LinearHead",  # 线性磁头
    head_config=head_config,  # 线性磁头配置
    learning_rate=1e-3,
    metrics=["accuracy", "f1_score"],
    metrics_params=[{}, {"average": "micro"}],
    metrics_prob_input=[False, True],
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=False,
)
tabular_model.fit(train=train)
Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]
Trainer was signaled to stop but the required `min_epochs=1` or `min_steps=None` has not been met. Training will continue...
`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.001584893192461114
Restoring states from the checkpoint path at /home/manujosephv/pytorch_tabular/docs/tutorials/.lr_find_a39eae5d-c288-4a00-ae11-afde402b0d6a.ckpt
Restored all states from the checkpoint at /home/manujosephv/pytorch_tabular/docs/tutorials/.lr_find_a39eae5d-c288-4a00-ae11-afde402b0d6a.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

┏━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
┃    Name              Type                       Params ┃
┡━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
│ 0 │ _backbone        │ CategoryEmbeddingBackbone │  153 K │
│ 1 │ _embedding_layer │ Embedding1dLayer          │    896 │
│ 2 │ head             │ LinearHead                │    119 │
│ 3 │ loss             │ CrossEntropyLoss          │      0 │
└───┴──────────────────┴───────────────────────────┴────────┘
Trainable params: 154 K                                                                                            
Non-trainable params: 0                                                                                            
Total params: 154 K                                                                                                
Total estimated model params size (MB): 0                                                                          
Output()
`Trainer.fit` stopped: `max_epochs=50` reached.



<pytorch_lightning.trainer.trainer.Trainer at 0x7f251411f590>
result = tabular_model.evaluate(test)
print(result)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

Output()
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃        Test metric               DataLoader 0        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│       test_accuracy           0.9165160655975342     │
│       test_f1_score           0.9165160655975342     │
│         test_loss             0.19816306233406067    │
└───────────────────────────┴───────────────────────────┘


[{'test_loss': 0.19816306233406067, 'test_accuracy': 0.9165160655975342, 'test_f1_score': 0.9165160655975342}]

pred_df = tabular_model.predict(test)
print_metrics(metrics, test[target_col], pred_df["prediction"], tag="Holdout")
Holdout Accuracy: 0.9165160668491076 | Holdout F1: 0.9163006785311444

提取学习到的嵌入

对于支持的模型(CategoryEmbeddingModel和CategoryEmbeddingNODE),我们可以将学习到的嵌入提取为一个scikit-learn风格的变换器。您可以在您的scikit-learn管道和工作流程中使用它作为直接替代。

transformer = CategoricalEmbeddingTransformer(tabular_model)
train_transform = transformer.fit_transform(train)
clf = lgb.LGBMClassifier(random_state=42, verbose=-1)
clf.fit(train_transform.drop(columns=target_col), train_transform[target_col])
Output()
/home/manujosephv/pytorch_tabular/src/pytorch_tabular/categorical_encoders.py:188: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  embedding.weight[self._categorical_encoder._mapping[col].loc[key], :]



LGBMClassifier(random_state=42, verbose=-1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

val_transform = transformer.transform(val)
val_pred = clf.predict(val_transform.drop(columns=target_col))
val_metrics = print_metrics(
    metrics, val_transform[target_col], val_pred, "Validation", return_dict=True
)
test_transform = transformer.transform(test)
test_pred = clf.predict(test_transform.drop(columns=target_col))
holdout_metrics = print_metrics(
    metrics, test_transform[target_col], test_pred, "Holdout", return_dict=True
)
Output()


Output()
Validation Accuracy: 0.8600552481433353 | Validation F1: 0.8595332390375023



Holdout Accuracy: 0.8595721244040551 | Holdout F1: 0.8590314305879913

results.append(
    {
        "Mode": "NeuralEmbedding",
        "Validation Acc": val_metrics["Accuracy"],
        "Validation F1": val_metrics["F1"],
        "Holdout Acc": holdout_metrics["Accuracy"],
        "Holdout F1": holdout_metrics["F1"],
    }
)
res_df = pd.DataFrame(results).T
res_df.columns = res_df.iloc[0]
res_df = res_df.iloc[1:].astype(float)
res_df
Mode OneHot Encoding NeuralEmbedding
Validation Acc 0.856140 0.860055
Validation F1 0.855536 0.859533
Holdout Acc 0.855588 0.859572
Holdout F1 0.854876 0.859031