import category_encoders as ce
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_covtype
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from pytorch_tabular.utils import print_metrics, load_covertype_dataset
# %load_ext autoreload
# %自动重载 2
def load_classification_data():
data, cat_col_names, num_col_names, target = load_covertype_dataset()
test_idx = data.sample(int(0.2 * len(data)), random_state=42).index
test = data[data.index.isin(test_idx)]
train = data[~data.index.isin(test_idx)]
return (train, test, cat_col_names, num_col_names, target)
加载森林覆盖数据¶
基准¶
让我们使用默认的 LightGBM 模型作为基准。
clf = lgb.LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1)
clf.fit(
train_transform.drop(columns=target_col),
train_transform[target_col].values.ravel(),
)
val_pred = clf.predict(val_transform.drop(columns=target_col))
val_metrics = print_metrics(
metrics, val_transform[target_col], val_pred, "Validation", return_dict=True
)
test_pred = clf.predict(test_transform.drop(columns=target_col))
holdout_metrics = print_metrics(
metrics, test_transform[target_col], test_pred, "Holdout", return_dict=True
)
类别嵌入模型¶
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
from pytorch_tabular.models.common.heads import LinearHeadConfig
data_config = DataConfig(
target=[target_col], # 目标应始终为一个列表。仅在回归任务中支持多目标设置。多任务分类功能尚未实现。
continuous_cols=num_col_names,
categorical_cols=cat_col_names,
continuous_feature_transform="quantile_normal",
normalize_continuous_features=True,
)
trainer_config = TrainerConfig(
auto_lr_find=True, # 运行LRFinder以自动推导学习率
batch_size=1024,
max_epochs=50,
accelerator="auto", # can be 'cpu','gpu', 'tpu', or 'ipu'
devices=-1, # -1 表示使用所有可用资源
)
optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(
layers="", # 头部没有额外的层,仅有一个映射层输出到`output_dim`。
dropout=0.1,
initialization="kaiming",
).__dict__ # 转换为字典以传递给模型配置(OmegaConf不接受对象)
model_config = CategoryEmbeddingModelConfig(
task="classification",
layers="512-256-16", # 每层节点数
activation="LeakyReLU", # 各层之间的激活
dropout=0.1,
initialization="kaiming",
head="LinearHead", # 线性磁头
head_config=head_config, # 线性磁头配置
learning_rate=1e-3,
metrics=["accuracy", "f1_score"],
metrics_params=[{}, {"average": "micro"}],
metrics_prob_input=[False, True],
)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
verbose=False,
)
提取学习到的嵌入¶
对于支持的模型(CategoryEmbeddingModel和CategoryEmbeddingNODE),我们可以将学习到的嵌入提取为一个scikit-learn风格的变换器。您可以在您的scikit-learn管道和工作流程中使用它作为直接替代。
val_transform = transformer.transform(val)
val_pred = clf.predict(val_transform.drop(columns=target_col))
val_metrics = print_metrics(
metrics, val_transform[target_col], val_pred, "Validation", return_dict=True
)
test_transform = transformer.transform(test)
test_pred = clf.predict(test_transform.drop(columns=target_col))
holdout_metrics = print_metrics(
metrics, test_transform[target_col], test_pred, "Holdout", return_dict=True
)