Skip to content
import warnings

import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from pytorch_tabular.utils import load_covertype_dataset
data, cat_col_names, num_col_names, target_col = load_covertype_dataset()

导入库

from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
    CategoryEmbeddingModelConfig,
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig
train, test = train_test_split(data, random_state=42)

装袋预测

这是指我们在数据的不同折叠上训练不同的模型,然后将它们最终组合在一起以得到最终预测。这是一种非常强大的技术,可以用来提高模型的准确性,也是Kaggle中非常常见的做法。

data_config = DataConfig(
    target=[
        target_col
    ],  # 目标应始终为一个列表。多目标仅支持回归任务。多任务分类功能尚未实现。
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    batch_size=1024,
    max_epochs=100,
    early_stopping="valid_loss",  # 监控有效损失以进行提前停止
    early_stopping_mode="min",  # 将模式设为min,因为对于val_loss,值越低越好。
    early_stopping_patience=5,  # 在终止之前等待的退化训练轮数
    checkpoints="valid_loss",  # 保存最佳检查点监控验证损失
    load_best=True,  # 训练后,加载最佳检查点。
    progress_bar="none",  # 关闭进度条
    trainer_kwargs=dict(enable_model_summary=False),  # 关闭模型摘要
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="",
    dropout=0.1,
    initialization=(  # 头部不添加额外层,仅有一层映射至输出维度
        "kaiming"
    ),
).__dict__  # 转换为字典以传递给模型配置(OmegaConf 不接受对象)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # 每一层的节点数量
    activation="LeakyReLU",  # 各层之间的激活
    learning_rate=1e-3,
    head="LinearHead",  # 线性磁头
    head_config=head_config,  # 线性磁头配置
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=False,
)
tabular_model.fit(train=train)
pred_df = tabular_model.predict(test)
Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.

我们可以使用 TabularModel 中的高级方法 bagging_predict

参数如下: - cv 可以是一个整数或一个 KFold 对象。如果是整数,将被视为 KFold 中的折数。对于分类问题,将为 StratifiedKFold。如果是 KFold 对象,将按原样使用。 - train 是训练数据集。 - test 是测试数据集。 - aggregate 是我们集成预测的方式。可以是 meanmedianminmaxhard_votinghard_voting 仅对分类有效。我们还可以传入一个自定义函数,该函数接受一个 3D 数组(样本数,折数,目标数)的列表并返回一个 2D 数组的最终概率(样本数,目标数)。 - weights 用于聚合每个折的预测。如果为 None,将使用相等权重。这仅在 aggregate 为 "mean" 时使用。 - return_raw_predictions 如果为 True,将返回每个折的原始预测。默认值为 False。

有关所有参数的完整列表,请参阅文档字符串。

# 使用sklearn进行交叉验证循环
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    bagged_pred_df = tabular_model.bagging_predict(
        cv=3, train=train, test=test, aggregate="mean"
    )
2023-12-31 18:04:38,344 - {pytorch_tabular.tabular_model:2276} - INFO - Running Fold 1/3                           
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

2023-12-31 18:08:17,581 - {pytorch_tabular.tabular_model:2307} - INFO - Fold 1/3 prediction done                   
2023-12-31 18:08:17,590 - {pytorch_tabular.tabular_model:2276} - INFO - Running Fold 2/3                           
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

2023-12-31 18:13:21,030 - {pytorch_tabular.tabular_model:2307} - INFO - Fold 2/3 prediction done                   
2023-12-31 18:13:21,039 - {pytorch_tabular.tabular_model:2276} - INFO - Running Fold 3/3                           
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

2023-12-31 18:16:38,902 - {pytorch_tabular.tabular_model:2307} - INFO - Fold 3/3 prediction done                   
# 计算指标
orig_acc = accuracy_score(test[target_col].values, pred_df["prediction"].values)
bagged_acc = accuracy_score(test[target_col].values, bagged_pred_df["prediction"].values)
print(f"Original Accuracy: {orig_acc} | Bagged Accuracy: {bagged_acc}")
Original Accuracy: 0.9506860443502028 | Bagged Accuracy: 0.9533778992516506