import warnings

import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from pytorch_tabular.utils import make_mixed_dataset

data, cat_col_names, num_col_names = make_mixed_dataset(
    task="classification", n_samples=10000, n_features=20, n_categories=4
)

导入库¶

from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
    CategoryEmbeddingModelConfig,
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

train, test = train_test_split(data, random_state=42)

交叉验证¶

data_config = DataConfig(
    target=[
        "target"
    ],  # 目标应始终为一个列表。多目标仅支持回归任务。多任务分类功能尚未实现。
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    batch_size=1024,
    max_epochs=100,
    early_stopping="valid_loss",  # 监控有效损失以进行提前停止
    early_stopping_mode="min",  # 将模式设置为min，因为对于val_loss，值越低越好。
    early_stopping_patience=5,  # 在终止前等待的降级训练的轮次数量
    checkpoints="valid_loss",  # 保存最佳检查点监控验证损失
    load_best=True,  # 训练完成后，加载最佳检查点
    progress_bar="none",  # 关闭进度条
    trainer_kwargs=dict(enable_model_summary=False),  # 关闭模型摘要
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="",
    dropout=0.1,
    initialization=(  # 头部没有额外的层，只是一个映射层到输出维度。
        "kaiming"
    ),
).__dict__  # 转换为字典以传递给模型配置（OmegaConf 不接受对象）

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # 每一层的节点数量
    activation="LeakyReLU",  # 每层之间的激活
    learning_rate=1e-3,
    head="LinearHead",  # 线性磁头
    head_config=head_config,  # 线性磁头 Config
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=False,
)

使用高级API¶

我们可以使用 TabularModel 中的高级方法 cross_validate

参数如下： - cv 可以是一个整数或 KFold 对象。如果是整数，它将被视为 KFold 中的折叠数。对于分类问题，它将是 StratifiedKFold。如果是 KFold 对象，则将按原样使用。 - metric 是用于评估的指标。它可以是一个字符串（指标名称）或一个可调用对象。如果是可调用对象，它应该接受两个参数，预测结果和目标值。预测结果应为 model.predict 的数据框输出，目标可以是一个系列或数组。 - train 是训练数据集。 - return_oof 是一个布尔值。如果设置为 True，它将返回训练数据集的折外预测。这对于模型堆叠很有用。 - reset_datamodule 是一个布尔值。如果设置为 True，它将在每个折叠后重置数据模块，这才是进行交叉验证的正确方式。如果设置为 False，它将不会重置数据模块，并且会更快，但会有少量数据泄露。这在处理大型数据集时很有用，能够节省时间。

# 使用sklearn进行交叉验证循环
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)


def _accuracy(y_true, y_pred):
    return accuracy_score(y_true, y_pred["prediction"].values)


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    cv_scores, oof_predictions = tabular_model.cross_validate(
        cv=2, train=train, metric=_accuracy, return_oof=True, reset_datamodule=False
    )

2023-12-31 13:08:18,468 - {pytorch_tabular.tabular_model:1925} - INFO - Running Fold 1/2

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

2023-12-31 13:08:22,376 - {pytorch_tabular.tabular_model:1952} - INFO - Fold 1/2 score: 0.908

2023-12-31 13:08:22,383 - {pytorch_tabular.tabular_model:1925} - INFO - Running Fold 2/2

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

2023-12-31 13:08:24,704 - {pytorch_tabular.tabular_model:1952} - INFO - Fold 2/2 score: 0.9517333333333333

print(f"KFold Mean: {np.mean(cv_scores)} | KFold SD: {np.std(cv_scores)}")

KFold Mean: 0.9298666666666666 | KFold SD: 0.021866666666666645

使用低级API¶

有时，我们可能想要做一些不仅仅是简单的、基础的交叉验证。例如，我们可能想要进行多指标的交叉验证，或者我们可能想要进行依赖于目标和预测以外内容的自定义指标的交叉验证。在这种情况下，我们可以使用低级API。

from rich import print

def _accuracy(y_true, y_pred):
    return accuracy_score(y_true, y_pred["prediction"].values)


def _roc_auc_score(y_true, y_pred):
    return roc_auc_score(y_true, y_pred["class_1_probability"].values)


kf = KFold(n_splits=5, shuffle=True, random_state=42)
# 初始化表格模型一次
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=False,
)
acc_metrics = []
roc_metrics = []
preds = []
datamodule = None
model = None
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold, (train_idx, val_idx) in enumerate(kf.split(train)):
        train_fold = train.iloc[train_idx]
        val_fold = train.iloc[val_idx]
        if datamodule is None:
            # 在第一个折叠中初始化数据模块和模型
            # 使用来自此折叠的数据来拟合所有转换器。
            datamodule = tabular_model.prepare_dataloader(
                train=train_fold, validation=val_fold, seed=42
            )
            model = tabular_model.prepare_model(datamodule)
        else:
            # 创建一个数据模块的副本，其中包含相同的转换器，但训练和验证数据不同。
            datamodule = datamodule.copy(train=train_fold, validation=val_fold)
        # 训练模型
        tabular_model.train(model, datamodule)
        pred_df = tabular_model.predict(val_fold)
        acc_metrics.append(_accuracy(val_fold["target"], pred_df))
        roc_metrics.append(_roc_auc_score(val_fold["target"], pred_df))
        print(
            f"[bold red]Fold:[/bold red] {fold} | [bold green]Accuracy:[/bold green]"
            f" {acc_metrics[-1]} | [bold green]AUC:[/bold green] {roc_metrics[-1]}"
        )
        # 在下一折之前重置训练好的权重
        tabular_model.model.reset_weights()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

Fold: 0 | Accuracy: 0.9293333333333333 | AUC: 0.9807391279599271

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

Fold: 1 | Accuracy: 0.9146666666666666 | AUC: 0.9736274684219891

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

Fold: 2 | Accuracy: 0.924 | AUC: 0.9730588808512757

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

Fold: 3 | Accuracy: 0.922 | AUC: 0.9757440627005844

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

Fold: 4 | Accuracy: 0.9166666666666666 | AUC: 0.9743804540010267

print(
    f"KFold Accuracy Mean: {np.mean(acc_metrics)} | KFold Accuracy SD:"
    f" {np.std(acc_metrics)}"
)
print(f"KFold AUC Mean: {np.mean(roc_metrics)} | KFold AUC SD: {np.std(roc_metrics)}")

KFold Accuracy Mean: 0.9213333333333333 | KFold Accuracy SD: 0.005249338582674566

KFold AUC Mean: 0.9755099987869607 | KFold AUC SD: 0.002765008099674828