Skip to content
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random
import numpy as np
import pandas as pd
from pytorch_tabular.utils import load_covertype_dataset
from rich import print
# %加载自动重新加载扩展
# %自动重新加载 2
data, cat_col_names, num_col_names, target_col = load_covertype_dataset()
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)

导入库

from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
    CategoryEmbeddingModelConfig
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

测试时增强

测试时增强(TTA)是计算机视觉中一种流行的技术。TTA的目的是通过在推理阶段使用数据增强来提高模型的准确性。TTA背后的理念很简单:对于每一张测试图像,我们创建多个与原始图像稍有不同的版本(例如,裁剪或翻转)。接下来,我们对测试图像及其创建的副本进行预测,并对每张图像的多个版本的模型预测结果进行平均。这通常有助于提高准确性,而不管基础模型如何。

有关更多细节,请参考此链接:表格数据的测试时增强

results = []
data_config = DataConfig(
    target=[
        target_col
    ],  # 目标应始终为一个列表。仅在回归任务中支持多目标。多任务分类尚未实现。
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    batch_size=1024,
    max_epochs=100,
    early_stopping="valid_loss",  # 监视有效的损失以进行提前停止
    early_stopping_mode="min",  # 将模式设置为min,因为在验证损失(val_loss)中,数值越低越好。
    early_stopping_patience=5,  # 在终止之前等待的退化训练的轮次数
    checkpoints="valid_loss",  # 保存最佳检查点监控验证损失
    load_best=True,  # 训练后,加载最佳检查点
    #     progress_bar="none", # Turning off Progress bar
    # trainer_kwargs=dict(
    # enable_model_summary=False 关闭模型摘要
    # )
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="", dropout=0.1, initialization="kaiming"  # 头部没有额外的层,仅包含一个映射层,输出维度为output_dim。
).__dict__  # 转换为字典以传递给模型配置(OmegaConf不接受对象)

model1_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # 每层节点的数量
    activation="LeakyReLU",  # 各层之间的激活
    learning_rate=1e-3,
    head="LinearHead",  # 线性磁头
    head_config=head_config,  # 线性磁头配置
)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # 每层节点的数量
    activation="LeakyReLU",  # 各层之间的激活
    learning_rate=1e-3,
    head="LinearHead",  # 线性磁头
    head_config=head_config,  # 线性磁头配置
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=False
)
datamodule = tabular_model.prepare_dataloader(train=train, validation=val, seed=42)
model = tabular_model.prepare_model(datamodule)
tabular_model.train(model, datamodule)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

┏━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
┃    Name              Type                       Params ┃
┡━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
│ 0 │ _backbone        │ CategoryEmbeddingBackbone │  823 K │
│ 1 │ _embedding_layer │ Embedding1dLayer          │    896 │
│ 2 │ head             │ LinearHead                │  3.6 K │
│ 3 │ loss             │ CrossEntropyLoss          │      0 │
└───┴──────────────────┴───────────────────────────┴────────┘
Trainable params: 827 K                                                                                            
Non-trainable params: 0                                                                                            
Total params: 827 K                                                                                                
Total estimated model params size (MB): 3                                                                          
Output()
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connecto
rs/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider 
increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connecto
rs/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider 
increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


<pytorch_lightning.trainer.trainer.Trainer at 0x7f56e2806610>
pred_df = tabular_model.predict(test)
tta_pred_df = tabular_model.predict(test, test_time_augmentation=True, num_tta=5, alpha_tta=0.005)
# 计算指标
orig_acc = accuracy_score(test[target_col].values, pred_df["prediction"].values)
tta_acc = accuracy_score(test[target_col].values, tta_pred_df["prediction"].values)
print(f"Original Accuracy: {orig_acc} | TTA Accuracy: {tta_acc}")
Original Accuracy: 0.9450889138262205 | TTA Accuracy: 0.9450062993535417