Skip to content

实验跟踪在机器学习中是至关重要的,因为它使数据科学家和研究人员能够有效地管理和重现他们的实验。通过跟踪实验的各个方面,例如超参数、模型架构和训练数据,更容易理解和解释结果。实验跟踪还允许团队成员之间更好的协作和知识共享,因为它提供了一个实验及其相关元数据的集中存储库。此外,跟踪实验有助于调试和故障排除,因为它可以识别导致成功或失败结果的特定设置或条件。总体而言,实验跟踪在确保机器学习工作流的透明性、可重现性和持续改进方面发挥着至关重要的作用。

现在,让我们看看如何使用PyTorch Tabular和Tensorboard免费获得所有这些好处(Tensorboard在PyTorch Lightning中预安装)。虽然不如Weights and Biases功能丰富,但Tensorboard是一个经典的离线跟踪解决方案。

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random
from pytorch_tabular.utils import load_covertype_dataset, print_metrics
import pandas as pd
import wandb

# %加载自动重新加载扩展
# %自动重载 2
data, cat_col_names, num_col_names, target_col = load_covertype_dataset()
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)

导入库

from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
    CategoryEmbeddingModelConfig,
    FTTransformerConfig,
    TabNetModelConfig,
    GANDALFConfig,
)
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)
from pytorch_tabular.models.common.heads import LinearHeadConfig

常见配置

data_config = DataConfig(
    target=[
        target_col
    ],  # 目标应始终为一个列表。仅在回归任务中支持多目标。多任务分类功能尚未实现。
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True,  # 运行LRFinder以自动推导学习率
    batch_size=1024,
    max_epochs=100,
    early_stopping="valid_loss",  # 监控有效损失以进行提前停止
    early_stopping_mode="min",  # 将模式设置为 min,因为对于 val_loss,数值越低越好。
    early_stopping_patience=5,  # 在终止之前等待的退化训练的周期数
    checkpoints="valid_loss",  # 保存最佳检查点监测val_loss
    load_best=True,  # 训练后,加载最佳检查点
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="",  # 头部没有额外层,仅有一个映射层映射至output_dim
    dropout=0.1,
    initialization="kaiming",
).__dict__  # 转换为字典以传递给模型配置(OmegaConf 不接受对象)

EXP_PROJECT_NAME = "pytorch-tabular-covertype"

类别嵌入模型

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # 每层节点数量
    activation="LeakyReLU",  # 每层之间的激活
    learning_rate=1e-3,
    head="LinearHead",  # 线性磁头
    head_config=head_config,  # 线性磁头 Config
)

experiment_config = ExperimentConfig(
    project_name=EXP_PROJECT_NAME,
    run_name="CategoryEmbeddingModel",
    log_target="tensorboard",
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    experiment_config=experiment_config,
    verbose=False,
    suppress_lightning_logger=True,
)
tabular_model.fit(train=train, validation=val)
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory saved_models exists and is not empty.
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]
┏━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
┃    Name              Type                       Params ┃
┡━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
│ 0 │ _backbone        │ CategoryEmbeddingBackbone │  823 K │
│ 1 │ _embedding_layer │ Embedding1dLayer          │    896 │
│ 2 │ head             │ LinearHead                │  3.6 K │
│ 3 │ loss             │ CrossEntropyLoss          │      0 │
└───┴──────────────────┴───────────────────────────┴────────┘
Trainable params: 827 K                                                                                            
Non-trainable params: 0                                                                                            
Total params: 827 K                                                                                                
Total estimated model params size (MB): 3                                                                          
Output()


<pytorch_lightning.trainer.trainer.Trainer at 0x7f485edf3690>
result = tabular_model.evaluate(test)
Output()
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃        Test metric               DataLoader 0        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│       test_accuracy           0.9159672856330872     │
│         test_loss             0.21389885246753693    │
└───────────────────────────┴───────────────────────────┘


FT Transformer

model_config = FTTransformerConfig(
    task="classification",
    num_attn_blocks=3,
    num_heads=4,
    learning_rate=1e-3,
    head="LinearHead",  # 线性磁头
    head_config=head_config,  # 线性磁头 Config
)

experiment_config = ExperimentConfig(
    project_name=EXP_PROJECT_NAME,
    run_name="FTTransformer",
    log_target="tensorboard",
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    experiment_config=experiment_config,
    verbose=False,
    suppress_lightning_logger=True,
)
tabular_model.fit(train=train, validation=val)
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory saved_models exists and is not empty.
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]
┏━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
┃    Name              Type                   Params ┃
┡━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
│ 0 │ _backbone        │ FTTransformerBackbone │ 86.5 K │
│ 1 │ _embedding_layer │ Embedding2dLayer      │  2.2 K │
│ 2 │ _head            │ LinearHead            │    231 │
│ 3 │ loss             │ CrossEntropyLoss      │      0 │
└───┴──────────────────┴───────────────────────┴────────┘
Trainable params: 89.0 K                                                                                           
Non-trainable params: 0                                                                                            
Total params: 89.0 K                                                                                               
Total estimated model params size (MB): 0                                                                          
Output()


<pytorch_lightning.trainer.trainer.Trainer at 0x7f485f1105d0>
result = tabular_model.evaluate(test)
Output()
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃        Test metric               DataLoader 0        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│       test_accuracy           0.9120706915855408     │
│         test_loss             0.21334321796894073    │
└───────────────────────────┴───────────────────────────┘


甘道夫

model_config = GANDALFConfig(
    task="classification",
    gflu_stages=10,
    learning_rate=1e-3,
    head="LinearHead",  # 线性磁头
    head_config=head_config,  # 线性磁头 Config
)

experiment_config = ExperimentConfig(
    project_name=EXP_PROJECT_NAME,
    run_name="GANDALF",
    log_target="tensorboard",
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    experiment_config=experiment_config,
    verbose=False,
    suppress_lightning_logger=True,
)
tabular_model.fit(train=train, validation=val)
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory saved_models exists and is not empty.
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]
┏━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
┃    Name              Type              Params ┃
┡━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
│ 0 │ _backbone        │ GANDALFBackbone  │ 70.7 K │
│ 1 │ _embedding_layer │ Embedding1dLayer │    896 │
│ 2 │ _head            │ Sequential       │    252 │
│ 3 │ loss             │ CrossEntropyLoss │      0 │
└───┴──────────────────┴──────────────────┴────────┘
Trainable params: 71.9 K                                                                                           
Non-trainable params: 0                                                                                            
Total params: 71.9 K                                                                                               
Total estimated model params size (MB): 0                                                                          
Output()


<pytorch_lightning.trainer.trainer.Trainer at 0x7f485f0f6990>
result = tabular_model.evaluate(test)
Output()
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃        Test metric               DataLoader 0        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│       test_accuracy           0.8669493794441223     │
│         test_loss             0.32519233226776123    │
└───────────────────────────┴───────────────────────────┘


访问实验

我们可以通过以下步骤来访问运行结果:

  1. 打开终端,并导航到您保存笔记本的目录。
  2. 运行以下命令: tensorboard --logdir <项目名称> --> 在这种情况下,命令将是 tensorboard --logdir <项目名称> 这将启动一个本地服务器,里面包含提供文件夹中的运行结果,并提供一个链接以访问Tensorboard UI。
  3. 在浏览器中打开该链接。