使用 PyTorch Lightning 与 Tune#
PyTorch Lightning 是一个将结构引入 PyTorch 模型训练的框架。它旨在避免样板代码,因此在构建新模型时,无需重复编写相同的训练循环。
PyTorch Lightning 的主要抽象是 LightningModule
类,该类应该由你的应用程序扩展。有 一篇关于如何将模型从普通 PyTorch 转移到 Lightning 的优秀文章。
PyTorch Lightning 的类结构使得定义和调整模型参数变得非常容易。本教程将向你展示如何使用 Ray Train 的 TorchTrainer
与 Tune 一起,找到最佳参数集,以训练一个 MNIST 分类器为例。值得注意的是,LightningModule
$ pip install "ray[tune]" torch torchvision pytorch_lightning
MNIST 的 PyTorch Lightning 分类器#
让我们首先从一个基本的 PyTorch Lightning 实现的 MNIST 分类器开始。此分类器此时不包含任何调优代码。
import os
import torch
import tempfile
import pytorch_lightning as pl
import torch.nn.functional as F
from filelock import FileLock
from torchmetrics import Accuracy
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import MNIST
from torchvision import transforms
我们的示例基于之前提到的 博客文章 中的 MNIST 示例。我们将原始模型和数据集定义调整为 MNISTClassifier
和 MNISTDataModule
class MNISTClassifier(pl.LightningModule):
def __init__(self, config):
super(MNISTClassifier, self).__init__()
self.accuracy = Accuracy(task="multiclass", num_classes=10, top_k=1)
self.layer_1_size = config["layer_1_size"]
self.layer_2_size = config["layer_2_size"]
self.lr = config["lr"]
# MNIST图像的形状为(1, 28, 28)(通道数, 宽度, 高度)
self.layer_1 = torch.nn.Linear(28 * 28, self.layer_1_size)
self.layer_2 = torch.nn.Linear(self.layer_1_size, self.layer_2_size)
self.layer_3 = torch.nn.Linear(self.layer_2_size, 10)
self.eval_loss = []
self.eval_accuracy = []
def cross_entropy_loss(self, logits, labels):
return F.nll_loss(logits, labels)
def forward(self, x):
batch_size, channels, width, height = x.size()
x = x.view(batch_size, -1)
x = self.layer_1(x)
x = torch.relu(x)
x = self.layer_2(x)
x = torch.relu(x)
x = self.layer_3(x)
x = torch.log_softmax(x, dim=1)
return x
def training_step(self, train_batch, batch_idx):
x, y = train_batch
logits = self.forward(x)
loss = self.cross_entropy_loss(logits, y)
accuracy = self.accuracy(logits, y)
self.log("ptl/train_loss", loss)
self.log("ptl/train_accuracy", accuracy)
return loss
def validation_step(self, val_batch, batch_idx):
x, y = val_batch
logits = self.forward(x)
loss = self.cross_entropy_loss(logits, y)
accuracy = self.accuracy(logits, y)
return {"val_loss": loss, "val_accuracy": accuracy}
def on_validation_epoch_end(self):
avg_loss = torch.stack(self.eval_loss).mean()
avg_acc = torch.stack(self.eval_accuracy).mean()
self.log("ptl/val_loss", avg_loss, sync_dist=True)
self.log("ptl/val_accuracy", avg_acc, sync_dist=True)
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
return optimizer
class MNISTDataModule(pl.LightningDataModule):
def __init__(self, batch_size=128):
self.data_dir = tempfile.mkdtemp()
self.batch_size = batch_size
self.transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
def setup(self, stage=None):
with FileLock(f"{self.data_dir}.lock"):
mnist = MNIST(
self.data_dir, train=True, download=True, transform=self.transform
self.mnist_train, self.mnist_val = random_split(mnist, [55000, 5000])
self.mnist_test = MNIST(
self.data_dir, train=False, download=True, transform=self.transform
def train_dataloader(self):
return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=4)
def val_dataloader(self):
return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=4)
def test_dataloader(self):
return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)
default_config = {
"layer_1_size": 128,
"layer_2_size": 256,
"lr": 1e-3,
定义一个训练函数,该函数使用Ray Train工具创建模型、数据模块和Lightning训练器。
from ray.train.lightning import (
def train_func(config):
dm = MNISTDataModule(batch_size=config["batch_size"])
model = MNISTClassifier(config)
trainer = pl.Trainer(
trainer = prepare_trainer(trainer)
trainer.fit(model, datamodule=dm)
from ray import tune
from ray.tune.schedulers import ASHAScheduler
现在我们配置参数搜索空间。我们希望在不同的层维度、学习率和批处理大小之间进行选择。学习率应该在 0.0001
和 0.1
search_space = {
"layer_1_size": tune.choice([32, 64, 128]),
"layer_2_size": tune.choice([64, 128, 256]),
"lr": tune.loguniform(1e-4, 1e-1),
"batch_size": tune.choice([32, 64]),
# 最大训练轮数
num_epochs = 5
# 参数空间中的样本数量
num_samples = 10
scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)
from ray.train import RunConfig, ScalingConfig, CheckpointConfig
scaling_config = ScalingConfig(
num_workers=3, use_gpu=True, resources_per_worker={"CPU": 1, "GPU": 1}
run_config = RunConfig(
from ray.train.torch import TorchTrainer
# 定义一个不带超参数的TorchTrainer供Tuner使用
ray_trainer = TorchTrainer(
最后,我们需要创建一个 Tuner()
对象并使用 tuner.fit()
启动 Ray Tune。完整的代码如下:
def tune_mnist_asha(num_samples=10):
scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)
tuner = tune.Tuner(
param_space={"train_loop_config": search_space},
return tuner.fit()
results = tune_mnist_asha(num_samples=num_samples)
Tune Status
Current time: | 2023-09-07 14:03:52 |
Running for: | 00:05:13.92 |
Memory: | 20.5/186.6 GiB |
System Info
Using AsyncHyperBand: num_stopped=10
Bracket: Iter 4.000: 0.9709362387657166 | Iter 2.000: 0.9617255330085754 | Iter 1.000: 0.9477165043354034
Logical resource usage: 4.0/48 CPUs, 3.0/4 GPUs
Logical resource usage: 4.0/48 CPUs, 3.0/4 GPUs (0.0/1.0 accelerator_type:None)
Trial Status
Trial name | status | loc | train_loop_config/ba tch_size | train_loop_config/la yer_1_size | train_loop_config/la yer_2_size | train_loop_config/lr | iter | total time (s) | ptl/train_loss | ptl/train_accuracy | ptl/val_loss |
TorchTrainer_5144b_00000 | TERMINATED | | 32 | 64 | 256 | 0.0316233 | 5 | 29.3336 | 0.973613 | 0.766667 | 0.580943 |
TorchTrainer_5144b_00001 | TERMINATED | | 64 | 128 | 64 | 0.0839278 | 1 | 12.2275 | 2.19514 | 0.266667 | 1.56644 |
TorchTrainer_5144b_00002 | TERMINATED | | 32 | 64 | 256 | 0.000233034 | 5 | 29.1314 | 0.146903 | 0.933333 | 0.114229 |
TorchTrainer_5144b_00003 | TERMINATED | | 64 | 128 | 64 | 0.00109259 | 5 | 21.6534 | 0.0474913 | 0.966667 | 0.0714878 |
TorchTrainer_5144b_00004 | TERMINATED | | 32 | 32 | 128 | 0.00114083 | 5 | 29.6367 | 0.0990443 | 0.966667 | 0.0891999 |
TorchTrainer_5144b_00005 | TERMINATED | | 32 | 64 | 64 | 0.00924264 | 4 | 25.7089 | 0.0349707 | 1 | 0.153937 |
TorchTrainer_5144b_00006 | TERMINATED | | 32 | 128 | 256 | 0.00325671 | 5 | 29.5763 | 0.0708755 | 0.966667 | 0.0820903 |
TorchTrainer_5144b_00007 | TERMINATED | | 32 | 32 | 64 | 0.000123766 | 1 | 13.9326 | 0.27464 | 0.966667 | 0.401102 |
TorchTrainer_5144b_00008 | TERMINATED | | 64 | 128 | 256 | 0.00371762 | 5 | 21.8337 | 0.00108961 | 1 | 0.0579874 |
TorchTrainer_5144b_00009 | TERMINATED | | 32 | 128 | 128 | 0.00397956 | 5 | 29.8334 | 0.00940019 | 1 | 0.0685028 |
(autoscaler +7m33s) [autoscaler] Current infeasible resource requests: [multiple bundle groups listed]
(RayTrainWorker pid=64101) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00000_0_batch_size=32,layer_1_size=64,layer_2_size=256,lr=0.0316_2023-09-07_13-58-38/lightning_logs [repeated 2x across cluster]
100%|██████████| 4542/4542 [00:00<00:00, 42147187.54it/s] [repeated 11x across cluster]
(RayTrainWorker pid=64101) LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2] [repeated 2x across cluster]
(RayTrainWorker pid=64101) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) [repeated 2x across cluster]
(RayTrainWorker pid=64102) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00000_0_batch_size=32,layer_1_size=64,layer_2_size=256,lr=0.0316_2023-09-07_13-58-38/checkpoint_000002) [repeated 6x across cluster]
(RayTrainWorker pid=64102) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00000_0_batch_size=32,layer_1_size=64,layer_2_size=256,lr=0.0316_2023-09-07_13-58-38/checkpoint_000004) [repeated 6x across cluster]
(TorchTrainer pid=71294) Starting distributed worker processes: ['71407 (', '71408 (', '71409 (']
(RayTrainWorker pid=71407) Setting up process group for: env:// [rank=0, world_size=3]
(RayTrainWorker pid=71407) /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
(RayTrainWorker pid=71407) rank_zero_warn(
(RayTrainWorker pid=71407) GPU available: True, used: True
(RayTrainWorker pid=71407) TPU available: False, using: 0 TPU cores
(RayTrainWorker pid=71407) IPU available: False, using: 0 IPUs
(RayTrainWorker pid=71407) HPU available: False, using: 0 HPUs
(RayTrainWorker pid=71408) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00001_1_batch_size=64,layer_1_size=128,layer_2_size=64,lr=0.0839_2023-09-07_13-58-38/lightning_logs
(RayTrainWorker pid=71408) LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2]
(RayTrainWorker pid=71407) | Name | Type | Params
(RayTrainWorker pid=71407) ------------------------------------------------
(RayTrainWorker pid=71407) 0 | accuracy | MulticlassAccuracy | 0
(RayTrainWorker pid=71407) 1 | layer_1 | Linear | 100 K
(RayTrainWorker pid=71407) 2 | layer_2 | Linear | 8.3 K
(RayTrainWorker pid=71407) 3 | layer_3 | Linear | 650
(RayTrainWorker pid=71407) ------------------------------------------------
(RayTrainWorker pid=71407) 109 K Trainable params
(RayTrainWorker pid=71407) 0 Non-trainable params
(RayTrainWorker pid=71407) 109 K Total params
(RayTrainWorker pid=71407) 0.438 Total estimated model params size (MB)
(RayTrainWorker pid=71407) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
(RayTrainWorker pid=71408) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00001_1_batch_size=64,layer_1_size=128,layer_2_size=64,lr=0.0839_2023-09-07_13-58-38/checkpoint_000000)
(TorchTrainer pid=73540) Starting distributed worker processes: ['73647 (', '73648 (', '73649 (']
(RayTrainWorker pid=73647) Setting up process group for: env:// [rank=0, world_size=3]
(RayTrainWorker pid=73648) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00002_2_batch_size=32,layer_1_size=64,layer_2_size=256,lr=0.0002_2023-09-07_13-58-38/lightning_logs
(RayTrainWorker pid=73647) /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
(RayTrainWorker pid=73647) rank_zero_warn(
(RayTrainWorker pid=73647) GPU available: True, used: True
(RayTrainWorker pid=73647) TPU available: False, using: 0 TPU cores
(RayTrainWorker pid=73647) IPU available: False, using: 0 IPUs
(RayTrainWorker pid=73647) HPU available: False, using: 0 HPUs
0%| | 0/9912422 [00:00<?, ?it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 88411180.44it/s]
60%|█████▉ | 5931008/9912422 [00:00<00:00, 57942493.14it/s]
(RayTrainWorker pid=73648) LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2]
(RayTrainWorker pid=73647) | Name | Type | Params
(RayTrainWorker pid=73647) ------------------------------------------------
(RayTrainWorker pid=73647) 0 | accuracy | MulticlassAccuracy | 0
(RayTrainWorker pid=73647) 1 | layer_1 | Linear | 50.2 K
(RayTrainWorker pid=73647) 2 | layer_2 | Linear | 16.6 K
(RayTrainWorker pid=73647) 3 | layer_3 | Linear | 2.6 K
(RayTrainWorker pid=73647) ------------------------------------------------
(RayTrainWorker pid=73647) 69.5 K Trainable params
(RayTrainWorker pid=73647) 0 Non-trainable params
(RayTrainWorker pid=73647) 69.5 K Total params
(RayTrainWorker pid=73647) 0.278 Total estimated model params size (MB)
(RayTrainWorker pid=73648) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
(RayTrainWorker pid=73648) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00002_2_batch_size=32,layer_1_size=64,layer_2_size=256,lr=0.0002_2023-09-07_13-58-38/checkpoint_000000)
(RayTrainWorker pid=73647) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00002_2_batch_size=32,layer_1_size=64,layer_2_size=256,lr=0.0002_2023-09-07_13-58-38/lightning_logs [repeated 2x across cluster]
100%|██████████| 4542/4542 [00:00<00:00, 45575427.67it/s] [repeated 11x across cluster]
(RayTrainWorker pid=73647) [repeated 4x across cluster]
(RayTrainWorker pid=73647) LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2] [repeated 2x across cluster]
(RayTrainWorker pid=73647) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) [repeated 2x across cluster]
(RayTrainWorker pid=73648) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00002_2_batch_size=32,layer_1_size=64,layer_2_size=256,lr=0.0002_2023-09-07_13-58-38/checkpoint_000002) [repeated 6x across cluster]
(RayTrainWorker pid=73648) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00002_2_batch_size=32,layer_1_size=64,layer_2_size=256,lr=0.0002_2023-09-07_13-58-38/checkpoint_000004) [repeated 6x across cluster]
(TorchTrainer pid=80840) Starting distributed worker processes: ['80950 (', '80951 (', '80952 (']
(RayTrainWorker pid=80950) Setting up process group for: env:// [rank=0, world_size=3]
(RayTrainWorker pid=80950) /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
(RayTrainWorker pid=80950) rank_zero_warn(
(RayTrainWorker pid=80950) GPU available: True, used: True
(RayTrainWorker pid=80950) TPU available: False, using: 0 TPU cores
(RayTrainWorker pid=80950) IPU available: False, using: 0 IPUs
(RayTrainWorker pid=80950) HPU available: False, using: 0 HPUs
(RayTrainWorker pid=80950) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00003_3_batch_size=64,layer_1_size=128,layer_2_size=64,lr=0.0011_2023-09-07_13-58-38/lightning_logs
100%|██████████| 9912422/9912422 [00:00<00:00, 120421348.01it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 111998101.50it/s]
(RayTrainWorker pid=80950) LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
(RayTrainWorker pid=80950) | Name | Type | Params
(RayTrainWorker pid=80950) ------------------------------------------------
(RayTrainWorker pid=80950) 0 | accuracy | MulticlassAccuracy | 0
(RayTrainWorker pid=80950) 1 | layer_1 | Linear | 100 K
(RayTrainWorker pid=80950) 2 | layer_2 | Linear | 8.3 K
(RayTrainWorker pid=80950) 3 | layer_3 | Linear | 650
(RayTrainWorker pid=80950) ------------------------------------------------
(RayTrainWorker pid=80950) 109 K Trainable params
(RayTrainWorker pid=80950) 0 Non-trainable params
(RayTrainWorker pid=80950) 109 K Total params
(RayTrainWorker pid=80950) 0.438 Total estimated model params size (MB)
(RayTrainWorker pid=80950) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
(RayTrainWorker pid=80950) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00003_3_batch_size=64,layer_1_size=128,layer_2_size=64,lr=0.0011_2023-09-07_13-58-38/checkpoint_000000)
(RayTrainWorker pid=80952) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00003_3_batch_size=64,layer_1_size=128,layer_2_size=64,lr=0.0011_2023-09-07_13-58-38/lightning_logs [repeated 2x across cluster]
100%|██████████| 4542/4542 [00:00<00:00, 39279440.76it/s] [repeated 11x across cluster]
(RayTrainWorker pid=80950)
(RayTrainWorker pid=80952) LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2] [repeated 2x across cluster]
(RayTrainWorker pid=80952) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) [repeated 2x across cluster]
(RayTrainWorker pid=80950) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00003_3_batch_size=64,layer_1_size=128,layer_2_size=64,lr=0.0011_2023-09-07_13-58-38/checkpoint_000003) [repeated 9x across cluster]
(TorchTrainer pid=88077) Starting distributed worker processes: ['88184 (', '88185 (', '88186 (']
(RayTrainWorker pid=88184) Setting up process group for: env:// [rank=0, world_size=3]
(RayTrainWorker pid=88186) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00004_4_batch_size=32,layer_1_size=32,layer_2_size=128,lr=0.0011_2023-09-07_13-58-38/lightning_logs
(RayTrainWorker pid=88184) /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
(RayTrainWorker pid=88184) rank_zero_warn(
(RayTrainWorker pid=88184) GPU available: True, used: True
(RayTrainWorker pid=88184) TPU available: False, using: 0 TPU cores
(RayTrainWorker pid=88184) IPU available: False, using: 0 IPUs
(RayTrainWorker pid=88184) HPU available: False, using: 0 HPUs
0%| | 0/9912422 [00:00<?, ?it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 135946084.34it/s]
61%|██████▏ | 6094848/9912422 [00:00<00:00, 60581952.53it/s]
(RayTrainWorker pid=88186) LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2]
(RayTrainWorker pid=88184) | Name | Type | Params
(RayTrainWorker pid=88184) ------------------------------------------------
(RayTrainWorker pid=88184) 0 | accuracy | MulticlassAccuracy | 0
(RayTrainWorker pid=88184) 1 | layer_1 | Linear | 25.1 K
(RayTrainWorker pid=88184) 2 | layer_2 | Linear | 4.2 K
(RayTrainWorker pid=88184) 3 | layer_3 | Linear | 1.3 K
(RayTrainWorker pid=88184) ------------------------------------------------
(RayTrainWorker pid=88184) 30.6 K Trainable params
(RayTrainWorker pid=88184) 0 Non-trainable params
(RayTrainWorker pid=88184) 30.6 K Total params
(RayTrainWorker pid=88184) 0.123 Total estimated model params size (MB)
(RayTrainWorker pid=88186) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
(RayTrainWorker pid=88186) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00004_4_batch_size=32,layer_1_size=32,layer_2_size=128,lr=0.0011_2023-09-07_13-58-38/checkpoint_000000)
(RayTrainWorker pid=88184) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00004_4_batch_size=32,layer_1_size=32,layer_2_size=128,lr=0.0011_2023-09-07_13-58-38/lightning_logs [repeated 2x across cluster]
100%|██████████| 4542/4542 [00:00<00:00, 47154774.18it/s] [repeated 11x across cluster]
(RayTrainWorker pid=88184) [repeated 2x across cluster]
100%|██████████| 9912422/9912422 [00:00<00:00, 87231776.04it/s]
(RayTrainWorker pid=88184) LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2] [repeated 2x across cluster]
(RayTrainWorker pid=88184) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) [repeated 2x across cluster]
(RayTrainWorker pid=88186) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00004_4_batch_size=32,layer_1_size=32,layer_2_size=128,lr=0.0011_2023-09-07_13-58-38/checkpoint_000002) [repeated 6x across cluster]
(RayTrainWorker pid=88186) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00004_4_batch_size=32,layer_1_size=32,layer_2_size=128,lr=0.0011_2023-09-07_13-58-38/checkpoint_000004) [repeated 6x across cluster]
(TorchTrainer pid=95388) Starting distributed worker processes: ['95492 (', '95493 (', '95494 (']
(RayTrainWorker pid=95492) Setting up process group for: env:// [rank=0, world_size=3]
(RayTrainWorker pid=95492) /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
(RayTrainWorker pid=95492) rank_zero_warn(
(RayTrainWorker pid=95492) GPU available: True, used: True
(RayTrainWorker pid=95492) TPU available: False, using: 0 TPU cores
(RayTrainWorker pid=95492) IPU available: False, using: 0 IPUs
(RayTrainWorker pid=95492) HPU available: False, using: 0 HPUs
(RayTrainWorker pid=95492) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00005_5_batch_size=32,layer_1_size=64,layer_2_size=64,lr=0.0092_2023-09-07_13-58-38/lightning_logs
0%| | 0/9912422 [00:00<?, ?it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 117459779.70it/s]
74%|███████▍ | 7372800/9912422 [00:00<00:00, 73213483.02it/s]
(RayTrainWorker pid=95494) LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2]
(RayTrainWorker pid=95492) | Name | Type | Params
(RayTrainWorker pid=95492) ------------------------------------------------
(RayTrainWorker pid=95492) 0 | accuracy | MulticlassAccuracy | 0
(RayTrainWorker pid=95492) 1 | layer_1 | Linear | 50.2 K
(RayTrainWorker pid=95492) 2 | layer_2 | Linear | 4.2 K
(RayTrainWorker pid=95492) 3 | layer_3 | Linear | 650
(RayTrainWorker pid=95492) ------------------------------------------------
(RayTrainWorker pid=95492) 55.1 K Trainable params
(RayTrainWorker pid=95492) 0 Non-trainable params
(RayTrainWorker pid=95492) 55.1 K Total params
(RayTrainWorker pid=95492) 0.220 Total estimated model params size (MB)
(RayTrainWorker pid=95494) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
(RayTrainWorker pid=95494) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00005_5_batch_size=32,layer_1_size=64,layer_2_size=64,lr=0.0092_2023-09-07_13-58-38/checkpoint_000000)
(RayTrainWorker pid=95494) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00005_5_batch_size=32,layer_1_size=64,layer_2_size=64,lr=0.0092_2023-09-07_13-58-38/lightning_logs [repeated 2x across cluster]
100%|██████████| 4542/4542 [00:00<00:00, 48598287.67it/s] [repeated 10x across cluster]
(RayTrainWorker pid=95492) [repeated 4x across cluster]
(RayTrainWorker pid=95493) LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2] [repeated 2x across cluster]
(RayTrainWorker pid=95493) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) [repeated 2x across cluster]
(RayTrainWorker pid=95494) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00005_5_batch_size=32,layer_1_size=64,layer_2_size=64,lr=0.0092_2023-09-07_13-58-38/checkpoint_000002) [repeated 6x across cluster]
(TorchTrainer pid=101434) Starting distributed worker processes: ['101544 (', '101545 (', '101546 (']
(RayTrainWorker pid=101544) Setting up process group for: env:// [rank=0, world_size=3]
(RayTrainWorker pid=101545) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00006_6_batch_size=32,layer_1_size=128,layer_2_size=256,lr=0.0033_2023-09-07_13-58-38/lightning_logs
(RayTrainWorker pid=101544) /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
(RayTrainWorker pid=101544) rank_zero_warn(
(RayTrainWorker pid=101544) GPU available: True, used: True
(RayTrainWorker pid=101544) TPU available: False, using: 0 TPU cores
(RayTrainWorker pid=101544) IPU available: False, using: 0 IPUs
(RayTrainWorker pid=101544) HPU available: False, using: 0 HPUs
0%| | 0/9912422 [00:00<?, ?it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 104607984.65it/s]
(RayTrainWorker pid=101545) Extracting /tmp/tmpxobpdr_p/MNIST/raw/train-images-idx3-ubyte.gz to /tmp/tmpxobpdr_p/MNIST/raw
(RayTrainWorker pid=101545) Extracting /tmp/tmpxobpdr_p/MNIST/raw/train-labels-idx1-ubyte.gz to /tmp/tmpxobpdr_p/MNIST/raw
(RayTrainWorker pid=101545) Extracting /tmp/tmpxobpdr_p/MNIST/raw/t10k-images-idx3-ubyte.gz to /tmp/tmpxobpdr_p/MNIST/raw
(RayTrainWorker pid=101545) Extracting /tmp/tmpxobpdr_p/MNIST/raw/t10k-labels-idx1-ubyte.gz to /tmp/tmpxobpdr_p/MNIST/raw
(RayTrainWorker pid=101545) LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2]
(RayTrainWorker pid=101544) | Name | Type | Params
(RayTrainWorker pid=101544) ------------------------------------------------
(RayTrainWorker pid=101544) 0 | accuracy | MulticlassAccuracy | 0
(RayTrainWorker pid=101544) 1 | layer_1 | Linear | 100 K
(RayTrainWorker pid=101544) 2 | layer_2 | Linear | 33.0 K
(RayTrainWorker pid=101544) 3 | layer_3 | Linear | 2.6 K
(RayTrainWorker pid=101544) ------------------------------------------------
(RayTrainWorker pid=101544) 136 K Trainable params
(RayTrainWorker pid=101544) 0 Non-trainable params
(RayTrainWorker pid=101544) 136 K Total params
(RayTrainWorker pid=101544) 0.544 Total estimated model params size (MB)
(RayTrainWorker pid=101545) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
(RayTrainWorker pid=101545) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00006_6_batch_size=32,layer_1_size=128,layer_2_size=256,lr=0.0033_2023-09-07_13-58-38/checkpoint_000000)
(RayTrainWorker pid=101546) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00006_6_batch_size=32,layer_1_size=128,layer_2_size=256,lr=0.0033_2023-09-07_13-58-38/lightning_logs [repeated 2x across cluster]
100%|██████████| 4542/4542 [00:00<00:00, 38642046.18it/s] [repeated 11x across cluster]
(RayTrainWorker pid=101544) [repeated 3x across cluster]
(RayTrainWorker pid=101546) LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2] [repeated 2x across cluster]
(RayTrainWorker pid=101544) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) [repeated 2x across cluster]
(RayTrainWorker pid=101545) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00006_6_batch_size=32,layer_1_size=128,layer_2_size=256,lr=0.0033_2023-09-07_13-58-38/checkpoint_000002) [repeated 6x across cluster]
(RayTrainWorker pid=101545) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00006_6_batch_size=32,layer_1_size=128,layer_2_size=256,lr=0.0033_2023-09-07_13-58-38/checkpoint_000004) [repeated 6x across cluster]
(TorchTrainer pid=108750) Starting distributed worker processes: ['108861 (', '108862 (', '108863 (']
(RayTrainWorker pid=108861) Setting up process group for: env:// [rank=0, world_size=3]
(RayTrainWorker pid=108861) /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
(RayTrainWorker pid=108861) rank_zero_warn(
(RayTrainWorker pid=108861) GPU available: True, used: True
(RayTrainWorker pid=108861) TPU available: False, using: 0 TPU cores
(RayTrainWorker pid=108861) IPU available: False, using: 0 IPUs
(RayTrainWorker pid=108861) HPU available: False, using: 0 HPUs
(RayTrainWorker pid=108862) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00007_7_batch_size=32,layer_1_size=32,layer_2_size=64,lr=0.0001_2023-09-07_13-58-38/lightning_logs
0%| | 0/9912422 [00:00<?, ?it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 111226266.99it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 89971437.39it/s]
(RayTrainWorker pid=108862) LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2]
(RayTrainWorker pid=108861) | Name | Type | Params
(RayTrainWorker pid=108861) ------------------------------------------------
(RayTrainWorker pid=108861) 0 | accuracy | MulticlassAccuracy | 0
(RayTrainWorker pid=108861) 1 | layer_1 | Linear | 25.1 K
(RayTrainWorker pid=108861) 2 | layer_2 | Linear | 2.1 K
(RayTrainWorker pid=108861) 3 | layer_3 | Linear | 650
(RayTrainWorker pid=108861) ------------------------------------------------
(RayTrainWorker pid=108861) 27.9 K Trainable params
(RayTrainWorker pid=108861) 0 Non-trainable params
(RayTrainWorker pid=108861) 27.9 K Total params
(RayTrainWorker pid=108861) 0.112 Total estimated model params size (MB)
(RayTrainWorker pid=108862) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
(RayTrainWorker pid=108862) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00007_7_batch_size=32,layer_1_size=32,layer_2_size=64,lr=0.0001_2023-09-07_13-58-38/checkpoint_000000)
(RayTrainWorker pid=108861) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00007_7_batch_size=32,layer_1_size=32,layer_2_size=64,lr=0.0001_2023-09-07_13-58-38/lightning_logs [repeated 2x across cluster]
100%|██████████| 4542/4542 [00:00<00:00, 42054147.39it/s] [repeated 11x across cluster]
(autoscaler +11m23s) [workspace snapshot] New snapshot created successfully (Size: 327.01 KB)
(RayTrainWorker pid=108861) [repeated 3x across cluster]
(RayTrainWorker pid=108861) LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2] [repeated 2x across cluster]
(RayTrainWorker pid=108861) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) [repeated 2x across cluster]
(RayTrainWorker pid=108861) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00007_7_batch_size=32,layer_1_size=32,layer_2_size=64,lr=0.0001_2023-09-07_13-58-38/checkpoint_000000) [repeated 2x across cluster]
(TrainTrainable pid=111019) 2023-09-07 14:02:51.493509: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
(TorchTrainer pid=111019) Starting distributed worker processes: ['111129 (', '111130 (', '111131 (']
(RayTrainWorker pid=111129) Setting up process group for: env:// [rank=0, world_size=3]
(RayTrainWorker pid=111129) /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
(RayTrainWorker pid=111129) rank_zero_warn(
(RayTrainWorker pid=111129) GPU available: True, used: True
(RayTrainWorker pid=111129) TPU available: False, using: 0 TPU cores
(RayTrainWorker pid=111129) IPU available: False, using: 0 IPUs
(RayTrainWorker pid=111129) HPU available: False, using: 0 HPUs
(RayTrainWorker pid=111131) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00008_8_batch_size=64,layer_1_size=128,layer_2_size=256,lr=0.0037_2023-09-07_13-58-38/lightning_logs
100%|██████████| 9912422/9912422 [00:00<00:00, 109686001.97it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 81254614.76it/s]
100%|██████████| 1648877/1648877 [00:00<00:00, 35741410.23it/s]
(RayTrainWorker pid=111131) LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2]
(RayTrainWorker pid=111129) | Name | Type | Params
(RayTrainWorker pid=111129) ------------------------------------------------
(RayTrainWorker pid=111129) 0 | accuracy | MulticlassAccuracy | 0
(RayTrainWorker pid=111129) 1 | layer_1 | Linear | 100 K
(RayTrainWorker pid=111129) 2 | layer_2 | Linear | 33.0 K
(RayTrainWorker pid=111129) 3 | layer_3 | Linear | 2.6 K
(RayTrainWorker pid=111129) ------------------------------------------------
(RayTrainWorker pid=111129) 136 K Trainable params
(RayTrainWorker pid=111129) 0 Non-trainable params
(RayTrainWorker pid=111129) 136 K Total params
(RayTrainWorker pid=111129) 0.544 Total estimated model params size (MB)
(RayTrainWorker pid=111131) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
(RayTrainWorker pid=111131) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00008_8_batch_size=64,layer_1_size=128,layer_2_size=256,lr=0.0037_2023-09-07_13-58-38/checkpoint_000000)
(RayTrainWorker pid=111129) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00008_8_batch_size=64,layer_1_size=128,layer_2_size=256,lr=0.0037_2023-09-07_13-58-38/lightning_logs [repeated 2x across cluster]
100%|██████████| 4542/4542 [00:00<00:00, 37135533.66it/s] [repeated 11x across cluster]
100%|██████████| 9912422/9912422 [00:00<00:00, 92298990.88it/s]
(RayTrainWorker pid=111129) [repeated 2x across cluster]
(RayTrainWorker pid=111129) LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2] [repeated 2x across cluster]
(RayTrainWorker pid=111129) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) [repeated 2x across cluster]
(RayTrainWorker pid=111131) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00008_8_batch_size=64,layer_1_size=128,layer_2_size=256,lr=0.0037_2023-09-07_13-58-38/checkpoint_000003) [repeated 9x across cluster]
(TorchTrainer pid=118255) Starting distributed worker processes: ['118362 (', '118363 (', '118364 (']
(RayTrainWorker pid=118362) Setting up process group for: env:// [rank=0, world_size=3]
(RayTrainWorker pid=118363) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00009_9_batch_size=32,layer_1_size=128,layer_2_size=128,lr=0.0040_2023-09-07_13-58-38/lightning_logs
(RayTrainWorker pid=118362) /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
(RayTrainWorker pid=118362) rank_zero_warn(
(RayTrainWorker pid=118362) GPU available: True, used: True
(RayTrainWorker pid=118362) TPU available: False, using: 0 TPU cores
(RayTrainWorker pid=118362) IPU available: False, using: 0 IPUs
(RayTrainWorker pid=118362) HPU available: False, using: 0 HPUs
0%| | 0/9912422 [00:00<?, ?it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 109752309.17it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 92575620.67it/s]
(RayTrainWorker pid=118363) LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2]
(RayTrainWorker pid=118362) | Name | Type | Params
(RayTrainWorker pid=118362) ------------------------------------------------
(RayTrainWorker pid=118362) 0 | accuracy | MulticlassAccuracy | 0
(RayTrainWorker pid=118362) 1 | layer_1 | Linear | 100 K
(RayTrainWorker pid=118362) 2 | layer_2 | Linear | 16.5 K
(RayTrainWorker pid=118362) 3 | layer_3 | Linear | 1.3 K
(RayTrainWorker pid=118362) ------------------------------------------------
(RayTrainWorker pid=118362) 118 K Trainable params
(RayTrainWorker pid=118362) 0 Non-trainable params
(RayTrainWorker pid=118362) 118 K Total params
(RayTrainWorker pid=118362) 0.473 Total estimated model params size (MB)
(RayTrainWorker pid=118363) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
(RayTrainWorker pid=118363) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00009_9_batch_size=32,layer_1_size=128,layer_2_size=128,lr=0.0040_2023-09-07_13-58-38/checkpoint_000000)
(RayTrainWorker pid=118362) Missing logger folder: /home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00009_9_batch_size=32,layer_1_size=128,layer_2_size=128,lr=0.0040_2023-09-07_13-58-38/lightning_logs [repeated 2x across cluster]
100%|██████████| 4542/4542 [00:00<00:00, 42810177.01it/s] [repeated 11x across cluster]
(RayTrainWorker pid=118362) [repeated 4x across cluster]
(RayTrainWorker pid=118362) LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2] [repeated 2x across cluster]
(RayTrainWorker pid=118362) [W reducer.cpp:1300] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) [repeated 2x across cluster]
(RayTrainWorker pid=118363) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00009_9_batch_size=32,layer_1_size=128,layer_2_size=128,lr=0.0040_2023-09-07_13-58-38/checkpoint_000002) [repeated 6x across cluster]
(RayTrainWorker pid=118363) Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00009_9_batch_size=32,layer_1_size=128,layer_2_size=128,lr=0.0040_2023-09-07_13-58-38/checkpoint_000004) [repeated 6x across cluster]
2023-09-07 14:03:52,186 INFO tune.py:1143 -- Total run time: 313.94 seconds (313.92 seconds for the tuning loop).
results.get_best_result(metric="ptl/val_accuracy", mode="max")
metrics={'ptl/train_loss': 0.00108961365185678, 'ptl/train_accuracy': 1.0, 'ptl/val_loss': 0.05798737704753876, 'ptl/val_accuracy': 0.9820601940155029, 'epoch': 4, 'step': 1435},
checkpoint=Checkpoint(filesystem=local, path=/home/ray/ray_results/TorchTrainer_2023-09-07_13-58-38/TorchTrainer_5144b_00008_8_batch_size=64,layer_1_size=128,layer_2_size=256,lr=0.0037_2023-09-07_13-58-38/checkpoint_000004)