Llama 模型在 Intel Gaudi 上的预训练#

在这个 Jupyter notebook 中,我们将使用 Intel Gaudi 加速器对 huggyllama/llama-7b 模型进行预训练。

我们将使用 PyTorch 进行模型训练,并使用 Ray 进行分布式训练。

Intel Gaudi AI 处理器 (HPUs) 是由 Habana Labs 设计的人工智能硬件加速器。有关更多信息,请参见 Gaudi 架构Gaudi 开发者文档

该预训练示例的基本特性包括:

准备环境#

此示例在单节点上运行,配备 4 个 HPU。

我们建议使用预构建的容器来运行这些示例。要运行容器,您需要 Docker。有关安装说明,请参见 安装 Docker 引擎

接下来,请按照 使用容器运行 的说明安装 Habana 驱动程序和容器运行时。

获取 Docker 镜像#

# 可在此处找到更多可用的 Docker 镜像: https://vault.habana.ai/ui/native/gaudi-docker
docker pull vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest

运行 Docker 镜像#

docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
# 可能需要映射您的工作区卷

安装依赖#

# "optimum-habana>1.11.1" 如果执行模式为 "eager" 或 "eager.compile" 
# "ray>=2.20.0"
pip install ray[train] notebook transformers datasets evaluate peft accelerate scikit-learn optimum-habana

# 安装 deepspeed
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0

# 该笔记本经过以下版本包的验证:
# transformers==4.38.2
# datasets==2.19.1
# evaluate==0.4.2
# peft==0.4.0
# accelerate==0.27.2
# scikit-learn==1.4.2
# optimum-habana==1.11.1

# deepspeed==0.12.4+hpu.synapse.v1.15.0

导入必要的库#

#!/usr/bin/env python

import os
from typing import Any, Dict
from torch.utils.data import DataLoader

import transformers
from itertools import chain
from datasets import load_dataset
from transformers import default_data_collator
from transformers.testing_utils import CaptureLogger
from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
from optimum.habana.utils import set_seed

构建数据集#

从 huggingface.co 下载并加载数据集

def load_datasets(config):
    dataset_name = config["name"] 
    dataset_config_name = config["config_name"]

    # 从中心下载并加载数据集。
    raw_datasets = load_dataset(
        dataset_name,
        dataset_config_name,
        cache_dir=None,
        token=None,
        streaming=False,
    )
    if "validation" not in raw_datasets.keys():
        raw_datasets["validation"] = load_dataset(
            dataset_name,
            dataset_config_name,
            split=f"train[:{data_args.validation_split_percentage}%]",
            cache_dir=None,
            token=None,
            streaming=False,
        )
        raw_datasets["train"] = load_dataset(
            dataset_name,
            dataset_config_name,
            split=f"train[{data_args.validation_split_percentage}%:]",
            cache_dir=None,
            token=None,
            streaming=False,
        )

    return raw_datasets

加载分词器#

从 huggingface.co 下载词汇表。

def load_tokenizer(config):
    name = config["name"]
    tokenizer_kwargs = {
        "cache_dir": None,
        "use_fast": True,
        "revision": "main",
        "token": None,
        "trust_remote_code": False,
    }
    return transformers.AutoTokenizer.from_pretrained(name, **tokenizer_kwargs)

切分数据集#

将单词切分为标记ID。

def tokenize_dataset(datasets, tokenizer):
    column_names = list(datasets["train"].features)
    text_column_name = "text" if "text" in column_names else column_names[0]

    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")

    def tokenize_function(examples):
        with CaptureLogger(tok_logger) as cl:
            output = tokenizer(examples[text_column_name])
        # clm输入的长度可能远远超过block_size
        if "Token indices sequence length is longer than the" in cl.out:
            tok_logger.warning(
                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
                " before being passed to the model."
            )
        return output

    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=None,
        remove_columns=column_names,
        load_from_cache_file=True,
        desc="Running tokenizer on dataset",
    )

    return tokenized_datasets

分组数据集#

此预处理将连接我们数据集中的所有文本,并生成块大小为 block_size 的块,从而更快地进行模型的预训练。

def group_dataset(config, datasets, tokenizer):
    config_name = config["name"]
    auto_config = transformers.AutoConfig.from_pretrained(config_name)
    max_pos_embeddings = auto_config.max_position_embeddings
    block_size = tokenizer.model_max_length
    if block_size > max_pos_embeddings:
        print(
            f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
            f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx."
        )
        if max_pos_embeddings > 0:
            block_size = min(1024, max_pos_embeddings)
        else:
            block_size = 1024

    # 主要数据处理功能,将连接数据集中所有文本,并生成块大小为block_size的片段。
    def group_texts(examples):
        # 连接所有文本。
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # 我们舍弃剩余的小部分数据,如果总长度小于块大小,则排除此批次并返回一个空字典。
        # 如果模型支持填充而不是这种丢弃,我们可以根据需要自定义这部分。
        total_length = (total_length // block_size) * block_size
        # 按最大长度分块。
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    lm_datasets = datasets.map(
        group_texts,
        batched=True,
        num_proc=None,
        load_from_cache_file=True,
        desc=f"Grouping texts in chunks of {block_size}",
    )
    return lm_datasets

加载模型#

从 huggingface.co 下载并加载预配置的模型,模型的详细配置在 config.json 中。

def load_model(config):
    name = config["name"]
    model_config = config.get("config", {})
    auto_config = transformers.AutoConfig.from_pretrained(
        pretrained_model_name_or_path=name, **model_config
    )
    model = transformers.AutoModelForCausalLM.from_config(auto_config, trust_remote_code=False)

    return model

准备训练器#

实例化训练器,使用 modelgaudi_configtraining_argstokenizer

没有传递评估数据集,仅进行训练。

def get_trainer(training_args, datasets, tokenizer, model):
    gaudi_config = GaudiConfig.from_pretrained(
        training_args.gaudi_config_name, revision="main",
    )

    trainer = GaudiTrainer(
        model=model,
        gaudi_config=gaudi_config,
        args=training_args,
        train_dataset=datasets["train"],
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )
    return trainer

训练函数#

该函数将在训练期间由每个工作线程执行,步骤如下:

  • 准备 GaudiTrainingArguments 对象。

  • 从 huggingface.co 加载数据集。

  • 从 huggingface.co 加载预配置的编码器。

  • 使用加载的模型编码器对数据集进行分词。

  • 将我们数据集中的所有文本连接起来并生成大小为 block_size 的块。

  • 实例化 GaudiTrainer 对象,并传入 training_args、数据集、编码器和模型。

  • 调用训练器的 train 方法。

  • 保存模型。

def pretrain_llama(config: Dict[str, Any]):

    training_args = GaudiTrainingArguments(**config["training_args"])
    set_seed(training_args.seed)

    raw_datasets = load_datasets(config["datasets"])

    tokenizer = load_tokenizer(config["tokenizer"])

    tokenized_datasets = tokenize_dataset(raw_datasets, tokenizer)

    tokenized_datasets = group_dataset(config["model"], tokenized_datasets, tokenizer)

    model = load_model(config["model"])

    trainer = get_trainer(training_args, tokenized_datasets, tokenizer, model)

    result = trainer.train()
    trainer.save_model()
    print(result)

主要训练功能#

main 函数使用 Ray 设置分布式训练环境并启动训练过程。为了启用使用 HPU 的训练,我们只需进行以下更改:

  • 设置训练的执行模式,支持的执行模式有:

    • “lazy”:图的延迟执行,由脚本逐步传递的操作组成,类似于 Eager 模式。它提供了 Eager 模式的体验,同时在 Gaudi 上表现良好。与使用 torch.compile 的 Eager 模式不同,图在每次迭代中都会被分析,从而导致更高的 CPU 使用率。

    • “eager”:按照标准 PyTorch Eager 模式脚本中的定义逐步执行操作。

    • “eager.compile”:带有 torch.compile 的 Eager 模式扩展 - 类似于 Eager 模式,但将模型的整体或部分(例如一个函数)包装成一个图。未包装的部分将以 Eager 方式执行。

    更多详细的理论可以在 这里 查找,详细的性能结果可以在 这里 查找。

  • 在 ScalingConfig 中为每个工作节点要求一个 HPU

  • 在 TorchConfig 中将后端设置为 hccl

def main(num_workers, execution_mode):
    import ray
    from ray.train import ScalingConfig
    from ray.train.torch import TorchTrainer, TorchConfig

    pretrain_config = {
        "datasets": {
            "name": "wikitext",
            "config_name": "wikitext-2-raw-v1",
        },
        "tokenizer": {
            "name": "huggyllama/llama-7b",
            "config": {}
        },
        "model": {
            "name": "huggyllama/llama-7b",
            "config": {
                "torch_dtype": "bfloat16",
            },
        },
        "training_args": {
            "per_device_train_batch_size": 1,
            "do_train": True,
            "save_strategy": "no",
            "output_dir": "/tmp/ray/pretrain-llama-2",
            "logging_steps": 1,
            "gaudi_config_name": "Habana/llama",
            "use_habana": True,
            "throughput_warmup_steps": 3,
            "use_lazy_mode": True,
            "overwrite_output_dir": True,
            "seed": 42,
            "bf16": True,
            "report_to":'tensorboard',
            "deepspeed": {
                "steps_per_print": 64,
                "train_batch_size": "auto",
                "train_micro_batch_size_per_gpu": "auto",
                "gradient_accumulation_steps": "auto",
                "bf16": {
                    "enabled": True
                },
                "gradient_clipping": 1.0,
                "zero_optimization": {
                    "stage": 3,
                    "overlap_comm": False,
                    "reduce_scatter": False,
                    "contiguous_gradients": False,
                    "stage3_gather_16bit_weights_on_model_save": True
                }
            },
        },
    }

    # 如果执行模式为eager模式并带有编译功能,必须指定一个编译后端。
    if execution_mode == "eager.compile":
        pretrain_config["training_args"].update({"torch_compile_backend": "hpu_backend"})

    scaling_config = ScalingConfig(num_workers=num_workers,
                                   use_gpu=False,
                                   resources_per_worker={"CPU": 1, "HPU": 1})

    # 将TorchConfig中的后端设置为hccl
    torch_config = TorchConfig(backend="hccl")

    ray.init()

    # 初始化一个 Ray TorchTrainer
    trainer = TorchTrainer(
        train_loop_per_worker=pretrain_llama,
        train_loop_config=pretrain_config,
        torch_config=torch_config,
        scaling_config=scaling_config
    )

    result = trainer.fit()
    print(result)

开始训练#

最后,我们调用 main 函数以启动预训练过程。

在调用 main 函数之前,您必须设置一些环境变量。

  1. 可见的设备。环境变量 HABANA_VISIBLE_DEVICESHABANA_VISIBLE_MODULES 用于控制应用程序可见的 HPU 设备,您必须正确设置这两个环境变量。有关 HABANA_VISIBLE_DEVICESHABANA_VISIBLE_MODULES 的详细使用,请访问 这里

  2. 执行模式。不同的执行模式具有不同的运行时性能。默认执行模式是惰性模式。

# 设置一些环境变量
os.environ["RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES"] = "0"
# 如果使用RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES环境变量
# 你需要设置HABANA_VISIBLE_MODULES,例如
# os.environ["HABANA_VISIBLE_MODULES"] = "0,1,2,3"

# execution_mode are ["lazy", "eager", "eager.compile"]
execution_mode = "lazy"
os.environ["PT_HPU_LAZY_MODE"] = "1" if execution_mode == "lazy" else "0"

main(num_workers=8, execution_mode=execution_mode)

可能的输出#

...

(RayTrainWorker pid=289322) 正在为: env:// [rank=0, world_size=8] 设置进程组
(TorchTrainer pid=288676) 启动了分布式工作进程: 
(TorchTrainer pid=288676) - (ip=100.83.111.228, pid=289322) world_rank=0, local_rank=0, node_rank=0
(TorchTrainer pid=288676) - (ip=100.83.111.228, pid=289323) world_rank=1, local_rank=1, node_rank=0
(TorchTrainer pid=288676) - (ip=100.83.111.228, pid=289324) world_rank=2, local_rank=2, node_rank=0
(TorchTrainer pid=288676) - (ip=100.83.111.228, pid=289325) world_rank=3, local_rank=3, node_rank=0
(TorchTrainer pid=288676) - (ip=100.83.111.228, pid=289327) world_rank=4, local_rank=4, node_rank=0
(TorchTrainer pid=288676) - (ip=100.83.111.228, pid=289326) world_rank=5, local_rank=5, node_rank=0
(TorchTrainer pid=288676) - (ip=100.83.111.228, pid=289328) world_rank=6, local_rank=6, node_rank=0
(TorchTrainer pid=288676) - (ip=100.83.111.228, pid=289329) world_rank=7, local_rank=7, node_rank=0

...

(RayTrainWorker pid=289322) ============================= HABANA PT 桥接配置 =========================== 
(RayTrainWorker pid=289322)  PT_HPU_LAZY_MODE = 1
(RayTrainWorker pid=289322)  PT_RECIPE_CACHE_PATH = 
(RayTrainWorker pid=289322)  PT_CACHE_FOLDER_DELETE = 0
(RayTrainWorker pid=289322)  PT_HPU_RECIPE_CACHE_CONFIG = 
(RayTrainWorker pid=289322)  PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
(RayTrainWorker pid=289322)  PT_HPU_LAZY_ACC_PAR_MODE = 1
(RayTrainWorker pid=289322)  PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
(RayTrainWorker pid=289322) ---------------------------: 系统配置 :---------------------------
(RayTrainWorker pid=289322) CPU 核心数 : 152
(RayTrainWorker pid=289322) CPU 内存 : 1056440348 KB
(RayTrainWorker pid=289322) ------------------------------------------------------------------------------

...

(RayTrainWorker pid=289322) {'loss': 11.1784, 'grad_norm': 11.160387992858887, 'learning_rate': 4.9903660886319845e-05, 'epoch': 0.01, 'memory_allocated (GB)': 26.34, 'max_memory_allocated (GB)': 66.83, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 11.1116, 'grad_norm': 11.13752555847168, 'learning_rate': 4.9807321772639694e-05, 'epoch': 0.01, 'memory_allocated (GB)': 27.32, 'max_memory_allocated (GB)': 71.35, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 10.8931, 'grad_norm': 11.067651748657227, 'learning_rate': 4.971098265895954e-05, 'epoch': 0.02, 'memory_allocated (GB)': 27.32, 'max_memory_allocated (GB)': 75.01, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 10.3421, 'grad_norm': 10.925484657287598, 'learning_rate': 4.9614643545279386e-05, 'epoch': 0.02, 'memory_allocated (GB)': 27.33, 'max_memory_allocated (GB)': 75.08, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 10.007, 'grad_norm': 9.689080238342285, 'learning_rate': 4.9518304431599236e-05, 'epoch': 0.03, 'memory_allocated (GB)': 27.33, 'max_memory_allocated (GB)': 75.08, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 9.8195, 'grad_norm': 18.040328979492188, 'learning_rate': 4.942196531791908e-05, 'epoch': 0.03, 'memory_allocated (GB)': 27.33, 'max_memory_allocated (GB)': 75.14, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 9.6815, 'grad_norm': 29.881019592285156, 'learning_rate': 4.932562620423892e-05, 'epoch': 0.04, 'memory_allocated (GB)': 27.32, 'max_memory_allocated (GB)': 75.14, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 9.4898, 'grad_norm': 12.468446731567383, 'learning_rate': 4.922928709055877e-05, 'epoch': 0.05, 'memory_allocated (GB)': 27.31, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 9.5611, 'grad_norm': 8.117713928222656, 'learning_rate': 4.913294797687861e-05, 'epoch': 0.05, 'memory_allocated (GB)': 27.32, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 9.2297, 'grad_norm': 14.138890266418457, 'learning_rate': 4.903660886319846e-05, 'epoch': 0.06, 'memory_allocated (GB)': 27.35, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 9.0812, 'grad_norm': 7.828359127044678, 'learning_rate': 4.894026974951831e-05, 'epoch': 0.06, 'memory_allocated (GB)': 27.32, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 9.9278, 'grad_norm': 40.32044219970703, 'learning_rate': 4.8843930635838154e-05, 'epoch': 0.07, 'memory_allocated (GB)': 27.32, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 8.5225, 'grad_norm': 7.01698637008667, 'learning_rate': 4.8747591522157996e-05, 'epoch': 0.08, 'memory_allocated (GB)': 27.36, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 8.3957, 'grad_norm': 9.207005500793457, 'learning_rate': 4.8651252408477846e-05, 'epoch': 0.08, 'memory_allocated (GB)': 27.33, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 8.3269, 'grad_norm': 15.509377479553223, 'learning_rate': 4.855491329479769e-05, 'epoch': 0.09, 'memory_allocated (GB)': 27.34, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 8.392, 'grad_norm': 11.741216659545898, 'learning_rate': 4.845857418111754e-05, 'epoch': 0.09, 'memory_allocated (GB)': 27.36, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 8.341, 'grad_norm': 13.54684066772461, 'learning_rate': 4.836223506743739e-05, 'epoch': 0.1, 'memory_allocated (GB)': 27.33, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 8.132, 'grad_norm': 6.200448513031006, 'learning_rate': 4.826589595375723e-05, 'epoch': 0.1, 'memory_allocated (GB)': 27.31, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.8102, 'grad_norm': 5.493015766143799, 'learning_rate': 4.816955684007707e-05, 'epoch': 0.11, 'memory_allocated (GB)': 27.3, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.6805, 'grad_norm': 7.432443141937256, 'learning_rate': 4.807321772639692e-05, 'epoch': 0.12, 'memory_allocated (GB)': 27.33, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.6716, 'grad_norm': 18.697616577148438, 'learning_rate': 4.7976878612716764e-05, 'epoch': 0.12, 'memory_allocated (GB)': 27.34, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.531, 'grad_norm': 9.172748565673828, 'learning_rate': 4.7880539499036607e-05, 'epoch': 0.13, 'memory_allocated (GB)': 27.32, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.4479, 'grad_norm': 7.693913459777832, 'learning_rate': 4.7784200385356456e-05, 'epoch': 0.13, 'memory_allocated (GB)': 27.34, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.4504, 'grad_norm': 4.102222442626953, 'learning_rate': 4.7687861271676305e-05, 'epoch': 0.14, 'memory_allocated (GB)': 27.34, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.2147, 'grad_norm': 4.539271831512451, 'learning_rate': 4.759152215799615e-05, 'epoch': 0.14, 'memory_allocated (GB)': 27.37, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.2953, 'grad_norm': 4.624892711639404, 'learning_rate': 4.7495183044316e-05, 'epoch': 0.15, 'memory_allocated (GB)': 27.37, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.279, 'grad_norm': 3.8493056297302246, 'learning_rate': 4.739884393063584e-05, 'epoch': 0.16, 'memory_allocated (GB)': 27.37, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.2769, 'grad_norm': 3.396097183227539, 'learning_rate': 4.730250481695568e-05, 'epoch': 0.16, 'memory_allocated (GB)': 27.31, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.2125, 'grad_norm': 4.0201640129089355, 'learning_rate': 4.720616570327553e-05, 'epoch': 0.17, 'memory_allocated (GB)': 27.34, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.1199, 'grad_norm': 4.433038234710693, 'learning_rate': 4.710982658959538e-05, 'epoch': 0.17, 'memory_allocated (GB)': 27.35, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 7.0391, 'grad_norm': 2.8623831272125244, 'learning_rate': 4.7013487475915223e-05, 'epoch': 0.18, 'memory_allocated (GB)': 27.34, 'max_memory_allocated (GB)': 79.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 6.8758, 'grad_norm': 3.1782188415527344, 'learning_rate': 4.6917148362235066e-05, 'epoch': 0.18, 'memory_allocated (GB)': 27.29, 'max_memory_allocated (GB)': 93.29, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 6.6878, 'grad_norm': 2.3016743659973145, 'learning_rate': 4.6820809248554915e-05, 'epoch': 0.19, 'memory_allocated (GB)': 27.37, 'max_memory_allocated (GB)': 93.29, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 6.637, 'grad_norm': 4.136375904083252, 'learning_rate': 4.672447013487476e-05, 'epoch': 0.2, 'memory_allocated (GB)': 27.33, 'max_memory_allocated (GB)': 93.29, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 6.8968, 'grad_norm': 3.34140682220459, 'learning_rate': 4.662813102119461e-05, 'epoch': 0.2, 'memory_allocated (GB)': 27.35, 'max_memory_allocated (GB)': 93.29, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 6.9145, 'grad_norm': 2.7163383960723877, 'learning_rate': 4.653179190751446e-05, 'epoch': 0.21, 'memory_allocated (GB)': 27.32, 'max_memory_allocated (GB)': 93.29, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 6.7147, 'grad_norm': 2.5218122005462646, 'learning_rate': 4.64354527938343e-05, 'epoch': 0.21, 'memory_allocated (GB)': 27.32, 'max_memory_allocated (GB)': 93.29, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 6.7815, 'grad_norm': 3.993046522140503, 'learning_rate': 4.633911368015414e-05, 'epoch': 0.22, 'memory_allocated (GB)': 27.32, 'max_memory_allocated (GB)': 93.29, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=289322) {'loss': 6.8765, 'grad_norm': 2.5143563747406006, 'learning_rate': 4.624277456647399e-05, 'epoch': 0.23, 'memory_allocated (GB)': 27.34, 'max_memory_allocated (GB)': 93.29, 'total_memory_available (GB)': 94.62}

...