使用 Intel Gaudi 对 Llama-2 模型进行微调#

在这个 Jupyter notebook 中，我们将：

使用 Intel Gaudi 加速器和 DDP 方法对 Llama-2-7b 模型进行微调
使用 Intel Gaudi 加速器和 DeepSpeed 方法对 Llama-2-70b 模型进行微调

我们将使用 PyTorch 进行模型训练，并使用 Ray 进行分布式训练。我们将使用数据集 tatsu-lab/alpaca。

Intel Gaudi AI 处理器 (HPUs) 是由 Habana Labs 设计的 AI 硬件加速器。欲了解更多信息，请参阅 Gaudi 架构和 Gaudi 开发者文档。

此微调示例的基本功能包括：

在 HPUs 上运行，支持三种执行模式：“lazy”, “eager”, “eager.compile”。
LoRA 训练。
基于 DDP 或 DeepSpeed 的方法。
基于 GaudiTrainer 的训练。
Llama-2-7b/Llama-2-70b 模型。
基于 Ray 的资源调度和管理。

准备环境#

此示例在单节点上运行，配备4个HPU。

我们推荐使用预构建的容器来运行这些示例。要运行容器，需要安装Docker。请参阅安装Docker引擎获取安装说明。

接下来，请遵循使用容器运行来安装Habana驱动程序和容器运行时。

获取docker镜像#

docker pull vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest

运行docker镜像#

docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
# 可能需要映射您的工作区卷

安装依赖#

# 在执行模式为"eager"或"eager.compile"时使用"optimum-habana>1.11.1" 
# "ray>=2.20.0"
pip install ray[train] notebook transformers datasets evaluate peft accelerate scikit-learn optimum-habana

# 安装deepspeed
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0

# 此笔记本已验证以下包的版本：
# transformers==4.38.2
# datasets==2.19.1
# evaluate==0.4.2
# peft==0.4.0
# accelerate==0.27.2
# scikit-learn==1.4.2
# optimum-habana==1.11.1

# deepspeed==0.12.4+hpu.synapse.v1.15.0

导入必要的库#

import os
import copy
from typing import Dict

import torch

import datasets
import transformers
from transformers import DataCollatorForLanguageModeling

from tqdm import tqdm

import peft

from optimum.habana import GaudiTrainer, GaudiConfig, GaudiTrainingArguments
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi

准备数据集函数#

使用指定格式对原始数据集的每一行进行预处理。

def preprocess_dataset(raw_datasets):

    PROMPT_DICT = {
        "prompt_with_input": (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
        ),
        "prompt_without_input": (
            "Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Response:"
        ),
    }

    def create_prompts(examples):
        prompts = {}
        prompts["source"] = []
        prompts["target"] = []
        for example in examples:
            prompt_template = (
                PROMPT_DICT["prompt_with_input"] if example["input"] != "" else PROMPT_DICT["prompt_without_input"]
            )
            source = prompt_template.format_map(example)
            prompts["source"].append(source)
            prompts["target"].append(example["output"])
        return prompts

    # 预处理数据集。
    for key in raw_datasets:
        prompts = create_prompts(raw_datasets[key])
        columns_to_be_removed = list(raw_datasets[key].features.keys())
        raw_datasets[key] = raw_datasets[key].add_column("prompt_sources", prompts["source"])
        raw_datasets[key] = raw_datasets[key].add_column("prompt_targets", prompts["target"])
        raw_datasets[key] = raw_datasets[key].remove_columns(columns_to_be_removed)

数据集到分词器函数#

通过模型分词器对数据集中的每一行进行分词。

在示例代码中，我们将数据集的行内容进行连接以加快训练速度。

所有数据集都被处理为“训练”数据集，没有从原始数据集中抽样评估数据集。

def preprocess_dataset_to_tokenizer(raw_datasets, tokenizer):
    max_seq_length = 512
    tokenizer.pad_token_id = 0
    tokenizer.eos_token_id = 1
    tokenizer.bos_token_id = 2

    def tokenize(prompt, add_eos_token=True):
        results = tokenizer(
            prompt,
            truncation=True,
            max_length=max_seq_length,
            padding=False,
            return_tensors=None,
        )
        for i in range(len(results["input_ids"])):
            if (
                results["input_ids"][i][-1] != tokenizer.eos_token_id
                and len(results["input_ids"][i]) < max_seq_length
                and add_eos_token
            ):
                results["input_ids"][i].append(tokenizer.eos_token_id)
                results["attention_mask"][i].append(1)

        results["labels"] = copy.deepcopy(results["input_ids"])
        results["input_id_len"] = [len(result) for result in results["input_ids"]]
        return results

    def preprocess_function(examples):
        keys = list(examples.data.keys())
        if len(keys) != 2:
            raise ValueError("Unsupported dataset format")

        st = [s + t for s, t in zip(examples[keys[0]], examples[keys[1]])]

        examples_tokenized = tokenize(st)
        input_ids = examples_tokenized["input_ids"]
        labels = examples_tokenized["labels"]
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": examples_tokenized["attention_mask"],
        }

    tokenized_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        load_from_cache_file=True,
    )

    def concatenate_data(dataset, max_seq_length):
        concatenated_dataset = {}
        for column in dataset.features:
            concatenated_data = [item for sample in dataset[column] for item in sample]
            reshaped_data = [
                concatenated_data[i * max_seq_length : (i + 1) * max_seq_length]
                for i in range(len(concatenated_data) // max_seq_length)
            ]
            concatenated_dataset[column] = reshaped_data
        return datasets.Dataset.from_dict(concatenated_dataset)

    tokenized_datasets_ = tokenized_datasets["train"].remove_columns(["prompt_sources", "prompt_targets"])
    tokenized_datasets["train"] = concatenate_data(tokenized_datasets_, max_seq_length)

    return tokenized_datasets

准备训练参数#

这里有一些参数是硬编码的，您可以从 config 中传递参数。

def prepare_training_args(config: Dict):
    # 准备执行模式配置
    execution_mode = config["execution_mode"]
    use_lazy_mode = True if execution_mode == "lazy" else False
    torch_compile_backend = "hpu_backend" if execution_mode == "eager.compile" else None

    deepspeed = config["deepspeed"] if "deepspeed" in config else None

    return GaudiTrainingArguments(deepspeed=deepspeed,
                                  output_dir=config["output"],
                                  do_train=True,
                                  do_eval=False,
                                  per_device_train_batch_size=config["batch_size_per_worker"],
                                  bf16=True,
                                  learning_rate=config["lr"],
                                  save_strategy="no",
                                  torch_compile_backend=torch_compile_backend,
                                  evaluation_strategy="no",
                                  lr_scheduler_type="cosine",
                                  num_train_epochs=config["epochs"],
                                  use_lazy_mode=use_lazy_mode,
                                  use_habana=True,
                                  pipelining_fwd_bwd=True,
                                  save_only_model=True,
                                  gradient_checkpointing=True,
                                  warmup_ratio=0.03,
                                  throughput_warmup_steps=3,
                                  logging_steps=5)

准备模型#

从 huggingface 下载模型或从本地目录读取模型。
将模型转换为 LoRA 模型。
将模型移动到 HPU 设备。

如果您不想使用 LoRA 进行微调，只需删除 LoRA 转换步骤。

def prepare_model(config: Dict, device):
    # 从预训练模型准备
    deepspeed = config["deepspeed"] if "deepspeed" in config else None
    if deepspeed is not None:
        auto_config = transformers.AutoConfig.from_pretrained(config["model"], use_cache=False, revision="main", use_auth_token=None, trust_remote_code=None)
        model = transformers.AutoModelForCausalLM.from_pretrained(config["model"], config=auto_config, **config["model_config"])
        model.generation_config.attn_softmax_bf16 = True
        model.generation_config.use_flash_attention = True
    else:
        model = transformers.AutoModelForCausalLM.from_pretrained(config["model"], **config["model_config"])
    model.enable_input_require_grads()

    # 转换为用于LoRA训练的PEFT模型
    peft_config = peft.LoraConfig(**config["lora_config"])
    model = peft.get_peft_model(model, peft_config)

    model.to(dtype=config["model_config"]["torch_dtype"], device=device)

    return model

训练函数#

该函数将在每个工作线程中执行，步骤如下：

准备训练参数，一个 GaudiTrainingArguments 的实例。
加载数据集并预处理数据集，仅读取前4096个条目作为训练数据集。
加载预训练模型作为分词器，并将数据集处理为分词器格式。
加载预训练模型。
准备数据收集器和 gaidu_config。
准备 GaudiTrainer 的实例。
调用 train() 方法进行模型训练。
保存模型结果。

与GPU的训练函数相比，迁移到HPU不需要任何更改。在内部，Ray Train 执行以下操作：

检测HPU并设置设备。
初始化habana PyTorch后端。
初始化habana分布式后端。

def train_func_per_worker(config: Dict):
    # 将变压器适配到Gaudi
    adapt_transformers_to_gaudi()

    # 准备训练参数
    training_args = prepare_training_args(config)

    # 准备数据集
    # here we use dataset "tatsu-lab/alpaca" from huggingface
    raw_datasets = datasets.DatasetDict({"train": datasets.load_dataset("tatsu-lab/alpaca", split='train[0:4096]')})
    preprocess_dataset(raw_datasets)

    # 准备分词器
    tokenizer = transformers.AutoTokenizer.from_pretrained(config["model"])
    tokenized_datasets = preprocess_dataset_to_tokenizer(raw_datasets, tokenizer)

    # 准备模型
    model = prepare_model(config, training_args.device)

    # 准备数据整理器
    data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="pt", mlm=False)

    # 准备高迪配置
    gaudi_config = GaudiConfig()
    gaudi_config.use_fused_adam = True
    gaudi_config.use_fused_clip_norm = True

    # 实例 GaudiTrainer
    trainer = GaudiTrainer(
        model=model,
        gaudi_config=gaudi_config,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=None,
        preprocess_logits_for_metrics=None,
    )

    train_result = trainer.train()
    print(f"train_result = {train_result}")
    trainer.save_model()

    return train_result

主要训练函数#

train_llama 函数设置分布式训练环境，使用 Ray 并启动训练过程。为了启用使用 HPU 的训练，我们只需进行以下更改：

设置训练的执行模式，支持的执行模式包括：
- “lazy”：图的延迟执行，包括从脚本按操作逐个传递的操作，类似于 Eager 模式。它在 Gaudi 上提供 Eager 模式体验。与使用 torch.compile 的 Eager 模式不同，图在每次迭代中进行分析，导致更高的 CPU 使用率。
- “eager”：按照标准 PyTorch Eager 模式脚本逐个操作执行。
- “eager.compile”：带有 torch.compile 的 Eager 模式 - 类似于 Eager 模式，但通过将整个模型或部分模型（例如一个函数）封装成一个图形来扩展。未被封装的部分将以 Eager 方式执行。
有关更多详细理论，请参见此处，有关详细性能结果，请参见此处
设置训练方法，支持的方法包括：
- “ddp”
- “deepspeed”
在 ScalingConfig 中为每个工作者要求一个 HPU
在 TorchConfig 中将后端设置为 hccl

def train_llama(num_workers, execution_mode, training_method):
    import ray
    from ray.train import ScalingConfig
    from ray.train.torch import TorchTrainer, TorchConfig

    # deepspeed 配置，也可以将其放置到配置文件中
    deepspeed_config = {
        "steps_per_print": 64,
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "gradient_accumulation_steps": "auto",
        "bf16": {
            "enabled": True
        },
        "gradient_clipping": 1.0,
        "zero_optimization": {
            "stage": 3,
            "overlap_comm": False,
            "contiguous_gradients": False,
            "stage3_gather_16bit_weights_on_model_save": True
        }
    }

    # 准备列车配置
    train_config = {
        "execution_mode": execution_mode,
        "model": "/root/models/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080/",
        "model_config": {"torch_dtype": torch.bfloat16, "trust_remote_code": False, "use_auth_token": None},
        "lora_config": {"task_type": "CAUSAL_LM", "r": 8, "lora_alpha": 32, "lora_dropout": 0.1, "target_modules": ["q_proj", "v_proj"]},
        "lr": 1e-4,
        "epochs": 2,
        "batch_size_per_worker": 8,
        "output": "/tmp/ray/",
        "deepspeed": deepspeed_config if training_method == "deepspeed" else None,
    }

    # 配置计算资源
    # 在 ScalingConfig 中，要求每个 worker 配备一个 HPU。
    scaling_config = ScalingConfig(num_workers=num_workers, resources_per_worker={"CPU": 1, "HPU": 1})
    # 将TorchConfig中的后端设置为hccl
    torch_config = TorchConfig(backend = "hccl")

    # 启动你的Ray集群
    ray.init()

    # 初始化一个 Ray TorchTrainer
    trainer = TorchTrainer(
        train_loop_per_worker=train_func_per_worker,
        train_loop_config=train_config,
        torch_config=torch_config,
        scaling_config=scaling_config,
    )

    result = trainer.fit()
    print(f"Training result: {result}")

开始训练#

最后，我们调用 train_llama 函数来开始训练过程。您可以调整要使用的工作线程数量以及 HPU 的执行模式。

# 设置一些环境变量
os.environ["RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES"] = "0"
# 如果使用RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES环境变量
# 你需要设置HABANA_VISIBLE_DEVICES，例如
# os.environ["HABANA_VISIBLE_DEVICES"] = "0,1,2,3"

# execution_mode are ["lazy", "eager", "eager.compile"]
execution_mode = "lazy"
os.environ["PT_HPU_LAZY_MODE"] = "1" if execution_mode == "lazy" else "0"

# training_method are ["ddp", "deepspeed"]
training_method = "deepspeed"
if training_method == "deepspeed":
    os.environ["PT_HPU_MAX_COMPOUND_OP_SIZE"] = "10"
    os.environ["DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED"] = "1"

# 这里使用4个HPU。
train_llama(num_workers=4, execution_mode=execution_mode, training_method=training_method)

最终输出#

在HPUs上进行DDP#

Llama-2-70b-chat-hf
4个HPU
LoRA

(RayTrainWorker pid=123181) {'loss': 1.8051, 'grad_norm': 0.6015625, 'learning_rate': 9.938441702975689e-05, 'epoch': 0.16, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.6754, 'grad_norm': 0.408203125, 'learning_rate': 9.567727288213005e-05, 'epoch': 0.32, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.568, 'grad_norm': 0.4453125, 'learning_rate': 8.885729807284856e-05, 'epoch': 0.48, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.4934, 'grad_norm': 0.4609375, 'learning_rate': 7.938926261462366e-05, 'epoch': 0.65, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.3965, 'grad_norm': 0.3515625, 'learning_rate': 6.7918397477265e-05, 'epoch': 0.81, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.3461, 'grad_norm': 0.34765625, 'learning_rate': 5.522642316338268e-05, 'epoch': 0.97, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.2924, 'grad_norm': 0.32421875, 'learning_rate': 4.2178276747988446e-05, 'epoch': 1.13, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.2643, 'grad_norm': 0.33203125, 'learning_rate': 2.9663167846209998e-05, 'epoch': 1.29, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.263, 'grad_norm': 0.318359375, 'learning_rate': 1.8533980447508137e-05, 'epoch': 1.45, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.2502, 'grad_norm': 0.275390625, 'learning_rate': 9.549150281252633e-06, 'epoch': 1.61, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.2161, 'grad_norm': 0.2734375, 'learning_rate': 3.3209786751399187e-06, 'epoch': 1.77, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.2517, 'grad_norm': 0.294921875, 'learning_rate': 2.7390523158633554e-07, 'epoch': 1.94, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}

在HPUs上进行DeepSpeed#

Llama-2-70b-chat-hf
4个HPU
LoRA

(RayTrainWorker pid=110856) {'loss': 1.6627, 'grad_norm': 0.35921376943588257, 'learning_rate': 9.938441702975689e-05, 'epoch': 0.16, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=110856) {'loss': 1.6085, 'grad_norm': 0.35271379351615906, 'learning_rate': 9.567727288213005e-05, 'epoch': 0.32, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=110856) {'loss': 1.5051, 'grad_norm': 0.4277978837490082, 'learning_rate': 8.885729807284856e-05, 'epoch': 0.48, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=110856) {'loss': 1.4157, 'grad_norm': 0.5138524770736694, 'learning_rate': 7.938926261462366e-05, 'epoch': 0.65, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=110856) {'loss': 1.3233, 'grad_norm': 0.3451262414455414, 'learning_rate': 6.7918397477265e-05, 'epoch': 0.81, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=110856) {'loss': 1.2728, 'grad_norm': 0.38564223051071167, 'learning_rate': 5.522642316338268e-05, 'epoch': 0.97, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=110856) {'loss': 1.1989, 'grad_norm': 0.36078131198883057, 'learning_rate': 4.2178276747988446e-05, 'epoch': 1.13, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=110856) {'loss': 1.1552, 'grad_norm': 0.47946077585220337, 'learning_rate': 2.9663167846209998e-05, 'epoch': 1.29, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=110856) {'loss': 1.1413, 'grad_norm': 0.3357600271701813, 'learning_rate': 1.8533980447508137e-05, 'epoch': 1.45, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=110856) {'loss': 1.129, 'grad_norm': 0.2777070701122284, 'learning_rate': 9.549150281252633e-06, 'epoch': 1.61, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=110856) {'loss': 1.0876, 'grad_norm': 0.25669950246810913, 'learning_rate': 3.3209786751399187e-06, 'epoch': 1.77, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=110856) {'loss': 1.1238, 'grad_norm': 0.2423330545425415, 'learning_rate': 2.7390523158633554e-07, 'epoch': 1.94, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}