作者: Apoorv Nandan
创建日期: 2020/05/23
最后修改日期: 2020/05/23
描述: 从 HuggingFace Transformers 微调预训练的 BERT 在 SQuAD 上。
本演示使用 SQuAD(斯坦福问答数据集)。 在 SQuAD 中,输入由一个问题和一个段落(作为上下文)组成。 目标是在段落中找到回答该问题的文本范围。 我们通过“准确匹配”指标来评估在此数据上的表现,该指标测量与真实答案完全匹配的预测百分比。
我们微调 BERT 模型以执行此任务,如下所示:
参考文献:
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
max_len = 384
configuration = BertConfig() # BERT 的默认参数和配置
# 保存缓慢的预训练分词器
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)
# 从保存的文件加载快速分词器
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)
train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
eval_path = keras.utils.get_file("eval.json", eval_data_url)
SquadExample
对象。 SquadExample
并创建 x_train, y_train, x_eval, y_eval
。class SquadExample:
def __init__(self, question, context, start_char_idx, answer_text, all_answers):
self.question = question
self.context = context
self.start_char_idx = start_char_idx
self.answer_text = answer_text
self.all_answers = all_answers
self.skip = False
def preprocess(self):
context = self.context
question = self.question
answer_text = self.answer_text
start_char_idx = self.start_char_idx
# 清理上下文、答案和问题
context = " ".join(str(context).split())
question = " ".join(str(question).split())
answer = " ".join(str(answer_text).split())
# 找到答案在上下文中的结束字符索引
end_char_idx = start_char_idx + len(answer)
if end_char_idx >= len(context):
self.skip = True
return
# 标记上下文中属于答案的字符索引
is_char_in_ans = [0] * len(context)
for idx in range(start_char_idx, end_char_idx):
is_char_in_ans[idx] = 1
# 对上下文进行分词
tokenized_context = tokenizer.encode(context)
# 找到由答案字符创建的标记
ans_token_idx = []
for idx, (start, end) in enumerate(tokenized_context.offsets):
if sum(is_char_in_ans[start:end]) > 0:
ans_token_idx.append(idx)
if len(ans_token_idx) == 0:
self.skip = True
return
# 找到答案的标记的开始和结束索引
start_token_idx = ans_token_idx[0]
end_token_idx = ans_token_idx[-1]
# 对问题进行分词
tokenized_question = tokenizer.encode(question)
# 创建输入
input_ids = tokenized_context.ids + tokenized_question.ids[1:]
token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
tokenized_question.ids[1:]
)
attention_mask = [1] * len(input_ids)
# 填充并创建注意力掩码。
# 如果需要截断则跳过
padding_length = max_len - len(input_ids)
if padding_length > 0: # 填充
input_ids = input_ids + ([0] * padding_length)
attention_mask = attention_mask + ([0] * padding_length)
token_type_ids = token_type_ids + ([0] * padding_length)
elif padding_length < 0: # 跳过
self.skip = True
return
self.input_ids = input_ids
self.token_type_ids = token_type_ids
self.attention_mask = attention_mask
self.start_token_idx = start_token_idx
self.end_token_idx = end_token_idx
self.context_token_to_char = tokenized_context.offsets
with open(train_path) as f:
raw_train_data = json.load(f)
with open(eval_path) as f:
raw_eval_data = json.load(f)
def create_squad_examples(raw_data):
squad_examples = []
for item in raw_data["data"]:
for para in item["paragraphs"]:
context = para["context"]
for qa in para["qas"]:
question = qa["question"]
answer_text = qa["answers"][0]["text"]
all_answers = [_["text"] for _ in qa["answers"]]
start_char_idx = qa["answers"][0]["answer_start"]
squad_eg = SquadExample(
question, context, start_char_idx, answer_text, all_answers
)
squad_eg.preprocess()
squad_examples.append(squad_eg)
return squad_examples
def create_inputs_targets(squad_examples):
dataset_dict = {
"input_ids": [],
"token_type_ids": [],
"attention_mask": [],
"start_token_idx": [],
"end_token_idx": [],
}
for item in squad_examples:
if item.skip == False:
for key in dataset_dict:
dataset_dict[key].append(getattr(item, key))
for key in dataset_dict:
dataset_dict[key] = np.array(dataset_dict[key])
x = [
dataset_dict["input_ids"],
dataset_dict["token_type_ids"],
dataset_dict["attention_mask"],
]
y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
return x, y
train_squad_examples = create_squad_examples(raw_train_data)
x_train, y_train = create_inputs_targets(train_squad_examples)
print(f"{len(train_squad_examples)} 个训练点已创建。")
eval_squad_examples = create_squad_examples(raw_eval_data)
x_eval, y_eval = create_inputs_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} 个评估点已创建。")
87599 个训练点被创建。
10570 个评估点被创建。
使用 BERT 和功能 API 创建问答模型
def create_model():
## BERT 编码器
encoder = TFBertModel.from_pretrained("bert-base-uncased")
## QA 模型
input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
embedding = encoder(
input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
)[0]
start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
start_logits = layers.Flatten()(start_logits)
end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
end_logits = layers.Flatten()(end_logits)
start_probs = layers.Activation(keras.activations.softmax)(start_logits)
end_probs = layers.Activation(keras.activations.softmax)(end_logits)
model = keras.Model(
inputs=[input_ids, token_type_ids, attention_mask],
outputs=[start_probs, end_probs],
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
optimizer = keras.optimizers.Adam(lr=5e-5)
model.compile(optimizer=optimizer, loss=[loss, loss])
return model
这段代码最好在 Google Colab TPU 运行时上运行。 在 Colab TPU 上,每个 epoch 将耗时 5-6 分钟。
use_tpu = True
if use_tpu:
# 创建分布策略
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
strategy = tf.distribute.TPUStrategy(tpu)
# 创建模型
with strategy.scope():
model = create_model()
else:
model = create_model()
model.summary()
INFO:absl:进入主设备范围:/job:worker/replica:0/task:0/device:CPU:0
INFO:tensorflow:初始化 TPU 系统:grpc://10.48.159.170:8470
INFO:tensorflow:清除急切缓存
INFO:tensorflow:完成 TPU 系统的初始化。
INFO:tensorflow:发现 TPU 系统:
INFO:tensorflow:*** TPU 核心数量: 8
INFO:tensorflow:*** TPU 工作器数量: 1
INFO:tensorflow:*** 每个工作器的 TPU 核心数量: 8
模型: "model"
__________________________________________________________________________________________________
层(类型) 输出形状 参数 # 连接到
==================================================================================================
input_1 (InputLayer) [(None, 384)] 0
__________________________________________________________________________________________________
input_3 (InputLayer) [(None, 384)] 0
__________________________________________________________________________________________________
input_2 (InputLayer) [(None, 384)] 0
__________________________________________________________________________________________________
tf_bert_model (TFBertModel) ((None, 384, 768), ( 109482240 input_1[0][0]
__________________________________________________________________________________________________
start_logit (Dense) (None, 384, 1) 768 tf_bert_model[0][0]
__________________________________________________________________________________________________
end_logit (Dense) (None, 384, 1) 768 tf_bert_model[0][0]
__________________________________________________________________________________________________
flatten (Flatten) (None, 384) 0 start_logit[0][0]
__________________________________________________________________________________________________
flatten_1 (Flatten) (None, 384) 0 end_logit[0][0]
__________________________________________________________________________________________________
activation_7 (Activation) (None, 384) 0 flatten[0][0]
__________________________________________________________________________________________________
activation_8 (Activation) (None, 384) 0 flatten_1[0][0]
==================================================================================================
总参数: 109,483,776
可训练参数: 109,483,776
不可训练参数: 0
__________________________________________________________________________________________________
此回调将在每个 epoch 结束后使用验证数据计算精确匹配分数。
def normalize_text(text):
text = text.lower()
# 移除标点符号
exclude = set(string.punctuation)
text = "".join(ch for ch in text if ch not in exclude)
# 移除冠词
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
text = re.sub(regex, " ", text)
# 移除多余的空白
text = " ".join(text.split())
return text
class ExactMatch(keras.callbacks.Callback):
"""
每个 `SquadExample` 对象包含其输入段落中每个标记的字符级偏移量。
我们使用它们来获取对应于我们预测的起始和结束标记之间的文本跨度。
所有的真实答案也包含在每个 `SquadExample` 对象中。
我们计算从模型预测中获得的文本跨度与真实答案之一匹配的数据点的百分比。
"""
def __init__(self, x_eval, y_eval):
self.x_eval = x_eval
self.y_eval = y_eval
def on_epoch_end(self, epoch, logs=None):
pred_start, pred_end = self.model.predict(self.x_eval)
count = 0
eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
squad_eg = eval_examples_no_skip[idx]
offsets = squad_eg.context_token_to_char
start = np.argmax(start)
end = np.argmax(end)
if start >= len(offsets):
continue
pred_char_start = offsets[start][0]
if end < len(offsets):
pred_char_end = offsets[end][1]
pred_ans = squad_eg.context[pred_char_start:pred_char_end]
else:
pred_ans = squad_eg.context[pred_char_start:]
normalized_pred_ans = normalize_text(pred_ans)
normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
if normalized_pred_ans in normalized_true_ans:
count += 1
acc = count / len(self.y_eval[0])
print(f"\nepoch={epoch+1}, exact match score={acc:.2f}")
exact_match_callback = ExactMatch(x_eval, y_eval)
model.fit(
x_train,
y_train,
epochs=1, # 为了演示,推荐使用3个epoch
verbose=2,
batch_size=64,
callbacks=[exact_match_callback],
)
epoch=1, 精确匹配得分=0.78
1346/1346 - 350s - activation_7_loss: 1.3488 - loss: 2.5905 - activation_8_loss: 1.2417
<tensorflow.python.keras.callbacks.History at 0x7fc78b4458d0>