代码示例 / 自然语言处理 / 使用 BERT 进行文本提取

使用 BERT 进行文本提取

作者: Apoorv Nandan
创建日期: 2020/05/23
最后修改日期: 2020/05/23

在 Colab 中查看 GitHub 源

描述: 从 HuggingFace Transformers 微调预训练的 BERT 在 SQuAD 上。


介绍

本演示使用 SQuAD(斯坦福问答数据集)。 在 SQuAD 中,输入由一个问题和一个段落(作为上下文)组成。 目标是在段落中找到回答该问题的文本范围。 我们通过“准确匹配”指标来评估在此数据上的表现,该指标测量与真实答案完全匹配的预测百分比。

我们微调 BERT 模型以执行此任务,如下所示:

  1. 将上下文和问题作为输入提供给 BERT。
  2. 获取两个维度等于 BERT 隐藏状态的向量 S 和 T。
  3. 计算每个标记作为答案范围的开始和结束的概率。 一个标记作为答案开始的概率由 S 和 BERT 最后一层中该标记的表示之间的点积给出,然后对所有标记进行 softmax。 一个标记作为答案结束的概率也以类似的方式用向量 T 计算。
  4. 微调 BERT 并在此过程中学习 S 和 T。

参考文献:

设置

import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

max_len = 384
configuration = BertConfig()  # BERT 的默认参数和配置

设置 BERT 分词器

# 保存缓慢的预训练分词器
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# 从保存的文件加载快速分词器
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

加载数据

train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
eval_path = keras.utils.get_file("eval.json", eval_data_url)

预处理数据

  1. 遍历 JSON 文件,并将每条记录存储为 SquadExample 对象。
  2. 遍历每个 SquadExample 并创建 x_train, y_train, x_eval, y_eval
class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_idx = self.start_char_idx

        # 清理上下文、答案和问题
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        # 找到答案在上下文中的结束字符索引
        end_char_idx = start_char_idx + len(answer)
        if end_char_idx >= len(context):
            self.skip = True
            return

        # 标记上下文中属于答案的字符索引
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1

        # 对上下文进行分词
        tokenized_context = tokenizer.encode(context)

        # 找到由答案字符创建的标记
        ans_token_idx = []
        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

        if len(ans_token_idx) == 0:
            self.skip = True
            return

        # 找到答案的标记的开始和结束索引
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]

        # 对问题进行分词
        tokenized_question = tokenizer.encode(question)

        # 创建输入
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
            tokenized_question.ids[1:]
        )
        attention_mask = [1] * len(input_ids)

        # 填充并创建注意力掩码。
        # 如果需要截断则跳过
        padding_length = max_len - len(input_ids)
        if padding_length > 0:  # 填充
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # 跳过
            self.skip = True
            return

        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = tokenized_context.offsets


with open(train_path) as f:
    raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)


def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(
                    question, context, start_char_idx, answer_text, all_answers
                )
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples


def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y


train_squad_examples = create_squad_examples(raw_train_data)
x_train, y_train = create_inputs_targets(train_squad_examples)
print(f"{len(train_squad_examples)} 个训练点已创建。")

eval_squad_examples = create_squad_examples(raw_eval_data)
x_eval, y_eval = create_inputs_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} 个评估点已创建。")
87599 个训练点被创建。
10570 个评估点被创建。

使用 BERT 和功能 API 创建问答模型

def create_model():
    ## BERT 编码器
    encoder = TFBertModel.from_pretrained("bert-base-uncased")

    ## QA 模型
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]

    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

这段代码最好在 Google Colab TPU 运行时上运行。 在 Colab TPU 上,每个 epoch 将耗时 5-6 分钟。

use_tpu = True
if use_tpu:
    # 创建分布策略
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)

    # 创建模型
    with strategy.scope():
        model = create_model()
else:
    model = create_model()

model.summary()
INFO:absl:进入主设备范围:/job:worker/replica:0/task:0/device:CPU:0

INFO:tensorflow:初始化 TPU 系统:grpc://10.48.159.170:8470

INFO:tensorflow:清除急切缓存

INFO:tensorflow:完成 TPU 系统的初始化。

INFO:tensorflow:发现 TPU 系统:

INFO:tensorflow:*** TPU 核心数量: 8

INFO:tensorflow:*** TPU 工作器数量: 1

INFO:tensorflow:*** 每个工作器的 TPU 核心数量: 8

模型: "model"
__________________________________________________________________________________________________
层(类型)                      输出形状           参数 #      连接到                     
==================================================================================================
input_1 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 384, 768), ( 109482240   input_1[0][0]                    
__________________________________________________________________________________________________
start_logit (Dense)             (None, 384, 1)       768         tf_bert_model[0][0]              
__________________________________________________________________________________________________
end_logit (Dense)               (None, 384, 1)       768         tf_bert_model[0][0]              
__________________________________________________________________________________________________
flatten (Flatten)               (None, 384)          0           start_logit[0][0]                
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 384)          0           end_logit[0][0]                  
__________________________________________________________________________________________________
activation_7 (Activation)       (None, 384)          0           flatten[0][0]                    
__________________________________________________________________________________________________
activation_8 (Activation)       (None, 384)          0           flatten_1[0][0]                  
==================================================================================================
总参数: 109,483,776
可训练参数: 109,483,776
不可训练参数: 0
__________________________________________________________________________________________________

创建评估回调

此回调将在每个 epoch 结束后使用验证数据计算精确匹配分数。

def normalize_text(text):
    text = text.lower()

    # 移除标点符号
    exclude = set(string.punctuation)
    text = "".join(ch for ch in text if ch not in exclude)

    # 移除冠词
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text = re.sub(regex, " ", text)

    # 移除多余的空白
    text = " ".join(text.split())
    return text


class ExactMatch(keras.callbacks.Callback):
    """
    每个 `SquadExample` 对象包含其输入段落中每个标记的字符级偏移量。 
    我们使用它们来获取对应于我们预测的起始和结束标记之间的文本跨度。
    所有的真实答案也包含在每个 `SquadExample` 对象中。
    我们计算从模型预测中获得的文本跨度与真实答案之一匹配的数据点的百分比。
    """

    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = eval_examples_no_skip[idx]
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]

            normalized_pred_ans = normalize_text(pred_ans)
            normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
            if normalized_pred_ans in normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch+1}, exact match score={acc:.2f}")

训练和评估

exact_match_callback = ExactMatch(x_eval, y_eval)
model.fit(
    x_train,
    y_train,
    epochs=1,  # 为了演示,推荐使用3个epoch
    verbose=2,
    batch_size=64,
    callbacks=[exact_match_callback],
)
epoch=1, 精确匹配得分=0.78
1346/1346 - 350s - activation_7_loss: 1.3488 - loss: 2.5905 - activation_8_loss: 1.2417

<tensorflow.python.keras.callbacks.History at 0x7fc78b4458d0>