3. 训练一个专门用于问答的微调模型
import openai
import pandas as pd
df = pd.read_csv('olympics-data/olympics_qa.csv')
olympics_search_fileid = "file-c3shd8wqF3vSCKaukW4Jr1TT"
title | heading | content | tokens | context | questions | answers | |
0 | 2020 Summer Olympics | Summary | The 2020 Summer Olympics (Japanese: 2020年夏季オリン... | 713 | 2020 Summer Olympics Summary The 2020 Summ... | 1. What is the 2020 Summer Olympics? 2. When ... | 1. The 2020 Summer Olympics is an internationa... |
1 | 2020 Summer Olympics | Host city selection | The International Olympic Committee (IOC) vote... | 126 | 2020 Summer Olympics Host city selection T... | 1. 2. 3. 4. | 1. What is the International Olympic Committee... |
2 | 2020 Summer Olympics | Impact of the COVID-19 pandemic | In January 2020, concerns were raised about th... | 369 | 2020 Summer Olympics Impact of the COVID-19 p... | 1. What was the COVID-19 pandemic? 2. How did... | 1. The COVID-19 pandemic was a pandemic that o... |
3 | 2020 Summer Olympics | Qualifying event cancellation and postponement | Concerns about the pandemic began to affect qu... | 298 | 2020 Summer Olympics Qualifying event cancell... | 1. What was the original location of the Asia ... | 1. The original location of the Asia & Oceania... |
4 | 2020 Summer Olympics | Effect on doping tests | Mandatory doping tests were being severely res... | 163 | 2020 Summer Olympics Effect on doping tests ... | 1. What was the COVID-19 pandemic? 2. What di... | 1. The COVID-19 pandemic was a pandemic that o... |
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
len(train_df), len(test_df)
(3014, 754)
3.1 为问答和鉴别器模型创建微调数据集
微调数据集是按以下方式创建的。对于每个相应的问题、答案和上下文对,我们创建: - 正例:正确的问题、答案、上下文对 - 负例: - 随机负例,其中随机上下文与问题配对 - 两个困难负例 - 一个来自相同的维基百科文章 - 另一个是与正确上下文最相似的
我们对鉴别器模型和问答模型都应用相同的数据 集创建过程。我们分别对训练集和测试集应用该过程,以确保训练集中的示例不会出现在测试集中。
import random
def get_random_similar_contexts(question, context, file_id=olympics_search_fileid, search_model='ada', max_rerank=10):
# 待办事项:openai.Engine(search_model) 已被弃用。
results = openai.Engine(search_model).search(
candidates = []
for result in results['data'][:3]:
if result['text'] == context:
random_candidate = random.choice(candidates)
return random_candidate
except Exception as e:
return ""
def create_fine_tuning_dataset(df, discriminator=False, n_negative=1, add_related=False):
Create a dataset for fine tuning the OpenAI model; either for a discriminator model,
or a model specializing in Q&A, where it says if no relevant context is found.
df: pd.DataFrame
The dataframe containing the question, answer and context pairs
discriminator: bool
Whether to create a dataset for the discriminator
n_negative: int
The number of random negative samples to add (using a random context)
add_related: bool
Whether to add the related contexts to the correct context. These are hard negative examples
The dataframe containing the prompts and completions, ready for fine-tuning
rows = []
for i, row in df.iterrows():
for q, a in zip(("1." + row.questions).split('\n'), ("1." + row.answers).split('\n')):
if len(q) >10 and len(a) >10:
if discriminator:
rows.append({"prompt":f"{row.context}\nQuestion: {q[2:].strip()}\n Related:", "completion":f" yes"})
rows.append({"prompt":f"{row.context}\nQuestion: {q[2:].strip()}\nAnswer:", "completion":f" {a[2:].strip()}"})
for i, row in df.iterrows():
for q in ("1." + row.questions).split('\n'):
if len(q) >10:
for j in range(n_negative + (2 if add_related else 0)):
random_context = ""
if j == 0 and add_related:
# 根据源自同一维基百科页面的相关上 下文进行添加
subset = df[(df.title == row.title) & (df.context != row.context)]
if len(subset) < 1:
random_context = subset.sample(1).iloc[0].context
if j == 1 and add_related:
# 根据搜索结果中最相似的上下文,添加相关内容。
random_context = get_random_similar_contexts(q[2:].strip(), row.context, search_model='ada', max_rerank=10)
while True:
# 添加随机上下文,但这并不是正确的上下文。
random_context = df.sample(1).iloc[0].context
if random_context != row.context:
if discriminator:
rows.append({"prompt":f"{random_context}\nQuestion: {q[2:].strip()}\n Related:", "completion":f" no"})
rows.append({"prompt":f"{random_context}\nQuestion: {q[2:].strip()}\nAnswer:", "completion":f" No appropriate context found to answer the question."})
return pd.DataFrame(rows)
for name, is_disc in [('discriminator', True), ('qa', False)]:
for train_test, dt in [('train', train_df), ('test', test_df)]:
ft = create_fine_tuning_dataset(dt, discriminator=is_disc, n_negative=1, add_related=True)
ft.to_json(f'{name}_{train_test}.jsonl', orient='records', lines=True)
我们根据微调工具的建议对数据进行了格式化,可以使用以下命令进行操作: > openai tools fine_tunes.prepare_data -f qa_train.jsonl
3.2 提交数据集进行微调
!openai api fine_tunes.create -t "olympics-data/discriminator_train.jsonl" -v "olympics-data/discriminator_test.jsonl" --batch_size 16 --compute_classification_metrics --classification_positive_class " yes" --model ada
!openai api fine_tunes.create -t "olympics-data/qa_train.jsonl" -v "olympics-data/qa_test.jsonl" --batch_size 16
3.3 使用微调后的模型
ft_discriminator = "curie:ft-openai-internal-2021-08-23-23-58-57"
ft_qa = "curie:ft-openai-internal-2021-08-23-17-54-10"
def apply_ft_discriminator(context, question, discriminator_model):
将微调后的判别器应用于一个问题,以评 估该问题是否能从上下文中得到解答。
prompt = f"{context}\nQuestion: {question}\n Related:"
result = openai.chat.completions.create(model=discriminator_model, prompt=prompt, max_tokens=1, temperature=0, top_p=1, n=1, logprobs=2)
return result['choices'][0]['logprobs']['top_logprobs']
apply_ft_discriminator('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.',
'What was the first human-made object in space?', ft_discriminator)
[<OpenAIObject at 0x7fe812e602b0> JSON: {
" no": -10.819577,
" yes": -2.045765e-05
def apply_ft_qa_answer(context, question, answering_model):
prompt = f"{context}\nQuestion: {question}\nAnswer:"
result = openai.chat.completions.create(model=answering_model, prompt=prompt, max_tokens=30, temperature=0, top_p=1, n=1, stop=['.','\n'])
return result['choices'][0]['text']
apply_ft_qa_answer('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.',
'What was the first human-made object in space?', ft_qa)
' The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957'
apply_ft_qa_answer('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.',
'What is impressive about the Soviet Union?', ft_qa)
' The Soviet Union was the first country to successfully launch a satellite into space'
apply_ft_qa_answer('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.',
'How many cars were produced in the Soviet Union in 1970?', ft_qa)
' No appropriate context found to answer the question'
我们可以看到,模型知 道何时回答问题,何时表示存在不足的上下文无法回答问题。
def answer_question_conditionally(answering_model, discriminator_model, context, question, discriminator_logprob_yes_modifier=0):
logprobs = apply_ft_discriminator(context, question, discriminator_model)
yes_logprob = logprobs[' yes'] if ' yes' in logprobs else -100
no_logprob = logprobs[' no'] if ' no' in logprobs else -100
if yes_logprob + discriminator_logprob_yes_modifier < no_logprob:
return " No appropriate context found to answer the question based on the discriminator."
return apply_ft_qa_answer(context, question, answering_model)
answer_question_conditionally(ft_qa, ft_discriminator,
"Crowdless games are a rare although not unheard-of occurrence in sports. \
When they do occur, it is usually the result of events beyond the control \
of the teams or fans, such as weather-related concerns, public health concerns, \
or wider civil disturbances unrelated to the game. For instance, \
the COVID-19 pandemic caused many sports leagues around the world \
to be played behind closed doors.",
"Could weather cause a sport event to have no crowd?")
' Weather could cause a sport event to have no crowd'
现在让我们来看一下答案端点的工作原理 - 结合搜索从知识库中检索相关内容,然后使用经过精细调整的问答模型来回答问题。
3.4 根据知识库回答问题
最后,我们可以使用类似于/answers端点的逻辑,首先搜索相关的上下文,然后请求一个问答模型根据该上下文回答问题。如果您 想查看实现细节,请查看answers_with_ft.py文件。
from answers_with_ft import answer_question
answer_question(olympics_search_fileid, ft_qa, "Which country won the Women's football tournament at the 2020 Olympic games?")
" Canada won the Women's football tournament at the 2020 Olympic games"