3. 训练一个专门用于问答的微调模型



import openai
import pandas as pd
df = pd.read_csv('olympics-data/olympics_qa.csv')
olympics_search_fileid = "file-c3shd8wqF3vSCKaukW4Jr1TT"

title heading content tokens context questions answers
0 2020 Summer Olympics Summary The 2020 Summer Olympics (Japanese: 2020年夏季オリン... 713 2020 Summer Olympics Summary The 2020 Summ... 1. What is the 2020 Summer Olympics? 2. When ... 1. The 2020 Summer Olympics is an internationa...
1 2020 Summer Olympics Host city selection The International Olympic Committee (IOC) vote... 126 2020 Summer Olympics Host city selection T... 1. 2. 3. 4. 1. What is the International Olympic Committee...
2 2020 Summer Olympics Impact of the COVID-19 pandemic In January 2020, concerns were raised about th... 369 2020 Summer Olympics Impact of the COVID-19 p... 1. What was the COVID-19 pandemic? 2. How did... 1. The COVID-19 pandemic was a pandemic that o...
3 2020 Summer Olympics Qualifying event cancellation and postponement Concerns about the pandemic began to affect qu... 298 2020 Summer Olympics Qualifying event cancell... 1. What was the original location of the Asia ... 1. The original location of the Asia & Oceania...
4 2020 Summer Olympics Effect on doping tests Mandatory doping tests were being severely res... 163 2020 Summer Olympics Effect on doping tests ... 1. What was the COVID-19 pandemic? 2. What di... 1. The COVID-19 pandemic was a pandemic that o...


from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
len(train_df), len(test_df)

(3014, 754)




3.1 为问答和鉴别器模型创建微调数据集

微调数据集是按以下方式创建的。对于每个相应的问题、答案和上下文对,我们创建: - 正例:正确的问题、答案、上下文对 - 负例: - 随机负例,其中随机上下文与问题配对 - 两个困难负例 - 一个来自相同的维基百科文章 - 另一个是与正确上下文最相似的



import random

def get_random_similar_contexts(question, context, file_id=olympics_search_fileid, search_model='ada', max_rerank=10):
# 待办事项:openai.Engine(search_model) 已被弃用。
results = openai.Engine(search_model).search(
candidates = []
for result in results['data'][:3]:
if result['text'] == context:
random_candidate = random.choice(candidates)
return random_candidate
except Exception as e:
return ""

def create_fine_tuning_dataset(df, discriminator=False, n_negative=1, add_related=False):
Create a dataset for fine tuning the OpenAI model; either for a discriminator model,
or a model specializing in Q&A, where it says if no relevant context is found.

df: pd.DataFrame
The dataframe containing the question, answer and context pairs
discriminator: bool
Whether to create a dataset for the discriminator
n_negative: int
The number of random negative samples to add (using a random context)
add_related: bool
Whether to add the related contexts to the correct context. These are hard negative examples

The dataframe containing the prompts and completions, ready for fine-tuning
rows = []
for i, row in df.iterrows():
for q, a in zip(("1." + row.questions).split('\n'), ("1." + row.answers).split('\n')):
if len(q) >10 and len(a) >10:
if discriminator:
rows.append({"prompt":f"{row.context}\nQuestion: {q[2:].strip()}\n Related:", "completion":f" yes"})
rows.append({"prompt":f"{row.context}\nQuestion: {q[2:].strip()}\nAnswer:", "completion":f" {a[2:].strip()}"})

for i, row in df.iterrows():
for q in ("1." + row.questions).split('\n'):
if len(q) >10:
for j in range(n_negative + (2 if add_related else 0)):
random_context = ""
if j == 0 and add_related:
# 根据源自同一维基百科页面的相关上下文进行添加
subset = df[(df.title == row.title) & (df.context != row.context)]

if len(subset) < 1:
random_context = subset.sample(1).iloc[0].context
if j == 1 and add_related:
# 根据搜索结果中最相似的上下文,添加相关内容。
random_context = get_random_similar_contexts(q[2:].strip(), row.context, search_model='ada', max_rerank=10)
while True:
# 添加随机上下文,但这并不是正确的上下文。
random_context = df.sample(1).iloc[0].context
if random_context != row.context:
if discriminator:
rows.append({"prompt":f"{random_context}\nQuestion: {q[2:].strip()}\n Related:", "completion":f" no"})
rows.append({"prompt":f"{random_context}\nQuestion: {q[2:].strip()}\nAnswer:", "completion":f" No appropriate context found to answer the question."})

return pd.DataFrame(rows)


for name, is_disc in [('discriminator', True), ('qa', False)]:
for train_test, dt in [('train', train_df), ('test', test_df)]:
ft = create_fine_tuning_dataset(dt, discriminator=is_disc, n_negative=1, add_related=True)
ft.to_json(f'{name}_{train_test}.jsonl', orient='records', lines=True)

我们根据微调工具的建议对数据进行了格式化,可以使用以下命令进行操作: > openai tools fine_tunes.prepare_data -f qa_train.jsonl


3.2 提交数据集进行微调

!openai api fine_tunes.create -t "olympics-data/discriminator_train.jsonl" -v "olympics-data/discriminator_test.jsonl" --batch_size 16  --compute_classification_metrics --classification_positive_class " yes" --model ada

!openai api fine_tunes.create -t "olympics-data/qa_train.jsonl" -v "olympics-data/qa_test.jsonl" --batch_size 16

3.3 使用微调后的模型


ft_discriminator = "curie:ft-openai-internal-2021-08-23-23-58-57"
ft_qa = "curie:ft-openai-internal-2021-08-23-17-54-10"

def apply_ft_discriminator(context, question, discriminator_model):
prompt = f"{context}\nQuestion: {question}\n Related:"
result = openai.chat.completions.create(model=discriminator_model, prompt=prompt, max_tokens=1, temperature=0, top_p=1, n=1, logprobs=2)
return result['choices'][0]['logprobs']['top_logprobs']

apply_ft_discriminator('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.',
'What was the first human-made object in space?', ft_discriminator)

[<OpenAIObject at 0x7fe812e602b0> JSON: {
" no": -10.819577,
" yes": -2.045765e-05


def apply_ft_qa_answer(context, question, answering_model):
prompt = f"{context}\nQuestion: {question}\nAnswer:"
result = openai.chat.completions.create(model=answering_model, prompt=prompt, max_tokens=30, temperature=0, top_p=1, n=1, stop=['.','\n'])
return result['choices'][0]['text']

apply_ft_qa_answer('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.',
'What was the first human-made object in space?', ft_qa)

' The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957'


apply_ft_qa_answer('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.',
'What is impressive about the Soviet Union?', ft_qa)

' The Soviet Union was the first country to successfully launch a satellite into space'
apply_ft_qa_answer('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.',
'How many cars were produced in the Soviet Union in 1970?', ft_qa)

' No appropriate context found to answer the question'



def answer_question_conditionally(answering_model, discriminator_model, context, question, discriminator_logprob_yes_modifier=0):
logprobs = apply_ft_discriminator(context, question, discriminator_model)
yes_logprob = logprobs[' yes'] if ' yes' in logprobs else -100
no_logprob = logprobs[' no'] if ' no' in logprobs else -100
if yes_logprob + discriminator_logprob_yes_modifier < no_logprob:
return " No appropriate context found to answer the question based on the discriminator."
return apply_ft_qa_answer(context, question, answering_model)
answer_question_conditionally(ft_qa, ft_discriminator,
"Crowdless games are a rare although not unheard-of occurrence in sports. \
When they do occur, it is usually the result of events beyond the control \
of the teams or fans, such as weather-related concerns, public health concerns, \
or wider civil disturbances unrelated to the game. For instance, \
the COVID-19 pandemic caused many sports leagues around the world \
to be played behind closed doors.",
"Could weather cause a sport event to have no crowd?")

' Weather could cause a sport event to have no crowd'


现在让我们来看一下答案端点的工作原理 - 结合搜索从知识库中检索相关内容,然后使用经过精细调整的问答模型来回答问题。

3.4 根据知识库回答问题


from answers_with_ft import answer_question
answer_question(olympics_search_fileid, ft_qa, "Which country won the Women's football tournament at the 2020 Olympic games?")

" Canada won the Women's football tournament at the 2020 Olympic games"