%pip install llama-index-readers-wikipedia
%pip install llama-index-finetuning
%pip install llama-index-llms-openai
%pip install llama-index-finetuning-callbacks
%pip install llama-index-llms-huggingface
# 注意:此笔记本进行了多次API调用,以使用OpenAI GPT模型生成文本,以及托管在HuggingFace上的模型。如果您不想等待这些生成过程,则可以使用下面提供的`wget`命令获取此笔记本的数据。# !wget "https://www.dropbox.com/scl/fo/m7skpjdbpb0g3p76y6epe/h?rlkey=omh2ysgh9qqqztf81qvjlivu2&dl=1" -O pairwise.zip
import nest_asyncio
nest_asyncio.apply()
import os# 我们将使用HuggingFace上的模型作为我们的LLM答案生成器HUGGING_FACE_TOKEN = os.getenv("HUGGING_FACE_TOKEN")# 我们将使用GPT-4和GPT-3.5 + OpenAI Fine-TuningOPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
import pandas as pd# 定义jupyter显示函数def display_eval_df(question, source, answer_a, answer_b, result) -> None: """漂亮打印问题/答案 + gpt-4判断数据集。""" eval_df = pd.DataFrame( { "Question": question, "Source": source, "Model A": answer_a["model"], "Answer A": answer_a["text"], "Model B": answer_b["model"], "Answer B": answer_b["text"], "Score": result.score, "Judgement": result.feedback, }, index=[0], ) eval_df = eval_df.style.set_properties( **{ "inline-size": "300px", "overflow-wrap": "break-word", }, subset=["Answer A", "Answer B"] ) display(eval_df)
步骤1 生成数据集:train_dataset
和 test_dataset
¶
对于我们将生成问题并提示各种LLM回答的数据集,我们将使用WikipediaReader
来读取几个城市的“train_dataset
,另一个用于test_dataset
。
!pip install wikipedia -q
[notice] A new release of pip is available: 23.2.1 -> 23.3.1 [notice] To update, run: pip install --upgrade pip
# 维基百科页面from llama_index.readers.wikipedia import WikipediaReadertrain_cities = [ "旧金山", "多伦多", "纽约", "温哥华", "蒙特利尔", "波士顿",]test_cities = [ "东京", "新加坡", "巴黎",]train_documents = WikipediaReader().load_data( pages=[f"{x}的历史" for x in train_cities])test_documents = WikipediaReader().load_data( pages=[f"{x}的历史" for x in test_cities])
使用DatasetGenerator
构建train_dataset
和test_dataset
¶
现在我们已经有了Document
的训练集和测试集,下一步是生成问题。为此,我们将使用DatasetGenerator
,它使用LLM从给定的文档集生成问题。
在这个部分,我们将生成一些问题,以便进行问答练习。
QUESTION_GEN_PROMPT = (
"You are a Teacher/ Professor. Your task is to setup "
"a quiz/examination. Using the provided context, formulate "
"a single question that captures an important fact from the "
"context. Restrict the question to the context information provided."
)
搞定这一切之后,让我们开始行动吧。首先,我们将下载参考的pdf文档,并根据它创建一组问题。
# 根据块生成问题from llama_index.core.evaluation import DatasetGeneratorfrom llama_index.llms.openai import OpenAIllm = OpenAI(model="gpt-3.5-turbo", temperature=0.3)# 实例化用于训练和测试的DatasetGeneratortrain_dataset_generator = DatasetGenerator.from_documents( train_documents, question_gen_query=QUESTION_GEN_PROMPT, llm=llm, show_progress=True, num_questions_per_chunk=25,)test_dataset_generator = DatasetGenerator.from_documents( test_documents, question_gen_query=QUESTION_GEN_PROMPT, llm=llm, show_progress=True, num_questions_per_chunk=25,)
# 使用DatasetGenerator从节点创建问题train_questions = train_dataset_generator.generate_questions_from_nodes( num=200)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [00:02<00:00, 36.34it/s]
test_questions = test_dataset_generator.generate_questions_from_nodes(num=150)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:02<00:00, 29.98it/s]
len(train_questions), len(test_questions)
(75, 64)
# 让我们来看看其中的一些train_questions[:3]
['What event in 1906 caused significant damage to San Francisco but was followed by a quick rebuild?', 'What was the name of the first significant homestead established outside the immediate vicinity of Mission Dolores in San Francisco?', "What event in 1855 led to the establishment of San Francisco's first county hospital and the development of California's system of county hospitals for the poor?"]
test_questions[:3]
['Question: What was the name of the oldest Buddhist temple in Tokyo, founded in 628?', 'What event marked the end of the samurai system and feudal class divisions in Tokyo?', 'Question: What role did the Tokyo Imperial University play in the Meiji Era?']
生成问题的答案¶
接下来的步骤是使用LLM生成答案。请记住,重点是评判这些生成的答案。因此,我们稍后将使用GPT模型来评判这些答案。
但是为了生成这些问题的答案,我们将使用另外两个LLM,即:Llama-2和Mistral。为了做到这一点,我们首先要为我们的文档创建一个向量存储和一个相关的检索器,这两个LLM答案生成器都将使用。
from llama_index.core import VectorStoreIndexfrom llama_index.core.retrievers import VectorIndexRetriever# 创建向量索引train_index = VectorStoreIndex.from_documents(documents=train_documents)# 在该索引上创建检索器train_retriever = VectorIndexRetriever( index=train_index, similarity_top_k=2,)# 为后续使用创建测试的向量索引test_index = VectorStoreIndex.from_documents(documents=test_documents)# 为后续使用创建测试的检索器test_retriever = VectorIndexRetriever( index=test_index, similarity_top_k=2,)
从这里开始,我们将构建RetrieverQueryEngine
,它将接收我们的查询(即问题)进行处理。请注意,我们使用HuggingFaceInferenceAPI
来进行LLM答案生成,而Llama-2需要权限。如果您还没有获得对这些模型的访问权限,那么可以随意将Llama-2替换为您选择的其他模型。
from llama_index.core.query_engine import RetrieverQueryEnginefrom llama_index.llms.huggingface import HuggingFaceInferenceAPIdef create_query_engine( hf_name: str, retriever: VectorIndexRetriever, hf_llm_generators: dict) -> RetrieverQueryEngine: """使用HuggingFaceInferenceAPI LLM创建一个RetrieverQueryEngine""" if hf_name not in hf_llm_generators: raise KeyError("模型未在hf_llm_generators中列出") llm = HuggingFaceInferenceAPI( model_name=hf_llm_generators[hf_name], context_window=2048, # 用于使用refine token=HUGGING_FACE_TOKEN, ) return RetrieverQueryEngine.from_args(retriever=retriever, llm=llm)
# 定义我们的llm生成器(查询引擎)hf_llm_generators = { "mistral-7b-instruct": "mistralai/Mistral-7B-Instruct-v0.1", "llama2-7b-chat": "meta-llama/Llama-2-7b-chat-hf",}train_query_engines = { mdl: create_query_engine(mdl, train_retriever, hf_llm_generators) for mdl in hf_llm_generators.keys()}test_query_engines = { mdl: create_query_engine(mdl, test_retriever, hf_llm_generators) for mdl in hf_llm_generators.keys()}
我们现在准备从各种LLM中生成答案。我们现在将为train_dataset
执行此操作,并暂时不为test_dataset
执行此操作,直到我们使用它的时候。
注意:生成答案可能需要一些时间。如果您不想等待,您可以选择加载包含每个问题的Llama-2和Mistral答案的train_qa.jsonl
文件。
import tqdmimport randomtrain_dataset = []for q in tqdm.tqdm(train_questions): # 随机选择两个LLM来生成对这个问题的答案 model_versus = random.sample(list(train_query_engines.items()), 2) # 为这个问题准备数据 data_entry = {"question": q} responses = [] source = None # 生成答案 for name, engine in model_versus: response = engine.query(q) response_struct = {} response_struct["model"] = name response_struct["text"] = str(response) if source is not None: assert source == response.source_nodes[0].node.text[:1000] + "..." else: source = response.source_nodes[0].node.text[:1000] + "..." responses.append(response_struct) data_entry["answers"] = responses data_entry["source"] = source train_dataset.append(data_entry)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [07:40<00:00, 6.14s/it]
获取Mistral和LLama-2答案的GPT-4评估¶
正如之前多次提到的,本指南的目的是从GPT-4法官中微调LLM法官。因此,为了完成我们的train_dataset
,我们现在需要实例化我们的GPT-4法官,并让它评估其他LLM(Llama-2和Mistral)提供的答案。为此,我们将使用PairwiseComparisonEvaluator
类。然后,这个法官将比较这两个答案,并给出一个裁决,判断Llama-2的答案更好,Mistral的答案更好,还是平局。
这里有一点额外的细微之处,因为在两两评估中,我们必须注意“位置偏见”的潜在问题。这是指法官偏向于首先呈现给它的第一个答案(在提示/上下文中)。为了考虑这种位置偏见,我们要求GPT-4法官对每个样本进行两次评估,在第二次评估中,我们交换两个答案的呈现顺序(即第一次评估:Llama-2然后Mistral,第二次评估:Mistral然后Llama-2)。
最后,我们还使用OpenAIFineTuningHandler
,它将收集我们最终需要微调GPT-3.5的所有聊天记录。
注意:生成裁决需要一些时间。同样,您可以选择将train_qa.jsonl
加载为train_dataset
。此外,我们还存储了传递给OpenAI用于微调GPT-3.5的JSONL文件。
# 实例化gpt-4评判器from llama_index.llms.openai import OpenAIfrom llama_index.finetuning.callbacks import OpenAIFineTuningHandlerfrom llama_index.core.callbacks import CallbackManagerfrom llama_index.core.evaluation import PairwiseComparisonEvaluatorfrom llama_index.core import Settings# 注意:这个finetuning_handler将为每个查询收集2倍的聊天记录:# 一个是原始的,另一个是翻转后的main_finetuning_handler = OpenAIFineTuningHandler()callback_manager = CallbackManager([main_finetuning_handler])Settings.callback_manager = callback_managerllm_4 = OpenAI(temperature=0, model="gpt-4", callback_manager=callback_manager)gpt4_judge = PairwiseComparisonEvaluator(llm=llm)
for data_entry in tqdm.tqdm(train_dataset): final_eval_result = await gpt4_judge.aevaluate( query=data_entry["question"], response=data_entry["answers"][0]["text"], second_response=data_entry["answers"][1]["text"], reference=data_entry["source"], ) # 保存最终结果 judgement = {} judgement["llm"] = "gpt_4" judgement["score"] = final_eval_result.score judgement["text"] = final_eval_result.response judgement["source"] = final_eval_result.pairwise_source data_entry["evaluations"] = [judgement]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [48:04<00:00, 38.46s/it]
让我们看看GPT-4评估中的一个示例。
# 让我们看看最后一个display_eval_df( question=data_entry["question"], source=data_entry["source"], answer_a=data_entry["answers"][0], answer_b=data_entry["answers"][1], result=final_eval_result,)
Question | Source | Model A | Answer A | Model B | Answer B | Score | Judgement | |
---|---|---|---|---|---|---|---|---|
0 | What resources are available in Boston for studying and preserving historical artifacts and remains? | In 1963, Boston Mayor John F. Collins and Boston Redevelopment Authority (BRA) executive Edward J. Logue organized a consortium of savings banks, cooperatives, and federal and state savings and loan associations in the city called the Boston Banks Urban Renewal Group (B-BURG) that would reverse redline parts of Dorchester, Roxbury, and Mattapan along Blue Hill Avenue. Despite the passage of legislation by the 156th Massachusetts General Court banning racial discrimination or segregation in housing in 1950, as well as the issuance of Executive Order 11063 by President John F. Kennedy in 1962 requiring all federal agencies to prevent racial discrimination in all federally-funded subsidized housing in the United States, the Boston Housing Authority (BHA) Board actively segregated the public housing developments in the city during the Collins administration as well, with BHA departments engaging in bureaucratic resistance against integration through at least 1966 and the Board retaining co... | llama2-7b-chat | There are several resources available in Boston for studying and preserving historical artifacts and remains. Some of these include: 1. The Boston Historical Society: This organization provides access to historical artifacts, photographs, and documents related to Boston's history. 2. Boston Mapjunction: This website provides over 200 historical maps of Boston dating back to 1630, as well as aerial photos compared with maps of today. 3. City of Boston Archaeology Program and Lab: The City of Boston has a City Archaeologist on staff to oversee any lots of land to be developed for historical artifacts and significance, and to manage the archaeological remains located on public land in Boston. The program also has an Archaeology Laboratory and Education and Curation Center. 4. The Freedom House Photographs Collection: This collection contains over 2,000 images of Roxbury people, places, and events from 1950 to 1975. 5. Vital Records of Boston: This resource provides access to historical records related to births, marriages, and deaths in Boston. 6. Reading and Everyday Life in | mistral-7b-instruct | The City of Boston has a City Archaeologist on staff to oversee any lots of land to be developed for historical artifacts and significance, and to manage the archaeological remains located on public land in Boston, and also has a City Archaeology Program and an Archaeology Laboratory, Education and Curation Center. The Freedom House Photographs Collection contains over 2,000 images of Roxbury people, places and events, 1950–1975 (Archives and Special Collections of the Northeastern University Libraries in Boston, MA). | 1.000000 | Assistant A provides a more comprehensive answer, listing several resources available in Boston for studying and preserving historical artifacts and remains. These include the Boston Historical Society, Boston Mapjunction, the City of Boston Archaeology Program and Lab, the Freedom House Photographs Collection, and Vital Records of Boston. This answer is more detailed and provides a wider range of resources for the user to explore. Assistant B, on the other hand, only mentions the City of Boston Archaeology Program and Lab and the Freedom House Photographs Collection. While these are relevant resources, the answer lacks the depth and variety of Assistant A's response. Therefore, based on the depth, variety, and level of detail in the responses, Assistant A's answer is superior. Final Verdict: [[A]] |
对JSONL进行特别注意¶
由于存在两种评估(一种是LLM答案原始呈现顺序的评估,另一种是翻转顺序的评估),我们需要小心选择正确的评估结果,以保留在我们的微调数据集中。这意味着我们需要挑选出由我们的 OpenAIFineTuningHandler
收集的正确事件,然后只使用这些事件来准备我们将传递给OpenAI微调API的JSONL。
main_finetuning_handler.save_finetuning_events(
"pairwise_finetuning_events.jsonl"
)
Wrote 150 examples to pairwise_finetuning_events.jsonl
import json# 获取fine_tuning_examples主数据集with open("pairwise_finetuning_events.jsonl") as f: combined_finetuning_events = [json.loads(line) for line in f]
finetuning_events = ( []) # 用于存储使用原始呈现顺序的事件flipped_finetuning_events = ( []) # 用于存储使用翻转顺序的事件for ix, event in enumerate(combined_finetuning_events): if ix % 2 == 0: # 我们总是先进行原始排序 finetuning_events += [event] else: # 然后我们翻转顺序,让GPT-4做出另一个判断 flipped_finetuning_events += [event]
assert len(finetuning_events) == len(flipped_finetuning_events)
# 我们需要选择要保留的聊天记录resolved_finetuning_events = []for ix, data_entry in enumerate(train_dataset): if data_entry["evaluations"][0]["source"] == "original": resolved_finetuning_events += [finetuning_events[ix]] elif data_entry["evaluations"][0]["source"] == "flipped": resolved_finetuning_events += [flipped_finetuning_events[ix]] else: continue
with open("resolved_pairwise_finetuning_events.jsonl", "w") as outfile:
for entry in resolved_finetuning_events:
print(json.dumps(entry), file=outfile)
第2步 执行知识蒸馏¶
好的,现在是时候从GPT-4中提炼一些知识到GPT-3.5了。为了做到这一点,我们将使用OpenAIFinetuneEngine
类以及刚刚创建的resolved_pairwise_finetuning_events.jsonl
文件。
from llama_index.finetuning import OpenAIFinetuneEngine
finetune_engine = OpenAIFinetuneEngine(
"gpt-3.5-turbo",
"resolved_pairwise_finetuning_events.jsonl",
)
finetune_engine.finetune()
Num examples: 72 First example: {'role': 'system', 'content': "Please act as an impartial judge and evaluate the quality of the responses provided by two AI question-answering assistants to the user question perhaps with added reference which are displayed below. You should choose the assistant that follows the user’s instructions and answers the user’s question better using the provided context. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: '[[A]]' if assistant A is better, '[[B]]' if assistant B is better, and '[[C]]' for a tie.\n"} {'role': 'user', 'content': "[User Question]\nWhat event in 1906 caused significant damage to San Francisco but was followed by a quick rebuild?\n\n[The Start of Reference]\n=== Reconstruction ===\nAlmost immediately after the quake re-planning and reconstruction plans were hatched to quickly rebuild the city. One of the more famous and ambitious plans, proposed before the fire, came from famed urban planner, Daniel Burnham. His bold plan called for Haussmann style avenues, boulevards, and arterial thoroughfares that radiated across the city, a massive civic center complex with classical structures, what would have been the largest urban park in the world, stretching from Twin Peaks to Lake Merced with a large athenaeum at its peak, and various other proposals. This plan was dismissed by critics (both at the time and now), as impractical and unrealistic to municipal supply and demand. Property owners and the Real Estate industry were against the idea as well due to the amounts of their land the city would have to purchase to realize such proposals. While the original street grid was restored, many of Burnham's proposals eventually saw the light of day such ...\n[The End of Reference]\n\n[The Start of Assistant A’s Answer]\n\n\nThe 1906 earthquake and fire caused significant damage to San Francisco, with over 80% of the city destroyed, including almost all of the downtown core. The earthquake, which had a magnitude of 7.8 on the Richter scale, resulted from the rupture of over 270 miles of the San Andreas Fault, centered immediately offshore of San Francisco. Water mains ruptured throughout the city, and the fires that followed burned out of control for days, trapping many residents and causing widespread destruction. Despite the devastation, the city quickly rebuilt, with many of the buildings constructed after the disaster being more earthquake-resistant than those that had come before. The official death toll from this event was initially reported as 478, but was later revised to over 3,000.\n[The End of Assistant A’s Answer]\n\n[The Start of Assistant B’s Answer]\n1906 earthquake and fire.\n[The End of Assistant B’s Answer]"} {'role': 'assistant', 'content': "Assistant A provides a detailed and comprehensive response to the user's question, explaining the event that caused significant damage to San Francisco in 1906, which was the earthquake and fire. It also provides additional information about the earthquake's magnitude, the extent of the damage, and the city's quick rebuilding efforts. On the other hand, Assistant B's response is very brief and only mentions the earthquake and fire, without providing any additional information or context. Therefore, Assistant A's response is more helpful, relevant, accurate, and detailed. \n\nFinal Verdict: [[A]]"} No errors found Num examples missing system message: 0 Num examples missing user message: 0 #### Distribution of num_messages_per_example: min / max: 3, 3 mean / median: 3.0, 3.0 p5 / p95: 3.0, 3.0 #### Distribution of num_total_tokens_per_example: min / max: 579, 1198 mean / median: 818.9305555555555, 772.0 p5 / p95: 625.9, 1076.0 #### Distribution of num_assistant_tokens_per_example: min / max: 66, 248 mean / median: 129.26388888888889, 117.5 p5 / p95: 81.0, 193.9 0 examples may be over the 4096 token limit, they will be truncated during fine-tuning Dataset has ~58963 tokens that will be charged for during training By default, you'll train for 3 epochs on this dataset By default, you'll be charged for ~176889 tokens As of August 22, 2023, fine-tuning gpt-3.5-turbo is $0.008 / 1K Tokens. This means your total cost for training will be $0.471704 per epoch.
# 我们可以按如下方式检查当前作业的状态# 这可能需要一些时间...finetune_engine.get_current_job()
<FineTuningJob fine_tuning.job id=ftjob-jLxZggQbHz2F98IlhQEI9KIw at 0x2e6b91170> JSON: { "object": "fine_tuning.job", "id": "ftjob-jLxZggQbHz2F98IlhQEI9KIw", "model": "gpt-3.5-turbo-0613", "created_at": 1698817329, "finished_at": 1698817949, "fine_tuned_model": "ft:gpt-3.5-turbo-0613:llamaindex::8FyRSSOl", "organization_id": "org-1ZDAvajC6v2ZtAP9hLEIsXRz", "result_files": [ "file-qLTnxGSZX2rHP0Q7wJIDDNWX" ], "status": "succeeded", "validation_file": null, "training_file": "file-xsAaOBjQ949ti0qk1xHHLOiF", "hyperparameters": { "n_epochs": 3 }, "trained_tokens": 176457, "error": null }
3 评估在测试数据集上对GPT-3.5进行微调¶
现在我们已经有了我们的微调过的GPT-3.5,让我们看看它在测试集上的表现如何。但首先,请记住我们说过要推迟创建test_dataset
直到我们需要它的时候?现在是时候了。因此,我们将在这里重复创建train_dataset
的过程,但这次是为了test_dataset
。
注意:生成这些答案和评估需要一些时间。您可以选择加载test_qa_complete.jsonl
,其中包含了三个考虑的LLM评估的所有内容。您可以将其加载为test_dataset
,然后运行下面“指标”小节中的代码。
import random# 使用Llama-2和Mistral LLMs生成测试查询的答案test_dataset = []for q in tqdm.tqdm(test_questions): # 随机选择两个LLMs来生成这个问题的答案 model_versus = random.sample(list(test_query_engines.items()), 2) # 为这个问题准备数据 data_entry = {"question": q} responses = [] source = None # 生成答案 for name, engine in model_versus: response = engine.query(q) response_struct = {} response_struct["model"] = name response_struct["text"] = str(response) if source is not None: assert source == response.source_nodes[0].node.text[:1000] + "..." else: source = response.source_nodes[0].node.text[:1000] + "..." responses.append(response_struct) data_entry["answers"] = responses data_entry["source"] = source test_dataset.append(data_entry)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [28:23<00:00, 26.62s/it]
# 获取Mistal和Llama-2答案的GPT-4判断for data_entry in tqdm.tqdm(test_dataset): final_eval_result = await gpt4_judge.aevaluate( query=data_entry["question"], response=data_entry["answers"][0]["text"], second_response=data_entry["answers"][1]["text"], reference=data_entry["source"], ) # 保存最终结果 judgement = {} judgement["llm"] = "gpt_4" judgement["score"] = final_eval_result.score judgement["text"] = final_eval_result.response judgement["source"] = final_eval_result.pairwise_source data_entry["evaluations"] = [judgement]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [43:21<00:00, 40.66s/it]
from llama_index.core.evaluation import EvaluationResult# 使用我们经过精细调整的 GPT-3.5 来评估答案ft_llm = finetune_engine.get_finetuned_model()ft_gpt_3p5_judge = PairwiseComparisonEvaluator(llm=ft_llm)for data_entry in tqdm.tqdm(test_dataset): try: final_eval_result = await ft_gpt_3p5_judge.aevaluate( query=data_entry["question"], response=data_entry["answers"][0]["text"], second_response=data_entry["answers"][1]["text"], reference=data_entry["source"], ) except: final_eval_result = EvaluationResult( query=data_entry["question"], response="", passing=None, score=0.5, feedback="", pairwise_source="output-cannot-be-parsed", ) # 保存最终结果 judgement = {} judgement["llm"] = "ft_gpt_3p5" judgement["score"] = final_eval_result.score judgement["text"] = final_eval_result.response judgement["source"] = final_eval_result.pairwise_source data_entry["evaluations"] += [judgement]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [04:08<00:00, 3.88s/it]
# 同样地,使用一个未经微调的评判器来评估答案gpt_3p5_llm = OpenAI(model="gpt-3.5-turbo")gpt_3p5_judge = PairwiseComparisonEvaluator(llm=gpt_3p5_llm)for data_entry in tqdm.tqdm(test_dataset): try: final_eval_result = await gpt_3p5_judge.evaluate( query=data_entry["question"], response=data_entry["answers"][0]["text"], second_response=data_entry["answers"][1]["text"], reference=data_entry["source"], ) except: final_eval_result = EvaluationResult( query=data_entry["question"], response="", passing=None, score=0.5, feedback="", pairwise_source="output-cannot-be-parsed", ) # 保存最终结果 judgement = {} judgement["llm"] = "gpt_3p5" judgement["score"] = final_eval_result.score judgement["text"] = final_eval_result.response judgement["source"] = final_eval_result.pairwise_source data_entry["evaluations"] += [judgement]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [09:32<00:00, 8.95s/it]
指标¶
哇!现在我们已经生成了LLM法官对测试查询中Llama-2/Mistral答案的所有评估。现在让我们从定量的角度来看一下fine-tuned GPT-3.5与GPT-4有多接近。
为此,我们报告几个指标,即:
- 与GPT-4评估的一致率
- 与GPT-4评估的相关性
- 与GPT-4评估的Jaccard相似度
我们还报告了“不确定”的次数,这是指当LLM法官在呈现了Llama-2和Mistral答案的翻转顺序后改变了决定的情况。更高的不确定次数表明LLM法官容易受到位置偏见的影响,这是不好的!
!pip install scikit-learn -q
[notice] A new release of pip is available: 23.2.1 -> 23.3.1 [notice] To update, run: pip install --upgrade pip
import numpy as np# 存储每个LLM评委对每个样本的分数和不确定性布尔值scores = {"gpt_4": [], "gpt_3p5": [], "ft_gpt_3p5": []}inconclusives = {"gpt_4": [], "gpt_3p5": [], "ft_gpt_3p5": []}for ix, d in enumerate(test_dataset): for e in d["evaluations"]: scores[e["llm"]].append(e["score"]) inconclusives[e["llm"]].append( e["source"] not in ["original", "flipped"] )
REPORT_FMT_STR = (
"{model}\n"
"-----------------\n"
"Number of inconclusives: {inconclusive}\n"
"Number of agreements with GPT-4: {agreement} out of {total}\n"
"Agreement rate: {agreement_rate}\n"
"Correlation: {corr}\n"
"Jaccard: {jacc}\n\n"
)
from sklearn.metrics import jaccard_score# numpy转换np_scores_gpt_4 = np.array(scores["gpt_4"])np_scores_gpt_3p5 = np.array(scores["gpt_3p5"])np_scores_ft_gpt_3p5 = np.array(scores["ft_gpt_3p5"])# 只有当两个评分者都有非不确定结果时才能进行比较ft_mask = ~np.array(inconclusives["gpt_4"]) * ~np.array( inconclusives["ft_gpt_3p5"])no_ft_mask = ~np.array(inconclusives["gpt_4"]) * ~np.array( inconclusives["gpt_3p5"])# 一致率agreement_ft = sum(np_scores_gpt_4[ft_mask] == np_scores_ft_gpt_3p5[ft_mask])agreement_rate_ft = agreement_ft / sum(ft_mask)agreement_no_ft = sum( np_scores_gpt_4[no_ft_mask] == np_scores_gpt_3p5[no_ft_mask])agreement_rate_no_ft = agreement_no_ft / sum(no_ft_mask)# 相关性corr_ft = np.corrcoef(np_scores_gpt_4[ft_mask], np_scores_ft_gpt_3p5[ft_mask])[ 0, 1]corr_no_ft = np.corrcoef( np_scores_gpt_4[no_ft_mask], np_scores_gpt_3p5[no_ft_mask])[0, 1]# jaccardjaccard_ft = jaccard_score( np_scores_gpt_4[ft_mask].astype(str), np_scores_ft_gpt_3p5[ft_mask].astype(str), average="weighted",)jaccard_no_ft = jaccard_score( np_scores_gpt_4[no_ft_mask].astype(str), np_scores_gpt_3p5[no_ft_mask].astype(str), average="weighted",)print( REPORT_FMT_STR.format( model="GPT-3.5 w/ fine-tuning", inconclusive=sum(inconclusives["ft_gpt_3p5"]), agreement=agreement_ft, total=sum(ft_mask), agreement_rate=agreement_rate_ft, corr=corr_ft, jacc=jaccard_ft, ))print( REPORT_FMT_STR.format( model="GPT-3.5 w/out fine-tuning", inconclusive=sum(inconclusives["gpt_3p5"]), agreement=agreement_no_ft, total=sum(no_ft_mask), agreement_rate=agreement_rate_no_ft, corr=corr_no_ft, jacc=jaccard_no_ft, ))print( f"GPT-4\n-----------------\nInconclusive Count: {sum(inconclusives['gpt_4'])}")
GPT-3.5 w/ fine-tuning ----------------- Number of inconclusives: 15 Number of agreements with GPT-4: 41 out of 47 Agreement rate: 0.8723404255319149 Correlation: 0.765365523658036 Jaccard: 0.773126734505088 GPT-3.5 w/out fine-tuning ----------------- Number of inconclusives: 24 Number of agreements with GPT-4: 32 out of 38 Agreement rate: 0.8421052631578947 Correlation: 0.671929323262293 Jaccard: 0.7308712958867757 GPT-4 ----------------- Inconclusive Count: 4
结论¶
从上面的数据中,我们可以看到,对GPT-3.5评判进行微调会产生比未经微调的GPT-3.5评判更高的一致性得分、相关性和Jaccard相似度。更重要的是,我们还看到经过微调后,无法得出结论的数量也有所减少。总的来说,我们看到在这里进行微调帮助我们获得了一个更接近GPT-4评判(因此间接地更接近人类判断)的GPT-3.5评判,同时也帮助纠正了未经微调的GPT-3.5可能存在的位置偏见。