设置¶
我们加载一些数据,并定义一个非常简单的RAG查询引擎,我们将对其进行评估(使用top-k检索)。
%pip install llama-index-readers-file pymupdf
%pip install llama-index-llms-openai
!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
mkdir: data: File exists --2023-09-19 00:05:14-- https://arxiv.org/pdf/2307.09288.pdf Resolving arxiv.org (arxiv.org)... 128.84.21.199 Connecting to arxiv.org (arxiv.org)|128.84.21.199|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 13661300 (13M) [application/pdf] Saving to: ‘data/llama2.pdf’ data/llama2.pdf 100%[===================>] 13.03M 1.56MB/s in 9.3s 2023-09-19 00:05:25 (1.40 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-4")
node_parser = SentenceSplitter(chunk_size=1024)
nodes = node_parser.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes)
query_engine = index.as_query_engine(llm=llm)
数据集生成¶
我们首先进行一个生成合成评估数据集的练习。我们通过从现有上下文中合成生成一组问题来实现这一点。然后,我们将每个问题与现有上下文一起运行到一个强大的LLM(例如GPT-4)中,以生成“地面真相”回答。
定义函数¶
我们定义将用于数据集生成的函数:
from llama_index.core.schema import BaseNode
from llama_index.llms.openai import OpenAI
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Tuple, List
import re
llm = OpenAI(model="gpt-4")
我们定义generate_answers_for_questions
来从给定上下文中生成问题的答案。
QA_PROMPT = PromptTemplate( "以下是上下文信息。\n" "---------------------\n" "{context_str}\n" "---------------------\n" "根据上下文信息和非先验知识,回答问题。\n" "问题: {query_str}\n" "答案: ")def generate_answers_for_questions( questions: List[str], context: str, llm: OpenAI) -> str: """给定上下文生成问题的答案。""" answers = [] for question in questions: fmt_qa_prompt = QA_PROMPT.format( context_str=context, query_str=question ) response_obj = llm.complete(fmt_qa_prompt) answers.append(str(response_obj)) return answers
我们定义generate_qa_pairs
函数来在整个节点列表上生成问答对。
QUESTION_GEN_USER_TMPL = ( "以下是上下文信息。\n" "---------------------\n" "{context_str}\n" "---------------------\n" "根据上下文信息和非先验知识,生成相关问题。")QUESTION_GEN_SYS_TMPL = """\您是一名老师/教授。您的任务是为即将到来的测验/考试设置 \{num_questions_per_chunk} 个问题。这些问题在文档中应该具有多样性。将问题限制在 \所提供的上下文信息中。\"""question_gen_template = ChatPromptTemplate( message_templates=[ ChatMessage(role=MessageRole.SYSTEM, content=QUESTION_GEN_SYS_TMPL), ChatMessage(role=MessageRole.USER, content=QUESTION_GEN_USER_TMPL), ])def generate_qa_pairs( nodes: List[BaseNode], llm: OpenAI, num_questions_per_chunk: int = 10) -> List[Tuple[str, str]]: """生成问题。""" qa_pairs = [] for idx, node in enumerate(nodes): print(f"节点 {idx}/{len(nodes)}") context_str = node.get_content(metadata_mode="all") fmt_messages = question_gen_template.format_messages( num_questions_per_chunk=10, context_str=context_str, ) chat_response = llm.chat(fmt_messages) raw_output = chat_response.message.content result_list = str(raw_output).strip().split("\n") cleaned_questions = [ re.sub(r"^\d+[\).\s]", "", question).strip() for question in result_list ] answers = generate_answers_for_questions( cleaned_questions, context_str, llm ) cur_qa_pairs = list(zip(cleaned_questions, answers)) qa_pairs.extend(cur_qa_pairs) return qa_pairs
qa_pairs
[('What is the main focus of the work described in the document?', 'The main focus of the work described in the document is the development and release of Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. The fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. The document also provides a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat.'), ('What is the range of parameters for the large language models (LLMs) developed in this work?', 'The range of parameters for the large language models (LLMs) developed in this work is from 7 billion to 70 billion.'), ('What is the specific name given to the fine-tuned LLMs optimized for dialogue use cases?', 'The specific name given to the fine-tuned LLMs optimized for dialogue use cases is Llama 2-Chat.'), ('How do the models developed in this work compare to open-source chat models based on the benchmarks tested?', 'The models developed in this work, specifically the fine-tuned LLMs called Llama 2-Chat, outperform open-source chat models on most benchmarks tested.'), ('What are the two key areas of human evaluation mentioned in the document for the developed models?', 'The two key areas of human evaluation mentioned in the document for the developed models are helpfulness and safety.'), ('What is the purpose of providing a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat?', 'The purpose of providing a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat is to enable the community to build on their work and contribute to the responsible development of Large Language Models (LLMs).'), ('What is the intended benefit for the community from this work?', 'The intended benefit for the community from this work is to enable them to build on the work and contribute to the responsible development of large language models (LLMs). The team provides a detailed description of their approach to fine-tuning and safety improvements of Llama 2-Chat for this purpose.'), ('Who are the corresponding authors of this work and how can they be contacted?', 'The corresponding authors of this work are Thomas Scialom and Hugo Touvron. They can be contacted via email at tscialom@meta.com and htouvron@meta.com respectively.'), ('What is the source of the document and how many pages does it contain?', 'The source of the document is "1" and it contains 77 pages.'), ('Where can the contributions of all the authors be found in the document?', 'The contributions of all the authors can be found in Section A.1 of the document.')]
在数据集上获取配对¶
注意:这可能需要很长时间。为了加快速度,请尝试输入节点的子集。
qa_pairs = generate_qa_pairs( # nodes[:1], nodes, llm, num_questions_per_chunk=10,)
[可选] 定义保存/加载¶
# 保存import picklepickle.dump(qa_pairs, open("eval_dataset.pkl", "wb"))
# 保存import pickleqa_pairs = pickle.load(open("eval_dataset.pkl", "rb"))
评估生成结果¶
在本节中,我们将介绍一些评估生成结果的方法。在高层次上,我们使用一个“评估LLM”来衡量生成结果的质量。我们在带标签设置和不带标签设置下进行评估。
我们将介绍以下评估算法:
- 正确性:将生成的答案与真实答案进行比较。
- 忠实度:评估回复是否忠实于上下文(无标签)。
构建一个正确性评估器¶
正确性评估器通过比较生成的答案和参考标准答案来评估查询的正确性。我们输出一个介于1和5之间的分数,其中1表示最差,5表示最佳。
我们通过一个系统和用户提示与聊天界面来实现这一点。
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Dict
CORRECTNESS_SYS_TMPL = """您是一个问题回答聊天机器人的专家评估系统。您将获得以下信息:- 用户查询,- 参考答案,以及- 生成的答案。您的任务是判断生成的答案的相关性和正确性。输出一个代表全面评估的单一分数。您必须在一行中返回您的响应,只包括分数。不要以任何其他格式返回答案。在另一行中提供您对分数的理由。遵循以下评分指南:- 您的分数必须在1到5之间,其中1是最差,5是最好。- 如果生成的答案与用户查询不相关,您应该给出1分。- 如果生成的答案相关但包含错误,您应该给出2到3分之间的分数。- 如果生成的答案相关且完全正确,您应该给出4到5分之间的分数。"""CORRECTNESS_USER_TMPL = """## 用户查询{query}## 参考答案{reference_answer}## 生成的答案{generated_answer}"""
eval_chat_template = ChatPromptTemplate(
message_templates=[
ChatMessage(role=MessageRole.SYSTEM, content=CORRECTNESS_SYS_TMPL),
ChatMessage(role=MessageRole.USER, content=CORRECTNESS_USER_TMPL),
]
)
现在我们已经定义了提示模板,让我们定义一个评估函数,将提示输入到LLM中,并将输出解析为结果字典。
from llama_index.llms.openai import OpenAIdef run_correctness_eval( query_str: str, reference_answer: str, generated_answer: str, llm: OpenAI, threshold: float = 4.0,) -> Dict: """运行正确性评估。""" fmt_messages = eval_chat_template.format_messages( llm=llm, query=query_str, reference_answer=reference_answer, generated_answer=generated_answer, ) chat_response = llm.chat(fmt_messages) raw_output = chat_response.message.content # 从响应中提取 score_str, reasoning_str = raw_output.split("\n", 1) score = float(score_str) reasoning = reasoning_str.lstrip("\n") return {"passing": score >= threshold, "score": score, "reason": reasoning}
现在让我们尝试在一些示例输入上运行这个对话模型(GPT-4)。
llm = OpenAI(model="gpt-4")
# query_str = "这项工作中开发的大型语言模型(LLMs)的参数范围是什么?"# reference_answer = "这项工作中开发的大型语言模型(LLMs)的参数范围是70亿到700亿。"query_str = ( "为对话使用情况进行优化的精细调整LLMs被赋予了什么具体名称?")reference_answer = ( "为对话使用情况进行优化的精细调整LLMs被赋予了Llama 2-Chat的具体名称。")
generated_answer = str(query_engine.query(query_str))
print(str(generated_answer))
The fine-tuned Large Language Models (LLMs) optimized for dialogue use cases are specifically called Llama 2-Chat.
eval_results = run_correctness_eval(
query_str, reference_answer, generated_answer, llm=llm, threshold=4.0
)
display(eval_results)
{'passing': True, 'score': 5.0, 'reason': 'The generated answer is completely relevant to the user query and matches the reference answer in terms of information. It correctly identifies "Llama 2-Chat" as the specific name given to the fine-tuned LLMs optimized for dialogue use cases.'}
构建一个忠实度评估器¶
忠实度评估器用于评估回复是否忠实于检索到的任何上下文。
这比正确性评估器更复杂一些。由于上下文集可能相当长,它们可能会超出上下文窗口。我们需要想出如何实现一种响应合成策略,以便按顺序迭代上下文。
我们有一个相应的教程,向您展示如何从头开始构建响应合成。我们还有开箱即用的响应合成模块。在本指南中,我们将使用开箱即用的模块。
EVAL_TEMPLATE = PromptTemplate(
"Please tell if a given piece of information "
"is supported by the context.\n"
"You need to answer with either YES or NO.\n"
"Answer YES if any of the context supports the information, even "
"if most of the context is unrelated. "
"Some examples are provided below. \n\n"
"Information: Apple pie is generally double-crusted.\n"
"Context: An apple pie is a fruit pie in which the principal filling "
"ingredient is apples. \n"
"Apple pie is often served with whipped cream, ice cream "
"('apple pie à la mode'), custard or cheddar cheese.\n"
"It is generally double-crusted, with pastry both above "
"and below the filling; the upper crust may be solid or "
"latticed (woven of crosswise strips).\n"
"Answer: YES\n"
"Information: Apple pies tastes bad.\n"
"Context: An apple pie is a fruit pie in which the principal filling "
"ingredient is apples. \n"
"Apple pie is often served with whipped cream, ice cream "
"('apple pie à la mode'), custard or cheddar cheese.\n"
"It is generally double-crusted, with pastry both above "
"and below the filling; the upper crust may be solid or "
"latticed (woven of crosswise strips).\n"
"Answer: NO\n"
"Information: {query_str}\n"
"Context: {context_str}\n"
"Answer: "
)
EVAL_REFINE_TEMPLATE = PromptTemplate(
"We want to understand if the following information is present "
"in the context information: {query_str}\n"
"We have provided an existing YES/NO answer: {existing_answer}\n"
"We have the opportunity to refine the existing answer "
"(only if needed) with some more context below.\n"
"------------\n"
"{context_msg}\n"
"------------\n"
"If the existing answer was already YES, still answer YES. "
"If the information is present in the new context, answer YES. "
"Otherwise answer NO.\n"
)
注意:在当前的响应合成器设置中,我们不会为聊天端点区分系统消息和用户消息,因此我们只是使用我们的标准 llm.complete
来完成文本。
我们现在在下面定义我们的函数。由于我们为给定的上下文定义了标准的评估模板,同时也为后续的上下文定义了一个优化模板,我们实现了我们的“创建和优化”响应合成策略来获得答案。
from llama_index.core.response_synthesizers import Refinefrom typing import List, Dictdef run_faithfulness_eval( generated_answer: str, contexts: List[str], llm: OpenAI,) -> Dict: """运行忠实性评估。""" refine = Refine( llm=llm, text_qa_template=EVAL_TEMPLATE, refine_template=EVAL_REFINE_TEMPLATE, ) response_obj = refine.get_response(generated_answer, contexts) response_txt = str(response_obj) if "yes" in response_txt.lower(): passing = True else: passing = False return {"passing": passing, "reason": str(response_txt)}
让我们尝试在一些数据上进行操作。
# 使用与上面相同的查询字符串和参考答案# query_str = "为对话用例优化的精细调整的LLMs的具体名称是什么?"# reference_answer = "为对话用例优化的精细调整的LLMs的具体名称是Llama 2-Chat。"response = query_engine.query(query_str)generated_answer = str(response)
context_list = [n.get_content() for n in response.source_nodes]
eval_results = run_faithfulness_eval(
generated_answer,
contexts=context_list,
llm=llm,
)
display(eval_results)
{'passing': True, 'reason': 'YES'}
import random
sample_size = 5
qa_pairs_sample = random.sample(qa_pairs, sample_size)
import pandas as pd
def run_evals(qa_pairs: List[Tuple[str, str]], llm: OpenAI, query_engine):
results_list = []
for question, reference_answer in qa_pairs:
response = query_engine.query(question)
generated_answer = str(response)
correctness_results = run_correctness_eval(
query_str,
reference_answer,
generated_answer,
llm=llm,
threshold=4.0,
)
faithfulness_results = run_faithfulness_eval(
generated_answer,
contexts=context_list,
llm=llm,
)
cur_result_dict = {
"correctness": correctness_results["passing"],
"faithfulness": faithfulness_results["passing"],
}
results_list.append(cur_result_dict)
return pd.DataFrame(results_list)
evals_df = run_evals(qa_pairs_sample, llm, query_engine)
evals_df["correctness"].mean()
0.4
evals_df["faithfulness"].mean()
0.6