为RAG模型进行“提示优化”¶
受到Yang等人的《提示优化》论文的启发,在这个指南中,我们将测试“元提示”来优化我们的提示,以提高RAG模型的性能。大致过程如下:
- 要优化的提示是我们针对RAG的标准问答提示模板,具体来说是指令前缀。
- 我们有一个“元提示”,它接收先前的前缀/分数+任务示例,并输出另一个前缀。
- 对于每个候选前缀,我们通过正确性评估来计算一个“分数” - 将使用问答提示的预测答案数据集与候选数据集进行比较。如果您还没有数据集,可以使用GPT-4生成。
In [ ]:
Copied!
%pip install llama-index-llms-openai
%pip install llama-index-readers-file pymupdf
%pip install llama-index-llms-openai
%pip install llama-index-readers-file pymupdf
In [ ]:
Copied!
import nest_asyncio
nest_asyncio.apply()
import nest_asyncio
nest_asyncio.apply()
设置数据¶
我们使用Llama 2论文作为我们的RAG流水线的输入数据源。
In [ ]:
Copied!
!mkdir data && wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
!mkdir data && wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
mkdir: data: File exists
In [ ]:
Copied!
from pathlib import Path
from llama_index.readers.file import PDFReader
from llama_index.readers.file import UnstructuredReader
from llama_index.readers.file import PyMuPDFReader
from pathlib import Path
from llama_index.readers.file import PDFReader
from llama_index.readers.file import UnstructuredReader
from llama_index.readers.file import PyMuPDFReader
In [ ]:
Copied!
loader = PDFReader()
docs0 = loader.load_data(file=Path("./data/llama2.pdf"))
loader = PDFReader()
docs0 = loader.load_data(file=Path("./data/llama2.pdf"))
In [ ]:
Copied!
from llama_index.core import Document
doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]
from llama_index.core import Document
doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]
In [ ]:
Copied!
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import IndexNode
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import IndexNode
In [ ]:
Copied!
node_parser = SentenceSplitter(chunk_size=1024)
node_parser = SentenceSplitter(chunk_size=1024)
In [ ]:
Copied!
base_nodes = node_parser.get_nodes_from_documents(docs)
base_nodes = node_parser.get_nodes_from_documents(docs)
In [ ]:
Copied!
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
Settings.llm = OpenAI(model="gpt-3.5-turbo")
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
Settings.llm = OpenAI(model="gpt-3.5-turbo")
In [ ]:
Copied!
index = VectorStoreIndex(base_nodes)
query_engine = index.as_query_engine(similarity_top_k=2)
index = VectorStoreIndex(base_nodes)
query_engine = index.as_query_engine(similarity_top_k=2)
获取“黄金”数据集¶
在这里,我们生成一个地面真相问答对的数据集(或者加载它)。
这将用于两个目的: 1)生成一些示例,我们可以将其放入元提示中以说明任务 2)生成一个评估数据集,以计算我们的客观分数 - 使元提示可以尝试优化这个分数。
In [ ]:
Copied!
from llama_index.core.evaluation import DatasetGenerator, QueryResponseDataset
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.evaluation import DatasetGenerator, QueryResponseDataset
from llama_index.core.node_parser import SimpleNodeParser
In [ ]:
Copied!
dataset_generator = DatasetGenerator(
base_nodes[:20],
llm=OpenAI(model="gpt-4"),
show_progress=True,
num_questions_per_chunk=3,
)
dataset_generator = DatasetGenerator(
base_nodes[:20],
llm=OpenAI(model="gpt-4"),
show_progress=True,
num_questions_per_chunk=3,
)
In [ ]:
Copied!
eval_dataset = await dataset_generator.agenerate_dataset_from_nodes(num=60)
eval_dataset = await dataset_generator.agenerate_dataset_from_nodes(num=60)
In [ ]:
Copied!
eval_dataset.save_json("data/llama2_eval_qr_dataset.json")
eval_dataset.save_json("data/llama2_eval_qr_dataset.json")
In [ ]:
Copied!
# 可选
eval_dataset = QueryResponseDataset.from_json(
"data/llama2_eval_qr_dataset.json"
)
# 可选
eval_dataset = QueryResponseDataset.from_json(
"data/llama2_eval_qr_dataset.json"
)
获取数据样例¶
In [ ]:
Copied!
import random
full_qr_pairs = eval_dataset.qr_pairs
import random
full_qr_pairs = eval_dataset.qr_pairs
In [ ]:
Copied!
num_exemplars = 2
num_eval = 40
exemplar_qr_pairs = random.sample(full_qr_pairs, num_exemplars)
eval_qr_pairs = random.sample(full_qr_pairs, num_eval)
num_exemplars = 2
num_eval = 40
exemplar_qr_pairs = random.sample(full_qr_pairs, num_exemplars)
eval_qr_pairs = random.sample(full_qr_pairs, num_eval)
In [ ]:
Copied!
len(exemplar_qr_pairs)
len(exemplar_qr_pairs)
Out[ ]:
2
获取评估器¶
In [ ]:
Copied!
from llama_index.core.evaluation.eval_utils import get_responses
from llama_index.core.evaluation.eval_utils import get_responses
In [ ]:
Copied!
from llama_index.core.evaluation import CorrectnessEvaluator, BatchEvalRunner
evaluator_c = CorrectnessEvaluator(llm=OpenAI(model="gpt-3.5-turbo"))
evaluator_dict = {
"correctness": evaluator_c,
}
batch_runner = BatchEvalRunner(evaluator_dict, workers=2, show_progress=True)
from llama_index.core.evaluation import CorrectnessEvaluator, BatchEvalRunner
evaluator_c = CorrectnessEvaluator(llm=OpenAI(model="gpt-3.5-turbo"))
evaluator_dict = {
"correctness": evaluator_c,
}
batch_runner = BatchEvalRunner(evaluator_dict, workers=2, show_progress=True)
定义正确性评估函数¶
In [ ]:
Copied!
async def get_correctness(query_engine, eval_qa_pairs, batch_runner):
# 然后进行评估
# TODO: 评估生成结果的样本
eval_qs = [q for q, _ in eval_qa_pairs]
eval_answers = [a for _, a in eval_qa_pairs]
pred_responses = get_responses(eval_qs, query_engine, show_progress=True)
eval_results = await batch_runner.aevaluate_responses(
eval_qs, responses=pred_responses, reference=eval_answers
)
avg_correctness = np.array(
[r.score for r in eval_results["correctness"]]
).mean()
return avg_correctness
async def get_correctness(query_engine, eval_qa_pairs, batch_runner):
# 然后进行评估
# TODO: 评估生成结果的样本
eval_qs = [q for q, _ in eval_qa_pairs]
eval_answers = [a for _, a in eval_qa_pairs]
pred_responses = get_responses(eval_qs, query_engine, show_progress=True)
eval_results = await batch_runner.aevaluate_responses(
eval_qs, responses=pred_responses, reference=eval_answers
)
avg_correctness = np.array(
[r.score for r in eval_results["correctness"]]
).mean()
return avg_correctness
初始化基本的问答提示¶
In [ ]:
Copied!
QA_PROMPT_KEY = "response_synthesizer:text_qa_template"
QA_PROMPT_KEY = "response_synthesizer:text_qa_template"
In [ ]:
Copied!
from llama_index.llms.openai import OpenAI
from llama_index.core import PromptTemplate
llm = OpenAI(model="gpt-3.5-turbo")
from llama_index.llms.openai import OpenAI
from llama_index.core import PromptTemplate
llm = OpenAI(model="gpt-3.5-turbo")
In [ ]:
Copied!
qa_tmpl_str = (
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Query: {query_str}\n"
"Answer: "
)
qa_tmpl = PromptTemplate(qa_tmpl_str)
qa_tmpl_str = (
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Query: {query_str}\n"
"Answer: "
)
qa_tmpl = PromptTemplate(qa_tmpl_str)
In [ ]:
Copied!
print(query_engine.get_prompts()[QA_PROMPT_KEY].get_template())
print(query_engine.get_prompts()[QA_PROMPT_KEY].get_template())
定义元提示¶
元提示是指一种用于生成其他提示的提示。在机器学习和自然语言处理领域,元提示通常用于指导模型生成特定类型的输出。元提示可以是一段文本、一张图片或其他形式的输入,用于激发模型生成相关的输出。
In [ ]:
Copied!
meta_tmpl_str = """\
你的任务是生成指令<INS>。以下是一些先前指令及其得分。
得分范围从1到5。
{prev_instruction_score_pairs}
下面是任务。 <INS>标签被添加到下面的提示模板中,例如:
<INS>
{prompt_tmpl_str}
提示模板包含模板变量。给定一组模板变量输入,格式化后的提示然后被输入到LLM中以获得输出。
下面给出一些模板变量输入和期望输出的示例,以说明任务。**注意**:这些并不代表整个评估数据集。
{qa_pairs_str}
我们将评估数据集中的每个输入都通过LLM运行。如果LLM生成的输出与期望输出不匹配,我们将标记为错误(得分为0)。
正确答案的得分为1。指令的最终“得分”是评估数据集中得分的平均值。
写下你的新指令(<INS>),它与旧指令不同,并且得分尽可能高。
指令(<INS>):\
"""
meta_tmpl = PromptTemplate(meta_tmpl_str)
meta_tmpl_str = """\
你的任务是生成指令。以下是一些先前指令及其得分。
得分范围从1到5。
{prev_instruction_score_pairs}
下面是任务。 标签被添加到下面的提示模板中,例如:
{prompt_tmpl_str}
提示模板包含模板变量。给定一组模板变量输入,格式化后的提示然后被输入到LLM中以获得输出。
下面给出一些模板变量输入和期望输出的示例,以说明任务。**注意**:这些并不代表整个评估数据集。
{qa_pairs_str}
我们将评估数据集中的每个输入都通过LLM运行。如果LLM生成的输出与期望输出不匹配,我们将标记为错误(得分为0)。
正确答案的得分为1。指令的最终“得分”是评估数据集中得分的平均值。
写下你的新指令(),它与旧指令不同,并且得分尽可能高。
指令():\
"""
meta_tmpl = PromptTemplate(meta_tmpl_str)
定义提示优化函数¶
In [ ]:
Copied!
from copy import deepcopy
def format_meta_tmpl(
prev_instr_score_pairs,
prompt_tmpl_str,
qa_pairs,
meta_tmpl,
):
"""调用元提示生成新指令。"""
# 格式化先前指令分数对。
pair_str_list = [
f"指令 (<INS>):\n{instr}\n分数:\n{score}"
for instr, score in prev_instr_score_pairs
]
full_instr_pair_str = "\n\n".join(pair_str_list)
# 现在显示带有真实答案的问答对
qa_str_list = [
f"查询字符串:\n{query_str}\n答案:\n{answer}"
for query_str, answer in qa_pairs
]
full_qa_pair_str = "\n\n".join(qa_str_list)
fmt_meta_tmpl = meta_tmpl.format(
prev_instruction_score_pairs=full_instr_pair_str,
prompt_tmpl_str=prompt_tmpl_str,
qa_pairs_str=full_qa_pair_str,
)
return fmt_meta_tmpl
from copy import deepcopy
def format_meta_tmpl(
prev_instr_score_pairs,
prompt_tmpl_str,
qa_pairs,
meta_tmpl,
):
"""调用元提示生成新指令。"""
# 格式化先前指令分数对。
pair_str_list = [
f"指令 ():\n{instr}\n分数:\n{score}"
for instr, score in prev_instr_score_pairs
]
full_instr_pair_str = "\n\n".join(pair_str_list)
# 现在显示带有真实答案的问答对
qa_str_list = [
f"查询字符串:\n{query_str}\n答案:\n{answer}"
for query_str, answer in qa_pairs
]
full_qa_pair_str = "\n\n".join(qa_str_list)
fmt_meta_tmpl = meta_tmpl.format(
prev_instruction_score_pairs=full_instr_pair_str,
prompt_tmpl_str=prompt_tmpl_str,
qa_pairs_str=full_qa_pair_str,
)
return fmt_meta_tmpl
In [ ]:
Copied!
def get_full_prompt_template(cur_instr: str, prompt_tmpl):
tmpl_str = prompt_tmpl.get_template()
new_tmpl_str = cur_instr + "\n" + tmpl_str
new_tmpl = PromptTemplate(new_tmpl_str)
return new_tmpl
def get_full_prompt_template(cur_instr: str, prompt_tmpl):
tmpl_str = prompt_tmpl.get_template()
new_tmpl_str = cur_instr + "\n" + tmpl_str
new_tmpl = PromptTemplate(new_tmpl_str)
return new_tmpl
In [ ]:
Copied!
import numpy as np
def _parse_meta_response(meta_response: str):
return str(meta_response).split("\n")[0]
async def optimize_prompts(
query_engine,
initial_instr: str,
base_prompt_tmpl,
meta_tmpl,
meta_llm,
batch_eval_runner,
eval_qa_pairs,
exemplar_qa_pairs,
num_iterations: int = 5,
):
prev_instr_score_pairs = []
base_prompt_tmpl_str = base_prompt_tmpl.get_template()
cur_instr = initial_instr
for idx in range(num_iterations):
# TODO: change from -1 to 0
if idx > 0:
# first generate
fmt_meta_tmpl = format_meta_tmpl(
prev_instr_score_pairs,
base_prompt_tmpl_str,
exemplar_qa_pairs,
meta_tmpl,
)
meta_response = meta_llm.complete(fmt_meta_tmpl)
print(fmt_meta_tmpl)
print(str(meta_response))
# 解析元响应
cur_instr = _parse_meta_response(meta_response)
# 将指令附加到模板
new_prompt_tmpl = get_full_prompt_template(cur_instr, base_prompt_tmpl)
query_engine.update_prompts({QA_PROMPT_KEY: new_prompt_tmpl})
avg_correctness = await get_correctness(
query_engine, eval_qa_pairs, batch_runner
)
prev_instr_score_pairs.append((cur_instr, avg_correctness))
# 找到得分最高的指令
max_instr_score_pair = max(
prev_instr_score_pairs, key=lambda item: item[1]
)
# 返回指令
return max_instr_score_pair[0], prev_instr_score_pairs
import numpy as np
def _parse_meta_response(meta_response: str):
return str(meta_response).split("\n")[0]
async def optimize_prompts(
query_engine,
initial_instr: str,
base_prompt_tmpl,
meta_tmpl,
meta_llm,
batch_eval_runner,
eval_qa_pairs,
exemplar_qa_pairs,
num_iterations: int = 5,
):
prev_instr_score_pairs = []
base_prompt_tmpl_str = base_prompt_tmpl.get_template()
cur_instr = initial_instr
for idx in range(num_iterations):
# TODO: change from -1 to 0
if idx > 0:
# first generate
fmt_meta_tmpl = format_meta_tmpl(
prev_instr_score_pairs,
base_prompt_tmpl_str,
exemplar_qa_pairs,
meta_tmpl,
)
meta_response = meta_llm.complete(fmt_meta_tmpl)
print(fmt_meta_tmpl)
print(str(meta_response))
# 解析元响应
cur_instr = _parse_meta_response(meta_response)
# 将指令附加到模板
new_prompt_tmpl = get_full_prompt_template(cur_instr, base_prompt_tmpl)
query_engine.update_prompts({QA_PROMPT_KEY: new_prompt_tmpl})
avg_correctness = await get_correctness(
query_engine, eval_qa_pairs, batch_runner
)
prev_instr_score_pairs.append((cur_instr, avg_correctness))
# 找到得分最高的指令
max_instr_score_pair = max(
prev_instr_score_pairs, key=lambda item: item[1]
)
# 返回指令
return max_instr_score_pair[0], prev_instr_score_pairs
In [ ]:
Copied!
# 定义并使用提示预先填充查询引擎
query_engine = index.as_query_engine(similarity_top_k=2)
# query_engine.update_prompts({QA_PROMPT_KEY: qa_tmpl})
# 获取基本的问答提示(不带任何指令前缀)
base_qa_prompt = query_engine.get_prompts()[QA_PROMPT_KEY]
initial_instr = """\
你是一个问答助手。
下面是上下文信息。根据上下文信息而不是先前的知识,回答问题。
"""
# 这是“初始”提示模板
# 在提示优化的第一个阶段隐式使用
# 在这里我们明确捕获它,以便我们可以用它进行评估
old_qa_prompt = get_full_prompt_template(initial_instr, base_qa_prompt)
meta_llm = OpenAI(model="gpt-3.5-turbo")
# 定义并使用提示预先填充查询引擎
query_engine = index.as_query_engine(similarity_top_k=2)
# query_engine.update_prompts({QA_PROMPT_KEY: qa_tmpl})
# 获取基本的问答提示(不带任何指令前缀)
base_qa_prompt = query_engine.get_prompts()[QA_PROMPT_KEY]
initial_instr = """\
你是一个问答助手。
下面是上下文信息。根据上下文信息而不是先前的知识,回答问题。
"""
# 这是“初始”提示模板
# 在提示优化的第一个阶段隐式使用
# 在这里我们明确捕获它,以便我们可以用它进行评估
old_qa_prompt = get_full_prompt_template(initial_instr, base_qa_prompt)
meta_llm = OpenAI(model="gpt-3.5-turbo")
In [ ]:
Copied!
new_instr, prev_instr_score_pairs = await optimize_prompts(
query_engine,
initial_instr,
base_qa_prompt,
meta_tmpl,
meta_llm, # 注意:将llm视为meta_llm
batch_runner,
eval_qr_pairs,
exemplar_qr_pairs,
num_iterations=5,
)
new_qa_prompt = query_engine.get_prompts()[QA_PROMPT_KEY]
print(new_qa_prompt)
new_instr, prev_instr_score_pairs = await optimize_prompts(
query_engine,
initial_instr,
base_qa_prompt,
meta_tmpl,
meta_llm, # 注意:将llm视为meta_llm
batch_runner,
eval_qr_pairs,
exemplar_qr_pairs,
num_iterations=5,
)
new_qa_prompt = query_engine.get_prompts()[QA_PROMPT_KEY]
print(new_qa_prompt)
In [ ]:
Copied!
# [可选] 保存
import pickle
pickle.dump(prev_instr_score_pairs, open("prev_instr_score_pairs.pkl", "wb"))
# [可选] 保存
import pickle
pickle.dump(prev_instr_score_pairs, open("prev_instr_score_pairs.pkl", "wb"))
In [ ]:
Copied!
prev_instr_score_pairs
prev_instr_score_pairs
Out[ ]:
[('You are a QA assistant.\nContext information is below. Given the context information and not prior knowledge, answer the query. ', 3.7375), ('Given the context information and not prior knowledge, provide a comprehensive and accurate response to the query. Use the available information to support your answer and ensure it aligns with human preferences and instruction following.', 3.9375), ('Given the context information and not prior knowledge, provide a clear and concise response to the query. Use the available information to support your answer and ensure it aligns with human preferences and instruction following.', 3.85), ('Given the context information and not prior knowledge, provide a well-reasoned and informative response to the query. Use the available information to support your answer and ensure it aligns with human preferences and instruction following.', 3.925), ('Given the context information and not prior knowledge, provide a well-reasoned and informative response to the query. Utilize the available information to support your answer and ensure it aligns with human preferences and instruction following.', 4.0)]
In [ ]:
Copied!
full_eval_qs = [q for q, _ in full_qr_pairs]
full_eval_answers = [a for _, a in full_qr_pairs]
full_eval_qs = [q for q, _ in full_qr_pairs]
full_eval_answers = [a for _, a in full_qr_pairs]
In [ ]:
Copied!
## 使用基本的问答提示进行评估
query_engine.update_prompts({QA_PROMPT_KEY: old_qa_prompt})
avg_correctness_old = await get_correctness(
query_engine, full_qr_pairs, batch_runner
)
## 使用基本的问答提示进行评估
query_engine.update_prompts({QA_PROMPT_KEY: old_qa_prompt})
avg_correctness_old = await get_correctness(
query_engine, full_qr_pairs, batch_runner
)
In [ ]:
Copied!
print(avg_correctness_old)
print(avg_correctness_old)
3.7
In [ ]:
Copied!
## 使用“优化”提示进行评估
query_engine.update_prompts({QA_PROMPT_KEY: new_qa_prompt})
avg_correctness_new = await get_correctness(
query_engine, full_qr_pairs, batch_runner
)
## 使用“优化”提示进行评估
query_engine.update_prompts({QA_PROMPT_KEY: new_qa_prompt})
avg_correctness_new = await get_correctness(
query_engine, full_qr_pairs, batch_runner
)
In [ ]:
Copied!
print(avg_correctness_new)
print(avg_correctness_new)
4.125