In [ ]:
Copied!
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-readers-file pymupdf
%pip install llama-index-experimental-param-tuner
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-readers-file pymupdf
%pip install llama-index-experimental-param-tuner
In [ ]:
Copied!
!pip install llama-index llama-hub
!pip install llama-index llama-hub
In [ ]:
Copied!
!mkdir data && wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
!mkdir data && wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
--2023-11-04 00:16:34-- https://arxiv.org/pdf/2307.09288.pdf Resolving arxiv.org (arxiv.org)... 128.84.21.199 Connecting to arxiv.org (arxiv.org)|128.84.21.199|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 13661300 (13M) [application/pdf] Saving to: ‘data/llama2.pdf’ data/llama2.pdf 100%[===================>] 13.03M 533KB/s in 36s 2023-11-04 00:17:10 (376 KB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]
In [ ]:
Copied!
import nest_asyncio
nest_asyncio.apply()
import nest_asyncio
nest_asyncio.apply()
In [ ]:
Copied!
from pathlib import Path
from llama_index.readers.file import PDFReader
from llama_index.readers.file import UnstructuredReader
from llama_index.readers.file import PyMuPDFReader
from pathlib import Path
from llama_index.readers.file import PDFReader
from llama_index.readers.file import UnstructuredReader
from llama_index.readers.file import PyMuPDFReader
In [ ]:
Copied!
loader = PDFReader()
docs0 = loader.load_data(file=Path("./data/llama2.pdf"))
loader = PDFReader()
docs0 = loader.load_data(file=Path("./data/llama2.pdf"))
In [ ]:
Copied!
from llama_index.core import Document
doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]
from llama_index.core import Document
doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]
In [ ]:
Copied!
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import IndexNode
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import IndexNode
加载“Golden”评估数据集¶
在这里,我们为llama2论文设置了一个“golden”评估数据集。
注意:我们从Dropbox中获取这个数据集。有关如何生成数据集的详细信息,请参阅我们的DatasetGenerator
模块。
In [ ]:
Copied!
!wget "https://www.dropbox.com/scl/fi/fh9vsmmm8vu0j50l3ss38/llama2_eval_qr_dataset.json?rlkey=kkoaez7aqeb4z25gzc06ak6kb&dl=1" -O data/llama2_eval_qr_dataset.json
!wget "https://www.dropbox.com/scl/fi/fh9vsmmm8vu0j50l3ss38/llama2_eval_qr_dataset.json?rlkey=kkoaez7aqeb4z25gzc06ak6kb&dl=1" -O data/llama2_eval_qr_dataset.json
In [ ]:
Copied!
from llama_index.core.evaluation import QueryResponseDataset
from llama_index.core.evaluation import QueryResponseDataset
In [ ]:
Copied!
# 可选eval_dataset = QueryResponseDataset.from_json( "data/llama2_eval_qr_dataset.json")
# 可选eval_dataset = QueryResponseDataset.from_json( "data/llama2_eval_qr_dataset.json")
In [ ]:
Copied!
eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]
eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]
In [ ]:
Copied!
from llama_index.core import (
VectorStoreIndex,
load_index_from_storage,
StorageContext,
)
from llama_index.experimental.param_tuner import ParamTuner
from llama_index.core.param_tuner.base import TunedResult, RunResult
from llama_index.core.evaluation.eval_utils import (
get_responses,
aget_responses,
)
from llama_index.core.evaluation import (
SemanticSimilarityEvaluator,
BatchEvalRunner,
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
import os
import numpy as np
from pathlib import Path
from llama_index.core import (
VectorStoreIndex,
load_index_from_storage,
StorageContext,
)
from llama_index.experimental.param_tuner import ParamTuner
from llama_index.core.param_tuner.base import TunedResult, RunResult
from llama_index.core.evaluation.eval_utils import (
get_responses,
aget_responses,
)
from llama_index.core.evaluation import (
SemanticSimilarityEvaluator,
BatchEvalRunner,
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
import os
import numpy as np
from pathlib import Path
辅助函数¶
In [ ]:
Copied!
def _build_index(chunk_size, docs): index_out_path = f"./storage_{chunk_size}" if not os.path.exists(index_out_path): Path(index_out_path).mkdir(parents=True, exist_ok=True) # 解析文档 node_parser = SimpleNodeParser.from_defaults(chunk_size=chunk_size) base_nodes = node_parser.get_nodes_from_documents(docs) # 构建索引 index = VectorStoreIndex(base_nodes) # 将索引保存到磁盘 index.storage_context.persist(index_out_path) else: # 重建存储上下文 storage_context = StorageContext.from_defaults( persist_dir=index_out_path ) # 加载索引 index = load_index_from_storage( storage_context, ) return indexdef _get_eval_batch_runner(): evaluator_s = SemanticSimilarityEvaluator(embed_model=OpenAIEmbedding()) eval_batch_runner = BatchEvalRunner( {"semantic_similarity": evaluator_s}, workers=2, show_progress=True ) return eval_batch_runner
def _build_index(chunk_size, docs): index_out_path = f"./storage_{chunk_size}" if not os.path.exists(index_out_path): Path(index_out_path).mkdir(parents=True, exist_ok=True) # 解析文档 node_parser = SimpleNodeParser.from_defaults(chunk_size=chunk_size) base_nodes = node_parser.get_nodes_from_documents(docs) # 构建索引 index = VectorStoreIndex(base_nodes) # 将索引保存到磁盘 index.storage_context.persist(index_out_path) else: # 重建存储上下文 storage_context = StorageContext.from_defaults( persist_dir=index_out_path ) # 加载索引 index = load_index_from_storage( storage_context, ) return indexdef _get_eval_batch_runner(): evaluator_s = SemanticSimilarityEvaluator(embed_model=OpenAIEmbedding()) eval_batch_runner = BatchEvalRunner( {"semantic_similarity": evaluator_s}, workers=2, show_progress=True ) return eval_batch_runner
目标函数(同步)¶
In [ ]:
Copied!
def objective_function(params_dict): chunk_size = params_dict["chunk_size"] docs = params_dict["docs"] top_k = params_dict["top_k"] eval_qs = params_dict["eval_qs"] ref_response_strs = params_dict["ref_response_strs"] # 构建索引 index = _build_index(chunk_size, docs) # 查询引擎 query_engine = index.as_query_engine(similarity_top_k=top_k) # 获取预测响应 pred_response_objs = get_responses( eval_qs, query_engine, show_progress=True ) # 运行评估器 # 注意:可以取消其他评估器的注释 eval_batch_runner = _get_eval_batch_runner() eval_results = eval_batch_runner.evaluate_responses( eval_qs, responses=pred_response_objs, reference=ref_response_strs ) # 获取语义相似度指标 mean_score = np.array( [r.score for r in eval_results["semantic_similarity"]] ).mean() return RunResult(score=mean_score, params=params_dict)
def objective_function(params_dict): chunk_size = params_dict["chunk_size"] docs = params_dict["docs"] top_k = params_dict["top_k"] eval_qs = params_dict["eval_qs"] ref_response_strs = params_dict["ref_response_strs"] # 构建索引 index = _build_index(chunk_size, docs) # 查询引擎 query_engine = index.as_query_engine(similarity_top_k=top_k) # 获取预测响应 pred_response_objs = get_responses( eval_qs, query_engine, show_progress=True ) # 运行评估器 # 注意:可以取消其他评估器的注释 eval_batch_runner = _get_eval_batch_runner() eval_results = eval_batch_runner.evaluate_responses( eval_qs, responses=pred_response_objs, reference=ref_response_strs ) # 获取语义相似度指标 mean_score = np.array( [r.score for r in eval_results["semantic_similarity"]] ).mean() return RunResult(score=mean_score, params=params_dict)
目标函数(异步)¶
In [ ]:
Copied!
async def aobjective_function(params_dict): chunk_size = params_dict["chunk_size"] docs = params_dict["docs"] top_k = params_dict["top_k"] eval_qs = params_dict["eval_qs"] ref_response_strs = params_dict["ref_response_strs"] # 构建索引 index = _build_index(chunk_size, docs) # 查询引擎 query_engine = index.as_query_engine(similarity_top_k=top_k) # 获取预测响应 pred_response_objs = await aget_responses( eval_qs, query_engine, show_progress=True ) # 运行评估器 # 注意:可以取消其他评估器的注释 eval_batch_runner = _get_eval_batch_runner() eval_results = await eval_batch_runner.aevaluate_responses( eval_qs, responses=pred_response_objs, reference=ref_response_strs ) # 获取语义相似度指标 mean_score = np.array( [r.score for r in eval_results["semantic_similarity"]] ).mean() return RunResult(score=mean_score, params=params_dict)
async def aobjective_function(params_dict): chunk_size = params_dict["chunk_size"] docs = params_dict["docs"] top_k = params_dict["top_k"] eval_qs = params_dict["eval_qs"] ref_response_strs = params_dict["ref_response_strs"] # 构建索引 index = _build_index(chunk_size, docs) # 查询引擎 query_engine = index.as_query_engine(similarity_top_k=top_k) # 获取预测响应 pred_response_objs = await aget_responses( eval_qs, query_engine, show_progress=True ) # 运行评估器 # 注意:可以取消其他评估器的注释 eval_batch_runner = _get_eval_batch_runner() eval_results = await eval_batch_runner.aevaluate_responses( eval_qs, responses=pred_response_objs, reference=ref_response_strs ) # 获取语义相似度指标 mean_score = np.array( [r.score for r in eval_results["semantic_similarity"]] ).mean() return RunResult(score=mean_score, params=params_dict)
参数¶
我们定义了要在param_dict
上进行网格搜索的参数和固定参数fixed_param_dict
。
In [ ]:
Copied!
param_dict = {"chunk_size": [256, 512, 1024], "top_k": [1, 2, 5]}# param_dict = {# "chunk_size": [256],# "top_k": [1]# }fixed_param_dict = { "docs": docs, "eval_qs": eval_qs[:10], "ref_response_strs": ref_response_strs[:10],}
param_dict = {"chunk_size": [256, 512, 1024], "top_k": [1, 2, 5]}# param_dict = {# "chunk_size": [256],# "top_k": [1]# }fixed_param_dict = { "docs": docs, "eval_qs": eval_qs[:10], "ref_response_strs": ref_response_strs[:10],}
运行ParamTuner(默认)¶
在这里,我们运行我们的默认参数调整器,它可以同步或异步地迭代所有超参数组合。
In [ ]:
Copied!
from llama_index.experimental.param_tuner import ParamTuner
from llama_index.experimental.param_tuner import ParamTuner
In [ ]:
Copied!
param_tuner = ParamTuner(
param_fn=objective_function,
param_dict=param_dict,
fixed_param_dict=fixed_param_dict,
show_progress=True,
)
param_tuner = ParamTuner(
param_fn=objective_function,
param_dict=param_dict,
fixed_param_dict=fixed_param_dict,
show_progress=True,
)
In [ ]:
Copied!
results = param_tuner.tune()
results = param_tuner.tune()
In [ ]:
Copied!
best_result = results.best_run_result
best_top_k = results.best_run_result.params["top_k"]
best_chunk_size = results.best_run_result.params["chunk_size"]
print(f"Score: {best_result.score}")
print(f"Top-k: {best_top_k}")
print(f"Chunk size: {best_chunk_size}")
best_result = results.best_run_result
best_top_k = results.best_run_result.params["top_k"]
best_chunk_size = results.best_run_result.params["chunk_size"]
print(f"Score: {best_result.score}")
print(f"Top-k: {best_top_k}")
print(f"Chunk size: {best_chunk_size}")
Score: 0.9490885841089257 Top-k: 2 Chunk size: 512
In [ ]:
Copied!
# 调整test_idx以进行额外测试test_idx = 6p = results.run_results[test_idx].params(results.run_results[test_idx].score, p["top_k"], p["chunk_size"])
# 调整test_idx以进行额外测试test_idx = 6p = results.run_results[test_idx].params(results.run_results[test_idx].score, p["top_k"], p["chunk_size"])
Out[ ]:
(0.9263373628377412, 1, 256)
运行ParamTuner(异步)¶
运行异步版本。
In [ ]:
Copied!
from llama_index.experimental.param_tuner import AsyncParamTuner
from llama_index.experimental.param_tuner import AsyncParamTuner
In [ ]:
Copied!
aparam_tuner = AsyncParamTuner(
aparam_fn=aobjective_function,
param_dict=param_dict,
fixed_param_dict=fixed_param_dict,
num_workers=2,
show_progress=True,
)
aparam_tuner = AsyncParamTuner(
aparam_fn=aobjective_function,
param_dict=param_dict,
fixed_param_dict=fixed_param_dict,
num_workers=2,
show_progress=True,
)
In [ ]:
Copied!
results = await aparam_tuner.atune()
results = await aparam_tuner.atune()
In [ ]:
Copied!
best_result = results.best_run_result
best_top_k = results.best_run_result.params["top_k"]
best_chunk_size = results.best_run_result.params["chunk_size"]
print(f"Score: {best_result.score}")
print(f"Top-k: {best_top_k}")
print(f"Chunk size: {best_chunk_size}")
best_result = results.best_run_result
best_top_k = results.best_run_result.params["top_k"]
best_chunk_size = results.best_run_result.params["chunk_size"]
print(f"Score: {best_result.score}")
print(f"Top-k: {best_top_k}")
print(f"Chunk size: {best_chunk_size}")
Score: 0.9521222054806685 Top-k: 2 Chunk size: 512
In [ ]:
Copied!
from llama_index.experimental.param_tuner import RayTuneParamTuner
from llama_index.experimental.param_tuner import RayTuneParamTuner
In [ ]:
Copied!
param_tuner = RayTuneParamTuner(
param_fn=objective_function,
param_dict=param_dict,
fixed_param_dict=fixed_param_dict,
run_config_dict={"storage_path": "/tmp/custom/ray_tune", "name": "my_exp"},
)
param_tuner = RayTuneParamTuner(
param_fn=objective_function,
param_dict=param_dict,
fixed_param_dict=fixed_param_dict,
run_config_dict={"storage_path": "/tmp/custom/ray_tune", "name": "my_exp"},
)
In [ ]:
Copied!
results = param_tuner.tune()
results = param_tuner.tune()
In [ ]:
Copied!
results.best_run_result.params.keys()
results.best_run_result.params.keys()
Out[ ]:
dict_keys(['docs', 'eval_qs', 'ref_response_strs', 'chunk_size', 'top_k'])
In [ ]:
Copied!
results.best_idx
results.best_idx
Out[ ]:
0
In [ ]:
Copied!
best_result = results.best_run_result
best_top_k = results.best_run_result.params["top_k"]
best_chunk_size = results.best_run_result.params["chunk_size"]
print(f"Score: {best_result.score}")
print(f"Top-k: {best_top_k}")
print(f"Chunk size: {best_chunk_size}")
best_result = results.best_run_result
best_top_k = results.best_run_result.params["top_k"]
best_chunk_size = results.best_run_result.params["chunk_size"]
print(f"Score: {best_result.score}")
print(f"Top-k: {best_top_k}")
print(f"Chunk size: {best_chunk_size}")
Score: 0.9486126773392092 Top-k: 2 Chunk size: 512