设置¶
如果您在colab上打开这个笔记本,您可能需要安装LlamaIndex 🦙。
In [ ]:
Copied!
!pip install llama-index
!pip install llama-index
In [ ]:
Copied!
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
# 下载数据
在这个部分,我们将学习如何下载数据。
In [ ]:
Copied!
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
Will not apply HSTS. The HSTS database must be a regular and non-world-writable file. ERROR: could not open HSTS store at '/home/loganm/.wget-hsts'. HSTS will be disabled. --2023-11-23 12:54:37-- https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 75042 (73K) [text/plain] Saving to: ‘data/paul_graham/paul_graham_essay.txt’ data/paul_graham/pa 100%[===================>] 73.28K --.-KB/s in 0.04s 2023-11-23 12:54:37 (1.77 MB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]
加载数据¶
我们首先展示如何将一个文档转换为一组节点,并插入到文档存储中。
In [ ]:
Copied!
from llama_index.core import SimpleDirectoryReader
# 加载文档
documents = SimpleDirectoryReader("./data/paul_graham").load_data()
from llama_index.core import SimpleDirectoryReader
# 加载文档
documents = SimpleDirectoryReader("./data/paul_graham").load_data()
In [ ]:
Copied!
from llama_index.core import Settings
nodes = Settings.node_parser.get_nodes_from_documents(documents)
from llama_index.core import Settings
nodes = Settings.node_parser.get_nodes_from_documents(documents)
In [ ]:
Copied!
from llama_index.core import StorageContext
# 初始化存储上下文(默认情况下是内存中的)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)
from llama_index.core import StorageContext
# 初始化存储上下文(默认情况下是内存中的)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)
定义相同数据上的向量索引和关键词表索引¶
我们在相同的文档存储上构建了一个向量索引和关键词表索引。
In [ ]:
Copied!
from llama_index.core import SimpleKeywordTableIndex, VectorStoreIndex
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
keyword_index = SimpleKeywordTableIndex(nodes, storage_context=storage_context)
from llama_index.core import SimpleKeywordTableIndex, VectorStoreIndex
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
keyword_index = SimpleKeywordTableIndex(nodes, storage_context=storage_context)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
In [ ]:
Copied!
# from llama_index.core import QueryBundle
from llama_index.core import QueryBundle
# from llama_index.core.schema import NodeWithScore
from llama_index.core.schema import NodeWithScore
# 检索器
from llama_index.core.retrievers import (
BaseRetriever,
VectorIndexRetriever,
KeywordTableSimpleRetriever
)
from typing import List
# from llama_index.core import QueryBundle
from llama_index.core import QueryBundle
# from llama_index.core.schema import NodeWithScore
from llama_index.core.schema import NodeWithScore
# 检索器
from llama_index.core.retrievers import (
BaseRetriever,
VectorIndexRetriever,
KeywordTableSimpleRetriever
)
from typing import List
In [ ]:
Copied!
class CustomRetriever(BaseRetriever):
"""自定义检索器,执行语义搜索和混合搜索。"""
def __init__(
self,
vector_retriever: VectorIndexRetriever,
keyword_retriever: KeywordTableSimpleRetriever,
mode: str = "AND",
) -> None:
"""初始化参数。"""
self._vector_retriever = vector_retriever
self._keyword_retriever = keyword_retriever
if mode not in ("AND", "OR"):
raise ValueError("无效的模式。")
self._mode = mode
super().__init__()
def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
"""根据查询检索节点。"""
vector_nodes = self._vector_retriever.retrieve(query_bundle)
keyword_nodes = self._keyword_retriever.retrieve(query_bundle)
vector_ids = {n.node.node_id for n in vector_nodes}
keyword_ids = {n.node.node_id for n in keyword_nodes}
combined_dict = {n.node.node_id: n for n in vector_nodes}
combined_dict.update({n.node.node_id: n for n in keyword_nodes})
if self._mode == "AND":
retrieve_ids = vector_ids.intersection(keyword_ids)
else:
retrieve_ids = vector_ids.union(keyword_ids)
retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
return retrieve_nodes
class CustomRetriever(BaseRetriever):
"""自定义检索器,执行语义搜索和混合搜索。"""
def __init__(
self,
vector_retriever: VectorIndexRetriever,
keyword_retriever: KeywordTableSimpleRetriever,
mode: str = "AND",
) -> None:
"""初始化参数。"""
self._vector_retriever = vector_retriever
self._keyword_retriever = keyword_retriever
if mode not in ("AND", "OR"):
raise ValueError("无效的模式。")
self._mode = mode
super().__init__()
def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
"""根据查询检索节点。"""
vector_nodes = self._vector_retriever.retrieve(query_bundle)
keyword_nodes = self._keyword_retriever.retrieve(query_bundle)
vector_ids = {n.node.node_id for n in vector_nodes}
keyword_ids = {n.node.node_id for n in keyword_nodes}
combined_dict = {n.node.node_id: n for n in vector_nodes}
combined_dict.update({n.node.node_id: n for n in keyword_nodes})
if self._mode == "AND":
retrieve_ids = vector_ids.intersection(keyword_ids)
else:
retrieve_ids = vector_ids.union(keyword_ids)
retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
return retrieve_nodes
将插件检索器嵌入查询引擎¶
将插件检索器嵌入查询引擎,并运行一些查询。
In [ ]:
Copied!
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
# 定义自定义检索器
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=2)
keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index)
custom_retriever = CustomRetriever(vector_retriever, keyword_retriever)
# 定义响应合成器
response_synthesizer = get_response_synthesizer()
# 组装查询引擎
custom_query_engine = RetrieverQueryEngine(
retriever=custom_retriever,
response_synthesizer=response_synthesizer,
)
# 向量查询引擎
vector_query_engine = RetrieverQueryEngine(
retriever=vector_retriever,
response_synthesizer=response_synthesizer,
)
# 关键词查询引擎
keyword_query_engine = RetrieverQueryEngine(
retriever=keyword_retriever,
response_synthesizer=response_synthesizer,
)
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
# 定义自定义检索器
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=2)
keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index)
custom_retriever = CustomRetriever(vector_retriever, keyword_retriever)
# 定义响应合成器
response_synthesizer = get_response_synthesizer()
# 组装查询引擎
custom_query_engine = RetrieverQueryEngine(
retriever=custom_retriever,
response_synthesizer=response_synthesizer,
)
# 向量查询引擎
vector_query_engine = RetrieverQueryEngine(
retriever=vector_retriever,
response_synthesizer=response_synthesizer,
)
# 关键词查询引擎
keyword_query_engine = RetrieverQueryEngine(
retriever=keyword_retriever,
response_synthesizer=response_synthesizer,
)
In [ ]:
Copied!
response = custom_query_engine.query(
"What did the author do during his time at YC?"
)
response = custom_query_engine.query(
"What did the author do during his time at YC?"
)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" INFO:llama_index.indices.keyword_table.retrievers:> Starting query: What did the author do during his time at YC?
> Starting query: What did the author do during his time at YC? INFO:llama_index.indices.keyword_table.retrievers:query keywords: ['author', 'yc', 'time'] query keywords: ['author', 'yc', 'time'] INFO:llama_index.indices.keyword_table.retrievers:> Extracted keywords: ['yc', 'time'] > Extracted keywords: ['yc', 'time'] INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK" HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
In [ ]:
Copied!
print(response)
print(response)
During his time at YC, the author worked on various projects, including writing essays and working on YC itself. He also wrote all of YC's internal software in Arc. Additionally, he mentioned that he dealt with urgent problems, with about a 60% chance of them being related to Hacker News (HN), and a 40% chance of them being related to everything else combined. The author also mentioned that YC was different from other kinds of work he had done, as the problems of the startups in each batch became their problems, and he worked hard even at the parts of the job he didn't like.
In [ ]:
Copied!
# 混合搜索可以使我们不检索与问题无关的节点
# 耶鲁大学在文章中从未被提及
response = custom_query_engine.query(
"作者在耶鲁大学期间做了什么?"
)
# 混合搜索可以使我们不检索与问题无关的节点
# 耶鲁大学在文章中从未被提及
response = custom_query_engine.query(
"作者在耶鲁大学期间做了什么?"
)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" INFO:llama_index.indices.keyword_table.retrievers:> Starting query: What did the author do during his time at Yale? > Starting query: What did the author do during his time at Yale? INFO:llama_index.indices.keyword_table.retrievers:query keywords: ['author', 'yale', 'time'] query keywords: ['author', 'yale', 'time'] INFO:llama_index.indices.keyword_table.retrievers:> Extracted keywords: ['time'] > Extracted keywords: ['time']
In [ ]:
Copied!
print(str(response))
len(response.source_nodes)
print(str(response))
len(response.source_nodes)
Empty Response
Out[ ]:
0
In [ ]:
Copied!
# 相比之下,向量搜索将返回一个答案
response = vector_query_engine.query(
"作者在耶鲁大学期间做了什么?"
)
# 相比之下,向量搜索将返回一个答案
response = vector_query_engine.query(
"作者在耶鲁大学期间做了什么?"
)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK" HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
In [ ]:
Copied!
print(str(response))
len(response.source_nodes)
print(str(response))
len(response.source_nodes)
The context information does not provide any information about the author's time at Yale.
Out[ ]:
2