最近性过滤¶

展示最近性加权节点后处理器的功能。

In [ ]:

Copied!

import os

os.environ["OPENAI_API_KEY"] = "sk-..."
import os

os.environ["OPENAI_API_KEY"] = "sk-..."

In [ ]:

Copied!





from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.postprocessor import (
    FixedRecencyPostprocessor,
    EmbeddingRecencyPostprocessor,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.response.notebook_utils import display_response
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.postprocessor import (
    FixedRecencyPostprocessor,
    EmbeddingRecencyPostprocessor,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.response.notebook_utils import display_response

将文档解析为节点，添加到文档存储库¶

在这个例子中，有PG的文章的3个不同版本。它们在大部分内容上是相同的，除了一个特定的部分，详细说明了他们为Viaweb筹集的资金金额。

V1: 50k，V2: 30k，V3: 10K

V1: 2020-01-01，V2: 2020-02-03，V3: 2022-04-12

这个想法是鼓励索引获取最新的信息（即V3）。

In [ ]:

Copied!





# 加载文档
from llama_index.core import StorageContext


def get_file_metadata(file_name: str):
    """获取文件元数据。"""
    if "v1" in file_name:
        return {"date": "2020-01-01"}
    elif "v2" in file_name:
        return {"date": "2020-02-03"}
    elif "v3" in file_name:
        return {"date": "2022-04-12"}
    else:
        raise ValueError("无效的文件")


documents = SimpleDirectoryReader(
    input_files=[
        "test_versioned_data/paul_graham_essay_v1.txt",
        "test_versioned_data/paul_graham_essay_v2.txt",
        "test_versioned_data/paul_graham_essay_v3.txt",
    ],
    file_metadata=get_file_metadata,
).load_data()

# 定义设置
from llama_index.core import Settings

Settings.text_splitter = SentenceSplitter(chunk_size=512)

# 使用节点解析器解析成节点
nodes = Settings.text_splitter.get_nodes_from_documents(documents)

# 添加到文档存储
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)

storage_context = StorageContext.from_defaults(docstore=docstore)
# 加载文档
from llama_index.core import StorageContext


def get_file_metadata(file_name: str):
    """获取文件元数据。"""
    if "v1" in file_name:
        return {"date": "2020-01-01"}
    elif "v2" in file_name:
        return {"date": "2020-02-03"}
    elif "v3" in file_name:
        return {"date": "2022-04-12"}
    else:
        raise ValueError("无效的文件")


documents = SimpleDirectoryReader(
    input_files=[
        "test_versioned_data/paul_graham_essay_v1.txt",
        "test_versioned_data/paul_graham_essay_v2.txt",
        "test_versioned_data/paul_graham_essay_v3.txt",
    ],
    file_metadata=get_file_metadata,
).load_data()

# 定义设置
from llama_index.core import Settings

Settings.text_splitter = SentenceSplitter(chunk_size=512)

# 使用节点解析器解析成节点
nodes = Settings.text_splitter.get_nodes_from_documents(documents)

# 添加到文档存储
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)

storage_context = StorageContext.from_defaults(docstore=docstore)

In [ ]:

Copied!

print(documents[2].get_text())
print(documents[2].get_text())

构建索引¶

In [ ]:

Copied!

# 构建索引
index = VectorStoreIndex(nodes, storage_context=storage_context)
# 构建索引
index = VectorStoreIndex(nodes, storage_context=storage_context)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 84471 tokens

定义Recency后处理器¶

In [ ]:

Copied!

node_postprocessor = FixedRecencyPostprocessor()
node_postprocessor = FixedRecencyPostprocessor()

In [ ]:

Copied!

node_postprocessor_emb = EmbeddingRecencyPostprocessor()
node_postprocessor_emb = EmbeddingRecencyPostprocessor()

查询索引¶

In [ ]:

Copied!





# 简单查询

query_engine = index.as_query_engine(
    similarity_top_k=3,
)
response = query_engine.query(
    "作者从Idelle的丈夫（朱利安）那里为Viaweb筹集了多少种子资金？",
)
# 简单查询

query_engine = index.as_query_engine(
    similarity_top_k=3,
)
response = query_engine.query(
    "作者从Idelle的丈夫（朱利安）那里为Viaweb筹集了多少种子资金？",
)

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 1813 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens

In [ ]:

Copied!





# 使用固定的最新节点后处理器进行查询

query_engine = index.as_query_engine(
    similarity_top_k=3, node_postprocessors=[node_postprocessor]
)
response = query_engine.query(
    "作者从Idelle的丈夫（朱利安）那里为Viaweb筹集了多少种子资金？",
)
# 使用固定的最新节点后处理器进行查询

query_engine = index.as_query_engine(
    similarity_top_k=3, node_postprocessors=[node_postprocessor]
)
response = query_engine.query(
    "作者从Idelle的丈夫（朱利安）那里为Viaweb筹集了多少种子资金？",
)

In [ ]:

Copied!





# 使用基于嵌入的节点后处理器进行查询

query_engine = index.as_query_engine(
    similarity_top_k=3, node_postprocessors=[node_postprocessor_emb]
)
response = query_engine.query(
    "作者从Idelle的丈夫（朱利安）那里为Viaweb的种子融资筹集了多少资金？",
)
# 使用基于嵌入的节点后处理器进行查询

query_engine = index.as_query_engine(
    similarity_top_k=3, node_postprocessors=[node_postprocessor_emb]
)
response = query_engine.query(
    "作者从Idelle的丈夫（朱利安）那里为Viaweb的种子融资筹集了多少资金？",
)

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 541 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens

查询索引（较低级别用法）¶

在这个例子中，我们首先从查询调用中获取完整的节点集，然后将其发送到节点后处理器，最后通过摘要索引合成响应。

In [ ]:

Copied!

from llama_index.core import SummaryIndex
from llama_index.core import SummaryIndex

In [ ]:

Copied!





query_str = (
    "How much did the author raise in seed funding from Idelle's husband"
    " (Julian) for Viaweb?"
)
query_str = (
    "How much did the author raise in seed funding from Idelle's husband"
    " (Julian) for Viaweb?"
)

In [ ]:

Copied!





query_engine = index.as_query_engine(
    similarity_top_k=3, response_mode="no_text"
)
init_response = query_engine.query(
    query_str,
)
resp_nodes = [n.node for n in init_response.source_nodes]
query_engine = index.as_query_engine(
    similarity_top_k=3, response_mode="no_text"
)
init_response = query_engine.query(
    query_str,
)
resp_nodes = [n.node for n in init_response.source_nodes]

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens

In [ ]:

Copied!





summary_index = SummaryIndex(resp_nodes)
query_engine = summary_index.as_query_engine(
    node_postprocessors=[node_postprocessor]
)
response = query_engine.query(query_str)
summary_index = SummaryIndex(resp_nodes)
query_engine = summary_index.as_query_engine(
    node_postprocessors=[node_postprocessor]
)
response = query_engine.query(query_str)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 541 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens