44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189 | class TreeIndex(BaseIndex[IndexGraph]):
"""树索引。
树索引是一种树形结构的索引,其中每个节点都是子节点的摘要。在构建索引过程中,树是自底向上构建的,直到最终得到一组根节点。
在查询时有几种不同的选项(参见::ref:`Ref-Query`)。
主要选项是从根节点向下遍历树。
次要答案是直接从根节点合成答案。
Args:
summary_template (Optional[BasePromptTemplate]): 摘要提示(参见::ref:`Prompt-Templates`)。
insert_prompt (Optional[BasePromptTemplate]): 树插入提示(参见::ref:`Prompt-Templates`)。
num_children (int): 每个节点应该具有的子节点数。
build_tree (bool): 是否在构建索引时构建树。
show_progress (bool): 是否显示进度条。默认为False。"""
index_struct_cls = IndexGraph
def __init__(
self,
nodes: Optional[Sequence[BaseNode]] = None,
objects: Optional[Sequence[IndexNode]] = None,
index_struct: Optional[IndexGraph] = None,
llm: Optional[LLM] = None,
summary_template: Optional[BasePromptTemplate] = None,
insert_prompt: Optional[BasePromptTemplate] = None,
num_children: int = 10,
build_tree: bool = True,
use_async: bool = False,
show_progress: bool = False,
# deprecated
service_context: Optional[ServiceContext] = None,
**kwargs: Any,
) -> None:
"""初始化参数。"""
# need to set parameters before building index in base class.
self.num_children = num_children
self.summary_template = summary_template or DEFAULT_SUMMARY_PROMPT
self.insert_prompt: BasePromptTemplate = insert_prompt or DEFAULT_INSERT_PROMPT
self.build_tree = build_tree
self._use_async = use_async
self._llm = llm or llm_from_settings_or_context(Settings, service_context)
super().__init__(
nodes=nodes,
index_struct=index_struct,
service_context=service_context,
show_progress=show_progress,
objects=objects,
**kwargs,
)
def as_retriever(
self,
retriever_mode: Union[str, TreeRetrieverMode] = TreeRetrieverMode.SELECT_LEAF,
embed_model: Optional[BaseEmbedding] = None,
**kwargs: Any,
) -> BaseRetriever:
# NOTE: lazy import
from llama_index.core.indices.tree.all_leaf_retriever import (
TreeAllLeafRetriever,
)
from llama_index.core.indices.tree.select_leaf_embedding_retriever import (
TreeSelectLeafEmbeddingRetriever,
)
from llama_index.core.indices.tree.select_leaf_retriever import (
TreeSelectLeafRetriever,
)
from llama_index.core.indices.tree.tree_root_retriever import (
TreeRootRetriever,
)
self._validate_build_tree_required(TreeRetrieverMode(retriever_mode))
if retriever_mode == TreeRetrieverMode.SELECT_LEAF:
return TreeSelectLeafRetriever(self, object_map=self._object_map, **kwargs)
elif retriever_mode == TreeRetrieverMode.SELECT_LEAF_EMBEDDING:
embed_model = embed_model or embed_model_from_settings_or_context(
Settings, self._service_context
)
return TreeSelectLeafEmbeddingRetriever(
self, embed_model=embed_model, object_map=self._object_map, **kwargs
)
elif retriever_mode == TreeRetrieverMode.ROOT:
return TreeRootRetriever(self, object_map=self._object_map, **kwargs)
elif retriever_mode == TreeRetrieverMode.ALL_LEAF:
return TreeAllLeafRetriever(self, object_map=self._object_map, **kwargs)
else:
raise ValueError(f"Unknown retriever mode: {retriever_mode}")
def _validate_build_tree_required(self, retriever_mode: TreeRetrieverMode) -> None:
"""检查索引是否支持需要树的模式。"""
if retriever_mode in REQUIRE_TREE_MODES and not self.build_tree:
raise ValueError(
"Index was constructed without building trees, "
f"but retriever mode {retriever_mode} requires trees."
)
def _build_index_from_nodes(self, nodes: Sequence[BaseNode]) -> IndexGraph:
"""从节点构建索引。"""
index_builder = GPTTreeIndexBuilder(
self.num_children,
self.summary_template,
service_context=self.service_context,
llm=self._llm,
use_async=self._use_async,
show_progress=self._show_progress,
docstore=self._docstore,
)
return index_builder.build_from_nodes(nodes, build_tree=self.build_tree)
def _insert(self, nodes: Sequence[BaseNode], **insert_kwargs: Any) -> None:
"""插入一个文档。"""
# TODO: allow to customize insert prompt
inserter = TreeIndexInserter(
self.index_struct,
service_context=self.service_context,
llm=self._llm,
num_children=self.num_children,
insert_prompt=self.insert_prompt,
summary_prompt=self.summary_template,
docstore=self._docstore,
)
inserter.insert(nodes)
def _delete_node(self, node_id: str, **delete_kwargs: Any) -> None:
"""删除一个节点。"""
raise NotImplementedError("Delete not implemented for tree index.")
@property
def ref_doc_info(self) -> Dict[str, RefDocInfo]:
"""获取已摄取文档及其节点和元数据的字典映射。"""
node_doc_ids = list(self.index_struct.all_nodes.values())
nodes = self.docstore.get_nodes(node_doc_ids)
all_ref_doc_info = {}
for node in nodes:
ref_node = node.source_node
if not ref_node:
continue
ref_doc_info = self.docstore.get_ref_doc_info(ref_node.node_id)
if not ref_doc_info:
continue
all_ref_doc_info[ref_node.node_id] = ref_doc_info
return all_ref_doc_info
|