22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209 | class RecursiveRetriever(BaseRetriever):
"""递归检索器。
这个检索器将递归地探索节点到其他检索器/查询引擎的链接。
对于任何检索到的节点,如果任何节点是IndexNodes,那么它将探索链接的检索器/查询引擎,并查询它。
Args:
root_id (str): 查询图的根id。
retriever_dict (Optional[Dict[str, BaseRetriever]]): id到检索器的字典。
query_engine_dict (Optional[Dict[str, BaseQueryEngine]]): id到查询引擎的字典。"""
def __init__(
self,
root_id: str,
retriever_dict: Dict[str, BaseRetriever],
query_engine_dict: Optional[Dict[str, BaseQueryEngine]] = None,
node_dict: Optional[Dict[str, BaseNode]] = None,
callback_manager: Optional[CallbackManager] = None,
query_response_tmpl: Optional[str] = None,
verbose: bool = False,
) -> None:
"""初始化参数。"""
self._root_id = root_id
if root_id not in retriever_dict:
raise ValueError(
f"Root id {root_id} not in retriever_dict, it must be a retriever."
)
self._retriever_dict = retriever_dict
self._query_engine_dict = query_engine_dict or {}
self._node_dict = node_dict or {}
# make sure keys don't overlap
if set(self._retriever_dict.keys()) & set(self._query_engine_dict.keys()):
raise ValueError("Retriever and query engine ids must not overlap.")
self._query_response_tmpl = query_response_tmpl or DEFAULT_QUERY_RESPONSE_TMPL
super().__init__(callback_manager, verbose=verbose)
def _deduplicate_nodes(
self, nodes_with_score: List[NodeWithScore]
) -> List[NodeWithScore]:
"""根据节点ID对节点进行去重。
保留具有最高分数/首次返回的节点。
"""
node_ids = set()
deduplicate_nodes = []
for node_with_score in nodes_with_score:
node = node_with_score.node
if node.id_ not in node_ids:
node_ids.add(node.id_)
deduplicate_nodes.append(node_with_score)
return deduplicate_nodes
def _query_retrieved_nodes(
self, query_bundle: QueryBundle, nodes_with_score: List[NodeWithScore]
) -> Tuple[List[NodeWithScore], List[NodeWithScore]]:
"""查询检索到的节点。
如果节点是IndexNode,则递归查询检索器/查询引擎。
如果节点是TextNode,则简单地返回该节点。
"""
nodes_to_add = []
additional_nodes = []
visited_ids = set()
# dedup index nodes that reference same index id
new_nodes_with_score = []
for node_with_score in nodes_with_score:
node = node_with_score.node
if isinstance(node, IndexNode):
if node.index_id not in visited_ids:
visited_ids.add(node.index_id)
new_nodes_with_score.append(node_with_score)
else:
new_nodes_with_score.append(node_with_score)
nodes_with_score = new_nodes_with_score
# recursively retrieve
for node_with_score in nodes_with_score:
node = node_with_score.node
if isinstance(node, IndexNode):
if self._verbose:
print_text(
"Retrieved node with id, entering: " f"{node.index_id}\n",
color="pink",
)
cur_retrieved_nodes, cur_additional_nodes = self._retrieve_rec(
query_bundle,
query_id=node.index_id,
cur_similarity=node_with_score.score,
)
else:
assert isinstance(node, TextNode)
if self._verbose:
print_text(
"Retrieving text node: " f"{node.get_content()}\n",
color="pink",
)
cur_retrieved_nodes = [node_with_score]
cur_additional_nodes = []
nodes_to_add.extend(cur_retrieved_nodes)
additional_nodes.extend(cur_additional_nodes)
# dedup nodes in case some nodes could be retrieved from multiple sources
nodes_to_add = self._deduplicate_nodes(nodes_to_add)
additional_nodes = self._deduplicate_nodes(additional_nodes)
return nodes_to_add, additional_nodes
def _get_object(self, query_id: str) -> RQN_TYPE:
"""获取检索器或查询引擎。"""
node = self._node_dict.get(query_id, None)
if node is not None:
return node
retriever = self._retriever_dict.get(query_id, None)
if retriever is not None:
return retriever
query_engine = self._query_engine_dict.get(query_id, None)
if query_engine is not None:
return query_engine
raise ValueError(
f"Query id {query_id} not found in either `retriever_dict` "
"or `query_engine_dict`."
)
def _retrieve_rec(
self,
query_bundle: QueryBundle,
query_id: Optional[str] = None,
cur_similarity: Optional[float] = None,
) -> Tuple[List[NodeWithScore], List[NodeWithScore]]:
"""递归查询。"""
if self._verbose:
print_text(
f"Retrieving with query id {query_id}: {query_bundle.query_str}\n",
color="blue",
)
query_id = query_id or self._root_id
cur_similarity = cur_similarity or 1.0
obj = self._get_object(query_id)
if isinstance(obj, BaseNode):
nodes_to_add = [NodeWithScore(node=obj, score=cur_similarity)]
additional_nodes: List[NodeWithScore] = []
elif isinstance(obj, BaseRetriever):
with self.callback_manager.event(
CBEventType.RETRIEVE,
payload={EventPayload.QUERY_STR: query_bundle.query_str},
) as event:
nodes = obj.retrieve(query_bundle)
event.on_end(payload={EventPayload.NODES: nodes})
nodes_to_add, additional_nodes = self._query_retrieved_nodes(
query_bundle, nodes
)
elif isinstance(obj, BaseQueryEngine):
sub_resp = obj.query(query_bundle)
if self._verbose:
print_text(
f"Got response: {sub_resp!s}\n",
color="green",
)
# format with both the query and the response
node_text = self._query_response_tmpl.format(
query_str=query_bundle.query_str, response=str(sub_resp)
)
node = TextNode(text=node_text)
nodes_to_add = [NodeWithScore(node=node, score=cur_similarity)]
additional_nodes = sub_resp.source_nodes
else:
raise ValueError("Must be a retriever or query engine.")
return nodes_to_add, additional_nodes
def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
retrieved_nodes, _ = self._retrieve_rec(query_bundle, query_id=None)
return retrieved_nodes
def retrieve_all(
self, query_bundle: QueryBundle
) -> Tuple[List[NodeWithScore], List[NodeWithScore]]:
"""获取所有节点。
与默认的“retrieve”方法不同,这也会获取额外的来源。
"""
return self._retrieve_rec(query_bundle, query_id=None)
|