class JSONNodeParser(NodeParser):
"""JSON节点解析器。
使用自定义的JSON拆分逻辑将文档拆分为节点。
Args:
include_metadata(布尔值):是否在节点中包含元数据
include_prev_next_rel(布尔值):是否包含上一个/下一个关系"""
@classmethod
def from_defaults(
cls,
include_metadata: bool = True,
include_prev_next_rel: bool = True,
callback_manager: Optional[CallbackManager] = None,
) -> "JSONNodeParser":
callback_manager = callback_manager or CallbackManager([])
return cls(
include_metadata=include_metadata,
include_prev_next_rel=include_prev_next_rel,
callback_manager=callback_manager,
)
@classmethod
def class_name(cls) -> str:
"""获取类名。"""
return "JSONNodeParser"
def _parse_nodes(
self, nodes: Sequence[BaseNode], show_progress: bool = False, **kwargs: Any
) -> List[BaseNode]:
all_nodes: List[BaseNode] = []
nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")
for node in nodes_with_progress:
nodes = self.get_nodes_from_node(node)
all_nodes.extend(nodes)
return all_nodes
def get_nodes_from_node(self, node: BaseNode) -> List[TextNode]:
"""从文档中获取节点。"""
text = node.get_content(metadata_mode=MetadataMode.NONE)
try:
data = json.loads(text)
except json.JSONDecodeError:
# Handle invalid JSON input here
return []
json_nodes = []
if isinstance(data, dict):
lines = [*self._depth_first_yield(data, 0, [])]
json_nodes.extend(
build_nodes_from_splits(["\n".join(lines)], node, id_func=self.id_func)
)
elif isinstance(data, list):
for json_object in data:
lines = [*self._depth_first_yield(json_object, 0, [])]
json_nodes.extend(
build_nodes_from_splits(
["\n".join(lines)], node, id_func=self.id_func
)
)
else:
raise ValueError("JSON is invalid")
return json_nodes
def _depth_first_yield(
self, json_data: Dict, levels_back: int, path: List[str]
) -> Generator[str, None, None]:
"""进行JSON的深度优先遍历,生成所有叶节点。
使用空格组合JSON树中的键。
如果levels_back设置为0,则打印所有级别。
"""
if isinstance(json_data, dict):
for key, value in json_data.items():
new_path = path[:]
new_path.append(key)
yield from self._depth_first_yield(value, levels_back, new_path)
elif isinstance(json_data, list):
for _, value in enumerate(json_data):
yield from self._depth_first_yield(value, levels_back, path)
else:
new_path = path[-levels_back:]
new_path.append(str(json_data))
yield " ".join(new_path)