18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136 | class UnstructuredElementNodeParser(BaseElementNodeParser):
"""无结构元素节点解析器。
将文档分割为文本节点和与嵌入对象(例如表格)对应的索引节点。"""
partitioning_parameters: Optional[Dict[str, Any]] = Field(
default={},
description="Extra dictionary representing parameters of the partitioning process.",
)
def __init__(
self,
callback_manager: Optional[CallbackManager] = None,
llm: Optional[Any] = None,
summary_query_str: str = DEFAULT_SUMMARY_QUERY_STR,
partitioning_parameters: Optional[Dict[str, Any]] = {},
) -> None:
"""初始化。"""
try:
import lxml # noqa # pants: no-infer-dep
import unstructured # noqa # pants: no-infer-dep
except ImportError:
raise ImportError(
"You must install the `unstructured` and `lxml` "
"package to use this node parser."
)
callback_manager = callback_manager or CallbackManager([])
return super().__init__(
callback_manager=callback_manager,
llm=llm,
summary_query_str=summary_query_str,
partitioning_parameters=partitioning_parameters,
)
@classmethod
def class_name(cls) -> str:
return "UnstructuredElementNodeParser"
def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
"""从节点中获取节点。"""
elements = self.extract_elements(
node.get_content(), table_filters=[self.filter_table]
)
table_elements = self.get_table_elements(elements)
# extract summaries over table elements
self.extract_table_summaries(table_elements)
# convert into nodes
# will return a list of Nodes and Index Nodes
nodes = self.get_nodes_from_elements(
elements, node, ref_doc_text=node.get_content()
)
source_document = node.source_node or node.as_related_node_info()
for n in nodes:
n.relationships[NodeRelationship.SOURCE] = source_document
n.metadata.update(node.metadata)
return nodes
async def aget_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
"""从节点中获取节点。"""
elements = self.extract_elements(
node.get_content(), table_filters=[self.filter_table]
)
table_elements = self.get_table_elements(elements)
# extract summaries over table elements
await self.aextract_table_summaries(table_elements)
# convert into nodes
# will return a list of Nodes and Index Nodes
nodes = self.get_nodes_from_elements(
elements, node, ref_doc_text=node.get_content()
)
source_document = node.source_node or node.as_related_node_info()
for n in nodes:
n.relationships[NodeRelationship.SOURCE] = source_document
n.metadata.update(node.metadata)
return nodes
def extract_elements(
self, text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any
) -> List[Element]:
"""从文本中提取元素。"""
from unstructured.partition.html import partition_html # pants: no-infer-dep
table_filters = table_filters or []
elements = partition_html(text=text, **self.partitioning_parameters)
output_els = []
for idx, element in enumerate(elements):
if "unstructured.documents.html.HTMLTable" in str(type(element)):
should_keep = all(tf(element) for tf in table_filters)
if should_keep:
table_df = html_to_df(str(element.metadata.text_as_html))
output_els.append(
Element(
id=f"id_{idx}",
type="table",
element=element,
table=table_df,
)
)
else:
# if not a table, keep it as Text as we don't want to loose context
from unstructured.documents.html import HTMLText
newElement = HTMLText(str(element), tag=element.tag)
output_els.append(
Element(id=f"id_{idx}", type="text", element=newElement)
)
else:
output_els.append(Element(id=f"id_{idx}", type="text", element=element))
return output_els
def filter_table(self, table_element: Any) -> bool:
"""过滤表格。"""
table_df = html_to_df(table_element.metadata.text_as_html)
# check if table_df is not None, has more than one row, and more than one column
return table_df is not None and not table_df.empty and len(table_df.columns) > 1
|