Skip to content

Unstructured element

节点解析器。

UnstructuredElementNodeParser #

Bases: BaseElementNodeParser

无结构元素节点解析器。

将文档分割为文本节点和与嵌入对象(例如表格)对应的索引节点。

Source code in llama_index/core/node_parser/relational/unstructured_element.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class UnstructuredElementNodeParser(BaseElementNodeParser):
    """无结构元素节点解析器。

将文档分割为文本节点和与嵌入对象(例如表格)对应的索引节点。"""

    partitioning_parameters: Optional[Dict[str, Any]] = Field(
        default={},
        description="Extra dictionary representing parameters of the partitioning process.",
    )

    def __init__(
        self,
        callback_manager: Optional[CallbackManager] = None,
        llm: Optional[Any] = None,
        summary_query_str: str = DEFAULT_SUMMARY_QUERY_STR,
        partitioning_parameters: Optional[Dict[str, Any]] = {},
    ) -> None:
        """初始化。"""
        try:
            import lxml  # noqa  # pants: no-infer-dep
            import unstructured  # noqa  # pants: no-infer-dep
        except ImportError:
            raise ImportError(
                "You must install the `unstructured` and `lxml` "
                "package to use this node parser."
            )
        callback_manager = callback_manager or CallbackManager([])

        return super().__init__(
            callback_manager=callback_manager,
            llm=llm,
            summary_query_str=summary_query_str,
            partitioning_parameters=partitioning_parameters,
        )

    @classmethod
    def class_name(cls) -> str:
        return "UnstructuredElementNodeParser"

    def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """从节点中获取节点。"""
        elements = self.extract_elements(
            node.get_content(), table_filters=[self.filter_table]
        )
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        self.extract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        nodes = self.get_nodes_from_elements(
            elements, node, ref_doc_text=node.get_content()
        )

        source_document = node.source_node or node.as_related_node_info()
        for n in nodes:
            n.relationships[NodeRelationship.SOURCE] = source_document
            n.metadata.update(node.metadata)
        return nodes

    async def aget_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """从节点中获取节点。"""
        elements = self.extract_elements(
            node.get_content(), table_filters=[self.filter_table]
        )
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        await self.aextract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        nodes = self.get_nodes_from_elements(
            elements, node, ref_doc_text=node.get_content()
        )

        source_document = node.source_node or node.as_related_node_info()
        for n in nodes:
            n.relationships[NodeRelationship.SOURCE] = source_document
            n.metadata.update(node.metadata)
        return nodes

    def extract_elements(
        self, text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any
    ) -> List[Element]:
        """从文本中提取元素。"""
        from unstructured.partition.html import partition_html  # pants: no-infer-dep

        table_filters = table_filters or []
        elements = partition_html(text=text, **self.partitioning_parameters)
        output_els = []
        for idx, element in enumerate(elements):
            if "unstructured.documents.html.HTMLTable" in str(type(element)):
                should_keep = all(tf(element) for tf in table_filters)
                if should_keep:
                    table_df = html_to_df(str(element.metadata.text_as_html))
                    output_els.append(
                        Element(
                            id=f"id_{idx}",
                            type="table",
                            element=element,
                            table=table_df,
                        )
                    )
                else:
                    # if not a table, keep it as Text as we don't want to loose context
                    from unstructured.documents.html import HTMLText

                    newElement = HTMLText(str(element), tag=element.tag)
                    output_els.append(
                        Element(id=f"id_{idx}", type="text", element=newElement)
                    )
            else:
                output_els.append(Element(id=f"id_{idx}", type="text", element=element))
        return output_els

    def filter_table(self, table_element: Any) -> bool:
        """过滤表格。"""
        table_df = html_to_df(table_element.metadata.text_as_html)

        # check if table_df is not None, has more than one row, and more than one column
        return table_df is not None and not table_df.empty and len(table_df.columns) > 1

get_nodes_from_node #

get_nodes_from_node(node: TextNode) -> List[BaseNode]

从节点中获取节点。

Source code in llama_index/core/node_parser/relational/unstructured_element.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
    """从节点中获取节点。"""
    elements = self.extract_elements(
        node.get_content(), table_filters=[self.filter_table]
    )
    table_elements = self.get_table_elements(elements)
    # extract summaries over table elements
    self.extract_table_summaries(table_elements)
    # convert into nodes
    # will return a list of Nodes and Index Nodes
    nodes = self.get_nodes_from_elements(
        elements, node, ref_doc_text=node.get_content()
    )

    source_document = node.source_node or node.as_related_node_info()
    for n in nodes:
        n.relationships[NodeRelationship.SOURCE] = source_document
        n.metadata.update(node.metadata)
    return nodes

aget_nodes_from_node async #

aget_nodes_from_node(node: TextNode) -> List[BaseNode]

从节点中获取节点。

Source code in llama_index/core/node_parser/relational/unstructured_element.py
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
async def aget_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
    """从节点中获取节点。"""
    elements = self.extract_elements(
        node.get_content(), table_filters=[self.filter_table]
    )
    table_elements = self.get_table_elements(elements)
    # extract summaries over table elements
    await self.aextract_table_summaries(table_elements)
    # convert into nodes
    # will return a list of Nodes and Index Nodes
    nodes = self.get_nodes_from_elements(
        elements, node, ref_doc_text=node.get_content()
    )

    source_document = node.source_node or node.as_related_node_info()
    for n in nodes:
        n.relationships[NodeRelationship.SOURCE] = source_document
        n.metadata.update(node.metadata)
    return nodes

extract_elements #

extract_elements(
    text: str,
    table_filters: Optional[List[Callable]] = None,
    **kwargs: Any
) -> List[Element]

从文本中提取元素。

Source code in llama_index/core/node_parser/relational/unstructured_element.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def extract_elements(
    self, text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any
) -> List[Element]:
    """从文本中提取元素。"""
    from unstructured.partition.html import partition_html  # pants: no-infer-dep

    table_filters = table_filters or []
    elements = partition_html(text=text, **self.partitioning_parameters)
    output_els = []
    for idx, element in enumerate(elements):
        if "unstructured.documents.html.HTMLTable" in str(type(element)):
            should_keep = all(tf(element) for tf in table_filters)
            if should_keep:
                table_df = html_to_df(str(element.metadata.text_as_html))
                output_els.append(
                    Element(
                        id=f"id_{idx}",
                        type="table",
                        element=element,
                        table=table_df,
                    )
                )
            else:
                # if not a table, keep it as Text as we don't want to loose context
                from unstructured.documents.html import HTMLText

                newElement = HTMLText(str(element), tag=element.tag)
                output_els.append(
                    Element(id=f"id_{idx}", type="text", element=newElement)
                )
        else:
            output_els.append(Element(id=f"id_{idx}", type="text", element=element))
    return output_els

filter_table #

filter_table(table_element: Any) -> bool

过滤表格。

Source code in llama_index/core/node_parser/relational/unstructured_element.py
131
132
133
134
135
136
def filter_table(self, table_element: Any) -> bool:
    """过滤表格。"""
    table_df = html_to_df(table_element.metadata.text_as_html)

    # check if table_df is not None, has more than one row, and more than one column
    return table_df is not None and not table_df.empty and len(table_df.columns) > 1