Source code for langchain_community.document_transformers.beautiful_soup_transformer

from typing import Any, Iterator, List, Sequence, Tuple, Union, cast

from langchain_core.documents import BaseDocumentTransformer, Document


[docs]class BeautifulSoupTransformer(BaseDocumentTransformer):
    """通过提取特定标签并删除不需要的标签来转换HTML内容。

    示例：
        .. code-block:: python

            from langchain_community.document_transformers import BeautifulSoupTransformer

            bs4_transformer = BeautifulSoupTransformer()
            docs_transformed = bs4_transformer.transform_documents(docs)
"""  # noqa: E501

[docs]    def __init__(self) -> None:
        """初始化转换器。

这将检查是否已安装BeautifulSoup4包。
如果没有安装，则会引发ImportError。
"""
        try:
            import bs4  # noqa:F401
        except ImportError:
            raise ImportError(
                "BeautifulSoup4 is required for BeautifulSoupTransformer. "
                "Please install it with `pip install beautifulsoup4`."
            )

[docs]    def transform_documents(
        self,
        documents: Sequence[Document],
        unwanted_tags: Union[List[str], Tuple[str, ...]] = ("script", "style"),
        tags_to_extract: Union[List[str], Tuple[str, ...]] = ("p", "li", "div", "a"),
        remove_lines: bool = True,
        *,
        unwanted_classnames: Union[Tuple[str, ...], List[str]] = (),
        remove_comments: bool = False,
        **kwargs: Any,
    ) -> Sequence[Document]:
        """将文档对象列表转换为清理其HTML内容。

参数：
    documents：包含HTML内容的Document对象序列。
    unwanted_tags：要从HTML中删除的标签列表。
    tags_to_extract：要提取内容的标签列表。
    remove_lines：如果设置为True，则将删除不必要的行。
    unwanted_classnames：要从HTML中删除的类名列表。
    remove_comments：如果设置为True，则将删除注释。

返回：
    转换内容的Document对象序列。
"""
        for doc in documents:
            cleaned_content = doc.page_content

            cleaned_content = self.remove_unwanted_classnames(
                cleaned_content, unwanted_classnames
            )

            cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags)

            cleaned_content = self.extract_tags(
                cleaned_content, tags_to_extract, remove_comments=remove_comments
            )

            if remove_lines:
                cleaned_content = self.remove_unnecessary_lines(cleaned_content)

            doc.page_content = cleaned_content

        return documents

[docs]    @staticmethod
    def remove_unwanted_classnames(
        html_content: str, unwanted_classnames: Union[List[str], Tuple[str, ...]]
    ) -> str:
        """从给定的HTML内容中删除不需要的类名。

参数：
    html_content：原始的HTML内容字符串。
    unwanted_classnames：要从HTML中删除的类名列表。

返回：
    删除了不需要的类名的清理过的HTML字符串。
"""
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(html_content, "html.parser")
        for classname in unwanted_classnames:
            for element in soup.find_all(class_=classname):
                element.decompose()
        return str(soup)

[docs]    @staticmethod
    def remove_unwanted_tags(
        html_content: str, unwanted_tags: Union[List[str], Tuple[str, ...]]
    ) -> str:
        """从给定的HTML内容中删除不需要的标签。

参数：
    html_content：原始的HTML内容字符串。
    unwanted_tags：要从HTML中删除的标签列表。

返回：
    删除了不需要标签的清理后的HTML字符串。
"""
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(html_content, "html.parser")
        for tag in unwanted_tags:
            for element in soup.find_all(tag):
                element.decompose()
        return str(soup)

[docs]    @staticmethod
    def extract_tags(
        html_content: str,
        tags: Union[List[str], Tuple[str, ...]],
        *,
        remove_comments: bool = False,
    ) -> str:
        """从给定的HTML内容中提取特定标签。

参数：
    html_content：原始的HTML内容字符串。
    tags：要从HTML中提取的标签列表。

返回：
    一个字符串，其中包含提取标签的内容。
"""
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(html_content, "html.parser")
        text_parts: List[str] = []
        for element in soup.find_all():
            if element.name in tags:
                # Extract all navigable strings recursively from this element.
                text_parts += get_navigable_strings(
                    element, remove_comments=remove_comments
                )

                # To avoid duplicate text, remove all descendants from the soup.
                element.decompose()

        return " ".join(text_parts)

[docs]    @staticmethod
    def remove_unnecessary_lines(content: str) -> str:
        """清理内容，删除不必要的行。

参数：
    content：一个字符串，可能包含不必要的行或空格。

返回：
    删除不必要行的清理过的字符串。
"""
        lines = content.split("\n")
        stripped_lines = [line.strip() for line in lines]
        non_empty_lines = [line for line in stripped_lines if line]
        cleaned_content = " ".join(non_empty_lines)
        return cleaned_content

[docs]    async def atransform_documents(
        self,
        documents: Sequence[Document],
        **kwargs: Any,
    ) -> Sequence[Document]:
        raise NotImplementedError


[docs]def get_navigable_strings(
    element: Any, *, remove_comments: bool = False
) -> Iterator[str]:
    """从BeautifulSoup元素中获取所有可导航的字符串。

参数：
    element：一个BeautifulSoup元素。

返回：
    一个字符串的生成器。
"""

    from bs4 import Comment, NavigableString, Tag

    for child in cast(Tag, element).children:
        if isinstance(child, Comment) and remove_comments:
            continue
        if isinstance(child, Tag):
            yield from get_navigable_strings(child, remove_comments=remove_comments)
        elif isinstance(child, NavigableString):
            if (element.name == "a") and (href := element.get("href")):
                yield f"{child.strip()} ({href})"
            else:
                yield child.strip()