Source code for langchain_community.document_transformers.beautiful_soup_transformer

from typing import Any, Iterator, List, Sequence, Tuple, Union, cast

from langchain_core.documents import BaseDocumentTransformer, Document


[docs]class BeautifulSoupTransformer(BaseDocumentTransformer): """通过提取特定标签并删除不需要的标签来转换HTML内容。 示例: .. code-block:: python from langchain_community.document_transformers import BeautifulSoupTransformer bs4_transformer = BeautifulSoupTransformer() docs_transformed = bs4_transformer.transform_documents(docs) """ # noqa: E501
[docs] def __init__(self) -> None: """初始化转换器。 这将检查是否已安装BeautifulSoup4包。 如果没有安装,则会引发ImportError。 """ try: import bs4 # noqa:F401 except ImportError: raise ImportError( "BeautifulSoup4 is required for BeautifulSoupTransformer. " "Please install it with `pip install beautifulsoup4`." )
[docs] def transform_documents( self, documents: Sequence[Document], unwanted_tags: Union[List[str], Tuple[str, ...]] = ("script", "style"), tags_to_extract: Union[List[str], Tuple[str, ...]] = ("p", "li", "div", "a"), remove_lines: bool = True, *, unwanted_classnames: Union[Tuple[str, ...], List[str]] = (), remove_comments: bool = False, **kwargs: Any, ) -> Sequence[Document]: """将文档对象列表转换为清理其HTML内容。 参数: documents:包含HTML内容的Document对象序列。 unwanted_tags:要从HTML中删除的标签列表。 tags_to_extract:要提取内容的标签列表。 remove_lines:如果设置为True,则将删除不必要的行。 unwanted_classnames:要从HTML中删除的类名列表。 remove_comments:如果设置为True,则将删除注释。 返回: 转换内容的Document对象序列。 """ for doc in documents: cleaned_content = doc.page_content cleaned_content = self.remove_unwanted_classnames( cleaned_content, unwanted_classnames ) cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags) cleaned_content = self.extract_tags( cleaned_content, tags_to_extract, remove_comments=remove_comments ) if remove_lines: cleaned_content = self.remove_unnecessary_lines(cleaned_content) doc.page_content = cleaned_content return documents
[docs] @staticmethod def remove_unwanted_classnames( html_content: str, unwanted_classnames: Union[List[str], Tuple[str, ...]] ) -> str: """从给定的HTML内容中删除不需要的类名。 参数: html_content:原始的HTML内容字符串。 unwanted_classnames:要从HTML中删除的类名列表。 返回: 删除了不需要的类名的清理过的HTML字符串。 """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") for classname in unwanted_classnames: for element in soup.find_all(class_=classname): element.decompose() return str(soup)
[docs] @staticmethod def remove_unwanted_tags( html_content: str, unwanted_tags: Union[List[str], Tuple[str, ...]] ) -> str: """从给定的HTML内容中删除不需要的标签。 参数: html_content:原始的HTML内容字符串。 unwanted_tags:要从HTML中删除的标签列表。 返回: 删除了不需要标签的清理后的HTML字符串。 """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") for tag in unwanted_tags: for element in soup.find_all(tag): element.decompose() return str(soup)
[docs] @staticmethod def extract_tags( html_content: str, tags: Union[List[str], Tuple[str, ...]], *, remove_comments: bool = False, ) -> str: """从给定的HTML内容中提取特定标签。 参数: html_content:原始的HTML内容字符串。 tags:要从HTML中提取的标签列表。 返回: 一个字符串,其中包含提取标签的内容。 """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") text_parts: List[str] = [] for element in soup.find_all(): if element.name in tags: # Extract all navigable strings recursively from this element. text_parts += get_navigable_strings( element, remove_comments=remove_comments ) # To avoid duplicate text, remove all descendants from the soup. element.decompose() return " ".join(text_parts)
[docs] @staticmethod def remove_unnecessary_lines(content: str) -> str: """清理内容,删除不必要的行。 参数: content:一个字符串,可能包含不必要的行或空格。 返回: 删除不必要行的清理过的字符串。 """ lines = content.split("\n") stripped_lines = [line.strip() for line in lines] non_empty_lines = [line for line in stripped_lines if line] cleaned_content = " ".join(non_empty_lines) return cleaned_content
[docs] async def atransform_documents( self, documents: Sequence[Document], **kwargs: Any, ) -> Sequence[Document]: raise NotImplementedError
[docs]def get_navigable_strings( element: Any, *, remove_comments: bool = False ) -> Iterator[str]: """从BeautifulSoup元素中获取所有可导航的字符串。 参数: element:一个BeautifulSoup元素。 返回: 一个字符串的生成器。 """ from bs4 import Comment, NavigableString, Tag for child in cast(Tag, element).children: if isinstance(child, Comment) and remove_comments: continue if isinstance(child, Tag): yield from get_navigable_strings(child, remove_comments=remove_comments) elif isinstance(child, NavigableString): if (element.name == "a") and (href := element.get("href")): yield f"{child.strip()} ({href})" else: yield child.strip()