Source code for langchain_community.document_transformers.beautiful_soup_transformer
from typing import Any, Iterator, List, Sequence, Tuple, Union, cast
from langchain_core.documents import BaseDocumentTransformer, Document
[docs]class BeautifulSoupTransformer(BaseDocumentTransformer):
"""通过提取特定标签并删除不需要的标签来转换HTML内容。
示例:
.. code-block:: python
from langchain_community.document_transformers import BeautifulSoupTransformer
bs4_transformer = BeautifulSoupTransformer()
docs_transformed = bs4_transformer.transform_documents(docs)
""" # noqa: E501
[docs] def __init__(self) -> None:
"""初始化转换器。
这将检查是否已安装BeautifulSoup4包。
如果没有安装,则会引发ImportError。
"""
try:
import bs4 # noqa:F401
except ImportError:
raise ImportError(
"BeautifulSoup4 is required for BeautifulSoupTransformer. "
"Please install it with `pip install beautifulsoup4`."
)
[docs] def transform_documents(
self,
documents: Sequence[Document],
unwanted_tags: Union[List[str], Tuple[str, ...]] = ("script", "style"),
tags_to_extract: Union[List[str], Tuple[str, ...]] = ("p", "li", "div", "a"),
remove_lines: bool = True,
*,
unwanted_classnames: Union[Tuple[str, ...], List[str]] = (),
remove_comments: bool = False,
**kwargs: Any,
) -> Sequence[Document]:
"""将文档对象列表转换为清理其HTML内容。
参数:
documents:包含HTML内容的Document对象序列。
unwanted_tags:要从HTML中删除的标签列表。
tags_to_extract:要提取内容的标签列表。
remove_lines:如果设置为True,则将删除不必要的行。
unwanted_classnames:要从HTML中删除的类名列表。
remove_comments:如果设置为True,则将删除注释。
返回:
转换内容的Document对象序列。
"""
for doc in documents:
cleaned_content = doc.page_content
cleaned_content = self.remove_unwanted_classnames(
cleaned_content, unwanted_classnames
)
cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags)
cleaned_content = self.extract_tags(
cleaned_content, tags_to_extract, remove_comments=remove_comments
)
if remove_lines:
cleaned_content = self.remove_unnecessary_lines(cleaned_content)
doc.page_content = cleaned_content
return documents
[docs] @staticmethod
def remove_unwanted_classnames(
html_content: str, unwanted_classnames: Union[List[str], Tuple[str, ...]]
) -> str:
"""从给定的HTML内容中删除不需要的类名。
参数:
html_content:原始的HTML内容字符串。
unwanted_classnames:要从HTML中删除的类名列表。
返回:
删除了不需要的类名的清理过的HTML字符串。
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
for classname in unwanted_classnames:
for element in soup.find_all(class_=classname):
element.decompose()
return str(soup)
[docs] @staticmethod
def remove_unwanted_tags(
html_content: str, unwanted_tags: Union[List[str], Tuple[str, ...]]
) -> str:
"""从给定的HTML内容中删除不需要的标签。
参数:
html_content:原始的HTML内容字符串。
unwanted_tags:要从HTML中删除的标签列表。
返回:
删除了不需要标签的清理后的HTML字符串。
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
for tag in unwanted_tags:
for element in soup.find_all(tag):
element.decompose()
return str(soup)
[docs] @staticmethod
def extract_tags(
html_content: str,
tags: Union[List[str], Tuple[str, ...]],
*,
remove_comments: bool = False,
) -> str:
"""从给定的HTML内容中提取特定标签。
参数:
html_content:原始的HTML内容字符串。
tags:要从HTML中提取的标签列表。
返回:
一个字符串,其中包含提取标签的内容。
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
text_parts: List[str] = []
for element in soup.find_all():
if element.name in tags:
# Extract all navigable strings recursively from this element.
text_parts += get_navigable_strings(
element, remove_comments=remove_comments
)
# To avoid duplicate text, remove all descendants from the soup.
element.decompose()
return " ".join(text_parts)
[docs] @staticmethod
def remove_unnecessary_lines(content: str) -> str:
"""清理内容,删除不必要的行。
参数:
content:一个字符串,可能包含不必要的行或空格。
返回:
删除不必要行的清理过的字符串。
"""
lines = content.split("\n")
stripped_lines = [line.strip() for line in lines]
non_empty_lines = [line for line in stripped_lines if line]
cleaned_content = " ".join(non_empty_lines)
return cleaned_content
[docs] async def atransform_documents(
self,
documents: Sequence[Document],
**kwargs: Any,
) -> Sequence[Document]:
raise NotImplementedError