Source code for langchain_community.document_loaders.html_bs
import logging
from pathlib import Path
from typing import Dict, Iterator, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
[docs]class BSHTMLLoader(BaseLoader):
"""加载`HTML`文件并使用`beautiful soup`解析。"""
[docs] def __init__(
self,
file_path: Union[str, Path],
open_encoding: Union[str, None] = None,
bs_kwargs: Union[dict, None] = None,
get_text_separator: str = "",
) -> None:
"""使用路径进行初始化,并可选择使用的文件编码,以及传递给BeautifulSoup对象的任何kwargs。
参数:
file_path:要加载的文件路径。
open_encoding:打开文件时要使用的编码。
bs_kwargs:传递给BeautifulSoup对象的任何kwargs。
get_text_separator:在soup上调用get_text时要使用的分隔符。
"""
try:
import bs4 # noqa:F401
except ImportError:
raise ImportError(
"beautifulsoup4 package not found, please install it with "
"`pip install beautifulsoup4`"
)
self.file_path = file_path
self.open_encoding = open_encoding
if bs_kwargs is None:
bs_kwargs = {"features": "lxml"}
self.bs_kwargs = bs_kwargs
self.get_text_separator = get_text_separator
[docs] def lazy_load(self) -> Iterator[Document]:
"""加载HTML文档到文档对象中。"""
from bs4 import BeautifulSoup
with open(self.file_path, "r", encoding=self.open_encoding) as f:
soup = BeautifulSoup(f, **self.bs_kwargs)
text = soup.get_text(self.get_text_separator)
if soup.title:
title = str(soup.title.string)
else:
title = ""
metadata: Dict[str, Union[str, None]] = {
"source": str(self.file_path),
"title": title,
}
yield Document(page_content=text, metadata=metadata)