langchain_community.document_loaders.html_bs

import logging
from pathlib import Path
from typing import Dict, Iterator, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


[docs]class BSHTMLLoader(BaseLoader):
    """加载`HTML`文件并使用`beautiful soup`解析。"""

[docs]    def __init__(
        self,
        file_path: Union[str, Path],
        open_encoding: Union[str, None] = None,
        bs_kwargs: Union[dict, None] = None,
        get_text_separator: str = "",
    ) -> None:
        """使用路径进行初始化，并可选择使用的文件编码，以及传递给BeautifulSoup对象的任何kwargs。

参数：
    file_path：要加载的文件路径。
    open_encoding：打开文件时要使用的编码。
    bs_kwargs：传递给BeautifulSoup对象的任何kwargs。
    get_text_separator：在soup上调用get_text时要使用的分隔符。
"""
        try:
            import bs4  # noqa:F401
        except ImportError:
            raise ImportError(
                "beautifulsoup4 package not found, please install it with "
                "`pip install beautifulsoup4`"
            )

        self.file_path = file_path
        self.open_encoding = open_encoding
        if bs_kwargs is None:
            bs_kwargs = {"features": "lxml"}
        self.bs_kwargs = bs_kwargs
        self.get_text_separator = get_text_separator

[docs]    def lazy_load(self) -> Iterator[Document]:
        """加载HTML文档到文档对象中。"""
        from bs4 import BeautifulSoup

        with open(self.file_path, "r", encoding=self.open_encoding) as f:
            soup = BeautifulSoup(f, **self.bs_kwargs)

        text = soup.get_text(self.get_text_separator)

        if soup.title:
            title = str(soup.title.string)
        else:
            title = ""

        metadata: Dict[str, Union[str, None]] = {
            "source": str(self.file_path),
            "title": title,
        }
        yield Document(page_content=text, metadata=metadata)
Source code for langchain_community.document_loaders.html_bs