langchain_community.document_loaders.mhtml

import email
import logging
from pathlib import Path
from typing import Dict, Iterator, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


[docs]class MHTMLLoader(BaseLoader):
    """使用`BeautifulSoup`解析`MHTML`文件。"""

[docs]    def __init__(
        self,
        file_path: Union[str, Path],
        open_encoding: Union[str, None] = None,
        bs_kwargs: Union[dict, None] = None,
        get_text_separator: str = "",
    ) -> None:
        """用路径初始化，并可选择使用的文件编码，以及传递给BeautifulSoup对象的任何kwargs。

参数：
    file_path：要加载的文件路径。
    open_encoding：打开文件时要使用的编码。
    bs_kwargs：传递给BeautifulSoup对象的任何kwargs。
    get_text_separator：从soup中获取文本时要使用的分隔符。
"""
        try:
            import bs4  # noqa:F401
        except ImportError:
            raise ImportError(
                "beautifulsoup4 package not found, please install it with "
                "`pip install beautifulsoup4`"
            )

        self.file_path = file_path
        self.open_encoding = open_encoding
        if bs_kwargs is None:
            bs_kwargs = {"features": "lxml"}
        self.bs_kwargs = bs_kwargs
        self.get_text_separator = get_text_separator

[docs]    def lazy_load(self) -> Iterator[Document]:
        """加载 MHTML 文档到文档对象中。"""

        from bs4 import BeautifulSoup

        with open(self.file_path, "r", encoding=self.open_encoding) as f:
            message = email.message_from_string(f.read())
            parts = message.get_payload()

            if not isinstance(parts, list):
                parts = [message]

            for part in parts:
                if part.get_content_type() == "text/html":  # type: ignore[union-attr]
                    html = part.get_payload(decode=True).decode()  # type: ignore[union-attr]

                    soup = BeautifulSoup(html, **self.bs_kwargs)
                    text = soup.get_text(self.get_text_separator)

                    if soup.title:
                        title = str(soup.title.string)
                    else:
                        title = ""

                    metadata: Dict[str, Union[str, None]] = {
                        "source": str(self.file_path),
                        "title": title,
                    }
                    yield Document(page_content=text, metadata=metadata)
                    return
Source code for langchain_community.document_loaders.mhtml