Source code for langchain_community.document_loaders.url

"""使用unstructured加载HTML文件的加载器。"""
import logging
from typing import Any, List

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


[docs]class UnstructuredURLLoader(BaseLoader):
    """使用`Unstructured`从远程URL加载文件。

使用非结构化分区函数来检测MIME类型，并将文件路由到适当的分区器。

您可以以两种模式之一运行加载程序："single"和"elements"。如果使用"single"模式，文档将作为单个langchain Document对象返回。如果使用"elements"模式，非结构化库将文档拆分为诸如Title和NarrativeText之类的元素。您可以在模式之后传递额外的非结构化kwargs以应用不同的非结构化设置。

示例
--------
from langchain_community.document_loaders import UnstructuredURLLoader

loader = UnstructuredURLLoader(
    urls=["<url-1>", "<url-2>"], mode="elements", strategy="fast",
)
docs = loader.load()

参考
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition"""

[docs]    def __init__(
        self,
        urls: List[str],
        continue_on_failure: bool = True,
        mode: str = "single",
        show_progress_bar: bool = False,
        **unstructured_kwargs: Any,
    ):
        """使用文件路径进行初始化。"""
        try:
            import unstructured  # noqa:F401
            from unstructured.__version__ import __version__ as __unstructured_version__

            self.__version = __unstructured_version__
        except ImportError:
            raise ImportError(
                "unstructured package not found, please install it with "
                "`pip install unstructured`"
            )

        self._validate_mode(mode)
        self.mode = mode

        headers = unstructured_kwargs.pop("headers", {})
        if len(headers.keys()) != 0:
            warn_about_headers = False
            if self.__is_non_html_available():
                warn_about_headers = not self.__is_headers_available_for_non_html()
            else:
                warn_about_headers = not self.__is_headers_available_for_html()

            if warn_about_headers:
                logger.warning(
                    "You are using an old version of unstructured. "
                    "The headers parameter is ignored"
                )

        self.urls = urls
        self.continue_on_failure = continue_on_failure
        self.headers = headers
        self.unstructured_kwargs = unstructured_kwargs
        self.show_progress_bar = show_progress_bar

    def _validate_mode(self, mode: str) -> None:
        _valid_modes = {"single", "elements"}
        if mode not in _valid_modes:
            raise ValueError(
                f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
            )

    def __is_headers_available_for_html(self) -> bool:
        _unstructured_version = self.__version.split("-")[0]
        unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])

        return unstructured_version >= (0, 5, 7)

    def __is_headers_available_for_non_html(self) -> bool:
        _unstructured_version = self.__version.split("-")[0]
        unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])

        return unstructured_version >= (0, 5, 13)

    def __is_non_html_available(self) -> bool:
        _unstructured_version = self.__version.split("-")[0]
        unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])

        return unstructured_version >= (0, 5, 12)

[docs]    def load(self) -> List[Document]:
        """加载文件。"""
        from unstructured.partition.auto import partition
        from unstructured.partition.html import partition_html

        docs: List[Document] = list()
        if self.show_progress_bar:
            try:
                from tqdm import tqdm
            except ImportError as e:
                raise ImportError(
                    "Package tqdm must be installed if show_progress_bar=True. "
                    "Please install with 'pip install tqdm' or set "
                    "show_progress_bar=False."
                ) from e

            urls = tqdm(self.urls)
        else:
            urls = self.urls

        for url in urls:
            try:
                if self.__is_non_html_available():
                    if self.__is_headers_available_for_non_html():
                        elements = partition(
                            url=url, headers=self.headers, **self.unstructured_kwargs
                        )
                    else:
                        elements = partition(url=url, **self.unstructured_kwargs)
                else:
                    if self.__is_headers_available_for_html():
                        elements = partition_html(
                            url=url, headers=self.headers, **self.unstructured_kwargs
                        )
                    else:
                        elements = partition_html(url=url, **self.unstructured_kwargs)
            except Exception as e:
                if self.continue_on_failure:
                    logger.error(f"Error fetching or processing {url}, exception: {e}")
                    continue
                else:
                    raise e

            if self.mode == "single":
                text = "\n\n".join([str(el) for el in elements])
                metadata = {"source": url}
                docs.append(Document(page_content=text, metadata=metadata))
            elif self.mode == "elements":
                for element in elements:
                    metadata = element.metadata.to_dict()
                    metadata["category"] = element.category
                    docs.append(Document(page_content=str(element), metadata=metadata))

        return docs