Source code for langchain_community.document_loaders.url

"""使用unstructured加载HTML文件的加载器。"""
import logging
from typing import Any, List

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


[docs]class UnstructuredURLLoader(BaseLoader): """使用`Unstructured`从远程URL加载文件。 使用非结构化分区函数来检测MIME类型,并将文件路由到适当的分区器。 您可以以两种模式之一运行加载程序:"single"和"elements"。如果使用"single"模式,文档将作为单个langchain Document对象返回。如果使用"elements"模式,非结构化库将文档拆分为诸如Title和NarrativeText之类的元素。您可以在模式之后传递额外的非结构化kwargs以应用不同的非结构化设置。 示例 -------- from langchain_community.document_loaders import UnstructuredURLLoader loader = UnstructuredURLLoader( urls=["<url-1>", "<url-2>"], mode="elements", strategy="fast", ) docs = loader.load() 参考 ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition"""
[docs] def __init__( self, urls: List[str], continue_on_failure: bool = True, mode: str = "single", show_progress_bar: bool = False, **unstructured_kwargs: Any, ): """使用文件路径进行初始化。""" try: import unstructured # noqa:F401 from unstructured.__version__ import __version__ as __unstructured_version__ self.__version = __unstructured_version__ except ImportError: raise ImportError( "unstructured package not found, please install it with " "`pip install unstructured`" ) self._validate_mode(mode) self.mode = mode headers = unstructured_kwargs.pop("headers", {}) if len(headers.keys()) != 0: warn_about_headers = False if self.__is_non_html_available(): warn_about_headers = not self.__is_headers_available_for_non_html() else: warn_about_headers = not self.__is_headers_available_for_html() if warn_about_headers: logger.warning( "You are using an old version of unstructured. " "The headers parameter is ignored" ) self.urls = urls self.continue_on_failure = continue_on_failure self.headers = headers self.unstructured_kwargs = unstructured_kwargs self.show_progress_bar = show_progress_bar
def _validate_mode(self, mode: str) -> None: _valid_modes = {"single", "elements"} if mode not in _valid_modes: raise ValueError( f"Got {mode} for `mode`, but should be one of `{_valid_modes}`" ) def __is_headers_available_for_html(self) -> bool: _unstructured_version = self.__version.split("-")[0] unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")]) return unstructured_version >= (0, 5, 7) def __is_headers_available_for_non_html(self) -> bool: _unstructured_version = self.__version.split("-")[0] unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")]) return unstructured_version >= (0, 5, 13) def __is_non_html_available(self) -> bool: _unstructured_version = self.__version.split("-")[0] unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")]) return unstructured_version >= (0, 5, 12)
[docs] def load(self) -> List[Document]: """加载文件。""" from unstructured.partition.auto import partition from unstructured.partition.html import partition_html docs: List[Document] = list() if self.show_progress_bar: try: from tqdm import tqdm except ImportError as e: raise ImportError( "Package tqdm must be installed if show_progress_bar=True. " "Please install with 'pip install tqdm' or set " "show_progress_bar=False." ) from e urls = tqdm(self.urls) else: urls = self.urls for url in urls: try: if self.__is_non_html_available(): if self.__is_headers_available_for_non_html(): elements = partition( url=url, headers=self.headers, **self.unstructured_kwargs ) else: elements = partition(url=url, **self.unstructured_kwargs) else: if self.__is_headers_available_for_html(): elements = partition_html( url=url, headers=self.headers, **self.unstructured_kwargs ) else: elements = partition_html(url=url, **self.unstructured_kwargs) except Exception as e: if self.continue_on_failure: logger.error(f"Error fetching or processing {url}, exception: {e}") continue else: raise e if self.mode == "single": text = "\n\n".join([str(el) for el in elements]) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) elif self.mode == "elements": for element in elements: metadata = element.metadata.to_dict() metadata["category"] = element.category docs.append(Document(page_content=str(element), metadata=metadata)) return docs