Source code for langchain_community.document_loaders.unstructured

"""使用unstructured加载文件的加载器。"""
import collections
from abc import ABC, abstractmethod
from pathlib import Path
from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


[docs]def satisfies_min_unstructured_version(min_version: str) -> bool: """检查已安装的“Unstructured”版本是否超过所需功能的最低版本要求。 """ from unstructured.__version__ import __version__ as __unstructured_version__ min_version_tuple = tuple([int(x) for x in min_version.split(".")]) # NOTE(MthwRobinson) - enables the loader to work when you're using pre-release # versions of unstructured like 0.4.17-dev1 _unstructured_version = __unstructured_version__.split("-")[0] unstructured_version_tuple = tuple( [int(x) for x in _unstructured_version.split(".")] ) return unstructured_version_tuple >= min_version_tuple
[docs]def validate_unstructured_version(min_unstructured_version: str) -> None: """如果“Unstructured”版本未达到指定的最小值,则引发错误。 """ if not satisfies_min_unstructured_version(min_unstructured_version): raise ValueError( f"unstructured>={min_unstructured_version} is required in this loader." )
[docs]class UnstructuredBaseLoader(BaseLoader, ABC): """基础加载器,使用`Unstructured`。"""
[docs] def __init__( self, mode: str = "single", post_processors: Optional[List[Callable]] = None, **unstructured_kwargs: Any, ): """使用文件路径进行初始化。""" try: import unstructured # noqa:F401 except ImportError: raise ImportError( "unstructured package not found, please install it with " "`pip install unstructured`" ) _valid_modes = {"single", "elements", "paged"} if mode not in _valid_modes: raise ValueError( f"Got {mode} for `mode`, but should be one of `{_valid_modes}`" ) self.mode = mode if not satisfies_min_unstructured_version("0.5.4"): if "strategy" in unstructured_kwargs: unstructured_kwargs.pop("strategy") self.unstructured_kwargs = unstructured_kwargs self.post_processors = post_processors or []
@abstractmethod def _get_elements(self) -> List: """获取元素。""" @abstractmethod def _get_metadata(self) -> dict: """获取元数据。""" def _post_process_elements(self, elements: list) -> list: """对提取的非结构化元素应用后处理函数。 后处理函数是通过在实例化加载器时使用 post_processors 参数传递的 str -> str 可调用函数。 """ for element in elements: for post_processor in self.post_processors: element.apply(post_processor) return elements
[docs] def lazy_load(self) -> Iterator[Document]: """加载文件。""" elements = self._get_elements() self._post_process_elements(elements) if self.mode == "elements": for element in elements: metadata = self._get_metadata() # NOTE(MthwRobinson) - the attribute check is for backward compatibility # with unstructured<0.4.9. The metadata attributed was added in 0.4.9. if hasattr(element, "metadata"): metadata.update(element.metadata.to_dict()) if hasattr(element, "category"): metadata["category"] = element.category yield Document(page_content=str(element), metadata=metadata) elif self.mode == "paged": text_dict: Dict[int, str] = {} meta_dict: Dict[int, Dict] = {} for idx, element in enumerate(elements): metadata = self._get_metadata() if hasattr(element, "metadata"): metadata.update(element.metadata.to_dict()) page_number = metadata.get("page_number", 1) # Check if this page_number already exists in docs_dict if page_number not in text_dict: # If not, create new entry with initial text and metadata text_dict[page_number] = str(element) + "\n\n" meta_dict[page_number] = metadata else: # If exists, append to text and update the metadata text_dict[page_number] += str(element) + "\n\n" meta_dict[page_number].update(metadata) # Convert the dict to a list of Document objects for key in text_dict.keys(): yield Document(page_content=text_dict[key], metadata=meta_dict[key]) elif self.mode == "single": metadata = self._get_metadata() text = "\n\n".join([str(el) for el in elements]) yield Document(page_content=text, metadata=metadata) else: raise ValueError(f"mode of {self.mode} not supported.")
[docs]class UnstructuredFileLoader(UnstructuredBaseLoader): """使用`Unstructured`加载文件。 文件加载器使用非结构化分区函数,并将自动检测文件类型。您可以在两种模式中的一种中运行加载器:"single"和"elements"。如果使用"single"模式,则文档将作为单个langchain Document对象返回。如果使用"elements"模式,则非结构化库将文档拆分为诸如Title和NarrativeText之类的元素。您可以在模式之后传递其他非结构化kwargs以应用不同的非结构化设置。 示例 -------- from langchain_community.document_loaders import UnstructuredFileLoader loader = UnstructuredFileLoader( "example.pdf", mode="elements", strategy="fast", ) docs = loader.load() 参考 ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition"""
[docs] def __init__( self, file_path: Union[str, List[str], Path, List[Path], None], mode: str = "single", **unstructured_kwargs: Any, ): """使用文件路径进行初始化。""" self.file_path = file_path super().__init__(mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List: from unstructured.partition.auto import partition if isinstance(self.file_path, list): elements = [] for file in self.file_path: if isinstance(file, Path): file = str(file) elements.extend(partition(filename=file, **self.unstructured_kwargs)) return elements else: if isinstance(self.file_path, Path): self.file_path = str(self.file_path) return partition(filename=self.file_path, **self.unstructured_kwargs) def _get_metadata(self) -> dict: return {"source": self.file_path}
[docs]def get_elements_from_api( file_path: Union[str, List[str], Path, List[Path], None] = None, file: Union[IO, Sequence[IO], None] = None, api_url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", **unstructured_kwargs: Any, ) -> List: """从`非结构化API`中检索元素列表。""" if is_list := isinstance(file_path, list): file_path = [str(path) for path in file_path] if isinstance(file, collections.abc.Sequence) or is_list: from unstructured.partition.api import partition_multiple_via_api _doc_elements = partition_multiple_via_api( filenames=file_path, files=file, api_key=api_key, api_url=api_url, **unstructured_kwargs, ) elements = [] for _elements in _doc_elements: elements.extend(_elements) return elements else: from unstructured.partition.api import partition_via_api return partition_via_api( filename=str(file_path) if file_path is not None else None, file=file, api_key=api_key, api_url=api_url, **unstructured_kwargs, )
[docs]class UnstructuredAPIFileLoader(UnstructuredFileLoader): """使用`Unstructured` API 加载文件。 默认情况下,加载器会调用托管的 Unstructured API。 如果您在本地运行 Unstructured API,可以在初始化加载器时传入 url 参数来更改 API 规则。 托管的 Unstructured API 需要 API 密钥。如果需要生成密钥,请参阅 https://www.unstructured.io/api-key/。 您可以在两种模式中运行加载器:"single" 和 "elements"。 如果使用 "single" 模式,文档将作为单个 langchain Document 对象返回。 如果使用 "elements" 模式,unstructured 库将文档拆分为诸如 Title 和 NarrativeText 等元素。 您可以在 mode 后传入其他 unstructured kwargs 来应用不同的 unstructured 设置。 示例 ```python from langchain_community.document_loaders import UnstructuredAPIFileLoader loader = UnstructuredFileAPILoader( "example.pdf", mode="elements", strategy="fast", api_key="MY_API_KEY", ) docs = loader.load() 参考资料 ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition https://www.unstructured.io/api-key/ https://github.com/Unstructured-IO/unstructured-api"""
[docs] def __init__( self, file_path: Union[str, List[str], None] = "", mode: str = "single", url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", **unstructured_kwargs: Any, ): """使用文件路径进行初始化。""" validate_unstructured_version(min_unstructured_version="0.10.15") self.url = url self.api_key = api_key super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_metadata(self) -> dict: return {"source": self.file_path} def _get_elements(self) -> List: return get_elements_from_api( file_path=self.file_path, api_key=self.api_key, api_url=self.url, **self.unstructured_kwargs, )
[docs]class UnstructuredFileIOLoader(UnstructuredBaseLoader): """使用`Unstructured`加载文件。 文件加载器使用非结构化分区函数,会自动检测文件类型。您可以在两种模式中的一种下运行加载器:"single"和"elements"。如果使用"single"模式,文档将作为单个`langchain Document`对象返回。如果使用"elements"模式,非结构化库将文档拆分为诸如Title和NarrativeText之类的元素。您可以在模式之后传递额外的非结构化kwargs以应用不同的非结构化设置。 示例 -------- ```python from langchain_community.document_loaders import UnstructuredFileIOLoader with open("example.pdf", "rb") as f: loader = UnstructuredFileIOLoader( f, mode="elements", strategy="fast", ) docs = loader.load() ``` 参考资料 ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition"""
[docs] def __init__( self, file: Union[IO, Sequence[IO]], mode: str = "single", **unstructured_kwargs: Any, ): """使用文件路径进行初始化。""" self.file = file super().__init__(mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List: from unstructured.partition.auto import partition return partition(file=self.file, **self.unstructured_kwargs) def _get_metadata(self) -> dict: return {}
[docs]class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader): """使用`Unstructured` API加载文件。 默认情况下,加载器会调用托管的Unstructured API。 如果您在本地运行Unstructured API,可以在初始化加载器时通过传递url参数来更改API规则。 托管的Unstructured API需要一个API密钥。如果需要生成密钥,请参阅https://www.unstructured.io/api-key/。 您可以以“single”和“elements”两种模式之一运行加载器。 如果使用“single”模式,文档将作为单个langchain Document对象返回。 如果使用“elements”模式,unstructured库将文档拆分为诸如Title和NarrativeText之类的元素。 您可以在mode之后传递其他unstructured kwargs以应用不同的unstructured设置。 示例 -------- from langchain_community.document_loaders import UnstructuredAPIFileLoader with open("example.pdf", "rb") as f: loader = UnstructuredFileAPILoader( f, mode="elements", strategy="fast", api_key="MY_API_KEY", ) docs = loader.load() 参考资料 ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition https://www.unstructured.io/api-key/ https://github.com/Unstructured-IO/unstructured-api"""
[docs] def __init__( self, file: Union[IO, Sequence[IO]], mode: str = "single", url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", **unstructured_kwargs: Any, ): """使用文件路径进行初始化。""" if isinstance(file, collections.abc.Sequence): validate_unstructured_version(min_unstructured_version="0.6.3") if file: validate_unstructured_version(min_unstructured_version="0.6.2") self.url = url self.api_key = api_key super().__init__(file=file, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List: return get_elements_from_api( file=self.file, api_key=self.api_key, api_url=self.url, **self.unstructured_kwargs, )