"""使用unstructured加载文件的加载器。"""
import collections
from abc import ABC, abstractmethod
from pathlib import Path
from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
[docs]def satisfies_min_unstructured_version(min_version: str) -> bool:
"""检查已安装的“Unstructured”版本是否超过所需功能的最低版本要求。
"""
from unstructured.__version__ import __version__ as __unstructured_version__
min_version_tuple = tuple([int(x) for x in min_version.split(".")])
# NOTE(MthwRobinson) - enables the loader to work when you're using pre-release
# versions of unstructured like 0.4.17-dev1
_unstructured_version = __unstructured_version__.split("-")[0]
unstructured_version_tuple = tuple(
[int(x) for x in _unstructured_version.split(".")]
)
return unstructured_version_tuple >= min_version_tuple
[docs]def validate_unstructured_version(min_unstructured_version: str) -> None:
"""如果“Unstructured”版本未达到指定的最小值,则引发错误。
"""
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
f"unstructured>={min_unstructured_version} is required in this loader."
)
[docs]class UnstructuredBaseLoader(BaseLoader, ABC):
"""基础加载器,使用`Unstructured`。"""
[docs] def __init__(
self,
mode: str = "single",
post_processors: Optional[List[Callable]] = None,
**unstructured_kwargs: Any,
):
"""使用文件路径进行初始化。"""
try:
import unstructured # noqa:F401
except ImportError:
raise ImportError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
_valid_modes = {"single", "elements", "paged"}
if mode not in _valid_modes:
raise ValueError(
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
)
self.mode = mode
if not satisfies_min_unstructured_version("0.5.4"):
if "strategy" in unstructured_kwargs:
unstructured_kwargs.pop("strategy")
self.unstructured_kwargs = unstructured_kwargs
self.post_processors = post_processors or []
@abstractmethod
def _get_elements(self) -> List:
"""获取元素。"""
@abstractmethod
def _get_metadata(self) -> dict:
"""获取元数据。"""
def _post_process_elements(self, elements: list) -> list:
"""对提取的非结构化元素应用后处理函数。
后处理函数是通过在实例化加载器时使用 post_processors 参数传递的 str -> str 可调用函数。
"""
for element in elements:
for post_processor in self.post_processors:
element.apply(post_processor)
return elements
[docs] def lazy_load(self) -> Iterator[Document]:
"""加载文件。"""
elements = self._get_elements()
self._post_process_elements(elements)
if self.mode == "elements":
for element in elements:
metadata = self._get_metadata()
# NOTE(MthwRobinson) - the attribute check is for backward compatibility
# with unstructured<0.4.9. The metadata attributed was added in 0.4.9.
if hasattr(element, "metadata"):
metadata.update(element.metadata.to_dict())
if hasattr(element, "category"):
metadata["category"] = element.category
yield Document(page_content=str(element), metadata=metadata)
elif self.mode == "paged":
text_dict: Dict[int, str] = {}
meta_dict: Dict[int, Dict] = {}
for idx, element in enumerate(elements):
metadata = self._get_metadata()
if hasattr(element, "metadata"):
metadata.update(element.metadata.to_dict())
page_number = metadata.get("page_number", 1)
# Check if this page_number already exists in docs_dict
if page_number not in text_dict:
# If not, create new entry with initial text and metadata
text_dict[page_number] = str(element) + "\n\n"
meta_dict[page_number] = metadata
else:
# If exists, append to text and update the metadata
text_dict[page_number] += str(element) + "\n\n"
meta_dict[page_number].update(metadata)
# Convert the dict to a list of Document objects
for key in text_dict.keys():
yield Document(page_content=text_dict[key], metadata=meta_dict[key])
elif self.mode == "single":
metadata = self._get_metadata()
text = "\n\n".join([str(el) for el in elements])
yield Document(page_content=text, metadata=metadata)
else:
raise ValueError(f"mode of {self.mode} not supported.")
[docs]class UnstructuredFileLoader(UnstructuredBaseLoader):
"""使用`Unstructured`加载文件。
文件加载器使用非结构化分区函数,并将自动检测文件类型。您可以在两种模式中的一种中运行加载器:"single"和"elements"。如果使用"single"模式,则文档将作为单个langchain Document对象返回。如果使用"elements"模式,则非结构化库将文档拆分为诸如Title和NarrativeText之类的元素。您可以在模式之后传递其他非结构化kwargs以应用不同的非结构化设置。
示例
--------
from langchain_community.document_loaders import UnstructuredFileLoader
loader = UnstructuredFileLoader(
"example.pdf", mode="elements", strategy="fast",
)
docs = loader.load()
参考
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition"""
[docs] def __init__(
self,
file_path: Union[str, List[str], Path, List[Path], None],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""使用文件路径进行初始化。"""
self.file_path = file_path
super().__init__(mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.auto import partition
if isinstance(self.file_path, list):
elements = []
for file in self.file_path:
if isinstance(file, Path):
file = str(file)
elements.extend(partition(filename=file, **self.unstructured_kwargs))
return elements
else:
if isinstance(self.file_path, Path):
self.file_path = str(self.file_path)
return partition(filename=self.file_path, **self.unstructured_kwargs)
def _get_metadata(self) -> dict:
return {"source": self.file_path}
[docs]def get_elements_from_api(
file_path: Union[str, List[str], Path, List[Path], None] = None,
file: Union[IO, Sequence[IO], None] = None,
api_url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
) -> List:
"""从`非结构化API`中检索元素列表。"""
if is_list := isinstance(file_path, list):
file_path = [str(path) for path in file_path]
if isinstance(file, collections.abc.Sequence) or is_list:
from unstructured.partition.api import partition_multiple_via_api
_doc_elements = partition_multiple_via_api(
filenames=file_path,
files=file,
api_key=api_key,
api_url=api_url,
**unstructured_kwargs,
)
elements = []
for _elements in _doc_elements:
elements.extend(_elements)
return elements
else:
from unstructured.partition.api import partition_via_api
return partition_via_api(
filename=str(file_path) if file_path is not None else None,
file=file,
api_key=api_key,
api_url=api_url,
**unstructured_kwargs,
)
[docs]class UnstructuredAPIFileLoader(UnstructuredFileLoader):
"""使用`Unstructured` API 加载文件。
默认情况下,加载器会调用托管的 Unstructured API。
如果您在本地运行 Unstructured API,可以在初始化加载器时传入 url 参数来更改 API 规则。
托管的 Unstructured API 需要 API 密钥。如果需要生成密钥,请参阅 https://www.unstructured.io/api-key/。
您可以在两种模式中运行加载器:"single" 和 "elements"。
如果使用 "single" 模式,文档将作为单个 langchain Document 对象返回。
如果使用 "elements" 模式,unstructured 库将文档拆分为诸如 Title 和 NarrativeText 等元素。
您可以在 mode 后传入其他 unstructured kwargs 来应用不同的 unstructured 设置。
示例
```python
from langchain_community.document_loaders import UnstructuredAPIFileLoader
loader = UnstructuredFileAPILoader(
"example.pdf", mode="elements", strategy="fast", api_key="MY_API_KEY",
)
docs = loader.load()
参考资料
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
https://www.unstructured.io/api-key/
https://github.com/Unstructured-IO/unstructured-api"""
[docs] def __init__(
self,
file_path: Union[str, List[str], None] = "",
mode: str = "single",
url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
):
"""使用文件路径进行初始化。"""
validate_unstructured_version(min_unstructured_version="0.10.15")
self.url = url
self.api_key = api_key
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_metadata(self) -> dict:
return {"source": self.file_path}
def _get_elements(self) -> List:
return get_elements_from_api(
file_path=self.file_path,
api_key=self.api_key,
api_url=self.url,
**self.unstructured_kwargs,
)
[docs]class UnstructuredFileIOLoader(UnstructuredBaseLoader):
"""使用`Unstructured`加载文件。
文件加载器使用非结构化分区函数,会自动检测文件类型。您可以在两种模式中的一种下运行加载器:"single"和"elements"。如果使用"single"模式,文档将作为单个`langchain Document`对象返回。如果使用"elements"模式,非结构化库将文档拆分为诸如Title和NarrativeText之类的元素。您可以在模式之后传递额外的非结构化kwargs以应用不同的非结构化设置。
示例
--------
```python
from langchain_community.document_loaders import UnstructuredFileIOLoader
with open("example.pdf", "rb") as f:
loader = UnstructuredFileIOLoader(
f, mode="elements", strategy="fast",
)
docs = loader.load()
```
参考资料
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition"""
[docs] def __init__(
self,
file: Union[IO, Sequence[IO]],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""使用文件路径进行初始化。"""
self.file = file
super().__init__(mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.auto import partition
return partition(file=self.file, **self.unstructured_kwargs)
def _get_metadata(self) -> dict:
return {}
[docs]class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
"""使用`Unstructured` API加载文件。
默认情况下,加载器会调用托管的Unstructured API。
如果您在本地运行Unstructured API,可以在初始化加载器时通过传递url参数来更改API规则。
托管的Unstructured API需要一个API密钥。如果需要生成密钥,请参阅https://www.unstructured.io/api-key/。
您可以以“single”和“elements”两种模式之一运行加载器。
如果使用“single”模式,文档将作为单个langchain Document对象返回。
如果使用“elements”模式,unstructured库将文档拆分为诸如Title和NarrativeText之类的元素。
您可以在mode之后传递其他unstructured kwargs以应用不同的unstructured设置。
示例
--------
from langchain_community.document_loaders import UnstructuredAPIFileLoader
with open("example.pdf", "rb") as f:
loader = UnstructuredFileAPILoader(
f, mode="elements", strategy="fast", api_key="MY_API_KEY",
)
docs = loader.load()
参考资料
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
https://www.unstructured.io/api-key/
https://github.com/Unstructured-IO/unstructured-api"""
[docs] def __init__(
self,
file: Union[IO, Sequence[IO]],
mode: str = "single",
url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
):
"""使用文件路径进行初始化。"""
if isinstance(file, collections.abc.Sequence):
validate_unstructured_version(min_unstructured_version="0.6.3")
if file:
validate_unstructured_version(min_unstructured_version="0.6.2")
self.url = url
self.api_key = api_key
super().__init__(file=file, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
return get_elements_from_api(
file=self.file,
api_key=self.api_key,
api_url=self.url,
**self.unstructured_kwargs,
)