Source code for langchain_community.document_loaders.powerpoint

import os
from typing import List

from langchain_community.document_loaders.unstructured import UnstructuredFileLoader


[docs]class UnstructuredPowerPointLoader(UnstructuredFileLoader): """使用`Unstructured`加载`Microsoft PowerPoint`文件。 适用于`.ppt`和`.pptx`文件。 您可以在两种模式中的一种中运行加载程序:"single"和"elements"。 如果使用"single"模式,则文档将作为单个`langchain Document`对象返回。 如果使用"elements"模式,`unstructured`库将文档拆分为诸如`Title`和`NarrativeText`之类的元素。 您可以在模式之后传递额外的`unstructured kwargs`以应用不同的`unstructured settings`。 示例 -------- ```python from langchain_community.document_loaders import UnstructuredPowerPointLoader loader = UnstructuredPowerPointLoader( "example.pptx", mode="elements", strategy="fast", ) docs = loader.load() ``` 参考 ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition-pptx""" def _get_elements(self) -> List: from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.file_utils.filetype import FileType, detect_filetype unstructured_version = tuple( [int(x) for x in __unstructured_version__.split(".")] ) # NOTE(MthwRobinson) - magic will raise an import error if the libmagic # system dependency isn't installed. If it's not installed, we'll just # check the file extension try: import magic # noqa: F401 is_ppt = detect_filetype(self.file_path) == FileType.PPT except ImportError: _, extension = os.path.splitext(str(self.file_path)) is_ppt = extension == ".ppt" if is_ppt and unstructured_version < (0, 4, 11): raise ValueError( f"You are on unstructured version {__unstructured_version__}. " "Partitioning .ppt files is only supported in unstructured>=0.4.11. " "Please upgrade the unstructured package and try again." ) if is_ppt: from unstructured.partition.ppt import partition_ppt return partition_ppt(filename=self.file_path, **self.unstructured_kwargs) else: from unstructured.partition.pptx import partition_pptx return partition_pptx(filename=self.file_path, **self.unstructured_kwargs)