Source code for langchain_community.document_loaders.word_document

"""加载Word文档。"""
import os
import tempfile
from abc import ABC
from pathlib import Path
from typing import List, Union
from urllib.parse import urlparse

import requests
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader


[docs]class Docx2txtLoader(BaseLoader, ABC):
    """使用`docx2txt`加载`DOCX`文件，并以字符级别进行分块。

默认情况下检查本地文件，但如果文件是Web路径，则会下载到临时文件中，然后使用该文件，完成后清理临时文件。"""

[docs]    def __init__(self, file_path: Union[str, Path]):
        """使用文件路径进行初始化。"""
        self.file_path = str(file_path)
        if "~" in self.file_path:
            self.file_path = os.path.expanduser(self.file_path)

        # If the file is a web path, download it to a temporary file, and use that
        if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
            r = requests.get(self.file_path)

            if r.status_code != 200:
                raise ValueError(
                    "Check the url of your file; returned status code %s"
                    % r.status_code
                )

            self.web_path = self.file_path
            self.temp_file = tempfile.NamedTemporaryFile()
            self.temp_file.write(r.content)
            self.file_path = self.temp_file.name
        elif not os.path.isfile(self.file_path):
            raise ValueError("File path %s is not a valid file or url" % self.file_path)

    def __del__(self) -> None:
        if hasattr(self, "temp_file"):
            self.temp_file.close()

[docs]    def load(self) -> List[Document]:
        """将给定路径加载为单个页面。"""
        import docx2txt

        return [
            Document(
                page_content=docx2txt.process(self.file_path),
                metadata={"source": self.file_path},
            )
        ]

    @staticmethod
    def _is_valid_url(url: str) -> bool:
        """检查URL是否有效。"""
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)


[docs]class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
    """使用`Unstructured`加载`Microsoft Word`文件。

    适用于`.docx`和`.doc`文件。
    您可以在两种模式中的一种中运行加载程序："single"和"elements"。
    如果使用"single"模式，文档将作为单个`langchain Document`对象返回。
    如果使用"elements"模式，`unstructured`库将文档拆分为诸如标题和叙述文本等元素。
    您可以在模式之后传递额外的`unstructured kwargs`以应用不同的`unstructured settings`。

    示例
    --------
    from langchain_community.document_loaders import UnstructuredWordDocumentLoader

    loader = UnstructuredWordDocumentLoader(
        "example.docx", mode="elements", strategy="fast",
    )
    docs = loader.load()

    参考资料
    ----------
    https://unstructured-io.github.io/unstructured/bricks.html#partition-docx"""

    def _get_elements(self) -> List:
        from unstructured.__version__ import __version__ as __unstructured_version__
        from unstructured.file_utils.filetype import FileType, detect_filetype

        unstructured_version = tuple(
            [int(x) for x in __unstructured_version__.split(".")]
        )
        # NOTE(MthwRobinson) - magic will raise an import error if the libmagic
        # system dependency isn't installed. If it's not installed, we'll just
        # check the file extension
        try:
            import magic  # noqa: F401

            is_doc = detect_filetype(self.file_path) == FileType.DOC
        except ImportError:
            _, extension = os.path.splitext(str(self.file_path))
            is_doc = extension == ".doc"

        if is_doc and unstructured_version < (0, 4, 11):
            raise ValueError(
                f"You are on unstructured version {__unstructured_version__}. "
                "Partitioning .doc files is only supported in unstructured>=0.4.11. "
                "Please upgrade the unstructured package and try again."
            )

        if is_doc:
            from unstructured.partition.doc import partition_doc

            return partition_doc(filename=self.file_path, **self.unstructured_kwargs)
        else:
            from unstructured.partition.docx import partition_docx

            return partition_docx(filename=self.file_path, **self.unstructured_kwargs)