Source code for langchain_community.document_loaders.word_document

"""加载Word文档。"""
import os
import tempfile
from abc import ABC
from pathlib import Path
from typing import List, Union
from urllib.parse import urlparse

import requests
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader


[docs]class Docx2txtLoader(BaseLoader, ABC): """使用`docx2txt`加载`DOCX`文件,并以字符级别进行分块。 默认情况下检查本地文件,但如果文件是Web路径,则会下载到临时文件中,然后使用该文件,完成后清理临时文件。"""
[docs] def __init__(self, file_path: Union[str, Path]): """使用文件路径进行初始化。""" self.file_path = str(file_path) if "~" in self.file_path: self.file_path = os.path.expanduser(self.file_path) # If the file is a web path, download it to a temporary file, and use that if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path): r = requests.get(self.file_path) if r.status_code != 200: raise ValueError( "Check the url of your file; returned status code %s" % r.status_code ) self.web_path = self.file_path self.temp_file = tempfile.NamedTemporaryFile() self.temp_file.write(r.content) self.file_path = self.temp_file.name elif not os.path.isfile(self.file_path): raise ValueError("File path %s is not a valid file or url" % self.file_path)
def __del__(self) -> None: if hasattr(self, "temp_file"): self.temp_file.close()
[docs] def load(self) -> List[Document]: """将给定路径加载为单个页面。""" import docx2txt return [ Document( page_content=docx2txt.process(self.file_path), metadata={"source": self.file_path}, ) ]
@staticmethod def _is_valid_url(url: str) -> bool: """检查URL是否有效。""" parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme)
[docs]class UnstructuredWordDocumentLoader(UnstructuredFileLoader): """使用`Unstructured`加载`Microsoft Word`文件。 适用于`.docx`和`.doc`文件。 您可以在两种模式中的一种中运行加载程序:"single"和"elements"。 如果使用"single"模式,文档将作为单个`langchain Document`对象返回。 如果使用"elements"模式,`unstructured`库将文档拆分为诸如标题和叙述文本等元素。 您可以在模式之后传递额外的`unstructured kwargs`以应用不同的`unstructured settings`。 示例 -------- from langchain_community.document_loaders import UnstructuredWordDocumentLoader loader = UnstructuredWordDocumentLoader( "example.docx", mode="elements", strategy="fast", ) docs = loader.load() 参考资料 ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition-docx""" def _get_elements(self) -> List: from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.file_utils.filetype import FileType, detect_filetype unstructured_version = tuple( [int(x) for x in __unstructured_version__.split(".")] ) # NOTE(MthwRobinson) - magic will raise an import error if the libmagic # system dependency isn't installed. If it's not installed, we'll just # check the file extension try: import magic # noqa: F401 is_doc = detect_filetype(self.file_path) == FileType.DOC except ImportError: _, extension = os.path.splitext(str(self.file_path)) is_doc = extension == ".doc" if is_doc and unstructured_version < (0, 4, 11): raise ValueError( f"You are on unstructured version {__unstructured_version__}. " "Partitioning .doc files is only supported in unstructured>=0.4.11. " "Please upgrade the unstructured package and try again." ) if is_doc: from unstructured.partition.doc import partition_doc return partition_doc(filename=self.file_path, **self.unstructured_kwargs) else: from unstructured.partition.docx import partition_docx return partition_docx(filename=self.file_path, **self.unstructured_kwargs)