Source code for langchain_community.document_loaders.base_o365

"""所有使用O365 Package的加载器的基类"""
from __future__ import annotations

import logging
import os
import tempfile
from abc import abstractmethod
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union

from langchain_core.pydantic_v1 import (
    BaseModel,
    BaseSettings,
    Field,
    FilePath,
    SecretStr,
)

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.blob_loaders.file_system import (
    FileSystemBlobLoader,
)
from langchain_community.document_loaders.blob_loaders.schema import Blob

if TYPE_CHECKING:
    from O365 import Account
    from O365.drive import Drive, Folder

logger = logging.getLogger(__name__)

CHUNK_SIZE = 1024 * 1024 * 5


class _O365Settings(BaseSettings):
    client_id: str = Field(..., env="O365_CLIENT_ID")
    client_secret: SecretStr = Field(..., env="O365_CLIENT_SECRET")

    class Config:
        env_prefix = ""
        case_sentive = False
        env_file = ".env"


class _O365TokenStorage(BaseSettings):
    token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt"


class _FileType(str, Enum):
    DOC = "doc"
    DOCX = "docx"
    PDF = "pdf"


[docs]def fetch_mime_types(file_types: Sequence[_FileType]) -> Dict[str, str]: """获取指定文件类型的MIME类型。""" mime_types_mapping = {} for file_type in file_types: if file_type.value == "doc": mime_types_mapping[file_type.value] = "application/msword" elif file_type.value == "docx": mime_types_mapping[ file_type.value ] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501 elif file_type.value == "pdf": mime_types_mapping[file_type.value] = "application/pdf" return mime_types_mapping
[docs]class O365BaseLoader(BaseLoader, BaseModel): """所有使用O365 Package的加载器的基类""" settings: _O365Settings = Field(default_factory=_O365Settings) # type: ignore[arg-type] """用于Office365 API客户端的设置。""" auth_with_token: bool = False """是否使用令牌进行身份验证。默认为False。""" chunk_size: Union[int, str] = CHUNK_SIZE """每次从服务器调用API时要检索的字节数。int或'auto'。""" recursive: bool = False """加载程序是否应递归加载子文件夹?""" @property @abstractmethod def _file_types(self) -> Sequence[_FileType]: """返回支持的文件类型。""" @property def _fetch_mime_types(self) -> Dict[str, str]: """返回一个支持的文件类型到相应的MIME类型的字典。""" return fetch_mime_types(self._file_types) @property @abstractmethod def _scopes(self) -> List[str]: """返回所需的范围。""" def _load_from_folder(self, folder: Folder) -> Iterable[Blob]: """懒加载指定文件夹中所有配置的MIME类型的文件。 参数: folder:要加载文件的Folder实例。这个Folder实例应该代表文件系统中存储文件的目录。 生成: 一个迭代器,产生Blob实例,这些实例是从文件夹加载的文件的二进制表示形式。 """ file_mime_types = self._fetch_mime_types items = folder.get_items() with tempfile.TemporaryDirectory() as temp_dir: os.makedirs(os.path.dirname(temp_dir), exist_ok=True) for file in items: if file.is_file: if file.mime_type in list(file_mime_types.values()): file.download(to_path=temp_dir, chunk_size=self.chunk_size) loader = FileSystemBlobLoader(path=temp_dir) yield from loader.yield_blobs() if self.recursive: for subfolder in folder.get_child_folders(): yield from self._load_from_folder(subfolder) def _load_from_object_ids( self, drive: Drive, object_ids: List[str] ) -> Iterable[Blob]: """从驱动器中按其对象ID指定的文件进行延迟加载。 将文件作为二进制大对象(Blob)加载到系统中,并返回Iterable。 参数: drive:要加载文件的Drive实例。此Drive实例应表示存储文件的云存储服务或类似存储系统。 object_ids:对象ID字符串列表。每个object_id代表驱动器中文件的唯一标识符。 产出: 一个迭代器,产出Blob实例,这些实例是使用指定的object_ids从驱动器加载的文件的二进制表示形式。 """ file_mime_types = self._fetch_mime_types with tempfile.TemporaryDirectory() as temp_dir: for object_id in object_ids: file = drive.get_item(object_id) if not file: logging.warning( "There isn't a file with" f"object_id {object_id} in drive {drive}." ) continue if file.is_file: if file.mime_type in list(file_mime_types.values()): file.download(to_path=temp_dir, chunk_size=self.chunk_size) loader = FileSystemBlobLoader(path=temp_dir) yield from loader.yield_blobs() def _auth(self) -> Account: """验证OneDrive API客户端 返回: 已验证的Account对象。 """ try: from O365 import Account, FileSystemTokenBackend except ImportError: raise ImportError( "O365 package not found, please install it with `pip install o365`" ) if self.auth_with_token: token_storage = _O365TokenStorage() token_path = token_storage.token_path token_backend = FileSystemTokenBackend( token_path=token_path.parent, token_filename=token_path.name ) account = Account( credentials=( self.settings.client_id, self.settings.client_secret.get_secret_value(), ), scopes=self._scopes, token_backend=token_backend, **{"raise_http_errors": False}, ) else: token_backend = FileSystemTokenBackend( token_path=Path.home() / ".credentials" ) account = Account( credentials=( self.settings.client_id, self.settings.client_secret.get_secret_value(), ), scopes=self._scopes, token_backend=token_backend, **{"raise_http_errors": False}, ) # make the auth account.authenticate() return account