Source code for langchain_community.document_loaders.base_o365
"""所有使用O365 Package的加载器的基类"""
from __future__ import annotations
import logging
import os
import tempfile
from abc import abstractmethod
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union
from langchain_core.pydantic_v1 import (
BaseModel,
BaseSettings,
Field,
FilePath,
SecretStr,
)
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.blob_loaders.file_system import (
FileSystemBlobLoader,
)
from langchain_community.document_loaders.blob_loaders.schema import Blob
if TYPE_CHECKING:
from O365 import Account
from O365.drive import Drive, Folder
logger = logging.getLogger(__name__)
CHUNK_SIZE = 1024 * 1024 * 5
class _O365Settings(BaseSettings):
client_id: str = Field(..., env="O365_CLIENT_ID")
client_secret: SecretStr = Field(..., env="O365_CLIENT_SECRET")
class Config:
env_prefix = ""
case_sentive = False
env_file = ".env"
class _O365TokenStorage(BaseSettings):
token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt"
class _FileType(str, Enum):
DOC = "doc"
DOCX = "docx"
PDF = "pdf"
[docs]def fetch_mime_types(file_types: Sequence[_FileType]) -> Dict[str, str]:
"""获取指定文件类型的MIME类型。"""
mime_types_mapping = {}
for file_type in file_types:
if file_type.value == "doc":
mime_types_mapping[file_type.value] = "application/msword"
elif file_type.value == "docx":
mime_types_mapping[
file_type.value
] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501
elif file_type.value == "pdf":
mime_types_mapping[file_type.value] = "application/pdf"
return mime_types_mapping
[docs]class O365BaseLoader(BaseLoader, BaseModel):
"""所有使用O365 Package的加载器的基类"""
settings: _O365Settings = Field(default_factory=_O365Settings) # type: ignore[arg-type]
"""用于Office365 API客户端的设置。"""
auth_with_token: bool = False
"""是否使用令牌进行身份验证。默认为False。"""
chunk_size: Union[int, str] = CHUNK_SIZE
"""每次从服务器调用API时要检索的字节数。int或'auto'。"""
recursive: bool = False
"""加载程序是否应递归加载子文件夹?"""
@property
@abstractmethod
def _file_types(self) -> Sequence[_FileType]:
"""返回支持的文件类型。"""
@property
def _fetch_mime_types(self) -> Dict[str, str]:
"""返回一个支持的文件类型到相应的MIME类型的字典。"""
return fetch_mime_types(self._file_types)
@property
@abstractmethod
def _scopes(self) -> List[str]:
"""返回所需的范围。"""
def _load_from_folder(self, folder: Folder) -> Iterable[Blob]:
"""懒加载指定文件夹中所有配置的MIME类型的文件。
参数:
folder:要加载文件的Folder实例。这个Folder实例应该代表文件系统中存储文件的目录。
生成:
一个迭代器,产生Blob实例,这些实例是从文件夹加载的文件的二进制表示形式。
"""
file_mime_types = self._fetch_mime_types
items = folder.get_items()
with tempfile.TemporaryDirectory() as temp_dir:
os.makedirs(os.path.dirname(temp_dir), exist_ok=True)
for file in items:
if file.is_file:
if file.mime_type in list(file_mime_types.values()):
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
loader = FileSystemBlobLoader(path=temp_dir)
yield from loader.yield_blobs()
if self.recursive:
for subfolder in folder.get_child_folders():
yield from self._load_from_folder(subfolder)
def _load_from_object_ids(
self, drive: Drive, object_ids: List[str]
) -> Iterable[Blob]:
"""从驱动器中按其对象ID指定的文件进行延迟加载。
将文件作为二进制大对象(Blob)加载到系统中,并返回Iterable。
参数:
drive:要加载文件的Drive实例。此Drive实例应表示存储文件的云存储服务或类似存储系统。
object_ids:对象ID字符串列表。每个object_id代表驱动器中文件的唯一标识符。
产出:
一个迭代器,产出Blob实例,这些实例是使用指定的object_ids从驱动器加载的文件的二进制表示形式。
"""
file_mime_types = self._fetch_mime_types
with tempfile.TemporaryDirectory() as temp_dir:
for object_id in object_ids:
file = drive.get_item(object_id)
if not file:
logging.warning(
"There isn't a file with"
f"object_id {object_id} in drive {drive}."
)
continue
if file.is_file:
if file.mime_type in list(file_mime_types.values()):
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
loader = FileSystemBlobLoader(path=temp_dir)
yield from loader.yield_blobs()
def _auth(self) -> Account:
"""验证OneDrive API客户端
返回:
已验证的Account对象。
"""
try:
from O365 import Account, FileSystemTokenBackend
except ImportError:
raise ImportError(
"O365 package not found, please install it with `pip install o365`"
)
if self.auth_with_token:
token_storage = _O365TokenStorage()
token_path = token_storage.token_path
token_backend = FileSystemTokenBackend(
token_path=token_path.parent, token_filename=token_path.name
)
account = Account(
credentials=(
self.settings.client_id,
self.settings.client_secret.get_secret_value(),
),
scopes=self._scopes,
token_backend=token_backend,
**{"raise_http_errors": False},
)
else:
token_backend = FileSystemTokenBackend(
token_path=Path.home() / ".credentials"
)
account = Account(
credentials=(
self.settings.client_id,
self.settings.client_secret.get_secret_value(),
),
scopes=self._scopes,
token_backend=token_backend,
**{"raise_http_errors": False},
)
# make the auth
account.authenticate()
return account