Source code for langchain_community.document_loaders.email

import os
from pathlib import Path
from typing import Any, Iterator, List, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.unstructured import (
    UnstructuredFileLoader,
    satisfies_min_unstructured_version,
)


[docs]class UnstructuredEmailLoader(UnstructuredFileLoader): """使用`Unstructured`加载电子邮件文件。 适用于`.eml`和`.msg`文件。您可以通过将`process_attachments=True`传递给加载器的构造函数,处理附件以及电子邮件消息本身。默认情况下,附件将使用非结构化分区函数进行处理。如果您已经知道附件的文档类型,可以使用`attachment partitioner`关键字参数指定另一个分区函数。 示例 ------- ```python from langchain_community.document_loaders import UnstructuredEmailLoader loader = UnstructuredEmailLoader("example_data/fake-email.eml", mode="elements") loader.load() ``` 示例 ------- ```python from langchain_community.document_loaders import UnstructuredEmailLoader loader = UnstructuredEmailLoader( "example_data/fake-email-attachment.eml", mode="elements", process_attachments=True, ) loader.load() ```"""
[docs] def __init__( self, file_path: Union[str, Path], mode: str = "single", **unstructured_kwargs: Any, ): process_attachments = unstructured_kwargs.get("process_attachments") attachment_partitioner = unstructured_kwargs.get("attachment_partitioner") if process_attachments and attachment_partitioner is None: from unstructured.partition.auto import partition unstructured_kwargs["attachment_partitioner"] = partition super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List: from unstructured.file_utils.filetype import FileType, detect_filetype filetype = detect_filetype(self.file_path) if filetype == FileType.EML: from unstructured.partition.email import partition_email return partition_email(filename=self.file_path, **self.unstructured_kwargs) elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG: from unstructured.partition.msg import partition_msg return partition_msg(filename=self.file_path, **self.unstructured_kwargs) else: raise ValueError( f"Filetype {filetype} is not supported in UnstructuredEmailLoader." )
[docs]class OutlookMessageLoader(BaseLoader): """使用extract_msg加载Outlook消息文件。 https://github.com/TeamMsgExtractor/msg-extractor"""
[docs] def __init__(self, file_path: Union[str, Path]): """使用文件路径进行初始化。 参数: file_path:Outlook消息文件的路径。 """ self.file_path = str(file_path) if not os.path.isfile(self.file_path): raise ValueError(f"File path {self.file_path} is not a valid file") try: import extract_msg # noqa:F401 except ImportError: raise ImportError( "extract_msg is not installed. Please install it with " "`pip install extract_msg`" )
[docs] def lazy_load(self) -> Iterator[Document]: import extract_msg msg = extract_msg.Message(self.file_path) yield Document( page_content=msg.body, metadata={ "source": self.file_path, "subject": msg.subject, "sender": msg.sender, "date": msg.date, }, )