Source code for langchain_community.document_loaders.email
import os
from pathlib import Path
from typing import Any, Iterator, List, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
satisfies_min_unstructured_version,
)
[docs]class UnstructuredEmailLoader(UnstructuredFileLoader):
"""使用`Unstructured`加载电子邮件文件。
适用于`.eml`和`.msg`文件。您可以通过将`process_attachments=True`传递给加载器的构造函数,处理附件以及电子邮件消息本身。默认情况下,附件将使用非结构化分区函数进行处理。如果您已经知道附件的文档类型,可以使用`attachment partitioner`关键字参数指定另一个分区函数。
示例
-------
```python
from langchain_community.document_loaders import UnstructuredEmailLoader
loader = UnstructuredEmailLoader("example_data/fake-email.eml", mode="elements")
loader.load()
```
示例
-------
```python
from langchain_community.document_loaders import UnstructuredEmailLoader
loader = UnstructuredEmailLoader(
"example_data/fake-email-attachment.eml",
mode="elements",
process_attachments=True,
)
loader.load()
```"""
[docs] def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
process_attachments = unstructured_kwargs.get("process_attachments")
attachment_partitioner = unstructured_kwargs.get("attachment_partitioner")
if process_attachments and attachment_partitioner is None:
from unstructured.partition.auto import partition
unstructured_kwargs["attachment_partitioner"] = partition
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.file_utils.filetype import FileType, detect_filetype
filetype = detect_filetype(self.file_path)
if filetype == FileType.EML:
from unstructured.partition.email import partition_email
return partition_email(filename=self.file_path, **self.unstructured_kwargs)
elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG:
from unstructured.partition.msg import partition_msg
return partition_msg(filename=self.file_path, **self.unstructured_kwargs)
else:
raise ValueError(
f"Filetype {filetype} is not supported in UnstructuredEmailLoader."
)
[docs]class OutlookMessageLoader(BaseLoader):
"""使用extract_msg加载Outlook消息文件。
https://github.com/TeamMsgExtractor/msg-extractor"""
[docs] def __init__(self, file_path: Union[str, Path]):
"""使用文件路径进行初始化。
参数:
file_path:Outlook消息文件的路径。
"""
self.file_path = str(file_path)
if not os.path.isfile(self.file_path):
raise ValueError(f"File path {self.file_path} is not a valid file")
try:
import extract_msg # noqa:F401
except ImportError:
raise ImportError(
"extract_msg is not installed. Please install it with "
"`pip install extract_msg`"
)
[docs] def lazy_load(self) -> Iterator[Document]:
import extract_msg
msg = extract_msg.Message(self.file_path)
yield Document(
page_content=msg.body,
metadata={
"source": self.file_path,
"subject": msg.subject,
"sender": msg.sender,
"date": msg.date,
},
)