Source code for langchain_community.document_loaders.evernote
"""从Evernote加载文档。
https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
"""
import hashlib
import logging
from base64 import b64decode
from pathlib import Path
from time import strptime
from typing import Any, Dict, Iterator, List, Optional, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
[docs]class EverNoteLoader(BaseLoader):
"""从`EverNote`加载。
加载一个EverNote笔记本导出文件,例如my_notebook.enex到Documents。
可以在以下网址找到有关生成此文件的说明:
https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML
目前只提取笔记中的纯文本,并将其存储为Document的内容,笔记上的任何非内容元数据(例如'author'、'created'、'updated'等,但不包括'content-raw'或'resource')标签将被提取并存储为Document的元数据。
参数:
file_path (str): 笔记本导出文件的路径,带有.enex扩展名
load_single_document (bool): 是否将所有笔记的内容连接成一个长的Document。
如果设置为True(默认值),则文档上的唯一元数据将是包含导出文件名的'source'。""" # noqa: E501
[docs] def __init__(self, file_path: Union[str, Path], load_single_document: bool = True):
"""使用文件路径进行初始化。"""
self.file_path = str(file_path)
self.load_single_document = load_single_document
def _lazy_load(self) -> Iterator[Document]:
for note in self._parse_note_xml(self.file_path):
if note.get("content") is not None:
yield Document(
page_content=note["content"],
metadata={
**{
key: value
for key, value in note.items()
if key not in ["content", "content-raw", "resource"]
},
**{"source": self.file_path},
},
)
[docs] def lazy_load(self) -> Iterator[Document]:
"""从EverNote导出文件中加载文档。"""
if not self.load_single_document:
yield from self._lazy_load()
else:
yield Document(
page_content="".join(
[document.page_content for document in self._lazy_load()]
),
metadata={"source": self.file_path},
)
@staticmethod
def _parse_content(content: str) -> str:
try:
import html2text
return html2text.html2text(content).strip()
except ImportError as e:
raise ImportError(
"Could not import `html2text`. Although it is not a required package "
"to use Langchain, using the EverNote loader requires `html2text`. "
"Please install `html2text` via `pip install html2text` and try again."
) from e
@staticmethod
def _parse_resource(resource: list) -> dict:
rsc_dict: Dict[str, Any] = {}
for elem in resource:
if elem.tag == "data":
# Sometimes elem.text is None
rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
else:
rsc_dict[elem.tag] = elem.text
return rsc_dict
@staticmethod
def _parse_note(note: List, prefix: Optional[str] = None) -> dict:
note_dict: Dict[str, Any] = {}
resources = []
def add_prefix(element_tag: str) -> str:
if prefix is None:
return element_tag
return f"{prefix}.{element_tag}"
for elem in note:
if elem.tag == "content":
note_dict[elem.tag] = EverNoteLoader._parse_content(elem.text)
# A copy of original content
note_dict["content-raw"] = elem.text
elif elem.tag == "resource":
resources.append(EverNoteLoader._parse_resource(elem))
elif elem.tag == "created" or elem.tag == "updated":
note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
elif elem.tag == "note-attributes":
additional_attributes = EverNoteLoader._parse_note(
elem, elem.tag
) # Recursively enter the note-attributes tag
note_dict.update(additional_attributes)
else:
note_dict[elem.tag] = elem.text
if len(resources) > 0:
note_dict["resource"] = resources
return {add_prefix(key): value for key, value in note_dict.items()}
@staticmethod
def _parse_note_xml(xml_file: str) -> Iterator[Dict[str, Any]]:
"""解析Evernote的xml。"""
# Without huge_tree set to True, parser may complain about huge text node
# Try to recover, because there may be " ", which will cause
# "XMLSyntaxError: Entity 'nbsp' not defined"
try:
from lxml import etree
except ImportError as e:
logger.error(
"Could not import `lxml`. Although it is not a required package to use "
"Langchain, using the EverNote loader requires `lxml`. Please install "
"`lxml` via `pip install lxml` and try again."
)
raise e
context = etree.iterparse(
xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
)
for action, elem in context:
if elem.tag == "note":
yield EverNoteLoader._parse_note(elem)