Source code for langchain_community.document_loaders.evernote

import hashlib
import logging
from base64 import b64decode
from pathlib import Path
from time import strptime
from typing import Any, Dict, Iterator, List, Optional, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)

[docs]class EverNoteLoader(BaseLoader): """从`EverNote`加载。 加载一个EverNote笔记本导出文件,例如my_notebook.enex到Documents。 可以在以下网址找到有关生成此文件的说明: 目前只提取笔记中的纯文本,并将其存储为Document的内容,笔记上的任何非内容元数据(例如'author'、'created'、'updated'等,但不包括'content-raw'或'resource')标签将被提取并存储为Document的元数据。 参数: file_path (str): 笔记本导出文件的路径,带有.enex扩展名 load_single_document (bool): 是否将所有笔记的内容连接成一个长的Document。 如果设置为True(默认值),则文档上的唯一元数据将是包含导出文件名的'source'。""" # noqa: E501
[docs] def __init__(self, file_path: Union[str, Path], load_single_document: bool = True): """使用文件路径进行初始化。""" self.file_path = str(file_path) self.load_single_document = load_single_document
def _lazy_load(self) -> Iterator[Document]: for note in self._parse_note_xml(self.file_path): if note.get("content") is not None: yield Document( page_content=note["content"], metadata={ **{ key: value for key, value in note.items() if key not in ["content", "content-raw", "resource"] }, **{"source": self.file_path}, }, )
[docs] def lazy_load(self) -> Iterator[Document]: """从EverNote导出文件中加载文档。""" if not self.load_single_document: yield from self._lazy_load() else: yield Document( page_content="".join( [document.page_content for document in self._lazy_load()] ), metadata={"source": self.file_path}, )
@staticmethod def _parse_content(content: str) -> str: try: import html2text return html2text.html2text(content).strip() except ImportError as e: raise ImportError( "Could not import `html2text`. Although it is not a required package " "to use Langchain, using the EverNote loader requires `html2text`. " "Please install `html2text` via `pip install html2text` and try again." ) from e @staticmethod def _parse_resource(resource: list) -> dict: rsc_dict: Dict[str, Any] = {} for elem in resource: if elem.tag == "data": # Sometimes elem.text is None rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b"" rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest() else: rsc_dict[elem.tag] = elem.text return rsc_dict @staticmethod def _parse_note(note: List, prefix: Optional[str] = None) -> dict: note_dict: Dict[str, Any] = {} resources = [] def add_prefix(element_tag: str) -> str: if prefix is None: return element_tag return f"{prefix}.{element_tag}" for elem in note: if elem.tag == "content": note_dict[elem.tag] = EverNoteLoader._parse_content(elem.text) # A copy of original content note_dict["content-raw"] = elem.text elif elem.tag == "resource": resources.append(EverNoteLoader._parse_resource(elem)) elif elem.tag == "created" or elem.tag == "updated": note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ") elif elem.tag == "note-attributes": additional_attributes = EverNoteLoader._parse_note( elem, elem.tag ) # Recursively enter the note-attributes tag note_dict.update(additional_attributes) else: note_dict[elem.tag] = elem.text if len(resources) > 0: note_dict["resource"] = resources return {add_prefix(key): value for key, value in note_dict.items()} @staticmethod def _parse_note_xml(xml_file: str) -> Iterator[Dict[str, Any]]: """解析Evernote的xml。""" # Without huge_tree set to True, parser may complain about huge text node # Try to recover, because there may be " ", which will cause # "XMLSyntaxError: Entity 'nbsp' not defined" try: from lxml import etree except ImportError as e: logger.error( "Could not import `lxml`. Although it is not a required package to use " "Langchain, using the EverNote loader requires `lxml`. Please install " "`lxml` via `pip install lxml` and try again." ) raise e context = etree.iterparse( xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True ) for action, elem in context: if elem.tag == "note": yield EverNoteLoader._parse_note(elem)