Source code for langchain_community.document_loaders.evernote

"""从Evernote加载文档。

https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
"""
import hashlib
import logging
from base64 import b64decode
from pathlib import Path
from time import strptime
from typing import Any, Dict, Iterator, List, Optional, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


[docs]class EverNoteLoader(BaseLoader):
    """从`EverNote`加载。

    加载一个EverNote笔记本导出文件，例如my_notebook.enex到Documents。
    可以在以下网址找到有关生成此文件的说明：
    https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML

    目前只提取笔记中的纯文本，并将其存储为Document的内容，笔记上的任何非内容元数据（例如'author'、'created'、'updated'等，但不包括'content-raw'或'resource'）标签将被提取并存储为Document的元数据。

    参数:
        file_path (str): 笔记本导出文件的路径，带有.enex扩展名
        load_single_document (bool): 是否将所有笔记的内容连接成一个长的Document。
        如果设置为True（默认值），则文档上的唯一元数据将是包含导出文件名的'source'。"""  # noqa: E501

[docs]    def __init__(self, file_path: Union[str, Path], load_single_document: bool = True):
        """使用文件路径进行初始化。"""
        self.file_path = str(file_path)
        self.load_single_document = load_single_document

    def _lazy_load(self) -> Iterator[Document]:
        for note in self._parse_note_xml(self.file_path):
            if note.get("content") is not None:
                yield Document(
                    page_content=note["content"],
                    metadata={
                        **{
                            key: value
                            for key, value in note.items()
                            if key not in ["content", "content-raw", "resource"]
                        },
                        **{"source": self.file_path},
                    },
                )

[docs]    def lazy_load(self) -> Iterator[Document]:
        """从EverNote导出文件中加载文档。"""
        if not self.load_single_document:
            yield from self._lazy_load()
        else:
            yield Document(
                page_content="".join(
                    [document.page_content for document in self._lazy_load()]
                ),
                metadata={"source": self.file_path},
            )

    @staticmethod
    def _parse_content(content: str) -> str:
        try:
            import html2text

            return html2text.html2text(content).strip()
        except ImportError as e:
            raise ImportError(
                "Could not import `html2text`. Although it is not a required package "
                "to use Langchain, using the EverNote loader requires `html2text`. "
                "Please install `html2text` via `pip install html2text` and try again."
            ) from e

    @staticmethod
    def _parse_resource(resource: list) -> dict:
        rsc_dict: Dict[str, Any] = {}
        for elem in resource:
            if elem.tag == "data":
                # Sometimes elem.text is None
                rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
                rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
            else:
                rsc_dict[elem.tag] = elem.text

        return rsc_dict

    @staticmethod
    def _parse_note(note: List, prefix: Optional[str] = None) -> dict:
        note_dict: Dict[str, Any] = {}
        resources = []

        def add_prefix(element_tag: str) -> str:
            if prefix is None:
                return element_tag
            return f"{prefix}.{element_tag}"

        for elem in note:
            if elem.tag == "content":
                note_dict[elem.tag] = EverNoteLoader._parse_content(elem.text)
                # A copy of original content
                note_dict["content-raw"] = elem.text
            elif elem.tag == "resource":
                resources.append(EverNoteLoader._parse_resource(elem))
            elif elem.tag == "created" or elem.tag == "updated":
                note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
            elif elem.tag == "note-attributes":
                additional_attributes = EverNoteLoader._parse_note(
                    elem, elem.tag
                )  # Recursively enter the note-attributes tag
                note_dict.update(additional_attributes)
            else:
                note_dict[elem.tag] = elem.text

        if len(resources) > 0:
            note_dict["resource"] = resources

        return {add_prefix(key): value for key, value in note_dict.items()}

    @staticmethod
    def _parse_note_xml(xml_file: str) -> Iterator[Dict[str, Any]]:
        """解析Evernote的xml。"""
        # Without huge_tree set to True, parser may complain about huge text node
        # Try to recover, because there may be "&nbsp;", which will cause
        # "XMLSyntaxError: Entity 'nbsp' not defined"
        try:
            from lxml import etree
        except ImportError as e:
            logger.error(
                "Could not import `lxml`. Although it is not a required package to use "
                "Langchain, using the EverNote loader requires `lxml`. Please install "
                "`lxml` via `pip install lxml` and try again."
            )
            raise e

        context = etree.iterparse(
            xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
        )

        for action, elem in context:
            if elem.tag == "note":
                yield EverNoteLoader._parse_note(elem)