Source code for langchain_community.document_loaders.acreom

import re
from pathlib import Path
from typing import Iterator, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


[docs]class AcreomLoader(BaseLoader): """从一个目录加载`acreom`保险库。""" FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL) """用于匹配Markdown文件中前置元数据的正则表达式。"""
[docs] def __init__( self, path: Union[str, Path], encoding: str = "UTF-8", collect_metadata: bool = True, ): """初始化加载器。""" self.file_path = path """Path to the directory containing the markdown files.""" self.encoding = encoding """Encoding to use when reading the files.""" self.collect_metadata = collect_metadata """Whether to collect metadata from the front matter."""
def _parse_front_matter(self, content: str) -> dict: """从内容中解析front matter元数据并将其作为字典返回。""" if not self.collect_metadata: return {} match = self.FRONT_MATTER_REGEX.search(content) front_matter = {} if match: lines = match.group(1).split("\n") for line in lines: if ":" in line: key, value = line.split(":", 1) front_matter[key.strip()] = value.strip() else: # Skip lines without a colon continue return front_matter def _remove_front_matter(self, content: str) -> str: """从给定内容中删除前置元数据。""" if not self.collect_metadata: return content return self.FRONT_MATTER_REGEX.sub("", content) def _process_acreom_content(self, content: str) -> str: # remove acreom specific elements from content that # do not contribute to the context of current document content = re.sub(r"\s*-\s\[\s\]\s.*|\s*\[\s\]\s.*", "", content) # rm tasks content = re.sub(r"#", "", content) # rm hashtags content = re.sub(r"\[\[.*?\]\]", "", content) # rm doclinks return content
[docs] def lazy_load(self) -> Iterator[Document]: ps = list(Path(self.file_path).glob("**/*.md")) for p in ps: with open(p, encoding=self.encoding) as f: text = f.read() front_matter = self._parse_front_matter(text) text = self._remove_front_matter(text) text = self._process_acreom_content(text) metadata = { "source": str(p.name), "path": str(p), **front_matter, } yield Document(page_content=text, metadata=metadata)