Source code for langchain_community.document_loaders.onenote

"""从OneNote笔记本加载数据"""

from pathlib import Path
from typing import Dict, Iterator, List, Optional

import requests
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import (
    BaseModel,
    BaseSettings,
    Field,
    FilePath,
    SecretStr,
)

from langchain_community.document_loaders.base import BaseLoader


class _OneNoteGraphSettings(BaseSettings):
    client_id: str = Field(..., env="MS_GRAPH_CLIENT_ID")
    client_secret: SecretStr = Field(..., env="MS_GRAPH_CLIENT_SECRET")

    class Config:
        """OneNoteGraphSettings的配置。"""

        env_prefix = ""
        case_sentive = False
        env_file = ".env"


[docs]class OneNoteLoader(BaseLoader, BaseModel):
    """从OneNote笔记本加载页面。"""

    settings: _OneNoteGraphSettings = Field(default_factory=_OneNoteGraphSettings)  # type: ignore[arg-type]
    """用于Microsoft Graph API客户端的设置。"""
    auth_with_token: bool = False
    """是否使用令牌进行身份验证。默认为False。"""
    access_token: str = ""
    """个人访问令牌"""
    onenote_api_base_url: str = "https://graph.microsoft.com/v1.0/me/onenote"
    """Microsoft Graph API用于OneNote的URL。"""
    authority_url = "https://login.microsoftonline.com/consumers/"
    """一个标识令牌授权的URL"""
    token_path: FilePath = Path.home() / ".credentials" / "onenote_graph_token.txt"
    """访问令牌存储的文件路径"""
    notebook_name: Optional[str] = None
    """根据笔记本名称筛选"""
    section_name: Optional[str] = None
    """过滤部分名称"""
    page_title: Optional[str] = None
    """过滤部分名称"""
    object_ids: Optional[List[str]] = None
    """要加载数据的对象的ID。"""

[docs]    def lazy_load(self) -> Iterator[Document]:
        """从OneNote笔记本获取页面。

返回：
    一个具有以下属性的文档列表：
        - 页面内容
        - 元数据
            - 标题
"""
        self._auth()

        try:
            from bs4 import BeautifulSoup
        except ImportError:
            raise ImportError(
                "beautifulsoup4 package not found, please install it with "
                "`pip install bs4`"
            )

        if self.object_ids is not None:
            for object_id in self.object_ids:
                page_content_html = self._get_page_content(object_id)
                soup = BeautifulSoup(page_content_html, "html.parser")
                page_title = ""
                title_tag = soup.title
                if title_tag:
                    page_title = title_tag.get_text(strip=True)
                page_content = soup.get_text(separator="\n", strip=True)
                yield Document(
                    page_content=page_content, metadata={"title": page_title}
                )
        else:
            request_url = self._url

            while request_url != "":
                response = requests.get(request_url, headers=self._headers, timeout=10)
                response.raise_for_status()
                pages = response.json()

                for page in pages["value"]:
                    page_id = page["id"]
                    page_content_html = self._get_page_content(page_id)
                    soup = BeautifulSoup(page_content_html, "html.parser")
                    page_title = ""
                    title_tag = soup.title
                    if title_tag:
                        page_content = soup.get_text(separator="\n", strip=True)
                    yield Document(
                        page_content=page_content, metadata={"title": page_title}
                    )

                if "@odata.nextLink" in pages:
                    request_url = pages["@odata.nextLink"]
                else:
                    request_url = ""

    def _get_page_content(self, page_id: str) -> str:
        """从OneNote API获取页面内容"""
        request_url = self.onenote_api_base_url + f"/pages/{page_id}/content"
        response = requests.get(request_url, headers=self._headers, timeout=10)
        response.raise_for_status()
        return response.text

    @property
    def _headers(self) -> Dict[str, str]:
        """返回OneNote API请求的标头"""
        return {
            "Authorization": f"Bearer {self.access_token}",
        }

    @property
    def _scopes(self) -> List[str]:
        """返回所需的范围。"""
        return ["Notes.Read"]

    def _auth(self) -> None:
        """使用Microsoft Graph API 进行身份验证"""
        if self.access_token != "":
            return

        if self.auth_with_token:
            with self.token_path.open("r") as token_file:
                self.access_token = token_file.read()
        else:
            try:
                from msal import ConfidentialClientApplication
            except ImportError as e:
                raise ImportError(
                    "MSAL package not found, please install it with `pip install msal`"
                ) from e

            client_instance = ConfidentialClientApplication(
                client_id=self.settings.client_id,
                client_credential=self.settings.client_secret.get_secret_value(),
                authority=self.authority_url,
            )

            authorization_request_url = client_instance.get_authorization_request_url(
                self._scopes
            )
            print("Visit the following url to give consent:")  # noqa: T201
            print(authorization_request_url)  # noqa: T201
            authorization_url = input("Paste the authenticated url here:\n")

            authorization_code = authorization_url.split("code=")[1].split("&")[0]
            access_token_json = client_instance.acquire_token_by_authorization_code(
                code=authorization_code, scopes=self._scopes
            )
            self.access_token = access_token_json["access_token"]

            try:
                if not self.token_path.parent.exists():
                    self.token_path.parent.mkdir(parents=True)
            except Exception as e:
                raise Exception(
                    f"Could not create the folder {self.token_path.parent} "
                    + "to store the access token."
                ) from e

            with self.token_path.open("w") as token_file:
                token_file.write(self.access_token)

    @property
    def _url(self) -> str:
        """创建用于从OneNoteApi API获取页面ID的URL。"""
        query_params_list = []
        filter_list = []
        expand_list = []

        query_params_list.append("$select=id")
        if self.notebook_name is not None:
            filter_list.append(
                "parentNotebook/displayName%20eq%20"
                + f"'{self.notebook_name.replace(' ', '%20')}'"
            )
            expand_list.append("parentNotebook")
        if self.section_name is not None:
            filter_list.append(
                "parentSection/displayName%20eq%20"
                + f"'{self.section_name.replace(' ', '%20')}'"
            )
            expand_list.append("parentSection")
        if self.page_title is not None:
            filter_list.append(
                "title%20eq%20" + f"'{self.page_title.replace(' ', '%20')}'"
            )

        if len(expand_list) > 0:
            query_params_list.append("$expand=" + ",".join(expand_list))
        if len(filter_list) > 0:
            query_params_list.append("$filter=" + "%20and%20".join(filter_list))

        query_params = "&".join(query_params_list)
        if query_params != "":
            query_params = "?" + query_params
        return f"{self.onenote_api_base_url}/pages{query_params}"