Source code for langchain_community.document_loaders.onenote

"""从OneNote笔记本加载数据"""

from pathlib import Path
from typing import Dict, Iterator, List, Optional

import requests
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import (
    BaseModel,
    BaseSettings,
    Field,
    FilePath,
    SecretStr,
)

from langchain_community.document_loaders.base import BaseLoader


class _OneNoteGraphSettings(BaseSettings):
    client_id: str = Field(..., env="MS_GRAPH_CLIENT_ID")
    client_secret: SecretStr = Field(..., env="MS_GRAPH_CLIENT_SECRET")

    class Config:
        """OneNoteGraphSettings的配置。"""

        env_prefix = ""
        case_sentive = False
        env_file = ".env"


[docs]class OneNoteLoader(BaseLoader, BaseModel): """从OneNote笔记本加载页面。""" settings: _OneNoteGraphSettings = Field(default_factory=_OneNoteGraphSettings) # type: ignore[arg-type] """用于Microsoft Graph API客户端的设置。""" auth_with_token: bool = False """是否使用令牌进行身份验证。默认为False。""" access_token: str = "" """个人访问令牌""" onenote_api_base_url: str = "https://graph.microsoft.com/v1.0/me/onenote" """Microsoft Graph API用于OneNote的URL。""" authority_url = "https://login.microsoftonline.com/consumers/" """一个标识令牌授权的URL""" token_path: FilePath = Path.home() / ".credentials" / "onenote_graph_token.txt" """访问令牌存储的文件路径""" notebook_name: Optional[str] = None """根据笔记本名称筛选""" section_name: Optional[str] = None """过滤部分名称""" page_title: Optional[str] = None """过滤部分名称""" object_ids: Optional[List[str]] = None """要加载数据的对象的ID。"""
[docs] def lazy_load(self) -> Iterator[Document]: """从OneNote笔记本获取页面。 返回: 一个具有以下属性的文档列表: - 页面内容 - 元数据 - 标题 """ self._auth() try: from bs4 import BeautifulSoup except ImportError: raise ImportError( "beautifulsoup4 package not found, please install it with " "`pip install bs4`" ) if self.object_ids is not None: for object_id in self.object_ids: page_content_html = self._get_page_content(object_id) soup = BeautifulSoup(page_content_html, "html.parser") page_title = "" title_tag = soup.title if title_tag: page_title = title_tag.get_text(strip=True) page_content = soup.get_text(separator="\n", strip=True) yield Document( page_content=page_content, metadata={"title": page_title} ) else: request_url = self._url while request_url != "": response = requests.get(request_url, headers=self._headers, timeout=10) response.raise_for_status() pages = response.json() for page in pages["value"]: page_id = page["id"] page_content_html = self._get_page_content(page_id) soup = BeautifulSoup(page_content_html, "html.parser") page_title = "" title_tag = soup.title if title_tag: page_content = soup.get_text(separator="\n", strip=True) yield Document( page_content=page_content, metadata={"title": page_title} ) if "@odata.nextLink" in pages: request_url = pages["@odata.nextLink"] else: request_url = ""
def _get_page_content(self, page_id: str) -> str: """从OneNote API获取页面内容""" request_url = self.onenote_api_base_url + f"/pages/{page_id}/content" response = requests.get(request_url, headers=self._headers, timeout=10) response.raise_for_status() return response.text @property def _headers(self) -> Dict[str, str]: """返回OneNote API请求的标头""" return { "Authorization": f"Bearer {self.access_token}", } @property def _scopes(self) -> List[str]: """返回所需的范围。""" return ["Notes.Read"] def _auth(self) -> None: """使用Microsoft Graph API 进行身份验证""" if self.access_token != "": return if self.auth_with_token: with self.token_path.open("r") as token_file: self.access_token = token_file.read() else: try: from msal import ConfidentialClientApplication except ImportError as e: raise ImportError( "MSAL package not found, please install it with `pip install msal`" ) from e client_instance = ConfidentialClientApplication( client_id=self.settings.client_id, client_credential=self.settings.client_secret.get_secret_value(), authority=self.authority_url, ) authorization_request_url = client_instance.get_authorization_request_url( self._scopes ) print("Visit the following url to give consent:") # noqa: T201 print(authorization_request_url) # noqa: T201 authorization_url = input("Paste the authenticated url here:\n") authorization_code = authorization_url.split("code=")[1].split("&")[0] access_token_json = client_instance.acquire_token_by_authorization_code( code=authorization_code, scopes=self._scopes ) self.access_token = access_token_json["access_token"] try: if not self.token_path.parent.exists(): self.token_path.parent.mkdir(parents=True) except Exception as e: raise Exception( f"Could not create the folder {self.token_path.parent} " + "to store the access token." ) from e with self.token_path.open("w") as token_file: token_file.write(self.access_token) @property def _url(self) -> str: """创建用于从OneNoteApi API获取页面ID的URL。""" query_params_list = [] filter_list = [] expand_list = [] query_params_list.append("$select=id") if self.notebook_name is not None: filter_list.append( "parentNotebook/displayName%20eq%20" + f"'{self.notebook_name.replace(' ', '%20')}'" ) expand_list.append("parentNotebook") if self.section_name is not None: filter_list.append( "parentSection/displayName%20eq%20" + f"'{self.section_name.replace(' ', '%20')}'" ) expand_list.append("parentSection") if self.page_title is not None: filter_list.append( "title%20eq%20" + f"'{self.page_title.replace(' ', '%20')}'" ) if len(expand_list) > 0: query_params_list.append("$expand=" + ",".join(expand_list)) if len(filter_list) > 0: query_params_list.append("$filter=" + "%20and%20".join(filter_list)) query_params = "&".join(query_params_list) if query_params != "": query_params = "?" + query_params return f"{self.onenote_api_base_url}/pages{query_params}"