Source code for langchain_community.document_loaders.rspace

import os
from typing import Any, Dict, Iterator, List, Optional, Union

from langchain_core.documents import Document
from langchain_core.utils import get_from_dict_or_env

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.base import BaseLoader


[docs]class RSpaceLoader(BaseLoader): """从RSpace笔记本、文件夹、文档或PDF Gallery文件加载内容。 将RSpace文档映射到Langchain文档,一对一。使用PyPDF导入PDF文件。 要求安装rspace_client (`pip install rspace_client`) 和 PyPDF(如果导入PDF文档则需要 `pip install pypdf`)。"""
[docs] def __init__( self, global_id: str, api_key: Optional[str] = None, url: Optional[str] = None ): """api_key: RSpace API密钥 - 也可以作为环境变量'RSPACE_API_KEY'提供 url: str 您的RSpace实例的URL - 也可以作为环境变量'RSPACE_URL'提供 global_id: str 要加载的资源的全局ID, 例如 'SD12344'(单个文档);'GL12345'(图库中的PDF文件);'NB4567'(笔记本);'FL12244'(文件夹) """ args: Dict[str, Optional[str]] = { "api_key": api_key, "url": url, "global_id": global_id, } verified_args: Dict[str, str] = RSpaceLoader.validate_environment(args) self.api_key = verified_args["api_key"] self.url = verified_args["url"] self.global_id: str = verified_args["global_id"]
[docs] @classmethod def validate_environment(cls, values: Dict) -> Dict: """验证环境中是否存在API密钥和URL。""" values["api_key"] = get_from_dict_or_env(values, "api_key", "RSPACE_API_KEY") values["url"] = get_from_dict_or_env(values, "url", "RSPACE_URL") if "global_id" not in values or values["global_id"] is None: raise ValueError( "No value supplied for global_id. Please supply an RSpace global ID" ) return values
def _create_rspace_client(self) -> Any: """创建一个 RSpace 客户端。""" try: from rspace_client.eln import eln, field_content except ImportError: raise ImportError("You must run " "`pip install rspace_client`") try: eln = eln.ELNClient(self.url, self.api_key) eln.get_status() except Exception: raise Exception( f"Unable to initialize client - is url {self.url} or " f"api key correct?" ) return eln, field_content.FieldContent def _get_doc(self, cli: Any, field_content: Any, d_id: Union[str, int]) -> Document: content = "" doc = cli.get_document(d_id) content += f"<h2>{doc['name']}<h2/>" for f in doc["fields"]: content += f"{f['name']}\n" fc = field_content(f["content"]) content += fc.get_text() content += "\n" return Document( metadata={"source": f"rspace: {doc['name']}-{doc['globalId']}"}, page_content=content, ) def _load_structured_doc(self) -> Iterator[Document]: cli, field_content = self._create_rspace_client() yield self._get_doc(cli, field_content, self.global_id) def _load_folder_tree(self) -> Iterator[Document]: cli, field_content = self._create_rspace_client() if self.global_id: docs_in_folder = cli.list_folder_tree( folder_id=self.global_id[2:], typesToInclude=["document"] ) doc_ids: List[int] = [d["id"] for d in docs_in_folder["records"]] for doc_id in doc_ids: yield self._get_doc(cli, field_content, doc_id) def _load_pdf(self) -> Iterator[Document]: cli, field_content = self._create_rspace_client() file_info = cli.get_file_info(self.global_id) _, ext = os.path.splitext(file_info["name"]) if ext.lower() == ".pdf": outfile = f"{self.global_id}.pdf" cli.download_file(self.global_id, outfile) pdf_loader = PyPDFLoader(outfile) for pdf in pdf_loader.lazy_load(): pdf.metadata["rspace_src"] = self.global_id yield pdf
[docs] def lazy_load(self) -> Iterator[Document]: if self.global_id and "GL" in self.global_id: for d in self._load_pdf(): yield d elif self.global_id and "SD" in self.global_id: for d in self._load_structured_doc(): yield d elif self.global_id and self.global_id[0:2] in ["FL", "NB"]: for d in self._load_folder_tree(): yield d else: raise ValueError("Unknown global ID type")