Source code for langchain_community.utilities.arxiv

"""调用Arxiv的工具。"""
import logging
import os
import re
from typing import Any, Dict, Iterator, List, Optional

from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator

logger = logging.getLogger(__name__)


[docs]class ArxivAPIWrapper(BaseModel): """封装了ArxivAPI。 要使用,您应该已安装``arxiv`` python包。 https://lukasschwab.me/arxiv.py/index.html 此封装将使用Arxiv API 进行搜索并获取文档摘要。默认情况下,它将返回前k个结果的文档摘要。 如果查询以arxiv标识符的形式存在 (参见https://info.arxiv.org/help/find/index.html),它将返回与arxiv标识符对应的论文。 通过doc_content_chars_max限制文档内容。 如果不想限制内容大小,请将doc_content_chars_max设置为None。 属性: top_k_results: 用于arxiv工具的前k个评分最高的文档数量 ARXIV_MAX_QUERY_LENGTH: 用于arxiv工具的查询的截断限制。 continue_on_failure (bool): 如果为True,在失败时继续加载其他URL。 load_max_docs: 加载文档数量的限制 load_all_available_meta: 如果为True: 加载的文档的“metadata”包含所有可用的元信息 (参见https://lukasschwab.me/arxiv.py/index.html#Result), 如果为False: “metadata”仅包含发布日期、标题、作者和摘要。 doc_content_chars_max: 文档内容长度的可选截断限制 示例: .. code-block:: python from langchain_community.utilities.arxiv import ArxivAPIWrapper arxiv = ArxivAPIWrapper( top_k_results = 3, ARXIV_MAX_QUERY_LENGTH = 300, load_max_docs = 3, load_all_available_meta = False, doc_content_chars_max = 40000 ) arxiv.run("tree of thought llm") """ arxiv_search: Any #: :meta private: arxiv_exceptions: Any # :meta private: top_k_results: int = 3 ARXIV_MAX_QUERY_LENGTH: int = 300 continue_on_failure: bool = False load_max_docs: int = 100 load_all_available_meta: bool = False doc_content_chars_max: Optional[int] = 4000
[docs] def is_arxiv_identifier(self, query: str) -> bool: """检查查询是否为arXiv标识符。""" arxiv_identifier_pattern = r"\d{2}(0[1-9]|1[0-2])\.\d{4,5}(v\d+|)|\d{7}.*" for query_item in query[: self.ARXIV_MAX_QUERY_LENGTH].split(): match_result = re.match(arxiv_identifier_pattern, query_item) if not match_result: return False assert match_result is not None if not match_result.group(0) == query_item: return False return True
@root_validator() def validate_environment(cls, values: Dict) -> Dict: """验证Python包是否存在于环境中。""" try: import arxiv values["arxiv_search"] = arxiv.Search values["arxiv_exceptions"] = ( arxiv.ArxivError, arxiv.UnexpectedEmptyPageError, arxiv.HTTPError, ) values["arxiv_result"] = arxiv.Result except ImportError: raise ImportError( "Could not import arxiv python package. " "Please install it with `pip install arxiv`." ) return values
[docs] def get_summaries_as_docs(self, query: str) -> List[Document]: """执行arxiv搜索并返回文档列表,其中摘要作为内容。 如果发生错误或未找到文档,则返回错误文本。https://lukasschwab.me/arxiv.py/index.html#Search的包装器 参数: query:纯文本搜索查询 """ # noqa: E501 try: if self.is_arxiv_identifier(query): results = self.arxiv_search( id_list=query.split(), max_results=self.top_k_results, ).results() else: results = self.arxiv_search( # type: ignore query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results ).results() except self.arxiv_exceptions as ex: return [Document(page_content=f"Arxiv exception: {ex}")] docs = [ Document( page_content=result.summary, metadata={ "Entry ID": result.entry_id, "Published": result.updated.date(), "Title": result.title, "Authors": ", ".join(a.name for a in result.authors), }, ) for result in results ] return docs
[docs] def run(self, query: str) -> str: """执行arxiv搜索并返回一个字符串,其中包含每篇文章的发布日期、标题、作者和摘要,每篇文章之间用两个换行符分隔。 如果发生错误或未找到任何文档,则返回错误文本。这是https://lukasschwab.me/arxiv.py/index.html#Search的包装器。 参数: query:一个纯文本搜索查询。 """ # noqa: E501 try: if self.is_arxiv_identifier(query): results = self.arxiv_search( id_list=query.split(), max_results=self.top_k_results, ).results() else: results = self.arxiv_search( # type: ignore query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results ).results() except self.arxiv_exceptions as ex: return f"Arxiv exception: {ex}" docs = [ f"Published: {result.updated.date()}\n" f"Title: {result.title}\n" f"Authors: {', '.join(a.name for a in result.authors)}\n" f"Summary: {result.summary}" for result in results ] if docs: return "\n\n".join(docs)[: self.doc_content_chars_max] else: return "No good Arxiv Result was found"
[docs] def load(self, query: str) -> List[Document]: """运行Arxiv搜索并获取文章文本以及文章元信息。 请参阅https://lukasschwab.me/arxiv.py/index.html#Search 返回:包含文档页面内容的文本格式文档列表 执行Arxiv搜索,下载前k个结果作为PDF,将它们加载为文档,并以列表形式返回。 参数: query:纯文本搜索查询 """ return list(self.lazy_load(query))
[docs] def lazy_load(self, query: str) -> Iterator[Document]: """运行Arxiv搜索并获取文章文本以及文章元信息。 参见https://lukasschwab.me/arxiv.py/index.html#Search 返回:文档以文本格式的document.page_content返回 执行Arxiv搜索,下载前k个结果作为PDF,将它们加载为Documents,并返回它们。 参数: query: 明文搜索查询 """ try: import fitz except ImportError: raise ImportError( "PyMuPDF package not found, please install it with " "`pip install pymupdf`" ) try: # Remove the ":" and "-" from the query, as they can cause search problems query = query.replace(":", "").replace("-", "") if self.is_arxiv_identifier(query): results = self.arxiv_search( id_list=query[: self.ARXIV_MAX_QUERY_LENGTH].split(), max_results=self.load_max_docs, ).results() else: results = self.arxiv_search( # type: ignore query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs ).results() except self.arxiv_exceptions as ex: logger.debug("Error on arxiv: %s", ex) return for result in results: try: doc_file_name: str = result.download_pdf() with fitz.open(doc_file_name) as doc_file: text: str = "".join(page.get_text() for page in doc_file) except (FileNotFoundError, fitz.fitz.FileDataError) as f_ex: logger.debug(f_ex) continue except Exception as e: if self.continue_on_failure: logger.error(e) continue else: raise e if self.load_all_available_meta: extra_metadata = { "entry_id": result.entry_id, "published_first_time": str(result.published.date()), "comment": result.comment, "journal_ref": result.journal_ref, "doi": result.doi, "primary_category": result.primary_category, "categories": result.categories, "links": [link.href for link in result.links], } else: extra_metadata = {} metadata = { "Published": str(result.updated.date()), "Title": result.title, "Authors": ", ".join(a.name for a in result.authors), "Summary": result.summary, **extra_metadata, } yield Document( page_content=text[: self.doc_content_chars_max], metadata=metadata ) os.remove(doc_file_name)