"""调用Arxiv的工具。"""
import logging
import os
import re
from typing import Any, Dict, Iterator, List, Optional
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator
logger = logging.getLogger(__name__)
[docs]class ArxivAPIWrapper(BaseModel):
"""封装了ArxivAPI。
要使用,您应该已安装``arxiv`` python包。
https://lukasschwab.me/arxiv.py/index.html
此封装将使用Arxiv API 进行搜索并获取文档摘要。默认情况下,它将返回前k个结果的文档摘要。
如果查询以arxiv标识符的形式存在
(参见https://info.arxiv.org/help/find/index.html),它将返回与arxiv标识符对应的论文。
通过doc_content_chars_max限制文档内容。
如果不想限制内容大小,请将doc_content_chars_max设置为None。
属性:
top_k_results: 用于arxiv工具的前k个评分最高的文档数量
ARXIV_MAX_QUERY_LENGTH: 用于arxiv工具的查询的截断限制。
continue_on_failure (bool): 如果为True,在失败时继续加载其他URL。
load_max_docs: 加载文档数量的限制
load_all_available_meta:
如果为True: 加载的文档的“metadata”包含所有可用的元信息
(参见https://lukasschwab.me/arxiv.py/index.html#Result),
如果为False: “metadata”仅包含发布日期、标题、作者和摘要。
doc_content_chars_max: 文档内容长度的可选截断限制
示例:
.. code-block:: python
from langchain_community.utilities.arxiv import ArxivAPIWrapper
arxiv = ArxivAPIWrapper(
top_k_results = 3,
ARXIV_MAX_QUERY_LENGTH = 300,
load_max_docs = 3,
load_all_available_meta = False,
doc_content_chars_max = 40000
)
arxiv.run("tree of thought llm")
"""
arxiv_search: Any #: :meta private:
arxiv_exceptions: Any # :meta private:
top_k_results: int = 3
ARXIV_MAX_QUERY_LENGTH: int = 300
continue_on_failure: bool = False
load_max_docs: int = 100
load_all_available_meta: bool = False
doc_content_chars_max: Optional[int] = 4000
[docs] def is_arxiv_identifier(self, query: str) -> bool:
"""检查查询是否为arXiv标识符。"""
arxiv_identifier_pattern = r"\d{2}(0[1-9]|1[0-2])\.\d{4,5}(v\d+|)|\d{7}.*"
for query_item in query[: self.ARXIV_MAX_QUERY_LENGTH].split():
match_result = re.match(arxiv_identifier_pattern, query_item)
if not match_result:
return False
assert match_result is not None
if not match_result.group(0) == query_item:
return False
return True
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""验证Python包是否存在于环境中。"""
try:
import arxiv
values["arxiv_search"] = arxiv.Search
values["arxiv_exceptions"] = (
arxiv.ArxivError,
arxiv.UnexpectedEmptyPageError,
arxiv.HTTPError,
)
values["arxiv_result"] = arxiv.Result
except ImportError:
raise ImportError(
"Could not import arxiv python package. "
"Please install it with `pip install arxiv`."
)
return values
[docs] def get_summaries_as_docs(self, query: str) -> List[Document]:
"""执行arxiv搜索并返回文档列表,其中摘要作为内容。
如果发生错误或未找到文档,则返回错误文本。https://lukasschwab.me/arxiv.py/index.html#Search的包装器
参数:
query:纯文本搜索查询
""" # noqa: E501
try:
if self.is_arxiv_identifier(query):
results = self.arxiv_search(
id_list=query.split(),
max_results=self.top_k_results,
).results()
else:
results = self.arxiv_search( # type: ignore
query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
).results()
except self.arxiv_exceptions as ex:
return [Document(page_content=f"Arxiv exception: {ex}")]
docs = [
Document(
page_content=result.summary,
metadata={
"Entry ID": result.entry_id,
"Published": result.updated.date(),
"Title": result.title,
"Authors": ", ".join(a.name for a in result.authors),
},
)
for result in results
]
return docs
[docs] def run(self, query: str) -> str:
"""执行arxiv搜索并返回一个字符串,其中包含每篇文章的发布日期、标题、作者和摘要,每篇文章之间用两个换行符分隔。
如果发生错误或未找到任何文档,则返回错误文本。这是https://lukasschwab.me/arxiv.py/index.html#Search的包装器。
参数:
query:一个纯文本搜索查询。
""" # noqa: E501
try:
if self.is_arxiv_identifier(query):
results = self.arxiv_search(
id_list=query.split(),
max_results=self.top_k_results,
).results()
else:
results = self.arxiv_search( # type: ignore
query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
).results()
except self.arxiv_exceptions as ex:
return f"Arxiv exception: {ex}"
docs = [
f"Published: {result.updated.date()}\n"
f"Title: {result.title}\n"
f"Authors: {', '.join(a.name for a in result.authors)}\n"
f"Summary: {result.summary}"
for result in results
]
if docs:
return "\n\n".join(docs)[: self.doc_content_chars_max]
else:
return "No good Arxiv Result was found"
[docs] def load(self, query: str) -> List[Document]:
"""运行Arxiv搜索并获取文章文本以及文章元信息。
请参阅https://lukasschwab.me/arxiv.py/index.html#Search
返回:包含文档页面内容的文本格式文档列表
执行Arxiv搜索,下载前k个结果作为PDF,将它们加载为文档,并以列表形式返回。
参数:
query:纯文本搜索查询
"""
return list(self.lazy_load(query))
[docs] def lazy_load(self, query: str) -> Iterator[Document]:
"""运行Arxiv搜索并获取文章文本以及文章元信息。
参见https://lukasschwab.me/arxiv.py/index.html#Search
返回:文档以文本格式的document.page_content返回
执行Arxiv搜索,下载前k个结果作为PDF,将它们加载为Documents,并返回它们。
参数:
query: 明文搜索查询
"""
try:
import fitz
except ImportError:
raise ImportError(
"PyMuPDF package not found, please install it with "
"`pip install pymupdf`"
)
try:
# Remove the ":" and "-" from the query, as they can cause search problems
query = query.replace(":", "").replace("-", "")
if self.is_arxiv_identifier(query):
results = self.arxiv_search(
id_list=query[: self.ARXIV_MAX_QUERY_LENGTH].split(),
max_results=self.load_max_docs,
).results()
else:
results = self.arxiv_search( # type: ignore
query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs
).results()
except self.arxiv_exceptions as ex:
logger.debug("Error on arxiv: %s", ex)
return
for result in results:
try:
doc_file_name: str = result.download_pdf()
with fitz.open(doc_file_name) as doc_file:
text: str = "".join(page.get_text() for page in doc_file)
except (FileNotFoundError, fitz.fitz.FileDataError) as f_ex:
logger.debug(f_ex)
continue
except Exception as e:
if self.continue_on_failure:
logger.error(e)
continue
else:
raise e
if self.load_all_available_meta:
extra_metadata = {
"entry_id": result.entry_id,
"published_first_time": str(result.published.date()),
"comment": result.comment,
"journal_ref": result.journal_ref,
"doi": result.doi,
"primary_category": result.primary_category,
"categories": result.categories,
"links": [link.href for link in result.links],
}
else:
extra_metadata = {}
metadata = {
"Published": str(result.updated.date()),
"Title": result.title,
"Authors": ", ".join(a.name for a in result.authors),
"Summary": result.summary,
**extra_metadata,
}
yield Document(
page_content=text[: self.doc_content_chars_max], metadata=metadata
)
os.remove(doc_file_name)