Source code for langchain_community.utilities.pubmed
import json
import logging
import time
import urllib.error
import urllib.parse
import urllib.request
from typing import Any, Dict, Iterator, List
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator
logger = logging.getLogger(__name__)
[docs]class PubMedAPIWrapper(BaseModel):
""" 对PubMed API的封装。
该封装将使用PubMed API进行搜索并获取文档摘要。默认情况下,它将返回输入搜索的前k个结果的文档摘要。
参数:
top_k_results: 用于PubMed工具的前k个评分最高的文档数量
MAX_QUERY_LENGTH: 查询的最大长度。
默认为300个字符。
doc_content_chars_max: 文档内容的最大长度。
如果超过此长度,内容将被截断。
默认为2000个字符。
max_retry: 请求的最大重试次数。默认为5。
sleep_time: 重试之间等待的时间。
默认为0.2秒。
email: 用于PubMed API的电子邮件地址。"""
parse: Any #: :meta private:
base_url_esearch: str = (
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
)
base_url_efetch: str = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
max_retry: int = 5
sleep_time: float = 0.2
# Default values for the parameters
top_k_results: int = 3
MAX_QUERY_LENGTH: int = 300
doc_content_chars_max: int = 2000
email: str = "your_email@example.com"
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""验证Python包是否存在于环境中。"""
try:
import xmltodict
values["parse"] = xmltodict.parse
except ImportError:
raise ImportError(
"Could not import xmltodict python package. "
"Please install it with `pip install xmltodict`."
)
return values
[docs] def run(self, query: str) -> str:
"""运行PubMed搜索并获取文章的元信息。
参见https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
它仅使用文章元信息中最具信息量的字段。
"""
try:
# Retrieve the top-k results for the query
docs = [
f"Published: {result['Published']}\n"
f"Title: {result['Title']}\n"
f"Copyright Information: {result['Copyright Information']}\n"
f"Summary::\n{result['Summary']}"
for result in self.load(query[: self.MAX_QUERY_LENGTH])
]
# Join the results and limit the character count
return (
"\n\n".join(docs)[: self.doc_content_chars_max]
if docs
else "No good PubMed Result was found"
)
except Exception as ex:
return f"PubMed exception: {ex}"
[docs] def lazy_load(self, query: str) -> Iterator[dict]:
"""搜索PubMed以查找与查询匹配的文档。
返回包含文档元数据的字典的迭代器。
"""
url = (
self.base_url_esearch
+ "db=pubmed&term="
+ str({urllib.parse.quote(query)})
+ f"&retmode=json&retmax={self.top_k_results}&usehistory=y"
)
result = urllib.request.urlopen(url)
text = result.read().decode("utf-8")
json_text = json.loads(text)
webenv = json_text["esearchresult"]["webenv"]
for uid in json_text["esearchresult"]["idlist"]:
yield self.retrieve_article(uid, webenv)
[docs] def load(self, query: str) -> List[dict]:
"""搜索PubMed以匹配查询的文档。
返回包含文档元数据的字典列表。
"""
return list(self.lazy_load(query))
def _dict2document(self, doc: dict) -> Document:
summary = doc.pop("Summary")
return Document(page_content=summary, metadata=doc)
[docs] def lazy_load_docs(self, query: str) -> Iterator[Document]:
for d in self.lazy_load(query=query):
yield self._dict2document(d)
[docs] def load_docs(self, query: str) -> List[Document]:
return list(self.lazy_load_docs(query=query))
[docs] def retrieve_article(self, uid: str, webenv: str) -> dict:
url = (
self.base_url_efetch
+ "db=pubmed&retmode=xml&id="
+ uid
+ "&webenv="
+ webenv
)
retry = 0
while True:
try:
result = urllib.request.urlopen(url)
break
except urllib.error.HTTPError as e:
if e.code == 429 and retry < self.max_retry:
# Too Many Requests errors
# wait for an exponentially increasing amount of time
print( # noqa: T201
f"Too Many Requests, "
f"waiting for {self.sleep_time:.2f} seconds..."
)
time.sleep(self.sleep_time)
self.sleep_time *= 2
retry += 1
else:
raise e
xml_text = result.read().decode("utf-8")
text_dict = self.parse(xml_text)
return self._parse_article(uid, text_dict)
def _parse_article(self, uid: str, text_dict: dict) -> dict:
try:
ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][
"Article"
]
except KeyError:
ar = text_dict["PubmedArticleSet"]["PubmedBookArticle"]["BookDocument"]
abstract_text = ar.get("Abstract", {}).get("AbstractText", [])
summaries = [
f"{txt['@Label']}: {txt['#text']}"
for txt in abstract_text
if "#text" in txt and "@Label" in txt
]
summary = (
"\n".join(summaries)
if summaries
else (
abstract_text
if isinstance(abstract_text, str)
else (
"\n".join(str(value) for value in abstract_text.values())
if isinstance(abstract_text, dict)
else "No abstract available"
)
)
)
a_d = ar.get("ArticleDate", {})
pub_date = "-".join(
[a_d.get("Year", ""), a_d.get("Month", ""), a_d.get("Day", "")]
)
return {
"uid": uid,
"Title": ar.get("ArticleTitle", ""),
"Published": pub_date,
"Copyright Information": ar.get("Abstract", {}).get(
"CopyrightInformation", ""
),
"Summary": summary,
}