Source code for langchain_community.utilities.pubmed

import json
import logging
import time
import urllib.error
import urllib.parse
import urllib.request
from typing import Any, Dict, Iterator, List

from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator

logger = logging.getLogger(__name__)


[docs]class PubMedAPIWrapper(BaseModel): """ 对PubMed API的封装。 该封装将使用PubMed API进行搜索并获取文档摘要。默认情况下,它将返回输入搜索的前k个结果的文档摘要。 参数: top_k_results: 用于PubMed工具的前k个评分最高的文档数量 MAX_QUERY_LENGTH: 查询的最大长度。 默认为300个字符。 doc_content_chars_max: 文档内容的最大长度。 如果超过此长度,内容将被截断。 默认为2000个字符。 max_retry: 请求的最大重试次数。默认为5。 sleep_time: 重试之间等待的时间。 默认为0.2秒。 email: 用于PubMed API的电子邮件地址。""" parse: Any #: :meta private: base_url_esearch: str = ( "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" ) base_url_efetch: str = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" max_retry: int = 5 sleep_time: float = 0.2 # Default values for the parameters top_k_results: int = 3 MAX_QUERY_LENGTH: int = 300 doc_content_chars_max: int = 2000 email: str = "your_email@example.com" @root_validator() def validate_environment(cls, values: Dict) -> Dict: """验证Python包是否存在于环境中。""" try: import xmltodict values["parse"] = xmltodict.parse except ImportError: raise ImportError( "Could not import xmltodict python package. " "Please install it with `pip install xmltodict`." ) return values
[docs] def run(self, query: str) -> str: """运行PubMed搜索并获取文章的元信息。 参见https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch 它仅使用文章元信息中最具信息量的字段。 """ try: # Retrieve the top-k results for the query docs = [ f"Published: {result['Published']}\n" f"Title: {result['Title']}\n" f"Copyright Information: {result['Copyright Information']}\n" f"Summary::\n{result['Summary']}" for result in self.load(query[: self.MAX_QUERY_LENGTH]) ] # Join the results and limit the character count return ( "\n\n".join(docs)[: self.doc_content_chars_max] if docs else "No good PubMed Result was found" ) except Exception as ex: return f"PubMed exception: {ex}"
[docs] def lazy_load(self, query: str) -> Iterator[dict]: """搜索PubMed以查找与查询匹配的文档。 返回包含文档元数据的字典的迭代器。 """ url = ( self.base_url_esearch + "db=pubmed&term=" + str({urllib.parse.quote(query)}) + f"&retmode=json&retmax={self.top_k_results}&usehistory=y" ) result = urllib.request.urlopen(url) text = result.read().decode("utf-8") json_text = json.loads(text) webenv = json_text["esearchresult"]["webenv"] for uid in json_text["esearchresult"]["idlist"]: yield self.retrieve_article(uid, webenv)
[docs] def load(self, query: str) -> List[dict]: """搜索PubMed以匹配查询的文档。 返回包含文档元数据的字典列表。 """ return list(self.lazy_load(query))
def _dict2document(self, doc: dict) -> Document: summary = doc.pop("Summary") return Document(page_content=summary, metadata=doc)
[docs] def lazy_load_docs(self, query: str) -> Iterator[Document]: for d in self.lazy_load(query=query): yield self._dict2document(d)
[docs] def load_docs(self, query: str) -> List[Document]: return list(self.lazy_load_docs(query=query))
[docs] def retrieve_article(self, uid: str, webenv: str) -> dict: url = ( self.base_url_efetch + "db=pubmed&retmode=xml&id=" + uid + "&webenv=" + webenv ) retry = 0 while True: try: result = urllib.request.urlopen(url) break except urllib.error.HTTPError as e: if e.code == 429 and retry < self.max_retry: # Too Many Requests errors # wait for an exponentially increasing amount of time print( # noqa: T201 f"Too Many Requests, " f"waiting for {self.sleep_time:.2f} seconds..." ) time.sleep(self.sleep_time) self.sleep_time *= 2 retry += 1 else: raise e xml_text = result.read().decode("utf-8") text_dict = self.parse(xml_text) return self._parse_article(uid, text_dict)
def _parse_article(self, uid: str, text_dict: dict) -> dict: try: ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][ "Article" ] except KeyError: ar = text_dict["PubmedArticleSet"]["PubmedBookArticle"]["BookDocument"] abstract_text = ar.get("Abstract", {}).get("AbstractText", []) summaries = [ f"{txt['@Label']}: {txt['#text']}" for txt in abstract_text if "#text" in txt and "@Label" in txt ] summary = ( "\n".join(summaries) if summaries else ( abstract_text if isinstance(abstract_text, str) else ( "\n".join(str(value) for value in abstract_text.values()) if isinstance(abstract_text, dict) else "No abstract available" ) ) ) a_d = ar.get("ArticleDate", {}) pub_date = "-".join( [a_d.get("Year", ""), a_d.get("Month", ""), a_d.get("Day", "")] ) return { "uid": uid, "Title": ar.get("ArticleTitle", ""), "Published": pub_date, "Copyright Information": ar.get("Abstract", {}).get( "CopyrightInformation", "" ), "Summary": summary, }