Source code for langchain_community.utilities.google_scholar

"""调用谷歌学术搜索的工具。"""
from typing import Dict, Optional

from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator
from langchain_core.utils import get_from_dict_or_env


[docs]class GoogleScholarAPIWrapper(BaseModel): """包装器用于Google Scholar API 您可以通过在以下网址注册来创建serpapi密钥:https://serpapi.com/users/sign_up。 该包装器使用serpapi python包: https://serpapi.com/integrations/python#search-google-scholar 要使用,您应该设置环境变量``SERP_API_KEY`` 为您的API密钥,或将`serp_api_key`作为命名参数传递给构造函数。 属性: top_k_results:从google-scholar查询中返回的结果数量。 默认情况下返回前10个结果。 hl:属性定义用于Google Scholar搜索的语言。 这是一个两字母的语言代码。 (例如,en表示英语,es表示西班牙语,fr表示法语)。请访问 Google语言页面查看支持的Google语言的完整列表: https://serpapi.com/google-languages lr:属性定义要限制搜索的一个或多个语言。 它使用lang_{两字母语言代码}来指定语言 并使用|作为分隔符。(例如,lang_fr|lang_de将仅搜索法语 和德语页面)。请访问Google lr语言页面查看完整的 支持语言列表:https://serpapi.com/google-lr-languages 示例: .. code-block:: python from langchain_community.utilities import GoogleScholarAPIWrapper google_scholar = GoogleScholarAPIWrapper() google_scholar.run('langchain')""" top_k_results: int = 10 hl: str = "en" lr: str = "lang_en" serp_api_key: Optional[str] = None class Config: """此pydantic对象的配置。""" extra = Extra.forbid @root_validator() def validate_environment(cls, values: Dict) -> Dict: """验证环境中是否存在API密钥和Python包。""" serp_api_key = get_from_dict_or_env(values, "serp_api_key", "SERP_API_KEY") values["SERP_API_KEY"] = serp_api_key try: from serpapi import GoogleScholarSearch except ImportError: raise ImportError( "google-search-results is not installed. " "Please install it with `pip install google-search-results" ">=2.4.2`" ) GoogleScholarSearch.SERP_API_KEY = serp_api_key values["google_scholar_engine"] = GoogleScholarSearch return values
[docs] def run(self, query: str) -> str: """通过GoogleSearchScholar运行查询并解析结果。""" total_results = [] page = 0 while page < max((self.top_k_results - 20), 1): # We are getting 20 results from every page # which is the max in order to reduce the number of API CALLS. # 0 is the first page of results, 20 is the 2nd page of results, # 40 is the 3rd page of results, etc. results = ( self.google_scholar_engine( # type: ignore { "q": query, "start": page, "hl": self.hl, "num": min( self.top_k_results, 20 ), # if top_k_result is less than 20. "lr": self.lr, } ) .get_dict() .get("organic_results", []) ) total_results.extend(results) if not results: # No need to search for more pages if current page # has returned no results break page += 20 if ( self.top_k_results % 20 != 0 and page > 20 and total_results ): # From the last page we would only need top_k_results%20 results # if k is not divisible by 20. results = ( self.google_scholar_engine( # type: ignore { "q": query, "start": page, "num": self.top_k_results % 20, "hl": self.hl, "lr": self.lr, } ) .get_dict() .get("organic_results", []) ) total_results.extend(results) if not total_results: return "No good Google Scholar Result was found" docs = [ f"Title: {result.get('title','')}\n" f"Authors: {','.join([author.get('name') for author in result.get('publication_info',{}).get('authors',[])])}\n" # noqa: E501 f"Summary: {result.get('publication_info',{}).get('summary','')}\n" f"Total-Citations: {result.get('inline_links',{}).get('cited_by',{}).get('total','')}" # noqa: E501 for result in total_results ] return "\n\n".join(docs)