Source code for langchain_community.utilities.google_scholar

"""调用谷歌学术搜索的工具。"""
from typing import Dict, Optional

from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator
from langchain_core.utils import get_from_dict_or_env


[docs]class GoogleScholarAPIWrapper(BaseModel):
    """包装器用于Google Scholar API

    您可以通过在以下网址注册来创建serpapi密钥：https://serpapi.com/users/sign_up。

    该包装器使用serpapi python包：
    https://serpapi.com/integrations/python#search-google-scholar

    要使用，您应该设置环境变量``SERP_API_KEY``
    为您的API密钥，或将`serp_api_key`作为命名参数传递给构造函数。

    属性：
        top_k_results：从google-scholar查询中返回的结果数量。
            默认情况下返回前10个结果。
        hl：属性定义用于Google Scholar搜索的语言。
            这是一个两字母的语言代码。
            （例如，en表示英语，es表示西班牙语，fr表示法语）。请访问
            Google语言页面查看支持的Google语言的完整列表：
            https://serpapi.com/google-languages

        lr：属性定义要限制搜索的一个或多个语言。
            它使用lang_{两字母语言代码}来指定语言
            并使用|作为分隔符。（例如，lang_fr|lang_de将仅搜索法语
            和德语页面）。请访问Google lr语言页面查看完整的
            支持语言列表：https://serpapi.com/google-lr-languages

     示例：
        .. code-block:: python

        from langchain_community.utilities import GoogleScholarAPIWrapper
        google_scholar = GoogleScholarAPIWrapper()
        google_scholar.run('langchain')"""

    top_k_results: int = 10
    hl: str = "en"
    lr: str = "lang_en"
    serp_api_key: Optional[str] = None

    class Config:
        """此pydantic对象的配置。"""

        extra = Extra.forbid

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """验证环境中是否存在API密钥和Python包。"""
        serp_api_key = get_from_dict_or_env(values, "serp_api_key", "SERP_API_KEY")
        values["SERP_API_KEY"] = serp_api_key

        try:
            from serpapi import GoogleScholarSearch

        except ImportError:
            raise ImportError(
                "google-search-results is not installed. "
                "Please install it with `pip install google-search-results"
                ">=2.4.2`"
            )
        GoogleScholarSearch.SERP_API_KEY = serp_api_key
        values["google_scholar_engine"] = GoogleScholarSearch

        return values

[docs]    def run(self, query: str) -> str:
        """通过GoogleSearchScholar运行查询并解析结果。"""
        total_results = []
        page = 0
        while page < max((self.top_k_results - 20), 1):
            # We are getting 20 results from every page
            # which is the max in order to reduce the number of API CALLS.
            # 0 is the first page of results, 20 is the 2nd page of results,
            # 40 is the 3rd page of results, etc.
            results = (
                self.google_scholar_engine(  # type: ignore
                    {
                        "q": query,
                        "start": page,
                        "hl": self.hl,
                        "num": min(
                            self.top_k_results, 20
                        ),  # if top_k_result is less than 20.
                        "lr": self.lr,
                    }
                )
                .get_dict()
                .get("organic_results", [])
            )
            total_results.extend(results)
            if not results:  # No need to search for more pages if current page
                # has returned no results
                break
            page += 20
        if (
            self.top_k_results % 20 != 0 and page > 20 and total_results
        ):  # From the last page we would only need top_k_results%20 results
            # if k is not divisible by 20.
            results = (
                self.google_scholar_engine(  # type: ignore
                    {
                        "q": query,
                        "start": page,
                        "num": self.top_k_results % 20,
                        "hl": self.hl,
                        "lr": self.lr,
                    }
                )
                .get_dict()
                .get("organic_results", [])
            )
            total_results.extend(results)
        if not total_results:
            return "No good Google Scholar Result was found"
        docs = [
            f"Title: {result.get('title','')}\n"
            f"Authors: {','.join([author.get('name') for author in result.get('publication_info',{}).get('authors',[])])}\n"  # noqa: E501
            f"Summary: {result.get('publication_info',{}).get('summary','')}\n"
            f"Total-Citations: {result.get('inline_links',{}).get('cited_by',{}).get('total','')}"  # noqa: E501
            for result in total_results
        ]
        return "\n\n".join(docs)