Semanticscholar

SemanticScholarReader #

Bases: BaseReader

一个用于从Semantic Scholar API读取和处理数据的类 ...

方法:#

init(): 实例化SemanticScholar对象

load_data(query: str, limit: int = 10, returned_fields: list = ["title", "abstract", "venue", "year", "paperId", "citationCount", "openAccessPdf", "authors"]) -> list: 根据查询和返回字段从Semantic Scholar加载数据

Source code in llama_index/readers/semanticscholar/base.py

class SemanticScholarReader(BaseReader):
    """一个用于从Semantic Scholar API读取和处理数据的类
...

方法:
-------
__init__():
    实例化SemanticScholar对象

load_data(query: str, limit: int = 10, returned_fields: list = ["title", "abstract", "venue", "year", "paperId", "citationCount", "openAccessPdf", "authors"]) -> list:
    根据查询和返回字段从Semantic Scholar加载数据"""

    def __init__(self, timeout=10, api_key=None, base_dir="pdfs") -> None:
        """
        实例化SemanticScholar对象。
        """
        import arxiv

        from semanticscholar import SemanticScholar

        self.arxiv = arxiv
        self.base_dir = base_dir
        self.s2 = SemanticScholar(timeout, api_key)
        # check for base dir
        if not os.path.exists(self.base_dir):
            os.makedirs(self.base_dir)

    def _clear_cache(self):
        """
        删除 .citation* 文件夹。
        """
        import shutil

        shutil.rmtree("./.citation*")

    def _download_pdf(self, paper_id, url: str, base_dir="pdfs"):
        logger = logging.getLogger()
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,"
                " like Gecko) Chrome/58.0.3029.110 Safari/537.3"
            )
        }
        # Making a GET request
        response = requests.get(url, headers=headers, stream=True)
        content_type = response.headers["Content-Type"]

        # As long as the content-type is application/pdf, this will download the file
        if "application/pdf" in content_type:
            os.makedirs(base_dir, exist_ok=True)
            file_path = os.path.join(base_dir, f"{paper_id}.pdf")
            # check if the file already exists
            if os.path.exists(file_path):
                logger.info(f"{file_path} already exists")
                return file_path
            with open(file_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        file.write(chunk)
            logger.info(f"Downloaded pdf from {url}")
            return file_path
        else:
            logger.warning(f"{url} was not downloaded: protected")
            return None

    def _get_full_text_docs(self, documents: List[Document]) -> List[Document]:
        from PyPDF2 import PdfReader

        """
        Gets the full text of the documents from Semantic Scholar

        Parameters
        ----------
        documents: list
            The list of Document object that contains the search results

        Returns
        -------
        list
            The list of Document object that contains the search results with full text

        Raises
        ------
        Exception
            If there is an error while getting the full text

        """
        full_text_docs = []
        for paper in documents:
            metadata = paper.extra_info
            url = metadata["openAccessPdf"]
            externalIds = metadata["externalIds"]
            paper_id = metadata["paperId"]
            file_path = None
            persist_dir = os.path.join(self.base_dir, f"{paper_id}.pdf")
            if url and not os.path.exists(persist_dir):
                # Download the document first
                file_path = self._download_pdf(metadata["paperId"], url, persist_dir)

            if (
                not url
                and externalIds
                and "ArXiv" in externalIds
                and not os.path.exists(persist_dir)
            ):
                # download the pdf from arxiv
                file_path = self._download_pdf_from_arxiv(
                    paper_id, externalIds["ArXiv"]
                )

            # Then, check if it's a valid PDF. If it's not, skip to the next document.
            if file_path:
                try:
                    pdf = PdfReader(open(file_path, "rb"))
                except Exception as e:
                    logging.error(
                        f"Failed to read pdf with exception: {e}. Skipping document..."
                    )
                    continue

                text = ""
                for page in pdf.pages:
                    text += page.extract_text()
                full_text_docs.append(Document(text=text, extra_info=metadata))

        return full_text_docs

    def _download_pdf_from_arxiv(self, paper_id, arxiv_id):
        paper = next(self.arxiv.Search(id_list=[arxiv_id], max_results=1).results())
        paper.download_pdf(dirpath=self.base_dir, filename=paper_id + ".pdf")
        return os.path.join(self.base_dir, f"{paper_id}.pdf")

    def load_data(
        self,
        query,
        limit,
        full_text=False,
        returned_fields=[
            "title",
            "abstract",
            "venue",
            "year",
            "paperId",
            "citationCount",
            "openAccessPdf",
            "authors",
            "externalIds",
        ],
    ) -> List[Document]:
        """从Semantic Scholar基于输入的查询和返回字段加载数据。

参数
----------
query: str
    文章的搜索查询
limit: int, optional
    返回的最大结果数（默认为10）
returned_fields: list, optional
    从搜索中返回的字段列表

返回:
-------
list
    包含搜索结果的Document对象列表

Raises:
------
Exception
    如果执行搜索时出现错误
"""
        try:
            results = self.s2.search_paper(query, limit=limit, fields=returned_fields)
        except (requests.HTTPError, requests.ConnectionError, requests.Timeout) as e:
            logging.error(
                "Failed to fetch data from Semantic Scholar with exception: %s", e
            )
            raise
        except Exception as e:
            logging.error("An unexpected error occurred: %s", e)
            raise

        documents = []

        for item in results[:limit]:
            openAccessPdf = getattr(item, "openAccessPdf", None)
            abstract = getattr(item, "abstract", None)
            title = getattr(item, "title", None)
            text = None
            # concat title and abstract
            if abstract and title:
                text = title + " " + abstract
            elif not abstract:
                text = title

            metadata = {
                "title": title,
                "venue": getattr(item, "venue", None),
                "year": getattr(item, "year", None),
                "paperId": getattr(item, "paperId", None),
                "citationCount": getattr(item, "citationCount", None),
                "openAccessPdf": openAccessPdf.get("url") if openAccessPdf else None,
                "authors": [author["name"] for author in getattr(item, "authors", [])],
                "externalIds": getattr(item, "externalIds", None),
            }
            documents.append(Document(text=text, extra_info=metadata))

        if full_text:
            full_text_documents = self._get_full_text_docs(documents)
            documents.extend(full_text_documents)
        return documents

load_data #

load_data(
    query,
    limit,
    full_text=False,
    returned_fields=[
        "title",
        "abstract",
        "venue",
        "year",
        "paperId",
        "citationCount",
        "openAccessPdf",
        "authors",
        "externalIds",
    ],
) -> List[Document]

从Semantic Scholar基于输入的查询和返回字段加载数据。

参数#

query: str 文章的搜索查询 limit: int, optional 返回的最大结果数（默认为10） returned_fields: list, optional 从搜索中返回的字段列表

返回:#

list 包含搜索结果的Document对象列表

Raises:#

Exception 如果执行搜索时出现错误

Source code in llama_index/readers/semanticscholar/base.py

    def load_data(
        self,
        query,
        limit,
        full_text=False,
        returned_fields=[
            "title",
            "abstract",
            "venue",
            "year",
            "paperId",
            "citationCount",
            "openAccessPdf",
            "authors",
            "externalIds",
        ],
    ) -> List[Document]:
        """从Semantic Scholar基于输入的查询和返回字段加载数据。

参数
----------
query: str
    文章的搜索查询
limit: int, optional
    返回的最大结果数（默认为10）
returned_fields: list, optional
    从搜索中返回的字段列表

返回:
-------
list
    包含搜索结果的Document对象列表

Raises:
------
Exception
    如果执行搜索时出现错误
"""
        try:
            results = self.s2.search_paper(query, limit=limit, fields=returned_fields)
        except (requests.HTTPError, requests.ConnectionError, requests.Timeout) as e:
            logging.error(
                "Failed to fetch data from Semantic Scholar with exception: %s", e
            )
            raise
        except Exception as e:
            logging.error("An unexpected error occurred: %s", e)
            raise

        documents = []

        for item in results[:limit]:
            openAccessPdf = getattr(item, "openAccessPdf", None)
            abstract = getattr(item, "abstract", None)
            title = getattr(item, "title", None)
            text = None
            # concat title and abstract
            if abstract and title:
                text = title + " " + abstract
            elif not abstract:
                text = title

            metadata = {
                "title": title,
                "venue": getattr(item, "venue", None),
                "year": getattr(item, "year", None),
                "paperId": getattr(item, "paperId", None),
                "citationCount": getattr(item, "citationCount", None),
                "openAccessPdf": openAccessPdf.get("url") if openAccessPdf else None,
                "authors": [author["name"] for author in getattr(item, "authors", [])],
                "externalIds": getattr(item, "externalIds", None),
            }
            documents.append(Document(text=text, extra_info=metadata))

        if full_text:
            full_text_documents = self._get_full_text_docs(documents)
            documents.extend(full_text_documents)
        return documents