Source code for langchain_community.document_loaders.pdf

import json
import logging
import os
import re
import tempfile
import time
from abc import ABC
from io import StringIO
from pathlib import Path
from typing import (
    TYPE_CHECKING,
    Any,
    Dict,
    Iterator,
    List,
    Mapping,
    Optional,
    Sequence,
    Union,
)
from urllib.parse import urlparse

import requests
from langchain_core.documents import Document
from langchain_core.utils import get_from_dict_or_env

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
    AmazonTextractPDFParser,
    DocumentIntelligenceParser,
    PDFMinerParser,
    PDFPlumberParser,
    PyMuPDFParser,
    PyPDFium2Parser,
    PyPDFParser,
)
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

if TYPE_CHECKING:
    from textractor.data.text_linearization_config import TextLinearizationConfig

logger = logging.getLogger(__file__)


[docs]class UnstructuredPDFLoader(UnstructuredFileLoader):
    """使用`Unstructured`加载`PDF`文件。

    您可以在两种模式中的一种中运行加载程序："single"和"elements"。
    如果使用"single"模式，文档将作为单个langchain Document对象返回。
    如果使用"elements"模式，unstructured库将文档拆分为诸如Title和NarrativeText之类的元素。
    您可以在模式之后传递额外的unstructured kwargs以应用不同的unstructured设置。

    示例
    --------
    from langchain_community.document_loaders import UnstructuredPDFLoader

    loader = UnstructuredPDFLoader(
        "example.pdf", mode="elements", strategy="fast",
    )
    docs = loader.load()

    参考资料
    ----------
    https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf"""

    def _get_elements(self) -> List:
        from unstructured.partition.pdf import partition_pdf

        return partition_pdf(filename=self.file_path, **self.unstructured_kwargs)


[docs]class BasePDFLoader(BaseLoader, ABC):
    """用于`PDF`文件的基本加载器类。

    如果文件是网络路径，则会将其下载到临时文件中，使用后再清理临时文件。"""

[docs]    def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None):
        """使用文件路径进行初始化。

参数：
    file_path：PDF文件的本地、S3或Web路径。
    headers：用于从Web路径下载文件的GET请求的标头。
"""
        self.file_path = str(file_path)
        self.web_path = None
        self.headers = headers
        if "~" in self.file_path:
            self.file_path = os.path.expanduser(self.file_path)

        # If the file is a web path or S3, download it to a temporary file, and use that
        if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
            self.temp_dir = tempfile.TemporaryDirectory()
            _, suffix = os.path.splitext(self.file_path)
            if self._is_s3_presigned_url(self.file_path):
                suffix = urlparse(self.file_path).path.split("/")[-1]
            temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
            self.web_path = self.file_path
            if not self._is_s3_url(self.file_path):
                r = requests.get(self.file_path, headers=self.headers)
                if r.status_code != 200:
                    raise ValueError(
                        "Check the url of your file; returned status code %s"
                        % r.status_code
                    )

                with open(temp_pdf, mode="wb") as f:
                    f.write(r.content)
                self.file_path = str(temp_pdf)
        elif not os.path.isfile(self.file_path):
            raise ValueError("File path %s is not a valid file or url" % self.file_path)

    def __del__(self) -> None:
        if hasattr(self, "temp_dir"):
            self.temp_dir.cleanup()

    @staticmethod
    def _is_valid_url(url: str) -> bool:
        """检查URL是否有效。"""
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)

    @staticmethod
    def _is_s3_url(url: str) -> bool:
        """检查URL是否为S3"""
        try:
            result = urlparse(url)
            if result.scheme == "s3" and result.netloc:
                return True
            return False
        except ValueError:
            return False

    @staticmethod
    def _is_s3_presigned_url(url: str) -> bool:
        """检查URL是否为预签名的S3 URL。"""
        try:
            result = urlparse(url)
            return bool(re.search(r"\.s3\.amazonaws\.com$", result.netloc))
        except ValueError:
            return False

    @property
    def source(self) -> str:
        return self.web_path if self.web_path is not None else self.file_path


[docs]class OnlinePDFLoader(BasePDFLoader):
    """加载在线`PDF`。"""

[docs]    def load(self) -> List[Document]:
        """加载文档。"""
        loader = UnstructuredPDFLoader(str(self.file_path))
        return loader.load()


[docs]class PyPDFLoader(BasePDFLoader):
    """使用pypdf加载PDF文件到文档列表中。

    加载器按页面划分块，并将页面编号存储在元数据中。"""

[docs]    def __init__(
        self,
        file_path: str,
        password: Optional[Union[str, bytes]] = None,
        headers: Optional[Dict] = None,
        extract_images: bool = False,
    ) -> None:
        """使用文件路径进行初始化。"""
        try:
            import pypdf  # noqa:F401
        except ImportError:
            raise ImportError(
                "pypdf package not found, please install it with " "`pip install pypdf`"
            )
        super().__init__(file_path, headers=headers)
        self.parser = PyPDFParser(password=password, extract_images=extract_images)

[docs]    def lazy_load(
        self,
    ) -> Iterator[Document]:
        """将给定路径作为页面进行延迟加载。"""
        if self.web_path:
            blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)  # type: ignore[attr-defined]
        else:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
        yield from self.parser.parse(blob)


[docs]class PyPDFium2Loader(BasePDFLoader):
    """使用`pypdfium2`加载`PDF`并按字符级别分块。"""

[docs]    def __init__(
        self,
        file_path: str,
        *,
        headers: Optional[Dict] = None,
        extract_images: bool = False,
    ):
        """使用文件路径进行初始化。"""
        super().__init__(file_path, headers=headers)
        self.parser = PyPDFium2Parser(extract_images=extract_images)

[docs]    def lazy_load(
        self,
    ) -> Iterator[Document]:
        """将给定路径作为页面进行延迟加载。"""
        if self.web_path:
            blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)  # type: ignore[attr-defined]
        else:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
        yield from self.parser.parse(blob)


[docs]class PyPDFDirectoryLoader(BaseLoader):
    """使用`pypdf`加载一个包含`PDF`文件的目录，并以字符级别进行分块。

加载器还会将页面编号存储在元数据中。"""

[docs]    def __init__(
        self,
        path: Union[str, Path],
        glob: str = "**/[!.]*.pdf",
        silent_errors: bool = False,
        load_hidden: bool = False,
        recursive: bool = False,
        extract_images: bool = False,
    ):
        self.path = path
        self.glob = glob
        self.load_hidden = load_hidden
        self.recursive = recursive
        self.silent_errors = silent_errors
        self.extract_images = extract_images

    @staticmethod
    def _is_visible(path: Path) -> bool:
        return not any(part.startswith(".") for part in path.parts)

[docs]    def load(self) -> List[Document]:
        p = Path(self.path)
        docs = []
        items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
        for i in items:
            if i.is_file():
                if self._is_visible(i.relative_to(p)) or self.load_hidden:
                    try:
                        loader = PyPDFLoader(str(i), extract_images=self.extract_images)
                        sub_docs = loader.load()
                        for doc in sub_docs:
                            doc.metadata["source"] = str(i)
                        docs.extend(sub_docs)
                    except Exception as e:
                        if self.silent_errors:
                            logger.warning(e)
                        else:
                            raise e
        return docs


[docs]class PDFMinerLoader(BasePDFLoader):
    """使用`PDFMiner`加载`PDF`文件。"""

[docs]    def __init__(
        self,
        file_path: str,
        *,
        headers: Optional[Dict] = None,
        extract_images: bool = False,
        concatenate_pages: bool = True,
    ) -> None:
        """使用文件路径进行初始化。

参数：
    extract_images：是否从PDF中提取图像。
    concatenate_pages：如果为True，则将所有PDF页面连接成一个单个文档。否则，每页返回一个文档。
"""
        try:
            from pdfminer.high_level import extract_text  # noqa:F401
        except ImportError:
            raise ImportError(
                "`pdfminer` package not found, please install it with "
                "`pip install pdfminer.six`"
            )

        super().__init__(file_path, headers=headers)
        self.parser = PDFMinerParser(
            extract_images=extract_images, concatenate_pages=concatenate_pages
        )

[docs]    def lazy_load(
        self,
    ) -> Iterator[Document]:
        """懒加载文档。"""
        if self.web_path:
            blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)  # type: ignore[attr-defined]
        else:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
        yield from self.parser.parse(blob)


[docs]class PDFMinerPDFasHTMLLoader(BasePDFLoader):
    """使用`PDFMiner`将`PDF`文件加载为HTML内容。"""

[docs]    def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
        """使用文件路径进行初始化。"""
        try:
            from pdfminer.high_level import extract_text_to_fp  # noqa:F401
        except ImportError:
            raise ImportError(
                "`pdfminer` package not found, please install it with "
                "`pip install pdfminer.six`"
            )

        super().__init__(file_path, headers=headers)

[docs]    def lazy_load(self) -> Iterator[Document]:
        """加载文件。"""
        from pdfminer.high_level import extract_text_to_fp
        from pdfminer.layout import LAParams
        from pdfminer.utils import open_filename

        output_string = StringIO()
        with open_filename(self.file_path, "rb") as fp:
            extract_text_to_fp(
                fp,
                output_string,
                codec="",
                laparams=LAParams(),
                output_type="html",
            )
        metadata = {
            "source": self.file_path if self.web_path is None else self.web_path
        }
        yield Document(page_content=output_string.getvalue(), metadata=metadata)


[docs]class PyMuPDFLoader(BasePDFLoader):
    """使用`PyMuPDF`加载`PDF`文件。"""

[docs]    def __init__(
        self,
        file_path: str,
        *,
        headers: Optional[Dict] = None,
        extract_images: bool = False,
        **kwargs: Any,
    ) -> None:
        """使用文件路径进行初始化。"""
        try:
            import fitz  # noqa:F401
        except ImportError:
            raise ImportError(
                "`PyMuPDF` package not found, please install it with "
                "`pip install pymupdf`"
            )
        super().__init__(file_path, headers=headers)
        self.extract_images = extract_images
        self.text_kwargs = kwargs

    def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
        if kwargs:
            logger.warning(
                f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
                f" is deprecated. Please pass arguments during initialization instead."
            )

        text_kwargs = {**self.text_kwargs, **kwargs}
        parser = PyMuPDFParser(
            text_kwargs=text_kwargs, extract_images=self.extract_images
        )
        if self.web_path:
            blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)  # type: ignore[attr-defined]
        else:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
        yield from parser.lazy_parse(blob)

[docs]    def load(self, **kwargs: Any) -> List[Document]:
        return list(self._lazy_load(**kwargs))

[docs]    def lazy_load(self) -> Iterator[Document]:
        yield from self._lazy_load()


# MathpixPDFLoader implementation taken largely from Daniel Gross's:
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
[docs]class MathpixPDFLoader(BasePDFLoader):
    """使用`Mathpix`服务加载`PDF`文件。"""

[docs]    def __init__(
        self,
        file_path: str,
        processed_file_format: str = "md",
        max_wait_time_seconds: int = 500,
        should_clean_pdf: bool = False,
        extra_request_data: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> None:
        """使用文件路径进行初始化。

参数：
    file_path: 用于加载的文件。
    processed_file_format: 处理后文件的格式。默认为"md"。
    max_wait_time_seconds: 等待服务器响应的最长时间。默认为500。
    should_clean_pdf: 清理PDF文件的标志。默认为False。
    extra_request_data: 额外的请求数据。
    **kwargs: 额外的关键字参数。
"""
        self.mathpix_api_key = get_from_dict_or_env(
            kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
        )
        self.mathpix_api_id = get_from_dict_or_env(
            kwargs, "mathpix_api_id", "MATHPIX_API_ID"
        )

        # The base class isn't expecting these and doesn't collect **kwargs
        kwargs.pop("mathpix_api_key", None)
        kwargs.pop("mathpix_api_id", None)

        super().__init__(file_path, **kwargs)
        self.processed_file_format = processed_file_format
        self.extra_request_data = (
            extra_request_data if extra_request_data is not None else {}
        )
        self.max_wait_time_seconds = max_wait_time_seconds
        self.should_clean_pdf = should_clean_pdf

    @property
    def _mathpix_headers(self) -> Dict[str, str]:
        return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}

    @property
    def url(self) -> str:
        return "https://api.mathpix.com/v3/pdf"

    @property
    def data(self) -> dict:
        options = {
            "conversion_formats": {self.processed_file_format: True},
            **self.extra_request_data,
        }
        return {"options_json": json.dumps(options)}

[docs]    def send_pdf(self) -> str:
        with open(self.file_path, "rb") as f:
            files = {"file": f}
            response = requests.post(
                self.url, headers=self._mathpix_headers, files=files, data=self.data
            )
        response_data = response.json()
        if "error" in response_data:
            raise ValueError(f"Mathpix request failed: {response_data['error']}")
        if "pdf_id" in response_data:
            pdf_id = response_data["pdf_id"]
            return pdf_id
        else:
            raise ValueError("Unable to send PDF to Mathpix.")

[docs]    def wait_for_processing(self, pdf_id: str) -> None:
        """等待处理完成。

参数：
    pdf_id: 一个PDF id。

返回值：无
"""
        url = self.url + "/" + pdf_id
        for _ in range(0, self.max_wait_time_seconds, 5):
            response = requests.get(url, headers=self._mathpix_headers)
            response_data = response.json()

            # This indicates an error with the request (e.g. auth problems)
            error = response_data.get("error", None)
            error_info = response_data.get("error_info", None)

            if error is not None:
                error_msg = f"Unable to retrieve PDF from Mathpix: {error}"

                if error_info is not None:
                    error_msg += f" ({error_info['id']})"

                raise ValueError(error_msg)

            status = response_data.get("status", None)

            if status == "completed":
                return
            elif status == "error":
                # This indicates an error with the PDF processing
                raise ValueError("Unable to retrieve PDF from Mathpix")
            else:
                print(f"Status: {status}, waiting for processing to complete")  # noqa: T201
                time.sleep(5)
        raise TimeoutError

[docs]    def get_processed_pdf(self, pdf_id: str) -> str:
        self.wait_for_processing(pdf_id)
        url = f"{self.url}/{pdf_id}.{self.processed_file_format}"
        response = requests.get(url, headers=self._mathpix_headers)
        return response.content.decode("utf-8")

[docs]    def clean_pdf(self, contents: str) -> str:
        """清理PDF文件。

参数：
    contents: 一个PDF文件的内容。

返回值：
"""
        contents = "\n".join(
            [line for line in contents.split("\n") if not line.startswith("![]")]
        )
        # replace \section{Title} with # Title
        contents = contents.replace("\\section{", "# ").replace("}", "")
        # replace the "\" slash that Mathpix adds to escape $, %, (, etc.
        contents = (
            contents.replace(r"\$", "$")
            .replace(r"\%", "%")
            .replace(r"\(", "(")
            .replace(r"\)", ")")
        )
        return contents

[docs]    def load(self) -> List[Document]:
        pdf_id = self.send_pdf()
        contents = self.get_processed_pdf(pdf_id)
        if self.should_clean_pdf:
            contents = self.clean_pdf(contents)
        metadata = {"source": self.source, "file_path": self.source, "pdf_id": pdf_id}
        return [Document(page_content=contents, metadata=metadata)]


[docs]class PDFPlumberLoader(BasePDFLoader):
    """使用`pdfplumber`加载`PDF`文件。"""

[docs]    def __init__(
        self,
        file_path: str,
        text_kwargs: Optional[Mapping[str, Any]] = None,
        dedupe: bool = False,
        headers: Optional[Dict] = None,
        extract_images: bool = False,
    ) -> None:
        """使用文件路径进行初始化。"""
        try:
            import pdfplumber  # noqa:F401
        except ImportError:
            raise ImportError(
                "pdfplumber package not found, please install it with "
                "`pip install pdfplumber`"
            )

        super().__init__(file_path, headers=headers)
        self.text_kwargs = text_kwargs or {}
        self.dedupe = dedupe
        self.extract_images = extract_images

[docs]    def load(self) -> List[Document]:
        """加载文件。"""

        parser = PDFPlumberParser(
            text_kwargs=self.text_kwargs,
            dedupe=self.dedupe,
            extract_images=self.extract_images,
        )
        if self.web_path:
            blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)  # type: ignore[attr-defined]
        else:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
        return parser.parse(blob)


[docs]class AmazonTextractPDFLoader(BasePDFLoader):
    """从本地文件系统、HTTP或S3加载`PDF`文件。

    要进行身份验证，AWS客户端使用以下方法自动加载凭据：
    https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html

    如果应该使用特定的凭据配置文件，必须传递要使用的位于~/.aws/credentials文件中的配置文件的名称。

    确保使用的凭据/角色具有访问Amazon Textract服务所需的策略。

    示例：
        .. code-block:: python
            from langchain_community.document_loaders import AmazonTextractPDFLoader
            loader = AmazonTextractPDFLoader(
                file_path="s3://pdfs/myfile.pdf"
            )
            document = loader.load()"""

[docs]    def __init__(
        self,
        file_path: str,
        textract_features: Optional[Sequence[str]] = None,
        client: Optional[Any] = None,
        credentials_profile_name: Optional[str] = None,
        region_name: Optional[str] = None,
        endpoint_url: Optional[str] = None,
        headers: Optional[Dict] = None,
        *,
        linearization_config: Optional["TextLinearizationConfig"] = None,
    ) -> None:
        """初始化加载程序。

参数：
    file_path：输入文件的文件、URL或S3路径
    textract_features：用于提取的特征，每个特征应作为符合枚举`Textract_Features`的字符串传递，参见`amazon-textract-caller`包
    client：boto3 textract客户端（可选）
    credentials_profile_name：AWS配置文件名称，如果不是默认值（可选）
    region_name：AWS区域，例如us-east-1（可选）
    endpoint_url：textract服务的端点URL（可选）
    linearization_config：用于输出线性化的配置应为来自`textractor`包的TextLinearizationConfig实例
"""
        super().__init__(file_path, headers=headers)

        try:
            import textractcaller as tc
        except ImportError:
            raise ImportError(
                "Could not import amazon-textract-caller python package. "
                "Please install it with `pip install amazon-textract-caller`."
            )
        if textract_features:
            features = [tc.Textract_Features[x] for x in textract_features]
        else:
            features = []

        if credentials_profile_name or region_name or endpoint_url:
            try:
                import boto3

                if credentials_profile_name is not None:
                    session = boto3.Session(profile_name=credentials_profile_name)
                else:
                    # use default credentials
                    session = boto3.Session()

                client_params = {}
                if region_name:
                    client_params["region_name"] = region_name
                if endpoint_url:
                    client_params["endpoint_url"] = endpoint_url

                client = session.client("textract", **client_params)

            except ImportError:
                raise ImportError(
                    "Could not import boto3 python package. "
                    "Please install it with `pip install boto3`."
                )
            except Exception as e:
                raise ValueError(
                    "Could not load credentials to authenticate with AWS client. "
                    "Please check that credentials in the specified "
                    f"profile name are valid. {e}"
                ) from e
        self.parser = AmazonTextractPDFParser(
            textract_features=features,
            client=client,
            linearization_config=linearization_config,
        )

[docs]    def load(self) -> List[Document]:
        """将给定路径加载为页面。"""
        return list(self.lazy_load())

[docs]    def lazy_load(
        self,
    ) -> Iterator[Document]:
        """延迟加载文档"""
        # the self.file_path is local, but the blob has to include
        # the S3 location if the file originated from S3 for multi-page documents
        # raises ValueError when multi-page and not on S3"""

        if self.web_path and self._is_s3_url(self.web_path):
            blob = Blob(path=self.web_path)  # type: ignore[call-arg] # type: ignore[misc]
        else:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
            if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1:
                raise ValueError(
                    f"the file {blob.path} is a multi-page document, \
                    but not stored on S3. \
                    Textract requires multi-page documents to be on S3."
                )

        yield from self.parser.parse(blob)

    @staticmethod
    def _get_number_of_pages(blob: Blob) -> int:  # type: ignore[valid-type]
        try:
            import pypdf
            from PIL import Image, ImageSequence

        except ImportError:
            raise ImportError(
                "Could not import pypdf or Pilloe python package. "
                "Please install it with `pip install pypdf Pillow`."
            )
        if blob.mimetype == "application/pdf":  # type: ignore[attr-defined]
            with blob.as_bytes_io() as input_pdf_file:  # type: ignore[attr-defined]
                pdf_reader = pypdf.PdfReader(input_pdf_file)
                return len(pdf_reader.pages)
        elif blob.mimetype == "image/tiff":  # type: ignore[attr-defined]
            num_pages = 0
            img = Image.open(blob.as_bytes())  # type: ignore[attr-defined]
            for _, _ in enumerate(ImageSequence.Iterator(img)):
                num_pages += 1
            return num_pages
        elif blob.mimetype in ["image/png", "image/jpeg"]:  # type: ignore[attr-defined]
            return 1
        else:
            raise ValueError(f"unsupported mime type: {blob.mimetype}")  # type: ignore[attr-defined]


[docs]class DocumentIntelligenceLoader(BasePDFLoader):
    """使用Azure文档智能加载PDF"""
[docs]    def __init__(
            self,
            file_path: str,
            client: Any,
            model: str = "prebuilt-document",
            headers: Optional[Dict] = None,
        ) -> None:

        """使用Azure文档智能（以前称为表单识别器）初始化文件处理对象。

        此构造函数初始化一个DocumentIntelligenceParser对象，用于使用Azure文档智能API解析文件。load方法为每个页面生成包含元数据（源blob和页码）的Document节点。

        参数:
        -----------
        file_path : str
            需要解析的文件路径。
        client: Any
            用于执行blob分析的DocumentAnalysisClient
        model : str
            用于在Azure中进行表单识别的模型名称或ID。

        示例:
        ---------
        >>> obj = DocumentIntelligenceLoader(
        ...     file_path="path/to/file",
        ...     client=client,
        ...     model="prebuilt-document"
        ... )
        """
        self.parser = DocumentIntelligenceParser(client=client, model=model)
        super().__init__(file_path, headers=headers)
        
[docs]    def load(self) -> List[Document]:
        """将给定路径加载为页面。"""
        return list(self.lazy_load())

[docs]    def lazy_load(
        self,
    ) -> Iterator[Document]:
        """将给定路径作为页面进行延迟加载。"""
        blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
        yield from self.parser.parse(blob)


# Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader