Source code for langchain_community.document_loaders.parsers.pdf

"""模块包含用于PDF的常见解析器。"""
from __future__ import annotations

import warnings
from typing import (
    TYPE_CHECKING,
    Any,
    Iterable,
    Iterator,
    Mapping,
    Optional,
    Sequence,
    Union,
)
from urllib.parse import urlparse

import numpy as np
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob

if TYPE_CHECKING:
    import fitz.fitz
    import pdfminer.layout
    import pdfplumber.page
    import pypdf._page
    import pypdfium2._helpers.page
    from textractor.data.text_linearization_config import TextLinearizationConfig


_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
_PDF_FILTER_WITHOUT_LOSS = [
    "LZWDecode",
    "LZW",
    "FlateDecode",
    "Fl",
    "ASCII85Decode",
    "A85",
    "ASCIIHexDecode",
    "AHx",
    "RunLengthDecode",
    "RL",
    "CCITTFaxDecode",
    "CCF",
    "JBIG2Decode",
]


[docs]def extract_from_images_with_rapidocr(
    images: Sequence[Union[Iterable[np.ndarray], bytes]],
) -> str:
    """使用RapidOCR从图像中提取文本。

参数：
    images：要从中提取文本的图像。

返回：
    从图像中提取的文本。

引发：
    ImportError：如果未安装`rapidocr-onnxruntime`包。
"""
    try:
        from rapidocr_onnxruntime import RapidOCR
    except ImportError:
        raise ImportError(
            "`rapidocr-onnxruntime` package not found, please install it with "
            "`pip install rapidocr-onnxruntime`"
        )
    ocr = RapidOCR()
    text = ""
    for img in images:
        result, _ = ocr(img)
        if result:
            result = [text[1] for text in result]
            text += "\n".join(result)
    return text


[docs]class PyPDFParser(BaseBlobParser):
    """使用`pypdf`加载`PDF`"""

[docs]    def __init__(
        self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False
    ):
        self.password = password
        self.extract_images = extract_images

[docs]    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """懒惰地解析blob。"""
        import pypdf

        with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
            pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
            yield from [
                Document(
                    page_content=page.extract_text()
                    + self._extract_images_from_page(page),
                    metadata={"source": blob.source, "page": page_number},  # type: ignore[attr-defined]
                )
                for page_number, page in enumerate(pdf_reader.pages)
            ]

    def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
        """从页面中提取图像，并使用RapidOCR获取文本。"""
        if not self.extract_images or "/XObject" not in page["/Resources"].keys():
            return ""

        xObject = page["/Resources"]["/XObject"].get_object()  # type: ignore
        images = []
        for obj in xObject:
            if xObject[obj]["/Subtype"] == "/Image":
                if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
                    height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]

                    images.append(
                        np.frombuffer(xObject[obj].get_data(), dtype=np.uint8).reshape(
                            height, width, -1
                        )
                    )
                elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
                    images.append(xObject[obj].get_data())
                else:
                    warnings.warn("Unknown PDF Filter!")
        return extract_from_images_with_rapidocr(images)


[docs]class PDFMinerParser(BaseBlobParser):
    """使用`PDFMiner`解析`PDF`。"""

[docs]    def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
        """基于PDFMiner初始化解析器。

参数：
    extract_images: 是否从PDF中提取图像。
    concatenate_pages: 如果为True，则将所有PDF页面连接成一个单个文档。否则，每页返回一个文档。
"""
        self.extract_images = extract_images
        self.concatenate_pages = concatenate_pages

[docs]    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """懒惰地解析blob。"""

        if not self.extract_images:
            from pdfminer.high_level import extract_text

            with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
                if self.concatenate_pages:
                    text = extract_text(pdf_file_obj)
                    metadata = {"source": blob.source}  # type: ignore[attr-defined]
                    yield Document(page_content=text, metadata=metadata)
                else:
                    from pdfminer.pdfpage import PDFPage

                    pages = PDFPage.get_pages(pdf_file_obj)
                    for i, _ in enumerate(pages):
                        text = extract_text(pdf_file_obj, page_numbers=[i])
                        metadata = {"source": blob.source, "page": str(i)}  # type: ignore[attr-defined]
                        yield Document(page_content=text, metadata=metadata)
        else:
            import io

            from pdfminer.converter import PDFPageAggregator, TextConverter
            from pdfminer.layout import LAParams
            from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
            from pdfminer.pdfpage import PDFPage

            text_io = io.StringIO()
            with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
                pages = PDFPage.get_pages(pdf_file_obj)
                rsrcmgr = PDFResourceManager()
                device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams())
                device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams())
                interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text)
                interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image)
                for i, page in enumerate(pages):
                    interpreter_for_text.process_page(page)
                    interpreter_for_image.process_page(page)
                    content = text_io.getvalue() + self._extract_images_from_page(
                        device_for_image.get_result()
                    )
                    text_io.truncate(0)
                    text_io.seek(0)
                    metadata = {"source": blob.source, "page": str(i)}  # type: ignore[attr-defined]
                    yield Document(page_content=content, metadata=metadata)

    def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str:
        """从页面中提取图像，并使用RapidOCR获取文本。"""
        import pdfminer

        def get_image(layout_object: Any) -> Any:
            if isinstance(layout_object, pdfminer.layout.LTImage):
                return layout_object
            if isinstance(layout_object, pdfminer.layout.LTContainer):
                for child in layout_object:
                    return get_image(child)
            else:
                return None

        images = []

        for img in list(filter(bool, map(get_image, page))):
            if img.stream["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
                images.append(
                    np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
                        img.stream["Height"], img.stream["Width"], -1
                    )
                )
            elif img.stream["Filter"].name in _PDF_FILTER_WITH_LOSS:
                images.append(img.stream.get_data())
            else:
                warnings.warn("Unknown PDF Filter!")
        return extract_from_images_with_rapidocr(images)


[docs]class PyMuPDFParser(BaseBlobParser):
    """使用`PyMuPDF`解析`PDF`。"""

[docs]    def __init__(
        self,
        text_kwargs: Optional[Mapping[str, Any]] = None,
        extract_images: bool = False,
    ) -> None:
        """初始化解析器。

参数：
    text_kwargs：传递给``fitz.Page.get_text()``的关键字参数。
"""
        self.text_kwargs = text_kwargs or {}
        self.extract_images = extract_images

[docs]    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """懒惰地解析blob。"""
        import fitz

        with blob.as_bytes_io() as file_path:  # type: ignore[attr-defined]
            if blob.data is None:  # type: ignore[attr-defined]
                doc = fitz.open(file_path)
            else:
                doc = fitz.open(stream=file_path, filetype="pdf")

            yield from [
                Document(
                    page_content=page.get_text(**self.text_kwargs)
                    + self._extract_images_from_page(doc, page),
                    metadata=dict(
                        {
                            "source": blob.source,  # type: ignore[attr-defined]
                            "file_path": blob.source,  # type: ignore[attr-defined]
                            "page": page.number,
                            "total_pages": len(doc),
                        },
                        **{
                            k: doc.metadata[k]
                            for k in doc.metadata
                            if type(doc.metadata[k]) in [str, int]
                        },
                    ),
                )
                for page in doc
            ]

    def _extract_images_from_page(
        self, doc: fitz.fitz.Document, page: fitz.fitz.Page
    ) -> str:
        """从页面中提取图像，并使用RapidOCR获取文本。"""
        if not self.extract_images:
            return ""
        import fitz

        img_list = page.get_images()
        imgs = []
        for img in img_list:
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            imgs.append(
                np.frombuffer(pix.samples, dtype=np.uint8).reshape(
                    pix.height, pix.width, -1
                )
            )
        return extract_from_images_with_rapidocr(imgs)


[docs]class PyPDFium2Parser(BaseBlobParser):
    """使用`PyPDFium2`解析`PDF`。"""

[docs]    def __init__(self, extract_images: bool = False) -> None:
        """初始化解析器。"""
        try:
            import pypdfium2  # noqa:F401
        except ImportError:
            raise ImportError(
                "pypdfium2 package not found, please install it with"
                " `pip install pypdfium2`"
            )
        self.extract_images = extract_images

[docs]    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """懒惰地解析blob。"""
        import pypdfium2

        # pypdfium2 is really finicky with respect to closing things,
        # if done incorrectly creates seg faults.
        with blob.as_bytes_io() as file_path:  # type: ignore[attr-defined]
            pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
            try:
                for page_number, page in enumerate(pdf_reader):
                    text_page = page.get_textpage()
                    content = text_page.get_text_range()
                    text_page.close()
                    content += "\n" + self._extract_images_from_page(page)
                    page.close()
                    metadata = {"source": blob.source, "page": page_number}  # type: ignore[attr-defined]
                    yield Document(page_content=content, metadata=metadata)
            finally:
                pdf_reader.close()

    def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str:
        """从页面中提取图像，并使用RapidOCR获取文本。"""
        if not self.extract_images:
            return ""

        import pypdfium2.raw as pdfium_c

        images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))

        images = list(map(lambda x: x.get_bitmap().to_numpy(), images))
        return extract_from_images_with_rapidocr(images)


[docs]class PDFPlumberParser(BaseBlobParser):
    """使用`PDFPlumber`解析`PDF`。"""

[docs]    def __init__(
        self,
        text_kwargs: Optional[Mapping[str, Any]] = None,
        dedupe: bool = False,
        extract_images: bool = False,
    ) -> None:
        """初始化解析器。

参数：
    text_kwargs：传递给``pdfplumber.Page.extract_text()``的关键字参数
    dedupe：如果`dedupe=True`，避免重复字符的错误。
"""
        self.text_kwargs = text_kwargs or {}
        self.dedupe = dedupe
        self.extract_images = extract_images

[docs]    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """懒惰地解析blob。"""
        import pdfplumber

        with blob.as_bytes_io() as file_path:  # type: ignore[attr-defined]
            doc = pdfplumber.open(file_path)  # open document

            yield from [
                Document(
                    page_content=self._process_page_content(page)
                    + "\n"
                    + self._extract_images_from_page(page),
                    metadata=dict(
                        {
                            "source": blob.source,  # type: ignore[attr-defined]
                            "file_path": blob.source,  # type: ignore[attr-defined]
                            "page": page.page_number - 1,
                            "total_pages": len(doc.pages),
                        },
                        **{
                            k: doc.metadata[k]
                            for k in doc.metadata
                            if type(doc.metadata[k]) in [str, int]
                        },
                    ),
                )
                for page in doc.pages
            ]

    def _process_page_content(self, page: pdfplumber.page.Page) -> str:
        """根据去重处理页面内容。"""
        if self.dedupe:
            return page.dedupe_chars().extract_text(**self.text_kwargs)
        return page.extract_text(**self.text_kwargs)

    def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
        """从页面中提取图像，并使用RapidOCR获取文本。"""
        if not self.extract_images:
            return ""

        images = []
        for img in page.images:
            if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
                images.append(
                    np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
                        img["stream"]["Height"], img["stream"]["Width"], -1
                    )
                )
            elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
                images.append(img["stream"].get_data())
            else:
                warnings.warn("Unknown PDF Filter!")

        return extract_from_images_with_rapidocr(images)


[docs]class AmazonTextractPDFParser(BaseBlobParser):
    """将`PDF`文件发送到`Amazon Textract`并解析它们。

    要解析多页PDF文件，它们必须存储在S3上。

    AmazonTextractPDFLoader调用
    [Amazon Textract服务](https://aws.amazon.com/textract/)
    将PDF文件转换为文档结构。
    支持单页和多页文档，最多支持3000页和512 MB大小。

    要成功调用，需要一个AWS账户，
    类似于
    [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
    的要求。

    除了AWS配置外，它与其他PDF加载器非常相似，同时还支持JPEG、PNG和TIFF以及非原生PDF格式。

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
    documents = loader.load()
    ```

    一个特性是输出的线性化。
    当与Textract一起使用特性LAYOUT、FORMS或TABLES时

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    # 您可以混合和匹配每个特性
    loader=AmazonTextractPDFLoader(
        "example_data/alejandro_rosalez_sample-small.jpeg",
        textract_features=["TABLES", "LAYOUT"])
    documents = loader.load()
    ```

    它将生成按阅读顺序格式化文本的输出，
    并尝试以表格结构输出信息或
    使用冒号（key: value）输出键/值对。
    这有助于大多数LLMs在处理这些文本时实现更好的准确性。"""

[docs]    def __init__(
        self,
        textract_features: Optional[Sequence[int]] = None,
        client: Optional[Any] = None,
        *,
        linearization_config: Optional["TextLinearizationConfig"] = None,
    ) -> None:
        """初始化解析器。

参数：
    textract_features：用于提取的特征，每个特征应作为符合枚举`Textract_Features`的整数传递，参见`amazon-textract-caller`包
    client：boto3 textract客户端
    linearization_config：用于输出线性化的配置应为来自`textractor`包的TextLinearizationConfig实例
"""

        try:
            import textractcaller as tc
            import textractor.entities.document as textractor

            self.tc = tc
            self.textractor = textractor

            if textract_features is not None:
                self.textract_features = [
                    tc.Textract_Features(f) for f in textract_features
                ]
            else:
                self.textract_features = []

            if linearization_config is not None:
                self.linearization_config = linearization_config
            else:
                self.linearization_config = self.textractor.TextLinearizationConfig(
                    hide_figure_layout=True,
                    title_prefix="# ",
                    section_header_prefix="## ",
                    list_element_prefix="*",
                )
        except ImportError:
            raise ImportError(
                "Could not import amazon-textract-caller or "
                "amazon-textract-textractor python package. Please install it "
                "with `pip install amazon-textract-caller` & "
                "`pip install amazon-textract-textractor`."
            )

        if not client:
            try:
                import boto3

                self.boto3_textract_client = boto3.client("textract")
            except ImportError:
                raise ImportError(
                    "Could not import boto3 python package. "
                    "Please install it with `pip install boto3`."
                )
        else:
            self.boto3_textract_client = client

[docs]    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """遍历Blob页面并返回一个迭代器，每个页面都有一个Document，类似于其他解析器。如果是多页文档，blob.path必须设置为S3 URI，对于单页文档，则使用blob.data。
"""

        url_parse_result = urlparse(str(blob.path)) if blob.path else None  # type: ignore[attr-defined]
        # Either call with S3 path (multi-page) or with bytes (single-page)
        if (
            url_parse_result
            and url_parse_result.scheme == "s3"
            and url_parse_result.netloc
        ):
            textract_response_json = self.tc.call_textract(
                input_document=str(blob.path),  # type: ignore[attr-defined]
                features=self.textract_features,
                boto3_textract_client=self.boto3_textract_client,
            )
        else:
            textract_response_json = self.tc.call_textract(
                input_document=blob.as_bytes(),  # type: ignore[attr-defined]
                features=self.textract_features,
                call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
                boto3_textract_client=self.boto3_textract_client,
            )

        document = self.textractor.Document.open(textract_response_json)

        for idx, page in enumerate(document.pages):
            yield Document(
                page_content=page.get_text(config=self.linearization_config),
                metadata={"source": blob.source, "page": idx + 1},  # type: ignore[attr-defined]
            )


[docs]class DocumentIntelligenceParser(BaseBlobParser):
    """使用Azure文档智能（以前称为表单识别器）加载PDF，并按字符级别分块。"""

[docs]    def __init__(self, client: Any, model: str):
        warnings.warn(
            "langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParser"
            "and langchain_community.document_loaders.pdf.DocumentIntelligenceLoader"
            " are deprecated. Please upgrade to "
            "langchain_community.document_loaders.DocumentIntelligenceLoader "
            "for any file parsing purpose using Azure Document Intelligence "
            "service."
        )
        self.client = client
        self.model = model

    def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:  # type: ignore[valid-type]
        for p in result.pages:
            content = " ".join([line.content for line in p.lines])

            d = Document(
                page_content=content,
                metadata={
                    "source": blob.source,  # type: ignore[attr-defined]
                    "page": p.page_number,
                },
            )
            yield d

[docs]    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """懒惰地解析blob。"""

        with blob.as_bytes_io() as file_obj:  # type: ignore[attr-defined]
            poller = self.client.begin_analyze_document(self.model, file_obj)
            result = poller.result()

            docs = self._generate_docs(blob, result)

            yield from docs