Source code for langchain_community.document_loaders.parsers.pdf

"""模块包含用于PDF的常见解析器。"""
from __future__ import annotations

import warnings
from typing import (
    TYPE_CHECKING,
    Any,
    Iterable,
    Iterator,
    Mapping,
    Optional,
    Sequence,
    Union,
)
from urllib.parse import urlparse

import numpy as np
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob

if TYPE_CHECKING:
    import fitz.fitz
    import pdfminer.layout
    import pdfplumber.page
    import pypdf._page
    import pypdfium2._helpers.page
    from textractor.data.text_linearization_config import TextLinearizationConfig


_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
_PDF_FILTER_WITHOUT_LOSS = [
    "LZWDecode",
    "LZW",
    "FlateDecode",
    "Fl",
    "ASCII85Decode",
    "A85",
    "ASCIIHexDecode",
    "AHx",
    "RunLengthDecode",
    "RL",
    "CCITTFaxDecode",
    "CCF",
    "JBIG2Decode",
]


[docs]def extract_from_images_with_rapidocr( images: Sequence[Union[Iterable[np.ndarray], bytes]], ) -> str: """使用RapidOCR从图像中提取文本。 参数: images:要从中提取文本的图像。 返回: 从图像中提取的文本。 引发: ImportError:如果未安装`rapidocr-onnxruntime`包。 """ try: from rapidocr_onnxruntime import RapidOCR except ImportError: raise ImportError( "`rapidocr-onnxruntime` package not found, please install it with " "`pip install rapidocr-onnxruntime`" ) ocr = RapidOCR() text = "" for img in images: result, _ = ocr(img) if result: result = [text[1] for text in result] text += "\n".join(result) return text
[docs]class PyPDFParser(BaseBlobParser): """使用`pypdf`加载`PDF`"""
[docs] def __init__( self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False ): self.password = password self.extract_images = extract_images
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """懒惰地解析blob。""" import pypdf with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password) yield from [ Document( page_content=page.extract_text() + self._extract_images_from_page(page), metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined] ) for page_number, page in enumerate(pdf_reader.pages) ]
def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str: """从页面中提取图像,并使用RapidOCR获取文本。""" if not self.extract_images or "/XObject" not in page["/Resources"].keys(): return "" xObject = page["/Resources"]["/XObject"].get_object() # type: ignore images = [] for obj in xObject: if xObject[obj]["/Subtype"] == "/Image": if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS: height, width = xObject[obj]["/Height"], xObject[obj]["/Width"] images.append( np.frombuffer(xObject[obj].get_data(), dtype=np.uint8).reshape( height, width, -1 ) ) elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS: images.append(xObject[obj].get_data()) else: warnings.warn("Unknown PDF Filter!") return extract_from_images_with_rapidocr(images)
[docs]class PDFMinerParser(BaseBlobParser): """使用`PDFMiner`解析`PDF`。"""
[docs] def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True): """基于PDFMiner初始化解析器。 参数: extract_images: 是否从PDF中提取图像。 concatenate_pages: 如果为True,则将所有PDF页面连接成一个单个文档。否则,每页返回一个文档。 """ self.extract_images = extract_images self.concatenate_pages = concatenate_pages
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """懒惰地解析blob。""" if not self.extract_images: from pdfminer.high_level import extract_text with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] if self.concatenate_pages: text = extract_text(pdf_file_obj) metadata = {"source": blob.source} # type: ignore[attr-defined] yield Document(page_content=text, metadata=metadata) else: from pdfminer.pdfpage import PDFPage pages = PDFPage.get_pages(pdf_file_obj) for i, _ in enumerate(pages): text = extract_text(pdf_file_obj, page_numbers=[i]) metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined] yield Document(page_content=text, metadata=metadata) else: import io from pdfminer.converter import PDFPageAggregator, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage text_io = io.StringIO() with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] pages = PDFPage.get_pages(pdf_file_obj) rsrcmgr = PDFResourceManager() device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams()) device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text) interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image) for i, page in enumerate(pages): interpreter_for_text.process_page(page) interpreter_for_image.process_page(page) content = text_io.getvalue() + self._extract_images_from_page( device_for_image.get_result() ) text_io.truncate(0) text_io.seek(0) metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined] yield Document(page_content=content, metadata=metadata)
def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str: """从页面中提取图像,并使用RapidOCR获取文本。""" import pdfminer def get_image(layout_object: Any) -> Any: if isinstance(layout_object, pdfminer.layout.LTImage): return layout_object if isinstance(layout_object, pdfminer.layout.LTContainer): for child in layout_object: return get_image(child) else: return None images = [] for img in list(filter(bool, map(get_image, page))): if img.stream["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: images.append( np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape( img.stream["Height"], img.stream["Width"], -1 ) ) elif img.stream["Filter"].name in _PDF_FILTER_WITH_LOSS: images.append(img.stream.get_data()) else: warnings.warn("Unknown PDF Filter!") return extract_from_images_with_rapidocr(images)
[docs]class PyMuPDFParser(BaseBlobParser): """使用`PyMuPDF`解析`PDF`。"""
[docs] def __init__( self, text_kwargs: Optional[Mapping[str, Any]] = None, extract_images: bool = False, ) -> None: """初始化解析器。 参数: text_kwargs:传递给``fitz.Page.get_text()``的关键字参数。 """ self.text_kwargs = text_kwargs or {} self.extract_images = extract_images
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """懒惰地解析blob。""" import fitz with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] if blob.data is None: # type: ignore[attr-defined] doc = fitz.open(file_path) else: doc = fitz.open(stream=file_path, filetype="pdf") yield from [ Document( page_content=page.get_text(**self.text_kwargs) + self._extract_images_from_page(doc, page), metadata=dict( { "source": blob.source, # type: ignore[attr-defined] "file_path": blob.source, # type: ignore[attr-defined] "page": page.number, "total_pages": len(doc), }, **{ k: doc.metadata[k] for k in doc.metadata if type(doc.metadata[k]) in [str, int] }, ), ) for page in doc ]
def _extract_images_from_page( self, doc: fitz.fitz.Document, page: fitz.fitz.Page ) -> str: """从页面中提取图像,并使用RapidOCR获取文本。""" if not self.extract_images: return "" import fitz img_list = page.get_images() imgs = [] for img in img_list: xref = img[0] pix = fitz.Pixmap(doc, xref) imgs.append( np.frombuffer(pix.samples, dtype=np.uint8).reshape( pix.height, pix.width, -1 ) ) return extract_from_images_with_rapidocr(imgs)
[docs]class PyPDFium2Parser(BaseBlobParser): """使用`PyPDFium2`解析`PDF`。"""
[docs] def __init__(self, extract_images: bool = False) -> None: """初始化解析器。""" try: import pypdfium2 # noqa:F401 except ImportError: raise ImportError( "pypdfium2 package not found, please install it with" " `pip install pypdfium2`" ) self.extract_images = extract_images
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """懒惰地解析blob。""" import pypdfium2 # pypdfium2 is really finicky with respect to closing things, # if done incorrectly creates seg faults. with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True) try: for page_number, page in enumerate(pdf_reader): text_page = page.get_textpage() content = text_page.get_text_range() text_page.close() content += "\n" + self._extract_images_from_page(page) page.close() metadata = {"source": blob.source, "page": page_number} # type: ignore[attr-defined] yield Document(page_content=content, metadata=metadata) finally: pdf_reader.close()
def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str: """从页面中提取图像,并使用RapidOCR获取文本。""" if not self.extract_images: return "" import pypdfium2.raw as pdfium_c images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))) images = list(map(lambda x: x.get_bitmap().to_numpy(), images)) return extract_from_images_with_rapidocr(images)
[docs]class PDFPlumberParser(BaseBlobParser): """使用`PDFPlumber`解析`PDF`。"""
[docs] def __init__( self, text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False, extract_images: bool = False, ) -> None: """初始化解析器。 参数: text_kwargs:传递给``pdfplumber.Page.extract_text()``的关键字参数 dedupe:如果`dedupe=True`,避免重复字符的错误。 """ self.text_kwargs = text_kwargs or {} self.dedupe = dedupe self.extract_images = extract_images
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """懒惰地解析blob。""" import pdfplumber with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] doc = pdfplumber.open(file_path) # open document yield from [ Document( page_content=self._process_page_content(page) + "\n" + self._extract_images_from_page(page), metadata=dict( { "source": blob.source, # type: ignore[attr-defined] "file_path": blob.source, # type: ignore[attr-defined] "page": page.page_number - 1, "total_pages": len(doc.pages), }, **{ k: doc.metadata[k] for k in doc.metadata if type(doc.metadata[k]) in [str, int] }, ), ) for page in doc.pages ]
def _process_page_content(self, page: pdfplumber.page.Page) -> str: """根据去重处理页面内容。""" if self.dedupe: return page.dedupe_chars().extract_text(**self.text_kwargs) return page.extract_text(**self.text_kwargs) def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str: """从页面中提取图像,并使用RapidOCR获取文本。""" if not self.extract_images: return "" images = [] for img in page.images: if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: images.append( np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape( img["stream"]["Height"], img["stream"]["Width"], -1 ) ) elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: images.append(img["stream"].get_data()) else: warnings.warn("Unknown PDF Filter!") return extract_from_images_with_rapidocr(images)
[docs]class AmazonTextractPDFParser(BaseBlobParser): """将`PDF`文件发送到`Amazon Textract`并解析它们。 要解析多页PDF文件,它们必须存储在S3上。 AmazonTextractPDFLoader调用 [Amazon Textract服务](https://aws.amazon.com/textract/) 将PDF文件转换为文档结构。 支持单页和多页文档,最多支持3000页和512 MB大小。 要成功调用,需要一个AWS账户, 类似于 [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) 的要求。 除了AWS配置外,它与其他PDF加载器非常相似,同时还支持JPEG、PNG和TIFF以及非原生PDF格式。 ```python from langchain_community.document_loaders import AmazonTextractPDFLoader loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg") documents = loader.load() ``` 一个特性是输出的线性化。 当与Textract一起使用特性LAYOUT、FORMS或TABLES时 ```python from langchain_community.document_loaders import AmazonTextractPDFLoader # 您可以混合和匹配每个特性 loader=AmazonTextractPDFLoader( "example_data/alejandro_rosalez_sample-small.jpeg", textract_features=["TABLES", "LAYOUT"]) documents = loader.load() ``` 它将生成按阅读顺序格式化文本的输出, 并尝试以表格结构输出信息或 使用冒号(key: value)输出键/值对。 这有助于大多数LLMs在处理这些文本时实现更好的准确性。"""
[docs] def __init__( self, textract_features: Optional[Sequence[int]] = None, client: Optional[Any] = None, *, linearization_config: Optional["TextLinearizationConfig"] = None, ) -> None: """初始化解析器。 参数: textract_features:用于提取的特征,每个特征应作为符合枚举`Textract_Features`的整数传递,参见`amazon-textract-caller`包 client:boto3 textract客户端 linearization_config:用于输出线性化的配置应为来自`textractor`包的TextLinearizationConfig实例 """ try: import textractcaller as tc import textractor.entities.document as textractor self.tc = tc self.textractor = textractor if textract_features is not None: self.textract_features = [ tc.Textract_Features(f) for f in textract_features ] else: self.textract_features = [] if linearization_config is not None: self.linearization_config = linearization_config else: self.linearization_config = self.textractor.TextLinearizationConfig( hide_figure_layout=True, title_prefix="# ", section_header_prefix="## ", list_element_prefix="*", ) except ImportError: raise ImportError( "Could not import amazon-textract-caller or " "amazon-textract-textractor python package. Please install it " "with `pip install amazon-textract-caller` & " "`pip install amazon-textract-textractor`." ) if not client: try: import boto3 self.boto3_textract_client = boto3.client("textract") except ImportError: raise ImportError( "Could not import boto3 python package. " "Please install it with `pip install boto3`." ) else: self.boto3_textract_client = client
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """遍历Blob页面并返回一个迭代器,每个页面都有一个Document,类似于其他解析器。如果是多页文档,blob.path必须设置为S3 URI,对于单页文档,则使用blob.data。 """ url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined] # Either call with S3 path (multi-page) or with bytes (single-page) if ( url_parse_result and url_parse_result.scheme == "s3" and url_parse_result.netloc ): textract_response_json = self.tc.call_textract( input_document=str(blob.path), # type: ignore[attr-defined] features=self.textract_features, boto3_textract_client=self.boto3_textract_client, ) else: textract_response_json = self.tc.call_textract( input_document=blob.as_bytes(), # type: ignore[attr-defined] features=self.textract_features, call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC, boto3_textract_client=self.boto3_textract_client, ) document = self.textractor.Document.open(textract_response_json) for idx, page in enumerate(document.pages): yield Document( page_content=page.get_text(config=self.linearization_config), metadata={"source": blob.source, "page": idx + 1}, # type: ignore[attr-defined] )
[docs]class DocumentIntelligenceParser(BaseBlobParser): """使用Azure文档智能(以前称为表单识别器)加载PDF,并按字符级别分块。"""
[docs] def __init__(self, client: Any, model: str): warnings.warn( "langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParser" "and langchain_community.document_loaders.pdf.DocumentIntelligenceLoader" " are deprecated. Please upgrade to " "langchain_community.document_loaders.DocumentIntelligenceLoader " "for any file parsing purpose using Azure Document Intelligence " "service." ) self.client = client self.model = model
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type] for p in result.pages: content = " ".join([line.content for line in p.lines]) d = Document( page_content=content, metadata={ "source": blob.source, # type: ignore[attr-defined] "page": p.page_number, }, ) yield d
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """懒惰地解析blob。""" with blob.as_bytes_io() as file_obj: # type: ignore[attr-defined] poller = self.client.begin_analyze_document(self.model, file_obj) result = poller.result() docs = self._generate_docs(blob, result) yield from docs