Source code for langchain_community.document_loaders.pdf

import json
import logging
import os
import re
import tempfile
import time
from abc import ABC
from io import StringIO
from pathlib import Path
from typing import (
    TYPE_CHECKING,
    Any,
    Dict,
    Iterator,
    List,
    Mapping,
    Optional,
    Sequence,
    Union,
)
from urllib.parse import urlparse

import requests
from langchain_core.documents import Document
from langchain_core.utils import get_from_dict_or_env

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
    AmazonTextractPDFParser,
    DocumentIntelligenceParser,
    PDFMinerParser,
    PDFPlumberParser,
    PyMuPDFParser,
    PyPDFium2Parser,
    PyPDFParser,
)
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

if TYPE_CHECKING:
    from textractor.data.text_linearization_config import TextLinearizationConfig

logger = logging.getLogger(__file__)


[docs]class UnstructuredPDFLoader(UnstructuredFileLoader): """使用`Unstructured`加载`PDF`文件。 您可以在两种模式中的一种中运行加载程序:"single"和"elements"。 如果使用"single"模式,文档将作为单个langchain Document对象返回。 如果使用"elements"模式,unstructured库将文档拆分为诸如Title和NarrativeText之类的元素。 您可以在模式之后传递额外的unstructured kwargs以应用不同的unstructured设置。 示例 -------- from langchain_community.document_loaders import UnstructuredPDFLoader loader = UnstructuredPDFLoader( "example.pdf", mode="elements", strategy="fast", ) docs = loader.load() 参考资料 ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf""" def _get_elements(self) -> List: from unstructured.partition.pdf import partition_pdf return partition_pdf(filename=self.file_path, **self.unstructured_kwargs)
[docs]class BasePDFLoader(BaseLoader, ABC): """用于`PDF`文件的基本加载器类。 如果文件是网络路径,则会将其下载到临时文件中,使用后再清理临时文件。"""
[docs] def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None): """使用文件路径进行初始化。 参数: file_path:PDF文件的本地、S3或Web路径。 headers:用于从Web路径下载文件的GET请求的标头。 """ self.file_path = str(file_path) self.web_path = None self.headers = headers if "~" in self.file_path: self.file_path = os.path.expanduser(self.file_path) # If the file is a web path or S3, download it to a temporary file, and use that if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path): self.temp_dir = tempfile.TemporaryDirectory() _, suffix = os.path.splitext(self.file_path) if self._is_s3_presigned_url(self.file_path): suffix = urlparse(self.file_path).path.split("/")[-1] temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}") self.web_path = self.file_path if not self._is_s3_url(self.file_path): r = requests.get(self.file_path, headers=self.headers) if r.status_code != 200: raise ValueError( "Check the url of your file; returned status code %s" % r.status_code ) with open(temp_pdf, mode="wb") as f: f.write(r.content) self.file_path = str(temp_pdf) elif not os.path.isfile(self.file_path): raise ValueError("File path %s is not a valid file or url" % self.file_path)
def __del__(self) -> None: if hasattr(self, "temp_dir"): self.temp_dir.cleanup() @staticmethod def _is_valid_url(url: str) -> bool: """检查URL是否有效。""" parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) @staticmethod def _is_s3_url(url: str) -> bool: """检查URL是否为S3""" try: result = urlparse(url) if result.scheme == "s3" and result.netloc: return True return False except ValueError: return False @staticmethod def _is_s3_presigned_url(url: str) -> bool: """检查URL是否为预签名的S3 URL。""" try: result = urlparse(url) return bool(re.search(r"\.s3\.amazonaws\.com$", result.netloc)) except ValueError: return False @property def source(self) -> str: return self.web_path if self.web_path is not None else self.file_path
[docs]class OnlinePDFLoader(BasePDFLoader): """加载在线`PDF`。"""
[docs] def load(self) -> List[Document]: """加载文档。""" loader = UnstructuredPDFLoader(str(self.file_path)) return loader.load()
[docs]class PyPDFLoader(BasePDFLoader): """使用pypdf加载PDF文件到文档列表中。 加载器按页面划分块,并将页面编号存储在元数据中。"""
[docs] def __init__( self, file_path: str, password: Optional[Union[str, bytes]] = None, headers: Optional[Dict] = None, extract_images: bool = False, ) -> None: """使用文件路径进行初始化。""" try: import pypdf # noqa:F401 except ImportError: raise ImportError( "pypdf package not found, please install it with " "`pip install pypdf`" ) super().__init__(file_path, headers=headers) self.parser = PyPDFParser(password=password, extract_images=extract_images)
[docs] def lazy_load( self, ) -> Iterator[Document]: """将给定路径作为页面进行延迟加载。""" if self.web_path: blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] else: blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from self.parser.parse(blob)
[docs]class PyPDFium2Loader(BasePDFLoader): """使用`pypdfium2`加载`PDF`并按字符级别分块。"""
[docs] def __init__( self, file_path: str, *, headers: Optional[Dict] = None, extract_images: bool = False, ): """使用文件路径进行初始化。""" super().__init__(file_path, headers=headers) self.parser = PyPDFium2Parser(extract_images=extract_images)
[docs] def lazy_load( self, ) -> Iterator[Document]: """将给定路径作为页面进行延迟加载。""" if self.web_path: blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] else: blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from self.parser.parse(blob)
[docs]class PyPDFDirectoryLoader(BaseLoader): """使用`pypdf`加载一个包含`PDF`文件的目录,并以字符级别进行分块。 加载器还会将页面编号存储在元数据中。"""
[docs] def __init__( self, path: Union[str, Path], glob: str = "**/[!.]*.pdf", silent_errors: bool = False, load_hidden: bool = False, recursive: bool = False, extract_images: bool = False, ): self.path = path self.glob = glob self.load_hidden = load_hidden self.recursive = recursive self.silent_errors = silent_errors self.extract_images = extract_images
@staticmethod def _is_visible(path: Path) -> bool: return not any(part.startswith(".") for part in path.parts)
[docs] def load(self) -> List[Document]: p = Path(self.path) docs = [] items = p.rglob(self.glob) if self.recursive else p.glob(self.glob) for i in items: if i.is_file(): if self._is_visible(i.relative_to(p)) or self.load_hidden: try: loader = PyPDFLoader(str(i), extract_images=self.extract_images) sub_docs = loader.load() for doc in sub_docs: doc.metadata["source"] = str(i) docs.extend(sub_docs) except Exception as e: if self.silent_errors: logger.warning(e) else: raise e return docs
[docs]class PDFMinerLoader(BasePDFLoader): """使用`PDFMiner`加载`PDF`文件。"""
[docs] def __init__( self, file_path: str, *, headers: Optional[Dict] = None, extract_images: bool = False, concatenate_pages: bool = True, ) -> None: """使用文件路径进行初始化。 参数: extract_images:是否从PDF中提取图像。 concatenate_pages:如果为True,则将所有PDF页面连接成一个单个文档。否则,每页返回一个文档。 """ try: from pdfminer.high_level import extract_text # noqa:F401 except ImportError: raise ImportError( "`pdfminer` package not found, please install it with " "`pip install pdfminer.six`" ) super().__init__(file_path, headers=headers) self.parser = PDFMinerParser( extract_images=extract_images, concatenate_pages=concatenate_pages )
[docs] def lazy_load( self, ) -> Iterator[Document]: """懒加载文档。""" if self.web_path: blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] else: blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from self.parser.parse(blob)
[docs]class PDFMinerPDFasHTMLLoader(BasePDFLoader): """使用`PDFMiner`将`PDF`文件加载为HTML内容。"""
[docs] def __init__(self, file_path: str, *, headers: Optional[Dict] = None): """使用文件路径进行初始化。""" try: from pdfminer.high_level import extract_text_to_fp # noqa:F401 except ImportError: raise ImportError( "`pdfminer` package not found, please install it with " "`pip install pdfminer.six`" ) super().__init__(file_path, headers=headers)
[docs] def lazy_load(self) -> Iterator[Document]: """加载文件。""" from pdfminer.high_level import extract_text_to_fp from pdfminer.layout import LAParams from pdfminer.utils import open_filename output_string = StringIO() with open_filename(self.file_path, "rb") as fp: extract_text_to_fp( fp, output_string, codec="", laparams=LAParams(), output_type="html", ) metadata = { "source": self.file_path if self.web_path is None else self.web_path } yield Document(page_content=output_string.getvalue(), metadata=metadata)
[docs]class PyMuPDFLoader(BasePDFLoader): """使用`PyMuPDF`加载`PDF`文件。"""
[docs] def __init__( self, file_path: str, *, headers: Optional[Dict] = None, extract_images: bool = False, **kwargs: Any, ) -> None: """使用文件路径进行初始化。""" try: import fitz # noqa:F401 except ImportError: raise ImportError( "`PyMuPDF` package not found, please install it with " "`pip install pymupdf`" ) super().__init__(file_path, headers=headers) self.extract_images = extract_images self.text_kwargs = kwargs
def _lazy_load(self, **kwargs: Any) -> Iterator[Document]: if kwargs: logger.warning( f"Received runtime arguments {kwargs}. Passing runtime args to `load`" f" is deprecated. Please pass arguments during initialization instead." ) text_kwargs = {**self.text_kwargs, **kwargs} parser = PyMuPDFParser( text_kwargs=text_kwargs, extract_images=self.extract_images ) if self.web_path: blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] else: blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from parser.lazy_parse(blob)
[docs] def load(self, **kwargs: Any) -> List[Document]: return list(self._lazy_load(**kwargs))
[docs] def lazy_load(self) -> Iterator[Document]: yield from self._lazy_load()
# MathpixPDFLoader implementation taken largely from Daniel Gross's: # https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
[docs]class MathpixPDFLoader(BasePDFLoader): """使用`Mathpix`服务加载`PDF`文件。"""
[docs] def __init__( self, file_path: str, processed_file_format: str = "md", max_wait_time_seconds: int = 500, should_clean_pdf: bool = False, extra_request_data: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> None: """使用文件路径进行初始化。 参数: file_path: 用于加载的文件。 processed_file_format: 处理后文件的格式。默认为"md"。 max_wait_time_seconds: 等待服务器响应的最长时间。默认为500。 should_clean_pdf: 清理PDF文件的标志。默认为False。 extra_request_data: 额外的请求数据。 **kwargs: 额外的关键字参数。 """ self.mathpix_api_key = get_from_dict_or_env( kwargs, "mathpix_api_key", "MATHPIX_API_KEY" ) self.mathpix_api_id = get_from_dict_or_env( kwargs, "mathpix_api_id", "MATHPIX_API_ID" ) # The base class isn't expecting these and doesn't collect **kwargs kwargs.pop("mathpix_api_key", None) kwargs.pop("mathpix_api_id", None) super().__init__(file_path, **kwargs) self.processed_file_format = processed_file_format self.extra_request_data = ( extra_request_data if extra_request_data is not None else {} ) self.max_wait_time_seconds = max_wait_time_seconds self.should_clean_pdf = should_clean_pdf
@property def _mathpix_headers(self) -> Dict[str, str]: return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key} @property def url(self) -> str: return "https://api.mathpix.com/v3/pdf" @property def data(self) -> dict: options = { "conversion_formats": {self.processed_file_format: True}, **self.extra_request_data, } return {"options_json": json.dumps(options)}
[docs] def send_pdf(self) -> str: with open(self.file_path, "rb") as f: files = {"file": f} response = requests.post( self.url, headers=self._mathpix_headers, files=files, data=self.data ) response_data = response.json() if "error" in response_data: raise ValueError(f"Mathpix request failed: {response_data['error']}") if "pdf_id" in response_data: pdf_id = response_data["pdf_id"] return pdf_id else: raise ValueError("Unable to send PDF to Mathpix.")
[docs] def wait_for_processing(self, pdf_id: str) -> None: """等待处理完成。 参数: pdf_id: 一个PDF id。 返回值:无 """ url = self.url + "/" + pdf_id for _ in range(0, self.max_wait_time_seconds, 5): response = requests.get(url, headers=self._mathpix_headers) response_data = response.json() # This indicates an error with the request (e.g. auth problems) error = response_data.get("error", None) error_info = response_data.get("error_info", None) if error is not None: error_msg = f"Unable to retrieve PDF from Mathpix: {error}" if error_info is not None: error_msg += f" ({error_info['id']})" raise ValueError(error_msg) status = response_data.get("status", None) if status == "completed": return elif status == "error": # This indicates an error with the PDF processing raise ValueError("Unable to retrieve PDF from Mathpix") else: print(f"Status: {status}, waiting for processing to complete") # noqa: T201 time.sleep(5) raise TimeoutError
[docs] def get_processed_pdf(self, pdf_id: str) -> str: self.wait_for_processing(pdf_id) url = f"{self.url}/{pdf_id}.{self.processed_file_format}" response = requests.get(url, headers=self._mathpix_headers) return response.content.decode("utf-8")
[docs] def clean_pdf(self, contents: str) -> str: """清理PDF文件。 参数: contents: 一个PDF文件的内容。 返回值: """ contents = "\n".join( [line for line in contents.split("\n") if not line.startswith("![]")] ) # replace \section{Title} with # Title contents = contents.replace("\\section{", "# ").replace("}", "") # replace the "\" slash that Mathpix adds to escape $, %, (, etc. contents = ( contents.replace(r"\$", "$") .replace(r"\%", "%") .replace(r"\(", "(") .replace(r"\)", ")") ) return contents
[docs] def load(self) -> List[Document]: pdf_id = self.send_pdf() contents = self.get_processed_pdf(pdf_id) if self.should_clean_pdf: contents = self.clean_pdf(contents) metadata = {"source": self.source, "file_path": self.source, "pdf_id": pdf_id} return [Document(page_content=contents, metadata=metadata)]
[docs]class PDFPlumberLoader(BasePDFLoader): """使用`pdfplumber`加载`PDF`文件。"""
[docs] def __init__( self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False, headers: Optional[Dict] = None, extract_images: bool = False, ) -> None: """使用文件路径进行初始化。""" try: import pdfplumber # noqa:F401 except ImportError: raise ImportError( "pdfplumber package not found, please install it with " "`pip install pdfplumber`" ) super().__init__(file_path, headers=headers) self.text_kwargs = text_kwargs or {} self.dedupe = dedupe self.extract_images = extract_images
[docs] def load(self) -> List[Document]: """加载文件。""" parser = PDFPlumberParser( text_kwargs=self.text_kwargs, dedupe=self.dedupe, extract_images=self.extract_images, ) if self.web_path: blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] else: blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] return parser.parse(blob)
[docs]class AmazonTextractPDFLoader(BasePDFLoader): """从本地文件系统、HTTP或S3加载`PDF`文件。 要进行身份验证,AWS客户端使用以下方法自动加载凭据: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html 如果应该使用特定的凭据配置文件,必须传递要使用的位于~/.aws/credentials文件中的配置文件的名称。 确保使用的凭据/角色具有访问Amazon Textract服务所需的策略。 示例: .. code-block:: python from langchain_community.document_loaders import AmazonTextractPDFLoader loader = AmazonTextractPDFLoader( file_path="s3://pdfs/myfile.pdf" ) document = loader.load()"""
[docs] def __init__( self, file_path: str, textract_features: Optional[Sequence[str]] = None, client: Optional[Any] = None, credentials_profile_name: Optional[str] = None, region_name: Optional[str] = None, endpoint_url: Optional[str] = None, headers: Optional[Dict] = None, *, linearization_config: Optional["TextLinearizationConfig"] = None, ) -> None: """初始化加载程序。 参数: file_path:输入文件的文件、URL或S3路径 textract_features:用于提取的特征,每个特征应作为符合枚举`Textract_Features`的字符串传递,参见`amazon-textract-caller`包 client:boto3 textract客户端(可选) credentials_profile_name:AWS配置文件名称,如果不是默认值(可选) region_name:AWS区域,例如us-east-1(可选) endpoint_url:textract服务的端点URL(可选) linearization_config:用于输出线性化的配置应为来自`textractor`包的TextLinearizationConfig实例 """ super().__init__(file_path, headers=headers) try: import textractcaller as tc except ImportError: raise ImportError( "Could not import amazon-textract-caller python package. " "Please install it with `pip install amazon-textract-caller`." ) if textract_features: features = [tc.Textract_Features[x] for x in textract_features] else: features = [] if credentials_profile_name or region_name or endpoint_url: try: import boto3 if credentials_profile_name is not None: session = boto3.Session(profile_name=credentials_profile_name) else: # use default credentials session = boto3.Session() client_params = {} if region_name: client_params["region_name"] = region_name if endpoint_url: client_params["endpoint_url"] = endpoint_url client = session.client("textract", **client_params) except ImportError: raise ImportError( "Could not import boto3 python package. " "Please install it with `pip install boto3`." ) except Exception as e: raise ValueError( "Could not load credentials to authenticate with AWS client. " "Please check that credentials in the specified " f"profile name are valid. {e}" ) from e self.parser = AmazonTextractPDFParser( textract_features=features, client=client, linearization_config=linearization_config, )
[docs] def load(self) -> List[Document]: """将给定路径加载为页面。""" return list(self.lazy_load())
[docs] def lazy_load( self, ) -> Iterator[Document]: """延迟加载文档""" # the self.file_path is local, but the blob has to include # the S3 location if the file originated from S3 for multi-page documents # raises ValueError when multi-page and not on S3""" if self.web_path and self._is_s3_url(self.web_path): blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc] else: blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1: raise ValueError( f"the file {blob.path} is a multi-page document, \ but not stored on S3. \ Textract requires multi-page documents to be on S3." ) yield from self.parser.parse(blob)
@staticmethod def _get_number_of_pages(blob: Blob) -> int: # type: ignore[valid-type] try: import pypdf from PIL import Image, ImageSequence except ImportError: raise ImportError( "Could not import pypdf or Pilloe python package. " "Please install it with `pip install pypdf Pillow`." ) if blob.mimetype == "application/pdf": # type: ignore[attr-defined] with blob.as_bytes_io() as input_pdf_file: # type: ignore[attr-defined] pdf_reader = pypdf.PdfReader(input_pdf_file) return len(pdf_reader.pages) elif blob.mimetype == "image/tiff": # type: ignore[attr-defined] num_pages = 0 img = Image.open(blob.as_bytes()) # type: ignore[attr-defined] for _, _ in enumerate(ImageSequence.Iterator(img)): num_pages += 1 return num_pages elif blob.mimetype in ["image/png", "image/jpeg"]: # type: ignore[attr-defined] return 1 else: raise ValueError(f"unsupported mime type: {blob.mimetype}") # type: ignore[attr-defined]
[docs]class DocumentIntelligenceLoader(BasePDFLoader): """使用Azure文档智能加载PDF"""
[docs] def __init__( self, file_path: str, client: Any, model: str = "prebuilt-document", headers: Optional[Dict] = None, ) -> None: """使用Azure文档智能(以前称为表单识别器)初始化文件处理对象。 此构造函数初始化一个DocumentIntelligenceParser对象,用于使用Azure文档智能API解析文件。load方法为每个页面生成包含元数据(源blob和页码)的Document节点。 参数: ----------- file_path : str 需要解析的文件路径。 client: Any 用于执行blob分析的DocumentAnalysisClient model : str 用于在Azure中进行表单识别的模型名称或ID。 示例: --------- >>> obj = DocumentIntelligenceLoader( ... file_path="path/to/file", ... client=client, ... model="prebuilt-document" ... ) """ self.parser = DocumentIntelligenceParser(client=client, model=model) super().__init__(file_path, headers=headers)
[docs] def load(self) -> List[Document]: """将给定路径加载为页面。""" return list(self.lazy_load())
[docs] def lazy_load( self, ) -> Iterator[Document]: """将给定路径作为页面进行延迟加载。""" blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from self.parser.parse(blob)
# Legacy: only for backwards compatibility. Use PyPDFLoader instead PagedPDFSplitter = PyPDFLoader