import json
import logging
import os
import re
import tempfile
import time
from abc import ABC
from io import StringIO
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterator,
List,
Mapping,
Optional,
Sequence,
Union,
)
from urllib.parse import urlparse
import requests
from langchain_core.documents import Document
from langchain_core.utils import get_from_dict_or_env
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
AmazonTextractPDFParser,
DocumentIntelligenceParser,
PDFMinerParser,
PDFPlumberParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
)
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
if TYPE_CHECKING:
from textractor.data.text_linearization_config import TextLinearizationConfig
logger = logging.getLogger(__file__)
[docs]class UnstructuredPDFLoader(UnstructuredFileLoader):
"""使用`Unstructured`加载`PDF`文件。
您可以在两种模式中的一种中运行加载程序:"single"和"elements"。
如果使用"single"模式,文档将作为单个langchain Document对象返回。
如果使用"elements"模式,unstructured库将文档拆分为诸如Title和NarrativeText之类的元素。
您可以在模式之后传递额外的unstructured kwargs以应用不同的unstructured设置。
示例
--------
from langchain_community.document_loaders import UnstructuredPDFLoader
loader = UnstructuredPDFLoader(
"example.pdf", mode="elements", strategy="fast",
)
docs = loader.load()
参考资料
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf"""
def _get_elements(self) -> List:
from unstructured.partition.pdf import partition_pdf
return partition_pdf(filename=self.file_path, **self.unstructured_kwargs)
[docs]class BasePDFLoader(BaseLoader, ABC):
"""用于`PDF`文件的基本加载器类。
如果文件是网络路径,则会将其下载到临时文件中,使用后再清理临时文件。"""
[docs] def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None):
"""使用文件路径进行初始化。
参数:
file_path:PDF文件的本地、S3或Web路径。
headers:用于从Web路径下载文件的GET请求的标头。
"""
self.file_path = str(file_path)
self.web_path = None
self.headers = headers
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)
# If the file is a web path or S3, download it to a temporary file, and use that
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
self.temp_dir = tempfile.TemporaryDirectory()
_, suffix = os.path.splitext(self.file_path)
if self._is_s3_presigned_url(self.file_path):
suffix = urlparse(self.file_path).path.split("/")[-1]
temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
self.web_path = self.file_path
if not self._is_s3_url(self.file_path):
r = requests.get(self.file_path, headers=self.headers)
if r.status_code != 200:
raise ValueError(
"Check the url of your file; returned status code %s"
% r.status_code
)
with open(temp_pdf, mode="wb") as f:
f.write(r.content)
self.file_path = str(temp_pdf)
elif not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file or url" % self.file_path)
def __del__(self) -> None:
if hasattr(self, "temp_dir"):
self.temp_dir.cleanup()
@staticmethod
def _is_valid_url(url: str) -> bool:
"""检查URL是否有效。"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
@staticmethod
def _is_s3_url(url: str) -> bool:
"""检查URL是否为S3"""
try:
result = urlparse(url)
if result.scheme == "s3" and result.netloc:
return True
return False
except ValueError:
return False
@staticmethod
def _is_s3_presigned_url(url: str) -> bool:
"""检查URL是否为预签名的S3 URL。"""
try:
result = urlparse(url)
return bool(re.search(r"\.s3\.amazonaws\.com$", result.netloc))
except ValueError:
return False
@property
def source(self) -> str:
return self.web_path if self.web_path is not None else self.file_path
[docs]class OnlinePDFLoader(BasePDFLoader):
"""加载在线`PDF`。"""
[docs] def load(self) -> List[Document]:
"""加载文档。"""
loader = UnstructuredPDFLoader(str(self.file_path))
return loader.load()
[docs]class PyPDFLoader(BasePDFLoader):
"""使用pypdf加载PDF文件到文档列表中。
加载器按页面划分块,并将页面编号存储在元数据中。"""
[docs] def __init__(
self,
file_path: str,
password: Optional[Union[str, bytes]] = None,
headers: Optional[Dict] = None,
extract_images: bool = False,
) -> None:
"""使用文件路径进行初始化。"""
try:
import pypdf # noqa:F401
except ImportError:
raise ImportError(
"pypdf package not found, please install it with " "`pip install pypdf`"
)
super().__init__(file_path, headers=headers)
self.parser = PyPDFParser(password=password, extract_images=extract_images)
[docs] def lazy_load(
self,
) -> Iterator[Document]:
"""将给定路径作为页面进行延迟加载。"""
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob)
[docs]class PyPDFium2Loader(BasePDFLoader):
"""使用`pypdfium2`加载`PDF`并按字符级别分块。"""
[docs] def __init__(
self,
file_path: str,
*,
headers: Optional[Dict] = None,
extract_images: bool = False,
):
"""使用文件路径进行初始化。"""
super().__init__(file_path, headers=headers)
self.parser = PyPDFium2Parser(extract_images=extract_images)
[docs] def lazy_load(
self,
) -> Iterator[Document]:
"""将给定路径作为页面进行延迟加载。"""
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob)
[docs]class PyPDFDirectoryLoader(BaseLoader):
"""使用`pypdf`加载一个包含`PDF`文件的目录,并以字符级别进行分块。
加载器还会将页面编号存储在元数据中。"""
[docs] def __init__(
self,
path: Union[str, Path],
glob: str = "**/[!.]*.pdf",
silent_errors: bool = False,
load_hidden: bool = False,
recursive: bool = False,
extract_images: bool = False,
):
self.path = path
self.glob = glob
self.load_hidden = load_hidden
self.recursive = recursive
self.silent_errors = silent_errors
self.extract_images = extract_images
@staticmethod
def _is_visible(path: Path) -> bool:
return not any(part.startswith(".") for part in path.parts)
[docs] def load(self) -> List[Document]:
p = Path(self.path)
docs = []
items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
for i in items:
if i.is_file():
if self._is_visible(i.relative_to(p)) or self.load_hidden:
try:
loader = PyPDFLoader(str(i), extract_images=self.extract_images)
sub_docs = loader.load()
for doc in sub_docs:
doc.metadata["source"] = str(i)
docs.extend(sub_docs)
except Exception as e:
if self.silent_errors:
logger.warning(e)
else:
raise e
return docs
[docs]class PDFMinerLoader(BasePDFLoader):
"""使用`PDFMiner`加载`PDF`文件。"""
[docs] def __init__(
self,
file_path: str,
*,
headers: Optional[Dict] = None,
extract_images: bool = False,
concatenate_pages: bool = True,
) -> None:
"""使用文件路径进行初始化。
参数:
extract_images:是否从PDF中提取图像。
concatenate_pages:如果为True,则将所有PDF页面连接成一个单个文档。否则,每页返回一个文档。
"""
try:
from pdfminer.high_level import extract_text # noqa:F401
except ImportError:
raise ImportError(
"`pdfminer` package not found, please install it with "
"`pip install pdfminer.six`"
)
super().__init__(file_path, headers=headers)
self.parser = PDFMinerParser(
extract_images=extract_images, concatenate_pages=concatenate_pages
)
[docs] def lazy_load(
self,
) -> Iterator[Document]:
"""懒加载文档。"""
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob)
[docs]class PDFMinerPDFasHTMLLoader(BasePDFLoader):
"""使用`PDFMiner`将`PDF`文件加载为HTML内容。"""
[docs] def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
"""使用文件路径进行初始化。"""
try:
from pdfminer.high_level import extract_text_to_fp # noqa:F401
except ImportError:
raise ImportError(
"`pdfminer` package not found, please install it with "
"`pip install pdfminer.six`"
)
super().__init__(file_path, headers=headers)
[docs] def lazy_load(self) -> Iterator[Document]:
"""加载文件。"""
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
from pdfminer.utils import open_filename
output_string = StringIO()
with open_filename(self.file_path, "rb") as fp:
extract_text_to_fp(
fp,
output_string,
codec="",
laparams=LAParams(),
output_type="html",
)
metadata = {
"source": self.file_path if self.web_path is None else self.web_path
}
yield Document(page_content=output_string.getvalue(), metadata=metadata)
[docs]class PyMuPDFLoader(BasePDFLoader):
"""使用`PyMuPDF`加载`PDF`文件。"""
[docs] def __init__(
self,
file_path: str,
*,
headers: Optional[Dict] = None,
extract_images: bool = False,
**kwargs: Any,
) -> None:
"""使用文件路径进行初始化。"""
try:
import fitz # noqa:F401
except ImportError:
raise ImportError(
"`PyMuPDF` package not found, please install it with "
"`pip install pymupdf`"
)
super().__init__(file_path, headers=headers)
self.extract_images = extract_images
self.text_kwargs = kwargs
def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
if kwargs:
logger.warning(
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
f" is deprecated. Please pass arguments during initialization instead."
)
text_kwargs = {**self.text_kwargs, **kwargs}
parser = PyMuPDFParser(
text_kwargs=text_kwargs, extract_images=self.extract_images
)
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from parser.lazy_parse(blob)
[docs] def load(self, **kwargs: Any) -> List[Document]:
return list(self._lazy_load(**kwargs))
[docs] def lazy_load(self) -> Iterator[Document]:
yield from self._lazy_load()
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
[docs]class MathpixPDFLoader(BasePDFLoader):
"""使用`Mathpix`服务加载`PDF`文件。"""
[docs] def __init__(
self,
file_path: str,
processed_file_format: str = "md",
max_wait_time_seconds: int = 500,
should_clean_pdf: bool = False,
extra_request_data: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> None:
"""使用文件路径进行初始化。
参数:
file_path: 用于加载的文件。
processed_file_format: 处理后文件的格式。默认为"md"。
max_wait_time_seconds: 等待服务器响应的最长时间。默认为500。
should_clean_pdf: 清理PDF文件的标志。默认为False。
extra_request_data: 额外的请求数据。
**kwargs: 额外的关键字参数。
"""
self.mathpix_api_key = get_from_dict_or_env(
kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
)
self.mathpix_api_id = get_from_dict_or_env(
kwargs, "mathpix_api_id", "MATHPIX_API_ID"
)
# The base class isn't expecting these and doesn't collect **kwargs
kwargs.pop("mathpix_api_key", None)
kwargs.pop("mathpix_api_id", None)
super().__init__(file_path, **kwargs)
self.processed_file_format = processed_file_format
self.extra_request_data = (
extra_request_data if extra_request_data is not None else {}
)
self.max_wait_time_seconds = max_wait_time_seconds
self.should_clean_pdf = should_clean_pdf
@property
def _mathpix_headers(self) -> Dict[str, str]:
return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}
@property
def url(self) -> str:
return "https://api.mathpix.com/v3/pdf"
@property
def data(self) -> dict:
options = {
"conversion_formats": {self.processed_file_format: True},
**self.extra_request_data,
}
return {"options_json": json.dumps(options)}
[docs] def send_pdf(self) -> str:
with open(self.file_path, "rb") as f:
files = {"file": f}
response = requests.post(
self.url, headers=self._mathpix_headers, files=files, data=self.data
)
response_data = response.json()
if "error" in response_data:
raise ValueError(f"Mathpix request failed: {response_data['error']}")
if "pdf_id" in response_data:
pdf_id = response_data["pdf_id"]
return pdf_id
else:
raise ValueError("Unable to send PDF to Mathpix.")
[docs] def wait_for_processing(self, pdf_id: str) -> None:
"""等待处理完成。
参数:
pdf_id: 一个PDF id。
返回值:无
"""
url = self.url + "/" + pdf_id
for _ in range(0, self.max_wait_time_seconds, 5):
response = requests.get(url, headers=self._mathpix_headers)
response_data = response.json()
# This indicates an error with the request (e.g. auth problems)
error = response_data.get("error", None)
error_info = response_data.get("error_info", None)
if error is not None:
error_msg = f"Unable to retrieve PDF from Mathpix: {error}"
if error_info is not None:
error_msg += f" ({error_info['id']})"
raise ValueError(error_msg)
status = response_data.get("status", None)
if status == "completed":
return
elif status == "error":
# This indicates an error with the PDF processing
raise ValueError("Unable to retrieve PDF from Mathpix")
else:
print(f"Status: {status}, waiting for processing to complete") # noqa: T201
time.sleep(5)
raise TimeoutError
[docs] def get_processed_pdf(self, pdf_id: str) -> str:
self.wait_for_processing(pdf_id)
url = f"{self.url}/{pdf_id}.{self.processed_file_format}"
response = requests.get(url, headers=self._mathpix_headers)
return response.content.decode("utf-8")
[docs] def clean_pdf(self, contents: str) -> str:
"""清理PDF文件。
参数:
contents: 一个PDF文件的内容。
返回值:
"""
contents = "\n".join(
[line for line in contents.split("\n") if not line.startswith("![]")]
)
# replace \section{Title} with # Title
contents = contents.replace("\\section{", "# ").replace("}", "")
# replace the "\" slash that Mathpix adds to escape $, %, (, etc.
contents = (
contents.replace(r"\$", "$")
.replace(r"\%", "%")
.replace(r"\(", "(")
.replace(r"\)", ")")
)
return contents
[docs] def load(self) -> List[Document]:
pdf_id = self.send_pdf()
contents = self.get_processed_pdf(pdf_id)
if self.should_clean_pdf:
contents = self.clean_pdf(contents)
metadata = {"source": self.source, "file_path": self.source, "pdf_id": pdf_id}
return [Document(page_content=contents, metadata=metadata)]
[docs]class PDFPlumberLoader(BasePDFLoader):
"""使用`pdfplumber`加载`PDF`文件。"""
[docs] def __init__(
self,
file_path: str,
text_kwargs: Optional[Mapping[str, Any]] = None,
dedupe: bool = False,
headers: Optional[Dict] = None,
extract_images: bool = False,
) -> None:
"""使用文件路径进行初始化。"""
try:
import pdfplumber # noqa:F401
except ImportError:
raise ImportError(
"pdfplumber package not found, please install it with "
"`pip install pdfplumber`"
)
super().__init__(file_path, headers=headers)
self.text_kwargs = text_kwargs or {}
self.dedupe = dedupe
self.extract_images = extract_images
[docs] def load(self) -> List[Document]:
"""加载文件。"""
parser = PDFPlumberParser(
text_kwargs=self.text_kwargs,
dedupe=self.dedupe,
extract_images=self.extract_images,
)
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
return parser.parse(blob)
[docs]class DocumentIntelligenceLoader(BasePDFLoader):
"""使用Azure文档智能加载PDF"""
[docs] def __init__(
self,
file_path: str,
client: Any,
model: str = "prebuilt-document",
headers: Optional[Dict] = None,
) -> None:
"""使用Azure文档智能(以前称为表单识别器)初始化文件处理对象。
此构造函数初始化一个DocumentIntelligenceParser对象,用于使用Azure文档智能API解析文件。load方法为每个页面生成包含元数据(源blob和页码)的Document节点。
参数:
-----------
file_path : str
需要解析的文件路径。
client: Any
用于执行blob分析的DocumentAnalysisClient
model : str
用于在Azure中进行表单识别的模型名称或ID。
示例:
---------
>>> obj = DocumentIntelligenceLoader(
... file_path="path/to/file",
... client=client,
... model="prebuilt-document"
... )
"""
self.parser = DocumentIntelligenceParser(client=client, model=model)
super().__init__(file_path, headers=headers)
[docs] def load(self) -> List[Document]:
"""将给定路径加载为页面。"""
return list(self.lazy_load())
[docs] def lazy_load(
self,
) -> Iterator[Document]:
"""将给定路径作为页面进行延迟加载。"""
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob)
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader