langchain_community.document_loaders.doc_intelligence

from typing import Iterator, List, Optional

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import (
    AzureAIDocumentIntelligenceParser,
)


[docs]class AzureAIDocumentIntelligenceLoader(BaseLoader):
    """使用Azure文档智能加载PDF。"""

[docs]    def __init__(
        self,
        api_endpoint: str,
        api_key: str,
        file_path: Optional[str] = None,
        url_path: Optional[str] = None,
        api_version: Optional[str] = None,
        api_model: str = "prebuilt-layout",
        mode: str = "markdown",
        *,
        analysis_features: Optional[List[str]] = None,
    ) -> None:
        """使用Azure文档智能（以前称为表单识别器）初始化文件处理对象。

此构造函数初始化一个AzureAIDocumentIntelligenceParser对象，用于使用Azure文档智能API解析文件。load方法生成的文档的内容表示由mode参数确定。

参数：
-----------
api_endpoint: str
    用于构建DocumentIntelligenceClient的API端点。
api_key: str
    用于构建DocumentIntelligenceClient的API密钥。
file_path : Optional[str]
    需要加载的文件路径。
    必须指定file_path或url_path之一。
url_path : Optional[str]
    需要加载的文件的URL。
    必须指定file_path或url_path之一。
api_version: Optional[str]
    DocumentIntelligenceClient的API版本。设置为None以使用`azure-ai-documentintelligence`包中的默认值。
api_model: str
    唯一的文档模型名称。默认值为"prebuilt-layout"。
    请注意，覆盖此默认值可能导致不受支持的行为。
mode: Optional[str]
    生成的文档的内容表示类型。
    使用"single"、"page"或"markdown"之一。默认值为"markdown"。
analysis_features: Optional[List[str]]
    可选分析特性列表，每个特性应作为符合`azure-ai-documentintelligence`包中的枚举`DocumentAnalysisFeature`的str传递。默认值为None。

示例：
---------
>>> obj = AzureAIDocumentIntelligenceLoader(
...     file_path="path/to/file",
...     api_endpoint="https://endpoint.azure.com",
...     api_key="APIKEY",
...     api_version="2023-10-31-preview",
...     api_model="prebuilt-layout",
...     mode="markdown"
... )
"""

        assert (
            file_path is not None or url_path is not None
        ), "file_path or url_path must be provided"
        self.file_path = file_path
        self.url_path = url_path

        self.parser = AzureAIDocumentIntelligenceParser(  # type: ignore[misc]
            api_endpoint=api_endpoint,
            api_key=api_key,
            api_version=api_version,
            api_model=api_model,
            mode=mode,
            analysis_features=analysis_features,
        )

[docs]    def lazy_load(
        self,
    ) -> Iterator[Document]:
        """将给定路径作为页面进行延迟加载。"""
        if self.file_path is not None:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
            yield from self.parser.parse(blob)
        else:
            yield from self.parser.parse_url(self.url_path)  # type: ignore[arg-type]
Source code for langchain_community.document_loaders.doc_intelligence