Source code for langchain_community.retrievers.google_cloud_documentai_warehouse

"""文档 AI Warehouse 的 Google Cloud 检索器包装器。"""

from typing import TYPE_CHECKING, Any, Dict, List, Optional

from langchain_core._api.deprecation import deprecated
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import root_validator
from langchain_core.retrievers import BaseRetriever
from langchain_core.utils import get_from_dict_or_env

from langchain_community.utilities.vertexai import get_client_info

if TYPE_CHECKING:
    from google.cloud.contentwarehouse_v1 import (
        DocumentServiceClient,
        RequestMetadata,
        SearchDocumentsRequest,
    )
    from google.cloud.contentwarehouse_v1.services.document_service.pagers import (
        SearchDocumentsPager,
    )


[docs]@deprecated( since="0.0.32", removal="0.3.0", alternative_import="langchain_google_community.DocumentAIWarehouseRetriever", ) class GoogleDocumentAIWarehouseRetriever(BaseRetriever): """基于文档 AI 仓库的检索器。 文档应该在单独的流程中创建和上传, 而此检索器仅使用提供的文档 AI schema_id 来搜索相关文档。 更多信息:https://cloud.google.com/document-ai-warehouse。""" location: str = "us" """谷歌云位置,文档AI Warehouse所在的位置。""" project_number: str """Google Cloud项目编号,应仅包含数字。""" schema_id: Optional[str] = None """文档AI仓库模式,用于查询。 如果未提供任何内容,将搜索项目中的所有文档。""" qa_size_limit: int = 5 """返回的文档数量限制。""" client: "DocumentServiceClient" = None #: :meta private: @root_validator() def validate_environment(cls, values: Dict) -> Dict: """验证环境。""" try: from google.cloud.contentwarehouse_v1 import DocumentServiceClient except ImportError as exc: raise ImportError( "google.cloud.contentwarehouse is not installed." "Please install it with pip install google-cloud-contentwarehouse" ) from exc values["project_number"] = get_from_dict_or_env( values, "project_number", "PROJECT_NUMBER" ) values["client"] = DocumentServiceClient( client_info=get_client_info(module="document-ai-warehouse") ) return values def _prepare_request_metadata(self, user_ldap: str) -> "RequestMetadata": from google.cloud.contentwarehouse_v1 import RequestMetadata, UserInfo user_info = UserInfo(id=f"user:{user_ldap}") return RequestMetadata(user_info=user_info) def _get_relevant_documents( self, query: str, *, run_manager: CallbackManagerForRetrieverRun, **kwargs: Any ) -> List[Document]: request = self._prepare_search_request(query, **kwargs) response = self.client.search_documents(request=request) return self._parse_search_response(response=response) def _prepare_search_request( self, query: str, **kwargs: Any ) -> "SearchDocumentsRequest": from google.cloud.contentwarehouse_v1 import ( DocumentQuery, SearchDocumentsRequest, ) try: user_ldap = kwargs["user_ldap"] except KeyError: raise ValueError("Argument user_ldap should be provided!") request_metadata = self._prepare_request_metadata(user_ldap=user_ldap) schemas = [] if self.schema_id: schemas.append( self.client.document_schema_path( project=self.project_number, location=self.location, document_schema=self.schema_id, ) ) return SearchDocumentsRequest( parent=self.client.common_location_path(self.project_number, self.location), request_metadata=request_metadata, document_query=DocumentQuery( query=query, is_nl_query=True, document_schema_names=schemas ), qa_size_limit=self.qa_size_limit, ) def _parse_search_response( self, response: "SearchDocumentsPager" ) -> List[Document]: documents = [] for doc in response.matching_documents: metadata = { "title": doc.document.title, "source": doc.document.raw_document_path, } documents.append( Document(page_content=doc.search_text_snippet, metadata=metadata) ) return documents