Source code for langchain_community.document_transformers.doctran_text_extract

from typing import Any, List, Optional, Sequence

from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_core.utils import get_from_env


[docs]class DoctranPropertyExtractor(BaseDocumentTransformer): """从文本文档中使用doctran提取属性。 参数: properties: 要提取的属性列表。 openai_api_key: OpenAI API密钥。也可以通过环境变量``OPENAI_API_KEY``指定。 示例: .. code-block:: python from langchain_community.document_transformers import DoctranPropertyExtractor properties = [ { "name": "category", "description": "这是什么类型的电子邮件。", "type": "string", "enum": ["update", "action_item", "customer_feedback", "announcement", "other"], "required": True, }, { "name": "mentions", "description": "此电子邮件中提到的所有人的列表。", "type": "array", "items": { "name": "full_name", "description": "被提及人的全名。", "type": "string", }, "required": True, }, { "name": "eli5", "description": "用5岁小孩的语言解释这封电子邮件。", "type": "string", "required": True, }, ] # 传入openai_api_key或设置环境变量OPENAI_API_KEY property_extractor = DoctranPropertyExtractor(properties) transformed_document = await qa_transformer.atransform_documents(documents)""" # noqa: E501
[docs] def __init__( self, properties: List[dict], openai_api_key: Optional[str] = None, openai_api_model: Optional[str] = None, ) -> None: self.properties = properties self.openai_api_key = openai_api_key or get_from_env( "openai_api_key", "OPENAI_API_KEY" ) self.openai_api_model = openai_api_model or get_from_env( "openai_api_model", "OPENAI_API_MODEL" )
[docs] async def atransform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: """使用doctran从文本文档中提取属性。""" try: from doctran import Doctran, ExtractProperty doctran = Doctran( openai_api_key=self.openai_api_key, openai_model=self.openai_api_model ) except ImportError: raise ImportError( "Install doctran to use this parser. (pip install doctran)" ) properties = [ExtractProperty(**property) for property in self.properties] for d in documents: doctran_doc = ( doctran.parse(content=d.page_content) .extract(properties=properties) .execute() ) d.metadata["extracted_properties"] = doctran_doc.extracted_properties return documents
[docs] def transform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: """使用doctran从文本文档中提取属性。""" try: from doctran import Doctran, ExtractProperty doctran = Doctran( openai_api_key=self.openai_api_key, openai_model=self.openai_api_model ) except ImportError: raise ImportError( "Install doctran to use this parser. (pip install doctran)" ) properties = [ExtractProperty(**property) for property in self.properties] for d in documents: doctran_doc = ( doctran.parse(content=d.page_content) .extract(properties=properties) .execute() ) d.metadata["extracted_properties"] = doctran_doc.extracted_properties return documents