Source code for langchain_community.document_transformers.openai_functions

"""使用OpenAI Functions模型的文档转换器"""
from typing import Any, Dict, Optional, Sequence, Type, Union

from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel


[docs]class OpenAIMetadataTagger(BaseDocumentTransformer, BaseModel): """使用OpenAI函数从文档内容中提取元数据标签。 示例: .. code-block:: python from langchain_community.chat_models import ChatOpenAI from langchain_community.document_transformers import OpenAIMetadataTagger from langchain_core.documents import Document schema = { "properties": { "movie_title": { "type": "string" }, "critic": { "type": "string" }, "tone": { "type": "string", "enum": ["positive", "negative"] }, "rating": { "type": "integer", "description": "评论家对电影评分的星级数" } }, "required": ["movie_title", "critic", "tone"] } # 必须是支持函数的OpenAI模型 llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613") tagging_chain = create_tagging_chain(schema, llm) document_transformer = OpenAIMetadataTagger(tagging_chain=tagging_chain) original_documents = [ Document(page_content="《蜜蜂电影》评论 作者:罗杰·艾伯特 这是有史以来最伟大的电影。5颗星中的4颗。"), Document(page_content="《教父》评论 作者:匿名 这部电影太无聊了。1颗星。", metadata={"reliable": False}), ] enhanced_documents = document_transformer.transform_documents(original_documents) """ # noqa: E501 tagging_chain: Any """用于从每个文档中提取元数据的链条。"""
[docs] def transform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: """自动提取和填充每个文档的元数据,根据提供的模式。 """ new_documents = [] for document in documents: extracted_metadata: Dict = self.tagging_chain.run(document.page_content) # type: ignore[assignment] # noqa: E501 new_document = Document( page_content=document.page_content, metadata={**extracted_metadata, **document.metadata}, ) new_documents.append(new_document) return new_documents
[docs] async def atransform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: raise NotImplementedError
[docs]def create_metadata_tagger( metadata_schema: Union[Dict[str, Any], Type[BaseModel]], llm: BaseLanguageModel, prompt: Optional[ChatPromptTemplate] = None, *, tagging_chain_kwargs: Optional[Dict] = None, ) -> OpenAIMetadataTagger: """创建一个DocumentTransformer,使用OpenAI函数链自动为文档添加基于其内容和输入模式的元数据标记。 参数: metadata_schema: 可以是字典或pydantic.BaseModel类。如果传入字典,则假定已经是有效的JsonSchema。 为了获得最佳结果,pydantic.BaseModels应该有描述模式代表和参数描述的文档字符串。 llm: 要使用的语言模型,假定支持OpenAI函数调用API。 默认使用"gpt-3.5-turbo-0613" prompt: 传递给模型的BasePromptTemplate。 返回: 将给定函数传递给模型的LLMChain。 示例: .. code-block:: python from langchain_community.chat_models import ChatOpenAI from langchain_community.document_transformers import create_metadata_tagger from langchain_core.documents import Document schema = { "properties": { "movie_title": { "type": "string" }, "critic": { "type": "string" }, "tone": { "type": "string", "enum": ["positive", "negative"] }, "rating": { "type": "integer", "description": "评论家给电影评分的星级数" } }, "required": ["movie_title", "critic", "tone"] } # 必须是支持函数的OpenAI模型 llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613") document_transformer = create_metadata_tagger(schema, llm) original_documents = [ Document(page_content="Review of The Bee Movie By Roger Ebert This is the greatest movie ever made. 4 out of 5 stars."), Document(page_content="Review of The Godfather By Anonymous This movie was super boring. 1 out of 5 stars.", metadata={"reliable": False}), ] enhanced_documents = document_transformer.transform_documents(original_documents) """ # noqa: E501 from langchain.chains.openai_functions import create_tagging_chain metadata_schema = ( metadata_schema if isinstance(metadata_schema, dict) else metadata_schema.schema() ) _tagging_chain_kwargs = tagging_chain_kwargs or {} tagging_chain = create_tagging_chain( metadata_schema, llm, prompt=prompt, **_tagging_chain_kwargs ) return OpenAIMetadataTagger(tagging_chain=tagging_chain)