Source code for langchain_community.document_transformers.openai_functions
"""使用OpenAI Functions模型的文档转换器"""
from typing import Any, Dict, Optional, Sequence, Type, Union
from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel
[docs]class OpenAIMetadataTagger(BaseDocumentTransformer, BaseModel):
"""使用OpenAI函数从文档内容中提取元数据标签。
示例:
.. code-block:: python
from langchain_community.chat_models import ChatOpenAI
from langchain_community.document_transformers import OpenAIMetadataTagger
from langchain_core.documents import Document
schema = {
"properties": {
"movie_title": { "type": "string" },
"critic": { "type": "string" },
"tone": {
"type": "string",
"enum": ["positive", "negative"]
},
"rating": {
"type": "integer",
"description": "评论家对电影评分的星级数"
}
},
"required": ["movie_title", "critic", "tone"]
}
# 必须是支持函数的OpenAI模型
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
tagging_chain = create_tagging_chain(schema, llm)
document_transformer = OpenAIMetadataTagger(tagging_chain=tagging_chain)
original_documents = [
Document(page_content="《蜜蜂电影》评论
作者:罗杰·艾伯特
这是有史以来最伟大的电影。5颗星中的4颗。"),
Document(page_content="《教父》评论
作者:匿名
这部电影太无聊了。1颗星。", metadata={"reliable": False}),
]
enhanced_documents = document_transformer.transform_documents(original_documents)
""" # noqa: E501
tagging_chain: Any
"""用于从每个文档中提取元数据的链条。"""
[docs] def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""自动提取和填充每个文档的元数据,根据提供的模式。
"""
new_documents = []
for document in documents:
extracted_metadata: Dict = self.tagging_chain.run(document.page_content) # type: ignore[assignment] # noqa: E501
new_document = Document(
page_content=document.page_content,
metadata={**extracted_metadata, **document.metadata},
)
new_documents.append(new_document)
return new_documents
[docs] async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
raise NotImplementedError
[docs]def create_metadata_tagger(
metadata_schema: Union[Dict[str, Any], Type[BaseModel]],
llm: BaseLanguageModel,
prompt: Optional[ChatPromptTemplate] = None,
*,
tagging_chain_kwargs: Optional[Dict] = None,
) -> OpenAIMetadataTagger:
"""创建一个DocumentTransformer,使用OpenAI函数链自动为文档添加基于其内容和输入模式的元数据标记。
参数:
metadata_schema: 可以是字典或pydantic.BaseModel类。如果传入字典,则假定已经是有效的JsonSchema。
为了获得最佳结果,pydantic.BaseModels应该有描述模式代表和参数描述的文档字符串。
llm: 要使用的语言模型,假定支持OpenAI函数调用API。
默认使用"gpt-3.5-turbo-0613"
prompt: 传递给模型的BasePromptTemplate。
返回:
将给定函数传递给模型的LLMChain。
示例:
.. code-block:: python
from langchain_community.chat_models import ChatOpenAI
from langchain_community.document_transformers import create_metadata_tagger
from langchain_core.documents import Document
schema = {
"properties": {
"movie_title": { "type": "string" },
"critic": { "type": "string" },
"tone": {
"type": "string",
"enum": ["positive", "negative"]
},
"rating": {
"type": "integer",
"description": "评论家给电影评分的星级数"
}
},
"required": ["movie_title", "critic", "tone"]
}
# 必须是支持函数的OpenAI模型
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
document_transformer = create_metadata_tagger(schema, llm)
original_documents = [
Document(page_content="Review of The Bee Movie
By Roger Ebert
This is the greatest movie ever made. 4 out of 5 stars."),
Document(page_content="Review of The Godfather
By Anonymous
This movie was super boring. 1 out of 5 stars.", metadata={"reliable": False}),
]
enhanced_documents = document_transformer.transform_documents(original_documents)
""" # noqa: E501
from langchain.chains.openai_functions import create_tagging_chain
metadata_schema = (
metadata_schema
if isinstance(metadata_schema, dict)
else metadata_schema.schema()
)
_tagging_chain_kwargs = tagging_chain_kwargs or {}
tagging_chain = create_tagging_chain(
metadata_schema, llm, prompt=prompt, **_tagging_chain_kwargs
)
return OpenAIMetadataTagger(tagging_chain=tagging_chain)