Source code for langchain_community.document_transformers.openai_functions

"""使用OpenAI Functions模型的文档转换器"""
from typing import Any, Dict, Optional, Sequence, Type, Union

from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel


[docs]class OpenAIMetadataTagger(BaseDocumentTransformer, BaseModel):
    """使用OpenAI函数从文档内容中提取元数据标签。

    示例:
        .. code-block:: python

                from langchain_community.chat_models import ChatOpenAI
                from langchain_community.document_transformers import OpenAIMetadataTagger
                from langchain_core.documents import Document

                schema = {
                    "properties": {
                        "movie_title": { "type": "string" },
                        "critic": { "type": "string" },
                        "tone": {
                            "type": "string",
                            "enum": ["positive", "negative"]
                        },
                        "rating": {
                            "type": "integer",
                            "description": "评论家对电影评分的星级数"
                        }
                    },
                    "required": ["movie_title", "critic", "tone"]
                }

                # 必须是支持函数的OpenAI模型
                llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
                tagging_chain = create_tagging_chain(schema, llm)
                document_transformer = OpenAIMetadataTagger(tagging_chain=tagging_chain)
                original_documents = [
                    Document(page_content="《蜜蜂电影》评论
作者：罗杰·艾伯特

这是有史以来最伟大的电影。5颗星中的4颗。"),
                    Document(page_content="《教父》评论
作者：匿名

这部电影太无聊了。1颗星。", metadata={"reliable": False}),
                ]

                enhanced_documents = document_transformer.transform_documents(original_documents)
    
"""  # noqa: E501

    tagging_chain: Any
    """用于从每个文档中提取元数据的链条。"""

[docs]    def transform_documents(
        self, documents: Sequence[Document], **kwargs: Any
    ) -> Sequence[Document]:
        """自动提取和填充每个文档的元数据，根据提供的模式。
"""

        new_documents = []

        for document in documents:
            extracted_metadata: Dict = self.tagging_chain.run(document.page_content)  # type: ignore[assignment]  # noqa: E501
            new_document = Document(
                page_content=document.page_content,
                metadata={**extracted_metadata, **document.metadata},
            )
            new_documents.append(new_document)
        return new_documents

[docs]    async def atransform_documents(
        self, documents: Sequence[Document], **kwargs: Any
    ) -> Sequence[Document]:
        raise NotImplementedError


[docs]def create_metadata_tagger(
    metadata_schema: Union[Dict[str, Any], Type[BaseModel]],
    llm: BaseLanguageModel,
    prompt: Optional[ChatPromptTemplate] = None,
    *,
    tagging_chain_kwargs: Optional[Dict] = None,
) -> OpenAIMetadataTagger:
    """创建一个DocumentTransformer，使用OpenAI函数链自动为文档添加基于其内容和输入模式的元数据标记。

参数:
    metadata_schema: 可以是字典或pydantic.BaseModel类。如果传入字典，则假定已经是有效的JsonSchema。
        为了获得最佳结果，pydantic.BaseModels应该有描述模式代表和参数描述的文档字符串。
    llm: 要使用的语言模型，假定支持OpenAI函数调用API。
        默认使用"gpt-3.5-turbo-0613"
    prompt: 传递给模型的BasePromptTemplate。

返回:
    将给定函数传递给模型的LLMChain。

示例:
    .. code-block:: python

            from langchain_community.chat_models import ChatOpenAI
            from langchain_community.document_transformers import create_metadata_tagger
            from langchain_core.documents import Document

            schema = {
                "properties": {
                    "movie_title": { "type": "string" },
                    "critic": { "type": "string" },
                    "tone": {
                        "type": "string",
                        "enum": ["positive", "negative"]
                    },
                    "rating": {
                        "type": "integer",
                        "description": "评论家给电影评分的星级数"
                    }
                },
                "required": ["movie_title", "critic", "tone"]
            }

            # 必须是支持函数的OpenAI模型
            llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

            document_transformer = create_metadata_tagger(schema, llm)
            original_documents = [
                Document(page_content="Review of The Bee Movie
By Roger Ebert

This is the greatest movie ever made. 4 out of 5 stars."),
                Document(page_content="Review of The Godfather
By Anonymous

This movie was super boring. 1 out of 5 stars.", metadata={"reliable": False}),
            ]

            enhanced_documents = document_transformer.transform_documents(original_documents)
"""  # noqa: E501
    from langchain.chains.openai_functions import create_tagging_chain

    metadata_schema = (
        metadata_schema
        if isinstance(metadata_schema, dict)
        else metadata_schema.schema()
    )
    _tagging_chain_kwargs = tagging_chain_kwargs or {}
    tagging_chain = create_tagging_chain(
        metadata_schema, llm, prompt=prompt, **_tagging_chain_kwargs
    )
    return OpenAIMetadataTagger(tagging_chain=tagging_chain)