Source code for langchain.evaluation.schema

"""通用评估器需要实现的接口。"""
from __future__ import annotations

import logging
from abc import ABC, abstractmethod
from enum import Enum
from typing import Any, Optional, Sequence, Tuple, Union
from warnings import warn

from langchain_core.agents import AgentAction
from langchain_core.language_models import BaseLanguageModel
from langchain_core.runnables.config import run_in_executor

from langchain.chains.base import Chain

logger = logging.getLogger(__name__)


[docs]class EvaluatorType(str, Enum):
    """评估者的类型。"""

    QA = "qa"
    """问题回答评估器，直接使用LLM对问题的答案进行评分。"""
    COT_QA = "cot_qa"
    """链式思维问答评估器，用于评分问题的答案，使用链式思维“推理”。"""
    CONTEXT_QA = "context_qa"
    """在回答中包含“上下文”的问答评估器。"""
    PAIRWISE_STRING = "pairwise_string"
    """一对一字符串评估器，用于预测在两个模型之间的首选预测。"""
    SCORE_STRING = "score_string"
    """评分字符串评估器，为预测结果给出1到10之间的分数。"""
    LABELED_PAIRWISE_STRING = "labeled_pairwise_string"
    """标记的成对字符串评估器，根据地面真实参考标签，预测两个模型之间的首选预测。"""
    LABELED_SCORE_STRING = "labeled_score_string"
    """标记的得分字符串评估器，根据基于真实标签的预测给出1到10的分数。"""
    AGENT_TRAJECTORY = "trajectory"
    """代理轨迹评估器，用于评分代理的中间步骤。"""
    CRITERIA = "criteria"
    """评估器，根据自定义一组标准评估模型，而不需要任何参考标签。"""
    LABELED_CRITERIA = "labeled_criteria"
    """标记的标准评估器，根据一组自定义标准以及一个参考标签来评估模型。"""
    STRING_DISTANCE = "string_distance"
    """使用字符串编辑距离比较预测结果和参考答案。"""
    EXACT_MATCH = "exact_match"
    """使用精确匹配方法将预测结果与参考答案进行比较。"""
    REGEX_MATCH = "regex_match"
    """使用正则表达式将预测结果与参考答案进行比较。"""
    PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
    """基于字符串编辑距离的预测进行比较。"""
    EMBEDDING_DISTANCE = "embedding_distance"
    """使用嵌入距离比较预测和参考标签。"""
    PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance"
    """使用嵌入距离比较两个预测。"""
    JSON_VALIDITY = "json_validity"
    """检查预测是否为有效的JSON。"""
    JSON_EQUALITY = "json_equality"
    """检查预测是否等于参考JSON。"""
    JSON_EDIT_DISTANCE = "json_edit_distance"
    """计算规范化后的两个JSON字符串之间的编辑距离。"""
    JSON_SCHEMA_VALIDATION = "json_schema_validation"
    """检查预测是否符合JSON模式。"""


[docs]class LLMEvalChain(Chain):
    """一个使用LLM的评估器的基类。"""

[docs]    @classmethod
    @abstractmethod
    def from_llm(cls, llm: BaseLanguageModel, **kwargs: Any) -> LLMEvalChain:
        """从LLM创建一个新的评估器。"""


class _EvalArgsMixin:
    """用于检查评估参数的Mixin。"""

    @property
    def requires_reference(self) -> bool:
        """评估器是否需要参考标签。"""
        return False

    @property
    def requires_input(self) -> bool:
        """评估器是否需要输入字符串。"""
        return False

    @property
    def _skip_input_warning(self) -> str:
        """当输入被忽略时显示警告。"""
        return f"Ignoring input in {self.__class__.__name__}, as it is not expected."

    @property
    def _skip_reference_warning(self) -> str:
        """当引用被忽略时显示警告。"""
        return (
            f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
        )

    def _check_evaluation_args(
        self,
        reference: Optional[str] = None,
        input: Optional[str] = None,
    ) -> None:
        """检查评估参数是否有效。

参数：
    reference（可选[str]，可选）：参考标签。
    input（可选[str]，可选）：输入字符串。
引发：
    ValueError：如果评估器需要输入字符串但未提供任何内容，
        或者如果评估器需要参考标签但未提供任何内容。
"""
        if self.requires_input and input is None:
            raise ValueError(f"{self.__class__.__name__} requires an input string.")
        elif input is not None and not self.requires_input:
            warn(self._skip_input_warning)
        if self.requires_reference and reference is None:
            raise ValueError(f"{self.__class__.__name__} requires a reference string.")
        elif reference is not None and not self.requires_reference:
            warn(self._skip_reference_warning)


[docs]class StringEvaluator(_EvalArgsMixin, ABC):
    """根据输入和/或参考标签对预测结果进行评分、标记或以其他方式进行评估。"""

    @property
    def evaluation_name(self) -> str:
        """评估的名称。"""
        return self.__class__.__name__

    @property
    def requires_reference(self) -> bool:
        """评估器是否需要参考标签。"""
        return False

    @abstractmethod
    def _evaluate_strings(
        self,
        *,
        prediction: Union[str, Any],
        reference: Optional[Union[str, Any]] = None,
        input: Optional[Union[str, Any]] = None,
        **kwargs: Any,
    ) -> dict:
        """评估链式或LLM输出，基于可选的输入和标签。

参数:
    prediction (str): 要评估的LLM或链预测。
    reference (Optional[str], optional): 要评估的参考标签。
    input (Optional[str], optional): 在评估过程中要考虑的输入。
    **kwargs: 包括回调函数、标签等的其他关键字参数。
返回:
    dict: 包含评分或值的评估结果。
        建议字典包含以下键:
             - score: 如果适用，评估的分数。
             - value: 如果适用，评估的字符串值。
             - reasoning: 如果适用，评估的理由。
"""  # noqa: E501

    async def _aevaluate_strings(
        self,
        *,
        prediction: Union[str, Any],
        reference: Optional[Union[str, Any]] = None,
        input: Optional[Union[str, Any]] = None,
        **kwargs: Any,
    ) -> dict:
        """异步评估Chain或LLM输出，基于可选的输入和标签。

参数:
    prediction (str): 要评估的LLM或chain预测。
    reference (Optional[str], optional): 用于评估的参考标签。
    input (Optional[str], optional): 在评估过程中要考虑的输入。
    **kwargs: 包括回调函数、标签等的其他关键字参数。
返回:
    dict: 包含得分或值的评估结果。
        建议字典包含以下键:
             - score: 如果适用，评估的分数。
             - value: 如果适用，评估的字符串值。
             - reasoning: 如果适用，评估的理由。
"""  # noqa: E501
        return await run_in_executor(
            None,
            self._evaluate_strings,
            prediction=prediction,
            reference=reference,
            input=input,
            **kwargs,
        )

[docs]    def evaluate_strings(
        self,
        *,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """评估链式或LLM输出，基于可选输入和标签。

参数:
    prediction (str): 要评估的LLM或链预测。
    reference (Optional[str], optional): 要评估的参考标签。
    input (Optional[str], optional): 在评估过程中要考虑的输入。
    **kwargs: 其他关键字参数，包括回调函数、标签等。
返回:
    dict: 包含得分或值的评估结果。
"""  # noqa: E501
        self._check_evaluation_args(reference=reference, input=input)
        return self._evaluate_strings(
            prediction=prediction, reference=reference, input=input, **kwargs
        )

[docs]    async def aevaluate_strings(
        self,
        *,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """异步评估Chain或LLM输出，基于可选的输入和标签。

参数:
    prediction (str): 要评估的LLM或chain预测。
    reference (Optional[str], optional): 要评估的参考标签。
    input (Optional[str], optional): 评估过程中要考虑的输入。
    **kwargs: 其他关键字参数，包括回调函数、标签等。
返回:
    dict: 包含得分或值的评估结果。
"""  # noqa: E501
        self._check_evaluation_args(reference=reference, input=input)
        return await self._aevaluate_strings(
            prediction=prediction, reference=reference, input=input, **kwargs
        )


[docs]class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
    """比较两个模型的输出（或同一模型的两个输出）。"""

    @abstractmethod
    def _evaluate_string_pairs(
        self,
        *,
        prediction: str,
        prediction_b: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """评估输出字符串对。

参数:
    prediction (str): 第一个模型的输出字符串。
    prediction_b (str): 第二个模型的输出字符串。
    reference (Optional[str], optional): 期望的输出/参考字符串。
    input (Optional[str], optional): 输入字符串。
    **kwargs: 其他关键字参数，如回调和可选的参考字符串。
返回:
    dict: 包含偏好、分数和/或其他信息的字典。
"""  # noqa: E501

    async def _aevaluate_string_pairs(
        self,
        *,
        prediction: str,
        prediction_b: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """异步评估输出字符串对。

参数:
    prediction (str): 第一个模型的输出字符串。
    prediction_b (str): 第二个模型的输出字符串。
    reference (Optional[str], optional): 期望的输出/参考字符串。
    input (Optional[str], optional): 输入字符串。
    **kwargs: 其他关键字参数，如回调和可选的参考字符串。
返回:
    dict: 包含偏好、分数和/或其他信息的字典。
"""  # noqa: E501
        return await run_in_executor(
            None,
            self._evaluate_string_pairs,
            prediction=prediction,
            prediction_b=prediction_b,
            reference=reference,
            input=input,
            **kwargs,
        )

[docs]    def evaluate_string_pairs(
        self,
        *,
        prediction: str,
        prediction_b: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """评估输出字符串对。

参数:
    prediction (str): 第一个模型的输出字符串。
    prediction_b (str): 第二个模型的输出字符串。
    reference (Optional[str], optional): 期望的输出/参考字符串。
    input (Optional[str], optional): 输入字符串。
    **kwargs: 其他关键字参数，如回调和可选的参考字符串。
返回:
    dict: 包含偏好、分数和/或其他信息的字典。
"""  # noqa: E501
        self._check_evaluation_args(reference=reference, input=input)
        return self._evaluate_string_pairs(
            prediction=prediction,
            prediction_b=prediction_b,
            reference=reference,
            input=input,
            **kwargs,
        )

[docs]    async def aevaluate_string_pairs(
        self,
        *,
        prediction: str,
        prediction_b: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """异步评估输出字符串对。

参数:
    prediction (str): 第一个模型的输出字符串。
    prediction_b (str): 第二个模型的输出字符串。
    reference (Optional[str], optional): 期望的输出/参考字符串。
    input (Optional[str], optional): 输入字符串。
    **kwargs: 其他关键字参数，如回调和可选的参考字符串。
返回:
    dict: 包含偏好、分数和/或其他信息的字典。
"""  # noqa: E501
        self._check_evaluation_args(reference=reference, input=input)
        return await self._aevaluate_string_pairs(
            prediction=prediction,
            prediction_b=prediction_b,
            reference=reference,
            input=input,
            **kwargs,
        )


[docs]class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
    """用于评估代理轨迹的接口。"""

    @property
    def requires_input(self) -> bool:
        """评估器是否需要输入字符串。"""
        return True

    @abstractmethod
    def _evaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        input: str,
        reference: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """评估一个轨迹。

参数:
    prediction (str): 最终预测的响应。
    agent_trajectory (List[Tuple[AgentAction, str]]):
        形成代理轨迹的中间步骤。
    input (str): 代理的输入。
    reference (Optional[str]): 参考答案。

返回:
    dict: 评估结果。
"""

    async def _aevaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        input: str,
        reference: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """异步评估一个轨迹。

参数:
    prediction (str): 最终预测的响应。
    agent_trajectory (List[Tuple[AgentAction, str]]):
        形成代理轨迹的中间步骤。
    input (str): 代理的输入。
    reference (Optional[str]): 参考答案。

返回:
    dict: 评估结果。
"""
        return await run_in_executor(
            None,
            self._evaluate_agent_trajectory,
            prediction=prediction,
            agent_trajectory=agent_trajectory,
            reference=reference,
            input=input,
            **kwargs,
        )

[docs]    def evaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        input: str,
        reference: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """评估一个轨迹。

参数:
    prediction (str): 最终预测的响应。
    agent_trajectory (List[Tuple[AgentAction, str]]):
        形成代理轨迹的中间步骤。
    input (str): 代理的输入。
    reference (Optional[str]): 参考答案。

返回:
    dict: 评估结果。
"""
        self._check_evaluation_args(reference=reference, input=input)
        return self._evaluate_agent_trajectory(
            prediction=prediction,
            input=input,
            agent_trajectory=agent_trajectory,
            reference=reference,
            **kwargs,
        )

[docs]    async def aevaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        input: str,
        reference: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """异步评估一个轨迹。

参数:
    prediction (str): 最终预测的响应。
    agent_trajectory (List[Tuple[AgentAction, str]]):
        形成代理轨迹的中间步骤。
    input (str): 代理的输入。
    reference (Optional[str]): 参考答案。

返回:
    dict: 评估结果。
"""
        self._check_evaluation_args(reference=reference, input=input)
        return await self._aevaluate_agent_trajectory(
            prediction=prediction,
            input=input,
            agent_trajectory=agent_trajectory,
            reference=reference,
            **kwargs,
        )