Source code for langchain.smith.evaluation.string_run_evaluator

"""运行字符串评估器的评估器包装器。"""
from __future__ import annotations

from abc import abstractmethod
from typing import Any, Dict, List, Optional

from langchain_core.callbacks.manager import (
    AsyncCallbackManagerForChainRun,
    CallbackManagerForChainRun,
)
from langchain_core.load.dump import dumpd
from langchain_core.load.load import load
from langchain_core.load.serializable import Serializable
from langchain_core.messages import BaseMessage, get_buffer_string, messages_from_dict
from langsmith import EvaluationResult, RunEvaluator
from langsmith.schemas import DataType, Example, Run

from langchain.chains.base import Chain
from langchain.evaluation.schema import StringEvaluator
from langchain.schema import RUN_KEY


def _get_messages_from_run_dict(messages: List[dict]) -> List[BaseMessage]:
    if not messages:
        return []
    first_message = messages[0]
    if "lc" in first_message:
        return [load(dumpd(message)) for message in messages]
    else:
        return messages_from_dict(messages)


[docs]class StringRunMapper(Serializable): """从运行对象中提取要评估的项目。""" @property def output_keys(self) -> List[str]: """从运行中提取的关键信息。""" return ["prediction", "input"]
[docs] @abstractmethod def map(self, run: Run) -> Dict[str, str]: """将Run映射到字典。"""
[docs] def __call__(self, run: Run) -> Dict[str, str]: """将Run映射到字典。""" if not run.outputs: raise ValueError(f"Run {run.id} has no outputs to evaluate.") return self.map(run)
[docs]class LLMStringRunMapper(StringRunMapper): """从运行对象中提取要评估的项目。"""
[docs] def serialize_chat_messages(self, messages: List[Dict]) -> str: """从运行中提取输入消息。""" if isinstance(messages, list) and messages: if isinstance(messages[0], dict): chat_messages = _get_messages_from_run_dict(messages) elif isinstance(messages[0], list): # Runs from Tracer have messages as a list of lists of dicts chat_messages = _get_messages_from_run_dict(messages[0]) else: raise ValueError(f"Could not extract messages to evaluate {messages}") return get_buffer_string(chat_messages) raise ValueError(f"Could not extract messages to evaluate {messages}")
[docs] def serialize_inputs(self, inputs: Dict) -> str: if "prompts" in inputs: # Should we even accept this? input_ = "\n\n".join(inputs["prompts"]) elif "prompt" in inputs: input_ = inputs["prompt"] elif "messages" in inputs: input_ = self.serialize_chat_messages(inputs["messages"]) else: raise ValueError("LLM Run must have either messages or prompts as inputs.") return input_
[docs] def serialize_outputs(self, outputs: Dict) -> str: if not outputs.get("generations"): raise ValueError("Cannot evaluate LLM Run without generations.") generations: List[Dict] = outputs["generations"] if not generations: raise ValueError("Cannot evaluate LLM run with empty generations.") first_generation: Dict = generations[0] if isinstance(first_generation, list): # Runs from Tracer have generations as a list of lists of dicts # Whereas Runs from the API have a list of dicts first_generation = first_generation[0] if "message" in first_generation: output_ = self.serialize_chat_messages([first_generation["message"]]) else: output_ = first_generation["text"] return output_
[docs] def map(self, run: Run) -> Dict[str, str]: """将Run映射到字典。""" if run.run_type != "llm": raise ValueError("LLM RunMapper only supports LLM runs.") elif not run.outputs: if run.error: raise ValueError( f"Cannot evaluate errored LLM run {run.id}: {run.error}" ) else: raise ValueError( f"Run {run.id} has no outputs. Cannot evaluate this run." ) else: try: inputs = self.serialize_inputs(run.inputs) except Exception as e: raise ValueError( f"Could not parse LM input from run inputs {run.inputs}" ) from e try: output_ = self.serialize_outputs(run.outputs) except Exception as e: raise ValueError( f"Could not parse LM prediction from run outputs {run.outputs}" ) from e return {"input": inputs, "prediction": output_}
[docs]class ChainStringRunMapper(StringRunMapper): """从链中的运行对象中提取要评估的项目。""" input_key: Optional[str] = None """从模型运行的输入中使用的关键字作为评估输入。 如果未提供,则将使用唯一的输入关键字,如果有多个则会引发错误。""" prediction_key: Optional[str] = None """从模型运行的输出中使用的关键字作为评估预测。 如果未提供,将使用唯一的输出关键字,如果有多个则会引发错误。""" def _get_key(self, source: Dict, key: Optional[str], which: str) -> str: if key is not None: return source[key] elif len(source) == 1: return next(iter(source.values())) else: raise ValueError( f"Could not map run {which} with multiple keys: " f"{source}\nPlease manually specify a {which}_key" )
[docs] def map(self, run: Run) -> Dict[str, str]: """将Run映射到字典。""" if not run.outputs: raise ValueError( f"Run with ID {run.id} lacks outputs required for evaluation." " Ensure the Run has valid outputs." ) if self.input_key is not None and self.input_key not in run.inputs: raise ValueError( f"Run with ID {run.id} is missing the expected input key" f" '{self.input_key}'.\nAvailable input keys in this Run" f" are: {run.inputs.keys()}.\nAdjust the evaluator's" f" input_key or ensure your input data includes key" f" '{self.input_key}'." ) elif self.prediction_key is not None and self.prediction_key not in run.outputs: available_keys = ", ".join(run.outputs.keys()) raise ValueError( f"Run with ID {run.id} doesn't have the expected prediction key" f" '{self.prediction_key}'. Available prediction keys in this Run are:" f" {available_keys}. Adjust the evaluator's prediction_key or" " ensure the Run object's outputs the expected key." ) else: input_ = self._get_key(run.inputs, self.input_key, "input") prediction = self._get_key(run.outputs, self.prediction_key, "prediction") return { "input": input_, "prediction": prediction, }
[docs]class ToolStringRunMapper(StringRunMapper): """将输入映射到工具。"""
[docs] def map(self, run: Run) -> Dict[str, str]: if not run.outputs: raise ValueError(f"Run {run.id} has no outputs to evaluate.") return {"input": run.inputs["input"], "prediction": run.outputs["output"]}
[docs]class StringExampleMapper(Serializable): """将数据集中的一个示例或行映射到评估的输入。""" reference_key: Optional[str] = None @property def output_keys(self) -> List[str]: """从运行中提取的关键信息。""" return ["reference"]
[docs] def serialize_chat_messages(self, messages: List[Dict]) -> str: """从运行中提取输入消息。""" chat_messages = _get_messages_from_run_dict(messages) return get_buffer_string(chat_messages)
[docs] def map(self, example: Example) -> Dict[str, str]: """将示例或数据集行映射到字典。""" if not example.outputs: raise ValueError( f"Example {example.id} has no outputs to use as a reference." ) if self.reference_key is None: if len(example.outputs) > 1: raise ValueError( f"Example {example.id} has multiple outputs, so you must" " specify a reference_key." ) else: output = list(example.outputs.values())[0] elif self.reference_key not in example.outputs: raise ValueError( f"Example {example.id} does not have reference key" f" {self.reference_key}." ) else: output = example.outputs[self.reference_key] return { "reference": self.serialize_chat_messages([output]) if isinstance(output, dict) and output.get("type") and output.get("data") else output }
[docs] def __call__(self, example: Example) -> Dict[str, str]: """将运行和示例映射到字典中。""" if not example.outputs: raise ValueError( f"Example {example.id} has no outputs to use as areference label." ) return self.map(example)
[docs]class StringRunEvaluatorChain(Chain, RunEvaluator): """评估运行和可选示例。""" run_mapper: StringRunMapper """将运行映射到一个带有'input'和'prediction'字符串的字典。""" example_mapper: Optional[StringExampleMapper] = None """将示例(数据集行)映射到一个带有“reference”字符串的字典。""" name: str """评估指标的名称。""" string_evaluator: StringEvaluator """评估链。""" @property def input_keys(self) -> List[str]: return ["run", "example"] @property def output_keys(self) -> List[str]: return ["feedback"] def _prepare_input(self, inputs: Dict[str, Any]) -> Dict[str, str]: run: Run = inputs["run"] example: Optional[Example] = inputs.get("example") evaluate_strings_inputs = self.run_mapper(run) if not self.string_evaluator.requires_input: # Hide warning about unused input evaluate_strings_inputs.pop("input", None) if example and self.example_mapper and self.string_evaluator.requires_reference: evaluate_strings_inputs.update(self.example_mapper(example)) elif self.string_evaluator.requires_reference: raise ValueError( f"Evaluator {self.name} requires an reference" " example from the dataset," f" but none was provided for run {run.id}." ) return evaluate_strings_inputs def _prepare_output(self, output: Dict[str, Any]) -> Dict[str, Any]: evaluation_result = EvaluationResult( key=self.name, comment=output.get("reasoning"), **output ) if RUN_KEY in output: # TODO: Not currently surfaced. Update evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY] return {"feedback": evaluation_result} def _call( self, inputs: Dict[str, str], run_manager: Optional[CallbackManagerForChainRun] = None, ) -> Dict[str, Any]: """调用评估链。""" evaluate_strings_inputs = self._prepare_input(inputs) _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager() callbacks = _run_manager.get_child() chain_output = self.string_evaluator.evaluate_strings( **evaluate_strings_inputs, callbacks=callbacks, include_run_info=True, ) return self._prepare_output(chain_output) async def _acall( self, inputs: Dict[str, str], run_manager: Optional[AsyncCallbackManagerForChainRun] = None, ) -> Dict[str, Any]: """调用评估链。""" evaluate_strings_inputs = self._prepare_input(inputs) _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager() callbacks = _run_manager.get_child() chain_output = await self.string_evaluator.aevaluate_strings( **evaluate_strings_inputs, callbacks=callbacks, include_run_info=True, ) return self._prepare_output(chain_output) def _prepare_evaluator_output(self, output: Dict[str, Any]) -> EvaluationResult: feedback: EvaluationResult = output["feedback"] if RUN_KEY not in feedback.evaluator_info: feedback.evaluator_info[RUN_KEY] = output[RUN_KEY] return feedback
[docs] def evaluate_run( self, run: Run, example: Optional[Example] = None ) -> EvaluationResult: """评估一个示例。""" try: result = self({"run": run, "example": example}, include_run_info=True) return self._prepare_evaluator_output(result) except Exception as e: return EvaluationResult( key=self.string_evaluator.evaluation_name, comment=f"Error evaluating run {run.id}: {e}", # TODO: Add run ID once we can declare it via callbacks )
[docs] async def aevaluate_run( self, run: Run, example: Optional[Example] = None ) -> EvaluationResult: """评估一个示例。""" try: result = await self.acall( {"run": run, "example": example}, include_run_info=True ) return self._prepare_evaluator_output(result) except Exception as e: return EvaluationResult( key=self.string_evaluator.evaluation_name, comment=f"Error evaluating run {run.id}: {e}", )
[docs] @classmethod def from_run_and_data_type( cls, evaluator: StringEvaluator, run_type: str, data_type: DataType, input_key: Optional[str] = None, prediction_key: Optional[str] = None, reference_key: Optional[str] = None, tags: Optional[List[str]] = None, ) -> StringRunEvaluatorChain: """从评估器和运行类型以及数据集类型创建一个StringRunEvaluatorChain。 该方法提供了一种简单的实例化StringRunEvaluatorChain的方式,通过使用评估器和有关运行类型和数据类型的信息。 该方法支持LLM和链式运行。 参数: evaluator (StringEvaluator): 要使用的字符串评估器。 run_type (str): 正在评估的运行类型。 支持的类型为LLM和Chain。 data_type (DataType): 运行中使用的数据集类型。 input_key (str, optional): 用于映射来自运行的输入的键。 prediction_key (str, optional): 用于映射来自运行的预测的键。 reference_key (str, optional): 用于映射数据集中的参考的键。 tags (List[str], optional): 要附加到评估链的标记列表。 返回: StringRunEvaluatorChain: 实例化的评估链。 引发: ValueError: 如果不支持运行类型,或者如果评估器需要来自数据集的参考但未提供参考键。 """ # noqa: E501 # Configure how run inputs/predictions are passed to the evaluator if run_type == "llm": run_mapper: StringRunMapper = LLMStringRunMapper() elif run_type == "chain": run_mapper = ChainStringRunMapper( input_key=input_key, prediction_key=prediction_key ) else: raise ValueError( f"Unsupported run type {run_type}. Expected one of 'llm' or 'chain'." ) # Configure how example rows are fed as a reference string to the evaluator if ( reference_key is not None or data_type in (DataType.llm, DataType.chat) or evaluator.requires_reference ): example_mapper = StringExampleMapper(reference_key=reference_key) elif evaluator.requires_reference: raise ValueError( f"Evaluator {evaluator.evaluation_name} requires a reference" " example from the dataset. Please specify the reference key from" " amongst the dataset outputs keys." ) else: example_mapper = None return cls( name=evaluator.evaluation_name, run_mapper=run_mapper, example_mapper=example_mapper, string_evaluator=evaluator, tags=tags, )