Source code for langchain.evaluation.agents.trajectory_eval_chain

"""用于评估ReAct风格代理的链条。

通过推理代理所采取的行动序列及其结果来评估ReAct风格代理。它使用语言模型链(LLMChain)来生成推理和分数。
"""

import re
from typing import (
    Any,
    Dict,
    List,
    Optional,
    Sequence,
    Tuple,
    TypedDict,
    Union,
    cast,
)

from langchain_core.agents import AgentAction
from langchain_core.callbacks.manager import (
    AsyncCallbackManagerForChainRun,
    CallbackManagerForChainRun,
    Callbacks,
)
from langchain_core.exceptions import OutputParserException
from langchain_core.language_models import BaseLanguageModel
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.pydantic_v1 import Extra, Field
from langchain_core.tools import BaseTool

from langchain.chains.llm import LLMChain
from langchain.evaluation.agents.trajectory_eval_prompt import (
    EVAL_CHAT_PROMPT,
    TOOL_FREE_EVAL_CHAT_PROMPT,
)
from langchain.evaluation.schema import AgentTrajectoryEvaluator, LLMEvalChain


[docs]class TrajectoryEval(TypedDict): """一个包含轨迹得分和推理的命名元组。""" score: float """轨迹的得分,从0到1进行了标准化。""" reasoning: str """评分的原因。"""
[docs]class TrajectoryOutputParser(BaseOutputParser): """轨迹输出解析器。""" @property def _type(self) -> str: return "agent_trajectory"
[docs] def parse(self, text: str) -> TrajectoryEval: """解析输出文本并提取分数和推理。 参数: text(str):要解析的输出文本。 返回: TrajectoryEval:包含标准化分数和推理的命名元组。 引发: OutputParserException:如果在输出文本中找不到分数,或者 如果LLM的分数不是1-5范围内的数字。 """ if "Score:" not in text: raise OutputParserException( f"Could not find score in model eval output: {text}" ) reasoning, score_str = text.split("Score: ", maxsplit=1) reasoning, score_str = reasoning.strip(), score_str.strip() # Use regex to extract the score. # This will get the number in the string, even if it is a float or more than 10. # E.g. "Score: 1" will return 1, "Score: 3.5" will return 3.5, and # "Score: 10" will return 10. # The score should be an integer digit in the range 1-5. _score = re.search(r"(\d+(\.\d+)?)", score_str) # If the score is not found or is a float, raise an exception. if _score is None or "." in _score.group(1): raise OutputParserException( f"Score is not an integer digit in the range 1-5: {text}" ) score = int(_score.group(1)) # If the score is not in the range 1-5, raise an exception. if not 1 <= score <= 5: raise OutputParserException( f"Score is not a digit in the range 1-5: {text}" ) normalized_score = (score - 1) / 4 return TrajectoryEval(score=normalized_score, reasoning=reasoning)
[docs]class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain): """用于评估ReAct风格代理的链。 该链用于通过推理所采取的行动序列及其结果来评估ReAct风格代理。 示例: .. code-block:: python from langchain.agents import AgentType, initialize_agent from langchain_community.chat_models import ChatOpenAI from langchain.evaluation import TrajectoryEvalChain from langchain.tools import tool @tool def geography_answers(country: str, question: str) -> str: \"\"\"对地理问题的非常有帮助的答案。\"\"\" return f"{country}? IDK - We may never know {question}." llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) agent = initialize_agent( tools=[geography_answers], llm=llm, agent=AgentType.OPENAI_FUNCTIONS, return_intermediate_steps=True, ) question = "How many dwell in the largest minor region in Argentina?" response = agent(question) eval_chain = TrajectoryEvalChain.from_llm( llm=llm, agent_tools=[geography_answers], return_reasoning=True ) result = eval_chain.evaluate_agent_trajectory( input=question, agent_trajectory=response["intermediate_steps"], prediction=response["output"], reference="Paris", ) print(result["score"]) # noqa: T201 # 0 """ # noqa: E501 agent_tools: Optional[List[BaseTool]] = None """代理可用的工具列表。""" eval_chain: LLMChain """用于评估的语言模型链。""" output_parser: TrajectoryOutputParser = Field( default_factory=TrajectoryOutputParser ) """用于解析输出的输出解析器。""" return_reasoning: bool = False # :meta private: """已弃用。始终返回推理。""" class Config: """QAEvalChain的配置。""" extra = Extra.ignore @property def requires_reference(self) -> bool: """评估器是否需要参考标签。""" return False @property def _tools_description(self) -> str: """获取代理工具的描述。 返回: str:代理工具的描述。 """ if self.agent_tools is None: return "" return "\n\n".join( [ f"""Tool {i}: {tool.name} Description: {tool.description}""" for i, tool in enumerate(self.agent_tools, 1) ] )
[docs] @staticmethod def get_agent_trajectory( steps: Union[str, Sequence[Tuple[AgentAction, str]]], ) -> str: """获取代理器的轨迹,返回格式化后的字符串。 参数: steps(Union[str, List[Tuple[AgentAction, str]]]):代理器的轨迹。 返回: str:格式化后的代理器轨迹。 """ if isinstance(steps, str): return steps return "\n\n".join( [ f"""Step {i}: Tool used: {action.tool} Tool input: {action.tool_input} Tool output: {output}""" for i, (action, output) in enumerate(steps, 1) ] )
@staticmethod def _format_reference(reference: Optional[str]) -> str: """格式化参考文本。 参数: reference (str): 参考文本。 返回: str: 格式化后的参考文本。 """ if not reference: return "" return f""" The following is the expected answer. Use this to measure correctness: [GROUND_TRUTH] {reference} [END_GROUND_TRUTH] """
[docs] @classmethod def from_llm( cls, llm: BaseLanguageModel, agent_tools: Optional[Sequence[BaseTool]] = None, output_parser: Optional[TrajectoryOutputParser] = None, **kwargs: Any, ) -> "TrajectoryEvalChain": """从语言模型链中创建一个TrajectoryEvalChain对象。 参数: llm(BaseChatModel):语言模型链。 agent_tools(可选[BaseTool]序列):代理可用的工具列表。 output_parser(可选[TrajectoryOutputParser]):用于将链输出解析为分数的输出解析器。 返回: TrajectoryEvalChain:TrajectoryEvalChain对象。 """ if not isinstance(llm, BaseChatModel): raise NotImplementedError( "Only chat models supported by the current trajectory eval" ) if agent_tools: prompt = EVAL_CHAT_PROMPT else: prompt = TOOL_FREE_EVAL_CHAT_PROMPT eval_chain = LLMChain(llm=llm, prompt=prompt) return cls( agent_tools=agent_tools, # type: ignore[arg-type] eval_chain=eval_chain, output_parser=output_parser or TrajectoryOutputParser(), **kwargs, )
@property def input_keys(self) -> List[str]: """获取链的输入键。 返回: List[str]: 输入键。 """ return ["question", "agent_trajectory", "answer", "reference"] @property def output_keys(self) -> List[str]: """获取链的输出键。 返回: List[str]:输出键。 """ return ["score", "reasoning"]
[docs] def prep_inputs(self, inputs: Union[Dict[str, Any], Any]) -> Dict[str, str]: """验证和准备输入。""" if "reference" not in inputs: inputs["reference"] = self._format_reference(inputs.get("reference")) return super().prep_inputs(inputs)
def _call( self, inputs: Dict[str, str], run_manager: Optional[CallbackManagerForChainRun] = None, ) -> Dict[str, Any]: """运行链并生成输出。 参数: inputs (Dict[str, str]): 链的输入数值。 run_manager (Optional[CallbackManagerForChainRun]): 链运行的回调管理器。 返回: Dict[str, Any]: 链的输出数值。 """ chain_input = {**inputs} if self.agent_tools: chain_input["tool_descriptions"] = self._tools_description _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager() raw_output = self.eval_chain.run( chain_input, callbacks=_run_manager.get_child() ) return cast(dict, self.output_parser.parse(raw_output)) async def _acall( self, inputs: Dict[str, str], run_manager: Optional[AsyncCallbackManagerForChainRun] = None, ) -> Dict[str, Any]: """运行链并生成输出。 参数: inputs (Dict[str, str]): 链的输入数值。 run_manager (Optional[CallbackManagerForChainRun]): 链运行的回调管理器。 返回: Dict[str, Any]: 链的输出数值。 """ chain_input = {**inputs} if self.agent_tools: chain_input["tool_descriptions"] = self._tools_description _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager() raw_output = await self.eval_chain.arun( chain_input, callbacks=_run_manager.get_child() ) return cast(dict, self.output_parser.parse(raw_output)) def _evaluate_agent_trajectory( self, *, prediction: str, input: str, agent_trajectory: Sequence[Tuple[AgentAction, str]], reference: Optional[str] = None, callbacks: Callbacks = None, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None, include_run_info: bool = False, **kwargs: Any, ) -> dict: """评估一个轨迹。 参数: prediction (str): 最终预测的响应。 input (str): 代理的输入。 agent_trajectory (List[Tuple[AgentAction, str]]): 形成代理轨迹的中间步骤。 reference (Optional[str]): 参考答案。 callbacks (Callbacks): 用于此链式运行的回调。 返回: dict: 评估结果,包括分数和可选的达到该分数的推理。 """ inputs = { "question": input, "agent_trajectory": self.get_agent_trajectory(agent_trajectory), "answer": prediction, "reference": reference, } return self.__call__( inputs=inputs, callbacks=callbacks, tags=tags, metadata=metadata, include_run_info=include_run_info, return_only_outputs=True, ) async def _aevaluate_agent_trajectory( self, *, prediction: str, input: str, agent_trajectory: Sequence[Tuple[AgentAction, str]], reference: Optional[str] = None, callbacks: Callbacks = None, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None, include_run_info: bool = False, **kwargs: Any, ) -> dict: """异步评估一个轨迹。 参数: prediction (str): 最终预测的响应。 input (str): 代理的输入。 agent_trajectory (List[Tuple[AgentAction, str]]): 形成代理轨迹的中间步骤。 reference (Optional[str]): 参考答案。 callbacks (Callbacks): 用于此链式运行的回调函数。 返回: dict: 评估结果,包括分数和达到该分数的推理(可选)。 """ inputs = { "question": input, "agent_trajectory": self.get_agent_trajectory(agent_trajectory), "answer": prediction, "reference": reference, } return await self.acall( inputs=inputs, callbacks=callbacks, tags=tags, metadata=metadata, include_run_info=include_run_info, return_only_outputs=True, )