"""用于评估ReAct风格代理的链条。
通过推理代理所采取的行动序列及其结果来评估ReAct风格代理。它使用语言模型链(LLMChain)来生成推理和分数。
"""
import re
from typing import (
Any,
Dict,
List,
Optional,
Sequence,
Tuple,
TypedDict,
Union,
cast,
)
from langchain_core.agents import AgentAction
from langchain_core.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
Callbacks,
)
from langchain_core.exceptions import OutputParserException
from langchain_core.language_models import BaseLanguageModel
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.pydantic_v1 import Extra, Field
from langchain_core.tools import BaseTool
from langchain.chains.llm import LLMChain
from langchain.evaluation.agents.trajectory_eval_prompt import (
EVAL_CHAT_PROMPT,
TOOL_FREE_EVAL_CHAT_PROMPT,
)
from langchain.evaluation.schema import AgentTrajectoryEvaluator, LLMEvalChain
[docs]class TrajectoryEval(TypedDict):
"""一个包含轨迹得分和推理的命名元组。"""
score: float
"""轨迹的得分,从0到1进行了标准化。"""
reasoning: str
"""评分的原因。"""
[docs]class TrajectoryOutputParser(BaseOutputParser):
"""轨迹输出解析器。"""
@property
def _type(self) -> str:
return "agent_trajectory"
[docs] def parse(self, text: str) -> TrajectoryEval:
"""解析输出文本并提取分数和推理。
参数:
text(str):要解析的输出文本。
返回:
TrajectoryEval:包含标准化分数和推理的命名元组。
引发:
OutputParserException:如果在输出文本中找不到分数,或者
如果LLM的分数不是1-5范围内的数字。
"""
if "Score:" not in text:
raise OutputParserException(
f"Could not find score in model eval output: {text}"
)
reasoning, score_str = text.split("Score: ", maxsplit=1)
reasoning, score_str = reasoning.strip(), score_str.strip()
# Use regex to extract the score.
# This will get the number in the string, even if it is a float or more than 10.
# E.g. "Score: 1" will return 1, "Score: 3.5" will return 3.5, and
# "Score: 10" will return 10.
# The score should be an integer digit in the range 1-5.
_score = re.search(r"(\d+(\.\d+)?)", score_str)
# If the score is not found or is a float, raise an exception.
if _score is None or "." in _score.group(1):
raise OutputParserException(
f"Score is not an integer digit in the range 1-5: {text}"
)
score = int(_score.group(1))
# If the score is not in the range 1-5, raise an exception.
if not 1 <= score <= 5:
raise OutputParserException(
f"Score is not a digit in the range 1-5: {text}"
)
normalized_score = (score - 1) / 4
return TrajectoryEval(score=normalized_score, reasoning=reasoning)
[docs]class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain):
"""用于评估ReAct风格代理的链。
该链用于通过推理所采取的行动序列及其结果来评估ReAct风格代理。
示例:
.. code-block:: python
from langchain.agents import AgentType, initialize_agent
from langchain_community.chat_models import ChatOpenAI
from langchain.evaluation import TrajectoryEvalChain
from langchain.tools import tool
@tool
def geography_answers(country: str, question: str) -> str:
\"\"\"对地理问题的非常有帮助的答案。\"\"\"
return f"{country}? IDK - We may never know {question}."
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
agent = initialize_agent(
tools=[geography_answers],
llm=llm,
agent=AgentType.OPENAI_FUNCTIONS,
return_intermediate_steps=True,
)
question = "How many dwell in the largest minor region in Argentina?"
response = agent(question)
eval_chain = TrajectoryEvalChain.from_llm(
llm=llm, agent_tools=[geography_answers], return_reasoning=True
)
result = eval_chain.evaluate_agent_trajectory(
input=question,
agent_trajectory=response["intermediate_steps"],
prediction=response["output"],
reference="Paris",
)
print(result["score"]) # noqa: T201
# 0
""" # noqa: E501
agent_tools: Optional[List[BaseTool]] = None
"""代理可用的工具列表。"""
eval_chain: LLMChain
"""用于评估的语言模型链。"""
output_parser: TrajectoryOutputParser = Field(
default_factory=TrajectoryOutputParser
)
"""用于解析输出的输出解析器。"""
return_reasoning: bool = False # :meta private:
"""已弃用。始终返回推理。"""
class Config:
"""QAEvalChain的配置。"""
extra = Extra.ignore
@property
def requires_reference(self) -> bool:
"""评估器是否需要参考标签。"""
return False
@property
def _tools_description(self) -> str:
"""获取代理工具的描述。
返回:
str:代理工具的描述。
"""
if self.agent_tools is None:
return ""
return "\n\n".join(
[
f"""Tool {i}: {tool.name}
Description: {tool.description}"""
for i, tool in enumerate(self.agent_tools, 1)
]
)
[docs] @staticmethod
def get_agent_trajectory(
steps: Union[str, Sequence[Tuple[AgentAction, str]]],
) -> str:
"""获取代理器的轨迹,返回格式化后的字符串。
参数:
steps(Union[str, List[Tuple[AgentAction, str]]]):代理器的轨迹。
返回:
str:格式化后的代理器轨迹。
"""
if isinstance(steps, str):
return steps
return "\n\n".join(
[
f"""Step {i}:
Tool used: {action.tool}
Tool input: {action.tool_input}
Tool output: {output}"""
for i, (action, output) in enumerate(steps, 1)
]
)
@staticmethod
def _format_reference(reference: Optional[str]) -> str:
"""格式化参考文本。
参数:
reference (str): 参考文本。
返回:
str: 格式化后的参考文本。
"""
if not reference:
return ""
return f"""
The following is the expected answer. Use this to measure correctness:
[GROUND_TRUTH]
{reference}
[END_GROUND_TRUTH]
"""
[docs] @classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
agent_tools: Optional[Sequence[BaseTool]] = None,
output_parser: Optional[TrajectoryOutputParser] = None,
**kwargs: Any,
) -> "TrajectoryEvalChain":
"""从语言模型链中创建一个TrajectoryEvalChain对象。
参数:
llm(BaseChatModel):语言模型链。
agent_tools(可选[BaseTool]序列):代理可用的工具列表。
output_parser(可选[TrajectoryOutputParser]):用于将链输出解析为分数的输出解析器。
返回:
TrajectoryEvalChain:TrajectoryEvalChain对象。
"""
if not isinstance(llm, BaseChatModel):
raise NotImplementedError(
"Only chat models supported by the current trajectory eval"
)
if agent_tools:
prompt = EVAL_CHAT_PROMPT
else:
prompt = TOOL_FREE_EVAL_CHAT_PROMPT
eval_chain = LLMChain(llm=llm, prompt=prompt)
return cls(
agent_tools=agent_tools, # type: ignore[arg-type]
eval_chain=eval_chain,
output_parser=output_parser or TrajectoryOutputParser(),
**kwargs,
)
@property
def input_keys(self) -> List[str]:
"""获取链的输入键。
返回:
List[str]: 输入键。
"""
return ["question", "agent_trajectory", "answer", "reference"]
@property
def output_keys(self) -> List[str]:
"""获取链的输出键。
返回:
List[str]:输出键。
"""
return ["score", "reasoning"]
def _call(
self,
inputs: Dict[str, str],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""运行链并生成输出。
参数:
inputs (Dict[str, str]): 链的输入数值。
run_manager (Optional[CallbackManagerForChainRun]): 链运行的回调管理器。
返回:
Dict[str, Any]: 链的输出数值。
"""
chain_input = {**inputs}
if self.agent_tools:
chain_input["tool_descriptions"] = self._tools_description
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
raw_output = self.eval_chain.run(
chain_input, callbacks=_run_manager.get_child()
)
return cast(dict, self.output_parser.parse(raw_output))
async def _acall(
self,
inputs: Dict[str, str],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""运行链并生成输出。
参数:
inputs (Dict[str, str]): 链的输入数值。
run_manager (Optional[CallbackManagerForChainRun]): 链运行的回调管理器。
返回:
Dict[str, Any]: 链的输出数值。
"""
chain_input = {**inputs}
if self.agent_tools:
chain_input["tool_descriptions"] = self._tools_description
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
raw_output = await self.eval_chain.arun(
chain_input, callbacks=_run_manager.get_child()
)
return cast(dict, self.output_parser.parse(raw_output))
def _evaluate_agent_trajectory(
self,
*,
prediction: str,
input: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
reference: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""评估一个轨迹。
参数:
prediction (str): 最终预测的响应。
input (str): 代理的输入。
agent_trajectory (List[Tuple[AgentAction, str]]):
形成代理轨迹的中间步骤。
reference (Optional[str]): 参考答案。
callbacks (Callbacks): 用于此链式运行的回调。
返回:
dict: 评估结果,包括分数和可选的达到该分数的推理。
"""
inputs = {
"question": input,
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
"answer": prediction,
"reference": reference,
}
return self.__call__(
inputs=inputs,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
return_only_outputs=True,
)
async def _aevaluate_agent_trajectory(
self,
*,
prediction: str,
input: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
reference: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""异步评估一个轨迹。
参数:
prediction (str): 最终预测的响应。
input (str): 代理的输入。
agent_trajectory (List[Tuple[AgentAction, str]]):
形成代理轨迹的中间步骤。
reference (Optional[str]): 参考答案。
callbacks (Callbacks): 用于此链式运行的回调函数。
返回:
dict: 评估结果,包括分数和达到该分数的推理(可选)。
"""
inputs = {
"question": input,
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
"answer": prediction,
"reference": reference,
}
return await self.acall(
inputs=inputs,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
return_only_outputs=True,
)