Bases: BaseEvaluator
嵌入相似性评估器。
通过比较生成答案的嵌入和参考答案之间的相似性来评估问答系统的质量。
受以下论文启发:
- 用于评估问答模型的语义答案相似性
https://arxiv.org/pdf/2108.06130.pdf
Source code in llama_index/core/evaluation/semantic_similarity.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80 | class SemanticSimilarityEvaluator(BaseEvaluator):
"""嵌入相似性评估器。
通过比较生成答案的嵌入和参考答案之间的相似性来评估问答系统的质量。
受以下论文启发:
- 用于评估问答模型的语义答案相似性
https://arxiv.org/pdf/2108.06130.pdf
Args:
service_context(可选[ServiceContext]):服务上下文。
similarity_threshold(float):“通过”的嵌入相似性阈值。默认为0.8。"""
def __init__(
self,
embed_model: Optional[BaseEmbedding] = None,
similarity_fn: Optional[Callable[..., float]] = None,
similarity_mode: Optional[SimilarityMode] = None,
similarity_threshold: float = 0.8,
# deprecated
service_context: Optional[ServiceContext] = None,
) -> None:
self._embed_model = embed_model or embed_model_from_settings_or_context(
Settings, service_context
)
if similarity_fn is None:
similarity_mode = similarity_mode or SimilarityMode.DEFAULT
self._similarity_fn = lambda x, y: similarity(x, y, mode=similarity_mode)
else:
if similarity_mode is not None:
raise ValueError(
"Cannot specify both similarity_fn and similarity_mode"
)
self._similarity_fn = similarity_fn
self._similarity_threshold = similarity_threshold
def _get_prompts(self) -> PromptDictType:
"""获取提示。"""
return {}
def _update_prompts(self, prompts: PromptDictType) -> None:
"""更新提示。"""
async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
reference: Optional[str] = None,
**kwargs: Any,
) -> EvaluationResult:
del query, contexts, kwargs # Unused
if response is None or reference is None:
raise ValueError("Must specify both response and reference")
response_embedding = await self._embed_model.aget_text_embedding(response)
reference_embedding = await self._embed_model.aget_text_embedding(reference)
similarity_score = self._similarity_fn(response_embedding, reference_embedding)
passing = similarity_score >= self._similarity_threshold
return EvaluationResult(
score=similarity_score,
passing=passing,
feedback=f"Similarity score: {similarity_score}",
)
|