Skip to content

Tonic validate

AnswerConsistencyBinaryEvaluator #

Bases: BaseEvaluator

Tonic Validate的答案一致性二元指标。

输出分数是一个浮点数,要么是0.0,要么是1.0。

更多详情请参见https://docs.tonic.ai/validate/。

Parameters:

Name Type Description Default
openai_service(OpenAIService)

要使用的OpenAI服务。指定要用作LLM评估器的聊天完成模型。默认为"gpt-4"。

required
Source code in llama_index/evaluation/tonic_validate/answer_consistency_binary.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class AnswerConsistencyBinaryEvaluator(BaseEvaluator):
    """Tonic Validate的答案一致性二元指标。

输出分数是一个浮点数,要么是0.0,要么是1.0。

更多详情请参见https://docs.tonic.ai/validate/。

Args:
    openai_service(OpenAIService): 要使用的OpenAI服务。指定要用作LLM评估器的聊天完成模型。默认为"gpt-4"。"""

    def __init__(self, openai_service: Optional[Any] = None):
        if openai_service is None:
            openai_service = OpenAIService("gpt-4")
        self.openai_service = openai_service
        self.metric = AnswerConsistencyBinaryMetric()

    async def aevaluate(
        self,
        query: Optional[str] = None,
        response: Optional[str] = None,
        contexts: Optional[Sequence[str]] = None,
        **kwargs: Any
    ) -> EvaluationResult:
        from tonic_validate.classes.benchmark import BenchmarkItem
        from tonic_validate.classes.llm_response import LLMResponse

        benchmark_item = BenchmarkItem(question=query)

        llm_response = LLMResponse(
            llm_answer=response,
            llm_context_list=contexts,
            benchmark_item=benchmark_item,
        )

        score = self.metric.score(llm_response, self.openai_service)

        return EvaluationResult(
            query=query, contexts=contexts, response=response, score=score
        )

    def _get_prompts(self) -> PromptDictType:
        return {}

    def _get_prompt_modules(self) -> PromptMixinType:
        return {}

    def _update_prompts(self, prompts_dict: PromptDictType) -> None:
        return

AnswerConsistencyEvaluator #

Bases: BaseEvaluator

Tonic Validate的答案一致性指标。

输出分数是一个介于0.0和1.0之间的浮点数。

更多详情请参见https://docs.tonic.ai/validate/。

Args: openai_service(OpenAIService):要使用的OpenAI服务。指定用作LLM评估器的聊天完成模型。默认为"gpt-4"。

Source code in llama_index/evaluation/tonic_validate/answer_consistency.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class AnswerConsistencyEvaluator(BaseEvaluator):
    """Tonic Validate的答案一致性指标。

输出分数是一个介于0.0和1.0之间的浮点数。

更多详情请参见https://docs.tonic.ai/validate/。

Args:
openai_service(OpenAIService):要使用的OpenAI服务。指定用作LLM评估器的聊天完成模型。默认为"gpt-4"。"""

    def __init__(self, openai_service: Optional[Any] = None):
        if openai_service is None:
            openai_service = OpenAIService("gpt-4")
        self.openai_service = openai_service
        self.metric = AnswerConsistencyMetric()

    async def aevaluate(
        self,
        query: Optional[str] = None,
        response: Optional[str] = None,
        contexts: Optional[Sequence[str]] = None,
        **kwargs: Any
    ) -> EvaluationResult:
        from tonic_validate.classes.benchmark import BenchmarkItem
        from tonic_validate.classes.llm_response import LLMResponse

        benchmark_item = BenchmarkItem(question=query)

        llm_response = LLMResponse(
            llm_answer=response,
            llm_context_list=contexts,
            benchmark_item=benchmark_item,
        )

        score = self.metric.score(llm_response, self.openai_service)

        return EvaluationResult(
            query=query, contexts=contexts, response=response, score=score
        )

    def _get_prompts(self) -> PromptDictType:
        return {}

    def _get_prompt_modules(self) -> PromptMixinType:
        return {}

    def _update_prompts(self, prompts_dict: PromptDictType) -> None:
        return

AnswerSimilarityEvaluator #

Bases: BaseEvaluator

Tonic Validate的答案相似度度量。

输出分数是一个介于0.0和5.0之间的浮点数。

请参阅https://docs.tonic.ai/validate/了解更多详情。

Parameters:

Name Type Description Default
openai_service(OpenAIService)

要使用的OpenAI服务。指定要用作LLM评估器的聊天完成模型。默认为"gpt-4"。

required
Source code in llama_index/evaluation/tonic_validate/answer_similarity.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
class AnswerSimilarityEvaluator(BaseEvaluator):
    """Tonic Validate的答案相似度度量。

输出分数是一个介于0.0和5.0之间的浮点数。

请参阅https://docs.tonic.ai/validate/了解更多详情。

Args:
    openai_service(OpenAIService): 要使用的OpenAI服务。指定要用作LLM评估器的聊天完成模型。默认为"gpt-4"。"""

    def __init__(self, openai_service: Optional[Any] = None):
        if openai_service is None:
            openai_service = OpenAIService("gpt-4")
        self.openai_service = openai_service
        self.metric = AnswerSimilarityMetric()

    async def aevaluate(
        self,
        query: Optional[str] = None,
        response: Optional[str] = None,
        contexts: Optional[Sequence[str]] = None,
        reference_response: Optional[str] = None,
        **kwargs: Any
    ) -> EvaluationResult:
        from tonic_validate.classes.benchmark import BenchmarkItem
        from tonic_validate.classes.llm_response import LLMResponse

        benchmark_item = BenchmarkItem(question=query, answer=reference_response)

        llm_response = LLMResponse(
            llm_answer=response,
            llm_context_list=contexts,
            benchmark_item=benchmark_item,
        )

        score = self.metric.score(llm_response, self.openai_service)

        return EvaluationResult(
            query=query, contexts=contexts, response=response, score=score
        )

    def _get_prompts(self) -> PromptDictType:
        return {}

    def _get_prompt_modules(self) -> PromptMixinType:
        return {}

    def _update_prompts(self, prompts_dict: PromptDictType) -> None:
        return

AugmentationAccuracyEvaluator #

Bases: BaseEvaluator

Tonic Validate的增强准确度指标。

输出得分为0.0到1.0之间的浮点数。

更多详情请参阅https://docs.tonic.ai/validate/。

Parameters:

Name Type Description Default
openai_service(OpenAIService)

要使用的OpenAI服务。指定用作LLM评估器的聊天完成模型。默认为"gpt-4"。

required
Source code in llama_index/evaluation/tonic_validate/augmentation_accuracy.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class AugmentationAccuracyEvaluator(BaseEvaluator):
    """Tonic Validate的增强准确度指标。

输出得分为0.0到1.0之间的浮点数。

更多详情请参阅https://docs.tonic.ai/validate/。

Args:
    openai_service(OpenAIService): 要使用的OpenAI服务。指定用作LLM评估器的聊天完成模型。默认为"gpt-4"。"""

    def __init__(self, openai_service: Optional[Any] = None):
        if openai_service is None:
            openai_service = OpenAIService("gpt-4")
        self.openai_service = openai_service
        self.metric = AugmentationAccuracyMetric()

    async def aevaluate(
        self,
        query: Optional[str] = None,
        response: Optional[str] = None,
        contexts: Optional[Sequence[str]] = None,
        **kwargs: Any
    ) -> EvaluationResult:
        from tonic_validate.classes.benchmark import BenchmarkItem
        from tonic_validate.classes.llm_response import LLMResponse

        benchmark_item = BenchmarkItem(question=query)

        llm_response = LLMResponse(
            llm_answer=response,
            llm_context_list=contexts,
            benchmark_item=benchmark_item,
        )

        score = self.metric.score(llm_response, self.openai_service)

        return EvaluationResult(
            query=query, contexts=contexts, response=response, score=score
        )

    def _get_prompts(self) -> PromptDictType:
        return {}

    def _get_prompt_modules(self) -> PromptMixinType:
        return {}

    def _update_prompts(self, prompts_dict: PromptDictType) -> None:
        return

AugmentationPrecisionEvaluator #

Bases: BaseEvaluator

Tonic Validate的增强精度指标。

输出分数是一个介于0.0和1.0之间的浮点数。

请参阅https://docs.tonic.ai/validate/获取更多详细信息。

Parameters:

Name Type Description Default
openai_service(OpenAIService)

要使用的OpenAI服务。指定用作LLM评估器的聊天完成模型。默认为"gpt-4"。

required
Source code in llama_index/evaluation/tonic_validate/augmentation_precision.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class AugmentationPrecisionEvaluator(BaseEvaluator):
    """Tonic Validate的增强精度指标。

输出分数是一个介于0.0和1.0之间的浮点数。

请参阅https://docs.tonic.ai/validate/获取更多详细信息。

Args:
    openai_service(OpenAIService): 要使用的OpenAI服务。指定用作LLM评估器的聊天完成模型。默认为"gpt-4"。"""

    def __init__(self, openai_service: Optional[Any] = None):
        if openai_service is None:
            openai_service = OpenAIService("gpt-4")
        self.openai_service = openai_service
        self.metric = AugmentationPrecisionMetric()

    async def aevaluate(
        self,
        query: Optional[str] = None,
        response: Optional[str] = None,
        contexts: Optional[Sequence[str]] = None,
        **kwargs: Any
    ) -> EvaluationResult:
        from tonic_validate.classes.benchmark import BenchmarkItem
        from tonic_validate.classes.llm_response import LLMResponse

        benchmark_item = BenchmarkItem(question=query)

        llm_response = LLMResponse(
            llm_answer=response,
            llm_context_list=contexts,
            benchmark_item=benchmark_item,
        )

        score = self.metric.score(llm_response, self.openai_service)

        return EvaluationResult(
            query=query, contexts=contexts, response=response, score=score
        )

    def _get_prompts(self) -> PromptDictType:
        return {}

    def _get_prompt_modules(self) -> PromptMixinType:
        return {}

    def _update_prompts(self, prompts_dict: PromptDictType) -> None:
        return

RetrievalPrecisionEvaluator #

Bases: BaseEvaluator

Tonic Validate的检索精度指标。

输出分数是一个介于0.0和1.0之间的浮点数。

更多详情请参阅https://docs.tonic.ai/validate/。

Parameters:

Name Type Description Default
openai_service(OpenAIService)

要使用的OpenAI服务。指定用作LLM评估器的聊天完成模型。默认为"gpt-4"。

required
Source code in llama_index/evaluation/tonic_validate/retrieval_precision.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class RetrievalPrecisionEvaluator(BaseEvaluator):
    """Tonic Validate的检索精度指标。

输出分数是一个介于0.0和1.0之间的浮点数。

更多详情请参阅https://docs.tonic.ai/validate/。

Args:
    openai_service(OpenAIService): 要使用的OpenAI服务。指定用作LLM评估器的聊天完成模型。默认为"gpt-4"。"""

    def __init__(self, openai_service: Optional[Any] = None):
        if openai_service is None:
            openai_service = OpenAIService("gpt-4")
        self.openai_service = openai_service
        self.metric = RetrievalPrecisionMetric()

    async def aevaluate(
        self,
        query: Optional[str] = None,
        response: Optional[str] = None,
        contexts: Optional[Sequence[str]] = None,
        **kwargs: Any
    ) -> EvaluationResult:
        from tonic_validate.classes.benchmark import BenchmarkItem
        from tonic_validate.classes.llm_response import LLMResponse

        benchmark_item = BenchmarkItem(question=query, answer=response)

        llm_response = LLMResponse(
            llm_answer=response,
            llm_context_list=contexts,
            benchmark_item=benchmark_item,
        )

        score = self.metric.score(llm_response, self.openai_service)

        return EvaluationResult(
            query=query, contexts=contexts, response=response, score=score
        )

    def _get_prompts(self) -> PromptDictType:
        return {}

    def _get_prompt_modules(self) -> PromptMixinType:
        return {}

    def _update_prompts(self, prompts_dict: PromptDictType) -> None:
        return

TonicValidateEvaluator #

Bases: BaseEvaluator

Tonic Validate的验证评分器。计算所有Tonic Validate的指标。

有关更多详细信息,请参见https://docs.tonic.ai/validate/。

Source code in llama_index/evaluation/tonic_validate/tonic_validate_evaluator.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
class TonicValidateEvaluator(BaseEvaluator):
    """Tonic Validate的验证评分器。计算所有Tonic Validate的指标。

    有关更多详细信息,请参见https://docs.tonic.ai/validate/。

    Args:
        metrics(List[Metric]):要使用的指标。默认为Tonic Validate的所有指标。
        model_evaluator(str):要使用的OpenAI服务。指定要用作LLM评估器的聊天完成模型。默认为"gpt-4"。"""

    def __init__(
        self, metrics: Optional[List[Any]] = None, model_evaluator: str = "gpt-4"
    ):
        if metrics is None:
            metrics = [
                AnswerConsistencyMetric(),
                AnswerSimilarityMetric(),
                AugmentationAccuracyMetric(),
                AugmentationPrecisionMetric(),
                RetrievalPrecisionMetric(),
            ]

        self.metrics = metrics
        self.model_evaluator = model_evaluator
        self.validate_scorer = ValidateScorer(metrics, model_evaluator)

    def _calculate_average_score(self, run: Any) -> float:
        from tonic_validate.metrics.answer_similarity_metric import (
            AnswerSimilarityMetric,
        )

        ave_score = 0.0
        metric_cnt = 0
        for metric_name, score in run.overall_scores.items():
            if metric_name == AnswerSimilarityMetric.name:
                ave_score += score / 5
            else:
                ave_score += score
            metric_cnt += 1
        return ave_score / metric_cnt

    async def aevaluate(
        self,
        query: Optional[str] = None,
        response: Optional[str] = None,
        contexts: Optional[Sequence[str]] = None,
        reference_response: Optional[str] = None,
        **kwargs: Any,
    ) -> TonicValidateEvaluationResult:
        from tonic_validate.classes.benchmark import BenchmarkItem
        from tonic_validate.classes.llm_response import LLMResponse

        benchmark_item = BenchmarkItem(question=query, answer=reference_response)

        llm_response = LLMResponse(
            llm_answer=response,
            llm_context_list=contexts,
            benchmark_item=benchmark_item,
        )

        responses = [llm_response]

        run = self.validate_scorer.score_run(responses)

        ave_score = self._calculate_average_score(run)

        return TonicValidateEvaluationResult(
            query=query,
            contexts=contexts,
            response=response,
            score=ave_score,
            score_dict=run.run_data[0].scores,
        )

    async def aevaluate_run(
        self,
        queries: List[str],
        responses: List[str],
        contexts_list: List[List[str]],
        reference_responses: List[str],
        **kwargs: Any,
    ) -> Any:
        """评估一批响应。

返回一个Tonic Validate Run对象,可以将其记录到Tonic Validate UI中。更多详情请参阅https://docs.tonic.ai/validate/。
"""
        from tonic_validate.classes.benchmark import BenchmarkItem
        from tonic_validate.classes.llm_response import LLMResponse

        llm_responses = []

        for query, response, contexts, reference_response in zip(
            queries, responses, contexts_list, reference_responses
        ):
            benchmark_item = BenchmarkItem(question=query, answer=reference_response)

            llm_response = LLMResponse(
                llm_answer=response,
                llm_context_list=contexts,
                benchmark_item=benchmark_item,
            )

            llm_responses.append(llm_response)

        return self.validate_scorer.score_run(llm_responses)

    def evaluate_run(
        self,
        queries: List[str],
        responses: List[str],
        contexts_list: List[List[str]],
        reference_responses: List[str],
        **kwargs: Any,
    ) -> Any:
        """评估一批响应。

返回一个Tonic Validate Run对象,可以将其记录到Tonic Validate UI中。更多详情请参阅https://docs.tonic.ai/validate/。
"""
        return asyncio.run(
            self.aevaluate_run(
                queries=queries,
                responses=responses,
                contexts_list=contexts_list,
                reference_responses=reference_responses,
                **kwargs,
            )
        )

    def _get_prompts(self) -> PromptDictType:
        return {}

    def _get_prompt_modules(self) -> PromptMixinType:
        return {}

    def _update_prompts(self, prompts_dict: PromptDictType) -> None:
        return

aevaluate_run async #

aevaluate_run(
    queries: List[str],
    responses: List[str],
    contexts_list: List[List[str]],
    reference_responses: List[str],
    **kwargs: Any
) -> Any

评估一批响应。

返回一个Tonic Validate Run对象,可以将其记录到Tonic Validate UI中。更多详情请参阅https://docs.tonic.ai/validate/。

Source code in llama_index/evaluation/tonic_validate/tonic_validate_evaluator.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
    async def aevaluate_run(
        self,
        queries: List[str],
        responses: List[str],
        contexts_list: List[List[str]],
        reference_responses: List[str],
        **kwargs: Any,
    ) -> Any:
        """评估一批响应。

返回一个Tonic Validate Run对象,可以将其记录到Tonic Validate UI中。更多详情请参阅https://docs.tonic.ai/validate/。
"""
        from tonic_validate.classes.benchmark import BenchmarkItem
        from tonic_validate.classes.llm_response import LLMResponse

        llm_responses = []

        for query, response, contexts, reference_response in zip(
            queries, responses, contexts_list, reference_responses
        ):
            benchmark_item = BenchmarkItem(question=query, answer=reference_response)

            llm_response = LLMResponse(
                llm_answer=response,
                llm_context_list=contexts,
                benchmark_item=benchmark_item,
            )

            llm_responses.append(llm_response)

        return self.validate_scorer.score_run(llm_responses)

evaluate_run #

evaluate_run(
    queries: List[str],
    responses: List[str],
    contexts_list: List[List[str]],
    reference_responses: List[str],
    **kwargs: Any
) -> Any

评估一批响应。

返回一个Tonic Validate Run对象,可以将其记录到Tonic Validate UI中。更多详情请参阅https://docs.tonic.ai/validate/。

Source code in llama_index/evaluation/tonic_validate/tonic_validate_evaluator.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
    def evaluate_run(
        self,
        queries: List[str],
        responses: List[str],
        contexts_list: List[List[str]],
        reference_responses: List[str],
        **kwargs: Any,
    ) -> Any:
        """评估一批响应。

返回一个Tonic Validate Run对象,可以将其记录到Tonic Validate UI中。更多详情请参阅https://docs.tonic.ai/validate/。
"""
        return asyncio.run(
            self.aevaluate_run(
                queries=queries,
                responses=responses,
                contexts_list=contexts_list,
                reference_responses=reference_responses,
                **kwargs,
            )
        )