Skip to content

Evaluator benchmarker

EvaluatorBenchmarkerPack #

Bases: BaseLlamaPack

一个用于对自己的评估器进行基准测试/评估的包。

Parameters:

Name Type Description Default
evaluator BaseEvaluator

要评估/基准测试的评估器。

required
eval_dataset LabelledEvaluatorDataset | LabelledPairwiseEvaluatorDataset

用于运行基准测试的标记评估数据集。

required
Source code in llama_index/packs/evaluator_benchmarker/base.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
class EvaluatorBenchmarkerPack(BaseLlamaPack):
    """一个用于对自己的评估器进行基准测试/评估的包。

    Args:
        evaluator (BaseEvaluator): 要评估/基准测试的评估器。
        eval_dataset (LabelledEvaluatorDataset | LabelledPairwiseEvaluatorDataset): 用于运行基准测试的标记评估数据集。"""

    def __init__(
        self,
        evaluator: BaseEvaluator,
        eval_dataset: Union[LabelledEvaluatorDataset, LabelledPairwiseEvaluatorDataset],
        show_progress: bool = True,
    ):
        self.evaluator = evaluator
        self.eval_dataset = eval_dataset
        self._num_examples = len(self.eval_dataset.examples)
        self.show_progress = show_progress
        self.prediction_dataset = None

    async def _amake_predictions(
        self,
        batch_size: int = 20,
        sleep_time_in_seconds: int = 1,
    ):
        """使用评估器进行异步预测。"""
        self.prediction_dataset: Union[
            EvaluatorPredictionDataset, PairwiseEvaluatorPredictionDataset
        ] = await self.eval_dataset.amake_predictions_with(
            predictor=self.evaluator,
            show_progress=self.show_progress,
            batch_size=batch_size,
            sleep_time_in_seconds=sleep_time_in_seconds,
        )

    def make_predictions(self, batch_size: int = 20, sleep_time_in_seconds: int = 1):
        """与评估器同步进行预测。"""
        self.prediction_dataset: Union[
            EvaluatorPredictionDataset, PairwiseEvaluatorPredictionDataset
        ] = self.eval_dataset.make_predictions_with(
            predictor=self.evaluator,
            show_progress=self.show_progress,
            batch_size=batch_size,
            sleep_time_in_seconds=sleep_time_in_seconds,
        )

    def _prepare_and_save_benchmark_results_pairwise_grading(self) -> pd.DataFrame:
        """计算成对评估的基准指标。"""
        inconclusive_counts = 0
        agreements_with_ties = 0
        agreements_without_ties = 0
        ties = 0
        invalid_counts = 0
        for example, prediction in zip(
            self.eval_dataset[:], self.prediction_dataset[:]
        ):
            if prediction.invalid_prediction:
                invalid_counts += 1
                continue

            # don't count inconclusive results
            if prediction.evaluation_source == "neither":
                inconclusive_counts += 1
                continue

            if prediction.score == 0.5 or example.reference_score == 0.5:
                ties += 1
            else:
                agreements_without_ties += int(
                    example.reference_score == prediction.score
                )
            agreements_with_ties += int(example.reference_score == prediction.score)

        agreement_rate_with_ties = agreements_with_ties / (
            len(self.prediction_dataset[:]) - inconclusive_counts - invalid_counts
        )
        agreement_rate_without_ties = agreements_without_ties / (
            len(self.prediction_dataset[:])
            - inconclusive_counts
            - ties
            - invalid_counts
        )

        df_data = {
            "number_examples": [len(self.prediction_dataset[:])],
            "invalid_predictions": [invalid_counts],
            "inconclusives": [inconclusive_counts],
            "ties": [ties],
            "agreement_rate_with_ties": [agreement_rate_with_ties],
            "agreement_rate_without_ties": [agreement_rate_without_ties],
        }
        benchmark_df = pd.DataFrame(df_data)
        benchmark_df.to_csv("benchmark.csv")
        return benchmark_df

    def _prepare_and_save_benchmark_results_single_grading(self) -> pd.DataFrame:
        """计算单个评分评估的基准指标。"""
        invalid_counts = sum([p.invalid_prediction for p in self.prediction_dataset[:]])
        np_preds = np.array([p.score for p in self.prediction_dataset[:]])
        np_refs = np.array([e.reference_score for e in self.eval_dataset[:]])
        invalid_mask = ~np.array(
            [p.invalid_prediction for p in self.prediction_dataset[:]]
        )

        # metrics
        mae = np.mean(np.abs(np_preds[invalid_mask] - np_refs[invalid_mask]))
        corr = np.corrcoef(
            np_preds[invalid_mask].astype(float), np_refs[invalid_mask].astype(float)
        )[0, 1]
        hamming = np.sum(np_preds[invalid_mask] == np_refs[invalid_mask])

        df_data = {
            "number_examples": [len(self.prediction_dataset[:])],
            "invalid_predictions": [invalid_counts],
            "correlation": [corr],
            "mae": [mae],
            "hamming": [hamming],
        }
        benchmark_df = pd.DataFrame(df_data)
        benchmark_df.to_csv("benchmark.csv")
        return benchmark_df

    def _make_evaluations(self) -> pd.DataFrame:
        """返回benchmark_df。"""
        if isinstance(self.eval_dataset, LabelledPairwiseEvaluatorDataset):
            return self._prepare_and_save_benchmark_results_pairwise_grading()
        else:
            return self._prepare_and_save_benchmark_results_single_grading()

    async def arun(self, batch_size: int = 10, sleep_time_in_seconds: int = 1):
        if batch_size > 10:
            warnings.warn(
                "You've set a large batch_size (>10). If using OpenAI GPT-4 as "
                " `judge_llm` (which is the default judge_llm),"
                " you may experience a RateLimitError. Previous successful eval "
                " responses are cached per batch. So hitting a RateLimitError"
                " would mean you'd lose all of the current batches successful "
                " GPT-4 calls."
            )

        # make predictions
        if self.prediction_dataset is None:
            await self._amake_predictions(batch_size, sleep_time_in_seconds)

        # produce metrics
        return self._make_evaluations()

make_predictions #

make_predictions(
    batch_size: int = 20, sleep_time_in_seconds: int = 1
)

与评估器同步进行预测。

Source code in llama_index/packs/evaluator_benchmarker/base.py
50
51
52
53
54
55
56
57
58
59
def make_predictions(self, batch_size: int = 20, sleep_time_in_seconds: int = 1):
    """与评估器同步进行预测。"""
    self.prediction_dataset: Union[
        EvaluatorPredictionDataset, PairwiseEvaluatorPredictionDataset
    ] = self.eval_dataset.make_predictions_with(
        predictor=self.evaluator,
        show_progress=self.show_progress,
        batch_size=batch_size,
        sleep_time_in_seconds=sleep_time_in_seconds,
    )