Skip to content

Dataset generation

评估模块。

DatasetGenerator #

Bases: PromptMixin

根据给定的文档生成数据集(问题/问题-答案对)。

注意:这是一个测试版功能,可能会有变化!

Source code in llama_index/core/evaluation/dataset_generation.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
@deprecated(
    "Deprecated in favor of `RagDatasetGenerator` which should be used instead.",
    action="always",
)
class DatasetGenerator(PromptMixin):
    """根据给定的文档生成数据集(问题/问题-答案对)。

注意:这是一个测试版功能,可能会有变化!

Args:
    nodes(List[Node]):节点列表。(可选)
    llm(LLM):语言模型。
    callback_manager(CallbackManager):回调管理器。
    num_questions_per_chunk:每个块生成的问题数量。每个文档被分块为512个单词大小的块。
    text_question_template:问题生成模板。
    question_gen_query:问题生成查询。
"""

    def __init__(
        self,
        nodes: List[BaseNode],
        llm: Optional[LLM] = None,
        callback_manager: Optional[CallbackManager] = None,
        num_questions_per_chunk: int = 10,
        text_question_template: BasePromptTemplate | None = None,
        text_qa_template: BasePromptTemplate | None = None,
        question_gen_query: str | None = None,
        metadata_mode: MetadataMode = MetadataMode.NONE,
        show_progress: bool = False,
        # deprecated
        service_context: ServiceContext | None = None,
    ) -> None:
        """初始化参数。"""
        self.llm = llm or llm_from_settings_or_context(Settings, service_context)
        self.callback_manager = (
            callback_manager
            or callback_manager_from_settings_or_context(Settings, service_context)
        )
        self.text_question_template = text_question_template or PromptTemplate(
            DEFAULT_QUESTION_GENERATION_PROMPT
        )
        self.text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT
        self.question_gen_query = (
            question_gen_query
            or f"You are a Teacher/Professor. Your task is to setup \
                        {num_questions_per_chunk} questions for an upcoming \
                        quiz/examination. The questions should be diverse in nature \
                            across the document. Restrict the questions to the \
                                context information provided."
        )
        self.nodes = nodes
        self._metadata_mode = metadata_mode
        self._show_progress = show_progress

    @classmethod
    def from_documents(
        cls,
        documents: List[Document],
        llm: Optional[LLM] = None,
        transformations: Optional[List[TransformComponent]] = None,
        callback_manager: Optional[CallbackManager] = None,
        num_questions_per_chunk: int = 10,
        text_question_template: BasePromptTemplate | None = None,
        text_qa_template: BasePromptTemplate | None = None,
        question_gen_query: str | None = None,
        required_keywords: List[str] | None = None,
        exclude_keywords: List[str] | None = None,
        show_progress: bool = False,
        # deprecated
        service_context: ServiceContext | None = None,
    ) -> DatasetGenerator:
        """从文档中生成数据集。"""
        llm = llm or llm_from_settings_or_context(Settings, service_context)
        transformations = transformations or transformations_from_settings_or_context(
            Settings, service_context
        )
        callback_manager = (
            callback_manager
            or callback_manager_from_settings_or_context(Settings, service_context)
        )

        nodes = run_transformations(
            documents, transformations, show_progress=show_progress
        )

        # use node postprocessor to filter nodes
        required_keywords = required_keywords or []
        exclude_keywords = exclude_keywords or []
        node_postprocessor = KeywordNodePostprocessor(
            callback_manager=callback_manager,
            required_keywords=required_keywords,
            exclude_keywords=exclude_keywords,
        )
        node_with_scores = [NodeWithScore(node=node) for node in nodes]
        node_with_scores = node_postprocessor.postprocess_nodes(node_with_scores)
        nodes = [node_with_score.node for node_with_score in node_with_scores]

        return cls(
            nodes=nodes,
            llm=llm,
            callback_manager=callback_manager,
            num_questions_per_chunk=num_questions_per_chunk,
            text_question_template=text_question_template,
            text_qa_template=text_qa_template,
            question_gen_query=question_gen_query,
            show_progress=show_progress,
            service_context=service_context,
        )

    async def _agenerate_dataset(
        self,
        nodes: List[BaseNode],
        num: int | None = None,
        generate_response: bool = False,
    ) -> QueryResponseDataset:
        """节点问题生成器。"""
        query_tasks: List[Coroutine] = []
        queries: Dict[str, str] = {}
        responses_dict: Dict[str, str] = {}

        if self._show_progress:
            from tqdm.asyncio import tqdm_asyncio

            async_module = tqdm_asyncio
        else:
            async_module = asyncio

        summary_indices: List[SummaryIndex] = []
        for node in nodes:
            if num is not None and len(query_tasks) >= num:
                break
            index = SummaryIndex.from_documents(
                [
                    Document(
                        text=node.get_content(metadata_mode=self._metadata_mode),
                        metadata=node.metadata,
                    )
                ],
                callback_manager=self.callback_manager,
            )

            query_engine = index.as_query_engine(
                llm=self.llm,
                text_qa_template=self.text_question_template,
                use_async=True,
            )
            task = query_engine.aquery(
                self.question_gen_query,
            )
            query_tasks.append(task)
            summary_indices.append(index)

        responses = await async_module.gather(*query_tasks)
        for idx, response in enumerate(responses):
            result = str(response).strip().split("\n")
            cleaned_questions = [
                re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
            ]
            cleaned_questions = [
                question for question in cleaned_questions if len(question) > 0
            ]
            cur_queries = {
                str(uuid.uuid4()): question for question in cleaned_questions
            }
            queries.update(cur_queries)

            if generate_response:
                index = summary_indices[idx]
                qr_tasks = []
                cur_query_items = list(cur_queries.items())
                cur_query_keys = [query_id for query_id, _ in cur_query_items]
                for query_id, query in cur_query_items:
                    qa_query_engine = index.as_query_engine(
                        llm=self.llm,
                        text_qa_template=self.text_qa_template,
                    )
                    qr_task = qa_query_engine.aquery(query)
                    qr_tasks.append(qr_task)
                qr_responses = await async_module.gather(*qr_tasks)
                for query_id, qa_response in zip(cur_query_keys, qr_responses):
                    responses_dict[query_id] = str(qa_response)
            else:
                pass

        query_ids = list(queries.keys())
        if num is not None:
            query_ids = query_ids[:num]
            # truncate queries, responses to the subset of query ids
            queries = {query_id: queries[query_id] for query_id in query_ids}
            if generate_response:
                responses_dict = {
                    query_id: responses_dict[query_id] for query_id in query_ids
                }

        return QueryResponseDataset(queries=queries, responses=responses_dict)

    async def agenerate_questions_from_nodes(self, num: int | None = None) -> List[str]:
        """为每个文档生成问题。"""
        dataset = await self._agenerate_dataset(
            self.nodes, num=num, generate_response=False
        )
        return dataset.questions

    async def agenerate_dataset_from_nodes(
        self, num: int | None = None
    ) -> QueryResponseDataset:
        """为每个文档生成问题。"""
        return await self._agenerate_dataset(
            self.nodes, num=num, generate_response=True
        )

    def generate_questions_from_nodes(self, num: int | None = None) -> List[str]:
        """为每个文档生成问题。"""
        return asyncio_run(self.agenerate_questions_from_nodes(num=num))

    def generate_dataset_from_nodes(
        self, num: int | None = None
    ) -> QueryResponseDataset:
        """为每个文档生成问题。"""
        return asyncio_run(self.agenerate_dataset_from_nodes(num=num))

    def _get_prompts(self) -> PromptDictType:
        """获取提示。"""
        return {
            "text_question_template": self.text_question_template,
            "text_qa_template": self.text_qa_template,
        }

    def _get_prompt_modules(self) -> PromptMixinType:
        """获取提示模块。"""
        return {}

    def _update_prompts(self, prompts: PromptDictType) -> None:
        """更新提示。"""
        if "text_question_template" in prompts:
            self.text_question_template = prompts["text_question_template"]
        if "text_qa_template" in prompts:
            self.text_qa_template = prompts["text_qa_template"]

from_documents classmethod #

from_documents(
    documents: List[Document],
    llm: Optional[LLM] = None,
    transformations: Optional[
        List[TransformComponent]
    ] = None,
    callback_manager: Optional[CallbackManager] = None,
    num_questions_per_chunk: int = 10,
    text_question_template: (
        BasePromptTemplate | None
    ) = None,
    text_qa_template: BasePromptTemplate | None = None,
    question_gen_query: str | None = None,
    required_keywords: List[str] | None = None,
    exclude_keywords: List[str] | None = None,
    show_progress: bool = False,
    service_context: ServiceContext | None = None,
) -> DatasetGenerator

从文档中生成数据集。

Source code in llama_index/core/evaluation/dataset_generation.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
@classmethod
def from_documents(
    cls,
    documents: List[Document],
    llm: Optional[LLM] = None,
    transformations: Optional[List[TransformComponent]] = None,
    callback_manager: Optional[CallbackManager] = None,
    num_questions_per_chunk: int = 10,
    text_question_template: BasePromptTemplate | None = None,
    text_qa_template: BasePromptTemplate | None = None,
    question_gen_query: str | None = None,
    required_keywords: List[str] | None = None,
    exclude_keywords: List[str] | None = None,
    show_progress: bool = False,
    # deprecated
    service_context: ServiceContext | None = None,
) -> DatasetGenerator:
    """从文档中生成数据集。"""
    llm = llm or llm_from_settings_or_context(Settings, service_context)
    transformations = transformations or transformations_from_settings_or_context(
        Settings, service_context
    )
    callback_manager = (
        callback_manager
        or callback_manager_from_settings_or_context(Settings, service_context)
    )

    nodes = run_transformations(
        documents, transformations, show_progress=show_progress
    )

    # use node postprocessor to filter nodes
    required_keywords = required_keywords or []
    exclude_keywords = exclude_keywords or []
    node_postprocessor = KeywordNodePostprocessor(
        callback_manager=callback_manager,
        required_keywords=required_keywords,
        exclude_keywords=exclude_keywords,
    )
    node_with_scores = [NodeWithScore(node=node) for node in nodes]
    node_with_scores = node_postprocessor.postprocess_nodes(node_with_scores)
    nodes = [node_with_score.node for node_with_score in node_with_scores]

    return cls(
        nodes=nodes,
        llm=llm,
        callback_manager=callback_manager,
        num_questions_per_chunk=num_questions_per_chunk,
        text_question_template=text_question_template,
        text_qa_template=text_qa_template,
        question_gen_query=question_gen_query,
        show_progress=show_progress,
        service_context=service_context,
    )

agenerate_questions_from_nodes async #

agenerate_questions_from_nodes(
    num: int | None = None,
) -> List[str]

为每个文档生成问题。

Source code in llama_index/core/evaluation/dataset_generation.py
307
308
309
310
311
312
async def agenerate_questions_from_nodes(self, num: int | None = None) -> List[str]:
    """为每个文档生成问题。"""
    dataset = await self._agenerate_dataset(
        self.nodes, num=num, generate_response=False
    )
    return dataset.questions

agenerate_dataset_from_nodes async #

agenerate_dataset_from_nodes(
    num: int | None = None,
) -> QueryResponseDataset

为每个文档生成问题。

Source code in llama_index/core/evaluation/dataset_generation.py
314
315
316
317
318
319
320
async def agenerate_dataset_from_nodes(
    self, num: int | None = None
) -> QueryResponseDataset:
    """为每个文档生成问题。"""
    return await self._agenerate_dataset(
        self.nodes, num=num, generate_response=True
    )

generate_questions_from_nodes #

generate_questions_from_nodes(
    num: int | None = None,
) -> List[str]

为每个文档生成问题。

Source code in llama_index/core/evaluation/dataset_generation.py
322
323
324
def generate_questions_from_nodes(self, num: int | None = None) -> List[str]:
    """为每个文档生成问题。"""
    return asyncio_run(self.agenerate_questions_from_nodes(num=num))

generate_dataset_from_nodes #

generate_dataset_from_nodes(
    num: int | None = None,
) -> QueryResponseDataset

为每个文档生成问题。

Source code in llama_index/core/evaluation/dataset_generation.py
326
327
328
329
330
def generate_dataset_from_nodes(
    self, num: int | None = None
) -> QueryResponseDataset:
    """为每个文档生成问题。"""
    return asyncio_run(self.agenerate_dataset_from_nodes(num=num))

QueryResponseDataset #

Bases: BaseModel

查询响应数据集。

如果数据集是从文档中生成的,响应可能为空。

Parameters:

Name Type Description Default
queries Dict[str, str]

查询id -> 查询。

required
responses Dict[str, str]

查询id -> 响应。

required
Source code in llama_index/core/evaluation/dataset_generation.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
@deprecated(
    "Deprecated in favor of `LabelledRagDataset` which should be used instead.",
    action="always",
)
class QueryResponseDataset(BaseModel):
    """查询响应数据集。

    如果数据集是从文档中生成的,响应可能为空。

    Args:
        queries (Dict[str, str]): 查询id -> 查询。
        responses (Dict[str, str]): 查询id -> 响应。"""

    queries: Dict[str, str] = Field(
        default_factory=dict, description="Query id -> query"
    )
    responses: Dict[str, str] = Field(
        default_factory=dict, description="Query id -> response"
    )

    @classmethod
    def from_qr_pairs(
        cls,
        qr_pairs: List[Tuple[str, str]],
    ) -> QueryResponseDataset:
        """从二维码对创建。"""
        # define ids as simple integers
        queries = {str(idx): query for idx, (query, _) in enumerate(qr_pairs)}
        responses = {str(idx): response for idx, (_, response) in enumerate(qr_pairs)}
        return cls(queries=queries, responses=responses)

    @property
    def qr_pairs(self) -> List[Tuple[str, str]]:
        """获取成对的元素。"""
        # if query_id not in response, throw error
        for query_id in self.queries:
            if query_id not in self.responses:
                raise ValueError(f"Query id {query_id} not in responses")

        return [
            (self.queries[query_id], self.responses[query_id])
            for query_id in self.queries
        ]

    @property
    def questions(self) -> List[str]:
        """获取问题。"""
        return list(self.queries.values())

    def save_json(self, path: str) -> None:
        """保存json。"""
        with open(path, "w") as f:
            json.dump(self.dict(), f, indent=4)

    @classmethod
    def from_json(cls, path: str) -> QueryResponseDataset:
        """加载json。"""
        with open(path) as f:
            data = json.load(f)
        return cls(**data)

qr_pairs property #

qr_pairs: List[Tuple[str, str]]

获取成对的元素。

questions property #

questions: List[str]

获取问题。

from_qr_pairs classmethod #

from_qr_pairs(
    qr_pairs: List[Tuple[str, str]]
) -> QueryResponseDataset

从二维码对创建。

Source code in llama_index/core/evaluation/dataset_generation.py
69
70
71
72
73
74
75
76
77
78
@classmethod
def from_qr_pairs(
    cls,
    qr_pairs: List[Tuple[str, str]],
) -> QueryResponseDataset:
    """从二维码对创建。"""
    # define ids as simple integers
    queries = {str(idx): query for idx, (query, _) in enumerate(qr_pairs)}
    responses = {str(idx): response for idx, (_, response) in enumerate(qr_pairs)}
    return cls(queries=queries, responses=responses)

save_json #

save_json(path: str) -> None

保存json。

Source code in llama_index/core/evaluation/dataset_generation.py
 98
 99
100
101
def save_json(self, path: str) -> None:
    """保存json。"""
    with open(path, "w") as f:
        json.dump(self.dict(), f, indent=4)

from_json classmethod #

from_json(path: str) -> QueryResponseDataset

加载json。

Source code in llama_index/core/evaluation/dataset_generation.py
103
104
105
106
107
108
@classmethod
def from_json(cls, path: str) -> QueryResponseDataset:
    """加载json。"""
    with open(path) as f:
        data = json.load(f)
    return cls(**data)