Question

QuestionsAnsweredExtractor #

Bases: BaseExtractor

问题回答提取器。节点级提取器。提取questions_this_excerpt_can_answer元数据字段。

Parameters:

Name	Type	Description	Default
`llm`	`Optional[LLM]`	LLM	`None`
`questions`	`int`	要提取的问题数量	`5`
`prompt_template`	`str`	用于问题提取的模板	`DEFAULT_QUESTION_GEN_TMPL`
`embedding_only`	`bool`	是否仅使用嵌入	`True`

Source code in llama_index/core/extractors/metadata_extractors.py

class QuestionsAnsweredExtractor(BaseExtractor):
    """问题回答提取器。节点级提取器。
提取`questions_this_excerpt_can_answer`元数据字段。

Args:
    llm (Optional[LLM]): LLM
    questions (int): 要提取的问题数量
    prompt_template (str): 用于问题提取的模板
    embedding_only (bool): 是否仅使用嵌入"""

    llm: LLMPredictorType = Field(description="The LLM to use for generation.")
    questions: int = Field(
        default=5,
        description="The number of questions to generate.",
        gt=0,
    )
    prompt_template: str = Field(
        default=DEFAULT_QUESTION_GEN_TMPL,
        description="Prompt template to use when generating questions.",
    )
    embedding_only: bool = Field(
        default=True, description="Whether to use metadata for emebddings only."
    )

    def __init__(
        self,
        llm: Optional[LLM] = None,
        # TODO: llm_predictor arg is deprecated
        llm_predictor: Optional[LLMPredictorType] = None,
        questions: int = 5,
        prompt_template: str = DEFAULT_QUESTION_GEN_TMPL,
        embedding_only: bool = True,
        num_workers: int = DEFAULT_NUM_WORKERS,
        **kwargs: Any,
    ) -> None:
        """初始化参数。"""
        if questions < 1:
            raise ValueError("questions must be >= 1")

        super().__init__(
            llm=llm or llm_predictor or Settings.llm,
            questions=questions,
            prompt_template=prompt_template,
            embedding_only=embedding_only,
            num_workers=num_workers,
            **kwargs,
        )

    @classmethod
    def class_name(cls) -> str:
        return "QuestionsAnsweredExtractor"

    async def _aextract_questions_from_node(self, node: BaseNode) -> Dict[str, str]:
        """从节点中提取问题并返回其元数据字典。"""
        if self.is_text_node_only and not isinstance(node, TextNode):
            return {}

        context_str = node.get_content(metadata_mode=self.metadata_mode)
        prompt = PromptTemplate(template=self.prompt_template)
        questions = await self.llm.apredict(
            prompt, num_questions=self.questions, context_str=context_str
        )

        return {"questions_this_excerpt_can_answer": questions.strip()}

    async def aextract(self, nodes: Sequence[BaseNode]) -> List[Dict]:
        questions_jobs = []
        for node in nodes:
            questions_jobs.append(self._aextract_questions_from_node(node))

        metadata_list: List[Dict] = await run_jobs(
            questions_jobs, show_progress=self.show_progress, workers=self.num_workers
        )

        return metadata_list