Skip to content

Marvin

MarvinMetadataExtractor #

Bases: BaseExtractor

Source code in llama_index/extractors/marvin/base.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class MarvinMetadataExtractor(BaseExtractor):
    # Forward reference to handle circular imports
    marvin_model: Type["ai_model"] = Field(
        description="The Marvin model to use for extracting custom metadata"
    )
    llm_model_string: Optional[str] = Field(
        description="The LLM model string to use for extracting custom metadata"
    )

    """用于使用Marvin提取自定义元数据的元数据提取器。
    节点级提取器。提取`marvin_metadata`元数据字段。
    Args:
        marvin_model: 用于提取元数据的Marvin模型
        llm_model_string: (可选) 用于提取元数据的LLM模型字符串
    用法:
        #创建提取器列表
        extractors = [
            TitleExtractor(nodes=1, llm=llm),
            MarvinMetadataExtractor(marvin_model=YourMarvinMetadataModel),
        ]

        #创建节点解析器以从文档中解析节点
        node_parser = SentenceSplitter(
            text_splitter=text_splitter
        )

        #使用node_parser从文档中获取节点
        from llama_index.ingestion import run_transformations
        nodes = run_transformations(documents, [node_parser] + extractors)
        print(nodes)"""

    def __init__(
        self,
        marvin_model: Type[BaseModel],
        llm_model_string: Optional[str] = None,
        **kwargs: Any,
    ) -> None:
        """初始化参数。"""
        import marvin
        from marvin import ai_model

        if not issubclass(marvin_model, ai_model):
            raise ValueError("marvin_model must be a subclass of ai_model")

        if llm_model_string:
            marvin.settings.llm_model = llm_model_string

        super().__init__(
            marvin_model=marvin_model, llm_model_string=llm_model_string, **kwargs
        )

    @classmethod
    def class_name(cls) -> str:
        return "MarvinEntityExtractor"

    async def aextract(self, nodes: Sequence[BaseNode]) -> List[Dict]:
        from marvin import ai_model

        ai_model = cast(ai_model, self.marvin_model)
        metadata_list: List[Dict] = []

        nodes_queue: Iterable[BaseNode] = get_tqdm_iterable(
            nodes, self.show_progress, "Extracting marvin metadata"
        )
        for node in nodes_queue:
            if self.is_text_node_only and not isinstance(node, TextNode):
                metadata_list.append({})
                continue

            # TODO: Does marvin support async?
            metadata = ai_model(node.get_content())

            metadata_list.append({"marvin_metadata": metadata.dict()})
        return metadata_list

llm_model_string class-attribute instance-attribute #

llm_model_string: Optional[str] = Field(
    description="The LLM model string to use for extracting custom metadata"
)

用于使用Marvin提取自定义元数据的元数据提取器。 节点级提取器。提取marvin_metadata元数据字段。 Args: marvin_model: 用于提取元数据的Marvin模型 llm_model_string: (可选) 用于提取元数据的LLM模型字符串 用法: #创建提取器列表 extractors = [ TitleExtractor(nodes=1, llm=llm), MarvinMetadataExtractor(marvin_model=YourMarvinMetadataModel), ]

#创建节点解析器以从文档中解析节点
node_parser = SentenceSplitter(
    text_splitter=text_splitter
)

#使用node_parser从文档中获取节点
from llama_index.ingestion import run_transformations
nodes = run_transformations(documents, [node_parser] + extractors)
print(nodes)