Index

IngestionPipeline #

Bases: BaseModel

一个可以应用于数据的摄取管道。

Parameters:

Name	Type	Description	Default
`name`	`str`	摄取管道的唯一名称。默认为DEFAULT_PIPELINE_NAME。	`DEFAULT_PIPELINE_NAME`
`project_name`	`str`	项目的唯一名称。默认为DEFAULT_PROJECT_NAME。	`DEFAULT_PROJECT_NAME`
`transformations`	`List[TransformComponent]`	要应用于数据的转换。默认为None。	`None`
`documents`	`Optional[Sequence[Document]]`	要摄取的文档。默认为None。	`None`
`readers`	`Optional[List[ReaderConfig]]`	用于读取数据的读取器。默认为None。	`None`
`vector_store`	`Optional[BasePydanticVectorStore]`	用于存储数据的向量存储。默认为None。	`None`
`cache`	`Optional[IngestionCache]`	用于存储数据的缓存。默认为None。	`None`
`docstore`	`Optional[BaseDocumentStore]`	用于与向量存储进行去重的文档存储。默认为None。	`None`
`docstore_strategy`	`DocstoreStrategy`	文档去重策略。默认为DocstoreStrategy.UPSERTS。	`UPSERTS`
`disable_cache`	`bool`	禁用缓存。默认为False。	`False`
`base_url`	`str`	LlamaCloud API的基本URL。默认为DEFAULT_BASE_URL。	`None`
`app_url`	`str`	LlamaCloud应用的基本URL。默认为DEFAULT_APP_URL。	`None`
`api_key`	`Optional[str]`	LlamaCloud API密钥。默认为None。	`None`

Examples:

from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=512, chunk_overlap=20),
        OpenAIEmbedding(),
    ],
)

nodes = pipeline.run(documents=documents)

Source code in llama_index/core/ingestion/pipeline.py

class IngestionPipeline(BaseModel):
    """一个可以应用于数据的摄取管道。

    Args:
        name (str, optional):
            摄取管道的唯一名称。默认为DEFAULT_PIPELINE_NAME。
        project_name (str, optional):
            项目的唯一名称。默认为DEFAULT_PROJECT_NAME。
        transformations (List[TransformComponent], optional):
            要应用于数据的转换。默认为None。
        documents (Optional[Sequence[Document]], optional):
            要摄取的文档。默认为None。
        readers (Optional[List[ReaderConfig]], optional):
            用于读取数据的读取器。默认为None。
        vector_store (Optional[BasePydanticVectorStore], optional):
            用于存储数据的向量存储。默认为None。
        cache (Optional[IngestionCache], optional):
            用于存储数据的缓存。默认为None。
        docstore (Optional[BaseDocumentStore], optional):
            用于与向量存储进行去重的文档存储。默认为None。
        docstore_strategy (DocstoreStrategy, optional):
            文档去重策略。默认为DocstoreStrategy.UPSERTS。
        disable_cache (bool, optional):
            禁用缓存。默认为False。
        base_url (str, optional):
            LlamaCloud API的基本URL。默认为DEFAULT_BASE_URL。
        app_url (str, optional):
            LlamaCloud应用的基本URL。默认为DEFAULT_APP_URL。
        api_key (Optional[str], optional):
            LlamaCloud API密钥。默认为None。

    Examples:
        ```python
        from llama_index.core.ingestion import IngestionPipeline
        from llama_index.core.node_parser import SentenceSplitter
        from llama_index.embeddings.openai import OpenAIEmbedding

        pipeline = IngestionPipeline(
            transformations=[
                SentenceSplitter(chunk_size=512, chunk_overlap=20),
                OpenAIEmbedding(),
            ],
        )

        nodes = pipeline.run(documents=documents)
        ```"""

    name: str = Field(
        default=DEFAULT_PIPELINE_NAME,
        description="Unique name of the ingestion pipeline",
    )
    project_name: str = Field(
        default=DEFAULT_PROJECT_NAME, description="Unique name of the project"
    )

    transformations: List[TransformComponent] = Field(
        description="Transformations to apply to the data"
    )

    documents: Optional[Sequence[Document]] = Field(description="Documents to ingest")
    readers: Optional[List[ReaderConfig]] = Field(
        description="Reader to use to read the data"
    )
    vector_store: Optional[BasePydanticVectorStore] = Field(
        description="Vector store to use to store the data"
    )
    cache: IngestionCache = Field(
        default_factory=IngestionCache,
        description="Cache to use to store the data",
    )
    docstore: Optional[BaseDocumentStore] = Field(
        default=None,
        description="Document store to use for de-duping with a vector store.",
    )
    docstore_strategy: DocstoreStrategy = Field(
        default=DocstoreStrategy.UPSERTS, description="Document de-dup strategy."
    )
    disable_cache: bool = Field(default=False, description="Disable the cache")

    base_url: str = Field(
        default=DEFAULT_BASE_URL, description="Base URL for the LlamaCloud API"
    )
    app_url: str = Field(
        default=DEFAULT_APP_URL, description="Base URL for the LlamaCloud app"
    )
    api_key: Optional[str] = Field(default=None, description="LlamaCloud API key")

    class Config:
        arbitrary_types_allowed = True

    def __init__(
        self,
        name: str = DEFAULT_PIPELINE_NAME,
        project_name: str = DEFAULT_PROJECT_NAME,
        transformations: Optional[List[TransformComponent]] = None,
        readers: Optional[List[ReaderConfig]] = None,
        documents: Optional[Sequence[Document]] = None,
        vector_store: Optional[BasePydanticVectorStore] = None,
        cache: Optional[IngestionCache] = None,
        docstore: Optional[BaseDocumentStore] = None,
        docstore_strategy: DocstoreStrategy = DocstoreStrategy.UPSERTS,
        base_url: Optional[str] = None,
        app_url: Optional[str] = None,
        api_key: Optional[str] = None,
        disable_cache: bool = False,
    ) -> None:
        if transformations is None:
            transformations = self._get_default_transformations()

        api_key = api_key or os.environ.get("LLAMA_CLOUD_API_KEY", None)
        base_url = base_url or os.environ.get("LLAMA_CLOUD_BASE_URL", DEFAULT_BASE_URL)
        app_url = app_url or os.environ.get("LLAMA_CLOUD_APP_URL", DEFAULT_APP_URL)

        super().__init__(
            name=name,
            project_name=project_name,
            transformations=transformations,
            readers=readers,
            documents=documents,
            vector_store=vector_store,
            cache=cache or IngestionCache(),
            docstore=docstore,
            docstore_strategy=docstore_strategy,
            base_url=base_url,
            app_url=app_url,
            api_key=api_key,
            disable_cache=disable_cache,
        )

    @classmethod
    def from_pipeline_name(
        cls,
        name: str,
        project_name: str = DEFAULT_PROJECT_NAME,
        base_url: Optional[str] = None,
        cache: Optional[IngestionCache] = None,
        api_key: Optional[str] = None,
        app_url: Optional[str] = None,
        vector_store: Optional[BasePydanticVectorStore] = None,
        disable_cache: bool = False,
    ) -> "IngestionPipeline":
        """从管道名称创建一个摄入管道。"""
        base_url = base_url or os.environ.get("LLAMA_CLOUD_BASE_URL", DEFAULT_BASE_URL)
        assert base_url is not None

        api_key = api_key or os.environ.get("LLAMA_CLOUD_API_KEY", None)
        app_url = app_url or os.environ.get("LLAMA_CLOUD_APP_URL", DEFAULT_APP_URL)

        client = get_client(api_key=api_key, base_url=base_url)

        projects: List[Project] = client.project.list_projects(
            project_name=project_name
        )
        if len(projects) < 0:
            raise ValueError(f"Project with name {project_name} not found")

        project = projects[0]
        assert project.id is not None, "Project ID should not be None"

        pipelines: List[Pipeline] = client.pipeline.search_pipelines(
            project_name=project_name, pipeline_name=name
        )
        if len(pipelines) < 0:
            raise ValueError(f"Pipeline with name {name} not found")

        pipeline = pipelines[0]

        transformations: List[TransformComponent] = []
        for configured_transformation in pipeline.configured_transformations:
            component_dict = cast(dict, configured_transformation.component)
            transformation_component_type = (
                configured_transformation.configurable_transformation_type
            )
            transformation = deserialize_transformation_component(
                component_dict, transformation_component_type
            )
            transformations.append(transformation)

        documents = []
        readers = []
        for data_source in pipeline.data_sources:
            component_dict = cast(dict, data_source.component)
            source_component_type = data_source.source_type

            if data_source.source_type == ConfigurableDataSourceNames.READER:
                source_component = deserialize_source_component(
                    component_dict, source_component_type
                )
                readers.append(source_component)
            elif data_source.source_type == ConfigurableDataSourceNames.DOCUMENT:
                source_component = deserialize_source_component(
                    component_dict, source_component_type
                )
                if (
                    isinstance(source_component, BaseNode)
                    and source_component.get_content()
                ):
                    documents.append(source_component)

        return cls(
            name=name,
            project_name=project_name,
            transformations=transformations,
            readers=readers,
            documents=documents,
            vector_store=vector_store,
            base_url=base_url,
            cache=cache,
            disable_cache=disable_cache,
            api_key=api_key,
            app_url=app_url,
        )

    def register(
        self,
        verbose: bool = True,
        documents: Optional[List[Document]] = None,
        nodes: Optional[List[BaseNode]] = None,
    ) -> str:
        """使用LlamaCloud API 注册流水线。"""
        client = get_client(api_key=self.api_key, base_url=self.base_url)

        input_nodes = self._prepare_inputs(documents, nodes)

        project = client.project.upsert_project(
            request=ProjectCreate(name=self.project_name)
        )
        assert project.id is not None, "Project ID should not be None"

        # avoid circular import
        from llama_index.core.ingestion.api_utils import get_pipeline_create

        pipeline_create = get_pipeline_create(
            self.name,
            client,
            PipelineType.PLAYGROUND,
            project_name=self.project_name,
            transformations=self.transformations,
            input_nodes=input_nodes,
            readers=self.readers,
        )

        # upload
        pipeline = client.project.upsert_pipeline_for_project(
            project.id,
            request=pipeline_create,
        )
        assert pipeline.id is not None, "Pipeline ID should not be None"

        # Print playground URL if not running remote
        if verbose:
            print(
                f"Pipeline available at: {self.app_url}/project/{project.id}/playground/{pipeline.id}"
            )

        return pipeline.id

    def run_remote(
        self,
        documents: Optional[List[Document]] = None,
        nodes: Optional[List[BaseNode]] = None,
    ) -> str:
        client = get_client(api_key=self.api_key, base_url=self.base_url)

        pipeline_id = self.register(documents=documents, nodes=nodes, verbose=False)

        # start pipeline?
        # the `PipeLineExecution` object should likely generate a URL at some point
        pipeline_execution = client.pipeline.create_playground_job(pipeline_id)

        assert (
            pipeline_execution.id is not None
        ), "Pipeline execution ID should not be None"

        print(
            f"Find your remote results here: {self.app_url}/"
            f"pipelines/execution?id={pipeline_execution.id}"
        )

        return pipeline_execution.id

    def persist(
        self,
        persist_dir: str = "./pipeline_storage",
        fs: Optional[AbstractFileSystem] = None,
        cache_name: str = DEFAULT_CACHE_NAME,
        docstore_name: str = DOCSTORE_FNAME,
    ) -> None:
        """将管道持久化到磁盘。"""
        if fs is not None:
            persist_dir = str(persist_dir)  # NOTE: doesn't support Windows here
            docstore_path = concat_dirs(persist_dir, docstore_name)
            cache_path = concat_dirs(persist_dir, cache_name)

        else:
            persist_path = Path(persist_dir)
            docstore_path = str(persist_path / docstore_name)
            cache_path = str(persist_path / cache_name)

        self.cache.persist(cache_path, fs=fs)
        if self.docstore is not None:
            self.docstore.persist(docstore_path, fs=fs)

    def load(
        self,
        persist_dir: str = "./pipeline_storage",
        fs: Optional[AbstractFileSystem] = None,
        cache_name: str = DEFAULT_CACHE_NAME,
        docstore_name: str = DOCSTORE_FNAME,
    ) -> None:
        """从磁盘加载流水线。"""
        if fs is not None:
            self.cache = IngestionCache.from_persist_path(
                concat_dirs(persist_dir, cache_name), fs=fs
            )
            persist_docstore_path = concat_dirs(persist_dir, docstore_name)
            if os.path.exists(persist_docstore_path):
                self.docstore = SimpleDocumentStore.from_persist_path(
                    concat_dirs(persist_dir, docstore_name), fs=fs
                )
        else:
            self.cache = IngestionCache.from_persist_path(
                str(Path(persist_dir) / cache_name)
            )
            persist_docstore_path = str(Path(persist_dir) / docstore_name)
            if os.path.exists(persist_docstore_path):
                self.docstore = SimpleDocumentStore.from_persist_path(
                    str(Path(persist_dir) / docstore_name)
                )

    def _get_default_transformations(self) -> List[TransformComponent]:
        return [
            SentenceSplitter(),
            Settings.embed_model,
        ]

    def _prepare_inputs(
        self, documents: Optional[List[Document]], nodes: Optional[List[BaseNode]]
    ) -> List[Document]:
        input_nodes: List[BaseNode] = []
        if documents is not None:
            input_nodes += documents

        if nodes is not None:
            input_nodes += nodes

        if self.documents is not None:
            input_nodes += self.documents

        if self.readers is not None:
            for reader in self.readers:
                input_nodes += reader.read()

        return input_nodes

    def _handle_duplicates(
        self,
        nodes: List[BaseNode],
        store_doc_text: bool = True,
    ) -> List[BaseNode]:
        """通过检查所有哈希值来处理文档存储的重复项。"""
        assert self.docstore is not None

        existing_hashes = self.docstore.get_all_document_hashes()
        current_hashes = []
        nodes_to_run = []
        for node in nodes:
            if node.hash not in existing_hashes and node.hash not in current_hashes:
                self.docstore.set_document_hash(node.id_, node.hash)
                nodes_to_run.append(node)
                current_hashes.append(node.hash)

        self.docstore.add_documents(nodes_to_run, store_text=store_doc_text)

        return nodes_to_run

    def _handle_upserts(
        self,
        nodes: List[BaseNode],
        store_doc_text: bool = True,
    ) -> List[BaseNode]:
        """处理文档存储的更新操作，通过检查哈希值和ID。"""
        assert self.docstore is not None

        existing_doc_ids_before = set(self.docstore.get_all_document_hashes().values())
        doc_ids_from_nodes = set()
        deduped_nodes_to_run = {}
        for node in nodes:
            ref_doc_id = node.ref_doc_id if node.ref_doc_id else node.id_
            doc_ids_from_nodes.add(ref_doc_id)
            existing_hash = self.docstore.get_document_hash(ref_doc_id)
            if not existing_hash:
                # document doesn't exist, so add it
                self.docstore.set_document_hash(ref_doc_id, node.hash)
                deduped_nodes_to_run[ref_doc_id] = node
            elif existing_hash and existing_hash != node.hash:
                self.docstore.delete_ref_doc(ref_doc_id, raise_error=False)

                if self.vector_store is not None:
                    self.vector_store.delete(ref_doc_id)

                self.docstore.set_document_hash(ref_doc_id, node.hash)

                deduped_nodes_to_run[ref_doc_id] = node
            else:
                continue  # document exists and is unchanged, so skip it

        if self.docstore_strategy == DocstoreStrategy.UPSERTS_AND_DELETE:
            # Identify missing docs and delete them from docstore and vector store
            doc_ids_to_delete = existing_doc_ids_before - doc_ids_from_nodes
            for ref_doc_id in doc_ids_to_delete:
                self.docstore.delete_document(ref_doc_id)

                if self.vector_store is not None:
                    self.vector_store.delete(ref_doc_id)

        nodes_to_run = list(deduped_nodes_to_run.values())
        self.docstore.add_documents(nodes_to_run, store_text=store_doc_text)

        return nodes_to_run

    @staticmethod
    def _node_batcher(
        num_batches: int, nodes: Union[List[BaseNode], List[Document]]
    ) -> Generator[Union[List[BaseNode], List[Document]], Any, Any]:
        """从lst中产生连续的大小为n的块。"""
        batch_size = max(1, int(len(nodes) / num_batches))
        for i in range(0, len(nodes), batch_size):
            yield nodes[i : i + batch_size]

    def run(
        self,
        show_progress: bool = False,
        documents: Optional[List[Document]] = None,
        nodes: Optional[List[BaseNode]] = None,
        cache_collection: Optional[str] = None,
        in_place: bool = True,
        store_doc_text: bool = True,
        num_workers: Optional[int] = None,
        **kwargs: Any,
    ) -> Sequence[BaseNode]:
        """对一组节点运行一系列转换。

如果提供了向量存储器，具有嵌入的节点将被添加到向量存储器中。

如果提供了向量存储器 + 文档存储器，文档存储器将用于去重文档。

Args:
    show_progress（bool，可选）：显示执行进度条。默认为False。
    documents（Optional[List[Document]]，可选）：要转换的文档集。默认为None。
    nodes（Optional[List[BaseNode]]，可选）：要转换的节点集。默认为None。
    cache_collection（Optional[str]，可选）：转换的缓存。默认为None。
    in_place（bool，可选）：转换是否为转换后的节点创建新列表，或修改传递给`run_transformations`的数组。默认为True。
    num_workers（Optional[int]，可选）：要使用的并行进程数。
        如果设置为None，则使用顺序计算。默认为None。

Returns:
    Sequence[BaseNode]：转换后的节点/文档集合
"""
        input_nodes = self._prepare_inputs(documents, nodes)

        # check if we need to dedup
        if self.docstore is not None and self.vector_store is not None:
            if self.docstore_strategy in (
                DocstoreStrategy.UPSERTS,
                DocstoreStrategy.UPSERTS_AND_DELETE,
            ):
                nodes_to_run = self._handle_upserts(
                    input_nodes, store_doc_text=store_doc_text
                )
            elif self.docstore_strategy == DocstoreStrategy.DUPLICATES_ONLY:
                nodes_to_run = self._handle_duplicates(
                    input_nodes, store_doc_text=store_doc_text
                )
            else:
                raise ValueError(f"Invalid docstore strategy: {self.docstore_strategy}")
        elif self.docstore is not None and self.vector_store is None:
            if self.docstore_strategy == DocstoreStrategy.UPSERTS:
                print(
                    "Docstore strategy set to upserts, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            elif self.docstore_strategy == DocstoreStrategy.UPSERTS_AND_DELETE:
                print(
                    "Docstore strategy set to upserts and delete, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            nodes_to_run = self._handle_duplicates(
                input_nodes, store_doc_text=store_doc_text
            )

        else:
            nodes_to_run = input_nodes

        if num_workers and num_workers > 1:
            if num_workers > multiprocessing.cpu_count():
                warnings.warn(
                    "Specified num_workers exceed number of CPUs in the system. "
                    "Setting `num_workers` down to the maximum CPU count."
                )

            with multiprocessing.get_context("spawn").Pool(num_workers) as p:
                node_batches = self._node_batcher(
                    num_batches=num_workers, nodes=nodes_to_run
                )
                nodes_parallel = p.starmap(
                    run_transformations,
                    zip(
                        node_batches,
                        repeat(self.transformations),
                        repeat(in_place),
                        repeat(self.cache if not self.disable_cache else None),
                        repeat(cache_collection),
                    ),
                )
                nodes = reduce(lambda x, y: x + y, nodes_parallel, [])
        else:
            nodes = run_transformations(
                nodes_to_run,
                self.transformations,
                show_progress=show_progress,
                cache=self.cache if not self.disable_cache else None,
                cache_collection=cache_collection,
                in_place=in_place,
                **kwargs,
            )

        if self.vector_store is not None:
            self.vector_store.add([n for n in nodes if n.embedding is not None])

        return nodes

    # ------ async methods ------

    async def _ahandle_duplicates(
        self,
        nodes: List[BaseNode],
        store_doc_text: bool = True,
    ) -> List[BaseNode]:
        """通过检查所有哈希值来处理文档存储的重复项。"""
        assert self.docstore is not None

        existing_hashes = await self.docstore.aget_all_document_hashes()
        current_hashes = []
        nodes_to_run = []
        for node in nodes:
            if node.hash not in existing_hashes and node.hash not in current_hashes:
                await self.docstore.aset_document_hash(node.id_, node.hash)
                nodes_to_run.append(node)
                current_hashes.append(node.hash)

        await self.docstore.async_add_documents(nodes_to_run, store_text=store_doc_text)

        return nodes_to_run

    async def _ahandle_upserts(
        self,
        nodes: List[BaseNode],
        store_doc_text: bool = True,
    ) -> List[BaseNode]:
        """处理文档存储的更新操作，通过检查哈希值和ID。"""
        assert self.docstore is not None

        existing_doc_ids_before = set(
            (await self.docstore.aget_all_document_hashes()).values()
        )
        doc_ids_from_nodes = set()
        deduped_nodes_to_run = {}
        for node in nodes:
            ref_doc_id = node.ref_doc_id if node.ref_doc_id else node.id_
            doc_ids_from_nodes.add(ref_doc_id)
            existing_hash = await self.docstore.aget_document_hash(ref_doc_id)
            if not existing_hash:
                # document doesn't exist, so add it
                await self.docstore.aset_document_hash(ref_doc_id, node.hash)
                deduped_nodes_to_run[ref_doc_id] = node
            elif existing_hash and existing_hash != node.hash:
                await self.docstore.adelete_ref_doc(ref_doc_id, raise_error=False)

                if self.vector_store is not None:
                    await self.vector_store.adelete(ref_doc_id)

                await self.docstore.aset_document_hash(ref_doc_id, node.hash)

                deduped_nodes_to_run[ref_doc_id] = node
            else:
                continue  # document exists and is unchanged, so skip it

        if self.docstore_strategy == DocstoreStrategy.UPSERTS_AND_DELETE:
            # Identify missing docs and delete them from docstore and vector store
            doc_ids_to_delete = existing_doc_ids_before - doc_ids_from_nodes
            for ref_doc_id in doc_ids_to_delete:
                await self.docstore.adelete_document(ref_doc_id)

                if self.vector_store is not None:
                    await self.vector_store.adelete(ref_doc_id)

        nodes_to_run = list(deduped_nodes_to_run.values())
        await self.docstore.async_add_documents(nodes_to_run, store_text=store_doc_text)

        return nodes_to_run

    async def arun(
        self,
        show_progress: bool = False,
        documents: Optional[List[Document]] = None,
        nodes: Optional[List[BaseNode]] = None,
        cache_collection: Optional[str] = None,
        in_place: bool = True,
        store_doc_text: bool = True,
        num_workers: Optional[int] = None,
        **kwargs: Any,
    ) -> Sequence[BaseNode]:
        """对一组节点运行一系列转换。

如果提供了向量存储器，具有嵌入的节点将被添加到向量存储器中。

如果提供了向量存储器 + 文档存储器，文档存储器将用于去重文档。

Args:
    show_progress（bool，可选）：显示执行进度条。默认为False。
    documents（Optional[List[Document]]，可选）：要转换的文档集。默认为None。
    nodes（Optional[List[BaseNode]]，可选）：要转换的节点集。默认为None。
    cache_collection（Optional[str]，可选）：转换的缓存。默认为None。
    in_place（bool，可选）：转换是否为转换后的节点创建新列表，或修改传递给`run_transformations`的数组。默认为True。
    num_workers（Optional[int]，可选）：要使用的并行进程数。
        如果设置为None，则使用顺序计算。默认为None。

Returns:
    Sequence[BaseNode]：转换后的节点/文档集合
"""
        input_nodes = self._prepare_inputs(documents, nodes)

        # check if we need to dedup
        if self.docstore is not None and self.vector_store is not None:
            if self.docstore_strategy in (
                DocstoreStrategy.UPSERTS,
                DocstoreStrategy.UPSERTS_AND_DELETE,
            ):
                nodes_to_run = await self._ahandle_upserts(
                    input_nodes, store_doc_text=store_doc_text
                )
            elif self.docstore_strategy == DocstoreStrategy.DUPLICATES_ONLY:
                nodes_to_run = await self._ahandle_duplicates(
                    input_nodes, store_doc_text=store_doc_text
                )
            else:
                raise ValueError(f"Invalid docstore strategy: {self.docstore_strategy}")
        elif self.docstore is not None and self.vector_store is None:
            if self.docstore_strategy == DocstoreStrategy.UPSERTS:
                print(
                    "Docstore strategy set to upserts, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            elif self.docstore_strategy == DocstoreStrategy.UPSERTS_AND_DELETE:
                print(
                    "Docstore strategy set to upserts and delete, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            nodes_to_run = await self._ahandle_duplicates(
                input_nodes, store_doc_text=store_doc_text
            )

        else:
            nodes_to_run = input_nodes

        if num_workers and num_workers > 1:
            if num_workers > multiprocessing.cpu_count():
                warnings.warn(
                    "Specified num_workers exceed number of CPUs in the system. "
                    "Setting `num_workers` down to the maximum CPU count."
                )

            loop = asyncio.get_event_loop()
            with ProcessPoolExecutor(max_workers=num_workers) as p:
                node_batches = self._node_batcher(
                    num_batches=num_workers, nodes=nodes_to_run
                )
                tasks = [
                    loop.run_in_executor(
                        p,
                        partial(
                            arun_transformations_wrapper,
                            transformations=self.transformations,
                            in_place=in_place,
                            cache=self.cache if not self.disable_cache else None,
                            cache_collection=cache_collection,
                        ),
                        batch,
                    )
                    for batch in node_batches
                ]
                result: List[List[BaseNode]] = await asyncio.gather(*tasks)
                nodes = reduce(lambda x, y: x + y, result, [])
        else:
            nodes = await arun_transformations(
                nodes_to_run,
                self.transformations,
                show_progress=show_progress,
                cache=self.cache if not self.disable_cache else None,
                cache_collection=cache_collection,
                in_place=in_place,
                **kwargs,
            )

        if self.vector_store is not None:
            await self.vector_store.async_add(
                [n for n in nodes if n.embedding is not None]
            )

        return nodes

from_pipeline_name `classmethod` #

from_pipeline_name(
    name: str,
    project_name: str = DEFAULT_PROJECT_NAME,
    base_url: Optional[str] = None,
    cache: Optional[IngestionCache] = None,
    api_key: Optional[str] = None,
    app_url: Optional[str] = None,
    vector_store: Optional[BasePydanticVectorStore] = None,
    disable_cache: bool = False,
) -> IngestionPipeline

从管道名称创建一个摄入管道。

Source code in llama_index/core/ingestion/pipeline.py

@classmethod
def from_pipeline_name(
    cls,
    name: str,
    project_name: str = DEFAULT_PROJECT_NAME,
    base_url: Optional[str] = None,
    cache: Optional[IngestionCache] = None,
    api_key: Optional[str] = None,
    app_url: Optional[str] = None,
    vector_store: Optional[BasePydanticVectorStore] = None,
    disable_cache: bool = False,
) -> "IngestionPipeline":
    """从管道名称创建一个摄入管道。"""
    base_url = base_url or os.environ.get("LLAMA_CLOUD_BASE_URL", DEFAULT_BASE_URL)
    assert base_url is not None

    api_key = api_key or os.environ.get("LLAMA_CLOUD_API_KEY", None)
    app_url = app_url or os.environ.get("LLAMA_CLOUD_APP_URL", DEFAULT_APP_URL)

    client = get_client(api_key=api_key, base_url=base_url)

    projects: List[Project] = client.project.list_projects(
        project_name=project_name
    )
    if len(projects) < 0:
        raise ValueError(f"Project with name {project_name} not found")

    project = projects[0]
    assert project.id is not None, "Project ID should not be None"

    pipelines: List[Pipeline] = client.pipeline.search_pipelines(
        project_name=project_name, pipeline_name=name
    )
    if len(pipelines) < 0:
        raise ValueError(f"Pipeline with name {name} not found")

    pipeline = pipelines[0]

    transformations: List[TransformComponent] = []
    for configured_transformation in pipeline.configured_transformations:
        component_dict = cast(dict, configured_transformation.component)
        transformation_component_type = (
            configured_transformation.configurable_transformation_type
        )
        transformation = deserialize_transformation_component(
            component_dict, transformation_component_type
        )
        transformations.append(transformation)

    documents = []
    readers = []
    for data_source in pipeline.data_sources:
        component_dict = cast(dict, data_source.component)
        source_component_type = data_source.source_type

        if data_source.source_type == ConfigurableDataSourceNames.READER:
            source_component = deserialize_source_component(
                component_dict, source_component_type
            )
            readers.append(source_component)
        elif data_source.source_type == ConfigurableDataSourceNames.DOCUMENT:
            source_component = deserialize_source_component(
                component_dict, source_component_type
            )
            if (
                isinstance(source_component, BaseNode)
                and source_component.get_content()
            ):
                documents.append(source_component)

    return cls(
        name=name,
        project_name=project_name,
        transformations=transformations,
        readers=readers,
        documents=documents,
        vector_store=vector_store,
        base_url=base_url,
        cache=cache,
        disable_cache=disable_cache,
        api_key=api_key,
        app_url=app_url,
    )

register #

register(
    verbose: bool = True,
    documents: Optional[List[Document]] = None,
    nodes: Optional[List[BaseNode]] = None,
) -> str

使用LlamaCloud API 注册流水线。

Source code in llama_index/core/ingestion/pipeline.py

def register(
    self,
    verbose: bool = True,
    documents: Optional[List[Document]] = None,
    nodes: Optional[List[BaseNode]] = None,
) -> str:
    """使用LlamaCloud API 注册流水线。"""
    client = get_client(api_key=self.api_key, base_url=self.base_url)

    input_nodes = self._prepare_inputs(documents, nodes)

    project = client.project.upsert_project(
        request=ProjectCreate(name=self.project_name)
    )
    assert project.id is not None, "Project ID should not be None"

    # avoid circular import
    from llama_index.core.ingestion.api_utils import get_pipeline_create

    pipeline_create = get_pipeline_create(
        self.name,
        client,
        PipelineType.PLAYGROUND,
        project_name=self.project_name,
        transformations=self.transformations,
        input_nodes=input_nodes,
        readers=self.readers,
    )

    # upload
    pipeline = client.project.upsert_pipeline_for_project(
        project.id,
        request=pipeline_create,
    )
    assert pipeline.id is not None, "Pipeline ID should not be None"

    # Print playground URL if not running remote
    if verbose:
        print(
            f"Pipeline available at: {self.app_url}/project/{project.id}/playground/{pipeline.id}"
        )

    return pipeline.id

persist #

persist(
    persist_dir: str = "./pipeline_storage",
    fs: Optional[AbstractFileSystem] = None,
    cache_name: str = DEFAULT_CACHE_NAME,
    docstore_name: str = DOCSTORE_FNAME,
) -> None

将管道持久化到磁盘。

Source code in llama_index/core/ingestion/pipeline.py

def persist(
    self,
    persist_dir: str = "./pipeline_storage",
    fs: Optional[AbstractFileSystem] = None,
    cache_name: str = DEFAULT_CACHE_NAME,
    docstore_name: str = DOCSTORE_FNAME,
) -> None:
    """将管道持久化到磁盘。"""
    if fs is not None:
        persist_dir = str(persist_dir)  # NOTE: doesn't support Windows here
        docstore_path = concat_dirs(persist_dir, docstore_name)
        cache_path = concat_dirs(persist_dir, cache_name)

    else:
        persist_path = Path(persist_dir)
        docstore_path = str(persist_path / docstore_name)
        cache_path = str(persist_path / cache_name)

    self.cache.persist(cache_path, fs=fs)
    if self.docstore is not None:
        self.docstore.persist(docstore_path, fs=fs)

load #

load(
    persist_dir: str = "./pipeline_storage",
    fs: Optional[AbstractFileSystem] = None,
    cache_name: str = DEFAULT_CACHE_NAME,
    docstore_name: str = DOCSTORE_FNAME,
) -> None

从磁盘加载流水线。

Source code in llama_index/core/ingestion/pipeline.py

def load(
    self,
    persist_dir: str = "./pipeline_storage",
    fs: Optional[AbstractFileSystem] = None,
    cache_name: str = DEFAULT_CACHE_NAME,
    docstore_name: str = DOCSTORE_FNAME,
) -> None:
    """从磁盘加载流水线。"""
    if fs is not None:
        self.cache = IngestionCache.from_persist_path(
            concat_dirs(persist_dir, cache_name), fs=fs
        )
        persist_docstore_path = concat_dirs(persist_dir, docstore_name)
        if os.path.exists(persist_docstore_path):
            self.docstore = SimpleDocumentStore.from_persist_path(
                concat_dirs(persist_dir, docstore_name), fs=fs
            )
    else:
        self.cache = IngestionCache.from_persist_path(
            str(Path(persist_dir) / cache_name)
        )
        persist_docstore_path = str(Path(persist_dir) / docstore_name)
        if os.path.exists(persist_docstore_path):
            self.docstore = SimpleDocumentStore.from_persist_path(
                str(Path(persist_dir) / docstore_name)
            )

run #

run(
    show_progress: bool = False,
    documents: Optional[List[Document]] = None,
    nodes: Optional[List[BaseNode]] = None,
    cache_collection: Optional[str] = None,
    in_place: bool = True,
    store_doc_text: bool = True,
    num_workers: Optional[int] = None,
    **kwargs: Any
) -> Sequence[BaseNode]

对一组节点运行一系列转换。

如果提供了向量存储器，具有嵌入的节点将被添加到向量存储器中。

如果提供了向量存储器 + 文档存储器，文档存储器将用于去重文档。

Returns:

Type	Description
`Sequence[BaseNode]`	Sequence[BaseNode]：转换后的节点/文档集合

Source code in llama_index/core/ingestion/pipeline.py

    def run(
        self,
        show_progress: bool = False,
        documents: Optional[List[Document]] = None,
        nodes: Optional[List[BaseNode]] = None,
        cache_collection: Optional[str] = None,
        in_place: bool = True,
        store_doc_text: bool = True,
        num_workers: Optional[int] = None,
        **kwargs: Any,
    ) -> Sequence[BaseNode]:
        """对一组节点运行一系列转换。

如果提供了向量存储器，具有嵌入的节点将被添加到向量存储器中。

如果提供了向量存储器 + 文档存储器，文档存储器将用于去重文档。

Args:
    show_progress（bool，可选）：显示执行进度条。默认为False。
    documents（Optional[List[Document]]，可选）：要转换的文档集。默认为None。
    nodes（Optional[List[BaseNode]]，可选）：要转换的节点集。默认为None。
    cache_collection（Optional[str]，可选）：转换的缓存。默认为None。
    in_place（bool，可选）：转换是否为转换后的节点创建新列表，或修改传递给`run_transformations`的数组。默认为True。
    num_workers（Optional[int]，可选）：要使用的并行进程数。
        如果设置为None，则使用顺序计算。默认为None。

Returns:
    Sequence[BaseNode]：转换后的节点/文档集合
"""
        input_nodes = self._prepare_inputs(documents, nodes)

        # check if we need to dedup
        if self.docstore is not None and self.vector_store is not None:
            if self.docstore_strategy in (
                DocstoreStrategy.UPSERTS,
                DocstoreStrategy.UPSERTS_AND_DELETE,
            ):
                nodes_to_run = self._handle_upserts(
                    input_nodes, store_doc_text=store_doc_text
                )
            elif self.docstore_strategy == DocstoreStrategy.DUPLICATES_ONLY:
                nodes_to_run = self._handle_duplicates(
                    input_nodes, store_doc_text=store_doc_text
                )
            else:
                raise ValueError(f"Invalid docstore strategy: {self.docstore_strategy}")
        elif self.docstore is not None and self.vector_store is None:
            if self.docstore_strategy == DocstoreStrategy.UPSERTS:
                print(
                    "Docstore strategy set to upserts, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            elif self.docstore_strategy == DocstoreStrategy.UPSERTS_AND_DELETE:
                print(
                    "Docstore strategy set to upserts and delete, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            nodes_to_run = self._handle_duplicates(
                input_nodes, store_doc_text=store_doc_text
            )

        else:
            nodes_to_run = input_nodes

        if num_workers and num_workers > 1:
            if num_workers > multiprocessing.cpu_count():
                warnings.warn(
                    "Specified num_workers exceed number of CPUs in the system. "
                    "Setting `num_workers` down to the maximum CPU count."
                )

            with multiprocessing.get_context("spawn").Pool(num_workers) as p:
                node_batches = self._node_batcher(
                    num_batches=num_workers, nodes=nodes_to_run
                )
                nodes_parallel = p.starmap(
                    run_transformations,
                    zip(
                        node_batches,
                        repeat(self.transformations),
                        repeat(in_place),
                        repeat(self.cache if not self.disable_cache else None),
                        repeat(cache_collection),
                    ),
                )
                nodes = reduce(lambda x, y: x + y, nodes_parallel, [])
        else:
            nodes = run_transformations(
                nodes_to_run,
                self.transformations,
                show_progress=show_progress,
                cache=self.cache if not self.disable_cache else None,
                cache_collection=cache_collection,
                in_place=in_place,
                **kwargs,
            )

        if self.vector_store is not None:
            self.vector_store.add([n for n in nodes if n.embedding is not None])

        return nodes

arun `async` #

arun(
    show_progress: bool = False,
    documents: Optional[List[Document]] = None,
    nodes: Optional[List[BaseNode]] = None,
    cache_collection: Optional[str] = None,
    in_place: bool = True,
    store_doc_text: bool = True,
    num_workers: Optional[int] = None,
    **kwargs: Any
) -> Sequence[BaseNode]

对一组节点运行一系列转换。

如果提供了向量存储器，具有嵌入的节点将被添加到向量存储器中。

如果提供了向量存储器 + 文档存储器，文档存储器将用于去重文档。

Returns:

Type	Description
`Sequence[BaseNode]`	Sequence[BaseNode]：转换后的节点/文档集合

Source code in llama_index/core/ingestion/pipeline.py

    async def arun(
        self,
        show_progress: bool = False,
        documents: Optional[List[Document]] = None,
        nodes: Optional[List[BaseNode]] = None,
        cache_collection: Optional[str] = None,
        in_place: bool = True,
        store_doc_text: bool = True,
        num_workers: Optional[int] = None,
        **kwargs: Any,
    ) -> Sequence[BaseNode]:
        """对一组节点运行一系列转换。

如果提供了向量存储器，具有嵌入的节点将被添加到向量存储器中。

如果提供了向量存储器 + 文档存储器，文档存储器将用于去重文档。

Args:
    show_progress（bool，可选）：显示执行进度条。默认为False。
    documents（Optional[List[Document]]，可选）：要转换的文档集。默认为None。
    nodes（Optional[List[BaseNode]]，可选）：要转换的节点集。默认为None。
    cache_collection（Optional[str]，可选）：转换的缓存。默认为None。
    in_place（bool，可选）：转换是否为转换后的节点创建新列表，或修改传递给`run_transformations`的数组。默认为True。
    num_workers（Optional[int]，可选）：要使用的并行进程数。
        如果设置为None，则使用顺序计算。默认为None。

Returns:
    Sequence[BaseNode]：转换后的节点/文档集合
"""
        input_nodes = self._prepare_inputs(documents, nodes)

        # check if we need to dedup
        if self.docstore is not None and self.vector_store is not None:
            if self.docstore_strategy in (
                DocstoreStrategy.UPSERTS,
                DocstoreStrategy.UPSERTS_AND_DELETE,
            ):
                nodes_to_run = await self._ahandle_upserts(
                    input_nodes, store_doc_text=store_doc_text
                )
            elif self.docstore_strategy == DocstoreStrategy.DUPLICATES_ONLY:
                nodes_to_run = await self._ahandle_duplicates(
                    input_nodes, store_doc_text=store_doc_text
                )
            else:
                raise ValueError(f"Invalid docstore strategy: {self.docstore_strategy}")
        elif self.docstore is not None and self.vector_store is None:
            if self.docstore_strategy == DocstoreStrategy.UPSERTS:
                print(
                    "Docstore strategy set to upserts, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            elif self.docstore_strategy == DocstoreStrategy.UPSERTS_AND_DELETE:
                print(
                    "Docstore strategy set to upserts and delete, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            nodes_to_run = await self._ahandle_duplicates(
                input_nodes, store_doc_text=store_doc_text
            )

        else:
            nodes_to_run = input_nodes

        if num_workers and num_workers > 1:
            if num_workers > multiprocessing.cpu_count():
                warnings.warn(
                    "Specified num_workers exceed number of CPUs in the system. "
                    "Setting `num_workers` down to the maximum CPU count."
                )

            loop = asyncio.get_event_loop()
            with ProcessPoolExecutor(max_workers=num_workers) as p:
                node_batches = self._node_batcher(
                    num_batches=num_workers, nodes=nodes_to_run
                )
                tasks = [
                    loop.run_in_executor(
                        p,
                        partial(
                            arun_transformations_wrapper,
                            transformations=self.transformations,
                            in_place=in_place,
                            cache=self.cache if not self.disable_cache else None,
                            cache_collection=cache_collection,
                        ),
                        batch,
                    )
                    for batch in node_batches
                ]
                result: List[List[BaseNode]] = await asyncio.gather(*tasks)
                nodes = reduce(lambda x, y: x + y, result, [])
        else:
            nodes = await arun_transformations(
                nodes_to_run,
                self.transformations,
                show_progress=show_progress,
                cache=self.cache if not self.disable_cache else None,
                cache_collection=cache_collection,
                in_place=in_place,
                **kwargs,
            )

        if self.vector_store is not None:
            await self.vector_store.async_add(
                [n for n in nodes if n.embedding is not None]
            )

        return nodes

DocstoreStrategy #

Bases: str, Enum

文档去重策略通过比较存储在文档存储中的哈希值或id来工作。它们需要设置一个必须在管道运行中持久化的文档存储。

属性： UPSERTS: ('upserts') 使用upserts来处理重复项。检查文档是否已根据其id存在于文档存储中。如果不存在，或者如果文档的哈希值已更新，它将更新文档存储并运行转换。 DUPLICATES_ONLY: ('duplicates_only') 仅处理重复项。检查文档的哈希值是否已存在于文档存储中。只有在这种情况下，它才会将文档添加到文档存储并运行转换。 UPSERTS_AND_DELETE: ('upserts_and_delete') 使用upserts和删除来处理重复项。类似于upsert策略，但它还将从文档存储中删除不存在的文档。

Source code in llama_index/core/ingestion/pipeline.py

class DocstoreStrategy(str, Enum):
    """文档去重策略通过比较存储在文档存储中的哈希值或id来工作。
它们需要设置一个必须在管道运行中持久化的文档存储。

属性：
        UPSERTS:
            ('upserts') 使用upserts来处理重复项。检查文档是否已根据其id存在于文档存储中。如果不存在，或者如果文档的哈希值已更新，它将更新文档存储并运行转换。
        DUPLICATES_ONLY:
            ('duplicates_only') 仅处理重复项。检查文档的哈希值是否已存在于文档存储中。只有在这种情况下，它才会将文档添加到文档存储并运行转换。
        UPSERTS_AND_DELETE:
            ('upserts_and_delete') 使用upserts和删除来处理重复项。类似于upsert策略，但它还将从文档存储中删除不存在的文档。"""

    UPSERTS = "upserts"
    DUPLICATES_ONLY = "duplicates_only"
    UPSERTS_AND_DELETE = "upserts_and_delete"

Index

IngestionPipeline #

from_pipeline_name classmethod #

register #

persist #

load #

run #

arun async #

DocstoreStrategy #

from_pipeline_name `classmethod` #

arun `async` #