Elasticsearch

ElasticsearchStore #

Bases: BasePydanticVectorStore

Elasticsearch向量存储。

引发： ConnectionError：如果AsyncElasticsearch客户端无法连接到Elasticsearch。 ValueError：如果既未提供es_client，也未提供es_url或es_cloud_id。

示例： pip install llama-index-vector-stores-elasticsearch

```python
from llama_index.vector_stores import ElasticsearchStore

# ElasticsearchStore类的附加设置
index_name = "my_index"
es_url = "http://localhost:9200"
es_cloud_id = "<cloud-id>"  # 在部署页面中找到
es_user = "elastic"
es_password = "<password>"  # 创建部署时提供或可以重置
es_api_key = "<api-key>"  # 在Kibana中创建API密钥（Security -> API Keys）

# 本地连接到ElasticsearchStore
es_local = ElasticsearchStore(
    index_name=index_name,
    es_url=es_url,
)

# 使用用户名和密码连接到Elastic Cloud
es_cloud_user_pass = ElasticsearchStore(
    index_name=index_name,
    es_cloud_id=es_cloud_id,
    es_user=es_user,
    es_password=es_password,
)

# 使用API密钥连接到Elastic Cloud
es_cloud_api_key = ElasticsearchStore(
    index_name=index_name,
    es_cloud_id=es_cloud_id,
    es_api_key=es_api_key,
)
```

Source code in llama_index/vector_stores/elasticsearch/base.py

class ElasticsearchStore(BasePydanticVectorStore):
    """Elasticsearch向量存储。

Args:
    index_name：Elasticsearch索引的名称。
    es_client：可选。预先存在的AsyncElasticsearch客户端。
    es_url：可选。Elasticsearch URL。
    es_cloud_id：可选。Elasticsearch云ID。
    es_api_key：可选。Elasticsearch API密钥。
    es_user：可选。Elasticsearch用户名。
    es_password：可选。Elasticsearch密码。
    text_field：可选。存储文本的Elasticsearch字段的名称。
    vector_field：可选。存储嵌入的Elasticsearch字段的名称。
    batch_size：可选。用于批量索引的批量大小。默认为200。
    distance_strategy：可选。用于相似性搜索的距离策略。默认为"COSINE"。
    retrieval_strategy：要使用的检索策略。AsyncBM25Strategy / AsyncSparseVectorStrategy / AsyncDenseVectorStrategy / AsyncRetrievalStrategy。默认为AsyncDenseVectorStrategy。

引发：
    ConnectionError：如果AsyncElasticsearch客户端无法连接到Elasticsearch。
    ValueError：如果既未提供es_client，也未提供es_url或es_cloud_id。

示例：
    `pip install llama-index-vector-stores-elasticsearch`

    ```python
    from llama_index.vector_stores import ElasticsearchStore

    # ElasticsearchStore类的附加设置
    index_name = "my_index"
    es_url = "http://localhost:9200"
    es_cloud_id = "<cloud-id>"  # 在部署页面中找到
    es_user = "elastic"
    es_password = "<password>"  # 创建部署时提供或可以重置
    es_api_key = "<api-key>"  # 在Kibana中创建API密钥（Security -> API Keys）

    # 本地连接到ElasticsearchStore
    es_local = ElasticsearchStore(
        index_name=index_name,
        es_url=es_url,
    )

    # 使用用户名和密码连接到Elastic Cloud
    es_cloud_user_pass = ElasticsearchStore(
        index_name=index_name,
        es_cloud_id=es_cloud_id,
        es_user=es_user,
        es_password=es_password,
    )

    # 使用API密钥连接到Elastic Cloud
    es_cloud_api_key = ElasticsearchStore(
        index_name=index_name,
        es_cloud_id=es_cloud_id,
        es_api_key=es_api_key,
    )
    ```"""

    class Config:
        # allow pydantic to tolarate its inability to validate AsyncRetrievalStrategy
        arbitrary_types_allowed = True

    stores_text: bool = True
    index_name: str
    es_client: Optional[Any]
    es_url: Optional[str]
    es_cloud_id: Optional[str]
    es_api_key: Optional[str]
    es_user: Optional[str]
    es_password: Optional[str]
    text_field: str = "content"
    vector_field: str = "embedding"
    batch_size: int = 200
    distance_strategy: Optional[DISTANCE_STRATEGIES] = "COSINE"
    retrieval_strategy: AsyncRetrievalStrategy

    _store = PrivateAttr()

    def __init__(
        self,
        index_name: str,
        es_client: Optional[Any] = None,
        es_url: Optional[str] = None,
        es_cloud_id: Optional[str] = None,
        es_api_key: Optional[str] = None,
        es_user: Optional[str] = None,
        es_password: Optional[str] = None,
        text_field: str = "content",
        vector_field: str = "embedding",
        batch_size: int = 200,
        distance_strategy: Optional[DISTANCE_STRATEGIES] = "COSINE",
        retrieval_strategy: Optional[AsyncRetrievalStrategy] = None,
    ) -> None:
        nest_asyncio.apply()

        if not es_client:
            es_client = get_elasticsearch_client(
                url=es_url,
                cloud_id=es_cloud_id,
                api_key=es_api_key,
                username=es_user,
                password=es_password,
            )

        if retrieval_strategy is None:
            retrieval_strategy = AsyncDenseVectorStrategy(
                distance=DistanceMetric[distance_strategy]
            )

        metadata_mappings = {
            "document_id": {"type": "keyword"},
            "doc_id": {"type": "keyword"},
            "ref_doc_id": {"type": "keyword"},
        }

        self._store = AsyncVectorStore(
            user_agent=get_user_agent(),
            client=es_client,
            index=index_name,
            retrieval_strategy=retrieval_strategy,
            text_field=text_field,
            vector_field=vector_field,
            metadata_mappings=metadata_mappings,
        )

        super().__init__(
            index_name=index_name,
            es_client=es_client,
            es_url=es_url,
            es_cloud_id=es_cloud_id,
            es_api_key=es_api_key,
            es_user=es_user,
            es_password=es_password,
            text_field=text_field,
            vector_field=vector_field,
            batch_size=batch_size,
            distance_strategy=distance_strategy,
            retrieval_strategy=retrieval_strategy,
        )

    @property
    def client(self) -> Any:
        """获取异步的elasticsearch客户端。"""
        return self._store.client

    def close(self) -> None:
        return asyncio.get_event_loop().run_until_complete(self._store.close())

    def add(
        self,
        nodes: List[BaseNode],
        *,
        create_index_if_not_exists: bool = True,
        **add_kwargs: Any,
    ) -> List[str]:
        """将节点添加到Elasticsearch索引。

Args:
    nodes：带有嵌入的节点列表。
    create_index_if_not_exists：可选。如果索引不存在，是否创建Elasticsearch索引。默认为True。

Returns:
    已添加到索引的节点ID列表。

引发：
    ImportError：如果未安装elasticsearch['async'] python包。
    BulkIndexError：如果AsyncElasticsearch async_bulk索引失败。
"""
        return asyncio.get_event_loop().run_until_complete(
            self.async_add(nodes, create_index_if_not_exists=create_index_if_not_exists)
        )

    async def async_add(
        self,
        nodes: List[BaseNode],
        *,
        create_index_if_not_exists: bool = True,
        **add_kwargs: Any,
    ) -> List[str]:
        """异步方法，用于将节点添加到Elasticsearch索引中。

Args:
    nodes: 带有嵌入的节点列表。
    create_index_if_not_exists: 可选。是否在索引不存在时创建AsyncElasticsearch索引。默认为True。

Returns:
    已添加到索引的节点ID列表。

引发：
    ImportError: 如果未安装elasticsearch python包。
    BulkIndexError: 如果AsyncElasticsearch async_bulk索引失败。
"""
        if len(nodes) == 0:
            return []

        embeddings: List[List[float]] = []
        texts: List[str] = []
        metadatas: List[dict] = []
        ids: List[str] = []
        for node in nodes:
            ids.append(node.node_id)
            embeddings.append(node.get_embedding())
            texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
            metadatas.append(node_to_metadata_dict(node, remove_text=True))

        if not self._store.num_dimensions:
            self._store.num_dimensions = len(embeddings[0])

        return await self._store.add_texts(
            texts=texts,
            metadatas=metadatas,
            vectors=embeddings,
            ids=ids,
            create_index_if_not_exists=create_index_if_not_exists,
            bulk_kwargs=add_kwargs,
        )

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """从Elasticsearch索引中删除节点。

Args:
    ref_doc_id: 要删除的节点的ID。
    delete_kwargs: 可选。传递给Elasticsearch delete_by_query的额外参数。

引发:
    Exception: 如果Elasticsearch delete_by_query失败。
"""
        return asyncio.get_event_loop().run_until_complete(
            self.adelete(ref_doc_id, **delete_kwargs)
        )

    async def adelete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """异步从Elasticsearch索引中删除节点。

Args:
    ref_doc_id：要删除的节点的ID。
    delete_kwargs：可选。传递给AsyncElasticsearch delete_by_query的额外参数。

引发：
    异常：如果AsyncElasticsearch delete_by_query失败。
"""
        await self._store.delete(
            query={"term": {"metadata.ref_doc_id": ref_doc_id}}, **delete_kwargs
        )

    def query(
        self,
        query: VectorStoreQuery,
        custom_query: Optional[
            Callable[[Dict, Union[VectorStoreQuery, None]], Dict]
        ] = None,
        es_filter: Optional[List[Dict]] = None,
        **kwargs: Any,
    ) -> VectorStoreQueryResult:
        """查询前k个最相似节点的索引。

Args:
    query_embedding（List[float]）：查询嵌入
    custom_query：可选。自定义查询函数，接受es查询体并返回修改后的查询体。这可用于向Elasticsearch查询中添加额外的查询参数。
    es_filter：可选。要应用于查询的Elasticsearch过滤器。如果在查询中提供了过滤器，则将忽略此过滤器。

Returns:
    VectorStoreQueryResult：查询结果。

引发：
    Exception：如果Elasticsearch查询失败。
"""
        return asyncio.get_event_loop().run_until_complete(
            self.aquery(query, custom_query, es_filter, **kwargs)
        )

    async def aquery(
        self,
        query: VectorStoreQuery,
        custom_query: Optional[
            Callable[[Dict, Union[VectorStoreQuery, None]], Dict]
        ] = None,
        es_filter: Optional[List[Dict]] = None,
        **kwargs: Any,
    ) -> VectorStoreQueryResult:
        """为前k个最相似节点异步查询索引。

Args:
    query_embedding (VectorStoreQuery): 查询嵌入
    custom_query: 可选。自定义查询函数，接受es查询体并返回修改后的查询体。
                这可用于向AsyncElasticsearch查询添加额外的查询参数。
    es_filter: 可选。要应用于查询的AsyncElasticsearch过滤器。
                如果在查询中提供了过滤器，则将忽略此过滤器。

Returns:
    VectorStoreQueryResult: 查询结果。

引发:
    Exception: 如果AsyncElasticsearch查询失败。
"""
        _mode_must_match_retrieval_strategy(query.mode, self.retrieval_strategy)

        if query.filters is not None and len(query.filters.legacy_filters()) > 0:
            filter = [_to_elasticsearch_filter(query.filters)]
        else:
            filter = es_filter or []

        hits = await self._store.search(
            query=query.query_str,
            query_vector=query.query_embedding,
            k=query.similarity_top_k,
            num_candidates=query.similarity_top_k * 10,
            filter=filter,
            custom_query=custom_query,
        )

        top_k_nodes = []
        top_k_ids = []
        top_k_scores = []
        for hit in hits:
            source = hit["_source"]
            metadata = source.get("metadata", None)
            text = source.get(self.text_field, None)
            node_id = hit["_id"]

            try:
                node = metadata_dict_to_node(metadata)
                node.text = text
            except Exception:
                # Legacy support for old metadata format
                logger.warning(
                    f"Could not parse metadata from hit {hit['_source']['metadata']}"
                )
                node_info = source.get("node_info")
                relationships = source.get("relationships", {})
                start_char_idx = None
                end_char_idx = None
                if isinstance(node_info, dict):
                    start_char_idx = node_info.get("start", None)
                    end_char_idx = node_info.get("end", None)

                node = TextNode(
                    text=text,
                    metadata=metadata,
                    id_=node_id,
                    start_char_idx=start_char_idx,
                    end_char_idx=end_char_idx,
                    relationships=relationships,
                )
            top_k_nodes.append(node)
            top_k_ids.append(node_id)
            top_k_scores.append(hit.get("_rank", hit["_score"]))

        if (
            isinstance(self.retrieval_strategy, AsyncDenseVectorStrategy)
            and self.retrieval_strategy.hybrid
        ):
            total_rank = sum(top_k_scores)
            top_k_scores = [total_rank - rank / total_rank for rank in top_k_scores]

        return VectorStoreQueryResult(
            nodes=top_k_nodes,
            ids=top_k_ids,
            similarities=_to_llama_similarities(top_k_scores),
        )

client `property` #

client: Any

获取异步的elasticsearch客户端。

add #

add(
    nodes: List[BaseNode],
    *,
    create_index_if_not_exists: bool = True,
    **add_kwargs: Any
) -> List[str]

将节点添加到Elasticsearch索引。

Returns:

Type	Description
`List[str]`	已添加到索引的节点ID列表。

引发： ImportError：如果未安装elasticsearch['async'] python包。 BulkIndexError：如果AsyncElasticsearch async_bulk索引失败。

Source code in llama_index/vector_stores/elasticsearch/base.py

    def add(
        self,
        nodes: List[BaseNode],
        *,
        create_index_if_not_exists: bool = True,
        **add_kwargs: Any,
    ) -> List[str]:
        """将节点添加到Elasticsearch索引。

Args:
    nodes：带有嵌入的节点列表。
    create_index_if_not_exists：可选。如果索引不存在，是否创建Elasticsearch索引。默认为True。

Returns:
    已添加到索引的节点ID列表。

引发：
    ImportError：如果未安装elasticsearch['async'] python包。
    BulkIndexError：如果AsyncElasticsearch async_bulk索引失败。
"""
        return asyncio.get_event_loop().run_until_complete(
            self.async_add(nodes, create_index_if_not_exists=create_index_if_not_exists)
        )

async_add `async` #

async_add(
    nodes: List[BaseNode],
    *,
    create_index_if_not_exists: bool = True,
    **add_kwargs: Any
) -> List[str]

异步方法，用于将节点添加到Elasticsearch索引中。

Parameters:

Name	Type	Description	Default
`nodes`	`List[BaseNode]`	带有嵌入的节点列表。	required
`create_index_if_not_exists`	`bool`	可选。是否在索引不存在时创建AsyncElasticsearch索引。默认为True。	`True`

Returns:

Type	Description
`List[str]`	已添加到索引的节点ID列表。

引发： ImportError: 如果未安装elasticsearch python包。 BulkIndexError: 如果AsyncElasticsearch async_bulk索引失败。

Source code in llama_index/vector_stores/elasticsearch/base.py

    async def async_add(
        self,
        nodes: List[BaseNode],
        *,
        create_index_if_not_exists: bool = True,
        **add_kwargs: Any,
    ) -> List[str]:
        """异步方法，用于将节点添加到Elasticsearch索引中。

Args:
    nodes: 带有嵌入的节点列表。
    create_index_if_not_exists: 可选。是否在索引不存在时创建AsyncElasticsearch索引。默认为True。

Returns:
    已添加到索引的节点ID列表。

引发：
    ImportError: 如果未安装elasticsearch python包。
    BulkIndexError: 如果AsyncElasticsearch async_bulk索引失败。
"""
        if len(nodes) == 0:
            return []

        embeddings: List[List[float]] = []
        texts: List[str] = []
        metadatas: List[dict] = []
        ids: List[str] = []
        for node in nodes:
            ids.append(node.node_id)
            embeddings.append(node.get_embedding())
            texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
            metadatas.append(node_to_metadata_dict(node, remove_text=True))

        if not self._store.num_dimensions:
            self._store.num_dimensions = len(embeddings[0])

        return await self._store.add_texts(
            texts=texts,
            metadatas=metadatas,
            vectors=embeddings,
            ids=ids,
            create_index_if_not_exists=create_index_if_not_exists,
            bulk_kwargs=add_kwargs,
        )

delete #

delete(ref_doc_id: str, **delete_kwargs: Any) -> None

从Elasticsearch索引中删除节点。

Parameters:

Name	Type	Description	Default
`ref_doc_id`	`str`	要删除的节点的ID。	required
`delete_kwargs`	`Any`	可选。传递给Elasticsearch delete_by_query的额外参数。	`{}`

引发

Exception: 如果Elasticsearch delete_by_query失败。

Source code in llama_index/vector_stores/elasticsearch/base.py

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """从Elasticsearch索引中删除节点。

Args:
    ref_doc_id: 要删除的节点的ID。
    delete_kwargs: 可选。传递给Elasticsearch delete_by_query的额外参数。

引发:
    Exception: 如果Elasticsearch delete_by_query失败。
"""
        return asyncio.get_event_loop().run_until_complete(
            self.adelete(ref_doc_id, **delete_kwargs)
        )

adelete `async` #

adelete(ref_doc_id: str, **delete_kwargs: Any) -> None

异步从Elasticsearch索引中删除节点。

引发：异常：如果AsyncElasticsearch delete_by_query失败。

Source code in llama_index/vector_stores/elasticsearch/base.py

    async def adelete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """异步从Elasticsearch索引中删除节点。

Args:
    ref_doc_id：要删除的节点的ID。
    delete_kwargs：可选。传递给AsyncElasticsearch delete_by_query的额外参数。

引发：
    异常：如果AsyncElasticsearch delete_by_query失败。
"""
        await self._store.delete(
            query={"term": {"metadata.ref_doc_id": ref_doc_id}}, **delete_kwargs
        )

query #

query(
    query: VectorStoreQuery,
    custom_query: Optional[
        Callable[
            [Dict, Union[VectorStoreQuery, None]], Dict
        ]
    ] = None,
    es_filter: Optional[List[Dict]] = None,
    **kwargs: Any
) -> VectorStoreQueryResult

查询前k个最相似节点的索引。

Returns:

Type	Description
`VectorStoreQueryResult`	VectorStoreQueryResult：查询结果。

引发： Exception：如果Elasticsearch查询失败。

Source code in llama_index/vector_stores/elasticsearch/base.py

    def query(
        self,
        query: VectorStoreQuery,
        custom_query: Optional[
            Callable[[Dict, Union[VectorStoreQuery, None]], Dict]
        ] = None,
        es_filter: Optional[List[Dict]] = None,
        **kwargs: Any,
    ) -> VectorStoreQueryResult:
        """查询前k个最相似节点的索引。

Args:
    query_embedding（List[float]）：查询嵌入
    custom_query：可选。自定义查询函数，接受es查询体并返回修改后的查询体。这可用于向Elasticsearch查询中添加额外的查询参数。
    es_filter：可选。要应用于查询的Elasticsearch过滤器。如果在查询中提供了过滤器，则将忽略此过滤器。

Returns:
    VectorStoreQueryResult：查询结果。

引发：
    Exception：如果Elasticsearch查询失败。
"""
        return asyncio.get_event_loop().run_until_complete(
            self.aquery(query, custom_query, es_filter, **kwargs)
        )

aquery `async` #

aquery(
    query: VectorStoreQuery,
    custom_query: Optional[
        Callable[
            [Dict, Union[VectorStoreQuery, None]], Dict
        ]
    ] = None,
    es_filter: Optional[List[Dict]] = None,
    **kwargs: Any
) -> VectorStoreQueryResult

为前k个最相似节点异步查询索引。

Parameters:

Name	Type	Description	Default
`query_embedding`	`VectorStoreQuery`	查询嵌入	required
`custom_query`	`Optional[Callable[[Dict, Union[VectorStoreQuery, None]], Dict]]`	可选。自定义查询函数，接受es查询体并返回修改后的查询体。这可用于向AsyncElasticsearch查询添加额外的查询参数。	`None`
`es_filter`	`Optional[List[Dict]]`	可选。要应用于查询的AsyncElasticsearch过滤器。如果在查询中提供了过滤器，则将忽略此过滤器。	`None`

Returns:

Name	Type	Description
`VectorStoreQueryResult`	`VectorStoreQueryResult`	查询结果。

引发

Exception: 如果AsyncElasticsearch查询失败。

Source code in llama_index/vector_stores/elasticsearch/base.py

    async def aquery(
        self,
        query: VectorStoreQuery,
        custom_query: Optional[
            Callable[[Dict, Union[VectorStoreQuery, None]], Dict]
        ] = None,
        es_filter: Optional[List[Dict]] = None,
        **kwargs: Any,
    ) -> VectorStoreQueryResult:
        """为前k个最相似节点异步查询索引。

Args:
    query_embedding (VectorStoreQuery): 查询嵌入
    custom_query: 可选。自定义查询函数，接受es查询体并返回修改后的查询体。
                这可用于向AsyncElasticsearch查询添加额外的查询参数。
    es_filter: 可选。要应用于查询的AsyncElasticsearch过滤器。
                如果在查询中提供了过滤器，则将忽略此过滤器。

Returns:
    VectorStoreQueryResult: 查询结果。

引发:
    Exception: 如果AsyncElasticsearch查询失败。
"""
        _mode_must_match_retrieval_strategy(query.mode, self.retrieval_strategy)

        if query.filters is not None and len(query.filters.legacy_filters()) > 0:
            filter = [_to_elasticsearch_filter(query.filters)]
        else:
            filter = es_filter or []

        hits = await self._store.search(
            query=query.query_str,
            query_vector=query.query_embedding,
            k=query.similarity_top_k,
            num_candidates=query.similarity_top_k * 10,
            filter=filter,
            custom_query=custom_query,
        )

        top_k_nodes = []
        top_k_ids = []
        top_k_scores = []
        for hit in hits:
            source = hit["_source"]
            metadata = source.get("metadata", None)
            text = source.get(self.text_field, None)
            node_id = hit["_id"]

            try:
                node = metadata_dict_to_node(metadata)
                node.text = text
            except Exception:
                # Legacy support for old metadata format
                logger.warning(
                    f"Could not parse metadata from hit {hit['_source']['metadata']}"
                )
                node_info = source.get("node_info")
                relationships = source.get("relationships", {})
                start_char_idx = None
                end_char_idx = None
                if isinstance(node_info, dict):
                    start_char_idx = node_info.get("start", None)
                    end_char_idx = node_info.get("end", None)

                node = TextNode(
                    text=text,
                    metadata=metadata,
                    id_=node_id,
                    start_char_idx=start_char_idx,
                    end_char_idx=end_char_idx,
                    relationships=relationships,
                )
            top_k_nodes.append(node)
            top_k_ids.append(node_id)
            top_k_scores.append(hit.get("_rank", hit["_score"]))

        if (
            isinstance(self.retrieval_strategy, AsyncDenseVectorStrategy)
            and self.retrieval_strategy.hybrid
        ):
            total_rank = sum(top_k_scores)
            top_k_scores = [total_rank - rank / total_rank for rank in top_k_scores]

        return VectorStoreQueryResult(
            nodes=top_k_nodes,
            ids=top_k_ids,
            similarities=_to_llama_similarities(top_k_scores),
        )

Elasticsearch

ElasticsearchStore #

client property #

add #

async_add async #

delete #

adelete async #

query #

aquery async #

client `property` #

async_add `async` #

adelete `async` #

aquery `async` #