Source code for langchain_community.vectorstores.docarray.hnsw

from __future__ import annotations

from typing import Any, List, Literal, Optional

from langchain_core.embeddings import Embeddings

from langchain_community.vectorstores.docarray.base import (
    DocArrayIndex,
    _check_docarray_import,
)


[docs]class DocArrayHnswSearch(DocArrayIndex):
    """`HnswLib` 使用 `DocArray` 包进行存储。

    要使用它，您应该安装版本 >=0.32.0 的 ``docarray`` 包。
    您可以使用 `pip install "docarray[hnswlib]"` 进行安装。"""

[docs]    @classmethod
    def from_params(
        cls,
        embedding: Embeddings,
        work_dir: str,
        n_dim: int,
        dist_metric: Literal["cosine", "ip", "l2"] = "cosine",
        max_elements: int = 1024,
        index: bool = True,
        ef_construction: int = 200,
        ef: int = 10,
        M: int = 16,
        allow_replace_deleted: bool = True,
        num_threads: int = 1,
        **kwargs: Any,
    ) -> DocArrayHnswSearch:
        """初始化DocArrayHnswSearch存储。

参数：
    embedding（Embeddings）：嵌入函数。
    work_dir（str）：存储所有数据的位置的路径。
    n_dim（int）：嵌入的维度。
    dist_metric（str）：DocArrayHnswSearch的距离度量可以是以下之一：
        "cosine"，"ip"和"l2"。默认为"cosine"。
    max_elements（int）：可以存储的向量的最大数量。
        默认为1024。
    index（bool）：是否应为此字段构建索引。
        默认为True。
    ef_construction（int）：定义构建时间/精度折衷。
        默认为200。
    ef（int）：控制查询时间/精度折衷的参数。
        默认为10。
    M（int）：定义图中最大出站连接数的参数。
        默认为16。
    allow_replace_deleted（bool）：启用替换已删除元素
        为新添加的元素。默认为True。
    num_threads（int）：设置要使用的CPU线程数。默认为1。
    **kwargs：要传递给get_doc_cls方法的其他关键字参数。
"""
        _check_docarray_import()
        from docarray.index import HnswDocumentIndex

        doc_cls = cls._get_doc_cls(
            dim=n_dim,
            space=dist_metric,
            max_elements=max_elements,
            index=index,
            ef_construction=ef_construction,
            ef=ef,
            M=M,
            allow_replace_deleted=allow_replace_deleted,
            num_threads=num_threads,
            **kwargs,
        )
        doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir)  # type: ignore
        return cls(doc_index, embedding)

[docs]    @classmethod
    def from_texts(
        cls,
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        work_dir: Optional[str] = None,
        n_dim: Optional[int] = None,
        **kwargs: Any,
    ) -> DocArrayHnswSearch:
        """创建一个DocArrayHnswSearch存储并插入数据。

参数：
    texts（List[str]）：文本数据。
    embedding（Embeddings）：嵌入函数。
    metadatas（Optional[List[dict]]）：如果存在，每个文本的元数据。默认为None。
    work_dir（str）：存储所有数据的位置路径。
    n_dim（int）：嵌入的维度。
    **kwargs：要传递给__init__方法的其他关键字参数。

返回：
    DocArrayHnswSearch向量存储器
"""
        if work_dir is None:
            raise ValueError("`work_dir` parameter has not been set.")
        if n_dim is None:
            raise ValueError("`n_dim` parameter has not been set.")

        store = cls.from_params(embedding, work_dir, n_dim, **kwargs)
        store.add_texts(texts=texts, metadatas=metadatas)
        return store