Simple

简单的向量存储索引。

SimpleVectorStore #

Bases: BasePydanticVectorStore

简单的向量存储。

在这个向量存储中，嵌入向量被存储在一个简单的内存字典中。

Parameters:

Name	Type	Description	Default
`simple_vector_store_data_dict`	`Optional[dict]`	数据字典，包含嵌入向量和文档ID。详细信息请参阅SimpleVectorStoreData。	required

Source code in llama_index/core/vector_stores/simple.py

class SimpleVectorStore(BasePydanticVectorStore):
    """简单的向量存储。

在这个向量存储中，嵌入向量被存储在一个简单的内存字典中。

Args:
    simple_vector_store_data_dict (Optional[dict]): 数据字典，包含嵌入向量和文档ID。详细信息请参阅SimpleVectorStoreData。"""

    stores_text: bool = False

    data: SimpleVectorStoreData = Field(default_factory=SimpleVectorStoreData)
    _fs: fsspec.AbstractFileSystem = PrivateAttr()

    def __init__(
        self,
        data: Optional[SimpleVectorStoreData] = None,
        fs: Optional[fsspec.AbstractFileSystem] = None,
        **kwargs: Any,
    ) -> None:
        """初始化参数。"""
        super().__init__(data=data or SimpleVectorStoreData())
        self._fs = fs or fsspec.filesystem("file")

    @classmethod
    def from_persist_dir(
        cls,
        persist_dir: str = DEFAULT_PERSIST_DIR,
        namespace: Optional[str] = None,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> "SimpleVectorStore":
        """从持久化目录加载。"""
        if namespace:
            persist_fname = f"{namespace}{NAMESPACE_SEP}{DEFAULT_PERSIST_FNAME}"
        else:
            persist_fname = DEFAULT_PERSIST_FNAME

        if fs is not None:
            persist_path = concat_dirs(persist_dir, persist_fname)
        else:
            persist_path = os.path.join(persist_dir, persist_fname)
        return cls.from_persist_path(persist_path, fs=fs)

    @classmethod
    def from_namespaced_persist_dir(
        cls,
        persist_dir: str = DEFAULT_PERSIST_DIR,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> Dict[str, BasePydanticVectorStore]:
        """从命名空间持久化目录加载。"""
        listing_fn = os.listdir if fs is None else fs.listdir

        vector_stores: Dict[str, BasePydanticVectorStore] = {}

        try:
            for fname in listing_fn(persist_dir):
                if fname.endswith(DEFAULT_PERSIST_FNAME):
                    namespace = fname.split(NAMESPACE_SEP)[0]

                    # handle backwards compatibility with stores that were persisted
                    if namespace == DEFAULT_PERSIST_FNAME:
                        vector_stores[DEFAULT_VECTOR_STORE] = cls.from_persist_dir(
                            persist_dir=persist_dir, fs=fs
                        )
                    else:
                        vector_stores[namespace] = cls.from_persist_dir(
                            persist_dir=persist_dir, namespace=namespace, fs=fs
                        )
        except Exception:
            # failed to listdir, so assume there is only one store
            try:
                vector_stores[DEFAULT_VECTOR_STORE] = cls.from_persist_dir(
                    persist_dir=persist_dir, fs=fs, namespace=DEFAULT_VECTOR_STORE
                )
            except Exception:
                # no namespace backwards compat
                vector_stores[DEFAULT_VECTOR_STORE] = cls.from_persist_dir(
                    persist_dir=persist_dir, fs=fs
                )

        return vector_stores

    @classmethod
    def class_name(cls) -> str:
        """类名。"""
        return "SimpleVectorStore"

    @property
    def client(self) -> None:
        """获取客户端。"""
        return

    @property
    def _data(self) -> SimpleVectorStoreData:
        """向后兼容性。"""
        return self.data

    def get(self, text_id: str) -> List[float]:
        """获取嵌入。"""
        return self.data.embedding_dict[text_id]

    def get_nodes(
        self,
        node_ids: Optional[List[str]] = None,
        filters: Optional[MetadataFilters] = None,
    ) -> List[BaseNode]:
        """获取节点。"""
        raise NotImplementedError("SimpleVectorStore does not store nodes directly.")

    def add(
        self,
        nodes: List[BaseNode],
        **add_kwargs: Any,
    ) -> List[str]:
        """将节点添加到索引。"""
        for node in nodes:
            self.data.embedding_dict[node.node_id] = node.get_embedding()
            self.data.text_id_to_ref_doc_id[node.node_id] = node.ref_doc_id or "None"

            metadata = node_to_metadata_dict(
                node, remove_text=True, flat_metadata=False
            )
            metadata.pop("_node_content", None)
            self.data.metadata_dict[node.node_id] = metadata
        return [node.node_id for node in nodes]

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """使用ref_doc_id删除节点。

Args:
    ref_doc_id（str）：要删除的文档的doc_id。
"""
        text_ids_to_delete = set()
        for text_id, ref_doc_id_ in self.data.text_id_to_ref_doc_id.items():
            if ref_doc_id == ref_doc_id_:
                text_ids_to_delete.add(text_id)

        for text_id in text_ids_to_delete:
            del self.data.embedding_dict[text_id]
            del self.data.text_id_to_ref_doc_id[text_id]
            # Handle metadata_dict not being present in stores that were persisted
            # without metadata, or, not being present for nodes stored
            # prior to metadata functionality.
            if self.data.metadata_dict is not None:
                self.data.metadata_dict.pop(text_id, None)

    def delete_nodes(
        self,
        node_ids: Optional[List[str]] = None,
        filters: Optional[MetadataFilters] = None,
        **delete_kwargs: Any,
    ) -> None:
        filter_fn = _build_metadata_filter_fn(
            lambda node_id: self.data.metadata_dict[node_id], filters
        )

        if node_ids is not None:
            node_id_set = set(node_ids)

            def node_filter_fn(node_id: str) -> bool:
                return node_id in node_id_set and filter_fn(node_id)

        else:

            def node_filter_fn(node_id: str) -> bool:
                return filter_fn(node_id)

        for node_id in list(self.data.embedding_dict.keys()):
            if node_filter_fn(node_id):
                del self.data.embedding_dict[node_id]
                del self.data.text_id_to_ref_doc_id[node_id]
                self.data.metadata_dict.pop(node_id, None)

    def clear(self) -> None:
        """清空存储。"""
        self.data = SimpleVectorStoreData()

    def query(
        self,
        query: VectorStoreQuery,
        **kwargs: Any,
    ) -> VectorStoreQueryResult:
        """获取响应的节点。"""
        # Prevent metadata filtering on stores that were persisted without metadata.
        if (
            query.filters is not None
            and self.data.embedding_dict
            and not self.data.metadata_dict
        ):
            raise ValueError(
                "Cannot filter stores that were persisted without metadata. "
                "Please rebuild the store with metadata to enable filtering."
            )
        # Prefilter nodes based on the query filter and node ID restrictions.
        query_filter_fn = _build_metadata_filter_fn(
            lambda node_id: self.data.metadata_dict[node_id], query.filters
        )

        if query.node_ids is not None:
            available_ids = set(query.node_ids)

            def node_filter_fn(node_id: str) -> bool:
                return node_id in available_ids

        else:

            def node_filter_fn(node_id: str) -> bool:
                return True

        node_ids = []
        embeddings = []
        # TODO: consolidate with get_query_text_embedding_similarities
        for node_id, embedding in self.data.embedding_dict.items():
            if node_filter_fn(node_id) and query_filter_fn(node_id):
                node_ids.append(node_id)
                embeddings.append(embedding)

        query_embedding = cast(List[float], query.query_embedding)

        if query.mode in LEARNER_MODES:
            top_similarities, top_ids = get_top_k_embeddings_learner(
                query_embedding,
                embeddings,
                similarity_top_k=query.similarity_top_k,
                embedding_ids=node_ids,
            )
        elif query.mode == MMR_MODE:
            mmr_threshold = kwargs.get("mmr_threshold", None)
            top_similarities, top_ids = get_top_k_mmr_embeddings(
                query_embedding,
                embeddings,
                similarity_top_k=query.similarity_top_k,
                embedding_ids=node_ids,
                mmr_threshold=mmr_threshold,
            )
        elif query.mode == VectorStoreQueryMode.DEFAULT:
            top_similarities, top_ids = get_top_k_embeddings(
                query_embedding,
                embeddings,
                similarity_top_k=query.similarity_top_k,
                embedding_ids=node_ids,
            )
        else:
            raise ValueError(f"Invalid query mode: {query.mode}")

        return VectorStoreQueryResult(similarities=top_similarities, ids=top_ids)

    def persist(
        self,
        persist_path: str = os.path.join(DEFAULT_PERSIST_DIR, DEFAULT_PERSIST_FNAME),
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> None:
        """将SimpleVectorStore持久化到一个目录中。"""
        fs = fs or self._fs
        dirpath = os.path.dirname(persist_path)
        if not fs.exists(dirpath):
            fs.makedirs(dirpath)

        with fs.open(persist_path, "w") as f:
            json.dump(self.data.to_dict(), f)

    @classmethod
    def from_persist_path(
        cls, persist_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
    ) -> "SimpleVectorStore":
        """从持久化目录创建一个SimpleKVStore。"""
        fs = fs or fsspec.filesystem("file")
        if not fs.exists(persist_path):
            raise ValueError(
                f"No existing {__name__} found at {persist_path}, skipping load."
            )

        logger.debug(f"Loading {__name__} from {persist_path}.")
        with fs.open(persist_path, "rb") as f:
            data_dict = json.load(f)
            data = SimpleVectorStoreData.from_dict(data_dict)
        return cls(data)

    @classmethod
    def from_dict(cls, save_dict: dict) -> "SimpleVectorStore":
        data = SimpleVectorStoreData.from_dict(save_dict)
        return cls(data)

    def to_dict(self) -> dict:
        return self.data.to_dict()

client `property` #

client: None

获取客户端。

from_persist_dir `classmethod` #

from_persist_dir(
    persist_dir: str = DEFAULT_PERSIST_DIR,
    namespace: Optional[str] = None,
    fs: Optional[AbstractFileSystem] = None,
) -> SimpleVectorStore

从持久化目录加载。

Source code in llama_index/core/vector_stores/simple.py

@classmethod
def from_persist_dir(
    cls,
    persist_dir: str = DEFAULT_PERSIST_DIR,
    namespace: Optional[str] = None,
    fs: Optional[fsspec.AbstractFileSystem] = None,
) -> "SimpleVectorStore":
    """从持久化目录加载。"""
    if namespace:
        persist_fname = f"{namespace}{NAMESPACE_SEP}{DEFAULT_PERSIST_FNAME}"
    else:
        persist_fname = DEFAULT_PERSIST_FNAME

    if fs is not None:
        persist_path = concat_dirs(persist_dir, persist_fname)
    else:
        persist_path = os.path.join(persist_dir, persist_fname)
    return cls.from_persist_path(persist_path, fs=fs)

from_namespaced_persist_dir `classmethod` #

from_namespaced_persist_dir(
    persist_dir: str = DEFAULT_PERSIST_DIR,
    fs: Optional[AbstractFileSystem] = None,
) -> Dict[str, BasePydanticVectorStore]

从命名空间持久化目录加载。

Source code in llama_index/core/vector_stores/simple.py

@classmethod
def from_namespaced_persist_dir(
    cls,
    persist_dir: str = DEFAULT_PERSIST_DIR,
    fs: Optional[fsspec.AbstractFileSystem] = None,
) -> Dict[str, BasePydanticVectorStore]:
    """从命名空间持久化目录加载。"""
    listing_fn = os.listdir if fs is None else fs.listdir

    vector_stores: Dict[str, BasePydanticVectorStore] = {}

    try:
        for fname in listing_fn(persist_dir):
            if fname.endswith(DEFAULT_PERSIST_FNAME):
                namespace = fname.split(NAMESPACE_SEP)[0]

                # handle backwards compatibility with stores that were persisted
                if namespace == DEFAULT_PERSIST_FNAME:
                    vector_stores[DEFAULT_VECTOR_STORE] = cls.from_persist_dir(
                        persist_dir=persist_dir, fs=fs
                    )
                else:
                    vector_stores[namespace] = cls.from_persist_dir(
                        persist_dir=persist_dir, namespace=namespace, fs=fs
                    )
    except Exception:
        # failed to listdir, so assume there is only one store
        try:
            vector_stores[DEFAULT_VECTOR_STORE] = cls.from_persist_dir(
                persist_dir=persist_dir, fs=fs, namespace=DEFAULT_VECTOR_STORE
            )
        except Exception:
            # no namespace backwards compat
            vector_stores[DEFAULT_VECTOR_STORE] = cls.from_persist_dir(
                persist_dir=persist_dir, fs=fs
            )

    return vector_stores

class_name `classmethod` #

class_name() -> str

类名。

Source code in llama_index/core/vector_stores/simple.py

@classmethod
def class_name(cls) -> str:
    """类名。"""
    return "SimpleVectorStore"

get #

get(text_id: str) -> List[float]

获取嵌入。

Source code in llama_index/core/vector_stores/simple.py

def get(self, text_id: str) -> List[float]:
    """获取嵌入。"""
    return self.data.embedding_dict[text_id]

get_nodes #

get_nodes(
    node_ids: Optional[List[str]] = None,
    filters: Optional[MetadataFilters] = None,
) -> List[BaseNode]

获取节点。

Source code in llama_index/core/vector_stores/simple.py

def get_nodes(
    self,
    node_ids: Optional[List[str]] = None,
    filters: Optional[MetadataFilters] = None,
) -> List[BaseNode]:
    """获取节点。"""
    raise NotImplementedError("SimpleVectorStore does not store nodes directly.")

add #

add(nodes: List[BaseNode], **add_kwargs: Any) -> List[str]

将节点添加到索引。

Source code in llama_index/core/vector_stores/simple.py

def add(
    self,
    nodes: List[BaseNode],
    **add_kwargs: Any,
) -> List[str]:
    """将节点添加到索引。"""
    for node in nodes:
        self.data.embedding_dict[node.node_id] = node.get_embedding()
        self.data.text_id_to_ref_doc_id[node.node_id] = node.ref_doc_id or "None"

        metadata = node_to_metadata_dict(
            node, remove_text=True, flat_metadata=False
        )
        metadata.pop("_node_content", None)
        self.data.metadata_dict[node.node_id] = metadata
    return [node.node_id for node in nodes]

delete #

delete(ref_doc_id: str, **delete_kwargs: Any) -> None

使用ref_doc_id删除节点。

Source code in llama_index/core/vector_stores/simple.py

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """使用ref_doc_id删除节点。

Args:
    ref_doc_id（str）：要删除的文档的doc_id。
"""
        text_ids_to_delete = set()
        for text_id, ref_doc_id_ in self.data.text_id_to_ref_doc_id.items():
            if ref_doc_id == ref_doc_id_:
                text_ids_to_delete.add(text_id)

        for text_id in text_ids_to_delete:
            del self.data.embedding_dict[text_id]
            del self.data.text_id_to_ref_doc_id[text_id]
            # Handle metadata_dict not being present in stores that were persisted
            # without metadata, or, not being present for nodes stored
            # prior to metadata functionality.
            if self.data.metadata_dict is not None:
                self.data.metadata_dict.pop(text_id, None)

clear #

clear() -> None

清空存储。

Source code in llama_index/core/vector_stores/simple.py

def clear(self) -> None:
    """清空存储。"""
    self.data = SimpleVectorStoreData()

query #

query(
    query: VectorStoreQuery, **kwargs: Any
) -> VectorStoreQueryResult

获取响应的节点。

Source code in llama_index/core/vector_stores/simple.py

def query(
    self,
    query: VectorStoreQuery,
    **kwargs: Any,
) -> VectorStoreQueryResult:
    """获取响应的节点。"""
    # Prevent metadata filtering on stores that were persisted without metadata.
    if (
        query.filters is not None
        and self.data.embedding_dict
        and not self.data.metadata_dict
    ):
        raise ValueError(
            "Cannot filter stores that were persisted without metadata. "
            "Please rebuild the store with metadata to enable filtering."
        )
    # Prefilter nodes based on the query filter and node ID restrictions.
    query_filter_fn = _build_metadata_filter_fn(
        lambda node_id: self.data.metadata_dict[node_id], query.filters
    )

    if query.node_ids is not None:
        available_ids = set(query.node_ids)

        def node_filter_fn(node_id: str) -> bool:
            return node_id in available_ids

    else:

        def node_filter_fn(node_id: str) -> bool:
            return True

    node_ids = []
    embeddings = []
    # TODO: consolidate with get_query_text_embedding_similarities
    for node_id, embedding in self.data.embedding_dict.items():
        if node_filter_fn(node_id) and query_filter_fn(node_id):
            node_ids.append(node_id)
            embeddings.append(embedding)

    query_embedding = cast(List[float], query.query_embedding)

    if query.mode in LEARNER_MODES:
        top_similarities, top_ids = get_top_k_embeddings_learner(
            query_embedding,
            embeddings,
            similarity_top_k=query.similarity_top_k,
            embedding_ids=node_ids,
        )
    elif query.mode == MMR_MODE:
        mmr_threshold = kwargs.get("mmr_threshold", None)
        top_similarities, top_ids = get_top_k_mmr_embeddings(
            query_embedding,
            embeddings,
            similarity_top_k=query.similarity_top_k,
            embedding_ids=node_ids,
            mmr_threshold=mmr_threshold,
        )
    elif query.mode == VectorStoreQueryMode.DEFAULT:
        top_similarities, top_ids = get_top_k_embeddings(
            query_embedding,
            embeddings,
            similarity_top_k=query.similarity_top_k,
            embedding_ids=node_ids,
        )
    else:
        raise ValueError(f"Invalid query mode: {query.mode}")

    return VectorStoreQueryResult(similarities=top_similarities, ids=top_ids)

persist #

persist(
    persist_path: str = os.path.join(
        DEFAULT_PERSIST_DIR, DEFAULT_PERSIST_FNAME
    ),
    fs: Optional[AbstractFileSystem] = None,
) -> None

将SimpleVectorStore持久化到一个目录中。

Source code in llama_index/core/vector_stores/simple.py

def persist(
    self,
    persist_path: str = os.path.join(DEFAULT_PERSIST_DIR, DEFAULT_PERSIST_FNAME),
    fs: Optional[fsspec.AbstractFileSystem] = None,
) -> None:
    """将SimpleVectorStore持久化到一个目录中。"""
    fs = fs or self._fs
    dirpath = os.path.dirname(persist_path)
    if not fs.exists(dirpath):
        fs.makedirs(dirpath)

    with fs.open(persist_path, "w") as f:
        json.dump(self.data.to_dict(), f)

from_persist_path `classmethod` #

from_persist_path(
    persist_path: str,
    fs: Optional[AbstractFileSystem] = None,
) -> SimpleVectorStore

从持久化目录创建一个SimpleKVStore。

Source code in llama_index/core/vector_stores/simple.py

@classmethod
def from_persist_path(
    cls, persist_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
) -> "SimpleVectorStore":
    """从持久化目录创建一个SimpleKVStore。"""
    fs = fs or fsspec.filesystem("file")
    if not fs.exists(persist_path):
        raise ValueError(
            f"No existing {__name__} found at {persist_path}, skipping load."
        )

    logger.debug(f"Loading {__name__} from {persist_path}.")
    with fs.open(persist_path, "rb") as f:
        data_dict = json.load(f)
        data = SimpleVectorStoreData.from_dict(data_dict)
    return cls(data)

Simple

SimpleVectorStore #

client property #

from_persist_dir classmethod #

from_namespaced_persist_dir classmethod #

class_name classmethod #

get #

get_nodes #

add #

delete #

clear #

query #

persist #

from_persist_path classmethod #

client `property` #

from_persist_dir `classmethod` #

from_namespaced_persist_dir `classmethod` #

class_name `classmethod` #

from_persist_path `classmethod` #