Txtai

txtai向量存储索引。

构建在现有向量存储之上的索引。

TxtaiVectorStore #

Bases: BasePydanticVectorStore

txtai向量存储。

嵌入向量被存储在txtai索引中。

在查询时，索引使用txtai查询前k个嵌入向量，并返回相应的索引。

Parameters:

Name	Type	Description	Default
`txtai_index`	`ANN`	txtai索引实例	required

Source code in llama_index/vector_stores/txtai.py

class TxtaiVectorStore(BasePydanticVectorStore):
    """txtai向量存储。

    嵌入向量被存储在txtai索引中。

    在查询时，索引使用txtai查询前k个嵌入向量，并返回相应的索引。

    Args:
        txtai_index (txtai.ann.ANN): txtai索引实例"""

    stores_text: bool = False

    _txtai_index = PrivateAttr()

    def __init__(
        self,
        txtai_index: Any,
    ) -> None:
        """初始化参数。"""
        try:
            import txtai
        except ImportError:
            raise ImportError(IMPORT_ERROR_MSG)

        self._txtai_index = cast(txtai.ann.ANN, txtai_index)

        super().__init__()

    @classmethod
    def from_persist_dir(
        cls,
        persist_dir: str = DEFAULT_PERSIST_DIR,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> "TxtaiVectorStore":
        persist_path = os.path.join(
            persist_dir,
            f"{DEFAULT_VECTOR_STORE}{NAMESPACE_SEP}{DEFAULT_PERSIST_FNAME}",
        )
        # only support local storage for now
        if fs and not isinstance(fs, LocalFileSystem):
            raise NotImplementedError("txtai only supports local storage for now.")
        return cls.from_persist_path(persist_path=persist_path, fs=None)

    @classmethod
    def from_persist_path(
        cls,
        persist_path: str,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> "TxtaiVectorStore":
        try:
            import txtai
        except ImportError:
            raise ImportError(IMPORT_ERROR_MSG)

        if fs and not isinstance(fs, LocalFileSystem):
            raise NotImplementedError("txtai only supports local storage for now.")

        if not os.path.exists(persist_path):
            raise ValueError(f"No existing {__name__} found at {persist_path}.")

        logger.info(f"Loading {__name__} config from {persist_path}.")
        parent_directory = Path(persist_path).parent
        config_path = parent_directory / "config.json"
        jsonconfig = config_path.exists()
        # Determine if config is json or pickle
        config_path = config_path if jsonconfig else parent_directory / "config"
        # Load configuration
        with open(config_path, "r" if jsonconfig else "rb") as f:
            config = json.load(f) if jsonconfig else pickle.load(f)

        logger.info(f"Loading {__name__} from {persist_path}.")
        txtai_index = txtai.ann.ANNFactory.create(config)
        txtai_index.load(persist_path)
        return cls(txtai_index=txtai_index)

    def add(
        self,
        nodes: List[BaseNode],
        **add_kwargs: Any,
    ) -> List[str]:
        """将节点添加到索引中。

Args:
    节点: List[BaseNode]: 带有嵌入的节点列表
"""
        text_embedding_np = np.array(
            [node.get_embedding() for node in nodes], dtype="float32"
        )

        # Check if the ann index is already created
        # If not create the index with node embeddings
        if self._txtai_index.backend is None:
            self._txtai_index.index(text_embedding_np)
        else:
            self._txtai_index.append(text_embedding_np)

        indx_size = self._txtai_index.count()
        return [str(idx) for idx in range(indx_size - len(nodes) + 1, indx_size + 1)]

    @property
    def client(self) -> Any:
        """返回txtai索引。"""
        return self._txtai_index

    def persist(
        self,
        persist_path: str = DEFAULT_PERSIST_PATH,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> None:
        """保存到文件。

这个方法将向磁盘保存向量存储。

Args:
    persist_path (str): 文件的保存路径。
"""
        if fs and not isinstance(fs, LocalFileSystem):
            raise NotImplementedError("txtai only supports local storage for now.")

        dirpath = Path(persist_path).parent
        dirpath.mkdir(exist_ok=True)

        jsonconfig = self._txtai_index.config.get("format", "pickle") == "json"
        # Determine if config is json or pickle
        config_path = dirpath / "config.json" if jsonconfig else dirpath / "config"

        # Write configuration
        with open(
            config_path,
            "w" if jsonconfig else "wb",
            encoding="utf-8" if jsonconfig else None,
        ) as f:
            if jsonconfig:
                # Write config as JSON
                json.dump(self._txtai_index.config, f, default=str)
            else:
                from txtai.version import __pickle__

                # Write config as pickle format
                pickle.dump(self._txtai_index.config, f, protocol=__pickle__)

        self._txtai_index.save(persist_path)

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """使用ref_doc_id删除节点。

Args:
    ref_doc_id（str）：要删除的文档的doc_id。
"""
        self._txtai_index.delete([int(ref_doc_id)])

    def query(
        self,
        query: VectorStoreQuery,
        **kwargs: Any,
    ) -> VectorStoreQueryResult:
        """查询前k个最相似节点的索引。

Args:
    query (VectorStoreQuery): 在索引中搜索的查询
"""
        if query.filters is not None:
            raise ValueError("Metadata filters not implemented for txtai yet.")

        query_embedding = cast(List[float], query.query_embedding)
        query_embedding_np = np.array(query_embedding, dtype="float32")[np.newaxis, :]
        search_result = self._txtai_index.search(
            query_embedding_np, query.similarity_top_k
        )[0]
        # if empty, then return an empty response
        if len(search_result) == 0:
            return VectorStoreQueryResult(similarities=[], ids=[])

        filtered_dists = []
        filtered_node_idxs = []
        for dist, idx in search_result:
            if idx < 0:
                continue
            filtered_dists.append(dist)
            filtered_node_idxs.append(str(idx))

        return VectorStoreQueryResult(
            similarities=filtered_dists, ids=filtered_node_idxs
        )

client `property` #

client: Any

返回txtai索引。

add #

add(nodes: List[BaseNode], **add_kwargs: Any) -> List[str]

将节点添加到索引中。

Parameters:

Name	Type	Description	Default
`节点`		List[BaseNode]: 带有嵌入的节点列表	required

Source code in llama_index/vector_stores/txtai.py

    def add(
        self,
        nodes: List[BaseNode],
        **add_kwargs: Any,
    ) -> List[str]:
        """将节点添加到索引中。

Args:
    节点: List[BaseNode]: 带有嵌入的节点列表
"""
        text_embedding_np = np.array(
            [node.get_embedding() for node in nodes], dtype="float32"
        )

        # Check if the ann index is already created
        # If not create the index with node embeddings
        if self._txtai_index.backend is None:
            self._txtai_index.index(text_embedding_np)
        else:
            self._txtai_index.append(text_embedding_np)

        indx_size = self._txtai_index.count()
        return [str(idx) for idx in range(indx_size - len(nodes) + 1, indx_size + 1)]

persist #

persist(
    persist_path: str = DEFAULT_PERSIST_PATH,
    fs: Optional[AbstractFileSystem] = None,
) -> None

保存到文件。

这个方法将向磁盘保存向量存储。

Parameters:

Name	Type	Description	Default
`persist_path`	`str`	文件的保存路径。	`DEFAULT_PERSIST_PATH`

Source code in llama_index/vector_stores/txtai.py

    def persist(
        self,
        persist_path: str = DEFAULT_PERSIST_PATH,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> None:
        """保存到文件。

这个方法将向磁盘保存向量存储。

Args:
    persist_path (str): 文件的保存路径。
"""
        if fs and not isinstance(fs, LocalFileSystem):
            raise NotImplementedError("txtai only supports local storage for now.")

        dirpath = Path(persist_path).parent
        dirpath.mkdir(exist_ok=True)

        jsonconfig = self._txtai_index.config.get("format", "pickle") == "json"
        # Determine if config is json or pickle
        config_path = dirpath / "config.json" if jsonconfig else dirpath / "config"

        # Write configuration
        with open(
            config_path,
            "w" if jsonconfig else "wb",
            encoding="utf-8" if jsonconfig else None,
        ) as f:
            if jsonconfig:
                # Write config as JSON
                json.dump(self._txtai_index.config, f, default=str)
            else:
                from txtai.version import __pickle__

                # Write config as pickle format
                pickle.dump(self._txtai_index.config, f, protocol=__pickle__)

        self._txtai_index.save(persist_path)

delete #

delete(ref_doc_id: str, **delete_kwargs: Any) -> None

使用ref_doc_id删除节点。

Source code in llama_index/vector_stores/txtai.py

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """使用ref_doc_id删除节点。

Args:
    ref_doc_id（str）：要删除的文档的doc_id。
"""
        self._txtai_index.delete([int(ref_doc_id)])

query #

query(
    query: VectorStoreQuery, **kwargs: Any
) -> VectorStoreQueryResult

查询前k个最相似节点的索引。

Parameters:

Name	Type	Description	Default
`query`	`VectorStoreQuery`	在索引中搜索的查询	required

Source code in llama_index/vector_stores/txtai.py

    def query(
        self,
        query: VectorStoreQuery,
        **kwargs: Any,
    ) -> VectorStoreQueryResult:
        """查询前k个最相似节点的索引。

Args:
    query (VectorStoreQuery): 在索引中搜索的查询
"""
        if query.filters is not None:
            raise ValueError("Metadata filters not implemented for txtai yet.")

        query_embedding = cast(List[float], query.query_embedding)
        query_embedding_np = np.array(query_embedding, dtype="float32")[np.newaxis, :]
        search_result = self._txtai_index.search(
            query_embedding_np, query.similarity_top_k
        )[0]
        # if empty, then return an empty response
        if len(search_result) == 0:
            return VectorStoreQueryResult(similarities=[], ids=[])

        filtered_dists = []
        filtered_node_idxs = []
        for dist, idx in search_result:
            if idx < 0:
                continue
            filtered_dists.append(dist)
            filtered_node_idxs.append(str(idx))

        return VectorStoreQueryResult(
            similarities=filtered_dists, ids=filtered_node_idxs
        )

Txtai

TxtaiVectorStore #

client property #

add #

persist #

delete #

query #

client `property` #