Lancedb

LanceDBVectorStore #

Bases: BasePydanticVectorStore

# LanceDB向量存储。

# 在LanceDB中存储文本和嵌入。如果LanceDB数据集存在，则向量存储将打开现有的LanceDB数据集，如果不存在则创建数据集。

# Args:
#     uri (str, required): LanceDB存储文件的位置。
#     table_name (str, optional): 嵌入将被存储的表名。默认为"vectors"。
#     vector_column_name (str, optional): 如果与默认值不同，则表中的向量列名。默认为"vector"，符合lancedb的约定。
#     nprobes (int, optional): 使用的探测次数。较高的数字使搜索更准确，但也更慢。默认为20。
#     refine_factor: (int, optional): 通过读取额外的元素并在内存中重新排列它们来优化结果。默认为None。

# 引发：
#     ImportError: 无法导入`lancedb`。

# Returns:
#     LanceDBVectorStore: 支持创建LanceDB数据集并查询它的向量存储。

# 示例：
#     `pip install llama-index-vector-stores-lancedb`

#     ```python
#     from llama_index.vector_stores.lancedb import LanceDBVectorStore

#     vector_store = LanceDBVectorStore(uri="/tmp/lancedb")
#     ```

Source code in llama_index/vector_stores/lancedb/base.py

class LanceDBVectorStore(BasePydanticVectorStore):
    """```python
# LanceDB向量存储。

# 在LanceDB中存储文本和嵌入。如果LanceDB数据集存在，则向量存储将打开现有的LanceDB数据集，如果不存在则创建数据集。

# Args:
#     uri (str, required): LanceDB存储文件的位置。
#     table_name (str, optional): 嵌入将被存储的表名。默认为"vectors"。
#     vector_column_name (str, optional): 如果与默认值不同，则表中的向量列名。默认为"vector"，符合lancedb的约定。
#     nprobes (int, optional): 使用的探测次数。较高的数字使搜索更准确，但也更慢。默认为20。
#     refine_factor: (int, optional): 通过读取额外的元素并在内存中重新排列它们来优化结果。默认为None。

# 引发：
#     ImportError: 无法导入`lancedb`。

# Returns:
#     LanceDBVectorStore: 支持创建LanceDB数据集并查询它的向量存储。

# 示例：
#     `pip install llama-index-vector-stores-lancedb`

#     ```python
#     from llama_index.vector_stores.lancedb import LanceDBVectorStore

#     vector_store = LanceDBVectorStore(uri="/tmp/lancedb")
#     ```
```"""

    stores_text = True
    flat_metadata: bool = True
    _connection: Any = PrivateAttr()
    uri: Optional[str]
    table_name: Optional[str]
    vector_column_name: Optional[str]
    nprobes: Optional[int]
    refine_factor: Optional[int]
    text_key: Optional[str]
    doc_id_key: Optional[str]

    def __init__(
        self,
        uri: Optional[str],
        table_name: str = "vectors",
        vector_column_name: str = "vector",
        nprobes: int = 20,
        refine_factor: Optional[int] = None,
        text_key: str = DEFAULT_TEXT_KEY,
        doc_id_key: str = DEFAULT_DOC_ID_KEY,
        **kwargs: Any,
    ) -> None:
        """初始化参数。"""
        self._connection = lancedb.connect(uri)
        super().__init__(
            uri=uri,
            table_name=table_name,
            vector_column_name=vector_column_name,
            nprobes=nprobes,
            refine_factor=refine_factor,
            text_key=text_key,
            doc_id_key=doc_id_key,
            **kwargs,
        )

    @property
    def client(self) -> None:
        """获取客户端。"""
        return self._connection

    @classmethod
    def from_params(
        cls,
        uri: Optional[str],
        table_name: str = "vectors",
        vector_column_name: str = "vector",
        nprobes: int = 20,
        refine_factor: Optional[int] = None,
        text_key: str = DEFAULT_TEXT_KEY,
        doc_id_key: str = DEFAULT_DOC_ID_KEY,
        **kwargs: Any,
    ) -> "LanceDBVectorStore":
        """从参数创建实例。"""
        _connection_ = cls._connection
        return cls(
            _connection=_connection_,
            uri=uri,
            table_name=table_name,
            vector_column_name=vector_column_name,
            nprobes=nprobes,
            refine_factor=refine_factor,
            text_key=text_key,
            doc_id_key=doc_id_key,
            **kwargs,
        )

    def add(
        self,
        nodes: List[BaseNode],
        **add_kwargs: Any,
    ) -> List[str]:
        if not nodes:
            _logger.debug("No nodes to add. Skipping the database operation.")
            return []
        data = []
        ids = []
        for node in nodes:
            metadata = node_to_metadata_dict(
                node, remove_text=False, flat_metadata=self.flat_metadata
            )
            append_data = {
                "id": node.node_id,
                "doc_id": node.ref_doc_id,
                "vector": node.get_embedding(),
                "text": node.get_content(metadata_mode=MetadataMode.NONE),
                "metadata": metadata,
            }
            data.append(append_data)
            ids.append(node.node_id)

        if self.table_name in self._connection.table_names():
            tbl = self._connection.open_table(self.table_name)
            tbl.add(data)
        else:
            self._connection.create_table(self.table_name, data)
        return ids

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """使用ref_doc_id删除节点。

Args:
    ref_doc_id（str）：要删除的文档的doc_id。
"""
        table = self._connection.open_table(self.table_name)
        table.delete('doc_id = "' + ref_doc_id + '"')

    def query(
        self,
        query: VectorStoreQuery,
        **kwargs: Any,
    ) -> VectorStoreQueryResult:
        """查询前k个最相似节点的索引。"""
        if query.filters is not None:
            if "where" in kwargs:
                raise ValueError(
                    "Cannot specify filter via both query and kwargs. "
                    "Use kwargs only for lancedb specific items that are "
                    "not supported via the generic query interface."
                )
            where = _to_lance_filter(query.filters)
        else:
            where = kwargs.pop("where", None)

        table = self._connection.open_table(self.table_name)
        lance_query = (
            table.search(
                query=query.query_embedding,
                vector_column_name=self.vector_column_name,
            )
            .limit(query.similarity_top_k)
            .where(where)
            .nprobes(self.nprobes)
        )

        if self.refine_factor is not None:
            lance_query.refine_factor(self.refine_factor)

        results = lance_query.to_pandas()
        nodes = []
        for _, item in results.iterrows():
            try:
                node = metadata_dict_to_node(item.metadata)
                node.embedding = list(item[self.vector_column_name])
            except Exception:
                # deprecated legacy logic for backward compatibility
                _logger.debug(
                    "Failed to parse Node metadata, fallback to legacy logic."
                )
                if "metadata" in item:
                    metadata, node_info, _relation = legacy_metadata_dict_to_node(
                        item.metadata, text_key=self.text_key
                    )
                else:
                    metadata, node_info = {}, {}
                node = TextNode(
                    text=item[self.text_key] or "",
                    id_=item.id,
                    metadata=metadata,
                    start_char_idx=node_info.get("start", None),
                    end_char_idx=node_info.get("end", None),
                    relationships={
                        NodeRelationship.SOURCE: RelatedNodeInfo(
                            node_id=item[self.doc_id_key]
                        ),
                    },
                )

            nodes.append(node)

        return VectorStoreQueryResult(
            nodes=nodes,
            similarities=_to_llama_similarities(results),
            ids=results["id"].tolist(),
        )

client `property` #

client: None

获取客户端。

from_params `classmethod` #

from_params(
    uri: Optional[str],
    table_name: str = "vectors",
    vector_column_name: str = "vector",
    nprobes: int = 20,
    refine_factor: Optional[int] = None,
    text_key: str = DEFAULT_TEXT_KEY,
    doc_id_key: str = DEFAULT_DOC_ID_KEY,
    **kwargs: Any
) -> LanceDBVectorStore

从参数创建实例。

Source code in llama_index/vector_stores/lancedb/base.py

@classmethod
def from_params(
    cls,
    uri: Optional[str],
    table_name: str = "vectors",
    vector_column_name: str = "vector",
    nprobes: int = 20,
    refine_factor: Optional[int] = None,
    text_key: str = DEFAULT_TEXT_KEY,
    doc_id_key: str = DEFAULT_DOC_ID_KEY,
    **kwargs: Any,
) -> "LanceDBVectorStore":
    """从参数创建实例。"""
    _connection_ = cls._connection
    return cls(
        _connection=_connection_,
        uri=uri,
        table_name=table_name,
        vector_column_name=vector_column_name,
        nprobes=nprobes,
        refine_factor=refine_factor,
        text_key=text_key,
        doc_id_key=doc_id_key,
        **kwargs,
    )

delete #

delete(ref_doc_id: str, **delete_kwargs: Any) -> None

使用ref_doc_id删除节点。

Source code in llama_index/vector_stores/lancedb/base.py

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """使用ref_doc_id删除节点。

Args:
    ref_doc_id（str）：要删除的文档的doc_id。
"""
        table = self._connection.open_table(self.table_name)
        table.delete('doc_id = "' + ref_doc_id + '"')

query #

query(
    query: VectorStoreQuery, **kwargs: Any
) -> VectorStoreQueryResult

查询前k个最相似节点的索引。

Source code in llama_index/vector_stores/lancedb/base.py

def query(
    self,
    query: VectorStoreQuery,
    **kwargs: Any,
) -> VectorStoreQueryResult:
    """查询前k个最相似节点的索引。"""
    if query.filters is not None:
        if "where" in kwargs:
            raise ValueError(
                "Cannot specify filter via both query and kwargs. "
                "Use kwargs only for lancedb specific items that are "
                "not supported via the generic query interface."
            )
        where = _to_lance_filter(query.filters)
    else:
        where = kwargs.pop("where", None)

    table = self._connection.open_table(self.table_name)
    lance_query = (
        table.search(
            query=query.query_embedding,
            vector_column_name=self.vector_column_name,
        )
        .limit(query.similarity_top_k)
        .where(where)
        .nprobes(self.nprobes)
    )

    if self.refine_factor is not None:
        lance_query.refine_factor(self.refine_factor)

    results = lance_query.to_pandas()
    nodes = []
    for _, item in results.iterrows():
        try:
            node = metadata_dict_to_node(item.metadata)
            node.embedding = list(item[self.vector_column_name])
        except Exception:
            # deprecated legacy logic for backward compatibility
            _logger.debug(
                "Failed to parse Node metadata, fallback to legacy logic."
            )
            if "metadata" in item:
                metadata, node_info, _relation = legacy_metadata_dict_to_node(
                    item.metadata, text_key=self.text_key
                )
            else:
                metadata, node_info = {}, {}
            node = TextNode(
                text=item[self.text_key] or "",
                id_=item.id,
                metadata=metadata,
                start_char_idx=node_info.get("start", None),
                end_char_idx=node_info.get("end", None),
                relationships={
                    NodeRelationship.SOURCE: RelatedNodeInfo(
                        node_id=item[self.doc_id_key]
                    ),
                },
            )

        nodes.append(node)

    return VectorStoreQueryResult(
        nodes=nodes,
        similarities=_to_llama_similarities(results),
        ids=results["id"].tolist(),
    )

Lancedb

LanceDBVectorStore #

client property #

from_params classmethod #

delete #

query #

client `property` #

from_params `classmethod` #