Skip to content

Lancedb

LanceDBVectorStore #

Bases: BasePydanticVectorStore

# LanceDB向量存储。

# 在LanceDB中存储文本和嵌入。如果LanceDB数据集存在,则向量存储将打开现有的LanceDB数据集,如果不存在则创建数据集。

# Args:
#     uri (str, required): LanceDB存储文件的位置。
#     table_name (str, optional): 嵌入将被存储的表名。默认为"vectors"。
#     vector_column_name (str, optional): 如果与默认值不同,则表中的向量列名。默认为"vector",符合lancedb的约定。
#     nprobes (int, optional): 使用的探测次数。较高的数字使搜索更准确,但也更慢。默认为20。
#     refine_factor: (int, optional): 通过读取额外的元素并在内存中重新排列它们来优化结果。默认为None。

# 引发:
#     ImportError: 无法导入`lancedb`。

# Returns:
#     LanceDBVectorStore: 支持创建LanceDB数据集并查询它的向量存储。

# 示例:
#     `pip install llama-index-vector-stores-lancedb`

#     ```python
#     from llama_index.vector_stores.lancedb import LanceDBVectorStore

#     vector_store = LanceDBVectorStore(uri="/tmp/lancedb")
#     ```
Source code in llama_index/vector_stores/lancedb/base.py
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
class LanceDBVectorStore(BasePydanticVectorStore):
    """```python
# LanceDB向量存储。

# 在LanceDB中存储文本和嵌入。如果LanceDB数据集存在,则向量存储将打开现有的LanceDB数据集,如果不存在则创建数据集。

# Args:
#     uri (str, required): LanceDB存储文件的位置。
#     table_name (str, optional): 嵌入将被存储的表名。默认为"vectors"。
#     vector_column_name (str, optional): 如果与默认值不同,则表中的向量列名。默认为"vector",符合lancedb的约定。
#     nprobes (int, optional): 使用的探测次数。较高的数字使搜索更准确,但也更慢。默认为20。
#     refine_factor: (int, optional): 通过读取额外的元素并在内存中重新排列它们来优化结果。默认为None。

# 引发:
#     ImportError: 无法导入`lancedb`。

# Returns:
#     LanceDBVectorStore: 支持创建LanceDB数据集并查询它的向量存储。

# 示例:
#     `pip install llama-index-vector-stores-lancedb`

#     ```python
#     from llama_index.vector_stores.lancedb import LanceDBVectorStore

#     vector_store = LanceDBVectorStore(uri="/tmp/lancedb")
#     ```
```"""

    stores_text = True
    flat_metadata: bool = True
    _connection: Any = PrivateAttr()
    uri: Optional[str]
    table_name: Optional[str]
    vector_column_name: Optional[str]
    nprobes: Optional[int]
    refine_factor: Optional[int]
    text_key: Optional[str]
    doc_id_key: Optional[str]

    def __init__(
        self,
        uri: Optional[str],
        table_name: str = "vectors",
        vector_column_name: str = "vector",
        nprobes: int = 20,
        refine_factor: Optional[int] = None,
        text_key: str = DEFAULT_TEXT_KEY,
        doc_id_key: str = DEFAULT_DOC_ID_KEY,
        **kwargs: Any,
    ) -> None:
        """初始化参数。"""
        self._connection = lancedb.connect(uri)
        super().__init__(
            uri=uri,
            table_name=table_name,
            vector_column_name=vector_column_name,
            nprobes=nprobes,
            refine_factor=refine_factor,
            text_key=text_key,
            doc_id_key=doc_id_key,
            **kwargs,
        )

    @property
    def client(self) -> None:
        """获取客户端。"""
        return self._connection

    @classmethod
    def from_params(
        cls,
        uri: Optional[str],
        table_name: str = "vectors",
        vector_column_name: str = "vector",
        nprobes: int = 20,
        refine_factor: Optional[int] = None,
        text_key: str = DEFAULT_TEXT_KEY,
        doc_id_key: str = DEFAULT_DOC_ID_KEY,
        **kwargs: Any,
    ) -> "LanceDBVectorStore":
        """从参数创建实例。"""
        _connection_ = cls._connection
        return cls(
            _connection=_connection_,
            uri=uri,
            table_name=table_name,
            vector_column_name=vector_column_name,
            nprobes=nprobes,
            refine_factor=refine_factor,
            text_key=text_key,
            doc_id_key=doc_id_key,
            **kwargs,
        )

    def add(
        self,
        nodes: List[BaseNode],
        **add_kwargs: Any,
    ) -> List[str]:
        if not nodes:
            _logger.debug("No nodes to add. Skipping the database operation.")
            return []
        data = []
        ids = []
        for node in nodes:
            metadata = node_to_metadata_dict(
                node, remove_text=False, flat_metadata=self.flat_metadata
            )
            append_data = {
                "id": node.node_id,
                "doc_id": node.ref_doc_id,
                "vector": node.get_embedding(),
                "text": node.get_content(metadata_mode=MetadataMode.NONE),
                "metadata": metadata,
            }
            data.append(append_data)
            ids.append(node.node_id)

        if self.table_name in self._connection.table_names():
            tbl = self._connection.open_table(self.table_name)
            tbl.add(data)
        else:
            self._connection.create_table(self.table_name, data)
        return ids

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """使用ref_doc_id删除节点。

Args:
    ref_doc_id(str):要删除的文档的doc_id。
"""
        table = self._connection.open_table(self.table_name)
        table.delete('doc_id = "' + ref_doc_id + '"')

    def query(
        self,
        query: VectorStoreQuery,
        **kwargs: Any,
    ) -> VectorStoreQueryResult:
        """查询前k个最相似节点的索引。"""
        if query.filters is not None:
            if "where" in kwargs:
                raise ValueError(
                    "Cannot specify filter via both query and kwargs. "
                    "Use kwargs only for lancedb specific items that are "
                    "not supported via the generic query interface."
                )
            where = _to_lance_filter(query.filters)
        else:
            where = kwargs.pop("where", None)

        table = self._connection.open_table(self.table_name)
        lance_query = (
            table.search(
                query=query.query_embedding,
                vector_column_name=self.vector_column_name,
            )
            .limit(query.similarity_top_k)
            .where(where)
            .nprobes(self.nprobes)
        )

        if self.refine_factor is not None:
            lance_query.refine_factor(self.refine_factor)

        results = lance_query.to_pandas()
        nodes = []
        for _, item in results.iterrows():
            try:
                node = metadata_dict_to_node(item.metadata)
                node.embedding = list(item[self.vector_column_name])
            except Exception:
                # deprecated legacy logic for backward compatibility
                _logger.debug(
                    "Failed to parse Node metadata, fallback to legacy logic."
                )
                if "metadata" in item:
                    metadata, node_info, _relation = legacy_metadata_dict_to_node(
                        item.metadata, text_key=self.text_key
                    )
                else:
                    metadata, node_info = {}, {}
                node = TextNode(
                    text=item[self.text_key] or "",
                    id_=item.id,
                    metadata=metadata,
                    start_char_idx=node_info.get("start", None),
                    end_char_idx=node_info.get("end", None),
                    relationships={
                        NodeRelationship.SOURCE: RelatedNodeInfo(
                            node_id=item[self.doc_id_key]
                        ),
                    },
                )

            nodes.append(node)

        return VectorStoreQueryResult(
            nodes=nodes,
            similarities=_to_llama_similarities(results),
            ids=results["id"].tolist(),
        )

client property #

client: None

获取客户端。

from_params classmethod #

from_params(
    uri: Optional[str],
    table_name: str = "vectors",
    vector_column_name: str = "vector",
    nprobes: int = 20,
    refine_factor: Optional[int] = None,
    text_key: str = DEFAULT_TEXT_KEY,
    doc_id_key: str = DEFAULT_DOC_ID_KEY,
    **kwargs: Any
) -> LanceDBVectorStore

从参数创建实例。

Source code in llama_index/vector_stores/lancedb/base.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
@classmethod
def from_params(
    cls,
    uri: Optional[str],
    table_name: str = "vectors",
    vector_column_name: str = "vector",
    nprobes: int = 20,
    refine_factor: Optional[int] = None,
    text_key: str = DEFAULT_TEXT_KEY,
    doc_id_key: str = DEFAULT_DOC_ID_KEY,
    **kwargs: Any,
) -> "LanceDBVectorStore":
    """从参数创建实例。"""
    _connection_ = cls._connection
    return cls(
        _connection=_connection_,
        uri=uri,
        table_name=table_name,
        vector_column_name=vector_column_name,
        nprobes=nprobes,
        refine_factor=refine_factor,
        text_key=text_key,
        doc_id_key=doc_id_key,
        **kwargs,
    )

delete #

delete(ref_doc_id: str, **delete_kwargs: Any) -> None

使用ref_doc_id删除节点。

Source code in llama_index/vector_stores/lancedb/base.py
184
185
186
187
188
189
190
191
    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """使用ref_doc_id删除节点。

Args:
    ref_doc_id(str):要删除的文档的doc_id。
"""
        table = self._connection.open_table(self.table_name)
        table.delete('doc_id = "' + ref_doc_id + '"')

query #

query(
    query: VectorStoreQuery, **kwargs: Any
) -> VectorStoreQueryResult

查询前k个最相似节点的索引。

Source code in llama_index/vector_stores/lancedb/base.py
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
def query(
    self,
    query: VectorStoreQuery,
    **kwargs: Any,
) -> VectorStoreQueryResult:
    """查询前k个最相似节点的索引。"""
    if query.filters is not None:
        if "where" in kwargs:
            raise ValueError(
                "Cannot specify filter via both query and kwargs. "
                "Use kwargs only for lancedb specific items that are "
                "not supported via the generic query interface."
            )
        where = _to_lance_filter(query.filters)
    else:
        where = kwargs.pop("where", None)

    table = self._connection.open_table(self.table_name)
    lance_query = (
        table.search(
            query=query.query_embedding,
            vector_column_name=self.vector_column_name,
        )
        .limit(query.similarity_top_k)
        .where(where)
        .nprobes(self.nprobes)
    )

    if self.refine_factor is not None:
        lance_query.refine_factor(self.refine_factor)

    results = lance_query.to_pandas()
    nodes = []
    for _, item in results.iterrows():
        try:
            node = metadata_dict_to_node(item.metadata)
            node.embedding = list(item[self.vector_column_name])
        except Exception:
            # deprecated legacy logic for backward compatibility
            _logger.debug(
                "Failed to parse Node metadata, fallback to legacy logic."
            )
            if "metadata" in item:
                metadata, node_info, _relation = legacy_metadata_dict_to_node(
                    item.metadata, text_key=self.text_key
                )
            else:
                metadata, node_info = {}, {}
            node = TextNode(
                text=item[self.text_key] or "",
                id_=item.id,
                metadata=metadata,
                start_char_idx=node_info.get("start", None),
                end_char_idx=node_info.get("end", None),
                relationships={
                    NodeRelationship.SOURCE: RelatedNodeInfo(
                        node_id=item[self.doc_id_key]
                    ),
                },
            )

        nodes.append(node)

    return VectorStoreQueryResult(
        nodes=nodes,
        similarities=_to_llama_similarities(results),
        ids=results["id"].tolist(),
    )