from __future__ import annotations
from typing import Any, Dict, Iterable, List, Optional, Tuple
import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.utils import guard_import
from langchain_core.vectorstores import VectorStore
from langchain_community.docstore.base import AddableMixin, Docstore
from langchain_community.docstore.in_memory import InMemoryDocstore
[docs]def dependable_usearch_import() -> Any:
"""
导入usearch(如果可用),否则引发错误。
"""
return guard_import("usearch.index")
[docs]class USearch(VectorStore):
"""`USearch` 向量存储。
要使用,您应该安装``usearch`` python包。"""
[docs] def __init__(
self,
embedding: Embeddings,
index: Any,
docstore: Docstore,
ids: List[str],
):
"""使用必要的组件进行初始化。"""
self.embedding = embedding
self.index = index
self.docstore = docstore
self.ids = ids
[docs] def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[Dict]] = None,
ids: Optional[np.ndarray] = None,
**kwargs: Any,
) -> List[str]:
"""运行更多文本通过嵌入并添加到向量存储。
参数:
texts:要添加到向量存储的字符串的可迭代对象。
metadatas:与文本相关的元数据的可选列表。
ids:唯一ID的可选列表。
返回:
将文本添加到向量存储中的ID列表。
"""
if not isinstance(self.docstore, AddableMixin):
raise ValueError(
"If trying to add texts, the underlying docstore should support "
f"adding items, which {self.docstore} does not"
)
embeddings = self.embedding.embed_documents(list(texts))
documents = []
for i, text in enumerate(texts):
metadata = metadatas[i] if metadatas else {}
documents.append(Document(page_content=text, metadata=metadata))
last_id = int(self.ids[-1]) + 1
if ids is None:
ids = np.array([str(last_id + id) for id, _ in enumerate(texts)])
self.index.add(np.array(ids), np.array(embeddings))
self.docstore.add(dict(zip(ids, documents)))
self.ids.extend(ids)
return ids.tolist()
[docs] def similarity_search_with_score(
self,
query: str,
k: int = 4,
) -> List[Tuple[Document, float]]:
"""返回与查询最相似的文档。
参数:
query:要查找与之相似文档的文本。
k:要返回的文档数量。默认为4。
返回:
与查询最相似的文档列表,带有距离信息。
"""
query_embedding = self.embedding.embed_query(query)
matches = self.index.search(np.array(query_embedding), k)
docs_with_scores: List[Tuple[Document, float]] = []
for id, score in zip(matches.keys, matches.distances):
doc = self.docstore.search(str(id))
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {id}, got {doc}")
docs_with_scores.append((doc, score))
return docs_with_scores
[docs] def similarity_search(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Document]:
"""返回与查询最相似的文档。
参数:
query:要查找与之相似的文档的文本。
k:要返回的文档数量。默认为4。
返回:
与查询最相似的文档列表。
"""
query_embedding = self.embedding.embed_query(query)
matches = self.index.search(np.array(query_embedding), k)
docs: List[Document] = []
for id in matches.keys:
doc = self.docstore.search(str(id))
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {id}, got {doc}")
docs.append(doc)
return docs
[docs] @classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[Dict]] = None,
ids: Optional[np.ndarray] = None,
metric: str = "cos",
**kwargs: Any,
) -> USearch:
"""从原始文档构建USearch包装器。
这是一个用户友好的接口,可以:
1. 嵌入文档。
2. 创建一个内存中的文档存储。
3. 初始化USearch数据库。
这旨在是一个快速开始的方式。
示例:
.. code-block:: python
from langchain_community.vectorstores import USearch
from langchain_community.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
usearch = USearch.from_texts(texts, embeddings)
"""
embeddings = embedding.embed_documents(texts)
documents: List[Document] = []
if ids is None:
ids = np.array([str(id) for id, _ in enumerate(texts)])
for i, text in enumerate(texts):
metadata = metadatas[i] if metadatas else {}
documents.append(Document(page_content=text, metadata=metadata))
docstore = InMemoryDocstore(dict(zip(ids, documents)))
usearch = guard_import("usearch.index")
index = usearch.Index(ndim=len(embeddings[0]), metric=metric)
index.add(np.array(ids), np.array(embeddings))
return cls(embedding, index, docstore, ids.tolist())