from __future__ import annotations
import itertools
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
if TYPE_CHECKING:
from tigrisdb import TigrisClient
from tigrisdb import VectorStore as TigrisVectorStore
from tigrisdb.types.filters import Filter as TigrisFilter
from tigrisdb.types.vector import Document as TigrisDocument
[docs]class Tigris(VectorStore):
"""`Tigris`向量存储。"""
[docs] def __init__(self, client: TigrisClient, embeddings: Embeddings, index_name: str):
"""初始化Tigris向量存储。"""
try:
import tigrisdb # noqa: F401
except ImportError:
raise ImportError(
"Could not import tigrisdb python package. "
"Please install it with `pip install tigrisdb`"
)
self._embed_fn = embeddings
self._vector_store = TigrisVectorStore(client.get_search(), index_name)
@property
def embeddings(self) -> Embeddings:
return self._embed_fn
@property
def search_index(self) -> TigrisVectorStore:
return self._vector_store
[docs] def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""运行更多的文本通过嵌入并添加到向量存储。
参数:
texts:要添加到向量存储的字符串的可迭代对象。
metadatas:与文本相关联的元数据的可选列表。
ids:文档的可选id列表。
如果未提供,将自动生成id。
kwargs:向量存储特定参数
返回:
将文本添加到向量存储中的id列表。
"""
docs = self._prep_docs(texts, metadatas, ids)
result = self.search_index.add_documents(docs)
return [r.id for r in result]
[docs] def similarity_search(
self,
query: str,
k: int = 4,
filter: Optional[TigrisFilter] = None,
**kwargs: Any,
) -> List[Document]:
"""返回与查询最相似的文档。"""
docs_with_scores = self.similarity_search_with_score(query, k, filter)
return [doc for doc, _ in docs_with_scores]
[docs] def similarity_search_with_score(
self,
query: str,
k: int = 4,
filter: Optional[TigrisFilter] = None,
) -> List[Tuple[Document, float]]:
"""运行使用Chroma和距离进行相似性搜索。
参数:
query (str): 要搜索的查询文本。
k (int): 要返回的结果数量。默认为4。
filter (Optional[TigrisFilter]): 按元数据过滤。默认为None。
返回:
List[Tuple[Document, float]]: 与查询文本最相似的文档列表,带有浮点距离。
"""
vector = self._embed_fn.embed_query(query)
result = self.search_index.similarity_search(
vector=vector, k=k, filter_by=filter
)
docs: List[Tuple[Document, float]] = []
for r in result:
docs.append(
(
Document(
page_content=r.doc["text"], metadata=r.doc.get("metadata")
),
r.score,
)
)
return docs
[docs] @classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
client: Optional[TigrisClient] = None,
index_name: Optional[str] = None,
**kwargs: Any,
) -> Tigris:
"""返回从文本和嵌入初始化的VectorStore。"""
if not index_name:
raise ValueError("`index_name` is required")
if not client:
client = TigrisClient()
store = cls(client, embedding, index_name)
store.add_texts(texts=texts, metadatas=metadatas, ids=ids)
return store
def _prep_docs(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]],
ids: Optional[List[str]],
) -> List[TigrisDocument]:
embeddings: List[List[float]] = self._embed_fn.embed_documents(list(texts))
docs: List[TigrisDocument] = []
for t, m, e, _id in itertools.zip_longest(
texts, metadatas or [], embeddings or [], ids or []
):
doc: TigrisDocument = {
"text": t,
"embeddings": e or [],
"metadata": m or {},
}
if _id:
doc["id"] = _id
docs.append(doc)
return docs