from __future__ import annotations
import uuid
from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
[docs]class PGVecto_rs(VectorStore):
"""使用pgvecto_rs支持的VectorStore。"""
_store = None
_embedding: Embeddings
[docs] def __init__(
self,
embedding: Embeddings,
dimension: int,
db_url: str,
collection_name: str,
new_table: bool = False,
) -> None:
"""初始化一个PGVector_rs向量存储。
参数:
embedding: 要使用的嵌入。
dimension: 嵌入的维度。
db_url: 数据库URL。
collection_name: 集合的名称。
new_table: 是否创建一个新表或连接到现有表。
如果为true,则如果存在,则删除表,然后重新创建。
默认为False。
"""
try:
from pgvecto_rs.sdk import PGVectoRs
except ImportError as e:
raise ImportError(
"Unable to import pgvector_rs.sdk , please install with "
'`pip install "pgvecto_rs[sdk]"`.'
) from e
self._store = PGVectoRs(
db_url=db_url,
collection_name=collection_name,
dimension=dimension,
recreate=new_table,
)
self._embedding = embedding
# ================ Create interface =================
[docs] @classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
db_url: str = "",
collection_name: str = str(uuid.uuid4().hex),
**kwargs: Any,
) -> PGVecto_rs:
"""返回从文本和可选元数据初始化的VectorStore。"""
sample_embedding = embedding.embed_query("Hello pgvecto_rs!")
dimension = len(sample_embedding)
if db_url is None:
raise ValueError("db_url must be provided")
_self: PGVecto_rs = cls(
embedding=embedding,
dimension=dimension,
db_url=db_url,
collection_name=collection_name,
)
_self.add_texts(texts, metadatas, **kwargs)
return _self
[docs] @classmethod
def from_documents(
cls,
documents: List[Document],
embedding: Embeddings,
db_url: str = "",
collection_name: str = str(uuid.uuid4().hex),
**kwargs: Any,
) -> PGVecto_rs:
"""返回从文档初始化的VectorStore。"""
texts = [document.page_content for document in documents]
metadatas = [document.metadata for document in documents]
return cls.from_texts(
texts, embedding, metadatas, db_url, collection_name, **kwargs
)
[docs] @classmethod
def from_collection_name(
cls,
embedding: Embeddings,
db_url: str,
collection_name: str,
) -> PGVecto_rs:
"""使用collection_name创建一个新的空向量存储,或者如果存在的话,连接到数据库中的现有向量存储。
参数应该与创建向量存储时相同。
"""
sample_embedding = embedding.embed_query("Hello pgvecto_rs!")
return cls(
embedding=embedding,
dimension=len(sample_embedding),
db_url=db_url,
collection_name=collection_name,
)
# ================ Insert interface =================
[docs] def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""运行更多的文本通过嵌入并添加到向量存储中。
参数:
texts:要添加到向量存储中的字符串的可迭代对象。
metadatas:与文本相关联的元数据的可选列表。
kwargs:向量存储特定参数
返回:
添加的文本的id列表。
"""
from pgvecto_rs.sdk import Record
embeddings = self._embedding.embed_documents(list(texts))
records = [
Record.from_text(text, embedding, meta)
for text, embedding, meta in zip(texts, embeddings, metadatas or [])
]
self._store.insert(records) # type: ignore[union-attr]
return [str(record.id) for record in records]
[docs] def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]:
"""运行更多文档通过嵌入并添加到向量存储。
参数:
documents (List[Document]): 要添加到向量存储的文档列表。
返回:
已添加文档的id列表。
"""
return self.add_texts(
[document.page_content for document in documents],
[document.metadata for document in documents],
**kwargs,
)
# ================ Query interface =================
[docs] def similarity_search_with_score_by_vector(
self,
query_vector: List[float],
k: int = 4,
distance_func: Literal[
"sqrt_euclid", "neg_dot_prod", "ned_cos"
] = "sqrt_euclid",
filter: Union[None, Dict[str, Any], Any] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""返回与查询向量最相似的文档及其得分。"""
from pgvecto_rs.sdk.filters import meta_contains
distance_func_map = {
"sqrt_euclid": "<->",
"neg_dot_prod": "<#>",
"ned_cos": "<=>",
}
if filter is None:
real_filter = None
elif isinstance(filter, dict):
real_filter = meta_contains(filter)
else:
real_filter = filter
results = self._store.search( # type: ignore[union-attr]
query_vector,
distance_func_map[distance_func],
k,
filter=real_filter,
)
return [
(
Document(
page_content=res[0].text,
metadata=res[0].meta,
),
res[1],
)
for res in results
]
[docs] def similarity_search_by_vector(
self,
embedding: List[float],
k: int = 4,
distance_func: Literal[
"sqrt_euclid", "neg_dot_prod", "ned_cos"
] = "sqrt_euclid",
filter: Optional[Any] = None,
**kwargs: Any,
) -> List[Document]:
return [
doc
for doc, _score in self.similarity_search_with_score_by_vector(
embedding, k, distance_func, **kwargs
)
]
[docs] def similarity_search_with_score(
self,
query: str,
k: int = 4,
distance_func: Literal[
"sqrt_euclid", "neg_dot_prod", "ned_cos"
] = "sqrt_euclid",
**kwargs: Any,
) -> List[Tuple[Document, float]]:
query_vector = self._embedding.embed_query(query)
return self.similarity_search_with_score_by_vector(
query_vector, k, distance_func, **kwargs
)
[docs] def similarity_search(
self,
query: str,
k: int = 4,
distance_func: Literal[
"sqrt_euclid", "neg_dot_prod", "ned_cos"
] = "sqrt_euclid",
**kwargs: Any,
) -> List[Document]:
"""返回与查询最相似的文档。"""
query_vector = self._embedding.embed_query(query)
return [
doc
for doc, _score in self.similarity_search_with_score_by_vector(
query_vector, k, distance_func, **kwargs
)
]