Source code for langchain_community.vectorstores.pgvecto_rs

from __future__ import annotations

import uuid
from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore


[docs]class PGVecto_rs(VectorStore): """使用pgvecto_rs支持的VectorStore。""" _store = None _embedding: Embeddings
[docs] def __init__( self, embedding: Embeddings, dimension: int, db_url: str, collection_name: str, new_table: bool = False, ) -> None: """初始化一个PGVector_rs向量存储。 参数: embedding: 要使用的嵌入。 dimension: 嵌入的维度。 db_url: 数据库URL。 collection_name: 集合的名称。 new_table: 是否创建一个新表或连接到现有表。 如果为true,则如果存在,则删除表,然后重新创建。 默认为False。 """ try: from pgvecto_rs.sdk import PGVectoRs except ImportError as e: raise ImportError( "Unable to import pgvector_rs.sdk , please install with " '`pip install "pgvecto_rs[sdk]"`.' ) from e self._store = PGVectoRs( db_url=db_url, collection_name=collection_name, dimension=dimension, recreate=new_table, ) self._embedding = embedding
# ================ Create interface =================
[docs] @classmethod def from_texts( cls, texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, db_url: str = "", collection_name: str = str(uuid.uuid4().hex), **kwargs: Any, ) -> PGVecto_rs: """返回从文本和可选元数据初始化的VectorStore。""" sample_embedding = embedding.embed_query("Hello pgvecto_rs!") dimension = len(sample_embedding) if db_url is None: raise ValueError("db_url must be provided") _self: PGVecto_rs = cls( embedding=embedding, dimension=dimension, db_url=db_url, collection_name=collection_name, ) _self.add_texts(texts, metadatas, **kwargs) return _self
[docs] @classmethod def from_documents( cls, documents: List[Document], embedding: Embeddings, db_url: str = "", collection_name: str = str(uuid.uuid4().hex), **kwargs: Any, ) -> PGVecto_rs: """返回从文档初始化的VectorStore。""" texts = [document.page_content for document in documents] metadatas = [document.metadata for document in documents] return cls.from_texts( texts, embedding, metadatas, db_url, collection_name, **kwargs )
[docs] @classmethod def from_collection_name( cls, embedding: Embeddings, db_url: str, collection_name: str, ) -> PGVecto_rs: """使用collection_name创建一个新的空向量存储,或者如果存在的话,连接到数据库中的现有向量存储。 参数应该与创建向量存储时相同。 """ sample_embedding = embedding.embed_query("Hello pgvecto_rs!") return cls( embedding=embedding, dimension=len(sample_embedding), db_url=db_url, collection_name=collection_name, )
# ================ Insert interface =================
[docs] def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> List[str]: """运行更多的文本通过嵌入并添加到向量存储中。 参数: texts:要添加到向量存储中的字符串的可迭代对象。 metadatas:与文本相关联的元数据的可选列表。 kwargs:向量存储特定参数 返回: 添加的文本的id列表。 """ from pgvecto_rs.sdk import Record embeddings = self._embedding.embed_documents(list(texts)) records = [ Record.from_text(text, embedding, meta) for text, embedding, meta in zip(texts, embeddings, metadatas or []) ] self._store.insert(records) # type: ignore[union-attr] return [str(record.id) for record in records]
[docs] def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]: """运行更多文档通过嵌入并添加到向量存储。 参数: documents (List[Document]): 要添加到向量存储的文档列表。 返回: 已添加文档的id列表。 """ return self.add_texts( [document.page_content for document in documents], [document.metadata for document in documents], **kwargs, )
# ================ Query interface =================
[docs] def similarity_search_with_score_by_vector( self, query_vector: List[float], k: int = 4, distance_func: Literal[ "sqrt_euclid", "neg_dot_prod", "ned_cos" ] = "sqrt_euclid", filter: Union[None, Dict[str, Any], Any] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: """返回与查询向量最相似的文档及其得分。""" from pgvecto_rs.sdk.filters import meta_contains distance_func_map = { "sqrt_euclid": "<->", "neg_dot_prod": "<#>", "ned_cos": "<=>", } if filter is None: real_filter = None elif isinstance(filter, dict): real_filter = meta_contains(filter) else: real_filter = filter results = self._store.search( # type: ignore[union-attr] query_vector, distance_func_map[distance_func], k, filter=real_filter, ) return [ ( Document( page_content=res[0].text, metadata=res[0].meta, ), res[1], ) for res in results ]
[docs] def similarity_search_by_vector( self, embedding: List[float], k: int = 4, distance_func: Literal[ "sqrt_euclid", "neg_dot_prod", "ned_cos" ] = "sqrt_euclid", filter: Optional[Any] = None, **kwargs: Any, ) -> List[Document]: return [ doc for doc, _score in self.similarity_search_with_score_by_vector( embedding, k, distance_func, **kwargs ) ]
[docs] def similarity_search_with_score( self, query: str, k: int = 4, distance_func: Literal[ "sqrt_euclid", "neg_dot_prod", "ned_cos" ] = "sqrt_euclid", **kwargs: Any, ) -> List[Tuple[Document, float]]: query_vector = self._embedding.embed_query(query) return self.similarity_search_with_score_by_vector( query_vector, k, distance_func, **kwargs )