Source code for langchain_community.vectorstores.hologres

from __future__ import annotations

import logging
import uuid
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.utils import get_from_dict_or_env
from langchain_core.vectorstores import VectorStore

ADA_TOKEN_COUNT = 1536
_LANGCHAIN_DEFAULT_TABLE_NAME = "langchain_pg_embedding"


[docs]class Hologres(VectorStore): """`Hologres API` 向量存储。 - `connection_string` 是一个 hologres 连接字符串。 - `embedding_function` 是实现 `langchain.embeddings.base.Embeddings` 接口的任何嵌入函数。 - `ndims` 是嵌入输出的维度数。 - `table_name` 是用于存储嵌入和数据的表的名称。 (默认值: langchain_pg_embedding) - 注意: 表将在初始化存储时创建(如果不存在) 因此,请确保用户具有创建表的权限。 - `pre_delete_table` 如果为 True,则会删除该表(如果存在)。 (默认值: False) - 用于测试。"""
[docs] def __init__( self, connection_string: str, embedding_function: Embeddings, ndims: int = ADA_TOKEN_COUNT, table_name: str = _LANGCHAIN_DEFAULT_TABLE_NAME, pre_delete_table: bool = False, logger: Optional[logging.Logger] = None, ) -> None: self.connection_string = connection_string self.ndims = ndims self.table_name = table_name self.embedding_function = embedding_function self.pre_delete_table = pre_delete_table self.logger = logger or logging.getLogger(__name__) self.__post_init__()
def __post_init__( self, ) -> None: """ 初始化商店。 """ from hologres_vector import HologresVector self.storage = HologresVector( self.connection_string, ndims=self.ndims, table_name=self.table_name, table_schema={"document": "text"}, pre_delete_table=self.pre_delete_table, ) @property def embeddings(self) -> Embeddings: return self.embedding_function @classmethod def __from( cls, texts: List[str], embeddings: List[List[float]], embedding_function: Embeddings, metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, ndims: int = ADA_TOKEN_COUNT, table_name: str = _LANGCHAIN_DEFAULT_TABLE_NAME, pre_delete_table: bool = False, **kwargs: Any, ) -> Hologres: if ids is None: ids = [str(uuid.uuid4()) for _ in texts] if not metadatas: metadatas = [{} for _ in texts] connection_string = cls.get_connection_string(kwargs) store = cls( connection_string=connection_string, embedding_function=embedding_function, ndims=ndims, table_name=table_name, pre_delete_table=pre_delete_table, ) store.add_embeddings( texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs ) return store
[docs] def add_embeddings( self, texts: Iterable[str], embeddings: List[List[float]], metadatas: List[dict], ids: List[str], **kwargs: Any, ) -> None: """将嵌入添加到向量存储中。 参数: texts:要添加到向量存储中的字符串的可迭代对象。 embeddings:嵌入向量的列表的列表。 metadatas:与文本相关联的元数据列表。 kwargs:向量存储特定参数。 """ try: schema_datas = [{"document": t} for t in texts] self.storage.upsert_vectors(embeddings, ids, metadatas, schema_datas) except Exception as e: self.logger.exception(e)
[docs] def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, **kwargs: Any, ) -> List[str]: """运行更多文本通过嵌入并添加到向量存储。 参数: texts:要添加到向量存储的字符串的可迭代对象。 metadatas:与文本相关联的元数据的可选列表。 kwargs:向量存储特定参数 返回: 将文本添加到向量存储中的ID列表。 """ if ids is None: ids = [str(uuid.uuid4()) for _ in texts] embeddings = self.embedding_function.embed_documents(list(texts)) if not metadatas: metadatas = [{} for _ in texts] self.add_embeddings(texts, embeddings, metadatas, ids, **kwargs) return ids
[docs] def similarity_search_by_vector( self, embedding: List[float], k: int = 4, filter: Optional[dict] = None, **kwargs: Any, ) -> List[Document]: """返回与嵌入向量最相似的文档。 参数: embedding: 要查找与之相似的文档的嵌入。 k: 要返回的文档数量。默认为4。 filter (Optional[Dict[str, str]]): 按元数据筛选。默认为None。 返回: 返回与查询向量最相似的文档列表。 """ docs_and_scores = self.similarity_search_with_score_by_vector( embedding=embedding, k=k, filter=filter ) return [doc for doc, _ in docs_and_scores]
[docs] def similarity_search_with_score( self, query: str, k: int = 4, filter: Optional[dict] = None, ) -> List[Tuple[Document, float]]: """返回与查询最相似的文档。 参数: query:要查找类似文档的文本。 k:要返回的文档数量。默认为4。 filter(可选[Dict[str,str]]):按元数据筛选。默认为无。 返回: 与查询最相似的文档列表,以及每个文档的得分。 """ embedding = self.embedding_function.embed_query(query) docs = self.similarity_search_with_score_by_vector( embedding=embedding, k=k, filter=filter ) return docs
[docs] def similarity_search_with_score_by_vector( self, embedding: List[float], k: int = 4, filter: Optional[dict] = None, ) -> List[Tuple[Document, float]]: results: List[dict[str, Any]] = self.storage.search( embedding, k=k, select_columns=["document"], metadata_filters=filter ) docs = [ ( Document( page_content=result["document"], metadata=result["metadata"], ), result["distance"], ) for result in results ] return docs
[docs] @classmethod def from_texts( cls: Type[Hologres], texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, ndims: int = ADA_TOKEN_COUNT, table_name: str = _LANGCHAIN_DEFAULT_TABLE_NAME, ids: Optional[List[str]] = None, pre_delete_table: bool = False, **kwargs: Any, ) -> Hologres: """返回从文本和嵌入初始化的VectorStore。 需要Hologres连接字符串。 "可以将其作为参数传递 或设置HOLOGRES_CONNECTION_STRING环境变量。 通过调用HologresVector.connection_string_from_db_params创建连接字符串。 """ embeddings = embedding.embed_documents(list(texts)) return cls.__from( texts, embeddings, embedding, metadatas=metadatas, ids=ids, ndims=ndims, table_name=table_name, pre_delete_table=pre_delete_table, **kwargs, )
[docs] @classmethod def from_embeddings( cls, text_embeddings: List[Tuple[str, List[float]]], embedding: Embeddings, metadatas: Optional[List[dict]] = None, ndims: int = ADA_TOKEN_COUNT, table_name: str = _LANGCHAIN_DEFAULT_TABLE_NAME, ids: Optional[List[str]] = None, pre_delete_table: bool = False, **kwargs: Any, ) -> Hologres: """从原始文档和预生成的嵌入中构建Hologres包装器。 返回从文档和嵌入初始化的VectorStore。 需要Hologres连接字符串 "可以将其作为参数传递 或设置HOLOGRES_CONNECTION_STRING环境变量。 通过调用HologresVector.connection_string_from_db_params创建连接字符串 示例: .. code-block:: python from langchain_community.vectorstores import Hologres from langchain_community.embeddings import OpenAIEmbeddings embeddings = OpenAIEmbeddings() text_embeddings = embeddings.embed_documents(texts) text_embedding_pairs = list(zip(texts, text_embeddings)) faiss = Hologres.from_embeddings(text_embedding_pairs, embeddings) """ texts = [t[0] for t in text_embeddings] embeddings = [t[1] for t in text_embeddings] return cls.__from( texts, embeddings, embedding, metadatas=metadatas, ids=ids, ndims=ndims, table_name=table_name, pre_delete_table=pre_delete_table, **kwargs, )
[docs] @classmethod def from_existing_index( cls: Type[Hologres], embedding: Embeddings, ndims: int = ADA_TOKEN_COUNT, table_name: str = _LANGCHAIN_DEFAULT_TABLE_NAME, pre_delete_table: bool = False, **kwargs: Any, ) -> Hologres: """获取现有Hologres存储的实例。此方法将返回存储的实例,而不会插入任何新的嵌入。 """ connection_string = cls.get_connection_string(kwargs) store = cls( connection_string=connection_string, ndims=ndims, table_name=table_name, embedding_function=embedding, pre_delete_table=pre_delete_table, ) return store
[docs] @classmethod def get_connection_string(cls, kwargs: Dict[str, Any]) -> str: connection_string: str = get_from_dict_or_env( data=kwargs, key="connection_string", env_key="HOLOGRES_CONNECTION_STRING", ) if not connection_string: raise ValueError( "Hologres connection string is required" "Either pass it as a parameter" "or set the HOLOGRES_CONNECTION_STRING environment variable." "Create the connection string by calling" "HologresVector.connection_string_from_db_params" ) return connection_string
[docs] @classmethod def from_documents( cls: Type[Hologres], documents: List[Document], embedding: Embeddings, ndims: int = ADA_TOKEN_COUNT, table_name: str = _LANGCHAIN_DEFAULT_TABLE_NAME, ids: Optional[List[str]] = None, pre_delete_collection: bool = False, **kwargs: Any, ) -> Hologres: """返回从文档和嵌入初始化的VectorStore。 需要Hologres连接字符串 “可以通过参数传递 或设置HOLOGRES_CONNECTION_STRING环境变量。 通过调用创建连接字符串 HologresVector.connection_string_from_db_params """ texts = [d.page_content for d in documents] metadatas = [d.metadata for d in documents] connection_string = cls.get_connection_string(kwargs) kwargs["connection_string"] = connection_string return cls.from_texts( texts=texts, pre_delete_collection=pre_delete_collection, embedding=embedding, metadatas=metadatas, ids=ids, ndims=ndims, table_name=table_name, **kwargs, )
[docs] @classmethod def connection_string_from_db_params( cls, host: str, port: int, database: str, user: str, password: str, ) -> str: """从数据库参数返回连接字符串。""" return ( f"dbname={database} user={user} password={password} host={host} port={port}" )