[docs]classSemaDB(VectorStore):"""`SemaDB` vector store. This vector store is a wrapper around the SemaDB database. Example: .. code-block:: python from langchain_community.vectorstores import SemaDB db = SemaDB('mycollection', 768, embeddings, DistanceStrategy.COSINE) """HOST:str="semadb.p.rapidapi.com"BASE_URL="https://"+HOST
[docs]def__init__(self,collection_name:str,vector_size:int,embedding:Embeddings,distance_strategy:DistanceStrategy=DistanceStrategy.EUCLIDEAN_DISTANCE,api_key:str="",):"""initialize the SemaDB vector store."""self.collection_name=collection_nameself.vector_size=vector_sizeself.api_key=api_keyorget_from_env("api_key","SEMADB_API_KEY")self._embedding=embeddingself.distance_strategy=distance_strategy
@propertydefheaders(self)->dict:"""Return the common headers."""return{"content-type":"application/json","X-RapidAPI-Key":self.api_key,"X-RapidAPI-Host":SemaDB.HOST,}def_get_internal_distance_strategy(self)->str:"""Return the internal distance strategy."""ifself.distance_strategy==DistanceStrategy.EUCLIDEAN_DISTANCE:return"euclidean"elifself.distance_strategy==DistanceStrategy.MAX_INNER_PRODUCT:raiseValueError("Max inner product is not supported by SemaDB")elifself.distance_strategy==DistanceStrategy.DOT_PRODUCT:return"dot"elifself.distance_strategy==DistanceStrategy.JACCARD:raiseValueError("Max inner product is not supported by SemaDB")elifself.distance_strategy==DistanceStrategy.COSINE:return"cosine"else:raiseValueError(f"Unknown distance strategy {self.distance_strategy}")
[docs]defcreate_collection(self)->bool:"""Creates the corresponding collection in SemaDB."""payload={"id":self.collection_name,"vectorSize":self.vector_size,"distanceMetric":self._get_internal_distance_strategy(),}response=requests.post(SemaDB.BASE_URL+"/collections",json=payload,headers=self.headers,)returnresponse.status_code==200
[docs]defdelete_collection(self)->bool:"""Deletes the corresponding collection in SemaDB."""response=requests.delete(SemaDB.BASE_URL+f"/collections/{self.collection_name}",headers=self.headers,)returnresponse.status_code==200
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,batch_size:int=1000,**kwargs:Any,)->List[str]:"""Add texts to the vector store."""ifnotisinstance(texts,list):texts=list(texts)embeddings=self._embedding.embed_documents(texts)# Check dimensionsiflen(embeddings[0])!=self.vector_size:raiseValueError(f"Embedding size mismatch {len(embeddings[0])} != {self.vector_size}")# Normalise if neededifself.distance_strategy==DistanceStrategy.COSINE:embed_matrix=np.array(embeddings)embed_matrix=embed_matrix/np.linalg.norm(embed_matrix,axis=1,keepdims=True)embeddings=cast(List[List[float]],embed_matrix.tolist())# Create pointsids:List[str]=[]points=[]ifmetadatasisnotNone:fortext,embedding,metadatainzip(texts,embeddings,metadatas):new_id=str(uuid4())ids.append(new_id)points.append({"id":new_id,"vector":embedding,"metadata":{**metadata,**{"text":text}},})else:fortext,embeddinginzip(texts,embeddings):new_id=str(uuid4())ids.append(new_id)points.append({"id":new_id,"vector":embedding,"metadata":{"text":text},})# Insert points in batchesforiinrange(0,len(points),batch_size):batch=points[i:i+batch_size]response=requests.post(SemaDB.BASE_URL+f"/collections/{self.collection_name}/points",json={"points":batch},headers=self.headers,)ifresponse.status_code!=200:print("HERE--",batch)# noqa: T201raiseValueError(f"Error adding points: {response.text}")failed_ranges=response.json()["failedRanges"]iflen(failed_ranges)>0:raiseValueError(f"Error adding points: {failed_ranges}")# Return idsreturnids
@propertydefembeddings(self)->Embeddings:"""Return the embeddings."""returnself._embedding
[docs]defdelete(self,ids:Optional[List[str]]=None,**kwargs:Any)->Optional[bool]:"""Delete by vector ID or other criteria. Args: ids: List of ids to delete. **kwargs: Other keyword arguments that subclasses might use. Returns: Optional[bool]: True if deletion is successful, False otherwise, None if not implemented. """payload={"ids":ids,}response=requests.delete(SemaDB.BASE_URL+f"/collections/{self.collection_name}/points",json=payload,headers=self.headers,)returnresponse.status_code==200andlen(response.json()["failedPoints"])==0
def_search_points(self,embedding:List[float],k:int=4)->List[dict]:"""Search points."""# Normalise if neededifself.distance_strategy==DistanceStrategy.COSINE:vec=np.array(embedding)vec=vec/np.linalg.norm(vec)embedding=cast(List[float],vec.tolist())# Perform search requestpayload={"vector":embedding,"limit":k,}response=requests.post(SemaDB.BASE_URL+f"/collections/{self.collection_name}/points/search",json=payload,headers=self.headers,)ifresponse.status_code!=200:raiseValueError(f"Error searching: {response.text}")returnresponse.json()["points"]
[docs]defsimilarity_search(self,query:str,k:int=4,**kwargs:Any)->List[Document]:"""Return docs most similar to query."""query_embedding=self._embedding.embed_query(query)returnself.similarity_search_by_vector(query_embedding,k=k)
[docs]defsimilarity_search_with_score(self,query:str,k:int=4,**kwargs:Any)->List[Tuple[Document,float]]:"""Run similarity search with distance."""query_embedding=self._embedding.embed_query(query)points=self._search_points(query_embedding,k=k)return[(Document(page_content=p["metadata"]["text"],metadata=p["metadata"]),p["distance"],)forpinpoints]
[docs]defsimilarity_search_by_vector(self,embedding:List[float],k:int=4,**kwargs:Any)->List[Document]:"""Return docs most similar to embedding vector. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. Returns: List of Documents most similar to the query vector. """points=self._search_points(embedding,k=k)return[Document(page_content=p["metadata"]["text"],metadata=p["metadata"])forpinpoints]
[docs]@classmethoddeffrom_texts(cls,texts:List[str],embedding:Embeddings,metadatas:Optional[List[dict]]=None,collection_name:str="",vector_size:int=0,api_key:str="",distance_strategy:DistanceStrategy=DistanceStrategy.EUCLIDEAN_DISTANCE,**kwargs:Any,)->"SemaDB":"""Return VectorStore initialized from texts and embeddings."""ifnotcollection_name:raiseValueError("Collection name must be provided")ifnotvector_size:raiseValueError("Vector size must be provided")ifnotapi_key:raiseValueError("API key must be provided")semadb=cls(collection_name,vector_size,embedding,distance_strategy=distance_strategy,api_key=api_key,)ifnotsemadb.create_collection():raiseValueError("Error creating collection")semadb.add_texts(texts,metadatas=metadatas)returnsemadb