Source code for langchain_community.vectorstores.jaguar

from __future__ import annotations

import json
import logging
from typing import Any, List, Optional, Tuple

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore

logger = logging.getLogger(__name__)


[docs]class Jaguar(VectorStore): """`Jaguar API` 向量存储。 请参阅 http://www.jaguardb.com 请参阅 http://github.com/fserv/jaguar-sdk 示例: .. code-block:: python from langchain_community.vectorstores.jaguar import Jaguar vectorstore = Jaguar( pod = 'vdb', store = 'mystore', vector_index = 'v', vector_type = 'cosine_fraction_float', vector_dimension = 1536, url='http://192.168.8.88:8080/fwww/', embedding=openai_model ) """
[docs] def __init__( self, pod: str, store: str, vector_index: str, vector_type: str, vector_dimension: int, url: str, embedding: Embeddings, ): self._pod = pod self._store = store self._vector_index = vector_index self._vector_type = vector_type self._vector_dimension = vector_dimension self._embedding = embedding try: from jaguardb_http_client.JaguarHttpClient import JaguarHttpClient except ImportError: raise ImportError( "Could not import jaguardb-http-client python package. " "Please install it with `pip install -U jaguardb-http-client`" ) self._jag = JaguarHttpClient(url) self._token = ""
[docs] def login( self, jaguar_api_key: Optional[str] = "", ) -> bool: """使用jaguar_api_key登录到jaguardb服务器,或让self._jag找到一个密钥 参数: pod (str): Pod的名称 store (str): 向量存储的名称 可选 jaguar_api_key (str): 用户到jaguardb服务器的API密钥 返回: 如果成功则为True;如果不成功则为False """ if jaguar_api_key == "": jaguar_api_key = self._jag.getApiKey() self._jaguar_api_key = jaguar_api_key self._token = self._jag.login(jaguar_api_key) if self._token == "": logger.error("E0001 error init(): invalid jaguar_api_key") return False return True
[docs] def create( self, metadata_str: str, text_size: int, ) -> None: """在后端数据库上创建向量存储 参数: metadata_str(str):列及其类型 返回: 如果成功则为True;如果不成功则为False """ podstore = self._pod + "." + self._store """ source column is required. v:text column is required. """ q = "create store " q += podstore q += f" ({self._vector_index} vector({self._vector_dimension}," q += f" '{self._vector_type}')," q += f" source char(256), v:text char({text_size})," q += metadata_str + ")" self.run(q)
[docs] def run(self, query: str, withFile: bool = False) -> dict: """在JaguarDB中运行任何查询语句 参数: query (str): 要在JaguarDB中运行的查询语句 返回: 无效令牌返回None,或 JSON结果字符串 """ if self._token == "": logger.error(f"E0005 error run({query})") return {} resp = self._jag.post(query, self._token, withFile) txt = resp.text try: js = json.loads(txt) return js except Exception: return {}
@property def embeddings(self) -> Optional[Embeddings]: return self._embedding
[docs] def add_texts( # type: ignore[override] self, texts: List[str], metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> List[str]: """通过嵌入将文本添加到向量存储中。 参数: texts:要添加到jaguar向量存储中的文本字符串列表。 metadatas:与文本相关联的元数据的可选列表。 [{"m1": "v11", "m2": "v12", "m3": "v13", "filecol": "path_file1.jpg" }, {"m1": "v21", "m2": "v22", "m3": "v23", "filecol": "path_file2.jpg" }, {"m1": "v31", "m2": "v32", "m3": "v33", "filecol": "path_file3.jpg" }, {"m1": "v41", "m2": "v42", "m3": "v43", "filecol": "path_file4.jpg" }] kwargs:vector_index=向量索引名称 file_column=文件列名称 返回: 将文本添加到向量存储中的ID列表。 """ vcol = self._vector_index filecol = kwargs.get("file_column", "") text_tag = kwargs.get("text_tag", "") podstorevcol = self._pod + "." + self._store + "." + vcol q = "textcol " + podstorevcol js = self.run(q) if js == "": return [] textcol = js["data"] if text_tag != "": tag_texts = [] for t in texts: tag_texts.append(text_tag + " " + t) texts = tag_texts embeddings = self._embedding.embed_documents(list(texts)) ids = [] if metadatas is None: ### no meta and no files to upload i = 0 for vec in embeddings: str_vec = [str(x) for x in vec] values_comma = ",".join(str_vec) podstore = self._pod + "." + self._store q = "insert into " + podstore + " (" q += vcol + "," + textcol + ") values ('" + values_comma txt = texts[i].replace("'", "\\'") q += "','" + txt + "')" js = self.run(q, False) ids.append(js["zid"]) i += 1 else: i = 0 for vec in embeddings: str_vec = [str(x) for x in vec] nvec, vvec, filepath = self._parseMeta(metadatas[i], filecol) if filecol != "": rc = self._jag.postFile(self._token, filepath, 1) if not rc: return [] names_comma = ",".join(nvec) names_comma += "," + vcol ## col1,col2,col3,vecl values_comma = "'" + "','".join(vvec) + "'" ### 'va1','val2','val3' values_comma += ",'" + ",".join(str_vec) + "'" ### 'v1,v2,v3' podstore = self._pod + "." + self._store q = "insert into " + podstore + " (" q += names_comma + "," + textcol + ") values (" + values_comma txt = texts[i].replace("'", "\\'") q += ",'" + txt + "')" if filecol != "": js = self.run(q, True) else: js = self.run(q, False) ids.append(js["zid"]) i += 1 return ids
[docs] def similarity_search_with_score( self, query: str, k: int = 3, fetch_k: int = -1, where: Optional[str] = None, args: Optional[str] = None, metadatas: Optional[List[str]] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: """返回与查询最相似的Jaguar文档,以及分数。 参数: query:要查找类似文档的文本。 k:要返回的文档数量。默认为3。 lambda_val:混合搜索的词汇匹配参数。 where:在选择相似性中的where子句。例如,where可以是"rating > 3.0 and (state = 'NV' or state = 'CA')" args:传递给选择相似性的额外选项 kwargs:vector_index=vcol,vector_type=cosine_fraction_float 返回: 与查询最相似的文档列表,以及每个文档的分数。 元组列表(doc,similarity_score): [(doc,score),(doc,score),...] """ vcol = self._vector_index vtype = self._vector_type embeddings = self._embedding.embed_query(query) str_embeddings = [str(f) for f in embeddings] qv_comma = ",".join(str_embeddings) podstore = self._pod + "." + self._store q = ( "select similarity(" + vcol + ",'" + qv_comma + "','topk=" + str(k) + ",fetch_k=" + str(fetch_k) + ",type=" + vtype ) q += ",with_score=yes,with_text=yes" if args is not None: q += "," + args if metadatas is not None: meta = "&".join(metadatas) q += ",metadata=" + meta q += "') from " + podstore if where is not None: q += " where " + where jarr = self.run(q) if jarr is None: return [] docs_with_score = [] for js in jarr: score = js["score"] text = js["text"] zid = js["zid"] ### give metadatas md = {} md["zid"] = zid if metadatas is not None: for m in metadatas: mv = js[m] md[m] = mv doc = Document(page_content=text, metadata=md) tup = (doc, score) docs_with_score.append(tup) return docs_with_score
[docs] def is_anomalous( self, query: str, **kwargs: Any, ) -> bool: """检测给定文本是否在数据集中异常 参数: query: 需要检测是否异常的文本 返回: True 或 False """ vcol = self._vector_index vtype = self._vector_type embeddings = self._embedding.embed_query(query) str_embeddings = [str(f) for f in embeddings] qv_comma = ",".join(str_embeddings) podstore = self._pod + "." + self._store q = "select anomalous(" + vcol + ", '" + qv_comma + "', 'type=" + vtype + "')" q += " from " + podstore js = self.run(q) if isinstance(js, list) and len(js) == 0: return False jd = json.loads(js[0]) if jd["anomalous"] == "YES": return True return False
[docs] @classmethod def from_texts( # type: ignore[override] cls, texts: List[str], embedding: Embeddings, url: str, pod: str, store: str, vector_index: str, vector_type: str, vector_dimension: int, metadatas: Optional[List[dict]] = None, jaguar_api_key: Optional[str] = "", **kwargs: Any, ) -> Jaguar: jagstore = cls( pod, store, vector_index, vector_type, vector_dimension, url, embedding ) jagstore.login(jaguar_api_key) jagstore.clear() jagstore.add_texts(texts, metadatas, **kwargs) return jagstore
[docs] def clear(self) -> None: """删除jaguardb中的所有记录 参数:无 返回:无 """ podstore = self._pod + "." + self._store q = "truncate store " + podstore self.run(q)
[docs] def delete(self, zids: List[str], **kwargs: Any) -> None: # type: ignore[override] """通过零ID列表删除jaguardb中的记录 参数: pod(str):Pod的名称 ids(List[str]):作为字符串的zid列表 返回: 不返回任何内容 """ podstore = self._pod + "." + self._store for zid in zids: q = "delete from " + podstore + " where zid='" + zid + "'" self.run(q)
[docs] def count(self) -> int: """统计jaguardb中商店的记录数 参数:无参数 返回值:(int) 商店中记录的数量 """ podstore = self._pod + "." + self._store q = "select count() from " + podstore js = self.run(q) if isinstance(js, list) and len(js) == 0: return 0 jd = json.loads(js[0]) return int(jd["data"])
[docs] def drop(self) -> None: """删除或移除JaguarDB中的一个存储库 参数:无参数 返回值:无 """ podstore = self._pod + "." + self._store q = "drop store " + podstore self.run(q)
[docs] def logout(self) -> None: """登出以清理资源 参数:无参数 返回:无 """ self._jag.logout(self._token)
[docs] def prt(self, msg: str) -> None: with open("/tmp/debugjaguar.log", "a") as file: print(f"msg={msg}", file=file, flush=True) # noqa: T201
def _parseMeta(self, nvmap: dict, filecol: str) -> Tuple[List[str], List[str], str]: filepath = "" if filecol == "": nvec = list(nvmap.keys()) vvec = list(nvmap.values()) else: nvec = [] vvec = [] if filecol in nvmap: nvec.append(filecol) vvec.append(nvmap[filecol]) filepath = nvmap[filecol] for k, v in nvmap.items(): if k != filecol: nvec.append(k) vvec.append(v) vvec_s = [str(e) for e in vvec] return nvec, vvec_s, filepath