from __future__ import annotations
import json
import logging
from typing import Any, List, Optional, Tuple
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
logger = logging.getLogger(__name__)
[docs]class Jaguar(VectorStore):
"""`Jaguar API` 向量存储。
请参阅 http://www.jaguardb.com
请参阅 http://github.com/fserv/jaguar-sdk
示例:
.. code-block:: python
from langchain_community.vectorstores.jaguar import Jaguar
vectorstore = Jaguar(
pod = 'vdb',
store = 'mystore',
vector_index = 'v',
vector_type = 'cosine_fraction_float',
vector_dimension = 1536,
url='http://192.168.8.88:8080/fwww/',
embedding=openai_model
)
"""
[docs] def __init__(
self,
pod: str,
store: str,
vector_index: str,
vector_type: str,
vector_dimension: int,
url: str,
embedding: Embeddings,
):
self._pod = pod
self._store = store
self._vector_index = vector_index
self._vector_type = vector_type
self._vector_dimension = vector_dimension
self._embedding = embedding
try:
from jaguardb_http_client.JaguarHttpClient import JaguarHttpClient
except ImportError:
raise ImportError(
"Could not import jaguardb-http-client python package. "
"Please install it with `pip install -U jaguardb-http-client`"
)
self._jag = JaguarHttpClient(url)
self._token = ""
[docs] def login(
self,
jaguar_api_key: Optional[str] = "",
) -> bool:
"""使用jaguar_api_key登录到jaguardb服务器,或让self._jag找到一个密钥
参数:
pod (str): Pod的名称
store (str): 向量存储的名称
可选 jaguar_api_key (str): 用户到jaguardb服务器的API密钥
返回:
如果成功则为True;如果不成功则为False
"""
if jaguar_api_key == "":
jaguar_api_key = self._jag.getApiKey()
self._jaguar_api_key = jaguar_api_key
self._token = self._jag.login(jaguar_api_key)
if self._token == "":
logger.error("E0001 error init(): invalid jaguar_api_key")
return False
return True
[docs] def create(
self,
metadata_str: str,
text_size: int,
) -> None:
"""在后端数据库上创建向量存储
参数:
metadata_str(str):列及其类型
返回:
如果成功则为True;如果不成功则为False
"""
podstore = self._pod + "." + self._store
"""
source column is required.
v:text column is required.
"""
q = "create store "
q += podstore
q += f" ({self._vector_index} vector({self._vector_dimension},"
q += f" '{self._vector_type}'),"
q += f" source char(256), v:text char({text_size}),"
q += metadata_str + ")"
self.run(q)
[docs] def run(self, query: str, withFile: bool = False) -> dict:
"""在JaguarDB中运行任何查询语句
参数:
query (str): 要在JaguarDB中运行的查询语句
返回:
无效令牌返回None,或
JSON结果字符串
"""
if self._token == "":
logger.error(f"E0005 error run({query})")
return {}
resp = self._jag.post(query, self._token, withFile)
txt = resp.text
try:
js = json.loads(txt)
return js
except Exception:
return {}
@property
def embeddings(self) -> Optional[Embeddings]:
return self._embedding
[docs] def add_texts( # type: ignore[override]
self,
texts: List[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""通过嵌入将文本添加到向量存储中。
参数:
texts:要添加到jaguar向量存储中的文本字符串列表。
metadatas:与文本相关联的元数据的可选列表。
[{"m1": "v11", "m2": "v12", "m3": "v13", "filecol": "path_file1.jpg" },
{"m1": "v21", "m2": "v22", "m3": "v23", "filecol": "path_file2.jpg" },
{"m1": "v31", "m2": "v32", "m3": "v33", "filecol": "path_file3.jpg" },
{"m1": "v41", "m2": "v42", "m3": "v43", "filecol": "path_file4.jpg" }]
kwargs:vector_index=向量索引名称
file_column=文件列名称
返回:
将文本添加到向量存储中的ID列表。
"""
vcol = self._vector_index
filecol = kwargs.get("file_column", "")
text_tag = kwargs.get("text_tag", "")
podstorevcol = self._pod + "." + self._store + "." + vcol
q = "textcol " + podstorevcol
js = self.run(q)
if js == "":
return []
textcol = js["data"]
if text_tag != "":
tag_texts = []
for t in texts:
tag_texts.append(text_tag + " " + t)
texts = tag_texts
embeddings = self._embedding.embed_documents(list(texts))
ids = []
if metadatas is None:
### no meta and no files to upload
i = 0
for vec in embeddings:
str_vec = [str(x) for x in vec]
values_comma = ",".join(str_vec)
podstore = self._pod + "." + self._store
q = "insert into " + podstore + " ("
q += vcol + "," + textcol + ") values ('" + values_comma
txt = texts[i].replace("'", "\\'")
q += "','" + txt + "')"
js = self.run(q, False)
ids.append(js["zid"])
i += 1
else:
i = 0
for vec in embeddings:
str_vec = [str(x) for x in vec]
nvec, vvec, filepath = self._parseMeta(metadatas[i], filecol)
if filecol != "":
rc = self._jag.postFile(self._token, filepath, 1)
if not rc:
return []
names_comma = ",".join(nvec)
names_comma += "," + vcol
## col1,col2,col3,vecl
values_comma = "'" + "','".join(vvec) + "'"
### 'va1','val2','val3'
values_comma += ",'" + ",".join(str_vec) + "'"
### 'v1,v2,v3'
podstore = self._pod + "." + self._store
q = "insert into " + podstore + " ("
q += names_comma + "," + textcol + ") values (" + values_comma
txt = texts[i].replace("'", "\\'")
q += ",'" + txt + "')"
if filecol != "":
js = self.run(q, True)
else:
js = self.run(q, False)
ids.append(js["zid"])
i += 1
return ids
[docs] def similarity_search_with_score(
self,
query: str,
k: int = 3,
fetch_k: int = -1,
where: Optional[str] = None,
args: Optional[str] = None,
metadatas: Optional[List[str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""返回与查询最相似的Jaguar文档,以及分数。
参数:
query:要查找类似文档的文本。
k:要返回的文档数量。默认为3。
lambda_val:混合搜索的词汇匹配参数。
where:在选择相似性中的where子句。例如,where可以是"rating > 3.0 and (state = 'NV' or state = 'CA')"
args:传递给选择相似性的额外选项
kwargs:vector_index=vcol,vector_type=cosine_fraction_float
返回:
与查询最相似的文档列表,以及每个文档的分数。
元组列表(doc,similarity_score):
[(doc,score),(doc,score),...]
"""
vcol = self._vector_index
vtype = self._vector_type
embeddings = self._embedding.embed_query(query)
str_embeddings = [str(f) for f in embeddings]
qv_comma = ",".join(str_embeddings)
podstore = self._pod + "." + self._store
q = (
"select similarity("
+ vcol
+ ",'"
+ qv_comma
+ "','topk="
+ str(k)
+ ",fetch_k="
+ str(fetch_k)
+ ",type="
+ vtype
)
q += ",with_score=yes,with_text=yes"
if args is not None:
q += "," + args
if metadatas is not None:
meta = "&".join(metadatas)
q += ",metadata=" + meta
q += "') from " + podstore
if where is not None:
q += " where " + where
jarr = self.run(q)
if jarr is None:
return []
docs_with_score = []
for js in jarr:
score = js["score"]
text = js["text"]
zid = js["zid"]
### give metadatas
md = {}
md["zid"] = zid
if metadatas is not None:
for m in metadatas:
mv = js[m]
md[m] = mv
doc = Document(page_content=text, metadata=md)
tup = (doc, score)
docs_with_score.append(tup)
return docs_with_score
[docs] def similarity_search(
self,
query: str,
k: int = 3,
where: Optional[str] = None,
metadatas: Optional[List[str]] = None,
**kwargs: Any,
) -> List[Document]:
"""返回与查询最相似的Jaguar文档,以及分数。
参数:
query:要查找类似文档的文本。
k:要返回的文档数量。默认为5。
where:在选择相似性时的where子句。例如,where可以是"rating > 3.0 and (state = 'NV' or state = 'CA')"
返回:
与查询最相似的文档列表
"""
docs_and_scores = self.similarity_search_with_score(
query, k=k, where=where, metadatas=metadatas, **kwargs
)
return [doc for doc, _ in docs_and_scores]
[docs] def is_anomalous(
self,
query: str,
**kwargs: Any,
) -> bool:
"""检测给定文本是否在数据集中异常
参数:
query: 需要检测是否异常的文本
返回:
True 或 False
"""
vcol = self._vector_index
vtype = self._vector_type
embeddings = self._embedding.embed_query(query)
str_embeddings = [str(f) for f in embeddings]
qv_comma = ",".join(str_embeddings)
podstore = self._pod + "." + self._store
q = "select anomalous(" + vcol + ", '" + qv_comma + "', 'type=" + vtype + "')"
q += " from " + podstore
js = self.run(q)
if isinstance(js, list) and len(js) == 0:
return False
jd = json.loads(js[0])
if jd["anomalous"] == "YES":
return True
return False
[docs] @classmethod
def from_texts( # type: ignore[override]
cls,
texts: List[str],
embedding: Embeddings,
url: str,
pod: str,
store: str,
vector_index: str,
vector_type: str,
vector_dimension: int,
metadatas: Optional[List[dict]] = None,
jaguar_api_key: Optional[str] = "",
**kwargs: Any,
) -> Jaguar:
jagstore = cls(
pod, store, vector_index, vector_type, vector_dimension, url, embedding
)
jagstore.login(jaguar_api_key)
jagstore.clear()
jagstore.add_texts(texts, metadatas, **kwargs)
return jagstore
[docs] def clear(self) -> None:
"""删除jaguardb中的所有记录
参数:无
返回:无
"""
podstore = self._pod + "." + self._store
q = "truncate store " + podstore
self.run(q)
[docs] def delete(self, zids: List[str], **kwargs: Any) -> None: # type: ignore[override]
"""通过零ID列表删除jaguardb中的记录
参数:
pod(str):Pod的名称
ids(List[str]):作为字符串的zid列表
返回:
不返回任何内容
"""
podstore = self._pod + "." + self._store
for zid in zids:
q = "delete from " + podstore + " where zid='" + zid + "'"
self.run(q)
[docs] def count(self) -> int:
"""统计jaguardb中商店的记录数
参数:无参数
返回值:(int) 商店中记录的数量
"""
podstore = self._pod + "." + self._store
q = "select count() from " + podstore
js = self.run(q)
if isinstance(js, list) and len(js) == 0:
return 0
jd = json.loads(js[0])
return int(jd["data"])
[docs] def drop(self) -> None:
"""删除或移除JaguarDB中的一个存储库
参数:无参数
返回值:无
"""
podstore = self._pod + "." + self._store
q = "drop store " + podstore
self.run(q)
[docs] def logout(self) -> None:
"""登出以清理资源
参数:无参数
返回:无
"""
self._jag.logout(self._token)
[docs] def prt(self, msg: str) -> None:
with open("/tmp/debugjaguar.log", "a") as file:
print(f"msg={msg}", file=file, flush=True) # noqa: T201
def _parseMeta(self, nvmap: dict, filecol: str) -> Tuple[List[str], List[str], str]:
filepath = ""
if filecol == "":
nvec = list(nvmap.keys())
vvec = list(nvmap.values())
else:
nvec = []
vvec = []
if filecol in nvmap:
nvec.append(filecol)
vvec.append(nvmap[filecol])
filepath = nvmap[filecol]
for k, v in nvmap.items():
if k != filecol:
nvec.append(k)
vvec.append(v)
vvec_s = [str(e) for e in vvec]
return nvec, vvec_s, filepath