import importlib
import os
import tempfile
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import Extra, root_validator
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
from langchain_core.vectorstores import VectorStore
[docs]class NeuralDBVectorStore(VectorStore):
"""使用ThirdAI的NeuralDB的Vectorstore。
要使用,应该安装``thirdai[neural_db]`` python包。
示例:
.. code-block:: python
from langchain_community.vectorstores import NeuralDBVectorStore
from thirdai import neural_db as ndb
db = ndb.NeuralDB()
vectorstore = NeuralDBVectorStore(db=db)"""
[docs] def __init__(self, db: Any) -> None:
self.db = db
db: Any = None #: :meta private:
"""神经数据库实例"""
class Config:
"""此pydantic对象的配置。"""
extra = Extra.forbid
underscore_attrs_are_private = True
@staticmethod
def _verify_thirdai_library(thirdai_key: Optional[str] = None): # type: ignore[no-untyped-def]
try:
from thirdai import licensing
importlib.util.find_spec("thirdai.neural_db")
licensing.activate(thirdai_key or os.getenv("THIRDAI_KEY"))
except ImportError:
raise ImportError(
"Could not import thirdai python package and neuraldb dependencies. "
"Please install it with `pip install thirdai[neural_db]`."
)
[docs] @classmethod
def from_scratch( # type: ignore[no-untyped-def, no-untyped-def]
cls,
thirdai_key: Optional[str] = None,
**model_kwargs,
):
"""从头开始创建一个 NeuralDBVectorStore。
要使用,请设置``THIRDAI_KEY``环境变量为您的 ThirdAI API 密钥,或将``thirdai_key``作为一个命名参数传递。
示例:
.. code-block:: python
from langchain_community.vectorstores import NeuralDBVectorStore
vectorstore = NeuralDBVectorStore.from_scratch(
thirdai_key="your-thirdai-key",
)
vectorstore.insert([
"/path/to/doc.pdf",
"/path/to/doc.docx",
"/path/to/doc.csv",
])
documents = vectorstore.similarity_search("AI-driven music therapy")
"""
NeuralDBVectorStore._verify_thirdai_library(thirdai_key)
from thirdai import neural_db as ndb
return cls(db=ndb.NeuralDB(**model_kwargs)) # type: ignore[call-arg]
[docs] @classmethod
def from_checkpoint( # type: ignore[no-untyped-def]
cls,
checkpoint: Union[str, Path],
thirdai_key: Optional[str] = None,
):
"""使用保存的检查点创建一个带有基本模型的NeuralDBVectorStore
要使用,请设置``THIRDAI_KEY``环境变量为您的ThirdAI API密钥,或将``thirdai_key``作为命名参数传递。
示例:
.. code-block:: python
from langchain_community.vectorstores import NeuralDBVectorStore
vectorstore = NeuralDBVectorStore.from_checkpoint(
checkpoint="/path/to/checkpoint.ndb",
thirdai_key="your-thirdai-key",
)
vectorstore.insert([
"/path/to/doc.pdf",
"/path/to/doc.docx",
"/path/to/doc.csv",
])
documents = vectorstore.similarity_search("AI-driven music therapy")
"""
NeuralDBVectorStore._verify_thirdai_library(thirdai_key)
from thirdai import neural_db as ndb
return cls(db=ndb.NeuralDB.from_checkpoint(checkpoint)) # type: ignore[call-arg]
[docs] @classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> "NeuralDBVectorStore":
"""返回从文本和嵌入初始化的VectorStore。"""
model_kwargs = {}
if "thirdai_key" in kwargs:
model_kwargs["thirdai_key"] = kwargs["thirdai_key"]
del kwargs["thirdai_key"]
vectorstore = cls.from_scratch(**model_kwargs)
vectorstore.add_texts(texts, metadatas, **kwargs)
return vectorstore
[docs] def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""运行更多文本通过嵌入并添加到向量存储。
参数:
texts:要添加到向量存储的字符串的可迭代对象。
metadatas:与文本相关联的元数据的可选列表。
kwargs:向量存储特定参数
返回:
将文本添加到向量存储中的ID列表。
"""
import pandas as pd
from thirdai import neural_db as ndb
df = pd.DataFrame({"texts": texts})
if metadatas:
df = pd.concat([df, pd.DataFrame.from_records(metadatas)], axis=1)
temp = tempfile.NamedTemporaryFile("w", delete=False, delete_on_close=False) # type: ignore[call-overload]
df.to_csv(temp)
source_id = self.insert([ndb.CSV(temp.name)], **kwargs)[0]
offset = self.db._savable_state.documents.get_source_by_id(source_id)[1]
return [str(offset + i) for i in range(len(texts))] # type: ignore[arg-type]
[docs] @root_validator(allow_reuse=True)
def validate_environments(cls, values: Dict) -> Dict:
"""验证 ThirdAI 环境变量。"""
values["thirdai_key"] = convert_to_secret_str(
get_from_dict_or_env(
values,
"thirdai_key",
"THIRDAI_KEY",
)
)
return values
[docs] def insert( # type: ignore[no-untyped-def, no-untyped-def]
self,
sources: List[Any],
train: bool = True,
fast_mode: bool = True,
**kwargs,
):
"""将文件/文档源插入向量存储中。
参数:
train: 当为True时,意味着NeuralDB中的基础模型将在插入的文件上进行无监督预训练。默认为True。
fast_mode: 更快的插入速度,性能略有下降。默认为True。
"""
sources = self._preprocess_sources(sources)
self.db.insert(
sources=sources,
train=train,
fast_approximation=fast_mode,
**kwargs,
)
def _preprocess_sources(self, sources): # type: ignore[no-untyped-def]
"""检查提供的源是否为字符串路径。如果是,则转换为NeuralDB文档对象。
参数:
sources: 字符串路径列表,可以是PDF、DOCX或CSV文件,也可以是NeuralDB文档对象。
"""
from thirdai import neural_db as ndb
if not sources:
return sources
preprocessed_sources = []
for doc in sources:
if not isinstance(doc, str):
preprocessed_sources.append(doc)
else:
if doc.lower().endswith(".pdf"):
preprocessed_sources.append(ndb.PDF(doc))
elif doc.lower().endswith(".docx"):
preprocessed_sources.append(ndb.DOCX(doc))
elif doc.lower().endswith(".csv"):
preprocessed_sources.append(ndb.CSV(doc))
else:
raise RuntimeError(
f"Could not automatically load {doc}. Only files "
"with .pdf, .docx, or .csv extensions can be loaded "
"automatically. For other formats, please use the "
"appropriate document object from the ThirdAI library."
)
return preprocessed_sources
[docs] def upvote(self, query: str, document_id: Union[int, str]): # type: ignore[no-untyped-def]
"""向量存储增加了特定查询的文档得分。这对于微调向量存储以适应用户行为非常有用。
参数:
query:与`document_id`关联的文本
document_id:要与查询关联的文档的ID。
"""
self.db.text_to_result(query, int(document_id))
[docs] def upvote_batch(self, query_id_pairs: List[Tuple[str, int]]): # type: ignore[no-untyped-def]
"""给定一批(查询,文档ID)对,向量存储器会增加对应查询的文档得分。这对于微调向量存储器以适应用户行为非常有用。
参数:
query_id_pairs:(查询,文档ID)对的列表。对于此列表中的每对,模型将增加该查询的文档ID的权重。
"""
self.db.text_to_result_batch(
[(query, int(doc_id)) for query, doc_id in query_id_pairs]
)
[docs] def associate(self, source: str, target: str): # type: ignore[no-untyped-def]
"""向量存储将源短语与目标短语关联起来。
当向量存储看到源短语时,它还会考虑与目标短语相关的结果。
参数:
source:要与“target”关联的文本。
target:要将“source”关联到的文本。
"""
self.db.associate(source, target)
[docs] def associate_batch(self, text_pairs: List[Tuple[str, str]]): # type: ignore[no-untyped-def]
"""给定一批(源,目标)对,向量存储将每个源短语与相应的目标短语关联起来。
参数:
text_pairs:(源,目标)文本对的列表。对于此列表中的每对,源将与目标关联起来。
"""
self.db.associate_batch(text_pairs)
[docs] def similarity_search(
self, query: str, k: int = 10, **kwargs: Any
) -> List[Document]:
"""检索给定查询的{k}个上下文
参数:
query: 提交给模型的查询
k: 要检索的上下文结果的最大数量。默认为10。
"""
try:
references = self.db.search(query=query, top_k=k, **kwargs)
return [
Document(
page_content=ref.text,
metadata={
"id": ref.id,
"upvote_ids": ref.upvote_ids,
"source": ref.source,
"metadata": ref.metadata,
"score": ref.score,
"context": ref.context(1),
},
)
for ref in references
]
except Exception as e:
raise ValueError(f"Error while retrieving documents: {e}") from e
[docs] def save(self, path: str): # type: ignore[no-untyped-def]
"""将NeuralDB实例保存到磁盘。可以通过调用NeuralDB.from_checkpoint(path)将其加载到内存中。
参数:
path: 保存NeuralDB实例的磁盘路径。
"""
self.db.save(path)
[docs]class NeuralDBClientVectorStore(VectorStore):
"""使用ThirdAI的NeuralDB Enterprise Python客户端的Vectorstore。
要使用,应该已安装``thirdai[neural_db]`` python包。
示例:
```python
from langchain_community.vectorstores import NeuralDBClientVectorStore
from thirdai.neural_db import ModelBazaar, NeuralDBClient
bazaar = ModelBazaar(base_url="http://{NEURAL_DB_ENTERPRISE_IP}/api/")
bazaar.log_in(email="user@thirdai.com", password="1234")
ndb_client = NeuralDBClient(
deployment_identifier="user/model-0:user/deployment-0",
base_url="http://{NEURAL_DB_ENTERPRISE_IP}/api/",
bazaar=bazaar
)
vectorstore = NeuralDBClientVectorStore(db=ndb_client)
retriever = vectorstore.as_retriever(search_kwargs={'k':5})
```"""
[docs] def __init__(self, db: Any) -> None:
self.db = db
db: Any = None #: :meta private:
"""神经数据库客户端实例"""
class Config:
"""此pydantic对象的配置。"""
extra = Extra.forbid
underscore_attrs_are_private = True
[docs] def similarity_search(
self, query: str, k: int = 10, **kwargs: Any
) -> List[Document]:
"""检索给定查询的{k}个上下文
参数:
query: 提交给模型的查询
k: 要检索的上下文结果的最大数量。默认为10。
"""
try:
references = self.db.search(query=query, top_k=k, **kwargs)["references"]
return [
Document(
page_content=ref["text"],
metadata={
"id": ref["id"],
"source": ref["source"],
"metadata": ref["metadata"],
"score": ref["source"],
"context": ref["context"],
},
)
for ref in references
]
except Exception as e:
raise ValueError(f"Error while retrieving documents: {e}") from e
[docs] def insert(self, documents: List[Dict[str, Any]]): # type: ignore[no-untyped-def, no-untyped-def]
"""将文档插入VectorStore并返回相应的Sources。
参数:
documents(List[Dict[str, Any]]):要插入VectorStores的文档列表,文档必须以以下格式的字典表示:
{"document_type": "DOCUMENT_TYPE", **kwargs},其中"DOCUMENT_TYPE"是以下之一:
"PDF"、"CSV"、"DOCX"、"URL"、"SentenceLevelPDF"、"SentenceLevelDOCX"、
"Unstructured"、"InMemoryText"。每种文档类型的kwargs如下所示:
class PDF(Document):
document_type: Literal["PDF"]
path: str
metadata: Optional[dict[str, Any]] = None
on_disk: bool = False
version: str = "v1"
chunk_size: int = 100
stride: int = 40
emphasize_first_words: int = 0
ignore_header_footer: bool = True
ignore_nonstandard_orientation: bool = True
class CSV(Document):
document_type: Literal["CSV"]
path: str
id_column: Optional[str] = None
strong_columns: Optional[List[str]] = None
weak_columns: Optional[List[str]] = None
reference_columns: Optional[List[str]] = None
save_extra_info: bool = True
metadata: Optional[dict[str, Any]] = None
has_offset: bool = False
on_disk: bool = False
class DOCX(Document):
document_type: Literal["DOCX"]
path: str
metadata: Optional[dict[str, Any]] = None
on_disk: bool = False
class URL(Document):
document_type: Literal["URL"]
url: str
save_extra_info: bool = True
title_is_strong: bool = False
metadata: Optional[dict[str, Any]] = None
on_disk: bool = False
class SentenceLevelPDF(Document):
document_type: Literal["SentenceLevelPDF"]
path: str
metadata: Optional[dict[str, Any]] = None
on_disk: bool = False
class SentenceLevelDOCX(Document):
document_type: Literal["SentenceLevelDOCX"]
path: str
metadata: Optional[dict[str, Any]] = None
on_disk: bool = False
class Unstructured(Document):
document_type: Literal["Unstructured"]
path: str
save_extra_info: bool = True
metadata: Optional[dict[str, Any]] = None
on_disk: bool = False
class InMemoryText(Document):
document_type: Literal["InMemoryText"]
name: str
texts: list[str]
metadatas: Optional[list[dict[str, Any]]] = None
global_metadata: Optional[dict[str, Any]] = None
on_disk: bool = False
对于带有参数"path"的文档类型,请确保路径存在于本地计算机上。
"""
return self.db.insert(documents)
[docs] def remove_documents(self, source_ids: List[str]): # type: ignore[no-untyped-def]
"""使用源ID从VectorStore中删除文档。
参数:
files(List[str]):要从VectorStore中删除的源ID列表。
"""
self.db.delete(source_ids)