Source code for langchain_community.vectorstores.thirdai_neuraldb

import importlib
import os
import tempfile
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import Extra, root_validator
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
from langchain_core.vectorstores import VectorStore


[docs]class NeuralDBVectorStore(VectorStore): """使用ThirdAI的NeuralDB的Vectorstore。 要使用,应该安装``thirdai[neural_db]`` python包。 示例: .. code-block:: python from langchain_community.vectorstores import NeuralDBVectorStore from thirdai import neural_db as ndb db = ndb.NeuralDB() vectorstore = NeuralDBVectorStore(db=db)"""
[docs] def __init__(self, db: Any) -> None: self.db = db
db: Any = None #: :meta private: """神经数据库实例""" class Config: """此pydantic对象的配置。""" extra = Extra.forbid underscore_attrs_are_private = True @staticmethod def _verify_thirdai_library(thirdai_key: Optional[str] = None): # type: ignore[no-untyped-def] try: from thirdai import licensing importlib.util.find_spec("thirdai.neural_db") licensing.activate(thirdai_key or os.getenv("THIRDAI_KEY")) except ImportError: raise ImportError( "Could not import thirdai python package and neuraldb dependencies. " "Please install it with `pip install thirdai[neural_db]`." )
[docs] @classmethod def from_scratch( # type: ignore[no-untyped-def, no-untyped-def] cls, thirdai_key: Optional[str] = None, **model_kwargs, ): """从头开始创建一个 NeuralDBVectorStore。 要使用,请设置``THIRDAI_KEY``环境变量为您的 ThirdAI API 密钥,或将``thirdai_key``作为一个命名参数传递。 示例: .. code-block:: python from langchain_community.vectorstores import NeuralDBVectorStore vectorstore = NeuralDBVectorStore.from_scratch( thirdai_key="your-thirdai-key", ) vectorstore.insert([ "/path/to/doc.pdf", "/path/to/doc.docx", "/path/to/doc.csv", ]) documents = vectorstore.similarity_search("AI-driven music therapy") """ NeuralDBVectorStore._verify_thirdai_library(thirdai_key) from thirdai import neural_db as ndb return cls(db=ndb.NeuralDB(**model_kwargs)) # type: ignore[call-arg]
[docs] @classmethod def from_checkpoint( # type: ignore[no-untyped-def] cls, checkpoint: Union[str, Path], thirdai_key: Optional[str] = None, ): """使用保存的检查点创建一个带有基本模型的NeuralDBVectorStore 要使用,请设置``THIRDAI_KEY``环境变量为您的ThirdAI API密钥,或将``thirdai_key``作为命名参数传递。 示例: .. code-block:: python from langchain_community.vectorstores import NeuralDBVectorStore vectorstore = NeuralDBVectorStore.from_checkpoint( checkpoint="/path/to/checkpoint.ndb", thirdai_key="your-thirdai-key", ) vectorstore.insert([ "/path/to/doc.pdf", "/path/to/doc.docx", "/path/to/doc.csv", ]) documents = vectorstore.similarity_search("AI-driven music therapy") """ NeuralDBVectorStore._verify_thirdai_library(thirdai_key) from thirdai import neural_db as ndb return cls(db=ndb.NeuralDB.from_checkpoint(checkpoint)) # type: ignore[call-arg]
[docs] @classmethod def from_texts( cls, texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> "NeuralDBVectorStore": """返回从文本和嵌入初始化的VectorStore。""" model_kwargs = {} if "thirdai_key" in kwargs: model_kwargs["thirdai_key"] = kwargs["thirdai_key"] del kwargs["thirdai_key"] vectorstore = cls.from_scratch(**model_kwargs) vectorstore.add_texts(texts, metadatas, **kwargs) return vectorstore
[docs] def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> List[str]: """运行更多文本通过嵌入并添加到向量存储。 参数: texts:要添加到向量存储的字符串的可迭代对象。 metadatas:与文本相关联的元数据的可选列表。 kwargs:向量存储特定参数 返回: 将文本添加到向量存储中的ID列表。 """ import pandas as pd from thirdai import neural_db as ndb df = pd.DataFrame({"texts": texts}) if metadatas: df = pd.concat([df, pd.DataFrame.from_records(metadatas)], axis=1) temp = tempfile.NamedTemporaryFile("w", delete=False, delete_on_close=False) # type: ignore[call-overload] df.to_csv(temp) source_id = self.insert([ndb.CSV(temp.name)], **kwargs)[0] offset = self.db._savable_state.documents.get_source_by_id(source_id)[1] return [str(offset + i) for i in range(len(texts))] # type: ignore[arg-type]
[docs] @root_validator(allow_reuse=True) def validate_environments(cls, values: Dict) -> Dict: """验证 ThirdAI 环境变量。""" values["thirdai_key"] = convert_to_secret_str( get_from_dict_or_env( values, "thirdai_key", "THIRDAI_KEY", ) ) return values
[docs] def insert( # type: ignore[no-untyped-def, no-untyped-def] self, sources: List[Any], train: bool = True, fast_mode: bool = True, **kwargs, ): """将文件/文档源插入向量存储中。 参数: train: 当为True时,意味着NeuralDB中的基础模型将在插入的文件上进行无监督预训练。默认为True。 fast_mode: 更快的插入速度,性能略有下降。默认为True。 """ sources = self._preprocess_sources(sources) self.db.insert( sources=sources, train=train, fast_approximation=fast_mode, **kwargs, )
def _preprocess_sources(self, sources): # type: ignore[no-untyped-def] """检查提供的源是否为字符串路径。如果是,则转换为NeuralDB文档对象。 参数: sources: 字符串路径列表,可以是PDF、DOCX或CSV文件,也可以是NeuralDB文档对象。 """ from thirdai import neural_db as ndb if not sources: return sources preprocessed_sources = [] for doc in sources: if not isinstance(doc, str): preprocessed_sources.append(doc) else: if doc.lower().endswith(".pdf"): preprocessed_sources.append(ndb.PDF(doc)) elif doc.lower().endswith(".docx"): preprocessed_sources.append(ndb.DOCX(doc)) elif doc.lower().endswith(".csv"): preprocessed_sources.append(ndb.CSV(doc)) else: raise RuntimeError( f"Could not automatically load {doc}. Only files " "with .pdf, .docx, or .csv extensions can be loaded " "automatically. For other formats, please use the " "appropriate document object from the ThirdAI library." ) return preprocessed_sources
[docs] def upvote(self, query: str, document_id: Union[int, str]): # type: ignore[no-untyped-def] """向量存储增加了特定查询的文档得分。这对于微调向量存储以适应用户行为非常有用。 参数: query:与`document_id`关联的文本 document_id:要与查询关联的文档的ID。 """ self.db.text_to_result(query, int(document_id))
[docs] def upvote_batch(self, query_id_pairs: List[Tuple[str, int]]): # type: ignore[no-untyped-def] """给定一批(查询,文档ID)对,向量存储器会增加对应查询的文档得分。这对于微调向量存储器以适应用户行为非常有用。 参数: query_id_pairs:(查询,文档ID)对的列表。对于此列表中的每对,模型将增加该查询的文档ID的权重。 """ self.db.text_to_result_batch( [(query, int(doc_id)) for query, doc_id in query_id_pairs] )
[docs] def associate(self, source: str, target: str): # type: ignore[no-untyped-def] """向量存储将源短语与目标短语关联起来。 当向量存储看到源短语时,它还会考虑与目标短语相关的结果。 参数: source:要与“target”关联的文本。 target:要将“source”关联到的文本。 """ self.db.associate(source, target)
[docs] def associate_batch(self, text_pairs: List[Tuple[str, str]]): # type: ignore[no-untyped-def] """给定一批(源,目标)对,向量存储将每个源短语与相应的目标短语关联起来。 参数: text_pairs:(源,目标)文本对的列表。对于此列表中的每对,源将与目标关联起来。 """ self.db.associate_batch(text_pairs)
[docs] def save(self, path: str): # type: ignore[no-untyped-def] """将NeuralDB实例保存到磁盘。可以通过调用NeuralDB.from_checkpoint(path)将其加载到内存中。 参数: path: 保存NeuralDB实例的磁盘路径。 """ self.db.save(path)
[docs]class NeuralDBClientVectorStore(VectorStore): """使用ThirdAI的NeuralDB Enterprise Python客户端的Vectorstore。 要使用,应该已安装``thirdai[neural_db]`` python包。 示例: ```python from langchain_community.vectorstores import NeuralDBClientVectorStore from thirdai.neural_db import ModelBazaar, NeuralDBClient bazaar = ModelBazaar(base_url="http://{NEURAL_DB_ENTERPRISE_IP}/api/") bazaar.log_in(email="user@thirdai.com", password="1234") ndb_client = NeuralDBClient( deployment_identifier="user/model-0:user/deployment-0", base_url="http://{NEURAL_DB_ENTERPRISE_IP}/api/", bazaar=bazaar ) vectorstore = NeuralDBClientVectorStore(db=ndb_client) retriever = vectorstore.as_retriever(search_kwargs={'k':5}) ```"""
[docs] def __init__(self, db: Any) -> None: self.db = db
db: Any = None #: :meta private: """神经数据库客户端实例""" class Config: """此pydantic对象的配置。""" extra = Extra.forbid underscore_attrs_are_private = True
[docs] def insert(self, documents: List[Dict[str, Any]]): # type: ignore[no-untyped-def, no-untyped-def] """将文档插入VectorStore并返回相应的Sources。 参数: documents(List[Dict[str, Any]]):要插入VectorStores的文档列表,文档必须以以下格式的字典表示: {"document_type": "DOCUMENT_TYPE", **kwargs},其中"DOCUMENT_TYPE"是以下之一: "PDF"、"CSV"、"DOCX"、"URL"、"SentenceLevelPDF"、"SentenceLevelDOCX"、 "Unstructured"、"InMemoryText"。每种文档类型的kwargs如下所示: class PDF(Document): document_type: Literal["PDF"] path: str metadata: Optional[dict[str, Any]] = None on_disk: bool = False version: str = "v1" chunk_size: int = 100 stride: int = 40 emphasize_first_words: int = 0 ignore_header_footer: bool = True ignore_nonstandard_orientation: bool = True class CSV(Document): document_type: Literal["CSV"] path: str id_column: Optional[str] = None strong_columns: Optional[List[str]] = None weak_columns: Optional[List[str]] = None reference_columns: Optional[List[str]] = None save_extra_info: bool = True metadata: Optional[dict[str, Any]] = None has_offset: bool = False on_disk: bool = False class DOCX(Document): document_type: Literal["DOCX"] path: str metadata: Optional[dict[str, Any]] = None on_disk: bool = False class URL(Document): document_type: Literal["URL"] url: str save_extra_info: bool = True title_is_strong: bool = False metadata: Optional[dict[str, Any]] = None on_disk: bool = False class SentenceLevelPDF(Document): document_type: Literal["SentenceLevelPDF"] path: str metadata: Optional[dict[str, Any]] = None on_disk: bool = False class SentenceLevelDOCX(Document): document_type: Literal["SentenceLevelDOCX"] path: str metadata: Optional[dict[str, Any]] = None on_disk: bool = False class Unstructured(Document): document_type: Literal["Unstructured"] path: str save_extra_info: bool = True metadata: Optional[dict[str, Any]] = None on_disk: bool = False class InMemoryText(Document): document_type: Literal["InMemoryText"] name: str texts: list[str] metadatas: Optional[list[dict[str, Any]]] = None global_metadata: Optional[dict[str, Any]] = None on_disk: bool = False 对于带有参数"path"的文档类型,请确保路径存在于本地计算机上。 """ return self.db.insert(documents)
[docs] def remove_documents(self, source_ids: List[str]): # type: ignore[no-untyped-def] """使用源ID从VectorStore中删除文档。 参数: files(List[str]):要从VectorStore中删除的源ID列表。 """ self.db.delete(source_ids)