Source code for langchain_community.retrievers.thirdai_neuraldb

from __future__ import annotations

import importlib
import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import Extra, SecretStr, root_validator
from langchain_core.retrievers import BaseRetriever
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env


[docs]class NeuralDBRetriever(BaseRetriever): """使用ThirdAI的NeuralDB的文档检索器。""" thirdai_key: SecretStr """ThirdAI API密钥""" db: Any = None #: :meta private: """神经数据库实例""" class Config: """此pydantic对象的配置。""" extra = Extra.forbid underscore_attrs_are_private = True @staticmethod def _verify_thirdai_library(thirdai_key: Optional[str] = None) -> None: try: from thirdai import licensing importlib.util.find_spec("thirdai.neural_db") licensing.activate(thirdai_key or os.getenv("THIRDAI_KEY")) except ImportError: raise ImportError( "Could not import thirdai python package and neuraldb dependencies. " "Please install it with `pip install thirdai[neural_db]`." )
[docs] @classmethod def from_scratch( cls, thirdai_key: Optional[str] = None, **model_kwargs: dict, ) -> NeuralDBRetriever: """从头开始创建一个NeuralDBRetriever。 要使用,设置``THIRDAI_KEY``环境变量为您的ThirdAI API密钥,或将``thirdai_key``作为命名参数传递。 示例: .. code-block:: python from langchain_community.retrievers import NeuralDBRetriever retriever = NeuralDBRetriever.from_scratch( thirdai_key="your-thirdai-key", ) retriever.insert([ "/path/to/doc.pdf", "/path/to/doc.docx", "/path/to/doc.csv", ]) documents = retriever.invoke("AI驱动的音乐疗法") """ NeuralDBRetriever._verify_thirdai_library(thirdai_key) from thirdai import neural_db as ndb return cls(thirdai_key=thirdai_key, db=ndb.NeuralDB(**model_kwargs)) # type: ignore[arg-type]
[docs] @classmethod def from_checkpoint( cls, checkpoint: Union[str, Path], thirdai_key: Optional[str] = None, ) -> NeuralDBRetriever: """使用保存的检查点创建一个带有基础模型的NeuralDBRetriever 要使用,请设置``THIRDAI_KEY``环境变量为您的ThirdAI API密钥,或将``thirdai_key``作为一个命名参数传递。 示例: .. code-block:: python from langchain_community.retrievers import NeuralDBRetriever retriever = NeuralDBRetriever.from_checkpoint( checkpoint="/path/to/checkpoint.ndb", thirdai_key="your-thirdai-key", ) retriever.insert([ "/path/to/doc.pdf", "/path/to/doc.docx", "/path/to/doc.csv", ]) documents = retriever.invoke("AI-driven music therapy") """ NeuralDBRetriever._verify_thirdai_library(thirdai_key) from thirdai import neural_db as ndb return cls(thirdai_key=thirdai_key, db=ndb.NeuralDB.from_checkpoint(checkpoint)) # type: ignore[arg-type]
@root_validator() def validate_environments(cls, values: Dict) -> Dict: """验证 ThirdAI 环境变量。""" values["thirdai_key"] = convert_to_secret_str( get_from_dict_or_env( values, "thirdai_key", "THIRDAI_KEY", ) ) return values
[docs] def insert( self, sources: List[Any], train: bool = True, fast_mode: bool = True, **kwargs: dict, ) -> None: """将文件/文档源插入检索器中。 参数: train: 当为True时,意味着NeuralDB中的基础模型将对插入的文件进行无监督预训练。默认为True。 fast_mode: 更快的插入速度,性能略有下降。默认为True。 """ sources = self._preprocess_sources(sources) self.db.insert( sources=sources, train=train, fast_approximation=fast_mode, **kwargs, )
def _preprocess_sources(self, sources: list) -> list: """检查提供的源是否为字符串路径。如果是,则转换为NeuralDB文档对象。 参数: sources: 字符串路径列表,可以是PDF、DOCX或CSV文件,也可以是NeuralDB文档对象。 """ from thirdai import neural_db as ndb if not sources: return sources preprocessed_sources = [] for doc in sources: if not isinstance(doc, str): preprocessed_sources.append(doc) else: if doc.lower().endswith(".pdf"): preprocessed_sources.append(ndb.PDF(doc)) elif doc.lower().endswith(".docx"): preprocessed_sources.append(ndb.DOCX(doc)) elif doc.lower().endswith(".csv"): preprocessed_sources.append(ndb.CSV(doc)) else: raise RuntimeError( f"Could not automatically load {doc}. Only files " "with .pdf, .docx, or .csv extensions can be loaded " "automatically. For other formats, please use the " "appropriate document object from the ThirdAI library." ) return preprocessed_sources
[docs] def upvote(self, query: str, document_id: int) -> None: """检索器会提升特定查询的文档得分。这对于调整检索器以适应用户行为非常有用。 参数: query:与`document_id`相关联的文本 document_id:要与查询相关联的文档的ID。 """ self.db.text_to_result(query, document_id)
[docs] def upvote_batch(self, query_id_pairs: List[Tuple[str, int]]) -> None: """给定一批(查询,文档ID)对,检索器会增加对应查询的文档得分。这对于对检索器进行用户行为微调非常有用。 参数: query_id_pairs: (查询,文档ID)对的列表。对于列表中的每对,模型将增加查询的文档ID的权重。 """ self.db.text_to_result_batch(query_id_pairs)
[docs] def associate(self, source: str, target: str) -> None: """检索器将源短语与目标短语关联起来。 当检索器看到源短语时,它还会考虑与目标短语相关的结果。 参数: source:要与“target”关联的文本。 target:要将“source”关联到的文本。 """ self.db.associate(source, target)
[docs] def associate_batch(self, text_pairs: List[Tuple[str, str]]) -> None: """给定一批(源,目标)对,检索器将每个源短语与相应的目标短语关联。 参数: text_pairs:(源,目标)文本对的列表。对于列表中的每一对,源将与目标相关联。 """ self.db.associate_batch(text_pairs)
def _get_relevant_documents( self, query: str, run_manager: CallbackManagerForRetrieverRun, **kwargs: Any ) -> List[Document]: """使用检索器检索给定查询的{top_k}个上下文 参数: query: 提交给模型的查询 top_k: 要检索的上下文结果的最大数量。默认为10。 """ try: if "top_k" not in kwargs: kwargs["top_k"] = 10 references = self.db.search(query=query, **kwargs) return [ Document( page_content=ref.text, metadata={ "id": ref.id, "upvote_ids": ref.upvote_ids, "source": ref.source, "metadata": ref.metadata, "score": ref.score, "context": ref.context(1), }, ) for ref in references ] except Exception as e: raise ValueError(f"Error while retrieving documents: {e}") from e
[docs] def save(self, path: str) -> None: """将NeuralDB实例保存到磁盘。可以通过调用NeuralDB.from_checkpoint(path)将其加载到内存中。 参数: path: 保存NeuralDB实例的磁盘路径。 """ self.db.save(path)