Source code for langchain_community.vectorstores.thirdai_neuraldb

import importlib
import os
import tempfile
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import Extra, root_validator
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
from langchain_core.vectorstores import VectorStore


[docs]class NeuralDBVectorStore(VectorStore):
    """使用ThirdAI的NeuralDB的Vectorstore。

    要使用，应该安装``thirdai[neural_db]`` python包。

    示例:
        .. code-block:: python

            from langchain_community.vectorstores import NeuralDBVectorStore
            from thirdai import neural_db as ndb

            db = ndb.NeuralDB()
            vectorstore = NeuralDBVectorStore(db=db)"""

[docs]    def __init__(self, db: Any) -> None:
        self.db = db

    db: Any = None  #: :meta private:
    """神经数据库实例"""

    class Config:
        """此pydantic对象的配置。"""

        extra = Extra.forbid
        underscore_attrs_are_private = True

    @staticmethod
    def _verify_thirdai_library(thirdai_key: Optional[str] = None):  # type: ignore[no-untyped-def]
        try:
            from thirdai import licensing

            importlib.util.find_spec("thirdai.neural_db")

            licensing.activate(thirdai_key or os.getenv("THIRDAI_KEY"))
        except ImportError:
            raise ImportError(
                "Could not import thirdai python package and neuraldb dependencies. "
                "Please install it with `pip install thirdai[neural_db]`."
            )

[docs]    @classmethod
    def from_scratch(  # type: ignore[no-untyped-def, no-untyped-def]
        cls,
        thirdai_key: Optional[str] = None,
        **model_kwargs,
    ):
        """从头开始创建一个 NeuralDBVectorStore。

要使用，请设置``THIRDAI_KEY``环境变量为您的 ThirdAI API 密钥，或将``thirdai_key``作为一个命名参数传递。

示例：
    .. code-block:: python

        from langchain_community.vectorstores import NeuralDBVectorStore

        vectorstore = NeuralDBVectorStore.from_scratch(
            thirdai_key="your-thirdai-key",
        )

        vectorstore.insert([
            "/path/to/doc.pdf",
            "/path/to/doc.docx",
            "/path/to/doc.csv",
        ])

        documents = vectorstore.similarity_search("AI-driven music therapy")
"""
        NeuralDBVectorStore._verify_thirdai_library(thirdai_key)
        from thirdai import neural_db as ndb

        return cls(db=ndb.NeuralDB(**model_kwargs))  # type: ignore[call-arg]

[docs]    @classmethod
    def from_checkpoint(  # type: ignore[no-untyped-def]
        cls,
        checkpoint: Union[str, Path],
        thirdai_key: Optional[str] = None,
    ):
        """使用保存的检查点创建一个带有基本模型的NeuralDBVectorStore

要使用，请设置``THIRDAI_KEY``环境变量为您的ThirdAI API密钥，或将``thirdai_key``作为命名参数传递。

示例：
    .. code-block:: python

        from langchain_community.vectorstores import NeuralDBVectorStore

        vectorstore = NeuralDBVectorStore.from_checkpoint(
            checkpoint="/path/to/checkpoint.ndb",
            thirdai_key="your-thirdai-key",
        )

        vectorstore.insert([
            "/path/to/doc.pdf",
            "/path/to/doc.docx",
            "/path/to/doc.csv",
        ])

        documents = vectorstore.similarity_search("AI-driven music therapy")

"""
        NeuralDBVectorStore._verify_thirdai_library(thirdai_key)
        from thirdai import neural_db as ndb

        return cls(db=ndb.NeuralDB.from_checkpoint(checkpoint))  # type: ignore[call-arg]

[docs]    @classmethod
    def from_texts(
        cls,
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> "NeuralDBVectorStore":
        """返回从文本和嵌入初始化的VectorStore。"""
        model_kwargs = {}
        if "thirdai_key" in kwargs:
            model_kwargs["thirdai_key"] = kwargs["thirdai_key"]
            del kwargs["thirdai_key"]
        vectorstore = cls.from_scratch(**model_kwargs)
        vectorstore.add_texts(texts, metadatas, **kwargs)
        return vectorstore

[docs]    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """运行更多文本通过嵌入并添加到向量存储。

参数：
    texts：要添加到向量存储的字符串的可迭代对象。
    metadatas：与文本相关联的元数据的可选列表。
    kwargs：向量存储特定参数

返回：
    将文本添加到向量存储中的ID列表。
"""
        import pandas as pd
        from thirdai import neural_db as ndb

        df = pd.DataFrame({"texts": texts})
        if metadatas:
            df = pd.concat([df, pd.DataFrame.from_records(metadatas)], axis=1)
        temp = tempfile.NamedTemporaryFile("w", delete=False, delete_on_close=False)  # type: ignore[call-overload]
        df.to_csv(temp)
        source_id = self.insert([ndb.CSV(temp.name)], **kwargs)[0]
        offset = self.db._savable_state.documents.get_source_by_id(source_id)[1]
        return [str(offset + i) for i in range(len(texts))]  # type: ignore[arg-type]

[docs]    @root_validator(allow_reuse=True)
    def validate_environments(cls, values: Dict) -> Dict:
        """验证 ThirdAI 环境变量。"""
        values["thirdai_key"] = convert_to_secret_str(
            get_from_dict_or_env(
                values,
                "thirdai_key",
                "THIRDAI_KEY",
            )
        )
        return values

[docs]    def insert(  # type: ignore[no-untyped-def, no-untyped-def]
        self,
        sources: List[Any],
        train: bool = True,
        fast_mode: bool = True,
        **kwargs,
    ):
        """将文件/文档源插入向量存储中。

参数：
    train: 当为True时，意味着NeuralDB中的基础模型将在插入的文件上进行无监督预训练。默认为True。
    fast_mode: 更快的插入速度，性能略有下降。默认为True。
"""
        sources = self._preprocess_sources(sources)
        self.db.insert(
            sources=sources,
            train=train,
            fast_approximation=fast_mode,
            **kwargs,
        )

    def _preprocess_sources(self, sources):  # type: ignore[no-untyped-def]
        """检查提供的源是否为字符串路径。如果是，则转换为NeuralDB文档对象。

参数:
    sources: 字符串路径列表，可以是PDF、DOCX或CSV文件，也可以是NeuralDB文档对象。
"""
        from thirdai import neural_db as ndb

        if not sources:
            return sources
        preprocessed_sources = []
        for doc in sources:
            if not isinstance(doc, str):
                preprocessed_sources.append(doc)
            else:
                if doc.lower().endswith(".pdf"):
                    preprocessed_sources.append(ndb.PDF(doc))
                elif doc.lower().endswith(".docx"):
                    preprocessed_sources.append(ndb.DOCX(doc))
                elif doc.lower().endswith(".csv"):
                    preprocessed_sources.append(ndb.CSV(doc))
                else:
                    raise RuntimeError(
                        f"Could not automatically load {doc}. Only files "
                        "with .pdf, .docx, or .csv extensions can be loaded "
                        "automatically. For other formats, please use the "
                        "appropriate document object from the ThirdAI library."
                    )
        return preprocessed_sources

[docs]    def upvote(self, query: str, document_id: Union[int, str]):  # type: ignore[no-untyped-def]
        """向量存储增加了特定查询的文档得分。这对于微调向量存储以适应用户行为非常有用。

参数：
    query：与`document_id`关联的文本
    document_id：要与查询关联的文档的ID。
"""
        self.db.text_to_result(query, int(document_id))

[docs]    def upvote_batch(self, query_id_pairs: List[Tuple[str, int]]):  # type: ignore[no-untyped-def]
        """给定一批（查询，文档ID）对，向量存储器会增加对应查询的文档得分。这对于微调向量存储器以适应用户行为非常有用。

参数：
    query_id_pairs：（查询，文档ID）对的列表。对于此列表中的每对，模型将增加该查询的文档ID的权重。
"""
        self.db.text_to_result_batch(
            [(query, int(doc_id)) for query, doc_id in query_id_pairs]
        )

[docs]    def associate(self, source: str, target: str):  # type: ignore[no-untyped-def]
        """向量存储将源短语与目标短语关联起来。
当向量存储看到源短语时，它还会考虑与目标短语相关的结果。

参数：
    source：要与“target”关联的文本。
    target：要将“source”关联到的文本。
"""
        self.db.associate(source, target)

[docs]    def associate_batch(self, text_pairs: List[Tuple[str, str]]):  # type: ignore[no-untyped-def]
        """给定一批（源，目标）对，向量存储将每个源短语与相应的目标短语关联起来。

参数：
    text_pairs：（源，目标）文本对的列表。对于此列表中的每对，源将与目标关联起来。
"""
        self.db.associate_batch(text_pairs)

[docs]    def similarity_search(
        self, query: str, k: int = 10, **kwargs: Any
    ) -> List[Document]:
        """检索给定查询的{k}个上下文

参数：
    query: 提交给模型的查询
    k: 要检索的上下文结果的最大数量。默认为10。
"""
        try:
            references = self.db.search(query=query, top_k=k, **kwargs)
            return [
                Document(
                    page_content=ref.text,
                    metadata={
                        "id": ref.id,
                        "upvote_ids": ref.upvote_ids,
                        "source": ref.source,
                        "metadata": ref.metadata,
                        "score": ref.score,
                        "context": ref.context(1),
                    },
                )
                for ref in references
            ]
        except Exception as e:
            raise ValueError(f"Error while retrieving documents: {e}") from e

[docs]    def save(self, path: str):  # type: ignore[no-untyped-def]
        """将NeuralDB实例保存到磁盘。可以通过调用NeuralDB.from_checkpoint(path)将其加载到内存中。

参数:
    path: 保存NeuralDB实例的磁盘路径。
"""
        self.db.save(path)


[docs]class NeuralDBClientVectorStore(VectorStore):
    """使用ThirdAI的NeuralDB Enterprise Python客户端的Vectorstore。

要使用，应该已安装``thirdai[neural_db]`` python包。

示例：
```python
from langchain_community.vectorstores import NeuralDBClientVectorStore
from thirdai.neural_db import ModelBazaar, NeuralDBClient

bazaar = ModelBazaar(base_url="http://{NEURAL_DB_ENTERPRISE_IP}/api/")
bazaar.log_in(email="user@thirdai.com", password="1234")

ndb_client = NeuralDBClient(
    deployment_identifier="user/model-0:user/deployment-0",
    base_url="http://{NEURAL_DB_ENTERPRISE_IP}/api/",
    bazaar=bazaar
)
vectorstore = NeuralDBClientVectorStore(db=ndb_client)
retriever = vectorstore.as_retriever(search_kwargs={'k':5})
```"""

[docs]    def __init__(self, db: Any) -> None:
        self.db = db

    db: Any = None  #: :meta private:
    """神经数据库客户端实例"""

    class Config:
        """此pydantic对象的配置。"""

        extra = Extra.forbid
        underscore_attrs_are_private = True

[docs]    def similarity_search(
        self, query: str, k: int = 10, **kwargs: Any
    ) -> List[Document]:
        """检索给定查询的{k}个上下文

参数：
    query: 提交给模型的查询
    k: 要检索的上下文结果的最大数量。默认为10。
"""
        try:
            references = self.db.search(query=query, top_k=k, **kwargs)["references"]
            return [
                Document(
                    page_content=ref["text"],
                    metadata={
                        "id": ref["id"],
                        "source": ref["source"],
                        "metadata": ref["metadata"],
                        "score": ref["source"],
                        "context": ref["context"],
                    },
                )
                for ref in references
            ]
        except Exception as e:
            raise ValueError(f"Error while retrieving documents: {e}") from e

[docs]    def insert(self, documents: List[Dict[str, Any]]):  # type: ignore[no-untyped-def, no-untyped-def]
        """将文档插入VectorStore并返回相应的Sources。

参数：
    documents（List[Dict[str, Any]]）：要插入VectorStores的文档列表，文档必须以以下格式的字典表示：
    {"document_type": "DOCUMENT_TYPE", **kwargs}，其中"DOCUMENT_TYPE"是以下之一：
    "PDF"、"CSV"、"DOCX"、"URL"、"SentenceLevelPDF"、"SentenceLevelDOCX"、
    "Unstructured"、"InMemoryText"。每种文档类型的kwargs如下所示：

    class PDF(Document):
        document_type: Literal["PDF"]
        path: str
        metadata: Optional[dict[str, Any]] = None
        on_disk: bool = False
        version: str = "v1"
        chunk_size: int = 100
        stride: int = 40
        emphasize_first_words: int = 0
        ignore_header_footer: bool = True
        ignore_nonstandard_orientation: bool = True

    class CSV(Document):
        document_type: Literal["CSV"]
        path: str
        id_column: Optional[str] = None
        strong_columns: Optional[List[str]] = None
        weak_columns: Optional[List[str]] = None
        reference_columns: Optional[List[str]] = None
        save_extra_info: bool = True
        metadata: Optional[dict[str, Any]] = None
        has_offset: bool = False
        on_disk: bool = False

    class DOCX(Document):
        document_type: Literal["DOCX"]
        path: str
        metadata: Optional[dict[str, Any]] = None
        on_disk: bool = False

    class URL(Document):
        document_type: Literal["URL"]
        url: str
        save_extra_info: bool = True
        title_is_strong: bool = False
        metadata: Optional[dict[str, Any]] = None
        on_disk: bool = False

    class SentenceLevelPDF(Document):
        document_type: Literal["SentenceLevelPDF"]
        path: str
        metadata: Optional[dict[str, Any]] = None
        on_disk: bool = False

    class SentenceLevelDOCX(Document):
        document_type: Literal["SentenceLevelDOCX"]
        path: str
        metadata: Optional[dict[str, Any]] = None
        on_disk: bool = False

    class Unstructured(Document):
        document_type: Literal["Unstructured"]
        path: str
        save_extra_info: bool = True
        metadata: Optional[dict[str, Any]] = None
        on_disk: bool = False

    class InMemoryText(Document):
        document_type: Literal["InMemoryText"]
        name: str
        texts: list[str]
        metadatas: Optional[list[dict[str, Any]]] = None
        global_metadata: Optional[dict[str, Any]] = None
        on_disk: bool = False

    对于带有参数"path"的文档类型，请确保路径存在于本地计算机上。
"""
        return self.db.insert(documents)

[docs]    def remove_documents(self, source_ids: List[str]):  # type: ignore[no-untyped-def]
        """使用源ID从VectorStore中删除文档。

参数：
    files（List[str]）：要从VectorStore中删除的源ID列表。
"""

        self.db.delete(source_ids)