Source code for langchain_community.vectorstores.docarray.base

from abc import ABC
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type

import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import Field
from langchain_core.vectorstores import VectorStore

from langchain_community.vectorstores.utils import maximal_marginal_relevance

if TYPE_CHECKING:
    from docarray import BaseDoc
    from docarray.index.abstract import BaseDocIndex


def _check_docarray_import() -> None:
    try:
        import docarray

        da_version = docarray.__version__.split(".")
        if int(da_version[0]) == 0 and int(da_version[1]) <= 31:
            raise ImportError(
                f"To use the DocArrayHnswSearch VectorStore the docarray "
                f"version >=0.32.0 is expected, received: {docarray.__version__}."
                f"To upgrade, please run: `pip install -U docarray`."
            )
    except ImportError:
        raise ImportError(
            "Could not import docarray python package. "
            'Please install it with `pip install "langchain[docarray]"`.'
        )


[docs]class DocArrayIndex(VectorStore, ABC): """基类,用于基于`DocArray`的向量存储。"""
[docs] def __init__( self, doc_index: "BaseDocIndex", embedding: Embeddings, ): """从DocArray的DocIndex初始化一个向量存储。""" self.doc_index = doc_index self.embedding = embedding
@staticmethod def _get_doc_cls(**embeddings_params: Any) -> Type["BaseDoc"]: """获取docarray文档类,描述了DocIndex的模式。""" from docarray import BaseDoc from docarray.typing import NdArray class DocArrayDoc(BaseDoc): text: Optional[str] = Field(default=None, required=False) embedding: Optional[NdArray] = Field(**embeddings_params) metadata: Optional[dict] = Field(default=None, required=False) return DocArrayDoc @property def doc_cls(self) -> Type["BaseDoc"]: if self.doc_index._schema is None: raise ValueError("doc_index expected to have non-null _schema attribute.") return self.doc_index._schema
[docs] def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> List[str]: """将文本嵌入并添加到向量存储中。 参数: texts:要添加到向量存储中的字符串的可迭代对象。 metadatas:与文本相关联的元数据的可选列表。 返回: 将文本添加到向量存储中后的id列表。 """ ids: List[str] = [] embeddings = self.embedding.embed_documents(list(texts)) for i, (t, e) in enumerate(zip(texts, embeddings)): m = metadatas[i] if metadatas else {} doc = self.doc_cls(text=t, embedding=e, metadata=m) self.doc_index.index([doc]) ids.append(str(doc.id)) return ids
[docs] def similarity_search_with_score( self, query: str, k: int = 4, **kwargs: Any ) -> List[Tuple[Document, float]]: """返回与查询最相似的文档。 参数: query:要查找类似文档的文本。 k:要返回的文档数量。默认为4。 返回: 与查询文本最相似的文档列表,以及每个文档的余弦距离(浮点数)。 较低的分数表示更相似。 """ query_embedding = self.embedding.embed_query(query) query_doc = self.doc_cls(embedding=query_embedding) # type: ignore docs, scores = self.doc_index.find(query_doc, search_field="embedding", limit=k) result = [ (Document(page_content=doc.text, metadata=doc.metadata), score) for doc, score in zip(docs, scores) ] return result
def _similarity_search_with_relevance_scores( self, query: str, k: int = 4, **kwargs: Any, ) -> List[Tuple[Document, float]]: """返回文档和相关性分数,标准化在0到1的范围内。 0表示不相似,1表示最相似。 """ raise NotImplementedError()
[docs] def similarity_search_by_vector( self, embedding: List[float], k: int = 4, **kwargs: Any ) -> List[Document]: """返回与嵌入向量最相似的文档。 参数: embedding: 要查找与之相似文档的嵌入。 k: 要返回的文档数量。默认为4。 返回: 与查询向量最相似的文档列表。 """ query_doc = self.doc_cls(embedding=embedding) # type: ignore docs = self.doc_index.find( query_doc, search_field="embedding", limit=k ).documents result = [ Document(page_content=doc.text, metadata=doc.metadata) for doc in docs ] return result