Source code for langchain_community.vectorstores.vectara

from __future__ import annotations

import json
import logging
import os
from dataclasses import dataclass, field
from hashlib import md5
from typing import Any, Iterable, List, Optional, Tuple, Type

import requests
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import Field
from langchain_core.vectorstores import VectorStore, VectorStoreRetriever

logger = logging.getLogger(__name__)


[docs]@dataclass class SummaryConfig: """用于生成摘要的配置。 is_enabled: 如果启用了摘要,则为True,否则为False max_results: 要总结的最大结果数 response_lang: 摘要的请求语言 prompt_name: 用于总结的提示名称 (参见https://docs.vectara.com/docs/learn/grounded-generation/select-a-summarizer)""" is_enabled: bool = False max_results: int = 7 response_lang: str = "eng" prompt_name: str = "vectara-summary-ext-v1.2.0"
[docs]@dataclass class MMRConfig: """用于最大边际相关性(MMR)搜索的配置。 is_enabled: 如果启用了MMR,则为True,否则为False mmr_k: 用于MMR获取结果的数量,默认为50 diversity_bias: 介于0和1之间的数字,确定结果之间的多样性程度, 其中0对应最小多样性,1对应最大多样性。 默认为0.3。 注意:diversity_bias等同于1-lambda_mult 其中lambda_mult是在max_marginal_relevance_search()中经常使用的值 我们选择使用这个值,因为我们认为这对用户更直观。""" is_enabled: bool = False mmr_k: int = 50 diversity_bias: float = 0.3
[docs]@dataclass class VectaraQueryConfig: """用于Vectara查询的配置。 k: 要返回的文档数量。默认为10。 lambda_val: 用于混合搜索的词汇匹配参数。 filter: 元数据过滤参数的字典。例如,过滤器可以是 "doc.rating > 3.0 and part.lang = 'deu'",更多详情请参见https://docs.vectara.com/docs/search-apis/sql/filter-overview。 score_threshold: 结果的最小得分阈值。如果定义了,得分低于此值的结果将被过滤掉。 n_sentence_context: 匹配段前/后要添加的句子数量,默认为2。 mmr_config: MMRConfig配置数据类 summary_config: SummaryConfig配置数据类""" k: int = 10 lambda_val: float = 0.0 filter: str = "" score_threshold: Optional[float] = None n_sentence_context: int = 2 mmr_config: MMRConfig = field(default_factory=MMRConfig) summary_config: SummaryConfig = field(default_factory=SummaryConfig)
[docs]class Vectara(VectorStore): """`Vectara API` 向量存储。 参见 (https://vectara.com). 示例: .. code-block:: python from langchain_community.vectorstores import Vectara vectorstore = Vectara( vectara_customer_id=vectara_customer_id, vectara_corpus_id=vectara_corpus_id, vectara_api_key=vectara_api_key ) """
[docs] def __init__( self, vectara_customer_id: Optional[str] = None, vectara_corpus_id: Optional[str] = None, vectara_api_key: Optional[str] = None, vectara_api_timeout: int = 120, source: str = "langchain", ): """使用Vectara API 进行初始化。""" self._vectara_customer_id = vectara_customer_id or os.environ.get( "VECTARA_CUSTOMER_ID" ) self._vectara_corpus_id = vectara_corpus_id or os.environ.get( "VECTARA_CORPUS_ID" ) self._vectara_api_key = vectara_api_key or os.environ.get("VECTARA_API_KEY") if ( self._vectara_customer_id is None or self._vectara_corpus_id is None or self._vectara_api_key is None ): logger.warning( "Can't find Vectara credentials, customer_id or corpus_id in " "environment." ) else: logger.debug(f"Using corpus id {self._vectara_corpus_id}") self._source = source self._session = requests.Session() # to reuse connections adapter = requests.adapters.HTTPAdapter(max_retries=3) self._session.mount("http://", adapter) self.vectara_api_timeout = vectara_api_timeout
@property def embeddings(self) -> Optional[Embeddings]: return None def _get_post_headers(self) -> dict: """返回应附加到每个POST请求的标头。""" return { "x-api-key": self._vectara_api_key, "customer-id": self._vectara_customer_id, "Content-Type": "application/json", "X-Source": self._source, } def _delete_doc(self, doc_id: str) -> bool: """从Vectara语料库中删除文档。 参数: url (str): 要删除的页面的URL。 doc_id (str): 要删除的文档的ID。 返回: bool: 如果删除成功则为True,否则为False。 """ body = { "customer_id": self._vectara_customer_id, "corpus_id": self._vectara_corpus_id, "document_id": doc_id, } response = self._session.post( "https://api.vectara.io/v1/delete-doc", data=json.dumps(body), verify=True, headers=self._get_post_headers(), timeout=self.vectara_api_timeout, ) if response.status_code != 200: logger.error( f"Delete request failed for doc_id = {doc_id} with status code " f"{response.status_code}, reason {response.reason}, text " f"{response.text}" ) return False return True def _index_doc(self, doc: dict, use_core_api: bool = False) -> str: request: dict[str, Any] = {} request["customer_id"] = self._vectara_customer_id request["corpus_id"] = self._vectara_corpus_id request["document"] = doc api_endpoint = ( "https://api.vectara.io/v1/core/index" if use_core_api else "https://api.vectara.io/v1/index" ) response = self._session.post( headers=self._get_post_headers(), url=api_endpoint, data=json.dumps(request), timeout=self.vectara_api_timeout, verify=True, ) status_code = response.status_code result = response.json() status_str = result["status"]["code"] if "status" in result else None if status_code == 409 or status_str and (status_str == "ALREADY_EXISTS"): return "E_ALREADY_EXISTS" elif status_str and (status_str == "FORBIDDEN"): return "E_NO_PERMISSIONS" else: return "E_SUCCEEDED"
[docs] def add_files( self, files_list: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> List[str]: """Vectara提供了一种通过我们的API直接添加文档的方式,其中预处理和分块在内部以最佳方式进行。 这种方法提供了一种在LangChain中使用该API的方式 参数: files_list:字符串的可迭代对象,每个字符串代表一个本地文件路径。 文件可以是文本、HTML、PDF、markdown、doc/docx、ppt/pptx等。 请查看API文档以获取完整列表 metadatas:可选的与每个文件关联的元数据列表 返回: 与每个索引文件相关联的ID列表 """ doc_ids = [] for inx, file in enumerate(files_list): if not os.path.exists(file): logger.error(f"File {file} does not exist, skipping") continue md = metadatas[inx] if metadatas else {} files: dict = { "file": (file, open(file, "rb")), "doc_metadata": json.dumps(md), } headers = self._get_post_headers() headers.pop("Content-Type") response = self._session.post( f"https://api.vectara.io/upload?c={self._vectara_customer_id}&o={self._vectara_corpus_id}&d=True", files=files, verify=True, headers=headers, timeout=self.vectara_api_timeout, ) if response.status_code == 409: doc_id = response.json()["document"]["documentId"] logger.info( f"File {file} already exists on Vectara (doc_id={doc_id}), skipping" ) elif response.status_code == 200: doc_id = response.json()["document"]["documentId"] doc_ids.append(doc_id) else: logger.info(f"Error indexing file {file}: {response.json()}") return doc_ids
[docs] def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, doc_metadata: Optional[dict] = None, **kwargs: Any, ) -> List[str]: """运行更多的文本通过嵌入并添加到向量存储。 参数: texts:要添加到向量存储的字符串的可迭代对象。 metadatas:与文本相关联的元数据的可选列表。 doc_metadata:文档的可选元数据。 此函数将所有输入文本字符串索引到Vectara语料库中作为单个Vectara文档,其中每个输入文本被视为一个“部分”,元数据与每个部分相关联。 如果提供了'doc_metadata',它将与Vectara文档关联。 返回: 添加的文档的文档ID """ doc_hash = md5() for t in texts: doc_hash.update(t.encode()) doc_id = doc_hash.hexdigest() if metadatas is None: metadatas = [{} for _ in texts] if doc_metadata: doc_metadata["source"] = "langchain" else: doc_metadata = {"source": "langchain"} use_core_api = kwargs.get("use_core_api", False) section_key = "parts" if use_core_api else "section" doc = { "document_id": doc_id, "metadataJson": json.dumps(doc_metadata), section_key: [ {"text": text, "metadataJson": json.dumps(md)} for text, md in zip(texts, metadatas) ], } success_str = self._index_doc(doc, use_core_api=use_core_api) if success_str == "E_ALREADY_EXISTS": self._delete_doc(doc_id) self._index_doc(doc) elif success_str == "E_NO_PERMISSIONS": print( # noqa: T201 """No permissions to add document to Vectara. Check your corpus ID, customer ID and API key""" ) return [doc_id]
[docs] def vectara_query( self, query: str, config: VectaraQueryConfig, **kwargs: Any, ) -> List[Tuple[Document, float]]: """运行一个Vectara查询 参数: query: 要查找相似文档的文本。 config: VectaraQueryConfig对象 返回: 一个包含k个匹配给定查询的文档的列表 如果启用了摘要,最后一个文档是带有'summary'=True的摘要文本 """ if isinstance(config.mmr_config, dict): config.mmr_config = MMRConfig(**config.mmr_config) if isinstance(config.summary_config, dict): config.summary_config = SummaryConfig(**config.summary_config) data = { "query": [ { "query": query, "start": 0, "numResults": ( config.mmr_config.mmr_k if config.mmr_config.is_enabled else config.k ), "contextConfig": { "sentencesBefore": config.n_sentence_context, "sentencesAfter": config.n_sentence_context, }, "corpusKey": [ { "customerId": self._vectara_customer_id, "corpusId": self._vectara_corpus_id, "metadataFilter": config.filter, "lexicalInterpolationConfig": {"lambda": config.lambda_val}, } ], } ] } if config.mmr_config.is_enabled: data["query"][0]["rerankingConfig"] = { "rerankerId": 272725718, "mmrConfig": {"diversityBias": config.mmr_config.diversity_bias}, } if config.summary_config.is_enabled: data["query"][0]["summary"] = [ { "maxSummarizedResults": config.summary_config.max_results, "responseLang": config.summary_config.response_lang, "summarizerPromptName": config.summary_config.prompt_name, } ] response = self._session.post( headers=self._get_post_headers(), url="https://api.vectara.io/v1/query", data=json.dumps(data), timeout=self.vectara_api_timeout, ) if response.status_code != 200: logger.error( "Query failed %s", f"(code {response.status_code}, reason {response.reason}, details " f"{response.text})", ) return [], "" # type: ignore[return-value] result = response.json() if config.score_threshold: responses = [ r for r in result["responseSet"][0]["response"] if r["score"] > config.score_threshold ] else: responses = result["responseSet"][0]["response"] documents = result["responseSet"][0]["document"] metadatas = [] for x in responses: md = {m["name"]: m["value"] for m in x["metadata"]} doc_num = x["documentIndex"] doc_md = {m["name"]: m["value"] for m in documents[doc_num]["metadata"]} if "source" not in doc_md: doc_md["source"] = "vectara" md.update(doc_md) metadatas.append(md) res = [ ( Document( page_content=x["text"], metadata=md, ), x["score"], ) for x, md in zip(responses, metadatas) ] if config.mmr_config.is_enabled: res = res[: config.k] if config.summary_config.is_enabled: summary = result["responseSet"][0]["summary"][0]["text"] res.append( (Document(page_content=summary, metadata={"summary": True}), 0.0) ) return res
[docs] def similarity_search_with_score( self, query: str, **kwargs: Any, ) -> List[Tuple[Document, float]]: """返回与查询最相似的Vectara文档,以及相似度分数。 参数: query: 要查找类似文档的文本。 k: 要返回的文档数量。默认为10。 VectaraQueryConfig中的其他查询变量,如: - lambda_val: 用于混合搜索的词汇匹配参数。 - filter: 过滤字符串 - score_threshold: 结果的最小分数阈值。 - n_sentence_context: 匹配段前/后的句子数量 - mmr_config: MMR的可选配置(参见MMRConfig数据类) - summary_config: 摘要的可选配置(参见SummaryConfig数据类) 返回: 与查询最相似的文档列表,以及每个文档的分数。 """ config = VectaraQueryConfig(**kwargs) docs = self.vectara_query(query, config) return docs
[docs] @classmethod def from_texts( cls: Type[Vectara], texts: List[str], embedding: Optional[Embeddings] = None, metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> Vectara: """从原始文档构建Vectara包装器。 这旨在是一个快速入门的方式。 示例: .. code-block:: python from langchain_community.vectorstores import Vectara vectara = Vectara.from_texts( texts, vectara_customer_id=customer_id, vectara_corpus_id=corpus_id, vectara_api_key=api_key, ) """ # Notes: # * Vectara generates its own embeddings, so we ignore the provided # embeddings (required by interface) # * when metadatas[] are provided they are associated with each "part" # in Vectara. doc_metadata can be used to provide additional metadata # for the document itself (applies to all "texts" in this call) doc_metadata = kwargs.pop("doc_metadata", {}) vectara = cls(**kwargs) vectara.add_texts(texts, metadatas, doc_metadata=doc_metadata, **kwargs) return vectara
[docs] @classmethod def from_files( cls: Type[Vectara], files: List[str], embedding: Optional[Embeddings] = None, metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> Vectara: """从原始文档构建Vectara包装器。 这旨在是一个快速入门的方式。 示例: .. code-block:: python from langchain_community.vectorstores import Vectara vectara = Vectara.from_files( files_list, vectara_customer_id=customer_id, vectara_corpus_id=corpus_id, vectara_api_key=api_key, ) """ # Note: Vectara generates its own embeddings, so we ignore the provided # embeddings (required by interface) vectara = cls(**kwargs) vectara.add_files(files, metadatas) return vectara
[docs]class VectaraRetriever(VectorStoreRetriever): """`Vectara`的检索器。""" vectorstore: Vectara """Vectara矢量存储。""" search_kwargs: dict = Field( default_factory=lambda: { "lambda_val": 0.0, "k": 5, "filter": "", "n_sentence_context": "2", "summary_config": SummaryConfig(), } ) """搜索参数。 k: 返回的文档数量。默认为5。 lambda_val: 用于混合搜索的词汇匹配参数。 filter: 要在元数据上过滤的参数字典。例如,一个过滤器可以是 "doc.rating > 3.0 and part.lang = 'deu'"},请参见https://docs.vectara.com/docs/search-apis/sql/filter-overview获取更多详细信息。 n_sentence_context: 匹配段前/后要添加的句子数量。"""
[docs] def add_texts( self, texts: List[str], metadatas: Optional[List[dict]] = None, doc_metadata: Optional[dict] = None, ) -> None: """将文本添加到Vectara向量存储中。 参数: texts (List[str]): 文本 metadatas (List[dict]): 元数据字典,必须与现有存储对齐。 """ self.vectorstore.add_texts(texts, metadatas, doc_metadata or {})