Source code for langchain_community.retrievers.weaviate_hybrid_search

from __future__ import annotations

from typing import Any, Dict, List, Optional, cast
from uuid import uuid4

from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import root_validator
from langchain_core.retrievers import BaseRetriever


[docs]class WeaviateHybridSearchRetriever(BaseRetriever): """`Weaviate混合搜索`检索器。 请参阅文档: https://weaviate.io/blog/hybrid-search-explained""" client: Any """传递给Weaviate客户端的关键字参数。""" index_name: str """要使用的索引名称。""" text_key: str """要使用的文本键的名称。""" alpha: float = 0.5 """混合搜索中文本关键字的权重。""" k: int = 4 """要返回的结果数量。""" attributes: List[str] """返回结果中的属性。""" create_schema_if_missing: bool = True """如果不存在,则创建模式。""" @root_validator(pre=True) def validate_client( cls, values: Dict[str, Any], ) -> Dict[str, Any]: try: import weaviate except ImportError: raise ImportError( "Could not import weaviate python package. " "Please install it with `pip install weaviate-client`." ) if not isinstance(values["client"], weaviate.Client): client = values["client"] raise ValueError( f"client should be an instance of weaviate.Client, got {type(client)}" ) if values.get("attributes") is None: values["attributes"] = [] cast(List, values["attributes"]).append(values["text_key"]) if values.get("create_schema_if_missing", True): class_obj = { "class": values["index_name"], "properties": [{"name": values["text_key"], "dataType": ["text"]}], "vectorizer": "text2vec-openai", } if not values["client"].schema.exists(values["index_name"]): values["client"].schema.create_class(class_obj) return values class Config: """此pydantic对象的配置。""" arbitrary_types_allowed = True # added text_key
[docs] def add_documents(self, docs: List[Document], **kwargs: Any) -> List[str]: """将文档上传到Weaviate。""" from weaviate.util import get_valid_uuid with self.client.batch as batch: ids = [] for i, doc in enumerate(docs): metadata = doc.metadata or {} data_properties = {self.text_key: doc.page_content, **metadata} # If the UUID of one of the objects already exists # then the existing objectwill be replaced by the new object. if "uuids" in kwargs: _id = kwargs["uuids"][i] else: _id = get_valid_uuid(uuid4()) batch.add_data_object(data_properties, self.index_name, _id) ids.append(_id) return ids
def _get_relevant_documents( self, query: str, *, run_manager: CallbackManagerForRetrieverRun, where_filter: Optional[Dict[str, object]] = None, score: bool = False, hybrid_search_kwargs: Optional[Dict[str, object]] = None, ) -> List[Document]: """在Weaviate中查找相似的文档。 查询:要搜索相关文档的查询 使用Weaviate混合搜索的优势。 where_filter:要应用于查询的过滤器。 https://weaviate.io/developers/weaviate/guides/querying/#filtering score:是否在返回的文档元数据中包括分数和分数解释。 hybrid_search_kwargs:用于传递额外参数到.with_hybrid()方法。 这些参数的主要用途是: 1)仅搜索特定属性 - 指定在混合搜索部分中要使用的属性。 注意:这与要返回的(self.attributes)不同。 示例 - hybrid_search_kwargs={"properties": ["question", "answer"]} https://weaviate.io/developers/weaviate/search/hybrid#selected-properties-only 2)加权提升搜索属性 - 在混合搜索部分中提升特定属性的权重。 示例 - hybrid_search_kwargs={"properties": ["question^2", "answer"]} https://weaviate.io/developers/weaviate/search/hybrid#weight-boost-searched-properties 3)使用自定义向量进行搜索 - 定义在混合搜索部分中使用的不同向量。 示例 - hybrid_search_kwargs={"vector": [0.1, 0.2, 0.3, ...]} https://weaviate.io/developers/weaviate/search/hybrid#with-a-custom-vector 4)使用融合排名方法 示例 - from weaviate.gql.get import HybridFusion hybrid_search_kwargs={"fusion": fusion_type=HybridFusion.RELATIVE_SCORE} https://weaviate.io/developers/weaviate/search/hybrid#fusion-ranking-method """ query_obj = self.client.query.get(self.index_name, self.attributes) if where_filter: query_obj = query_obj.with_where(where_filter) if score: query_obj = query_obj.with_additional(["score", "explainScore"]) if hybrid_search_kwargs is None: hybrid_search_kwargs = {} result = ( query_obj.with_hybrid(query, alpha=self.alpha, **hybrid_search_kwargs) .with_limit(self.k) .do() ) if "errors" in result: raise ValueError(f"Error during query: {result['errors']}") docs = [] for res in result["data"]["Get"][self.index_name]: text = res.pop(self.text_key) docs.append(Document(page_content=text, metadata=res)) return docs