Source code for langchain_community.retrievers.weaviate_hybrid_search

from __future__ import annotations

from typing import Any, Dict, List, Optional, cast
from uuid import uuid4

from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import root_validator
from langchain_core.retrievers import BaseRetriever


[docs]class WeaviateHybridSearchRetriever(BaseRetriever):
    """`Weaviate混合搜索`检索器。

    请参阅文档：
      https://weaviate.io/blog/hybrid-search-explained"""

    client: Any
    """传递给Weaviate客户端的关键字参数。"""
    index_name: str
    """要使用的索引名称。"""
    text_key: str
    """要使用的文本键的名称。"""
    alpha: float = 0.5
    """混合搜索中文本关键字的权重。"""
    k: int = 4
    """要返回的结果数量。"""
    attributes: List[str]
    """返回结果中的属性。"""
    create_schema_if_missing: bool = True
    """如果不存在，则创建模式。"""

    @root_validator(pre=True)
    def validate_client(
        cls,
        values: Dict[str, Any],
    ) -> Dict[str, Any]:
        try:
            import weaviate
        except ImportError:
            raise ImportError(
                "Could not import weaviate python package. "
                "Please install it with `pip install weaviate-client`."
            )
        if not isinstance(values["client"], weaviate.Client):
            client = values["client"]
            raise ValueError(
                f"client should be an instance of weaviate.Client, got {type(client)}"
            )
        if values.get("attributes") is None:
            values["attributes"] = []

        cast(List, values["attributes"]).append(values["text_key"])

        if values.get("create_schema_if_missing", True):
            class_obj = {
                "class": values["index_name"],
                "properties": [{"name": values["text_key"], "dataType": ["text"]}],
                "vectorizer": "text2vec-openai",
            }

            if not values["client"].schema.exists(values["index_name"]):
                values["client"].schema.create_class(class_obj)

        return values

    class Config:
        """此pydantic对象的配置。"""

        arbitrary_types_allowed = True

    # added text_key
[docs]    def add_documents(self, docs: List[Document], **kwargs: Any) -> List[str]:
        """将文档上传到Weaviate。"""
        from weaviate.util import get_valid_uuid

        with self.client.batch as batch:
            ids = []
            for i, doc in enumerate(docs):
                metadata = doc.metadata or {}
                data_properties = {self.text_key: doc.page_content, **metadata}

                # If the UUID of one of the objects already exists
                # then the existing objectwill be replaced by the new object.
                if "uuids" in kwargs:
                    _id = kwargs["uuids"][i]
                else:
                    _id = get_valid_uuid(uuid4())

                batch.add_data_object(data_properties, self.index_name, _id)
                ids.append(_id)
        return ids

    def _get_relevant_documents(
        self,
        query: str,
        *,
        run_manager: CallbackManagerForRetrieverRun,
        where_filter: Optional[Dict[str, object]] = None,
        score: bool = False,
        hybrid_search_kwargs: Optional[Dict[str, object]] = None,
    ) -> List[Document]:
        """在Weaviate中查找相似的文档。

查询：要搜索相关文档的查询
使用Weaviate混合搜索的优势。

where_filter：要应用于查询的过滤器。
    https://weaviate.io/developers/weaviate/guides/querying/#filtering

score：是否在返回的文档元数据中包括分数和分数解释。

hybrid_search_kwargs：用于传递额外参数到.with_hybrid()方法。
    这些参数的主要用途是：
    1）仅搜索特定属性 -
        指定在混合搜索部分中要使用的属性。
        注意：这与要返回的(self.attributes)不同。
        示例 - hybrid_search_kwargs={"properties": ["question", "answer"]}
    https://weaviate.io/developers/weaviate/search/hybrid#selected-properties-only

    2）加权提升搜索属性 -
        在混合搜索部分中提升特定属性的权重。
        示例 - hybrid_search_kwargs={"properties": ["question^2", "answer"]}
    https://weaviate.io/developers/weaviate/search/hybrid#weight-boost-searched-properties

    3）使用自定义向量进行搜索 - 定义在混合搜索部分中使用的不同向量。
        示例 - hybrid_search_kwargs={"vector": [0.1, 0.2, 0.3, ...]}
    https://weaviate.io/developers/weaviate/search/hybrid#with-a-custom-vector

    4）使用融合排名方法
        示例 - from weaviate.gql.get import HybridFusion
        hybrid_search_kwargs={"fusion": fusion_type=HybridFusion.RELATIVE_SCORE}
    https://weaviate.io/developers/weaviate/search/hybrid#fusion-ranking-method
"""
        query_obj = self.client.query.get(self.index_name, self.attributes)
        if where_filter:
            query_obj = query_obj.with_where(where_filter)

        if score:
            query_obj = query_obj.with_additional(["score", "explainScore"])

        if hybrid_search_kwargs is None:
            hybrid_search_kwargs = {}

        result = (
            query_obj.with_hybrid(query, alpha=self.alpha, **hybrid_search_kwargs)
            .with_limit(self.k)
            .do()
        )
        if "errors" in result:
            raise ValueError(f"Error during query: {result['errors']}")

        docs = []

        for res in result["data"]["Get"][self.index_name]:
            text = res.pop(self.text_key)
            docs.append(Document(page_content=text, metadata=res))
        return docs