Source code for langchain_community.retrievers.weaviate_hybrid_search
from __future__ import annotations
from typing import Any, Dict, List, Optional, cast
from uuid import uuid4
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import root_validator
from langchain_core.retrievers import BaseRetriever
[docs]class WeaviateHybridSearchRetriever(BaseRetriever):
"""`Weaviate混合搜索`检索器。
请参阅文档:
https://weaviate.io/blog/hybrid-search-explained"""
client: Any
"""传递给Weaviate客户端的关键字参数。"""
index_name: str
"""要使用的索引名称。"""
text_key: str
"""要使用的文本键的名称。"""
alpha: float = 0.5
"""混合搜索中文本关键字的权重。"""
k: int = 4
"""要返回的结果数量。"""
attributes: List[str]
"""返回结果中的属性。"""
create_schema_if_missing: bool = True
"""如果不存在,则创建模式。"""
@root_validator(pre=True)
def validate_client(
cls,
values: Dict[str, Any],
) -> Dict[str, Any]:
try:
import weaviate
except ImportError:
raise ImportError(
"Could not import weaviate python package. "
"Please install it with `pip install weaviate-client`."
)
if not isinstance(values["client"], weaviate.Client):
client = values["client"]
raise ValueError(
f"client should be an instance of weaviate.Client, got {type(client)}"
)
if values.get("attributes") is None:
values["attributes"] = []
cast(List, values["attributes"]).append(values["text_key"])
if values.get("create_schema_if_missing", True):
class_obj = {
"class": values["index_name"],
"properties": [{"name": values["text_key"], "dataType": ["text"]}],
"vectorizer": "text2vec-openai",
}
if not values["client"].schema.exists(values["index_name"]):
values["client"].schema.create_class(class_obj)
return values
class Config:
"""此pydantic对象的配置。"""
arbitrary_types_allowed = True
# added text_key
[docs] def add_documents(self, docs: List[Document], **kwargs: Any) -> List[str]:
"""将文档上传到Weaviate。"""
from weaviate.util import get_valid_uuid
with self.client.batch as batch:
ids = []
for i, doc in enumerate(docs):
metadata = doc.metadata or {}
data_properties = {self.text_key: doc.page_content, **metadata}
# If the UUID of one of the objects already exists
# then the existing objectwill be replaced by the new object.
if "uuids" in kwargs:
_id = kwargs["uuids"][i]
else:
_id = get_valid_uuid(uuid4())
batch.add_data_object(data_properties, self.index_name, _id)
ids.append(_id)
return ids
def _get_relevant_documents(
self,
query: str,
*,
run_manager: CallbackManagerForRetrieverRun,
where_filter: Optional[Dict[str, object]] = None,
score: bool = False,
hybrid_search_kwargs: Optional[Dict[str, object]] = None,
) -> List[Document]:
"""在Weaviate中查找相似的文档。
查询:要搜索相关文档的查询
使用Weaviate混合搜索的优势。
where_filter:要应用于查询的过滤器。
https://weaviate.io/developers/weaviate/guides/querying/#filtering
score:是否在返回的文档元数据中包括分数和分数解释。
hybrid_search_kwargs:用于传递额外参数到.with_hybrid()方法。
这些参数的主要用途是:
1)仅搜索特定属性 -
指定在混合搜索部分中要使用的属性。
注意:这与要返回的(self.attributes)不同。
示例 - hybrid_search_kwargs={"properties": ["question", "answer"]}
https://weaviate.io/developers/weaviate/search/hybrid#selected-properties-only
2)加权提升搜索属性 -
在混合搜索部分中提升特定属性的权重。
示例 - hybrid_search_kwargs={"properties": ["question^2", "answer"]}
https://weaviate.io/developers/weaviate/search/hybrid#weight-boost-searched-properties
3)使用自定义向量进行搜索 - 定义在混合搜索部分中使用的不同向量。
示例 - hybrid_search_kwargs={"vector": [0.1, 0.2, 0.3, ...]}
https://weaviate.io/developers/weaviate/search/hybrid#with-a-custom-vector
4)使用融合排名方法
示例 - from weaviate.gql.get import HybridFusion
hybrid_search_kwargs={"fusion": fusion_type=HybridFusion.RELATIVE_SCORE}
https://weaviate.io/developers/weaviate/search/hybrid#fusion-ranking-method
"""
query_obj = self.client.query.get(self.index_name, self.attributes)
if where_filter:
query_obj = query_obj.with_where(where_filter)
if score:
query_obj = query_obj.with_additional(["score", "explainScore"])
if hybrid_search_kwargs is None:
hybrid_search_kwargs = {}
result = (
query_obj.with_hybrid(query, alpha=self.alpha, **hybrid_search_kwargs)
.with_limit(self.k)
.do()
)
if "errors" in result:
raise ValueError(f"Error during query: {result['errors']}")
docs = []
for res in result["data"]["Get"][self.index_name]:
text = res.pop(self.text_key)
docs.append(Document(page_content=text, metadata=res))
return docs