Qdrant混合搜索¶
Qdrant支持通过结合稀疏
和密集
向量的搜索结果来实现混合搜索。
密集
向量可能是您已经在使用的一种 -- 来自OpenAI、BGE、SentenceTransformers等的嵌入模型通常是密集
嵌入模型。它们会创建文本的数值表示,表示为一长串数字。这些密集
向量可以捕捉整个文本的丰富语义信息。
稀疏
向量略有不同。它们使用专门的方法或模型(如TF-IDF、BM25、SPLADE等)来生成向量。这些向量通常大部分是零,使它们成为稀疏
向量。这些稀疏
向量非常擅长捕捉特定关键词和类似的细节。
本笔记将介绍如何使用Qdrant和来自Huggingface的"prithvida/Splade_PP_en_v1"
变种来设置和定制混合搜索。
设置¶
首先,我们设置环境并加载数据。
%pip install -U llama-index llama-index-vector-stores-qdrant fastembed
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
!mkdir -p 'data/'
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("./data/").load_data()
索引数据¶
现在,我们可以对数据进行索引。
使用Qdrant进行混合搜索必须从一开始就启用 -- 我们可以简单地设置 enable_hybrid=True
。
这将在本地使用 "prithvida/Splade_PP_en_v1"
和 fastembed 运行稀疏向量生成,同时使用OpenAI生成密集向量。
from llama_index.core import VectorStoreIndex,StorageContext
from llama_index.core import Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient,AsyncQdrantClient
# 创建一个持久化索引到磁盘
client = QdrantClient(host="localhost", port=6333)
aclient = AsyncQdrantClient(host="localhost", port=6333)
# 创建一个启用混合索引的向量存储
# batch_size控制一次编码稀疏向量的节点数量
vector_store = QdrantVectorStore(
"llama2_paper",
client=client,
aclient=aclient,
enable_hybrid=True,
batch_size=20,
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
Settings.chunk_size = 512
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
)
Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.
Fetching 9 files: 0%| | 0/9 [00:00<?, ?it/s]
.gitattributes: 0%| | 0.00/1.52k [00:00<?, ?B/s]
generation_config.json: 0%| | 0.00/90.0 [00:00<?, ?B/s]
tokenizer.json: 0%| | 0.00/712k [00:00<?, ?B/s]
config.json: 0%| | 0.00/755 [00:00<?, ?B/s]
tokenizer_config.json: 0%| | 0.00/1.38k [00:00<?, ?B/s]
README.md: 0%| | 0.00/133 [00:00<?, ?B/s]
model.onnx: 0%| | 0.00/532M [00:00<?, ?B/s]
vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
special_tokens_map.json: 0%| | 0.00/695 [00:00<?, ?B/s]
Fetching 9 files: 0%| | 0/9 [00:00<?, ?it/s]
query_engine = index.as_query_engine(
similarity_top_k=2, sparse_top_k=12, vector_store_query_mode="hybrid"
)
from IPython.display import display, Markdown
response = query_engine.query(
"How was Llama2 specifically trained differently from Llama1?"
)
display(Markdown(str(response)))
Llama 2 was specifically trained differently from Llama 1 by making changes such as performing more robust data cleaning, updating data mixes, training on 40% more total tokens, doubling the context length, and using grouped-query attention (GQA) to improve inference scalability for larger models. Additionally, Llama 2 adopted most of the pretraining setting and model architecture from Llama 1 but included architectural enhancements like increased context length and grouped-query attention.
print(len(response.source_nodes))
2
让我们来比较一下完全不使用混合搜索的情况!
from IPython.display import display, Markdown
query_engine = index.as_query_engine(
similarity_top_k=2,
# sparse_top_k=10,
# vector_store_query_mode="hybrid"
)
response = query_engine.query(
"How was Llama2 specifically trained differently from Llama1?"
)
display(Markdown(str(response)))
Llama 2 was specifically trained differently from Llama 1 by making changes to improve performance, such as performing more robust data cleaning, updating data mixes, training on 40% more total tokens, doubling the context length, and using grouped-query attention (GQA) to improve inference scalability for larger models.
异步支持¶
当然,Qdrant也支持异步查询(请注意,内存中的Qdrant数据不会在异步和同步客户端之间共享!)
import nest_asyncio
nest_asyncio.apply()
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core import Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
# 创建具有混合索引功能的向量存储
vector_store = QdrantVectorStore(
collection_name="llama2_paper",
client=client,
aclient=aclient,
enable_hybrid=True,
batch_size=20,
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
Settings.chunk_size = 512
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
use_async=True,
)
query_engine = index.as_query_engine(similarity_top_k=2, sparse_top_k=10)
response = await query_engine.aquery(
"What baseline models are measured against in the paper?"
)
from typing import Any, List, Tuple
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
doc_tokenizer = AutoTokenizer.from_pretrained(
"naver/efficient-splade-VI-BT-large-doc"
)
doc_model = AutoModelForMaskedLM.from_pretrained(
"naver/efficient-splade-VI-BT-large-doc"
)
query_tokenizer = AutoTokenizer.from_pretrained(
"naver/efficient-splade-VI-BT-large-query"
)
query_model = AutoModelForMaskedLM.from_pretrained(
"naver/efficient-splade-VI-BT-large-query"
)
def sparse_doc_vectors(
texts: List[str],
) -> Tuple[List[List[int]], List[List[float]]]:
"""
使用ReLU、log和max操作从logits和attention mask计算向量。
"""
tokens = doc_tokenizer(
texts, truncation=True, padding=True, return_tensors="pt"
)
if torch.cuda.is_available():
tokens = tokens.to("cuda")
output = doc_model(**tokens)
logits, attention_mask = output.logits, tokens.attention_mask
relu_log = torch.log(1 + torch.relu(logits))
weighted_log = relu_log * attention_mask.unsqueeze(-1)
tvecs, _ = torch.max(weighted_log, dim=1)
# 提取非零向量及其索引
indices = []
vecs = []
for batch in tvecs:
indices.append(batch.nonzero(as_tuple=True)[0].tolist())
vecs.append(batch[indices[-1]].tolist())
return indices, vecs
def sparse_query_vectors(
texts: List[str],
) -> Tuple[List[List[int]], List[List[float]]]:
"""
使用ReLU、log和max操作从logits和attention mask计算向量。
"""
# TODO: 如果超出最大长度,则分批计算稀疏向量
tokens = query_tokenizer(
texts, truncation=True, padding=True, return_tensors="pt"
)
if torch.cuda.is_available():
tokens = tokens.to("cuda")
output = query_model(**tokens)
logits, attention_mask = output.logits, tokens.attention_mask
relu_log = torch.log(1 + torch.relu(logits))
weighted_log = relu_log * attention_mask.unsqueeze(-1)
tvecs, _ = torch.max(weighted_log, dim=1)
# 提取非零向量及其索引
indices = []
vecs = []
for batch in tvecs:
indices.append(batch.nonzero(as_tuple=True)[0].tolist())
vecs.append(batch[indices[-1]].tolist())
return indices, vecs
vector_store = QdrantVectorStore(
"llama2_paper",
client=client,
enable_hybrid=True,
sparse_doc_fn=sparse_doc_vectors,
sparse_query_fn=sparse_query_vectors,
)
自定义 hybrid_fusion_fn()
¶
默认情况下,在使用 Qdrant 运行混合查询时,会使用相对分数融合来合并从稀疏查询和密集查询中检索到的节点。
您可以自定义此函数为任何其他方法(如简单去重、倒数排名融合等)。
以下是我们相对分数融合方法的默认代码以及如何将其传递给构造函数。
from llama_index.core.vector_stores import VectorStoreQueryResult
def relative_score_fusion(
dense_result: VectorStoreQueryResult,
sparse_result: VectorStoreQueryResult,
alpha: float = 0.5, # 从查询引擎传入
top_k: int = 2, # 从查询引擎传入,例如 similarity_top_k
) -> VectorStoreQueryResult:
"""
使用相对分数融合方法融合稠密和稀疏结果。
"""
# 检查结果是否合理
assert dense_result.nodes is not None
assert dense_result.similarities is not None
assert sparse_result.nodes is not None
assert sparse_result.similarities is not None
# 拆解结果
sparse_result_tuples = list(
zip(sparse_result.similarities, sparse_result.nodes)
)
sparse_result_tuples.sort(key=lambda x: x[0], reverse=True)
dense_result_tuples = list(
zip(dense_result.similarities, dense_result.nodes)
)
dense_result_tuples.sort(key=lambda x: x[0], reverse=True)
# 跟踪两个结果中的节点
all_nodes_dict = {x.node_id: x for x in dense_result.nodes}
for node in sparse_result.nodes:
if node.node_id not in all_nodes_dict:
all_nodes_dict[node.node_id] = node
# 将稀疏相似度归一化到0到1
sparse_similarities = [x[0] for x in sparse_result_tuples]
max_sparse_sim = max(sparse_similarities)
min_sparse_sim = min(sparse_similarities)
sparse_similarities = [
(x - min_sparse_sim) / (max_sparse_sim - min_sparse_sim)
for x in sparse_similarities
]
sparse_per_node = {
sparse_result_tuples[i][1].node_id: x
for i, x in enumerate(sparse_similarities)
}
# 将稠密相似度归一化到0到1
dense_similarities = [x[0] for x in dense_result_tuples]
max_dense_sim = max(dense_similarities)
min_dense_sim = min(dense_similarities)
dense_similarities = [
(x - min_dense_sim) / (max_dense_sim - min_dense_sim)
for x in dense_similarities
]
dense_per_node = {
dense_result_tuples[i][1].node_id: x
for i, x in enumerate(dense_similarities)
}
# 融合分数
fused_similarities = []
for node_id in all_nodes_dict:
sparse_sim = sparse_per_node.get(node_id, 0)
dense_sim = dense_per_node.get(node_id, 0)
fused_sim = alpha * (sparse_sim + dense_sim)
fused_similarities.append((fused_sim, all_nodes_dict[node_id]))
fused_similarities.sort(key=lambda x: x[0], reverse=True)
fused_similarities = fused_similarities[:top_k]
# 创建最终的响应对象
return VectorStoreQueryResult(
nodes=[x[1] for x in fused_similarities],
similarities=[x[0] for x in fused_similarities],
ids=[x[1].node_id for x in fused_similarities],
)
vector_store = QdrantVectorStore(
"llama2_paper",
client=client,
enable_hybrid=True,
hybrid_fusion_fn=relative_score_fusion,
)
您可能已经注意到了上面函数中的alpha参数。这可以直接在as_query_engine()
调用中设置,这将在向量索引检索器中进行设置。
index.as_query_engine(alpha=0.5, similarity_top_k=2)
from qdrant_client import models
client.recreate_collection(
collection_name="llama2_paper",
vectors_config={
"text-dense": models.VectorParams(
size=1536, # openai向量大小
distance=models.Distance.COSINE,
)
},
sparse_vectors_config={
"text-sparse": models.SparseVectorParams(
index=models.SparseIndexParams()
)
},
)
# 由于我们创建了一个稀疏集合,因此启用混合模式
vector_store = QdrantVectorStore(
collection_name="llama2_paper", client=client, enable_hybrid=True
)