DocArray Hnsw Vector Store¶

DocArrayHnswVectorStore 是由DocArray提供的轻量级文档索引实现，完全在本地运行，最适合小到中等大小的数据集。它将向量存储在hnswlib中，并将所有其他数据存储在SQLite中。

如果您在colab上打开这个笔记本，您可能需要安装LlamaIndex 🦙。

In [ ]:

Copied!

%pip install llama-index-vector-stores-docarray
%pip install llama-index-vector-stores-docarray

In [ ]:

Copied!

!pip install llama-index
!pip install llama-index

In [ ]:

Copied!





import os
import sys
import logging
import textwrap

import warnings

warnings.filterwarnings("ignore")

# 停止huggingface的警告
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 取消注释以查看调试日志
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import (
    GPTVectorStoreIndex,
    SimpleDirectoryReader,
    Document,
)
from llama_index.vector_stores.docarray import DocArrayHnswVectorStore
from IPython.display import Markdown, display
import os
import sys
import logging
import textwrap

import warnings

warnings.filterwarnings("ignore")

# 停止huggingface的警告
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 取消注释以查看调试日志
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import (
    GPTVectorStoreIndex,
    SimpleDirectoryReader,
    Document,
)
from llama_index.vector_stores.docarray import DocArrayHnswVectorStore
from IPython.display import Markdown, display

In [ ]:

Copied!

import os

os.environ["OPENAI_API_KEY"] = "<your openai key>"
import os

os.environ["OPENAI_API_KEY"] = ""

下载数据

In [ ]:

Copied!

!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

In [ ]:

Copied!





# 加载文档
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
print(
    "文档ID:",
    documents[0].doc_id,
    "文档哈希:",
    documents[0].doc_hash,
)
# 加载文档
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
print(
    "文档ID:",
    documents[0].doc_id,
    "文档哈希:",
    documents[0].doc_hash,
)

Document ID: 07d9ca27-ded0-46fa-9165-7e621216fd47 Document Hash: 77ae91ab542f3abb308c4d7c77c9bc4c9ad0ccd63144802b7cbe7e1bb3a4094e

初始化和索引¶

In [ ]:

Copied!





from llama_index.core import StorageContext


vector_store = DocArrayHnswVectorStore(work_dir="hnsw_index")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = GPTVectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)
from llama_index.core import StorageContext


vector_store = DocArrayHnswVectorStore(work_dir="hnsw_index")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = GPTVectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

查询¶

In [ ]:

Copied!





# 将日志级别设置为DEBUG，以获得更详细的输出
query_engine = index.as_query_engine()
response = query_engine.query("作者在成长过程中做了什么？")
print(textwrap.fill(str(response), 100))
# 将日志级别设置为DEBUG，以获得更详细的输出
query_engine = index.as_query_engine()
response = query_engine.query("作者在成长过程中做了什么？")
print(textwrap.fill(str(response), 100))

Token indices sequence length is longer than the specified maximum sequence length for this model (1830 > 1024). Running this sequence through the model will result in indexing errors

 Growing up, the author wrote short stories, programmed on an IBM 1401, and nagged his father to buy
him a TRS-80 microcomputer. He wrote simple games, a program to predict how high his model rockets
would fly, and a word processor. He also studied philosophy in college, but switched to AI after
becoming bored with it. He then took art classes at Harvard and applied to art schools, eventually
attending RISD.

In [ ]:

Copied!

response = query_engine.query("What was a hard moment for the author?")
print(textwrap.fill(str(response), 100))
response = query_engine.query("What was a hard moment for the author?")
print(textwrap.fill(str(response), 100))

 A hard moment for the author was when he realized that the AI programs of the time were a hoax and
that there was an unbridgeable gap between what they could do and actually understanding natural
language.

使用过滤器进行查询¶

In [ ]:

Copied!





from llama_index.core.schema import TextNode

nodes = [
    TextNode(
        text="The Shawshank Redemption",
        metadata={
            "author": "Stephen King",
            "theme": "Friendship",
        },
    ),
    TextNode(
        text="The Godfather",
        metadata={
            "director": "Francis Ford Coppola",
            "theme": "Mafia",
        },
    ),
    TextNode(
        text="Inception",
        metadata={
            "director": "Christopher Nolan",
        },
    ),
]
from llama_index.core.schema import TextNode

nodes = [
    TextNode(
        text="The Shawshank Redemption",
        metadata={
            "author": "Stephen King",
            "theme": "Friendship",
        },
    ),
    TextNode(
        text="The Godfather",
        metadata={
            "director": "Francis Ford Coppola",
            "theme": "Mafia",
        },
    ),
    TextNode(
        text="Inception",
        metadata={
            "director": "Christopher Nolan",
        },
    ),
]

In [ ]:

Copied!

from llama_index.core import StorageContext

vector_store = DocArrayHnswVectorStore(work_dir="hnsw_filters")
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = GPTVectorStoreIndex(nodes, storage_context=storage_context)
from llama_index.core import StorageContext

vector_store = DocArrayHnswVectorStore(work_dir="hnsw_filters")
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = GPTVectorStoreIndex(nodes, storage_context=storage_context)

In [ ]:

Copied!

from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters

filters = MetadataFilters(
    filters=[ExactMatchFilter(key="theme", value="Mafia")]
)

retriever = index.as_retriever(filters=filters)
retriever.retrieve("What is inception about?")
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters

filters = MetadataFilters(
    filters=[ExactMatchFilter(key="theme", value="Mafia")]
)

retriever = index.as_retriever(filters=filters)
retriever.retrieve("What is inception about?")

Out[ ]:

[NodeWithScore(node=Node(text='director: Francis Ford Coppola\ntheme: Mafia\n\nThe Godfather', doc_id='d96456bf-ef6e-4c1b-bdb8-e90a37d881f3', embedding=None, doc_hash='b770e43e6a94854a22dc01421d3d9ef6a94931c2b8dbbadf4fdb6eb6fbe41010', extra_info=None, node_info=None, relationships={<DocumentRelationship.SOURCE: '1'>: 'None'}), score=0.4634347)]

In [ ]:

Copied!





# 删除已创建的索引
import os, shutil

hnsw_dirs = ["hnsw_filters", "hnsw_index"]
for dir in hnsw_dirs:
    if os.path.exists(dir):
        shutil.rmtree(dir)
# 删除已创建的索引
import os, shutil

hnsw_dirs = ["hnsw_filters", "hnsw_index"]
for dir in hnsw_dirs:
    if os.path.exists(dir):
        shutil.rmtree(dir)