基本示例¶
在这个基本示例中,我们将一篇Paul Graham的文章分成片段,使用开源嵌入模型进行嵌入,将其加载到Bagel中,然后进行查询。
In [ ]:
Copied!
%pip install llama-index-vector-stores-bagel
%pip install llama-index-embeddings-huggingface
%pip install bagelML
%pip install llama-index-vector-stores-bagel
%pip install llama-index-embeddings-huggingface
%pip install bagelML
In [ ]:
Copied!
# 导入来自llama_index.core的VectorStoreIndex,SimpleDirectoryReader来自llama_index.vector_stores.bagel的BagelVectorStore来自llama_index.core的StorageContext来自IPython.display的Markdown,displayfrom bagel来自bagel的Settings
# 导入来自llama_index.core的VectorStoreIndex,SimpleDirectoryReader来自llama_index.vector_stores.bagel的BagelVectorStore来自llama_index.core的StorageContext来自IPython.display的Markdown,displayfrom bagel来自bagel的Settings
In [ ]:
Copied!
# 设置OpenAIimport osimport getpassos.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")import openaiopenai.api_key = os.environ["OPENAI_API_KEY"]
# 设置OpenAIimport osimport getpassos.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")import openaiopenai.api_key = os.environ["OPENAI_API_KEY"]
下载数据
In [ ]:
Copied!
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
In [ ]:
Copied!
# 创建服务器设置server_settings = Settings( bagel_api_impl="rest", bagel_server_host="api.bageldb.ai")# 创建客户端client = bagel.Client(server_settings)# 创建集合collection = client.get_or_create_cluster( "testing_embeddings", embedding_model="custom", dimension=384)# 定义嵌入函数embed_model = "local:BAAI/bge-small-en-v1.5"# 加载文档documents = SimpleDirectoryReader("./data/paul_graham/").load_data()# 设置BagelVectorStore并加载数据vector_store = BagelVectorStore(collection=collection)storage_context = StorageContext.from_defaults(vector_store=vector_store)index = VectorStoreIndex.from_documents( documents, storage_context=storage_context, embed_model=embed_model)query_engine = index.as_query_engine()response = query_engine.query("What did the author do growing up?")print(f"<b>{response}</b>")
# 创建服务器设置server_settings = Settings( bagel_api_impl="rest", bagel_server_host="api.bageldb.ai")# 创建客户端client = bagel.Client(server_settings)# 创建集合collection = client.get_or_create_cluster( "testing_embeddings", embedding_model="custom", dimension=384)# 定义嵌入函数embed_model = "local:BAAI/bge-small-en-v1.5"# 加载文档documents = SimpleDirectoryReader("./data/paul_graham/").load_data()# 设置BagelVectorStore并加载数据vector_store = BagelVectorStore(collection=collection)storage_context = StorageContext.from_defaults(vector_store=vector_store)index = VectorStoreIndex.from_documents( documents, storage_context=storage_context, embed_model=embed_model)query_engine = index.as_query_engine()response = query_engine.query("What did the author do growing up?")print(f"{response}")
创建 - 添加 - 获取¶
In [ ]:
Copied!
def create_add_get(client): """ 创建、添加和获取 """ name = "testing" # 获取或创建一个集群 cluster = client.get_or_create_cluster(name) # 将文档添加到集群 resp = cluster.add( documents=[ "这是文档1", "这是Bidhan", ], metadatas=[{"来源": "谷歌"}, {"来源": "notion"}], ids=[str(uuid.uuid4()), str(uuid.uuid4())], ) # 打印文档数量 print("文档数量:", cluster.count()) # 获取第一项 first_item = cluster.peek(1) if first_item: print("获取第一项") print(">> 创建、添加和获取完成!\n")
def create_add_get(client): """ 创建、添加和获取 """ name = "testing" # 获取或创建一个集群 cluster = client.get_or_create_cluster(name) # 将文档添加到集群 resp = cluster.add( documents=[ "这是文档1", "这是Bidhan", ], metadatas=[{"来源": "谷歌"}, {"来源": "notion"}], ids=[str(uuid.uuid4()), str(uuid.uuid4())], ) # 打印文档数量 print("文档数量:", cluster.count()) # 获取第一项 first_item = cluster.peek(1) if first_item: print("获取第一项") print(">> 创建、添加和获取完成!\n")
创建 - 添加 - 按文本查找¶
In [ ]:
Copied!
def create_add_find(client): """ 创建、添加和查找 参数 ---------- api : _type_ _description_ """ name = "testing" # 获取或创建一个集群 cluster = client.get_or_create_cluster(name) # 将文档添加到集群 cluster.add( documents=[ "这是文档", "这是Towhid", "这是文本", ], metadatas=[ {"source": "notion"}, {"source": "notion"}, {"source": "google-doc"}, ], ids=[str(uuid.uuid4()), str(uuid.uuid4()), str(uuid.uuid4())], ) # 查询相似结果的集群 results = cluster.find( query_texts=["这"], n_results=5, where={"source": "notion"}, where_document={"$contains": "is"}, ) print(results) print(">> create_add_find 完成 !\n")
def create_add_find(client): """ 创建、添加和查找 参数 ---------- api : _type_ _description_ """ name = "testing" # 获取或创建一个集群 cluster = client.get_or_create_cluster(name) # 将文档添加到集群 cluster.add( documents=[ "这是文档", "这是Towhid", "这是文本", ], metadatas=[ {"source": "notion"}, {"source": "notion"}, {"source": "google-doc"}, ], ids=[str(uuid.uuid4()), str(uuid.uuid4()), str(uuid.uuid4())], ) # 查询相似结果的集群 results = cluster.find( query_texts=["这"], n_results=5, where={"source": "notion"}, where_document={"$contains": "is"}, ) print(results) print(">> create_add_find 完成 !\n")
创建 - 添加 - 通过嵌入查找¶
In [ ]:
Copied!
def create_add_find_em(client): """创建、添加和查找嵌入 参数 ---------- api : _type_ _description_ """ name = "testing_embeddings" # 重置Bagel服务器 client.reset() # 获取或创建一个集群 cluster = api.get_or_create_cluster(name) # 将嵌入和其他数据添加到集群中 cluster.add( embeddings=[ [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], ], metadatas=[ {"uri": "img1.png", "style": "style1"}, {"uri": "img2.png", "style": "style2"}, {"uri": "img3.png", "style": "style1"}, {"uri": "img4.png", "style": "style1"}, {"uri": "img5.png", "style": "style1"}, {"uri": "img6.png", "style": "style1"}, {"uri": "img7.png", "style": "style1"}, {"uri": "img8.png", "style": "style1"}, ], documents=[ "doc1", "doc2", "doc3", "doc4", "doc5", "doc6", "doc7", "doc8", ], ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"], ) # 查询集群的结果 results = cluster.find(query_embeddings=[[1.1, 2.3, 3.2]], n_results=5) print("查找结果:", results) print(">> 创建、添加和查找嵌入完成 !\n")
def create_add_find_em(client): """创建、添加和查找嵌入 参数 ---------- api : _type_ _description_ """ name = "testing_embeddings" # 重置Bagel服务器 client.reset() # 获取或创建一个集群 cluster = api.get_or_create_cluster(name) # 将嵌入和其他数据添加到集群中 cluster.add( embeddings=[ [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], ], metadatas=[ {"uri": "img1.png", "style": "style1"}, {"uri": "img2.png", "style": "style2"}, {"uri": "img3.png", "style": "style1"}, {"uri": "img4.png", "style": "style1"}, {"uri": "img5.png", "style": "style1"}, {"uri": "img6.png", "style": "style1"}, {"uri": "img7.png", "style": "style1"}, {"uri": "img8.png", "style": "style1"}, ], documents=[ "doc1", "doc2", "doc3", "doc4", "doc5", "doc6", "doc7", "doc8", ], ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"], ) # 查询集群的结果 results = cluster.find(query_embeddings=[[1.1, 2.3, 3.2]], n_results=5) print("查找结果:", results) print(">> 创建、添加和查找嵌入完成 !\n")
创建 - 添加 - 修改 - 更新¶
In [ ]:
Copied!
def create_add_modify_update(client): """ 创建、添加、修改和更新 参数 ---------- api : _type_ _description_ """ name = "testing" new_name = "new_" + name # 获取或创建一个集群 cluster = client.get_or_create_cluster(name) # 修改集群名称 print("之前:", cluster.name) cluster.modify(name=new_name) print("之后:", cluster.name) # 向集群添加文档 cluster.add( documents=[ "这是文档1", "这是bidhan", ], metadatas=[{"source": "notion"}, {"source": "google"}], ids=["id1", "id2"], ) # 更新前检索文档元数据 print("更新前:") print(cluster.get(ids=["id1"])) # 更新文档元数据 cluster.update(ids=["id1"], metadatas=[{"source": "google"}]) # 更新后检索文档元数据 print("更新后的来源:") print(cluster.get(ids=["id1"])) print(">> create_add_modify_update 完成!\n")
def create_add_modify_update(client): """ 创建、添加、修改和更新 参数 ---------- api : _type_ _description_ """ name = "testing" new_name = "new_" + name # 获取或创建一个集群 cluster = client.get_or_create_cluster(name) # 修改集群名称 print("之前:", cluster.name) cluster.modify(name=new_name) print("之后:", cluster.name) # 向集群添加文档 cluster.add( documents=[ "这是文档1", "这是bidhan", ], metadatas=[{"source": "notion"}, {"source": "google"}], ids=["id1", "id2"], ) # 更新前检索文档元数据 print("更新前:") print(cluster.get(ids=["id1"])) # 更新文档元数据 cluster.update(ids=["id1"], metadatas=[{"source": "google"}]) # 更新后检索文档元数据 print("更新后的来源:") print(cluster.get(ids=["id1"])) print(">> create_add_modify_update 完成!\n")
在数据库中,"upsert" 是指在执行插入操作时,如果记录已经存在则更新记录,如果记录不存在则插入新记录。这种操作可以通过使用 "INSERT ... ON CONFLICT DO UPDATE" 语句来实现。
In [ ]:
Copied!
def create_upsert(client): """ 创建并更新 参数 ---------- api : _type_ _description_ """ # 重置Bagel服务器 api.reset() name = "testing" # 获取或创建一个集群 cluster = client.get_or_create_cluster(name) # 将文档添加到集群 cluster.add( documents=[ "这是文档1", "这是Bidhan", ], metadatas=[{"source": "notion"}, {"source": "google"}], ids=["id1", "id2"], ) # 在集群中更新文档 cluster.upsert( documents=[ "这是文档", "这是谷歌", ], metadatas=[{"source": "notion"}, {"source": "google"}], ids=["id1", "id3"], ) # 打印集群中文档的数量 print("文档数量:", cluster.count()) print(">> 创建并更新完成!\n")
def create_upsert(client): """ 创建并更新 参数 ---------- api : _type_ _description_ """ # 重置Bagel服务器 api.reset() name = "testing" # 获取或创建一个集群 cluster = client.get_or_create_cluster(name) # 将文档添加到集群 cluster.add( documents=[ "这是文档1", "这是Bidhan", ], metadatas=[{"source": "notion"}, {"source": "google"}], ids=["id1", "id2"], ) # 在集群中更新文档 cluster.upsert( documents=[ "这是文档", "这是谷歌", ], metadatas=[{"source": "notion"}, {"source": "google"}], ids=["id1", "id3"], ) # 打印集群中文档的数量 print("文档数量:", cluster.count()) print(">> 创建并更新完成!\n")