Bagel网络¶

Bagel是一个面向人工智能的开放推理数据。它专为分布式机器学习计算而构建，可以将人工智能数据基础设施的支出减少十倍。

使用以下命令安装Bagel：

pip install bagelML

与任何其他数据库一样，您可以：

.add（添加）
.get（获取）
.delete（删除）
.update（更新）
.upsert（更新或插入）
.peek（查看）
.modify（修改）
以及.find（运行相似性搜索）。

基本示例¶

在这个基本示例中，我们将一篇Paul Graham的文章分成片段，使用开源嵌入模型进行嵌入，将其加载到Bagel中，然后进行查询。

In [ ]:

Copied!

%pip install llama-index-vector-stores-bagel
%pip install llama-index-embeddings-huggingface
%pip install bagelML
%pip install llama-index-vector-stores-bagel
%pip install llama-index-embeddings-huggingface
%pip install bagelML

In [ ]:

Copied!

# 导入来自llama_index.core的VectorStoreIndex，SimpleDirectoryReader来自llama_index.vector_stores.bagel的BagelVectorStore来自llama_index.core的StorageContext来自IPython.display的Markdown，displayfrom bagel来自bagel的Settings
# 导入来自llama_index.core的VectorStoreIndex，SimpleDirectoryReader来自llama_index.vector_stores.bagel的BagelVectorStore来自llama_index.core的StorageContext来自IPython.display的Markdown，displayfrom bagel来自bagel的Settings

In [ ]:

Copied!

# 设置OpenAIimport osimport getpassos.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")import openaiopenai.api_key = os.environ["OPENAI_API_KEY"]
# 设置OpenAIimport osimport getpassos.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")import openaiopenai.api_key = os.environ["OPENAI_API_KEY"]

下载数据

In [ ]:

Copied!

!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

In [ ]:

Copied!

# 创建服务器设置server_settings = Settings(    bagel_api_impl="rest", bagel_server_host="api.bageldb.ai")# 创建客户端client = bagel.Client(server_settings)# 创建集合collection = client.get_or_create_cluster(    "testing_embeddings", embedding_model="custom", dimension=384)# 定义嵌入函数embed_model = "local:BAAI/bge-small-en-v1.5"# 加载文档documents = SimpleDirectoryReader("./data/paul_graham/").load_data()# 设置BagelVectorStore并加载数据vector_store = BagelVectorStore(collection=collection)storage_context = StorageContext.from_defaults(vector_store=vector_store)index = VectorStoreIndex.from_documents(    documents, storage_context=storage_context, embed_model=embed_model)query_engine = index.as_query_engine()response = query_engine.query("What did the author do growing up?")print(f"<b>{response}</b>")
# 创建服务器设置server_settings = Settings(    bagel_api_impl="rest", bagel_server_host="api.bageldb.ai")# 创建客户端client = bagel.Client(server_settings)# 创建集合collection = client.get_or_create_cluster(    "testing_embeddings", embedding_model="custom", dimension=384)# 定义嵌入函数embed_model = "local:BAAI/bge-small-en-v1.5"# 加载文档documents = SimpleDirectoryReader("./data/paul_graham/").load_data()# 设置BagelVectorStore并加载数据vector_store = BagelVectorStore(collection=collection)storage_context = StorageContext.from_defaults(vector_store=vector_store)index = VectorStoreIndex.from_documents(    documents, storage_context=storage_context, embed_model=embed_model)query_engine = index.as_query_engine()response = query_engine.query("What did the author do growing up?")print(f"{response}")

创建 - 添加 - 获取¶

In [ ]:

Copied!

def create_add_get(client):    """    创建、添加和获取    """    name = "testing"    # 获取或创建一个集群    cluster = client.get_or_create_cluster(name)    # 将文档添加到集群    resp = cluster.add(        documents=[            "这是文档1",            "这是Bidhan",        ],        metadatas=[{"来源": "谷歌"}, {"来源": "notion"}],        ids=[str(uuid.uuid4()), str(uuid.uuid4())],    )    # 打印文档数量    print("文档数量:", cluster.count())    # 获取第一项    first_item = cluster.peek(1)    if first_item:        print("获取第一项")    print(">> 创建、添加和获取完成！\n")
def create_add_get(client):    """    创建、添加和获取    """    name = "testing"    # 获取或创建一个集群    cluster = client.get_or_create_cluster(name)    # 将文档添加到集群    resp = cluster.add(        documents=[            "这是文档1",            "这是Bidhan",        ],        metadatas=[{"来源": "谷歌"}, {"来源": "notion"}],        ids=[str(uuid.uuid4()), str(uuid.uuid4())],    )    # 打印文档数量    print("文档数量:", cluster.count())    # 获取第一项    first_item = cluster.peek(1)    if first_item:        print("获取第一项")    print(">> 创建、添加和获取完成！\n")

创建 - 添加 - 按文本查找¶

In [ ]:

Copied!

def create_add_find(client):    """    创建、添加和查找    参数    ----------    api : _type_        _description_    """    name = "testing"    # 获取或创建一个集群    cluster = client.get_or_create_cluster(name)    # 将文档添加到集群    cluster.add(        documents=[            "这是文档",            "这是Towhid",            "这是文本",        ],        metadatas=[            {"source": "notion"},            {"source": "notion"},            {"source": "google-doc"},        ],        ids=[str(uuid.uuid4()), str(uuid.uuid4()), str(uuid.uuid4())],    )    # 查询相似结果的集群    results = cluster.find(        query_texts=["这"],        n_results=5,        where={"source": "notion"},        where_document={"$contains": "is"},    )    print(results)    print(">> create_add_find 完成  !\n")
def create_add_find(client):    """    创建、添加和查找    参数    ----------    api : _type_        _description_    """    name = "testing"    # 获取或创建一个集群    cluster = client.get_or_create_cluster(name)    # 将文档添加到集群    cluster.add(        documents=[            "这是文档",            "这是Towhid",            "这是文本",        ],        metadatas=[            {"source": "notion"},            {"source": "notion"},            {"source": "google-doc"},        ],        ids=[str(uuid.uuid4()), str(uuid.uuid4()), str(uuid.uuid4())],    )    # 查询相似结果的集群    results = cluster.find(        query_texts=["这"],        n_results=5,        where={"source": "notion"},        where_document={"$contains": "is"},    )    print(results)    print(">> create_add_find 完成  !\n")

创建 - 添加 - 通过嵌入查找¶

In [ ]:

Copied!

def create_add_find_em(client):    """创建、添加和查找嵌入    参数    ----------    api : _type_        _description_    """    name = "testing_embeddings"    # 重置Bagel服务器    client.reset()    # 获取或创建一个集群    cluster = api.get_or_create_cluster(name)    # 将嵌入和其他数据添加到集群中    cluster.add(        embeddings=[            [1.1, 2.3, 3.2],            [4.5, 6.9, 4.4],            [1.1, 2.3, 3.2],            [4.5, 6.9, 4.4],            [1.1, 2.3, 3.2],            [4.5, 6.9, 4.4],            [1.1, 2.3, 3.2],            [4.5, 6.9, 4.4],        ],        metadatas=[            {"uri": "img1.png", "style": "style1"},            {"uri": "img2.png", "style": "style2"},            {"uri": "img3.png", "style": "style1"},            {"uri": "img4.png", "style": "style1"},            {"uri": "img5.png", "style": "style1"},            {"uri": "img6.png", "style": "style1"},            {"uri": "img7.png", "style": "style1"},            {"uri": "img8.png", "style": "style1"},        ],        documents=[            "doc1",            "doc2",            "doc3",            "doc4",            "doc5",            "doc6",            "doc7",            "doc8",        ],        ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],    )    # 查询集群的结果    results = cluster.find(query_embeddings=[[1.1, 2.3, 3.2]], n_results=5)    print("查找结果:", results)    print(">> 创建、添加和查找嵌入完成  !\n")
def create_add_find_em(client):    """创建、添加和查找嵌入    参数    ----------    api : _type_        _description_    """    name = "testing_embeddings"    # 重置Bagel服务器    client.reset()    # 获取或创建一个集群    cluster = api.get_or_create_cluster(name)    # 将嵌入和其他数据添加到集群中    cluster.add(        embeddings=[            [1.1, 2.3, 3.2],            [4.5, 6.9, 4.4],            [1.1, 2.3, 3.2],            [4.5, 6.9, 4.4],            [1.1, 2.3, 3.2],            [4.5, 6.9, 4.4],            [1.1, 2.3, 3.2],            [4.5, 6.9, 4.4],        ],        metadatas=[            {"uri": "img1.png", "style": "style1"},            {"uri": "img2.png", "style": "style2"},            {"uri": "img3.png", "style": "style1"},            {"uri": "img4.png", "style": "style1"},            {"uri": "img5.png", "style": "style1"},            {"uri": "img6.png", "style": "style1"},            {"uri": "img7.png", "style": "style1"},            {"uri": "img8.png", "style": "style1"},        ],        documents=[            "doc1",            "doc2",            "doc3",            "doc4",            "doc5",            "doc6",            "doc7",            "doc8",        ],        ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],    )    # 查询集群的结果    results = cluster.find(query_embeddings=[[1.1, 2.3, 3.2]], n_results=5)    print("查找结果:", results)    print(">> 创建、添加和查找嵌入完成  !\n")

创建 - 添加 - 修改 - 更新¶

In [ ]:

Copied!

def create_add_modify_update(client):    """    创建、添加、修改和更新    参数    ----------    api : _type_        _description_    """    name = "testing"    new_name = "new_" + name    # 获取或创建一个集群    cluster = client.get_or_create_cluster(name)    # 修改集群名称    print("之前:", cluster.name)    cluster.modify(name=new_name)    print("之后:", cluster.name)    # 向集群添加文档    cluster.add(        documents=[            "这是文档1",            "这是bidhan",        ],        metadatas=[{"source": "notion"}, {"source": "google"}],        ids=["id1", "id2"],    )    # 更新前检索文档元数据    print("更新前:")    print(cluster.get(ids=["id1"]))    # 更新文档元数据    cluster.update(ids=["id1"], metadatas=[{"source": "google"}])    # 更新后检索文档元数据    print("更新后的来源:")    print(cluster.get(ids=["id1"]))    print(">> create_add_modify_update 完成！\n")
def create_add_modify_update(client):    """    创建、添加、修改和更新    参数    ----------    api : _type_        _description_    """    name = "testing"    new_name = "new_" + name    # 获取或创建一个集群    cluster = client.get_or_create_cluster(name)    # 修改集群名称    print("之前:", cluster.name)    cluster.modify(name=new_name)    print("之后:", cluster.name)    # 向集群添加文档    cluster.add(        documents=[            "这是文档1",            "这是bidhan",        ],        metadatas=[{"source": "notion"}, {"source": "google"}],        ids=["id1", "id2"],    )    # 更新前检索文档元数据    print("更新前:")    print(cluster.get(ids=["id1"]))    # 更新文档元数据    cluster.update(ids=["id1"], metadatas=[{"source": "google"}])    # 更新后检索文档元数据    print("更新后的来源:")    print(cluster.get(ids=["id1"]))    print(">> create_add_modify_update 完成！\n")

在数据库中，"upsert" 是指在执行插入操作时，如果记录已经存在则更新记录，如果记录不存在则插入新记录。这种操作可以通过使用 "INSERT ... ON CONFLICT DO UPDATE" 语句来实现。

In [ ]:

Copied!

def create_upsert(client):    """    创建并更新    参数    ----------    api : _type_        _description_    """    # 重置Bagel服务器    api.reset()    name = "testing"    # 获取或创建一个集群    cluster = client.get_or_create_cluster(name)    # 将文档添加到集群    cluster.add(        documents=[            "这是文档1",            "这是Bidhan",        ],        metadatas=[{"source": "notion"}, {"source": "google"}],        ids=["id1", "id2"],    )    # 在集群中更新文档    cluster.upsert(        documents=[            "这是文档",            "这是谷歌",        ],        metadatas=[{"source": "notion"}, {"source": "google"}],        ids=["id1", "id3"],    )    # 打印集群中文档的数量    print("文档数量:", cluster.count())    print(">> 创建并更新完成！\n")
def create_upsert(client):    """    创建并更新    参数    ----------    api : _type_        _description_    """    # 重置Bagel服务器    api.reset()    name = "testing"    # 获取或创建一个集群    cluster = client.get_or_create_cluster(name)    # 将文档添加到集群    cluster.add(        documents=[            "这是文档1",            "这是Bidhan",        ],        metadatas=[{"source": "notion"}, {"source": "google"}],        ids=["id1", "id2"],    )    # 在集群中更新文档    cluster.upsert(        documents=[            "这是文档",            "这是谷歌",        ],        metadatas=[{"source": "notion"}, {"source": "google"}],        ids=["id1", "id3"],    )    # 打印集群中文档的数量    print("文档数量:", cluster.count())    print(">> 创建并更新完成！\n")