多模态 GPT4V Pydantic 程序¶

在这个笔记本中，我们将向您展示如何通过 LlamaIndex 使用新的 OpenAI GPT4V API 生成结构化数据。用户只需要指定一个 Pydantic 对象。

我们还比较了几个用于此任务的大型视觉模型：

GPT4-V
Fuyu-8B
MiniGPT-4
CogVLM
Llava-14B

要将图像下载到本地¶

In [ ]:

Copied!

%pip install llama-index-multi-modal-llms-openai
%pip install llama-index-multi-modal-llms-replicate
%pip install llama-index-multi-modal-llms-openai
%pip install llama-index-multi-modal-llms-replicate

In [ ]:

Copied!

import os

OPENAI_API_KEY = "sk-<your-openai-api-token>"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
import os

OPENAI_API_KEY = "sk-"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [ ]:

Copied!

REPLICATE_API_TOKEN = ""  # 在这里填入您的Relicate API令牌
os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN
REPLICATE_API_TOKEN = ""  # 在这里填入您的Relicate API令牌
os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN

In [ ]:

Copied!

from pathlib import Path

input_image_path = Path("restaurant_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)
from pathlib import Path

input_image_path = Path("restaurant_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)

In [ ]:

Copied!

!wget "https://docs.google.com/uc?export=download&id=1GlqcNJhGGbwLKjJK1QJ_nyswCTQ2K2Fq" -O ./restaurant_images/fried_chicken.png
!wget "https://docs.google.com/uc?export=download&id=1GlqcNJhGGbwLKjJK1QJ_nyswCTQ2K2Fq" -O ./restaurant_images/fried_chicken.png

初始化用于餐厅的Pydantic类¶

In [ ]:

Copied!





from pydantic import BaseModel


class Restaurant(BaseModel):
    """餐厅的数据模型。"""

    restaurant: str
    food: str
    discount: str
    price: str
    rating: str
    review: str

from pydantic import BaseModel


class Restaurant(BaseModel):
    """餐厅的数据模型。"""

    restaurant: str
    food: str
    discount: str
    price: str
    rating: str
    review: str

加载OpenAI GPT4V多模态LLM模型¶

In [ ]:

Copied!





来自llama_index.multi_modal_llms.openai的import OpenAIMultiModal
来自llama_index.core的import SimpleDirectoryReader

# 将本地目录放在这里
image_documents = SimpleDirectoryReader("./restaurant_images").load_data()

openai_mm_llm = OpenAIMultiModal(
    model="gpt-4-vision-preview", api_key=OPENAI_API_KEY, max_new_tokens=1000
)
来自llama_index.multi_modal_llms.openai的import OpenAIMultiModal
来自llama_index.core的import SimpleDirectoryReader

# 将本地目录放在这里
image_documents = SimpleDirectoryReader("./restaurant_images").load_data()

openai_mm_llm = OpenAIMultiModal(
    model="gpt-4-vision-preview", api_key=OPENAI_API_KEY, max_new_tokens=1000
)

绘图¶

In [ ]:

Copied!





from PIL import Image
import matplotlib.pyplot as plt

imageUrl = "./restaurant_images/fried_chicken.png"
image = Image.open(imageUrl).convert("RGB")

plt.figure(figsize=(16, 5))
plt.imshow(image)
from PIL import Image
import matplotlib.pyplot as plt

imageUrl = "./restaurant_images/fried_chicken.png"
image = Image.open(imageUrl).convert("RGB")

plt.figure(figsize=(16, 5))
plt.imshow(image)

Out[ ]:

<matplotlib.image.AxesImage at 0x2a5cd06d0>

No description has been provided for this image

使用多模态Pydantic程序从GPT4V输出中生成餐厅图片的结构化数据¶

In [ ]:

Copied!





from llama_index.core.program import MultiModalLLMCompletionProgram
from llama_index.core.output_parsers import PydanticOutputParser

prompt_template_str = """\
    你能总结一下图片里面的内容
    并以json格式返回答案
"""
openai_program = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Restaurant),
    image_documents=image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)
from llama_index.core.program import MultiModalLLMCompletionProgram
from llama_index.core.output_parsers import PydanticOutputParser

prompt_template_str = """\
    你能总结一下图片里面的内容
    并以json格式返回答案
"""
openai_program = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Restaurant),
    image_documents=image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)

In [ ]:

Copied!

response = openai_program()
for res in response:
    print(res)
response = openai_program()
for res in response:
    print(res)

('restaurant', 'Not Specified')
('food', '8 Wings or Chicken Poppers')
('discount', 'Black Friday Offer')
('price', '$8.73')
('rating', 'Not Specified')
('review', 'Not Specified')

为MiniGPT-4、Fuyu-8B、LLaVa-13B、CogVLM模型测试Pydantic¶

In [ ]:

Copied!





from llama_index.multi_modal_llms.replicate import ReplicateMultiModal
from llama_index.multi_modal_llms.replicate.base import (
    REPLICATE_MULTI_MODAL_LLM_MODELS,
)

prompt_template_str = """\
    你能总结一下图片中的内容
    并以json格式返回答案
"""


def pydantic_replicate(
    model_name, output_class, image_documents, prompt_template_str
):
    mm_llm = ReplicateMultiModal(
        model=REPLICATE_MULTI_MODAL_LLM_MODELS[model_name],
        temperature=0.1,
        max_new_tokens=1000,
    )

    llm_program = MultiModalLLMCompletionProgram.from_defaults(
        output_parser=PydanticOutputParser(output_class),
        image_documents=image_documents,
        prompt_template_str=prompt_template_str,
        multi_modal_llm=mm_llm,
        verbose=True,
    )

    response = llm_program()
    print(f"模型：{model_name}")
    for res in response:
        print(res)
from llama_index.multi_modal_llms.replicate import ReplicateMultiModal
from llama_index.multi_modal_llms.replicate.base import (
    REPLICATE_MULTI_MODAL_LLM_MODELS,
)

prompt_template_str = """\
    你能总结一下图片中的内容
    并以json格式返回答案
"""


def pydantic_replicate(
    model_name, output_class, image_documents, prompt_template_str
):
    mm_llm = ReplicateMultiModal(
        model=REPLICATE_MULTI_MODAL_LLM_MODELS[model_name],
        temperature=0.1,
        max_new_tokens=1000,
    )

    llm_program = MultiModalLLMCompletionProgram.from_defaults(
        output_parser=PydanticOutputParser(output_class),
        image_documents=image_documents,
        prompt_template_str=prompt_template_str,
        multi_modal_llm=mm_llm,
        verbose=True,
    )

    response = llm_program()
    print(f"模型：{model_name}")
    for res in response:
        print(res)

使用Fuyu-8B进行Pydantic结构化输出¶

In [ ]:

Copied!

pydantic_replicate("fuyu-8b", Restaurant, image_documents, prompt_template_str)
pydantic_replicate("fuyu-8b", Restaurant, image_documents, prompt_template_str)

使用 LLaVa-13B 生成 Pydantic 结构化输出¶

In [ ]:

Copied!

pydantic_replicate(
    "llava-13b", Restaurant, image_documents, prompt_template_str
)
pydantic_replicate(
    "llava-13b", Restaurant, image_documents, prompt_template_str
)

使用MiniGPT-4生成Pydantic结构化输出¶

In [ ]:

Copied!

pydantic_replicate(
    "minigpt-4", Restaurant, image_documents, prompt_template_str
)
pydantic_replicate(
    "minigpt-4", Restaurant, image_documents, prompt_template_str
)

使用CogVLM生成Pydantic结构化输出¶

In [ ]:

Copied!

pydantic_replicate("cogvlm", Restaurant, image_documents, prompt_template_str)
pydantic_replicate("cogvlm", Restaurant, image_documents, prompt_template_str)

观察：

只有 GPT4-V 在这个图像 pydantic 任务中表现良好
其他视觉模型可能只能输出部分字段

更改为亚马逊产品示例¶

下载亚马逊产品图片截图¶

In [ ]:

Copied!

input_image_path = Path("amazon_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)
input_image_path = Path("amazon_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)

In [ ]:

Copied!

!wget "https://docs.google.com/uc?export=download&id=1p1Y1qAoM68eC4sAvvHaiJyPhdUZS0Gqb" -O ./amazon_images/amazon.png
!wget "https://docs.google.com/uc?export=download&id=1p1Y1qAoM68eC4sAvvHaiJyPhdUZS0Gqb" -O ./amazon_images/amazon.png

初始化亚马逊产品 Pydantic 类¶

In [ ]:

Copied!





from pydantic import BaseModel


class Product(BaseModel):
    """亚马逊产品的数据模型。"""

    title: str
    category: str
    discount: str
    price: str
    rating: str
    review: str
    description: str
    inventory: str

from pydantic import BaseModel


class Product(BaseModel):
    """亚马逊产品的数据模型。"""

    title: str
    category: str
    discount: str
    price: str
    rating: str
    review: str
    description: str
    inventory: str

绘制图像¶

In [ ]:

Copied!

imageUrl = "./amazon_images/amazon.png"
image = Image.open(imageUrl).convert("RGB")

plt.figure(figsize=(16, 5))
plt.imshow(image)
imageUrl = "./amazon_images/amazon.png"
image = Image.open(imageUrl).convert("RGB")

plt.figure(figsize=(16, 5))
plt.imshow(image)

Out[ ]:

<matplotlib.image.AxesImage at 0x17b96e010>

使用多模态Pydantic程序从GPT4V输出中生成Amazon产品图片的结构化数据¶

这个项目旨在使用多模态Pydantic程序，从GPT4V生成的文本输出中提取结构化数据，以用于Amazon产品图片。

In [ ]:

Copied!





amazon_image_documents = SimpleDirectoryReader("./amazon_images").load_data()

prompt_template_str = """\
    你能总结一下图片中的内容，并以json格式返回答案\
"""
openai_program_amazon = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Product),
    image_documents=amazon_image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)
amazon_image_documents = SimpleDirectoryReader("./amazon_images").load_data()

prompt_template_str = """\
    你能总结一下图片中的内容，并以json格式返回答案\
"""
openai_program_amazon = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Product),
    image_documents=amazon_image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)

In [ ]:

Copied!

response = openai_program_amazon()
for res in response:
    print(res)
response = openai_program_amazon()
for res in response:
    print(res)

('title', 'Instant Vortex 5.7QT Air Fryer Oven Combo')
('category', 'Kitchen Appliances')
('discount', '20% off')
('price', '$151.20')
('rating', '4.7 out of 5 stars')
('review', '5086 ratings')
('description', '6-in-1 functionality; air fry, broil, bake, roast, reheat, and dehydrate. EvenCrisp Technology for crispy results. Easy to use touchscreen. Dishwasher safe parts. Cooks food faster and with less oil.')
('inventory', 'In stock')

为MiniGPT-4、Fuyu-8B、LLaVa-13B、CogVLM模型测试Pydantic¶

使用Fuyu-8B进行Pydantic结构化输出¶

In [ ]:

Copied!

pydantic_replicate(
    "fuyu-8b", Product, amazon_image_documents, prompt_template_str
)
pydantic_replicate(
    "fuyu-8b", Product, amazon_image_documents, prompt_template_str
)

使用MiniGPT-4生成Pydantic结构化输出¶

In [ ]:

Copied!

pydantic_replicate(
    "minigpt-4", Product, amazon_image_documents, prompt_template_str
)
pydantic_replicate(
    "minigpt-4", Product, amazon_image_documents, prompt_template_str
)

使用CogVLM-4生成Pydantic结构化输出¶

在这个示例中，我们将展示如何使用CogVLM-4来生成Pydantic结构化输出。CogVLM-4是一个用于生成结构化输出的工具，而Pydantic是一个用于数据验证和序列化的库。通过结合它们，我们可以轻松地生成符合特定结构的输出数据。

In [ ]:

Copied!

pydantic_replicate(
    "cogvlm", Product, amazon_image_documents, prompt_template_str
)
pydantic_replicate(
    "cogvlm", Product, amazon_image_documents, prompt_template_str
)

Model: cogvlm
('title', 'Instant Vortex 5.7QT Air Fryer Oven Combo')
('category', 'Kitchen Appliances')
('discount', '20% off')
('price', '151.00')
('rating', '4.5 stars')
('review', "Amazon's Choice")
('description', 'Instant Vortex 5.7QT Air Fryer Oven Combo, From the Makers of Instant Pot, Customizable Smart Cooking Programs, Digital Touchscreen, Nonstick and Dishwasher Safe Basket, App with over 100 Recipes')
('inventory', 'In stock')

使用 LlaVa-13B 生成 Pydantic 结构化输出¶

In [ ]:

Copied!

pydantic_replicate(
    "llava-13b", Product, amazon_image_documents, prompt_template_str
)
pydantic_replicate(
    "llava-13b", Product, amazon_image_documents, prompt_template_str
)

Model: llava-13b
('title', 'Instant Vortex 6.5 Qt Air Fryer Oven Combo')
('category', 'Kitchen Appliances')
('discount', '20% off')
('price', '$149.99')
('rating', '4.5 out of 5 stars')
('review', '500+ reviews')
('description', 'The Instant Vortex 6.5 Qt Air Fryer Oven Combo is a versatile and customizable small kitchen appliance that can air fry, bake, roast, broil, and dehydrate. It features a digital touchscreen, non-stick safe basket, and dishwasher safe basket, making it easy to use and clean. With over 1200 recipes, cooking programs, and digital touchscreen, this appliance is perfect for anyone looking to simplify their cooking routine.')
('inventory', 'In Stock')

观察：

只有GPT4v、Llava-13B和GogVLM输出了所需的字段
在这3个模型中，GPT4V获得了最准确的结果。Llava-13B和CogVLM得到了错误的价格。

初始化Instagram广告Pydantic类并比较不同的多模态LLM的性能¶

In [ ]:

Copied!

input_image_path = Path("instagram_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)
input_image_path = Path("instagram_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)

In [ ]:

Copied!

!wget "https://docs.google.com/uc?export=download&id=12ZpBBFkYu-jzz1iz356U5kMikn4uN9ww" -O ./instagram_images/jordan.png
!wget "https://docs.google.com/uc?export=download&id=12ZpBBFkYu-jzz1iz356U5kMikn4uN9ww" -O ./instagram_images/jordan.png

In [ ]:

Copied!





from pydantic import BaseModel


class InsAds(BaseModel):
    """Ins广告的数据模型。"""

    account: str
    brand: str
    product: str
    category: str
    discount: str
    price: str
    comments: str
    review: str
    description: str

from pydantic import BaseModel


class InsAds(BaseModel):
    """Ins广告的数据模型。"""

    account: str
    brand: str
    product: str
    category: str
    discount: str
    price: str
    comments: str
    review: str
    description: str

In [ ]:

Copied!





from PIL import Image
import matplotlib.pyplot as plt

imageUrl = "./instagram_images/jordan.png"
image = Image.open(imageUrl).convert("RGB")

plt.figure(figsize=(16, 5))
plt.imshow(image)
from PIL import Image
import matplotlib.pyplot as plt

imageUrl = "./instagram_images/jordan.png"
image = Image.open(imageUrl).convert("RGB")

plt.figure(figsize=(16, 5))
plt.imshow(image)

Out[ ]:

<matplotlib.image.AxesImage at 0x16a722890>

In [ ]:

Copied!





ins_image_documents = SimpleDirectoryReader("./instagram_images").load_data()

prompt_template_str = """\
    你能总结一下图片中的内容，并以json格式返回答案\
"""
openai_program_ins = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(InsAds),
    image_documents=ins_image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)


response = openai_program_ins()
for res in response:
    print(res)

ins_image_documents = SimpleDirectoryReader("./instagram_images").load_data()

prompt_template_str = """\
    你能总结一下图片中的内容，并以json格式返回答案\
"""
openai_program_ins = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(InsAds),
    image_documents=ins_image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)


response = openai_program_ins()
for res in response:
    print(res)

('account', 'jordansdaily')
('brand', 'Air Jordan')
('product', 'Air Jordan 2')
('category', 'Footwear')
('discount', 'None')
('price', '$175')
('comments', 'Liked by cemm2k and others')
('review', 'Not available')
('description', "Release date November 18th - Air Jordan 2 'Italy'")

In [ ]:

Copied!

pydantic_replicate("fuyu-8b", InsAds, ins_image_documents, prompt_template_str)
pydantic_replicate("fuyu-8b", InsAds, ins_image_documents, prompt_template_str)

In [ ]:

Copied!

pydantic_replicate(
    "llava-13b", InsAds, ins_image_documents, prompt_template_str
)
pydantic_replicate(
    "llava-13b", InsAds, ins_image_documents, prompt_template_str
)

In [ ]:

Copied!

pydantic_replicate("cogvlm", InsAds, ins_image_documents, prompt_template_str)
pydantic_replicate("cogvlm", InsAds, ins_image_documents, prompt_template_str)

Model: cogvlm
('account', 'jordansdaily')
('brand', 'AIR JORDAN')
('product', '2')
('category', 'ITALY')
('discount', '')
('price', '$175')
('comments', '')
('review', '')
('description', "AIR JORDAN 2 'ITALY' release NOV 18TH $175")

In [ ]:

Copied!

pydantic_replicate(
    "minigpt-4", InsAds, ins_image_documents, prompt_template_str
)
pydantic_replicate(
    "minigpt-4", InsAds, ins_image_documents, prompt_template_str
)

观察：

只有 GPT4v 和 GogVLM 输出所需字段
在这两个模型中，GPT4V 获得更准确的结果。