跳到主要内容

如何使用带有函数调用的GPT-4 Vision

nbviewer

新的GPT-4 Turbo模型,截至2024年4月可作为gpt-4-turbo-2024-04-09使用,现在支持具有视觉能力的函数调用,具有更好的推理能力和截止日期为2023年12月的知识。使用带有函数调用的图像将解锁多模式用例和使用推理的能力,使您能够超越OCR和图像描述。

我们将通过两个示例来演示如何使用带有GPT-4 Turbo Vision的函数调用:

  1. 模拟客户服务助手以支持交付异常
  2. 分析组织结构图以提取员工信息

安装和设置

!pip install pymupdf --quiet
!pip install openai --quiet
!pip install matplotlib --quiet
# 讲师让使用函数调用变得简单易行。
!pip install instructor --quiet

import base64
import os
from enum import Enum
from io import BytesIO
from typing import Iterable
from typing import List
from typing import Literal, Optional

import fitz
# Instructor 基于 Pydantic 构建,而 Pydantic 则依赖于类型提示。模式验证和提示功能均通过类型注解来控制。
import instructor
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
from PIL import Image
from openai import OpenAI
from pydantic import BaseModel, Field

1. 模拟一个用于处理投递异常支持的客服助手

我们将模拟一个用于处理投递异常支持的客服助手,该助手能够分析包裹的图像。根据图像分析,助手将执行以下操作: - 如果图像中的包裹看起来受损,根据政策自动处理退款。 - 如果包裹看起来潮湿,启动替换流程。 - 如果包裹看起来正常且未受损,升级至客服代理处理。

让我们来看看客服助理将分析的样本包裹图片,以确定适当的操作。我们将把这些图片编码为base64字符串,以便模型处理。

# 将图像编码为base64的函数
def encode_image(image_path: str):
# 检查图像是否存在
if not os.path.exists(image_path):
raise FileNotFoundError(f"Image file not found: {image_path}")
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')


# 测试用样本图像
image_dir = "images"

# 将目录中的所有图像进行编码
image_files = os.listdir(image_dir)
image_data = {}
for image_file in image_files:
image_path = os.path.join(image_dir, image_file)
# 使用密钥作为图像文件名来编码图像
image_data[image_file.split('.')[0]] = encode_image(image_path)
print(f"Encoded image: {image_file}")


def display_images(image_data: dict):
fig, axs = plt.subplots(1, 3, figsize=(18, 6))
for i, (key, value) in enumerate(image_data.items()):
img = Image.open(BytesIO(base64.b64decode(value)))
ax = axs[i]
ax.imshow(img)
ax.axis("off")
ax.set_title(key)
plt.tight_layout()
plt.show()


display_images(image_data)

Encoded image: wet_package.jpg
Encoded image: damaged_package.jpg
Encoded image: normal_package.jpg

我们已成功将示例图片编码为base64字符串并显示出来。客服助理将分析这些图片,以确定基于包裹状况采取适当的行动。

现在让我们定义用于订单处理的函数/工具,比如将订单升级给代理、退款订单和替换订单。我们将创建占位符函数来模拟基于识别工具的这些操作的处理。我们将使用Pydantic模型来定义订单操作数据的结构。

MODEL = "gpt-4-turbo-2024-04-09"

class Order(BaseModel):
"""表示一个包含订单ID、客户姓名、产品名称、价格、状态和交付日期等详细信息的订单。"""
order_id: str = Field(..., description="The unique identifier of the order")
product_name: str = Field(..., description="The name of the product")
price: float = Field(..., description="The price of the product")
status: str = Field(..., description="The status of the order")
delivery_date: str = Field(..., description="The delivery date of the order")
# 订单处理占位函数

def get_order_details(order_id):
# 根据订单ID获取订单详情的占位函数
return Order(
order_id=order_id,
product_name="Product X",
price=100.0,
status="Delivered",
delivery_date="2024-04-10",
)

def escalate_to_agent(order: Order, message: str):
# 占位函数,用于将订单升级至人工客服处理
return f"Order {order.order_id} has been escalated to an agent with message: `{message}`"

def refund_order(order: Order):
# 用于处理订单退款的占位函数
return f"Order {order.order_id} has been refunded successfully."

def replace_order(order: Order):
# 占位函数,用于将订单替换为新订单
return f"Order {order.order_id} has been replaced with a new order."

class FunctionCallBase(BaseModel):
rationale: Optional[str] = Field(..., description="The reason for the action.")
image_description: Optional[str] = Field(
..., description="The detailed description of the package image."
)
action: Literal["escalate_to_agent", "replace_order", "refund_order"]
message: Optional[str] = Field(
...,
description="The message to be escalated to the agent if action is escalate_to_agent",
)
# 根据订单ID处理操作的占位函数
def __call__(self, order_id):
order: Order = get_order_details(order_id=order_id)
if self.action == "escalate_to_agent":
return escalate_to_agent(order, self.message)
if self.action == "replace_order":
return replace_order(order)
if self.action == "refund_order":
return refund_order(order)

class EscalateToAgent(FunctionCallBase):
"""升级至客服专员以获得进一步帮助。"""
pass

class OrderActionBase(FunctionCallBase):
pass

class ReplaceOrder(OrderActionBase):
"""工具呼叫以替换订单。"""
pass

class RefundOrder(OrderActionBase):
"""工具调用以退款订单。"""
pass

模拟用户消息并处理包裹图片

我们将模拟包含包裹图片的用户消息,并使用GPT-4 Turbo with Vision模型处理这些图片。该模型将根据图像分析确定适当的工具调用,并针对受损、潮湿或正常包裹的预定义操作。然后,我们将根据订单ID处理识别出的操作,并显示结果。

# 从响应中提取工具调用
ORDER_ID = "12345" # 测试用的占位订单编号
INSTRUCTION_PROMPT = "You are a customer service assistant for a delivery service, equipped to analyze images of packages. If a package appears damaged in the image, automatically process a refund according to policy. If the package looks wet, initiate a replacement. If the package appears normal and not damaged, escalate to agent. For any other issues or unclear images, escalate to agent. You must always use tools!"

def delivery_exception_support_handler(test_image: str):
payload = {
"model": MODEL,
"response_model": Iterable[RefundOrder | ReplaceOrder | EscalateToAgent],
"tool_choice": "auto", # 根据上下文自动选择工具
"temperature": 0.0, # 以减少回答的多样性
"seed": 123, # 为了确保结果的可重复性,设定一个种子
}
payload["messages"] = [
{
"role": "user",
"content": INSTRUCTION_PROMPT,
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data[test_image]}"
}
},
],
}
]
function_calls = instructor.from_openai(
OpenAI(), mode=instructor.Mode.PARALLEL_TOOLS
).chat.completions.create(**payload)
for tool in function_calls:
print(f"- Tool call: {tool.action} for provided img: {test_image}")
print(f"- Parameters: {tool}")
print(f">> Action result: {tool(ORDER_ID)}")
return tool


print("Processing delivery exception support for different package images...")

print("\n===================== Simulating user message 1 =====================")
assert delivery_exception_support_handler("damaged_package").action == "refund_order"

print("\n===================== Simulating user message 2 =====================")
assert delivery_exception_support_handler("normal_package").action == "escalate_to_agent"

print("\n===================== Simulating user message 3 =====================")
assert delivery_exception_support_handler("wet_package").action == "replace_order"

Processing delivery exception support for different package images...

===================== Simulating user message 1 =====================
- Tool call: refund_order for provided img: damaged_package
- Parameters: rationale='The package is visibly damaged with significant tears and crushing, indicating potential harm to the contents.' image_description='The package in the image shows extensive damage, including deep creases and tears in the cardboard. The package is also wrapped with extra tape, suggesting prior attempts to secure it after damage.' action='refund_order' message=None
>> Action result: Order 12345 has been refunded successfully.

===================== Simulating user message 2 =====================
- Tool call: escalate_to_agent for provided img: normal_package
- Parameters: rationale='The package appears normal and not damaged, requiring further assistance for any potential issues not visible in the image.' image_description='A cardboard box on a wooden floor, appearing intact and undamaged, with no visible signs of wear, tear, or wetness.' action='escalate_to_agent' message='Please review this package for any issues not visible in the image. The package appears normal and undamaged.'
>> Action result: Order 12345 has been escalated to an agent with message: `Please review this package for any issues not visible in the image. The package appears normal and undamaged.`

===================== Simulating user message 3 =====================
- Tool call: replace_order for provided img: wet_package
- Parameters: rationale='The package appears wet, which may have compromised the contents, especially since it is labeled as fragile.' image_description="The package in the image shows significant wetness on the top surface, indicating potential water damage. The box is labeled 'FRAGILE', which suggests that the contents are delicate and may be more susceptible to damage from moisture." action='replace_order' message=None
>> Action result: Order 12345 has been replaced with a new order.

2. 分析组织结构图以提取员工信息

对于第二个示例,我们将分析一个组织结构图像,提取员工信息,如员工姓名、角色、经理和经理角色等。我们将使用GPT-4 Turbo与Vision来处理组织结构图像,并提取关于组织中员工的结构化数据。事实上,函数调用让我们可以超越OCR,实际上推断和翻译图表中的层次关系。

我们将从一个PDF格式的示例组织结构图开始,我们希望分析并将PDF的第一页转换为JPEG图像进行分析。

# 将单页PDF页面转换为JPEG图像的函数
def convert_pdf_page_to_jpg(pdf_path: str, output_path: str, page_number=0):
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
doc = fitz.open(pdf_path)
page = doc.load_page(page_number) # 0 是第一页。
pix = page.get_pixmap()
# 将图像保存为JPEG格式
pix.save(output_path)


def display_img_local(image_path: str):
img = Image.open(image_path)
display(img)


pdf_path = 'data/org-chart-sample.pdf'
output_path = 'org-chart-sample.jpg'

convert_pdf_page_to_jpg(pdf_path, output_path)
display_img_local(output_path)

成功从PDF文件中提取并显示了组织结构图像。现在让我们定义一个函数,使用新的GPT4 Turbo with Vision 来分析组织结构图像。该函数将从图像中提取有关员工、他们的角色和经理的信息。我们将使用函数/工具调用来指定组织结构的输入参数,如员工姓名、角色以及经理的姓名和角色。我们将使用Pydantic模型来定义数据的结构。

base64_img = encode_image(output_path)

class RoleEnum(str, Enum):
"""定义组织内可能的角色。"""
CEO = "CEO"
CTO = "CTO"
CFO = "CFO"
COO = "COO"
EMPLOYEE = "Employee"
MANAGER = "Manager"
INTERN = "Intern"
OTHER = "Other"

class Employee(BaseModel):
"""表示一名员工,包括其姓名、角色以及可选的上级信息。"""
employee_name: str = Field(..., description="The name of the employee")
role: RoleEnum = Field(..., description="The role of the employee")
manager_name: Optional[str] = Field(None, description="The manager's name, if applicable")
manager_role: Optional[RoleEnum] = Field(None, description="The manager's role, if applicable")


class EmployeeList(BaseModel):
"""组织结构中的员工名单。"""
employees: List[Employee] = Field(..., description="A list of employees")

def parse_orgchart(base64_img: str) -> EmployeeList:
response = instructor.from_openai(OpenAI()).chat.completions.create(
model='gpt-4-turbo',
response_model=EmployeeList,
messages=[
{
"role": "user",
"content": 'Analyze the given organizational chart and very carefully extract the information.',
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_img}"
}
},
],
}
],
)
return response

现在,我们将定义一个函数来解析GPT-4 Turbo with Vision的响应,并提取员工数据。我们将对提取的数据进行制表,以便于查看。请注意,提取的数据的准确性可能会根据输入图像的复杂性和清晰度而有所不同。

# 调用函数以分析组织结构图并解析响应
result = parse_orgchart(base64_img)

# 将提取的数据制成表格
df = pd.DataFrame([{
'employee_name': employee.employee_name,
'role': employee.role.value,
'manager_name': employee.manager_name,
'manager_role': employee.manager_role.value if employee.manager_role else None
} for employee in result.employees])

display(df)

employee_name role manager_name manager_role
0 Juliana Silva CEO None None
1 Kim Chun Hei CFO Juliana Silva CEO
2 Chad Gibbons CTO Juliana Silva CEO
3 Chiaki Sato COO Juliana Silva CEO
4 Cahaya Dewi Manager Kim Chun Hei CFO
5 Shawn Garcia Manager Chad Gibbons CTO
6 Aaron Loeb Manager Chiaki Sato COO
7 Drew Feig Employee Cahaya Dewi Manager
8 Richard Sanchez Employee Cahaya Dewi Manager
9 Sacha Dubois Intern Cahaya Dewi Manager
10 Olivia Wilson Employee Shawn Garcia Manager
11 Matt Zhang Intern Shawn Garcia Manager
12 Avery Davis Employee Aaron Loeb Manager
13 Harper Russo Employee Aaron Loeb Manager
14 Taylor Alonso Intern Aaron Loeb Manager

成功解析组织结构图中提取的数据,并在DataFrame中显示。这种方法使我们能够利用GPT-4 Turbo的视觉能力从图像中提取结构化信息,如组织结构图和图表,并处理数据以进行进一步分析。通过使用函数调用,我们可以扩展多模型的功能,执行特定任务或调用外部函数。