OpenAI Vision API 客户端#
源代码 vllm-project/vllm。
1"""An example showing how to use vLLM to serve VLMs.
2
3Launch the vLLM server with the following command:
4
5(single image inference with Llava)
6vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
7
8(multi-image inference with Phi-3.5-vision-instruct)
9vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
10 --trust-remote-code --limit-mm-per-prompt image=2
11"""
12import base64
13
14import requests
15from openai import OpenAI
16
17# Modify OpenAI's API key and API base to use vLLM's API server.
18openai_api_key = "EMPTY"
19openai_api_base = "http://localhost:8000/v1"
20
21client = OpenAI(
22 # defaults to os.environ.get("OPENAI_API_KEY")
23 api_key=openai_api_key,
24 base_url=openai_api_base,
25)
26
27models = client.models.list()
28model = models.data[0].id
29
30# Single-image input inference
31image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
32
33## Use image url in the payload
34chat_completion_from_url = client.chat.completions.create(
35 messages=[{
36 "role":
37 "user",
38 "content": [
39 {
40 "type": "text",
41 "text": "What's in this image?"
42 },
43 {
44 "type": "image_url",
45 "image_url": {
46 "url": image_url
47 },
48 },
49 ],
50 }],
51 model=model,
52 max_tokens=64,
53)
54
55result = chat_completion_from_url.choices[0].message.content
56print("Chat completion output:", result)
57
58
59## Use base64 encoded image in the payload
60def encode_image_base64_from_url(image_url: str) -> str:
61 """Encode an image retrieved from a remote url to base64 format."""
62
63 with requests.get(image_url) as response:
64 response.raise_for_status()
65 result = base64.b64encode(response.content).decode('utf-8')
66
67 return result
68
69
70image_base64 = encode_image_base64_from_url(image_url=image_url)
71chat_completion_from_base64 = client.chat.completions.create(
72 messages=[{
73 "role":
74 "user",
75 "content": [
76 {
77 "type": "text",
78 "text": "What's in this image?"
79 },
80 {
81 "type": "image_url",
82 "image_url": {
83 "url": f"data:image/jpeg;base64,{image_base64}"
84 },
85 },
86 ],
87 }],
88 model=model,
89 max_tokens=64,
90)
91
92result = chat_completion_from_base64.choices[0].message.content
93print(f"Chat completion output:{result}")
94
95# Multi-image input inference
96image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
97image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
98chat_completion_from_url = client.chat.completions.create(
99 messages=[{
100 "role":
101 "user",
102 "content": [
103 {
104 "type": "text",
105 "text": "What are the animals in these images?"
106 },
107 {
108 "type": "image_url",
109 "image_url": {
110 "url": image_url_duck
111 },
112 },
113 {
114 "type": "image_url",
115 "image_url": {
116 "url": image_url_lion
117 },
118 },
119 ],
120 }],
121 model=model,
122 max_tokens=64,
123)
124
125result = chat_completion_from_url.choices[0].message.content
126print("Chat completion output:", result)