OpenAI Vision API 客户端

OpenAI Vision API 客户端#

源代码 vllm-project/vllm

  1"""An example showing how to use vLLM to serve VLMs.
  2
  3Launch the vLLM server with the following command:
  4
  5(single image inference with Llava)
  6vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
  7
  8(multi-image inference with Phi-3.5-vision-instruct)
  9vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
 10    --trust-remote-code --limit-mm-per-prompt image=2
 11"""
 12import base64
 13
 14import requests
 15from openai import OpenAI
 16
 17# Modify OpenAI's API key and API base to use vLLM's API server.
 18openai_api_key = "EMPTY"
 19openai_api_base = "http://localhost:8000/v1"
 20
 21client = OpenAI(
 22    # defaults to os.environ.get("OPENAI_API_KEY")
 23    api_key=openai_api_key,
 24    base_url=openai_api_base,
 25)
 26
 27models = client.models.list()
 28model = models.data[0].id
 29
 30# Single-image input inference
 31image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 32
 33## Use image url in the payload
 34chat_completion_from_url = client.chat.completions.create(
 35    messages=[{
 36        "role":
 37        "user",
 38        "content": [
 39            {
 40                "type": "text",
 41                "text": "What's in this image?"
 42            },
 43            {
 44                "type": "image_url",
 45                "image_url": {
 46                    "url": image_url
 47                },
 48            },
 49        ],
 50    }],
 51    model=model,
 52    max_tokens=64,
 53)
 54
 55result = chat_completion_from_url.choices[0].message.content
 56print("Chat completion output:", result)
 57
 58
 59## Use base64 encoded image in the payload
 60def encode_image_base64_from_url(image_url: str) -> str:
 61    """Encode an image retrieved from a remote url to base64 format."""
 62
 63    with requests.get(image_url) as response:
 64        response.raise_for_status()
 65        result = base64.b64encode(response.content).decode('utf-8')
 66
 67    return result
 68
 69
 70image_base64 = encode_image_base64_from_url(image_url=image_url)
 71chat_completion_from_base64 = client.chat.completions.create(
 72    messages=[{
 73        "role":
 74        "user",
 75        "content": [
 76            {
 77                "type": "text",
 78                "text": "What's in this image?"
 79            },
 80            {
 81                "type": "image_url",
 82                "image_url": {
 83                    "url": f"data:image/jpeg;base64,{image_base64}"
 84                },
 85            },
 86        ],
 87    }],
 88    model=model,
 89    max_tokens=64,
 90)
 91
 92result = chat_completion_from_base64.choices[0].message.content
 93print(f"Chat completion output:{result}")
 94
 95# Multi-image input inference
 96image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
 97image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
 98chat_completion_from_url = client.chat.completions.create(
 99    messages=[{
100        "role":
101        "user",
102        "content": [
103            {
104                "type": "text",
105                "text": "What are the animals in these images?"
106            },
107            {
108                "type": "image_url",
109                "image_url": {
110                    "url": image_url_duck
111                },
112            },
113            {
114                "type": "image_url",
115                "image_url": {
116                    "url": image_url_lion
117                },
118            },
119        ],
120    }],
121    model=model,
122    max_tokens=64,
123)
124
125result = chat_completion_from_url.choices[0].message.content
126print("Chat completion output:", result)