离线推理聊天

离线推理聊天#

源代码 vllm-project/vllm

 1from vllm import LLM, SamplingParams
 2
 3llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
 4sampling_params = SamplingParams(temperature=0.5)
 5
 6
 7def print_outputs(outputs):
 8    for output in outputs:
 9        prompt = output.prompt
10        generated_text = output.outputs[0].text
11        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
12    print("-" * 80)
13
14
15print("=" * 80)
16
17# In this script, we demonstrate how to pass input to the chat method:
18
19conversation = [
20    {
21        "role": "system",
22        "content": "You are a helpful assistant"
23    },
24    {
25        "role": "user",
26        "content": "Hello"
27    },
28    {
29        "role": "assistant",
30        "content": "Hello! How can I assist you today?"
31    },
32    {
33        "role": "user",
34        "content": "Write an essay about the importance of higher education.",
35    },
36]
37outputs = llm.chat(conversation,
38                   sampling_params=sampling_params,
39                   use_tqdm=False)
40print_outputs(outputs)
41
42# You can run batch inference with llm.chat API
43conversation = [
44    {
45        "role": "system",
46        "content": "You are a helpful assistant"
47    },
48    {
49        "role": "user",
50        "content": "Hello"
51    },
52    {
53        "role": "assistant",
54        "content": "Hello! How can I assist you today?"
55    },
56    {
57        "role": "user",
58        "content": "Write an essay about the importance of higher education.",
59    },
60]
61conversations = [conversation for _ in range(10)]
62
63# We turn on tqdm progress bar to verify it's indeed running batch inference
64outputs = llm.chat(messages=conversations,
65                   sampling_params=sampling_params,
66                   use_tqdm=True)
67print_outputs(outputs)
68
69# A chat template can be optionally supplied.
70# If not, the model will use its default chat template.
71
72# with open('template_falcon_180b.jinja', "r") as f:
73#     chat_template = f.read()
74
75# outputs = llm.chat(
76#     conversations,
77#     sampling_params=sampling_params,
78#     use_tqdm=False,
79#     chat_template=chat_template,
80# )