离线推理聊天#
源代码 vllm-project/vllm。
1from vllm import LLM, SamplingParams
2
3llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
4sampling_params = SamplingParams(temperature=0.5)
5
6
7def print_outputs(outputs):
8 for output in outputs:
9 prompt = output.prompt
10 generated_text = output.outputs[0].text
11 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
12 print("-" * 80)
13
14
15print("=" * 80)
16
17# In this script, we demonstrate how to pass input to the chat method:
18
19conversation = [
20 {
21 "role": "system",
22 "content": "You are a helpful assistant"
23 },
24 {
25 "role": "user",
26 "content": "Hello"
27 },
28 {
29 "role": "assistant",
30 "content": "Hello! How can I assist you today?"
31 },
32 {
33 "role": "user",
34 "content": "Write an essay about the importance of higher education.",
35 },
36]
37outputs = llm.chat(conversation,
38 sampling_params=sampling_params,
39 use_tqdm=False)
40print_outputs(outputs)
41
42# You can run batch inference with llm.chat API
43conversation = [
44 {
45 "role": "system",
46 "content": "You are a helpful assistant"
47 },
48 {
49 "role": "user",
50 "content": "Hello"
51 },
52 {
53 "role": "assistant",
54 "content": "Hello! How can I assist you today?"
55 },
56 {
57 "role": "user",
58 "content": "Write an essay about the importance of higher education.",
59 },
60]
61conversations = [conversation for _ in range(10)]
62
63# We turn on tqdm progress bar to verify it's indeed running batch inference
64outputs = llm.chat(messages=conversations,
65 sampling_params=sampling_params,
66 use_tqdm=True)
67print_outputs(outputs)
68
69# A chat template can be optionally supplied.
70# If not, the model will use its default chat template.
71
72# with open('template_falcon_180b.jinja', "r") as f:
73# chat_template = f.read()
74
75# outputs = llm.chat(
76# conversations,
77# sampling_params=sampling_params,
78# use_tqdm=False,
79# chat_template=chat_template,
80# )