sitatech commited on
Commit
86e4f57
·
1 Parent(s): 5145e05

[llm] Update vLLM configs and Qwen chat template to enable custom tool parsing

Browse files
Files changed (2) hide show
  1. llm/app.py +9 -15
  2. llm/configs.py +2 -0
llm/app.py CHANGED
@@ -10,6 +10,7 @@ from configs import (
10
  API_KEY,
11
  VLLM_PORT,
12
  flashinfer_cache_vol,
 
13
  )
14
 
15
 
@@ -40,15 +41,6 @@ def serve_llm():
40
  import os
41
  import torch
42
 
43
- # chat_template_path = "/root/chat_template.jinja"
44
- # if not os.path.exists(chat_template_path):
45
- # print("Downloading chat template...")
46
- # url = "https://raw.githubusercontent.com/edwardzjl/chat-templates/refs/heads/main/qwen2_5/chat_template.jinja"
47
- # response = requests.get(url)
48
- # response.raise_for_status()
49
- # with open(chat_template_path, "w") as f:
50
- # f.write(response.text)
51
-
52
  min_pixels = 128 * 28 * 28 # min 128 tokens
53
  max_pixels = 340 * 28 * 28 # max 340 tokens (~512x512 image)
54
 
@@ -67,14 +59,10 @@ def serve_llm():
67
  "--enable-auto-tool-choice",
68
  "--limit-mm-per-prompt",
69
  "image=100",
 
 
70
  "--tensor-parallel-size",
71
  str(N_GPU),
72
- "--host",
73
- "0.0.0.0",
74
- "--port",
75
- str(VLLM_PORT),
76
- "--api-key",
77
- os.environ["API_KEY"],
78
  "--enforce-eager",
79
  # Minimize token usage
80
  "--mm-processor-kwargs",
@@ -84,6 +72,12 @@ def serve_llm():
84
  # '{"rope_type":"yarn","factor":2.0,"original_max_position_embeddings":32768}',
85
  "--max-model-len",
86
  "32768",
 
 
 
 
 
 
87
  ]
88
 
89
  subprocess.Popen(cmd)
 
10
  API_KEY,
11
  VLLM_PORT,
12
  flashinfer_cache_vol,
13
+ CHAT_TEMPLATE,
14
  )
15
 
16
 
 
41
  import os
42
  import torch
43
 
 
 
 
 
 
 
 
 
 
44
  min_pixels = 128 * 28 * 28 # min 128 tokens
45
  max_pixels = 340 * 28 * 28 # max 340 tokens (~512x512 image)
46
 
 
59
  "--enable-auto-tool-choice",
60
  "--limit-mm-per-prompt",
61
  "image=100",
62
+ "--chat-template",
63
+ CHAT_TEMPLATE,
64
  "--tensor-parallel-size",
65
  str(N_GPU),
 
 
 
 
 
 
66
  "--enforce-eager",
67
  # Minimize token usage
68
  "--mm-processor-kwargs",
 
72
  # '{"rope_type":"yarn","factor":2.0,"original_max_position_embeddings":32768}',
73
  "--max-model-len",
74
  "32768",
75
+ "--host",
76
+ "0.0.0.0",
77
+ "--port",
78
+ str(VLLM_PORT),
79
+ "--api-key",
80
+ os.environ["API_KEY"],
81
  ]
82
 
83
  subprocess.Popen(cmd)
llm/configs.py CHANGED
@@ -39,3 +39,5 @@ API_KEY = modal.Secret.from_name(
39
  )
40
  MINUTE = 60
41
  VLLM_PORT = 8000
 
 
 
39
  )
40
  MINUTE = 60
41
  VLLM_PORT = 8000
42
+
43
+ CHAT_TEMPLATE = "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\nTools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments based on the provided signatures within <tool-call></tool-call> XML tags:\\n<tool-call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool-call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool-calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool-call in message.tool-calls %}\n {%- if tool-call.function is defined %}\n {%- set tool-call = tool-call.function %}\n {%- endif %}\n {{- '\\n<tool-call>\\n{\"name\": \"' }}\n {{- tool-call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool-call.arguments | tojson }}\n {{- '}\\n</tool-call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"