Spaces:
Running
Running
[llm] Update vLLM configs and Qwen chat template to enable custom tool parsing
Browse files- llm/app.py +9 -15
- llm/configs.py +2 -0
llm/app.py
CHANGED
|
@@ -10,6 +10,7 @@ from configs import (
|
|
| 10 |
API_KEY,
|
| 11 |
VLLM_PORT,
|
| 12 |
flashinfer_cache_vol,
|
|
|
|
| 13 |
)
|
| 14 |
|
| 15 |
|
|
@@ -40,15 +41,6 @@ def serve_llm():
|
|
| 40 |
import os
|
| 41 |
import torch
|
| 42 |
|
| 43 |
-
# chat_template_path = "/root/chat_template.jinja"
|
| 44 |
-
# if not os.path.exists(chat_template_path):
|
| 45 |
-
# print("Downloading chat template...")
|
| 46 |
-
# url = "https://raw.githubusercontent.com/edwardzjl/chat-templates/refs/heads/main/qwen2_5/chat_template.jinja"
|
| 47 |
-
# response = requests.get(url)
|
| 48 |
-
# response.raise_for_status()
|
| 49 |
-
# with open(chat_template_path, "w") as f:
|
| 50 |
-
# f.write(response.text)
|
| 51 |
-
|
| 52 |
min_pixels = 128 * 28 * 28 # min 128 tokens
|
| 53 |
max_pixels = 340 * 28 * 28 # max 340 tokens (~512x512 image)
|
| 54 |
|
|
@@ -67,14 +59,10 @@ def serve_llm():
|
|
| 67 |
"--enable-auto-tool-choice",
|
| 68 |
"--limit-mm-per-prompt",
|
| 69 |
"image=100",
|
|
|
|
|
|
|
| 70 |
"--tensor-parallel-size",
|
| 71 |
str(N_GPU),
|
| 72 |
-
"--host",
|
| 73 |
-
"0.0.0.0",
|
| 74 |
-
"--port",
|
| 75 |
-
str(VLLM_PORT),
|
| 76 |
-
"--api-key",
|
| 77 |
-
os.environ["API_KEY"],
|
| 78 |
"--enforce-eager",
|
| 79 |
# Minimize token usage
|
| 80 |
"--mm-processor-kwargs",
|
|
@@ -84,6 +72,12 @@ def serve_llm():
|
|
| 84 |
# '{"rope_type":"yarn","factor":2.0,"original_max_position_embeddings":32768}',
|
| 85 |
"--max-model-len",
|
| 86 |
"32768",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
]
|
| 88 |
|
| 89 |
subprocess.Popen(cmd)
|
|
|
|
| 10 |
API_KEY,
|
| 11 |
VLLM_PORT,
|
| 12 |
flashinfer_cache_vol,
|
| 13 |
+
CHAT_TEMPLATE,
|
| 14 |
)
|
| 15 |
|
| 16 |
|
|
|
|
| 41 |
import os
|
| 42 |
import torch
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
min_pixels = 128 * 28 * 28 # min 128 tokens
|
| 45 |
max_pixels = 340 * 28 * 28 # max 340 tokens (~512x512 image)
|
| 46 |
|
|
|
|
| 59 |
"--enable-auto-tool-choice",
|
| 60 |
"--limit-mm-per-prompt",
|
| 61 |
"image=100",
|
| 62 |
+
"--chat-template",
|
| 63 |
+
CHAT_TEMPLATE,
|
| 64 |
"--tensor-parallel-size",
|
| 65 |
str(N_GPU),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
"--enforce-eager",
|
| 67 |
# Minimize token usage
|
| 68 |
"--mm-processor-kwargs",
|
|
|
|
| 72 |
# '{"rope_type":"yarn","factor":2.0,"original_max_position_embeddings":32768}',
|
| 73 |
"--max-model-len",
|
| 74 |
"32768",
|
| 75 |
+
"--host",
|
| 76 |
+
"0.0.0.0",
|
| 77 |
+
"--port",
|
| 78 |
+
str(VLLM_PORT),
|
| 79 |
+
"--api-key",
|
| 80 |
+
os.environ["API_KEY"],
|
| 81 |
]
|
| 82 |
|
| 83 |
subprocess.Popen(cmd)
|
llm/configs.py
CHANGED
|
@@ -39,3 +39,5 @@ API_KEY = modal.Secret.from_name(
|
|
| 39 |
)
|
| 40 |
MINUTE = 60
|
| 41 |
VLLM_PORT = 8000
|
|
|
|
|
|
|
|
|
| 39 |
)
|
| 40 |
MINUTE = 60
|
| 41 |
VLLM_PORT = 8000
|
| 42 |
+
|
| 43 |
+
CHAT_TEMPLATE = "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\nTools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments based on the provided signatures within <tool-call></tool-call> XML tags:\\n<tool-call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool-call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool-calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool-call in message.tool-calls %}\n {%- if tool-call.function is defined %}\n {%- set tool-call = tool-call.function %}\n {%- endif %}\n {{- '\\n<tool-call>\\n{\"name\": \"' }}\n {{- tool-call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool-call.arguments | tojson }}\n {{- '}\\n</tool-call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
|