onnx-models / CMD.md
agoor97's picture
Upload folder using huggingface_hub
16ffc97 verified

Qwen-0.5B Model

# Step 1: Export for text generation with past KV cache (better for chat)
echo "Exporting Qwen-0.5B..."
optimum-cli export onnx --model Qwen/Qwen1.5-0.5B --task text-generation-with-past onnx_models/qwen_onnx/

# Step 2: Quantize for ARM64 (Mobile target) using static INT8 quantization
echo "Quantizing Qwen-0.5B for ARM64 (Static)..."
optimum-cli onnxruntime quantize --onnx_model onnx_models/qwen_onnx/ --arm64 -o onnx_models/qwen_onnx_quantized/

TinyLlama-1.1B

# Step 1: Export for text generation with past KV cache (better for chat)
echo "Exporting TinyLlama-1.1B..."
optimum-cli export onnx --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --task text-generation-with-past onnx_models/tinyllama_onnx/

# Step 2: Attempt Quantization for ARM64 (Static INT8)
# This is the step you mentioned takes too long or fails. Try it, but have the alternative ready.
echo "Attempting TinyLlama-1.1B quantization for ARM64 (Static)..."
optimum-cli onnxruntime quantize --onnx_model onnx_models/tinyllama_onnx/ --arm64 -o onnx_models/tinyllama_onnx_quantized/

Phi-1.5 Model

# Step 1: Export for text generation with past KV cache (better for chat)
echo "Exporting Phi-1.5..."
optimum-cli export onnx --model microsoft/phi-1_5 --task text-generation-with-past onnx_models/phi_onnx/

# Step 2: Attempt Quantization for ARM64 (Static INT8) -- Failed with me (need much memory)
echo "Quantizing Phi-1.5 for ARM64 (Static)..."
optimum-cli onnxruntime quantize --onnx_model onnx_models/phi_onnx/ --arm64 -o onnx_models/phi_onnx_quantized/

Falcon-1B Model

# Export
echo "Exporting Falcon-1B..."
optimum-cli export onnx --model tiiuae/falcon-rw-1b --task text-generation-with-past onnx_models/falcon_onnx/

# Quantize for ARM64 -- Failed with me (need much memory)
echo "Quantizing Falcon-1B for ARM64..."
optimum-cli onnxruntime quantize --onnx_model onnx_models/falcon_onnx/ --arm64 -o onnx_models/falcon_onnx_quantized/

GPT-2Medium Model

# Export GPT2-Medium
echo "Exporting GPT2-Medium..."
optimum-cli export onnx --model gpt2-medium --task text-generation-with-past onnx_models/gpt2_onnx/

# Quantize for ARM64
echo "Quantizing GPT2-Medium for ARM64..."
optimum-cli onnxruntime quantize --onnx_model onnx_models/gpt2_onnx/ --arm64 -o onnx_models/gpt2_onnx_quantized/

OPT-350M Model

# Export OPT-350M
echo "Exporting OPT-350M..."
optimum-cli export onnx --model facebook/opt-350m --task text-generation-with-past onnx_models/opt_onnx/

# Quantize for ARM64
echo "Quantizing OPT-350M for ARM64..."
optimum-cli onnxruntime quantize --onnx_model onnx_models/opt_onnx/ --arm64 -o onnx_models/opt_onnx_quantized/

Bloom-560M Model

# Export Bloom-560M
echo "Exporting Bloom-560M..."
optimum-cli export onnx --model bigscience/bloom-560m --task text-generation-with-past onnx_models/bloom_onnx/

# Quantize for ARM64
echo "Quantizing Bloom-560M for ARM64..."
optimum-cli onnxruntime quantize --onnx_model onnx_models/bloom_onnx/ --arm64 -o onnx_models/bloom_onnx_quantized/