Intel
/

Qwen3-Coder-480B-A35B-Instruct-int4-mixed-AutoRound

@@ -6,18 +6,23 @@ base_model:
 ## Model Details
-This model is a mixed int4 model with group_size 64 and symmetric quantization of [Qwen/Qwen3-Coder-480B-A35B-Instruct](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct) generated by [intel/auto-round](https://github.com/intel/auto-round) via **RTN** (no algorithm tuning).  Non expert layers are fall back to 8 bits and group_size 128
 Please follow the license of the original model.
 ## How To Use
 **INT4 Inference on CPU/Intel GPU/CUDA**
 ~~~python
 from transformers import AutoModelForCausalLM, AutoTokenizer
-model_name = "Intel/Qwen3-Coder-480B-A35B-Instruct-int4-mixed-AutoRound"
 # load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -63,227 +68,6 @@ for i, prompt in enumerate(prompts):
     print(f"Generated: {decoded_outputs[i]}")
     print("-" * 50)
-"""
-Prompt: Write a quick sort algorithm.
-Generated: Here's a QuickSort implementation in Python with both in-place and simple versions:
-## In-Place QuickSort (More Efficient)
-```python
-def quicksort(arr, low=0, high=None):
-    """
-    Sorts an array using the QuickSort algorithm (in-place).
-    Args:
-        arr: List to be sorted
-        low: Starting index (default: 0)
-        high: Ending index (default: len(arr) - 1)
-    """
-    if high is None:
-        high = len(arr) - 1
-    if low < high:
-        # Partition the array and get the pivot index
-        pivot_index = partition(arr, low, high)
-        # Recursively sort elements before and after partition
-        quicksort(arr, low, pivot_index - 1)
-        quicksort(arr, pivot_index + 1, high)
-def partition(arr, low, high):
-    """
-    Partitions the array around a pivot element.
-    Elements smaller than pivot go to the left, larger to the right.
-    Returns:
-        The final position of the pivot
-    """
-    # Choose the rightmost element as pivot
-    pivot = arr[high]
-    # Index of smaller element (indicates right position of pivot)
-    i = low - 1
-    for j in range(low, high):
-        # If current element is smaller than or equal to pivot
-        if arr[j] <= pivot:
-            i += 1
-            arr[i], arr[j] = arr[j], arr[i]  # Swap elements
-    # Place pivot in correct position
-    arr[i + 1], arr[high] = arr[high], arr[i + 1]
-    return i + 1
-# Example usage
-if __name__ == "__main__":
-    # Test the algorithm
-    test_array = [64, 34, 25, 12, 22, 11, 90]
-    print("Original array:", test_array)
-    quicksort(test_array)
-    print("Sorted array:", test_array)
-```
-## Simple Version (Creates New Arrays)
-```python
-def quicksort_simple(arr):
-    """
-    Simple QuickSort implementation that creates new arrays.
-    Less memory efficient but easier to understand.
-    """
-    if len(arr) <= 1:
-        return arr
-    pivot = arr[len(arr) // 2]  # Choose middle element as pivot
-    left =
---------------------------------------------------
-Prompt: Write a flappy bird.
-Generated: # Flappy Bird in PyGame
-Here's a complete implementation of Flappy Bird using PyGame:
-```python
-import pygame
-import sys
-import random
-import math
-# Initialize pygame
-pygame.init()
-# Game constants
-WIDTH, HEIGHT = 800, 600
-FPS = 60
-GRAVITY = 0.5
-FLAP_STRENGTH = -8
-PIPE_SPEED = 3
-PIPE_GAP = 200
-PIPE_FREQUENCY = 1800  # milliseconds
-GROUND_HEIGHT = 100
-BIRD_RADIUS = 20
-# Colors
-SKY_BLUE = (113, 197, 207)
-GREEN = (111, 196, 69)
-DARK_GREEN = (76, 145, 65)
-BROWN = (160, 120, 40)
-YELLOW = (255, 217, 61)
-RED = (231, 76, 60)
-WHITE = (255, 255, 255)
-BLACK = (0, 0, 0)
-# Set up the display
-screen = pygame.display.set_mode((WIDTH, HEIGHT))
-pygame.display.set_caption("Flappy Bird")
-clock = pygame.time.Clock()
-# Font setup
-font = pygame.font.SysFont(None, 48)
-small_font = pygame.font.SysFont(None, 36)
-class Bird:
-    def __init__(self):
-        self.x = WIDTH // 3
-        self.y = HEIGHT // 2
-        self.velocity = 0
-        self.alive = True
-        self.rotation = 0
-    def flap(self):
-        if self.alive:
-            self.velocity = FLAP_STRENGTH
-    def update(self):
-        # Apply gravity
-        self.velocity += GRAVITY
-        self.y += self.velocity
-        # Rotate bird based on velocity
-        self.rotation = max(-30, min(self.velocity * 2, 90))
-        # Check boundaries
-        if self.y < 0:
-            self.y = 0
-            self.velocity = 0
-        if self.y > HEIGHT - GROUND_HEIGHT - BIRD_RADIUS:
-            self.y = HEIGHT - GROUND_HEIGHT - BIRD_RADIUS
-            self.velocity = 0
-            self.alive = False
---------------------------------------------------
-Prompt: Write a llm quantization algorithm.
-Generated: Here's a comprehensive implementation of LLM quantization algorithms, including post-training quantization and QLoRA-style quantization:
-```python
-import torch
-import torch.nn as nn
-from typing import Dict, Tuple, Optional
-import math
-class Quantizer:
-    """Base class for quantization operations"""
-    @staticmethod
-    def linear_quantize(tensor: torch.Tensor, bits: int, symmetric: bool = True) -> Tuple[torch.Tensor, float, float]:
-        """
-        Linearly quantize a tensor to specified bit-width
-        Args:
-            tensor: Input tensor to quantize
-            bits: Number of bits for quantization (e.g., 4, 8)
-            symmetric: Whether to use symmetric or asymmetric quantization
-        Returns:
-            Tuple of (quantized_tensor, scale, zero_point)
-        """
-        # Calculate range
-        if symmetric:
-            max_val = torch.max(torch.abs(tensor))
-            min_val = -max_val
-        else:
-            max_val = tensor.max()
-            min_val = tensor.min()
-        # Calculate scale and zero point
-        qmin = 0
-        qmax = 2 ** bits - 1
-        scale = (max_val - min_val) / (qmax - qmin)
-        if symmetric:
-            zero_point = 0.0
-        else:
-            zero_point = qmin - min_val / scale
-        # Quantize
-        quantized = torch.round(tensor / scale + zero_point).clamp(qmin, qmax)
-        return quantized.to(torch.uint8), scale.item(), zero_point
-    @staticmethod
-    def linear_dequantize(quantized: torch.Tensor, scale: float, zero_point: float) -> torch.Tensor:
-        """Dequantize tensor back to floating point"""
-        return (quantized.float() - zero_point) * scale
-class PostTrainingQuantizer:
-    """Post-training quantization for transformer models"""
-    def __init__(self, bits: int = 8):
-        self.bits = bits
-        self.quant_params = {}
-    def quantize_model(self, model: nn.Module) -> nn.Module:
-        """Quantize all linear layers in the model"""
-        for name, module in model.named_modules():
-            if isinstance(module, nn.Linear):
-                # Store original weight
-                weight = module.weight.data
-                # Quantize weight
-                q_weight, scale, zero_point = Quantizer.linear_quantize(
-                    weight
-"""
 ~~~
@@ -293,10 +77,10 @@ Here is the sample command to reproduce the model
 ```python
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
-model_name = "Qwen3/Qwen3-Coder-480B-A35B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_name,
                                              device_map="cpu", torch_dtype="auto")
@@ -305,12 +89,30 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
 layer_config = {}
 for n, m in model.named_modules():
-    if isinstance(m, torch.nn.Linear) and (not "expert" in n or "shared_experts" in n) and n != "lm_head":
         layer_config[n] = {"bits": 8, "group_size": 128}
 autoround = AutoRound(model, tokenizer, iters=0, group_size=64, layer_config=layer_config)
-autoround.quantize_and_save("./Qwen3-Coder-480B-A35B-Instruct-int4-mixed")
 ```

 ## Model Details
+This model is a mixed int4 model with group_size 64 and symmetric quantization of [Qwen/Qwen3-Coder-480B-A35B-Instruct](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct) generated by [intel/auto-round](https://github.com/intel/auto-round) via **RTN** (no algorithm tuning).
+Non expert layers  fallback to 8 bits and group_size 128. mlp.gate layers fallback to 16 bits to ensure runing successfully on vLLM.
 Please follow the license of the original model.
 ## How To Use
+**vLLM usage**
+~~~bash
+vllm serve Intel/Qwen3-Coder-480B-A35B-Instruct-int4-mixed-ar --tensor-parallel-size 4 --max-model-len 65536
+~~~
 **INT4 Inference on CPU/Intel GPU/CUDA**
 ~~~python
 from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "Intel/Qwen3-Coder-480B-A35B-Instruct-int4-mixed-ar"
 # load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
     print(f"Generated: {decoded_outputs[i]}")
     print("-" * 50)
 ~~~
 ```python
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 from auto_round import AutoRound
+model_name = "Qwen/Qwen3-Coder-480B-A35B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_name,
                                              device_map="cpu", torch_dtype="auto")
 layer_config = {}
 for n, m in model.named_modules():
+    if "mlp.gate" in n: ## vllm only support 16 bit for this layer
+        layer_config[n] = {"bits": 16}
+    elif isinstance(m, torch.nn.Linear) and (not "expert" in n or "shared_experts" in n) and n != "lm_head":
         layer_config[n] = {"bits": 8, "group_size": 128}
 autoround = AutoRound(model, tokenizer, iters=0, group_size=64, layer_config=layer_config)
+output_dir = "/dataset/Qwen3-Coder-480B-A35B-Instruct-int4-mixed"
+autoround.quantize_and_save(output_dir)
+## tricky code to handle qkv fusing issue, we will fix it in vllm later
+import os
+import json
+config_path = os.path.join(output_dir, "config.json")
+with open(config_path, "r") as file:
+    config = json.load(file)
+extra_config = config["quantization_config"]["extra_config"]
+num_hidden_layers = config["num_hidden_layers"]
+for i in range(num_hidden_layers):
+    qkv_name = f"model.layers.{str(i)}.self_attn.qkv_proj"
+    extra_config[qkv_name] = {"bits": 8, "group_size": 128}
+with open(config_path, "w") as file:
+    json.dump(config, file, indent=2)
 ```