kkolomeitsev commited on Aug 10

Commit

d872c55

verified ·

1 Parent(s): ca1f8dd

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

.gitattributes +36 -35
LICENSE +21 -0
README.md +192 -3
grouped_qwen3_checkpoint/epoch_2_best/added_tokens.json +28 -0
grouped_qwen3_checkpoint/epoch_2_best/chat_template.jinja +89 -0
grouped_qwen3_checkpoint/epoch_2_best/config.json +60 -0
grouped_qwen3_checkpoint/epoch_2_best/epoch_metadata.json +9 -0
grouped_qwen3_checkpoint/epoch_2_best/merges.txt +0 -0
grouped_qwen3_checkpoint/epoch_2_best/pytorch_model.bin +3 -0
grouped_qwen3_checkpoint/epoch_2_best/special_tokens_map.json +31 -0
grouped_qwen3_checkpoint/epoch_2_best/tokenizer.json +3 -0
grouped_qwen3_checkpoint/epoch_2_best/tokenizer_config.json +239 -0
grouped_qwen3_checkpoint/epoch_2_best/vocab.json +0 -0
inference_qwen3_merged.py +793 -0
prepare_dataset.py +652 -0
processed_qwen3_dataset/metadata.json +11 -0
processed_qwen3_dataset/processed_dataset.pkl +3 -0
processed_qwen3_dataset/samples.json +92 -0
train_custom_qwen3.py +1292 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,36 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+grouped_qwen3_checkpoint/epoch_2_best/tokenizer.json filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Konstantin Kolomeitsev
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,192 @@
----
-license: mit
----

+# Context Merging: from Tokens to Entities and Concepts
+This repo contains a minimal research pipeline that compresses input context for Qwen3 by grouping dependent subtokens early, then trains a small adapter to consume the grouped embeddings.
+- `prepare_dataset.py` builds a local dataset of grouped embeddings from a base Qwen3 with a custom layer 0 that performs token grouping.
+- `train_custom_qwen3.py` fine-tunes a customized Qwen3 that adds a small MLP adapter for grouped inputs, while freezing all weights except layer 0.
+- `inference_qwen3_merged.py` runs end-to-end inference by first grouping with the base model, then generating with the trained model that understands grouped inputs. Includes perf metrics and estimated attention-memory savings.
+---
+## How it works
+1. **Layer-0 grouping at prefill**
+   A custom decoder layer 0 computes attention on the full token sequence, clusters adjacent tokens using lightweight heuristics plus attention relations, then averages token vectors per group. The grouped result is added back to a residual projection and saved as `grouped_hidden_states`.
+2. **Dataset building**
+   The dataset builder swaps in the custom layer 0, feeds formatted prompts, extracts the stored `grouped_hidden_states`, and serializes them together with target responses.
+3. **Model training**
+   The training model wraps Qwen3 with a **GroupedInputMLPAdapter** that processes the grouped embeddings during prefill. Only layer 0 and the adapter are trainable; embeddings, upper layers, final norm, and LM head are frozen. Prefill uses `grouped_inputs` as `inputs_embeds`, then generation proceeds with past-key-values.
+4. **Inference**
+   The inference runner loads two models: a grouping model with the custom layer 0, and your trained model. It reports token compression, timing, and memory usage. Savings are also estimated with a simple attention-cost proxy that scales with sequence length squared.
+---
+## Requirements
+- Python packages: `torch`, `transformers`, `datasets`, `tqdm`, `psutil`. These are imported directly in the scripts.
+- GPU is optional. Scripts detect CUDA and set dtype accordingly.
+Install:
+```bash
+pip install torch transformers datasets tqdm psutil
+```
+---
+## Repository layout
+- `prepare_dataset.py` - dataset builder using custom layer 0 grouping.
+- `train_custom_qwen3.py` - trainer for grouped-input Qwen3 with an MLP adapter, freezing all but layer 0.
+- `inference_qwen3_merged.py` - two-stage inference runner with metrics.
+---
+## 1 Build the local dataset
+Run:
+```bash
+python prepare_dataset.py
+```
+Key defaults inside `DatasetProcessor`:
+- `model_name="Qwen/Qwen3-0.6B"`
+- `dataset_name="Magpie-Align/Magpie-Qwen2.5-Pro-1M-v0.1"`
+- `output_dir="./processed_dataset"`
+- `batch_size=1`, `max_samples=None`, `save_frequency=1000`
+  Edit these in the constructor if you need to change them.
+The builder formats inputs using a simple system prompt template.
+It tokenizes, runs layer 0 once per example, captures `grouped_hidden_states`, and buffers results.
+**Outputs** under `output_dir`:
+- `processed_dataset.pkl` - list of samples with `inputs_embeds` (grouped), `response`, and metadata.
+- Additional metadata and sample previews are written alongside, for quick inspection.
+---
+## 2 Train the grouped-input model
+Run:
+```bash
+python train_custom_qwen3.py --mode train
+```
+Training config defaults (edit in the script if needed):
+- `model_name="Qwen/Qwen3-0.6B"`
+- `dataset_path="./processed_qwen3_dataset/processed_dataset.pkl"`
+- `output_dir="./grouped_qwen3_checkpoint"`
+- `batch_size=4`, `learning_rate=5e-4`, `num_epochs=3`, `warmup_steps=100`
+- Logging, eval, and checkpoint cadence are configurable.
+What is trained:
+- A **GroupedInputMLPAdapter** that takes grouped embeddings and returns adapted embeddings, normalized with RMSNorm.
+- Only layer 0 and this adapter are trainable; everything else is frozen.
+How targets are computed:
+- Prefill: pass `grouped_inputs` via `inputs_embeds` with `is_prefill=True`.
+- Then feed target response tokens while reusing `past_key_values`.
+Checkpoints contain model weights, config, and tokenizer in the epoch folder.
+---
+## 3 Run inference
+### Option A - standalone runner
+Quick start:
+```bash
+python inference_qwen3_merged.py \
+  --checkpoint ./grouped_qwen3_checkpoint/epoch_2_best \
+  --grouping_model Qwen/Qwen3-0.6B \
+  --instruction "Explain attention like I am in 9th grade" \
+  --max_length 256 \
+  --temperature 0.7 \
+  --device cuda
+```
+CLI options: `--checkpoint`, `--grouping_model`, `--instruction`, `--max_length`, `--temperature`, `--no_sample` for greedy, and `--device` for cuda or cpu.
+What it does:
+- Loads a grouping model with the custom layer 0 and a trained inference model.
+- Phase 1 groups tokens and reports compression. Phase 2 generates with the trained model.
+- Reports compression ratio, memory reduction, total time, and tokens per second.
+### Option B - use the training script utilities
+The trainer exposes helper functions for loading a trained model and running generation with grouped inputs. See `load_trained_model` and `generate_with_grouped_input` in the training script if you prefer a programmatic flow.
+---
+## Parameters - quick reference
+### Dataset builder
+- `model_name` - base HF model for grouping, default Qwen/Qwen3-0.6B.
+- `dataset_name` - source HF dataset split, default Magpie-Align... Qwen2.5-Pro-1M.
+- `output_dir` - where pickles and metadata go.
+- `max_samples` - optional cap for quick tests.
+### Training
+- `dataset_path` - path to `processed_dataset.pkl`.
+- `output_dir` - where checkpoints are written.
+- `batch_size, learning_rate, num_epochs, warmup_steps` - training hyperparams.
+- Only layer 0 and the adapter are trainable. Verify with `requires_grad` settings in `_freeze_layers`.
+### Inference
+- `--checkpoint` - path to trained checkpoint folder.
+- `--grouping_model` - HF model name used for grouping.
+- `--instruction` - user prompt, any language.
+- `--max_length`, `--temperature`, `--no_sample`, `--device`.
+---
+## Notes
+- The custom layer 0 is installed by copying weights from the original layer 0, then replacing the module so it can compute groups and cache the grouped states.
+- Grouping relies on simple rules over tokens like space and newline boundaries plus attention relations. You can tune the threshold in `CustomQwen3Attention`.
+---
+## Troubleshooting
+- **CUDA memory spikes**: reduce batch size during training or use fewer samples. Generation is incremental and reuses past-key-values.
+- **No grouped states found**: ensure the custom layer 0 is used and `is_initialized` is reset before each prefill.
+- **Checkpoint not found**: the inference loader expects `pytorch_model.bin` or `model.safetensors` in the checkpoint directory.
+---
+## Why this can save memory
+If the sequence shrinks from `N` to `G` groups, attention memory scales roughly with `G^2` vs `N^2`. The script prints an estimated savings based on that relation.
+---
+## Citation
+```
+@misc{Kolomeitsev2025ContextMerging,
+      title = {Context Merging: from Tokens to Entities and Concepts},
+      author = {Konstantin Kolomeitsev},
+      year = {2025}
+}
+```
+## Contact
+If you have any questions, please raise an issue or contact with me [[email protected]]([email protected]).

grouped_qwen3_checkpoint/epoch_2_best/added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

grouped_qwen3_checkpoint/epoch_2_best/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

grouped_qwen3_checkpoint/epoch_2_best/config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

grouped_qwen3_checkpoint/epoch_2_best/epoch_metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "epoch": 2,
+  "global_step": 27000,
+  "model_name": "Qwen/Qwen3-0.6B",
+  "learning_rate": 0.0005,
+  "batch_size": 1,
+  "is_best": true,
+  "model_class": "CustomQwen3ForCausalLM"
+}

grouped_qwen3_checkpoint/epoch_2_best/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

grouped_qwen3_checkpoint/epoch_2_best/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db3c76f1ed2b27fe53e699354e4b628d09c75da48a5f1371058542645a525b5a
+size 2401122294

grouped_qwen3_checkpoint/epoch_2_best/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

grouped_qwen3_checkpoint/epoch_2_best/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:352a863cd2761388ccc58f1432467ba6a1037bf12df9069889b142fa246471f6
+size 11422752

grouped_qwen3_checkpoint/epoch_2_best/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

grouped_qwen3_checkpoint/epoch_2_best/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

inference_qwen3_merged.py ADDED Viewed

	@@ -0,0 +1,793 @@

+import os
+import logging
+import time
+import psutil
+from typing import Optional, List, Dict, Any, Tuple
+from pathlib import Path
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+from transformers.models.qwen3.modeling_qwen3 import (
+    Qwen3ForCausalLM,
+    Qwen3RMSNorm,
+    Qwen3DecoderLayer,
+    Qwen3Attention,
+    Qwen3RotaryEmbedding,
+)
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger("qwen3_grouped_inference")
+class PerformanceMonitor:
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        """Reset all metrics."""
+        self.start_time = None
+        self.end_time = None
+        self.start_memory = None
+        self.peak_memory = None
+        self.start_gpu_memory = None
+        self.peak_gpu_memory = None
+    def start_monitoring(self):
+        self.reset()
+        self.start_time = time.time()
+        process = psutil.Process()
+        self.start_memory = process.memory_info().rss / 1024 / 1024  # MB
+        self.peak_memory = self.start_memory
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            self.start_gpu_memory = torch.cuda.memory_allocated() / 1024 / 1024  # MB
+            self.peak_gpu_memory = self.start_gpu_memory
+    def update_peak_memory(self):
+        process = psutil.Process()
+        current_memory = process.memory_info().rss / 1024 / 1024  # MB
+        self.peak_memory = max(self.peak_memory, current_memory)
+        if torch.cuda.is_available():
+            current_gpu_memory = torch.cuda.memory_allocated() / 1024 / 1024  # MB
+            self.peak_gpu_memory = max(self.peak_gpu_memory, current_gpu_memory)
+    def stop_monitoring(self):
+        self.end_time = time.time()
+        self.update_peak_memory()
+        metrics = {
+            "duration_ms": (self.end_time - self.start_time) * 1000,
+            "cpu_memory_start_mb": self.start_memory,
+            "cpu_memory_peak_mb": self.peak_memory,
+            "cpu_memory_used_mb": self.peak_memory - self.start_memory,
+        }
+        if torch.cuda.is_available():
+            metrics.update({
+                "gpu_memory_start_mb": self.start_gpu_memory,
+                "gpu_memory_peak_mb": self.peak_gpu_memory,
+                "gpu_memory_used_mb": self.peak_gpu_memory - self.start_gpu_memory,
+            })
+        return metrics
+class CustomQwen3Attention(Qwen3Attention):
+    def __init__(self, config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.layer_idx = layer_idx
+        self.tokenizer = None
+        self.current_input_ids = None
+        self.threshold = 0.1
+        if not hasattr(self, 'num_key_value_heads'):
+            self.num_key_value_heads = config.num_key_value_heads if hasattr(config, 'num_key_value_heads') else config.num_attention_heads
+        if not hasattr(self, 'head_dim'):
+            self.head_dim = config.hidden_size // config.num_attention_heads
+    def set_tokenizer(self, tokenizer):
+        self.tokenizer = tokenizer
+    def set_current_input_ids(self, input_ids):
+        self.current_input_ids = input_ids
+    def _is_special_token(self, token: str) -> bool:
+        if self.tokenizer is None:
+            return False
+        special_tokens = set()
+        if hasattr(self.tokenizer, 'special_tokens_map'):
+            for token_type, token_value in self.tokenizer.special_tokens_map.items():
+                if isinstance(token_value, str):
+                    special_tokens.add(token_value)
+                elif isinstance(token_value, list):
+                    special_tokens.update(token_value)
+        if hasattr(self.tokenizer, 'added_tokens_encoder'):
+            special_tokens.update(self.tokenizer.added_tokens_encoder.keys())
+        if token in special_tokens:
+            return True
+        special_patterns = [
+            lambda t: t.startswith('<|') and t.endswith('|>'),
+            lambda t: t.startswith('<') and t.endswith('>'),
+            lambda t: t.startswith('[') and t.endswith(']'),
+        ]
+        return any(pattern(token) for pattern in special_patterns)
+    def _get_token_relations(self, attention_weights: torch.Tensor, tokens: List[str]) -> List[Dict]:
+        batch_size, num_heads, query_len, key_len = attention_weights.shape
+        attn = attention_weights[0].mean(dim=0)
+        relations = []
+        if query_len == 1:
+            current_token_pos = len(tokens) - 1
+            token_relations = []
+            for j in range(len(tokens)):
+                if j != current_token_pos:
+                    weight = attn[0, j].item()
+                    if weight > self.threshold:
+                        token_relations.append({
+                            'target_pos': j,
+                            'weight': round(weight, 3)
+                        })
+            relations.append({
+                'source_pos': current_token_pos,
+                'relations': token_relations
+            })
+        else:
+            for i in range(min(query_len, len(tokens))):
+                token_relations = []
+                for j in range(len(tokens)):
+                    if i != j and j < key_len:
+                        weight = attn[i, j].item()
+                        if weight > self.threshold:
+                            token_relations.append({
+                                'target_pos': j,
+                                'weight': round(weight, 3)
+                            })
+                relations.append({
+                    'source_pos': i,
+                    'relations': token_relations
+                })
+        return relations
+    def _get_token_groups(self, attention_weights: torch.Tensor) -> List[List[int]]:
+        if self.tokenizer is None or self.current_input_ids is None:
+            return []
+        if len(attention_weights.shape) != 4:
+            return []
+        batch_size, num_heads, query_len, key_len = attention_weights.shape
+        input_ids = self.current_input_ids
+        if input_ids is None or input_ids.shape[1] < key_len:
+            return []
+        tokens = [self.tokenizer.decode([token_id]) for token_id in input_ids[0][:key_len]]
+        relations = self._get_token_relations(attention_weights, tokens)
+        groups = []
+        current_group = []
+        current_group_indices = []
+        for i, token in enumerate(tokens):
+            is_empty_relations = i < len(relations) and len(relations[i]['relations']) == 0
+            starts_with_space = token.startswith(' ') and token != ' '
+            is_space = token == ' '
+            is_new_line = '\n' in token
+            prev_token_is_special = False
+            prev_token_is_new_line = False
+            prev_token_is_space = False
+            if i > 0:
+                prev_token = tokens[i-1]
+                prev_token_is_special = self._is_special_token(prev_token)
+                prev_token_is_new_line = '\n' in prev_token
+                prev_token_is_space = prev_token == ' '
+            prev_newline_current_not = prev_token_is_new_line and not is_new_line
+            prev_space_current_not = prev_token_is_space and not is_space
+            current_space_prev_not = is_space and not prev_token_is_space
+            if (is_empty_relations or starts_with_space or is_new_line or
+                prev_token_is_special or prev_newline_current_not or prev_space_current_not or
+                current_space_prev_not) and current_group:
+                groups.append(current_group_indices)
+                current_group = []
+                current_group_indices = []
+            current_group.append(token)
+            current_group_indices.append(i)
+        if current_group:
+            groups.append(current_group_indices)
+        if groups:
+            logger.info("Token grouping details:")
+            for group_idx, group_indices in enumerate(groups):
+                group_tokens = [tokens[i] for i in group_indices]
+                combined_text = ''.join(group_tokens)
+                logger.info(f"  Group {group_idx + 1}: {group_tokens} → '{combined_text}'")
+        return groups
+class CustomQwen3DecoderLayer(Qwen3DecoderLayer):
+    """Custom Qwen3 decoder layer with grouping functionality."""
+    def __init__(self, config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.layer_idx = layer_idx
+        self.rotary_emb = Qwen3RotaryEmbedding(config=config)
+        self.self_attn = CustomQwen3Attention(config, layer_idx)
+        self.is_initialized = False
+        self.grouped_hidden_states = None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple] = None,
+        **kwargs,
+    ):
+        if self.layer_idx != 0:
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        is_prefill = hidden_states.shape[1] > 1 and not self.is_initialized
+        if not is_prefill:
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        residual = hidden_states
+        x = self.input_layernorm(hidden_states)
+        attn_ret = self.self_attn(
+            hidden_states=x,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=None,
+            output_attentions=True,
+            use_cache=False,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        if isinstance(attn_ret, tuple):
+            if len(attn_ret) == 3:
+                attn_out, attn_weights, _ = attn_ret
+            elif len(attn_ret) == 2:
+                attn_out, attn_weights = attn_ret
+            else:
+                raise RuntimeError(f"Unexpected attention return length: {len(attn_ret)}")
+        else:
+            raise RuntimeError("Attention did not return weights.")
+        groups = self.self_attn._get_token_groups(attn_weights)
+        if not groups:
+            self.is_initialized = True
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        averaged_vectors = []
+        group_info = []
+        for gi, idxs in enumerate(groups):
+            if len(idxs) == 1:
+                averaged_vectors.append(attn_out[:, idxs[0], :])
+                group_info.append({"type": "single", "positions": idxs, "new_position": gi})
+            else:
+                gvecs = attn_out[:, idxs, :]
+                ave = gvecs.mean(dim=1)
+                averaged_vectors.append(ave)
+                group_info.append({"type": "averaged", "positions": idxs, "new_position": gi})
+        new_attn_out = torch.stack(averaged_vectors, dim=1)
+        expanded_residual = torch.stack([
+            (
+                residual[:, info['positions'], :].sum(dim=1)
+                if len(info['positions']) > 1
+                else residual[:, info['positions'][0], :]
+            )
+            for info in group_info
+        ], dim=1)
+        hs = expanded_residual + new_attn_out
+        grouped_hidden = self.post_attention_layernorm(hs)
+        # Store grouped embeddings
+        self.grouped_hidden_states = grouped_hidden
+        self.is_initialized = True
+        return hs
+class GroupedInputMLPAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        hidden_size = config.hidden_size
+        self.grouped_processor = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size * 2),
+            nn.SiLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.Dropout(0.1)
+        )
+        norm_eps = getattr(config, 'rms_norm_eps', 1e-6)
+        self.layer_norm = Qwen3RMSNorm(hidden_size, eps=norm_eps)
+    def forward(self, grouped_embeds: torch.Tensor) -> torch.Tensor:
+        processed = self.grouped_processor(grouped_embeds)
+        output = self.layer_norm(grouped_embeds + processed)
+        return output
+class CustomQwen3ForCausalLM(Qwen3ForCausalLM):
+    def __init__(self, config):
+        super().__init__(config)
+        self.grouped_input_mlp = GroupedInputMLPAdapter(config)
+        self.is_grouped_input_mode = False
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        grouped_inputs: Optional[torch.FloatTensor] = None,
+        is_prefill: Optional[bool] = None,
+        **kwargs
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if grouped_inputs is not None and is_prefill:
+            self.is_grouped_input_mode = True
+            processed_grouped_inputs = self.grouped_input_mlp(grouped_inputs)
+            inputs_embeds = processed_grouped_inputs
+            input_ids = None
+            batch_size, seq_len = inputs_embeds.shape[:2]
+            if position_ids is None:
+                device = inputs_embeds.device
+                position_ids = torch.arange(seq_len, device=device, dtype=torch.long)
+                position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
+            if attention_mask is None:
+                attention_mask = torch.ones((batch_size, seq_len), device=inputs_embeds.device, dtype=torch.long)
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs
+        )
+def create_grouping_model(model_name: str = "Qwen/Qwen3-0.6B") -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        dtype = torch.float16
+    else:
+        device = torch.device("cpu")
+        dtype = torch.float32
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=dtype,
+        attn_implementation="eager"
+    ).to(device)
+    orig0 = model.model.layers[0]
+    custom0 = CustomQwen3DecoderLayer(model.config, 0)
+    custom0.mlp.load_state_dict(orig0.mlp.state_dict())
+    custom0.input_layernorm.load_state_dict(orig0.input_layernorm.state_dict())
+    custom0.post_attention_layernorm.load_state_dict(orig0.post_attention_layernorm.state_dict())
+    custom0.self_attn.load_state_dict(orig0.self_attn.state_dict())
+    custom0.self_attn.set_tokenizer(tokenizer)
+    custom0 = custom0.to(device=device, dtype=dtype)
+    model.model.layers[0] = custom0
+    return model, tokenizer
+def load_inference_model(checkpoint_path: str) -> Tuple[CustomQwen3ForCausalLM, AutoTokenizer]:
+    logger.info(f"Loading inference model from {checkpoint_path}")
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
+    config = Qwen3Config.from_pretrained(checkpoint_path)
+    model = CustomQwen3ForCausalLM(config)
+    model_path = Path(checkpoint_path) / "pytorch_model.bin"
+    if not model_path.exists():
+        model_path = Path(checkpoint_path) / "model.safetensors"
+    if not model_path.exists():
+        raise FileNotFoundError(f"No model weights found in {checkpoint_path}")
+    state_dict = torch.load(model_path, map_location="cpu")
+    model.load_state_dict(state_dict, strict=False)
+    model = model.eval().to(torch.float32)
+    return model, tokenizer
+class Qwen3GroupedInference:
+    def __init__(self,
+                 checkpoint_path: str,
+                 grouping_model_name: str = "Qwen/Qwen3-0.6B",
+                 device: Optional[str] = None):
+        """Initialize inference system with both models."""
+        if device is None:
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.device = torch.device(device)
+        logger.info(f"Initializing inference on device: {self.device}")
+        self.system_prompt = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
+        self.response_start = "<|im_end|>\n<|im_start|>assistant\n"
+        logger.info("Loading grouping model...")
+        self.grouping_model, self.grouping_tokenizer = create_grouping_model(grouping_model_name)
+        self.grouping_model = self.grouping_model.to(self.device)
+        logger.info("Loading inference model...")
+        self.inference_model, self.inference_tokenizer = load_inference_model(checkpoint_path)
+        self.inference_model = self.inference_model.to(self.device)
+        logger.info("Both models loaded successfully")
+    def format_input_text(self, instruction: str) -> str:
+        return f"{self.system_prompt}{instruction}{self.response_start}"
+    def get_grouped_embeddings(self, text: str) -> Tuple[torch.Tensor, Dict[str, Any]]:
+        monitor = PerformanceMonitor()
+        monitor.start_monitoring()
+        if hasattr(self.grouping_model.model.layers[0], "is_initialized"):
+            self.grouping_model.model.layers[0].is_initialized = False
+        batch = self.grouping_tokenizer(text, return_tensors="pt", truncation=True, max_length=2048).to(self.device)
+        input_ids = batch["input_ids"]
+        original_token_count = input_ids.shape[1]
+        original_tokens = [self.grouping_tokenizer.decode([token_id]) for token_id in input_ids[0]]
+        logger.info(f"Original input tokens ({original_token_count}): {original_tokens}")
+        if hasattr(self.grouping_model.model.layers[0], "self_attn"):
+            sat = self.grouping_model.model.layers[0].self_attn
+            if hasattr(sat, "set_current_input_ids"):
+                sat.set_current_input_ids(input_ids)
+        monitor.update_peak_memory()
+        with torch.no_grad():
+            inputs_embeds = self.grouping_model.model.embed_tokens(input_ids)
+            seq_len = inputs_embeds.shape[1]
+            position_ids = torch.arange(seq_len, device=self.device, dtype=torch.long).unsqueeze(0)
+            if hasattr(self.grouping_model.model, 'rotary_emb'):
+                pos_embeds = self.grouping_model.model.rotary_emb(inputs_embeds, position_ids)
+            else:
+                pos_embeds = None
+            monitor.update_peak_memory()
+            _ = self.grouping_model.model.layers[0](
+                hidden_states=inputs_embeds,
+                attention_mask=None,
+                position_ids=position_ids,
+                past_key_value=None,
+                output_attentions=False,
+                use_cache=False,
+                cache_position=None,
+                position_embeddings=pos_embeds,
+            )
+            monitor.update_peak_memory()
+            if (hasattr(self.grouping_model.model.layers[0], "grouped_hidden_states") and
+                self.grouping_model.model.layers[0].grouped_hidden_states is not None):
+                grouped_embeds = self.grouping_model.model.layers[0].grouped_hidden_states.clone()
+                grouped_token_count = grouped_embeds.shape[1]
+                # Clear the stored state
+                self.grouping_model.model.layers[0].grouped_hidden_states = None
+                compression_ratio = original_token_count / grouped_token_count if grouped_token_count > 0 else 1.0
+                reduction_percent = (1 - grouped_token_count / original_token_count) * 100 if original_token_count > 0 else 0.0
+                logger.info(f"Grouped tokens: {grouped_token_count}")
+                logger.info(f"Compression ratio: {compression_ratio:.2f}x ({reduction_percent:.1f}% reduction)")
+                metrics = monitor.stop_monitoring()
+                metrics.update({
+                    "original_tokens": original_token_count,
+                    "grouped_tokens": grouped_token_count,
+                    "compression_ratio": compression_ratio,
+                    "reduction_percent": reduction_percent
+                })
+                return grouped_embeds.squeeze(0), metrics
+            else:
+                logger.warning("Grouping failed, using original embeddings")
+                metrics = monitor.stop_monitoring()
+                metrics.update({
+                    "original_tokens": original_token_count,
+                    "grouped_tokens": original_token_count,
+                    "compression_ratio": 1.0,
+                    "reduction_percent": 0.0
+                })
+                return inputs_embeds.squeeze(0), metrics
+    def generate_with_grouped_input(self,
+                                  grouped_input: torch.Tensor,
+                                  max_length: int = 512,
+                                  temperature: float = 0.7,
+                                  do_sample: bool = True) -> Tuple[str, Dict[str, Any]]:
+        """Generate text using grouped input embeddings."""
+        monitor = PerformanceMonitor()
+        monitor.start_monitoring()
+        model_dtype = next(self.inference_model.parameters()).dtype
+        grouped_input = grouped_input.to(device=self.device, dtype=model_dtype)
+        if grouped_input.ndim == 2:
+            grouped_input = grouped_input.unsqueeze(0)
+        input_seq_len = grouped_input.shape[1]
+        logger.info(f"Inference model input sequence length: {input_seq_len}")
+        monitor.update_peak_memory()
+        with torch.no_grad():
+            outputs = self.inference_model(
+                grouped_inputs=grouped_input,
+                is_prefill=True,
+                use_cache=True,
+                return_dict=True
+            )
+        monitor.update_peak_memory()
+        if hasattr(outputs, 'logits') and outputs.logits is not None:
+            next_token_logits = outputs.logits[:, -1, :]
+        else:
+            raise RuntimeError("Could not extract logits from model output")
+        if do_sample:
+            next_token_logits = next_token_logits / temperature
+            probs = F.softmax(next_token_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+        generated_ids = next_token
+        past_key_values = getattr(outputs, 'past_key_values', None)
+        generated_tokens = 1
+        for step in range(max_length - 1):
+            monitor.update_peak_memory()
+            with torch.no_grad():
+                outputs = self.inference_model(
+                    input_ids=next_token,
+                    past_key_values=past_key_values,
+                    use_cache=True,
+                    return_dict=True
+                )
+            if not hasattr(outputs, 'logits'):
+                break
+            next_token_logits = outputs.logits[:, -1, :]
+            if do_sample:
+                next_token_logits = next_token_logits / temperature
+                probs = F.softmax(next_token_logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            else:
+                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+            generated_ids = torch.cat([generated_ids, next_token], dim=1)
+            past_key_values = getattr(outputs, 'past_key_values', None)
+            generated_tokens += 1
+            if next_token.item() == self.inference_tokenizer.eos_token_id:
+                break
+        generated_text = self.inference_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        metrics = monitor.stop_monitoring()
+        metrics.update({
+            "input_seq_len": input_seq_len,
+            "generated_tokens": generated_tokens,
+            "tokens_per_second": generated_tokens / (metrics["duration_ms"] / 1000) if metrics["duration_ms"] > 0 else 0
+        })
+        logger.info(f"Generated {generated_tokens} tokens in {metrics['duration_ms']:.1f}ms")
+        logger.info(f"Generation speed: {metrics['tokens_per_second']:.1f} tokens/second")
+        return generated_text, metrics
+    def inference(self,
+                 instruction: str,
+                 max_length: int = 512,
+                 temperature: float = 0.7,
+                 do_sample: bool = True) -> Dict[str, Any]:
+        """Run complete inference pipeline from instruction to response."""
+        logger.info("=" * 60)
+        logger.info("STARTING INFERENCE PIPELINE")
+        logger.info("=" * 60)
+        input_text = self.format_input_text(instruction)
+        logger.info("PHASE 1: Token Grouping")
+        grouped_embeddings, grouping_metrics = self.get_grouped_embeddings(input_text)
+        logger.info("PHASE 2: Response Generation")
+        response, generation_metrics = self.generate_with_grouped_input(
+            grouped_input=grouped_embeddings,
+            max_length=max_length,
+            temperature=temperature,
+            do_sample=do_sample
+        )
+        total_metrics = {
+            "grouping": grouping_metrics,
+            "generation": generation_metrics,
+            "total_duration_ms": grouping_metrics["duration_ms"] + generation_metrics["duration_ms"],
+        }
+        logger.info("=" * 60)
+        logger.info("INFERENCE SUMMARY")
+        logger.info("=" * 60)
+        logger.info(f"Input compression: {grouping_metrics['original_tokens']} → {grouping_metrics['grouped_tokens']} tokens")
+        logger.info(f"Compression ratio: {grouping_metrics['compression_ratio']:.2f}x")
+        logger.info(f"Memory reduction: {grouping_metrics['reduction_percent']:.1f}%")
+        logger.info(f"Total time: {total_metrics['total_duration_ms']:.1f}ms")
+        logger.info(f"Generation speed: {generation_metrics['tokens_per_second']:.1f} tokens/sec")
+        if torch.cuda.is_available():
+            total_gpu_memory = grouping_metrics.get("gpu_memory_used_mb", 0) + generation_metrics.get("gpu_memory_used_mb", 0)
+            logger.info(f"Total GPU memory used: {total_gpu_memory:.1f}MB")
+        total_cpu_memory = grouping_metrics.get("cpu_memory_used_mb", 0) + generation_metrics.get("cpu_memory_used_mb", 0)
+        logger.info(f"Total CPU memory used: {total_cpu_memory:.1f}MB")
+        original_seq_len = grouping_metrics['original_tokens']
+        grouped_seq_len = grouping_metrics['grouped_tokens']
+        estimated_memory_savings = (1 - (grouped_seq_len ** 2) / (original_seq_len ** 2)) * 100 if original_seq_len > 0 else 0
+        logger.info(f"Estimated attention memory savings: {estimated_memory_savings:.1f}%")
+        logger.info("=" * 60)
+        return {
+            "instruction": instruction,
+            "response": response,
+            "metrics": total_metrics
+        }
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Qwen3 Grouped Inference")
+    parser.add_argument("--checkpoint", type=str,
+                       default="./grouped_qwen3_checkpoint/epoch_2_best",
+                       help="Path to trained model checkpoint")
+    parser.add_argument("--grouping_model", type=str, default="Qwen/Qwen3-0.6B",
+                       help="Grouping model name")
+    parser.add_argument("--instruction", type=str, default="""
+Что такое нейронные сети, объясни как школьнику 9го класса
+""",
+                       help="Instruction for inference")
+    parser.add_argument("--max_length", type=int, default=512,
+                       help="Maximum generation length")
+    parser.add_argument("--temperature", type=float, default=0.7,
+                       help="Generation temperature")
+    parser.add_argument("--no_sample", action="store_true",
+                       help="Use greedy decoding")
+    parser.add_argument("--device", type=str,
+                       help="Device to use (cuda/cpu)")
+    args = parser.parse_args()
+    inference_system = Qwen3GroupedInference(
+        checkpoint_path=args.checkpoint,
+        grouping_model_name=args.grouping_model,
+        device=args.device
+    )
+    do_sample = not args.no_sample
+    result = inference_system.inference(
+        instruction=args.instruction,
+        max_length=args.max_length,
+        temperature=args.temperature,
+        do_sample=do_sample
+    )
+    print(f"\nInstruction: {result['instruction']}")
+    print(f"Response: {result['response']}")
+    metrics = result.get('metrics', {})
+    if metrics:
+        print(f"\n--- Performance Metrics ---")
+        grouping = metrics.get('grouping', {})
+        generation = metrics.get('generation', {})
+        print(f"Token compression: {grouping.get('compression_ratio', 'N/A'):.2f}x")
+        print(f"Memory reduction: {grouping.get('reduction_percent', 'N/A'):.1f}%")
+        print(f"Total time: {metrics.get('total_duration_ms', 'N/A'):.1f}ms")
+        print(f"Generation speed: {generation.get('tokens_per_second', 'N/A'):.1f} tokens/sec")
+if __name__ == "__main__":
+    main()

prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,652 @@

+import os
+import sys
+import logging
+import json
+import pickle
+from typing import Optional, Tuple, List, Dict, Any
+from pathlib import Path
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+from datasets import load_dataset, Dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers.models.qwen3.modeling_qwen3 import (
+    Qwen3DecoderLayer,
+    Qwen3Attention,
+    Qwen3RotaryEmbedding,
+)
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+    force=True,
+)
+logger = logging.getLogger("qwen3_dataset_processor")
+class GroupedCache:
+    """Cache for grouping metadata."""
+    def __init__(self):
+        self.grouped_positions = None
+        self.position_mapping = None
+        self.group_info = None
+        self.original_seq_length = None
+class CustomQwen3Attention(Qwen3Attention):
+    def __init__(self, config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.layer_idx = layer_idx
+        self.tokenizer = None
+        self.current_input_ids = None
+        self.threshold = 0.1
+        self.grouped_cache = GroupedCache()
+        if not hasattr(self, 'num_key_value_heads'):
+            self.num_key_value_heads = config.num_key_value_heads if hasattr(config, 'num_key_value_heads') else config.num_attention_heads
+        if not hasattr(self, 'head_dim'):
+            self.head_dim = config.hidden_size // config.num_attention_heads
+    def set_tokenizer(self, tokenizer):
+        self.tokenizer = tokenizer
+    def set_current_input_ids(self, input_ids):
+        self.current_input_ids = input_ids
+    def _is_special_token(self, token: str) -> bool:
+        if self.tokenizer is None:
+            return False
+        special_tokens = set()
+        if hasattr(self.tokenizer, 'special_tokens_map'):
+            for token_type, token_value in self.tokenizer.special_tokens_map.items():
+                if isinstance(token_value, str):
+                    special_tokens.add(token_value)
+                elif isinstance(token_value, list):
+                    special_tokens.update(token_value)
+        if hasattr(self.tokenizer, 'added_tokens_encoder'):
+            special_tokens.update(self.tokenizer.added_tokens_encoder.keys())
+        if token in special_tokens:
+            return True
+        special_patterns = [
+            lambda t: t.startswith('<|') and t.endswith('|>'),
+            lambda t: t.startswith('<') and t.endswith('>'),
+            lambda t: t.startswith('[') and t.endswith(']'),
+        ]
+        return any(pattern(token) for pattern in special_patterns)
+    def _get_token_relations(self, attention_weights: torch.Tensor, tokens: List[str]) -> List[Dict]:
+        batch_size, num_heads, query_len, key_len = attention_weights.shape
+        attn = attention_weights[0].mean(dim=0)
+        relations = []
+        if query_len == 1:
+            current_token_pos = len(tokens) - 1
+            token_relations = []
+            for j in range(len(tokens)):
+                if j != current_token_pos:
+                    weight = attn[0, j].item()
+                    if weight > self.threshold:
+                        token_relations.append({
+                            'target_pos': j,
+                            'weight': round(weight, 3)
+                        })
+            relations.append({
+                'source_pos': current_token_pos,
+                'relations': token_relations
+            })
+        else:
+            for i in range(min(query_len, len(tokens))):
+                token_relations = []
+                for j in range(len(tokens)):
+                    if i != j and j < key_len:
+                        weight = attn[i, j].item()
+                        if weight > self.threshold:
+                            token_relations.append({
+                                'target_pos': j,
+                                'weight': round(weight, 3)
+                            })
+                relations.append({
+                    'source_pos': i,
+                    'relations': token_relations
+                })
+        return relations
+    def _get_token_groups(self, attention_weights: torch.Tensor) -> List[List[int]]:
+        if self.tokenizer is None or self.current_input_ids is None:
+            return []
+        if len(attention_weights.shape) != 4:
+            return []
+        batch_size, num_heads, query_len, key_len = attention_weights.shape
+        input_ids = self.current_input_ids
+        if input_ids is None or input_ids.shape[1] < key_len:
+            return []
+        tokens = [self.tokenizer.decode([token_id]) for token_id in input_ids[0][:key_len]]
+        relations = self._get_token_relations(attention_weights, tokens)
+        groups = []
+        current_group = []
+        current_group_indices = []
+        for i, token in enumerate(tokens):
+            is_empty_relations = i < len(relations) and len(relations[i]['relations']) == 0
+            starts_with_space = token.startswith(' ') and token != ' '
+            is_space = token == ' '
+            is_new_line = '\n' in token
+            prev_token_is_special = False
+            prev_token_is_new_line = False
+            prev_token_is_space = False
+            if i > 0:
+                prev_token = tokens[i-1]
+                prev_token_is_special = self._is_special_token(prev_token)
+                prev_token_is_new_line = '\n' in prev_token
+                prev_token_is_space = prev_token == ' '
+            prev_newline_current_not = prev_token_is_new_line and not is_new_line
+            prev_space_current_not = prev_token_is_space and not is_space
+            current_space_prev_not = is_space and not prev_token_is_space
+            if (is_empty_relations or starts_with_space or is_new_line or
+                prev_token_is_special or prev_newline_current_not or prev_space_current_not or
+                current_space_prev_not) and current_group:
+                groups.append(current_group_indices)
+                current_group = []
+                current_group_indices = []
+            current_group.append(token)
+            current_group_indices.append(i)
+        if current_group:
+            groups.append(current_group_indices)
+        return groups
+class CustomQwen3DecoderLayer(Qwen3DecoderLayer):
+    def __init__(self, config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.layer_idx = layer_idx
+        self.rotary_emb = Qwen3RotaryEmbedding(config=config)
+        self.self_attn = CustomQwen3Attention(config, layer_idx)
+        self.is_initialized = False
+        self.grouped_hidden_states = None
+        self.grouped_cache = GroupedCache()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple] = None,
+        **kwargs,
+    ):
+        if self.layer_idx != 0:
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        is_prefill = hidden_states.shape[1] > 1 and not self.is_initialized
+        if not is_prefill:
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        residual = hidden_states
+        x = self.input_layernorm(hidden_states)
+        attn_ret = self.self_attn(
+            hidden_states=x,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=None,
+            output_attentions=True,
+            use_cache=False,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        if isinstance(attn_ret, tuple):
+            if len(attn_ret) == 3:
+                attn_out, attn_weights, _ = attn_ret
+            elif len(attn_ret) == 2:
+                attn_out, attn_weights = attn_ret
+            else:
+                raise RuntimeError(f"Unexpected attention return length: {len(attn_ret)}")
+        else:
+            raise RuntimeError("Attention did not return weights.")
+        groups = self.self_attn._get_token_groups(attn_weights)
+        if not groups:
+            self.is_initialized = True
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        averaged_vectors = []
+        group_info = []
+        position_mapping = {}
+        for gi, idxs in enumerate(groups):
+            if len(idxs) == 1:
+                averaged_vectors.append(attn_out[:, idxs[0], :])
+                group_info.append({"type": "single", "positions": idxs, "new_position": gi})
+            else:
+                gvecs = attn_out[:, idxs, :]
+                ave = gvecs.mean(dim=1)
+                averaged_vectors.append(ave)
+                group_info.append({"type": "averaged", "positions": idxs, "new_position": gi})
+            for p in idxs:
+                position_mapping[p] = gi
+        new_attn_out = torch.stack(averaged_vectors, dim=1)
+        expanded_residual = torch.stack([
+            (
+                residual[:, info['positions'], :].sum(dim=1)
+                if len(info['positions']) > 1
+                else residual[:, info['positions'][0], :]
+            )
+            for info in group_info
+        ], dim=1)
+        hs = expanded_residual + new_attn_out
+        grouped_hidden = self.post_attention_layernorm(hs)
+        self.grouped_cache.grouped_positions = len(groups)
+        self.grouped_cache.position_mapping = position_mapping
+        self.grouped_cache.group_info = group_info
+        self.grouped_cache.original_seq_length = hidden_states.shape[1]
+        self.grouped_hidden_states = grouped_hidden
+        self.is_initialized = True
+        return hs
+def create_model_with_custom_layer0(model_name: str = "Qwen/Qwen3-0.6B"):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        dtype = torch.float16
+    else:
+        device = torch.device("cpu")
+        dtype = torch.float32
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=dtype,
+        attn_implementation="eager"
+    ).to(device)
+    orig0 = model.model.layers[0]
+    custom0 = CustomQwen3DecoderLayer(model.config, 0)
+    custom0.mlp.load_state_dict(orig0.mlp.state_dict())
+    custom0.input_layernorm.load_state_dict(orig0.input_layernorm.state_dict())
+    custom0.post_attention_layernorm.load_state_dict(orig0.post_attention_layernorm.state_dict())
+    custom0.self_attn.load_state_dict(orig0.self_attn.state_dict())
+    custom0.self_attn.set_tokenizer(tokenizer)
+    custom0 = custom0.to(device=device, dtype=dtype)
+    model.model.layers[0] = custom0
+    return model, tokenizer
+class DatasetProcessor:
+    def __init__(self,
+                 model_name: str = "Qwen/Qwen3-0.6B",
+                 dataset_name: str = "Magpie-Align/Magpie-Qwen2.5-Pro-1M-v0.1",
+                 output_dir: str = "./processed_dataset",
+                 batch_size: int = 1,
+                 max_samples: Optional[int] = None,
+                 save_frequency: int = 1000):
+        self.model_name = model_name
+        self.dataset_name = dataset_name
+        self.output_dir = Path(output_dir)
+        self.batch_size = batch_size
+        self.max_samples = max_samples
+        self.save_frequency = save_frequency
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # System prompt template for Qwen3
+        self.system_prompt = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
+        self.response_start = "<|im_end|>\n<|im_start|>assistant\n"
+        self.current_chunk = 0
+        self.processed_data_buffer = []
+    def load_dataset(self) -> Dataset:
+        logger.info(f"Loading dataset: {self.dataset_name}")
+        dataset = load_dataset(self.dataset_name, split="train")
+        if self.max_samples:
+            dataset = dataset.select(range(min(self.max_samples, len(dataset))))
+        logger.info(f"Dataset loaded: {len(dataset)} samples")
+        return dataset
+    def format_input_text(self, instruction: str) -> str:
+        return f"{self.system_prompt}{instruction}{self.response_start}"
+    def process_embeddings_batch(self, model, tokenizer, texts: List[str]) -> List[torch.Tensor]:
+        device = model.device
+        embeddings_batch = []
+        for text in texts:
+            try:
+                if hasattr(model.model.layers[0], "is_initialized"):
+                    model.model.layers[0].is_initialized = False
+                batch = tokenizer(text, return_tensors="pt", truncation=True, max_length=2048).to(device)
+                input_ids = batch["input_ids"]
+                if hasattr(model.model.layers[0], "self_attn"):
+                    sat = model.model.layers[0].self_attn
+                    if hasattr(sat, "set_current_input_ids"):
+                        sat.set_current_input_ids(input_ids)
+                with torch.no_grad():
+                    inputs_embeds = model.model.embed_tokens(input_ids)
+                    seq_len = inputs_embeds.shape[1]
+                    position_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0)
+                    if hasattr(model.model, 'rotary_emb'):
+                        pos_embeds = model.model.rotary_emb(inputs_embeds, position_ids)
+                    else:
+                        pos_embeds = None
+                    _ = model.model.layers[0](
+                        hidden_states=inputs_embeds,
+                        attention_mask=None,
+                        position_ids=position_ids,
+                        past_key_value=None,
+                        output_attentions=False,
+                        use_cache=False,
+                        cache_position=None,
+                        position_embeddings=pos_embeds,
+                    )
+                    if (hasattr(model.model.layers[0], "grouped_hidden_states") and
+                        model.model.layers[0].grouped_hidden_states is not None):
+                        grouped_embeds = model.model.layers[0].grouped_hidden_states.clone().cpu()
+                        embeddings_batch.append(grouped_embeds.squeeze(0))
+                        model.model.layers[0].grouped_hidden_states = None
+                    else:
+                        embeddings_batch.append(inputs_embeds.squeeze(0).cpu())
+                    del inputs_embeds, position_ids
+                    if pos_embeds is not None:
+                        del pos_embeds
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+            except Exception as e:
+                logger.warning(f"Error processing sample: {e}")
+                embeddings_batch.append(torch.zeros(1, model.config.hidden_size))
+        return embeddings_batch
+    def save_chunk(self, chunk_data: List[Dict[str, Any]], chunk_id: int):
+        if not chunk_data:
+            return
+        chunk_path = self.output_dir / f"processed_chunk_{chunk_id:04d}.pkl"
+        with open(chunk_path, 'wb') as f:
+            pickle.dump(chunk_data, f)
+        # Clear memory
+        del chunk_data
+        import gc
+        gc.collect()
+    def merge_chunks(self) -> List[Dict[str, Any]]:
+        logger.info("Merging chunks...")
+        chunk_files = sorted(list(self.output_dir.glob("processed_chunk_*.pkl")))
+        if not chunk_files:
+            return []
+        merged_data = []
+        for chunk_file in tqdm(chunk_files, desc="Merging chunks"):
+            try:
+                with open(chunk_file, 'rb') as f:
+                    chunk_data = pickle.load(f)
+                    if isinstance(chunk_data, list):
+                        merged_data.extend(chunk_data)
+            except Exception as e:
+                logger.error(f"Error loading chunk {chunk_file}: {e}")
+                continue
+        # Clean up chunk files
+        self.cleanup_chunks()
+        logger.info(f"Merged {len(chunk_files)} chunks into {len(merged_data)} samples")
+        return merged_data
+    def cleanup_chunks(self):
+        chunk_files = list(self.output_dir.glob("processed_chunk_*.pkl"))
+        for chunk_file in chunk_files:
+            try:
+                chunk_file.unlink()
+            except Exception as e:
+                logger.warning(f"Could not delete chunk {chunk_file}: {e}")
+        if chunk_files:
+            logger.info(f"Cleaned up {len(chunk_files)} temporary chunk files")
+    def save_final_dataset(self, processed_data: List[Dict[str, Any]], stats: Dict[str, int]):
+        pickle_path = self.output_dir / "processed_dataset.pkl"
+        with open(pickle_path, 'wb') as f:
+            pickle.dump(processed_data, f)
+        error_samples = sum(1 for sample in processed_data if sample.get("error", False))
+        successful_samples = len(processed_data) - error_samples
+        metadata = {
+            "model_name": self.model_name,
+            "dataset_name": self.dataset_name,
+            "total_samples": stats["total_samples"],
+            "processed_samples": len(processed_data),
+            "successful_samples": successful_samples,
+            "error_samples": error_samples,
+            "batch_size": self.batch_size,
+            "max_samples": self.max_samples,
+            "success_rate": f"{(successful_samples / len(processed_data) * 100):.2f}%" if processed_data else "0%"
+        }
+        with open(self.output_dir / "metadata.json", 'w', encoding='utf-8') as f:
+            json.dump(metadata, f, indent=2, ensure_ascii=False)
+        text_samples = []
+        count = 0
+        for i, sample in enumerate(processed_data):
+            if not sample.get("error", False) and count < 10:
+                text_samples.append({
+                    "sample_id": i,
+                    "input_text": sample["input_text"][:300] + "..." if len(sample["input_text"]) > 300 else sample["input_text"],
+                    "response": sample["response"][:300] + "..." if len(sample["response"]) > 300 else sample["response"],
+                    "embedding_shape": sample["embedding_shape"]
+                })
+                count += 1
+        with open(self.output_dir / "samples.json", 'w', encoding='utf-8') as f:
+            json.dump(text_samples, f, indent=2, ensure_ascii=False)
+        logger.info(f"Dataset saved: {len(processed_data)} samples")
+        logger.info(f"Success rate: {metadata['success_rate']}")
+    def process_dataset(self):
+        dataset = self.load_dataset()
+        logger.info("Loading model...")
+        model, tokenizer = create_model_with_custom_layer0(self.model_name)
+        total_samples = len(dataset)
+        processed_count = 0
+        error_count = 0
+        logger.info(f"Processing {total_samples} samples...")
+        for i in tqdm(range(0, total_samples, self.batch_size), desc="Processing"):
+            batch_end = min(i + self.batch_size, total_samples)
+            batch_samples = dataset.select(range(i, batch_end))
+            batch_texts = []
+            batch_instructions = []
+            batch_responses = []
+            try:
+                for sample in batch_samples:
+                    instruction = sample.get("instruction", "")
+                    response = sample.get("response", "")
+                    if not instruction.strip() or not response.strip():
+                        instruction = "Empty instruction"
+                        response = "Empty response"
+                    input_text = self.format_input_text(instruction)
+                    batch_texts.append(input_text)
+                    batch_instructions.append(input_text)
+                    batch_responses.append(response)
+                embeddings_batch = self.process_embeddings_batch(model, tokenizer, batch_texts)
+                for j, (input_text, embedding, response) in enumerate(zip(batch_instructions, embeddings_batch, batch_responses)):
+                    processed_sample = {
+                        "input_text": input_text,
+                        "inputs_embeds": embedding,
+                        "response": response,
+                        "embedding_shape": list(embedding.shape),
+                        "original_index": i + j
+                    }
+                    self.processed_data_buffer.append(processed_sample)
+                    processed_count += 1
+                if len(self.processed_data_buffer) >= self.save_frequency:
+                    self.save_chunk(self.processed_data_buffer, self.current_chunk)
+                    self.processed_data_buffer = []
+                    self.current_chunk += 1
+                    import gc
+                    gc.collect()
+            except Exception as e:
+                logger.error(f"Error processing batch: {e}")
+                error_count += len(batch_samples)
+        if self.processed_data_buffer:
+            self.save_chunk(self.processed_data_buffer, self.current_chunk)
+            self.processed_data_buffer = []
+        merged_data = self.merge_chunks()
+        stats = {
+            "total_samples": total_samples,
+            "processed_count": processed_count,
+            "error_count": error_count
+        }
+        self.save_final_dataset(merged_data, stats)
+        return merged_data
+def load_processed_dataset(dataset_path: str) -> List[Dict[str, Any]]:
+    pickle_path = Path(dataset_path) / "processed_dataset.pkl"
+    with open(pickle_path, 'rb') as f:
+        return pickle.load(f)
+def get_dataset_info(dataset_path: str) -> Dict:
+    metadata_path = Path(dataset_path) / "metadata.json"
+    with open(metadata_path, 'r') as f:
+        return json.load(f)
+def main():
+    model_name = "Qwen/Qwen3-0.6B"
+    dataset_name = "Magpie-Align/Magpie-Qwen2.5-Pro-1M-v0.1"
+    output_dir = "./processed_qwen3_dataset"
+    batch_size = 1
+    max_samples = 10000  # Set to number for testing, None for full dataset
+    save_frequency = 1000
+    logger.info("Starting Qwen3 dataset processing...")
+    logger.info(f"Model: {model_name}")
+    logger.info(f"Dataset: {dataset_name}")
+    logger.info(f"Output: {output_dir}")
+    logger.info(f"Max samples: {max_samples or 'ALL'}")
+    try:
+        processor = DatasetProcessor(
+            model_name=model_name,
+            dataset_name=dataset_name,
+            output_dir=output_dir,
+            batch_size=batch_size,
+            max_samples=max_samples,
+            save_frequency=save_frequency
+        )
+        processed_data = processor.process_dataset()
+        logger.info("Processing completed successfully!")
+        logger.info(f"Final dataset: {len(processed_data)} samples")
+        logger.info(f"Files saved to: {output_dir}")
+        return processed_data
+    except Exception as e:
+        logger.error(f"Processing failed: {e}")
+        raise
+if __name__ == "__main__":
+    main()

processed_qwen3_dataset/metadata.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "model_name": "Qwen/Qwen3-0.6B",
+  "dataset_name": "Magpie-Align/Magpie-Qwen2.5-Pro-1M-v0.1",
+  "total_samples": 10000,
+  "processed_samples": 10000,
+  "successful_samples": 10000,
+  "error_samples": 0,
+  "batch_size": 1,
+  "max_samples": 10000,
+  "success_rate": "100.00%"
+}

processed_qwen3_dataset/processed_dataset.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:696e5df891b54ac7ea3f1b1c3a69ae24f8a7e0b8e96600e4105585b8882af2eb
+size 1029880114

processed_qwen3_dataset/samples.json ADDED Viewed

	@@ -0,0 +1,92 @@

+[
+  {
+    "sample_id": 0,
+    "input_text": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nCan you tell me about the latest developments in AI research that could impact the field of healthcare?<|im_end|>\n<|im_start|>assistant\n",
+    "response": "Certainly! The intersection of AI and healthcare is a rapidly evolving field with numerous recent developments that hold significant promise. Here are some of the latest advancements:\n\n1. **AI in Diagnostics**:\n   - **Imaging Analysis**: AI algorithms, particularly deep learning models, are becoming...",
+    "embedding_shape": [
+      47,
+      1024
+    ]
+  },
+  {
+    "sample_id": 1,
+    "input_text": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nWhat is the most effective way to learn a new language, especially for someone who is already an adult and has a busy schedule?<|im_end|>\n<|im_start|>assistant\n",
+    "response": "Learning a new language as an adult with a busy schedule can be challenging, but it's definitely achievable with the right strategies and consistent effort. Here are some effective methods to help you learn a new language efficiently:\n\n1. **Set Clear Goals**: Define what you want to achieve with the...",
+    "embedding_shape": [
+      54,
+      1024
+    ]
+  },
+  {
+    "sample_id": 2,
+    "input_text": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nI need to create a marketing plan for a new line of organic skincare products. What are the key elements I should consider?<|im_end|>\n<|im_start|>assistant\n",
+    "response": "Creating a marketing plan for a new line of organic skincare products involves several key elements to ensure the successful launch and sustained growth of your brand. Here’s a comprehensive guide to help you develop an effective marketing plan:\n\n### 1. **Market Research**\n   - **Target Audience**: ...",
+    "embedding_shape": [
+      53,
+      1024
+    ]
+  },
+  {
+    "sample_id": 3,
+    "input_text": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nWhat is the role of a project manager in a software development team?<|im_end|>\n<|im_start|>assistant\n",
+    "response": "The role of a project manager in a software development team is multifaceted and crucial for the success of the project. Here are some of the key responsibilities and tasks typically associated with this role:\n\n1. **Project Planning**: The project manager is responsible for defining the scope of the...",
+    "embedding_shape": [
+      42,
+      1024
+    ]
+  },
+  {
+    "sample_id": 4,
+    "input_text": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nI need to create a social media campaign for a new line of eco-friendly products. Can you help me with some ideas?<|im_end|>\n<|im_start|>assistant\n",
+    "response": "Absolutely, I'd be happy to help you brainstorm some ideas for a social media campaign for your new line of eco-friendly products! Here are a few strategies and ideas to consider:\n\n### 1. **Eco-Challenges**\n- **30-Day Eco-Challenge:** Encourage followers to adopt one eco-friendly habit each day for ...",
+    "embedding_shape": [
+      52,
+      1024
+    ]
+  },
+  {
+    "sample_id": 5,
+    "input_text": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nCan you explain the concept of \"dynamic equilibrium\" in chemistry? And could you provide an example to illustrate this concept?<|im_end|>\n<|im_start|>assistant\n",
+    "response": "Certainly! The concept of \"dynamic equilibrium\" in chemistry refers to a state in which the concentrations of the reactants and products in a reversible reaction remain constant over time, despite the fact that the forward and reverse reactions are still occurring at the same rate. This means that w...",
+    "embedding_shape": [
+      51,
+      1024
+    ]
+  },
+  {
+    "sample_id": 6,
+    "input_text": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nWhat is the best way to create a budget for a small business?<|im_end|>\n<|im_start|>assistant\n",
+    "response": "Creating a budget for a small business is a crucial step in managing finances effectively and ensuring the long-term success of the business. Here’s a step-by-step guide to help you create a comprehensive and effective budget:\n\n### 1. **Define Your Business Goals**\n   - **Short-term and Long-term Go...",
+    "embedding_shape": [
+      42,
+      1024
+    ]
+  },
+  {
+    "sample_id": 7,
+    "input_text": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nCan you provide a brief overview of Alibaba Cloud's security measures and how they protect user data?<|im_end|>\n<|im_start|>assistant\n",
+    "response": "Certainly! Alibaba Cloud places a strong emphasis on security and has implemented a comprehensive set of measures to protect user data and ensure the integrity, confidentiality, and availability of its services. Here’s a brief overview of some key security measures:\n\n1. **Data Encryption**: Alibaba ...",
+    "embedding_shape": [
+      47,
+      1024
+    ]
+  },
+  {
+    "sample_id": 8,
+    "input_text": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nI am interested in learning more about the cultural significance of the Chinese zodiac. Could you tell me about its origin and how it is used in Chinese culture today?<|im_end|>\n<|im_st...",
+    "response": "Certainly! The Chinese zodiac, known as \"Sheng Xiao\" (生肖) in Chinese, is a classification scheme that assigns an animal and its reputed attributes to each year in a repeating 12-year cycle. The 12 animals of the Chinese zodiac are the Rat, Ox, Tiger, Rabbit, Dragon, Snake, Horse, Goat, Monkey, Roost...",
+    "embedding_shape": [
+      60,
+      1024
+    ]
+  },
+  {
+    "sample_id": 9,
+    "input_text": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nHow can I use a Virtual Private Cloud (VPC) to enhance the security of my cloud applications?<|im_end|>\n<|im_start|>assistant\n",
+    "response": "Using a Virtual Private Cloud (VPC) can significantly enhance the security of your cloud applications by providing a secure and isolated network environment. Here are several ways you can leverage a VPC to improve security:\n\n### 1. **Network Isolation**\n   - **Private Network**: A VPC allows you to ...",
+    "embedding_shape": [
+      48,
+      1024
+    ]
+  }
+]

train_custom_qwen3.py ADDED Viewed

	@@ -0,0 +1,1292 @@

+import os
+import sys
+import logging
+import json
+import pickle
+from typing import Optional, Tuple, List, Dict, Any, Union
+from pathlib import Path
+from tqdm import tqdm
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from datasets import load_dataset, Dataset as HFDataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    get_linear_schedule_with_warmup,
+    PreTrainedModel
+)
+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+from transformers.models.qwen3.modeling_qwen3 import (
+    Qwen3Model,
+    Qwen3ForCausalLM,
+    Qwen3PreTrainedModel,
+    Qwen3RMSNorm
+)
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+    force=True,
+)
+logger = logging.getLogger("grouped_qwen3_training")
+class GroupedInputMLPAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        hidden_size = config.hidden_size
+        self.grouped_processor = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size * 2),
+            nn.SiLU(),  # Using SiLU activation like Qwen3
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.Dropout(0.1)
+        )
+        norm_eps = getattr(config, 'rms_norm_eps', 1e-6)
+        self.layer_norm = Qwen3RMSNorm(hidden_size, eps=norm_eps)
+    def forward(self, grouped_embeds: torch.Tensor) -> torch.Tensor:
+        processed = self.grouped_processor(grouped_embeds)
+        output = self.layer_norm(grouped_embeds + processed)
+        return output
+class CustomQwen3ForCausalLM(Qwen3ForCausalLM):
+    def __init__(self, config):
+        super().__init__(config)
+        self.grouped_input_mlp = GroupedInputMLPAdapter(config)
+        self.is_grouped_input_mode = False
+        self.grouped_cache_initialized = False
+        self._init_grouped_weights()
+        self._freeze_layers()
+    def _init_grouped_weights(self):
+        def _init_weights(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+                if module.bias is not None:
+                    torch.nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.LayerNorm):
+                torch.nn.init.ones_(module.weight)
+                torch.nn.init.zeros_(module.bias)
+        self.grouped_input_mlp.apply(_init_weights)
+    def _freeze_layers(self):
+        for param in self.model.embed_tokens.parameters():
+            param.requires_grad = False
+        for i, layer in enumerate(self.model.layers):
+            if i == 0:
+                for param in layer.parameters():
+                    param.requires_grad = True
+            else:
+                for param in layer.parameters():
+                    param.requires_grad = False
+        for param in self.model.norm.parameters():
+            param.requires_grad = False
+        for param in self.lm_head.parameters():
+            param.requires_grad = False
+        for param in self.grouped_input_mlp.parameters():
+            param.requires_grad = True
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        total_params = sum(p.numel() for p in self.parameters())
+        logger.info(f"Trainable parameters: {trainable_params:,} / {total_params:,} "
+                   f"({trainable_params/total_params*100:.2f}%)")
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        grouped_inputs: Optional[torch.FloatTensor] = None,  # New parameter for grouped inputs
+        is_prefill: Optional[bool] = None,  # Flag to indicate prefill phase
+        **kwargs
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if grouped_inputs is not None and is_prefill:
+            self.is_grouped_input_mode = True
+            processed_grouped_inputs = self.grouped_input_mlp(grouped_inputs)
+            inputs_embeds = processed_grouped_inputs
+            input_ids = None  # Don't use input_ids when we have grouped inputs
+            batch_size, seq_len = inputs_embeds.shape[:2]
+            if position_ids is None:
+                device = inputs_embeds.device
+                position_ids = torch.arange(seq_len, device=device, dtype=torch.long)
+                position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
+            if attention_mask is None:
+                attention_mask = torch.ones((batch_size, seq_len), device=inputs_embeds.device, dtype=torch.long)
+            self.grouped_cache_initialized = True
+        elif not is_prefill and self.is_grouped_input_mode:
+            pass
+        else:
+            self.is_grouped_input_mode = False
+        # Call parent forward
+        outputs = super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs
+        )
+        return outputs
+class GroupedDataset(Dataset):
+    def __init__(self, dataset_path: str, tokenizer, max_response_length: int = 512,
+                 validation_split: float = 0.1, is_validation: bool = False,
+                 chunk_size: int = 1000, max_samples: Optional[int] = None):
+        self.dataset_path = dataset_path
+        self.tokenizer = tokenizer
+        self.max_response_length = max_response_length
+        self.validation_split = validation_split
+        self.is_validation = is_validation
+        self.chunk_size = chunk_size
+        self.max_samples = max_samples
+        self._chunk_cache = {}
+        self._cache_size_limit = 3  # Keep max 3 chunks in memory
+        self._build_index()
+    def _build_index(self):
+        logger.info(f"Building index for {self.dataset_path}")
+        with open(self.dataset_path, 'rb') as f:
+            data = pickle.load(f)
+            valid_indices = []
+            for i, item in enumerate(data):
+                if not item.get("error", False):
+                    valid_indices.append(i)
+                    if self.max_samples and len(valid_indices) >= self.max_samples:
+                        break
+            total_valid = len(valid_indices)
+            val_size = min(1000, int(self.validation_split * total_valid))
+            train_size = total_valid - val_size
+            if self.is_validation:
+                self.valid_indices = valid_indices[train_size:train_size + val_size]
+                self.total_samples = val_size
+            else:
+                self.valid_indices = valid_indices[:train_size]
+                self.total_samples = train_size
+            self._full_data = data
+            logger.info(f"{'Validation' if self.is_validation else 'Training'} dataset: {self.total_samples} samples")
+    def _get_chunk_id(self, idx):
+        return idx // self.chunk_size
+    def _load_chunk(self, chunk_id):
+        if chunk_id in self._chunk_cache:
+            return self._chunk_cache[chunk_id]
+        start_idx = chunk_id * self.chunk_size
+        end_idx = min(start_idx + self.chunk_size, self.total_samples)
+        chunk_data = {}
+        for i in range(start_idx, end_idx):
+            actual_idx = self.valid_indices[i]
+            chunk_data[i] = self._full_data[actual_idx]
+        if len(self._chunk_cache) >= self._cache_size_limit:
+            oldest_chunk = min(self._chunk_cache.keys())
+            del self._chunk_cache[oldest_chunk]
+        self._chunk_cache[chunk_id] = chunk_data
+        return chunk_data
+    def __len__(self):
+        return self.total_samples
+    def __getitem__(self, idx):
+        if idx >= self.total_samples:
+            raise IndexError(f"Index {idx} out of range for dataset of size {self.total_samples}")
+        chunk_id = self._get_chunk_id(idx)
+        chunk_data = self._load_chunk(chunk_id)
+        item = chunk_data[idx]
+        return self._process_item(item)
+    def _process_item(self, item):
+        grouped_embeds = item["inputs_embeds"]
+        if isinstance(grouped_embeds, torch.Tensor):
+            grouped_embeds = grouped_embeds.clone()
+        else:
+            grouped_embeds = torch.tensor(grouped_embeds)
+        if grouped_embeds.dtype != torch.float32:
+            grouped_embeds = grouped_embeds.float()
+        response = item["response"]
+        response_tokens = self.tokenizer(
+            response,
+            max_length=self.max_response_length,
+            truncation=True,
+            padding=False,
+            return_tensors="pt"
+        )
+        response_input_ids = response_tokens["input_ids"].squeeze(0)
+        return {
+            "grouped_inputs": grouped_embeds,
+            "response_input_ids": response_input_ids,
+            "response_text": response,
+            "input_text": item["input_text"],
+        }
+    def cleanup(self):
+        self._chunk_cache.clear()
+        if hasattr(self, '_full_data'):
+            del self._full_data
+def collate_fn(batch, tokenizer, pad_token_id=None):
+    if pad_token_id is None:
+        pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
+    grouped_inputs = [item["grouped_inputs"] for item in batch]
+    response_input_ids = [item["response_input_ids"] for item in batch]
+    max_grouped_len = max(gi.shape[0] for gi in grouped_inputs)
+    batch_size = len(grouped_inputs)
+    hidden_size = grouped_inputs[0].shape[-1]
+    padded_grouped_inputs = torch.zeros(batch_size, max_grouped_len, hidden_size)
+    grouped_attention_mask = torch.zeros(batch_size, max_grouped_len, dtype=torch.long)
+    for i, gi in enumerate(grouped_inputs):
+        seq_len = gi.shape[0]
+        padded_grouped_inputs[i, :seq_len] = gi
+        grouped_attention_mask[i, :seq_len] = 1
+    max_response_len = max(len(rid) for rid in response_input_ids)
+    padded_response_ids = torch.full((batch_size, max_response_len), pad_token_id, dtype=torch.long)
+    for i, rid in enumerate(response_input_ids):
+        padded_response_ids[i, :len(rid)] = rid
+    return {
+        "grouped_inputs": padded_grouped_inputs,
+        "grouped_attention_mask": grouped_attention_mask,
+        "response_input_ids": padded_response_ids,
+        "response_texts": [item["response_text"] for item in batch],
+        "input_texts": [item["input_text"] for item in batch],
+    }
+class TrainingState:
+    def __init__(self, output_dir: Path):
+        self.output_dir = output_dir
+        self.state_file = output_dir / "training_state.json"
+    def save_state(self, epoch: int, global_step: int, best_val_loss: float,
+                   optimizer_state: Dict, scheduler_state: Dict):
+        """Save training state."""
+        state = {
+            "epoch": epoch,
+            "global_step": global_step,
+            "best_val_loss": best_val_loss,
+            "optimizer_state": optimizer_state,
+            "scheduler_state": scheduler_state,
+            "completed_epochs": epoch
+        }
+        with open(self.state_file, 'w') as f:
+            json.dump(state, f, indent=2, default=str)  # default=str for handling tensor types
+        logger.info(f"Saved training state at epoch {epoch}, step {global_step}")
+    def load_state(self):
+        if not self.state_file.exists():
+            return None
+        try:
+            with open(self.state_file, 'r') as f:
+                state = json.load(f)
+            logger.info(f"Loaded training state from epoch {state['epoch']}, step {state['global_step']}")
+            return state
+        except Exception as e:
+            logger.warning(f"Failed to load training state: {e}")
+            return None
+    def get_latest_checkpoint(self):
+        state = self.load_state()
+        if state is None:
+            return None
+        epoch = state["completed_epochs"]
+        checkpoint_path = self.output_dir / f"epoch_{epoch}"
+        if checkpoint_path.exists():
+            return checkpoint_path, state
+        else:
+            logger.warning(f"Checkpoint for epoch {epoch} not found")
+            return None
+class GroupedTrainer:
+    def __init__(
+        self,
+        model_name: str = "Qwen/Qwen3-0.6B",
+        dataset_path: str = "./processed_qwen3_dataset/processed_dataset.pkl",
+        output_dir: str = "./grouped_qwen3_checkpoint",
+        batch_size: int = 4,
+        learning_rate: float = 5e-4,
+        num_epochs: int = 3,
+        warmup_steps: int = 100,
+        max_grad_norm: float = 1.0,
+        save_steps: int = 500,
+        eval_steps: int = 500,
+        logging_steps: int = 50,
+        resume_training: bool = True,
+        debug: bool = False,
+        chunk_size: int = 1000,  # Chunk size for streaming
+        max_samples: Optional[int] = None,  # Limit dataset size for testing
+    ):
+        self.model_name = model_name
+        self.dataset_path = dataset_path
+        self.output_dir = Path(output_dir)
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.num_epochs = num_epochs
+        self.warmup_steps = warmup_steps
+        self.max_grad_norm = max_grad_norm
+        self.save_steps = save_steps
+        self.eval_steps = eval_steps
+        self.logging_steps = logging_steps
+        self.resume_training = resume_training
+        self.debug = debug
+        self.chunk_size = chunk_size
+        self.max_samples = max_samples
+        if self.debug:
+            logger.setLevel(logging.DEBUG)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.tokenizer = None
+        self.model = None
+        self.train_dataset = None
+        self.val_dataset = None
+        self.training_state = TrainingState(self.output_dir)
+    def load_model_and_tokenizer(self):
+        logger.info(f"Loading tokenizer and model: {self.model_name}")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        config = Qwen3Config.from_pretrained(self.model_name)
+        self.model = CustomQwen3ForCausalLM.from_pretrained(
+            self.model_name,
+            config=config,
+            torch_dtype=torch.float32,  # Use float32 for training
+            attn_implementation="eager"
+        ).to(self.device)
+        logger.info(f"Model loaded on {self.device}")
+    def load_dataset(self, chunk_size: int = 1000, max_samples: Optional[int] = None):
+        logger.info(f"Loading streaming dataset from {self.dataset_path}")
+        # Create streaming datasets
+        self.train_dataset = GroupedDataset(
+            dataset_path=self.dataset_path,
+            tokenizer=self.tokenizer,
+            is_validation=False,
+            chunk_size=chunk_size,
+            max_samples=max_samples
+        )
+        self.val_dataset = GroupedDataset(
+            dataset_path=self.dataset_path,
+            tokenizer=self.tokenizer,
+            is_validation=True,
+            chunk_size=chunk_size,
+            max_samples=max_samples
+        )
+        logger.info(f"Train samples: {len(self.train_dataset)}")
+        logger.info(f"Val samples: {len(self.val_dataset)}")
+        # Log memory usage
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            memory_used = torch.cuda.memory_allocated() / 1024**3
+            logger.info(f"GPU memory after dataset loading: {memory_used:.2f} GB")
+    def cleanup_datasets(self):
+        if hasattr(self.train_dataset, 'cleanup'):
+            self.train_dataset.cleanup()
+        if hasattr(self.val_dataset, 'cleanup'):
+            self.val_dataset.cleanup()
+        import gc
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    def load_checkpoint(self, checkpoint_path: Path):
+        logger.info(f"Loading checkpoint from {checkpoint_path}")
+        model_path = checkpoint_path / "pytorch_model.bin"
+        if not model_path.exists():
+            model_path = checkpoint_path / "model.safetensors"
+        if model_path.exists():
+            state_dict = torch.load(model_path, map_location=self.device)
+            missing_keys, unexpected_keys = self.model.load_state_dict(state_dict, strict=False)
+            if missing_keys:
+                logger.warning(f"Missing keys when loading checkpoint: {missing_keys}")
+            if unexpected_keys:
+                logger.warning(f"Unexpected keys when loading checkpoint: {unexpected_keys}")
+            logger.info("Model checkpoint loaded successfully")
+        else:
+            raise FileNotFoundError(f"Model checkpoint not found at {checkpoint_path}")
+    def compute_loss(self, batch, outputs):
+        logits = outputs.logits if hasattr(outputs, 'logits') else outputs[0]
+        target_ids = batch["response_input_ids"].to(self.device)  # [batch_size, target_len]
+        logger.debug(f"Logits shape: {logits.shape}, Target shape: {target_ids.shape}")
+        batch_size = target_ids.shape[0]
+        if target_ids.shape[1] > 1:
+            labels = target_ids.clone()
+            pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            labels[labels == pad_token_id] = -100
+            seq_len = min(logits.shape[1], labels.shape[1])
+            logits_truncated = logits[:, :seq_len, :]  # [batch_size, seq_len, vocab_size]
+            labels_truncated = labels[:, :seq_len]     # [batch_size, seq_len]
+            logger.debug(f"After truncation - Logits: {logits_truncated.shape}, Labels: {labels_truncated.shape}")
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                logits_truncated.reshape(-1, logits_truncated.size(-1)),
+                labels_truncated.reshape(-1)
+            )
+        else:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(logits.view(-1, logits.size(-1)), target_ids.view(-1))
+        return loss
+    def training_step(self, batch, step):
+        self.model.train()
+        if step < 5 and torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            memory_before = torch.cuda.memory_allocated() / 1024**3
+        grouped_inputs = batch["grouped_inputs"].to(self.device)
+        grouped_attention_mask = batch["grouped_attention_mask"].to(self.device)
+        response_input_ids = batch["response_input_ids"].to(self.device)
+        batch_size = grouped_inputs.shape[0]
+        grouped_seq_len = grouped_inputs.shape[1]
+        response_seq_len = response_input_ids.shape[1]
+        if self.debug:
+            logger.debug(f"Batch sizes - grouped: {grouped_inputs.shape}, response: {response_input_ids.shape}")
+        grouped_outputs = self.model(
+            grouped_inputs=grouped_inputs,
+            attention_mask=grouped_attention_mask,
+            is_prefill=True,
+            use_cache=True,
+            return_dict=True
+        )
+        if response_seq_len > 1:
+            response_attention_mask = (response_input_ids != self.tokenizer.pad_token_id).long()
+            response_outputs = self.model(
+                input_ids=response_input_ids[:, :-1],  # All but last token as input
+                attention_mask=response_attention_mask[:, :-1],
+                past_key_values=grouped_outputs.past_key_values,
+                use_cache=False,
+                return_dict=True
+            )
+            logits = response_outputs.logits
+            labels = response_input_ids[:, 1:]  # All but first token as targets
+            labels = labels.clone()
+            pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            labels[labels == pad_token_id] = -100
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(logits.reshape(-1, logits.size(-1)), labels.reshape(-1))
+        else:
+            loss = torch.tensor(0.0, requires_grad=True, device=self.device)
+        if step < 5 and torch.cuda.is_available():
+            memory_after = torch.cuda.memory_allocated() / 1024**3
+            memory_peak = torch.cuda.max_memory_allocated() / 1024**3
+            logger.info(f"Step {step} Memory: {memory_before:.2f}GB → {memory_after:.2f}GB (Peak: {memory_peak:.2f}GB)")
+            if memory_peak > 20.0:  # 20GB threshold for L4
+                logger.warning("High memory usage detected! Consider reducing batch_size")
+        class MockOutputs:
+            def __init__(self, loss, logits):
+                self.loss = loss
+                self.logits = logits
+        outputs = MockOutputs(loss, response_outputs.logits if 'response_outputs' in locals() else grouped_outputs.logits)
+        return loss, outputs
+    def validation_step(self, batch):
+        """Single validation step."""
+        self.model.eval()
+        with torch.no_grad():
+            grouped_inputs = batch["grouped_inputs"].to(self.device)
+            grouped_attention_mask = batch["grouped_attention_mask"].to(self.device)
+            response_input_ids = batch["response_input_ids"].to(self.device)
+            grouped_outputs = self.model(
+                grouped_inputs=grouped_inputs,
+                attention_mask=grouped_attention_mask,
+                is_prefill=True,
+                use_cache=True,
+                return_dict=True
+            )
+            if response_input_ids.shape[1] > 1:
+                response_attention_mask = (response_input_ids != self.tokenizer.pad_token_id).long()
+                response_outputs = self.model(
+                    input_ids=response_input_ids[:, :-1],
+                    attention_mask=response_attention_mask[:, :-1],
+                    past_key_values=grouped_outputs.past_key_values,
+                    use_cache=False,
+                    return_dict=True
+                )
+                logits = response_outputs.logits
+                labels = response_input_ids[:, 1:]
+                labels = labels.clone()
+                pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+                labels[labels == pad_token_id] = -100
+                loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+                loss = loss_fct(logits.reshape(-1, logits.size(-1)), labels.reshape(-1))
+            else:
+                loss = torch.tensor(0.0, device=self.device)
+        return loss.item()
+    def save_epoch_checkpoint(self, epoch: int, global_step: int, is_best: bool = False):
+        checkpoint_name = f"epoch_{epoch}"
+        if is_best:
+            checkpoint_name += "_best"
+        checkpoint_dir = self.output_dir / checkpoint_name
+        checkpoint_dir.mkdir(exist_ok=True)
+        torch.save(self.model.state_dict(), checkpoint_dir / "pytorch_model.bin")
+        self.model.config.save_pretrained(checkpoint_dir)
+        self.tokenizer.save_pretrained(checkpoint_dir)
+        metadata = {
+            "epoch": epoch,
+            "global_step": global_step,
+            "model_name": self.model_name,
+            "learning_rate": self.learning_rate,
+            "batch_size": self.batch_size,
+            "is_best": is_best,
+            "model_class": "CustomQwen3ForCausalLM"
+        }
+        with open(checkpoint_dir / "epoch_metadata.json", 'w') as f:
+            json.dump(metadata, f, indent=2)
+        logger.info(f"Saved epoch checkpoint: {checkpoint_dir}")
+        return checkpoint_dir
+    def train(self):
+        logger.info("Starting training...")
+        train_loader = DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            collate_fn=lambda batch: collate_fn(batch, self.tokenizer),
+            num_workers=0  # Avoid multiprocessing issues with custom collate_fn
+        )
+        val_loader = DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            collate_fn=lambda batch: collate_fn(batch, self.tokenizer),
+            num_workers=0
+        )
+        optimizer = AdamW(
+            [p for p in self.model.parameters() if p.requires_grad],
+            lr=self.learning_rate,
+            weight_decay=0.01
+        )
+        total_steps = len(train_loader) * self.num_epochs
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=self.warmup_steps,
+            num_training_steps=total_steps
+        )
+        start_epoch = 0
+        global_step = 0
+        best_val_loss = float('inf')
+        if self.resume_training:
+            checkpoint_info = self.training_state.get_latest_checkpoint()
+            if checkpoint_info is not None:
+                checkpoint_path, state = checkpoint_info
+                self.load_checkpoint(checkpoint_path)
+                start_epoch = state["completed_epochs"]
+                global_step = state["global_step"]
+                best_val_loss = state["best_val_loss"]
+                if "optimizer_state" in state and state["optimizer_state"]:
+                    try:
+                        optimizer.load_state_dict(state["optimizer_state"])
+                    except Exception as e:
+                        logger.warning(f"Failed to load optimizer state: {e}")
+                if "scheduler_state" in state and state["scheduler_state"]:
+                    try:
+                        scheduler.load_state_dict(state["scheduler_state"])
+                    except Exception as e:
+                        logger.warning(f"Failed to load scheduler state: {e}")
+                logger.info(f"Resumed training from epoch {start_epoch + 1}")
+        for epoch in range(start_epoch, self.num_epochs):
+            logger.info(f"Epoch {epoch + 1}/{self.num_epochs}")
+            epoch_train_loss = 0
+            train_steps = 0
+            progress_bar = tqdm(train_loader, desc=f"Training Epoch {epoch + 1}")
+            for batch_idx, batch in enumerate(progress_bar):
+                try:
+                    loss, outputs = self.training_step(batch, global_step)
+                    loss.backward()
+                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
+                    optimizer.step()
+                    scheduler.step()
+                    optimizer.zero_grad()
+                    epoch_train_loss += loss.item()
+                    train_steps += 1
+                    global_step += 1
+                    progress_bar.set_postfix({
+                        'loss': f'{loss.item():.4f}',
+                        'lr': f'{scheduler.get_last_lr()[0]:.2e}'
+                    })
+                    if global_step % self.logging_steps == 0:
+                        avg_loss = epoch_train_loss / train_steps
+                        logger.info(f"Step {global_step}: train_loss={avg_loss:.4f}, lr={scheduler.get_last_lr()[0]:.2e}")
+                    if global_step % self.eval_steps == 0:
+                        val_loss = self.validate(val_loader)
+                        logger.info(f"Step {global_step}: val_loss={val_loss:.4f}")
+                        if val_loss < best_val_loss:
+                            best_val_loss = val_loss
+                            best_checkpoint = self.save_epoch_checkpoint(epoch, global_step, is_best=True)
+                            logger.info(f"New best validation loss: {val_loss:.4f}")
+                except Exception as e:
+                    logger.error(f"Error in training step {global_step}: {e}")
+                    continue
+            val_loss = self.validate(val_loader)
+            avg_train_loss = epoch_train_loss / train_steps if train_steps > 0 else 0
+            logger.info(f"Epoch {epoch + 1} completed:")
+            logger.info(f"  Average train loss: {avg_train_loss:.4f}")
+            logger.info(f"  Validation loss: {val_loss:.4f}")
+            is_best = val_loss < best_val_loss
+            if is_best:
+                best_val_loss = val_loss
+            checkpoint_dir = self.save_epoch_checkpoint(epoch, global_step, is_best=is_best)
+            self.training_state.save_state(
+                epoch=epoch,
+                global_step=global_step,
+                best_val_loss=best_val_loss,
+                optimizer_state=optimizer.state_dict(),
+                scheduler_state=scheduler.state_dict()
+            )
+            logger.info(f"Epoch {epoch + 1} checkpoint and state saved")
+        logger.info(f"Training completed! Best validation loss: {best_val_loss:.4f}")
+        final_checkpoint = self.save_epoch_checkpoint(self.num_epochs - 1, global_step, is_best=False)
+        logger.info(f"Final checkpoint saved: {final_checkpoint}")
+    def validate(self, val_loader):
+        self.model.eval()
+        total_loss = 0
+        num_batches = 0
+        with torch.no_grad():
+            for batch in tqdm(val_loader, desc="Validation"):
+                try:
+                    loss = self.validation_step(batch)
+                    total_loss += loss
+                    num_batches += 1
+                except Exception as e:
+                    logger.warning(f"Error in validation step: {e}")
+                    continue
+        avg_loss = total_loss / num_batches if num_batches > 0 else float('inf')
+        self.model.train()  # Set back to training mode
+        return avg_loss
+    def run(self):
+        try:
+            self.load_model_and_tokenizer()
+            self.load_dataset(
+                chunk_size=self.chunk_size,
+                max_samples=self.max_samples
+            )
+            self.train()
+            logger.info("Training pipeline completed successfully!")
+            self.cleanup_datasets()
+        except Exception as e:
+            logger.error(f"Training pipeline failed: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+            try:
+                self.cleanup_datasets()
+            except:
+                pass
+            raise
+def load_trained_model(checkpoint_path: str, model_name: str = "Qwen/Qwen3-0.6B"):
+    logger.info(f"Loading trained model from {checkpoint_path}")
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
+    config = Qwen3Config.from_pretrained(checkpoint_path)
+    model = CustomQwen3ForCausalLM(config)
+    model_path = Path(checkpoint_path) / "pytorch_model.bin"
+    if not model_path.exists():
+        model_path = Path(checkpoint_path) / "model.safetensors"
+    if not model_path.exists():
+        raise FileNotFoundError(f"No model weights found in {checkpoint_path}")
+    state_dict = torch.load(model_path, map_location="cpu")
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    if missing_keys:
+        logger.warning(f"Missing keys when loading model: {missing_keys}")
+    if unexpected_keys:
+        logger.warning(f"Unexpected keys when loading model: {unexpected_keys}")
+    model = model.eval().to(torch.float32)
+    return model, tokenizer
+def generate_with_grouped_input(
+    model,
+    tokenizer,
+    grouped_input: torch.Tensor,
+    max_length: int = 512,
+    temperature: float = 0.7,
+    do_sample: bool = True
+):
+    device = model.device
+    model_dtype = next(model.parameters()).dtype
+    grouped_input = grouped_input.to(device=device, dtype=model_dtype)
+    if grouped_input.ndim == 2:
+        grouped_input = grouped_input.unsqueeze(0)  # Add batch dimension
+    logger.debug(f"Grouped input shape: {grouped_input.shape}, dtype: {grouped_input.dtype}")
+    logger.debug(f"Model dtype: {model_dtype}, device: {device}")
+    with torch.no_grad():
+        try:
+            outputs = model(
+                grouped_inputs=grouped_input,
+                is_prefill=True,
+                use_cache=True,
+                return_dict=True
+            )
+        except Exception as e:
+            logger.error(f"Error in prefill phase: {e}")
+            raise
+    if hasattr(outputs, 'logits') and outputs.logits is not None:
+        next_token_logits = outputs.logits[:, -1, :]
+    elif hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
+        last_hidden_state = outputs.hidden_states[-1] if isinstance(outputs.hidden_states, (list, tuple)) else outputs.hidden_states
+        next_token_logits = model.lm_head(last_hidden_state[:, -1, :])
+    else:
+        raise RuntimeError("Could not extract logits from model output")
+    if do_sample:
+        next_token_logits = next_token_logits / temperature
+        probs = F.softmax(next_token_logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+    else:
+        next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+    generated_ids = next_token
+    past_key_values = getattr(outputs, 'past_key_values', None)
+    for step in range(max_length - 1):
+        with torch.no_grad():
+            try:
+                outputs = model(
+                    input_ids=next_token,
+                    past_key_values=past_key_values,
+                    use_cache=True,
+                    return_dict=True
+                )
+            except Exception as e:
+                logger.error(f"Error in generation step {step}: {e}")
+                break
+        if hasattr(outputs, 'logits'):
+            next_token_logits = outputs.logits[:, -1, :]
+        else:
+            logger.warning("No logits in generation output, stopping generation")
+            break
+        if do_sample:
+            next_token_logits = next_token_logits / temperature
+            probs = F.softmax(next_token_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+        generated_ids = torch.cat([generated_ids, next_token], dim=1)
+        past_key_values = getattr(outputs, 'past_key_values', None)
+        if next_token.item() == tokenizer.eos_token_id:
+            break
+    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+    return generated_text
+def main():
+    config = {
+        "model_name": "Qwen/Qwen3-0.6B",
+        "dataset_path": "./processed_qwen3_dataset/processed_dataset.pkl",
+        "output_dir": "./grouped_qwen3_checkpoint",
+        "batch_size": 12,  # Optimized for L4 24GB VRAM
+        "learning_rate": 5e-4,
+        "num_epochs": 3,
+        "warmup_steps": 500,  # Increased for larger batch
+        "max_grad_norm": 1.0,
+        "save_steps": 1000,   # Less frequent saves due to larger batches
+        "eval_steps": 1000,   # Less frequent evaluation
+        "logging_steps": 100,
+        "resume_training": True,
+        "debug": False,  # Disable debug for performance
+        # Streaming parameters
+        "chunk_size": 2000,        # Load 2000 samples per chunk
+        "max_samples": None,       # Use full dataset (set to smaller number for testing)
+    }
+    logger.info("="*60)
+    logger.info("GROUPED QWEN3 TRAINING CONFIGURATION (STREAMING)")
+    logger.info("="*60)
+    for key, value in config.items():
+        logger.info(f"{key}: {value}")
+    logger.info("="*60)
+    if torch.cuda.is_available():
+        logger.info(f"GPU: {torch.cuda.get_device_name()}")
+        logger.info(f"VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
+    import psutil
+    ram_usage = psutil.virtual_memory()
+    logger.info(f"System RAM: {ram_usage.used / 1024**3:.1f} GB / {ram_usage.total / 1024**3:.1f} GB ({ram_usage.percent:.1f}%)")
+    trainer = GroupedTrainer(**config)
+    trainer.run()
+def inference_by_id(sample_id: int, checkpoint_path: str = "./grouped_qwen3_checkpoint/epoch_2_best",
+                   dataset_path: str = "./processed_qwen3_dataset/processed_dataset.pkl",
+                   max_length: int = 512, temperature: float = 0.7, do_sample: bool = True):
+    """Run inference on a specific sample ID from the dataset."""
+    logger.info(f"Running inference on sample ID: {sample_id}")
+    try:
+        model, tokenizer = load_trained_model(checkpoint_path)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = model.to(device)
+        logger.info(f"Model loaded from {checkpoint_path}")
+    except Exception as e:
+        logger.error(f"Failed to load model: {e}")
+        return None
+    try:
+        logger.info(f"Loading sample {sample_id} from dataset...")
+        with open(dataset_path, 'rb') as f:
+            processed_data = pickle.load(f)
+        if sample_id >= len(processed_data):
+            logger.error(f"Sample ID {sample_id} is out of range. Dataset has {len(processed_data)} samples.")
+            return None
+        sample = processed_data[sample_id]
+        if sample.get("error", False):
+            logger.error(f"Sample {sample_id} has errors and cannot be used for inference.")
+            return None
+    except Exception as e:
+        logger.error(f"Failed to load dataset: {e}")
+        return None
+    grouped_embeds_raw = sample["inputs_embeds"]
+    if isinstance(grouped_embeds_raw, torch.Tensor):
+        grouped_input = grouped_embeds_raw.detach().clone().float()
+    else:
+        grouped_input = torch.tensor(grouped_embeds_raw, dtype=torch.float32)
+    original_input = sample["input_text"]
+    expected_response = sample["response"]
+    print("\n" + "="*80)
+    print(f"INFERENCE ON SAMPLE ID: {sample_id}")
+    print("="*80)
+    print(f"📝 ORIGINAL REQUEST:")
+    print(f"{original_input}")
+    print("\n" + "-"*80)
+    print(f"🎯 EXPECTED RESPONSE:")
+    print(f"{expected_response}")
+    print("\n" + "-"*80)
+    print(f"🤖 MODEL GENERATED RESPONSE:")
+    try:
+        generated_text = generate_with_grouped_input(
+            model=model,
+            tokenizer=tokenizer,
+            grouped_input=grouped_input,
+            max_length=max_length,
+            temperature=temperature,
+            do_sample=do_sample
+        )
+        print(f"{generated_text}")
+        print("\n" + "="*80)
+        expected_words = expected_response.split()
+        generated_words = generated_text.split()
+        print(f"📊 METRICS:")
+        print(f"Expected length: {len(expected_words)} words")
+        print(f"Generated length: {len(generated_words)} words")
+        print(f"Temperature: {temperature}")
+        print(f"Max length: {max_length}")
+        print("="*80)
+        return {
+            "sample_id": sample_id,
+            "original_input": original_input,
+            "expected_response": expected_response,
+            "generated_response": generated_text,
+            "expected_length": len(expected_words),
+            "generated_length": len(generated_words)
+        }
+    except Exception as e:
+        logger.error(f"Failed to generate response: {e}")
+        print(f"❌ GENERATION FAILED: {e}")
+        print("="*80)
+        return None
+def batch_inference(sample_ids: List[int], checkpoint_path: str = "./grouped_qwen3_checkpoint/epoch_2_best",
+                   dataset_path: str = "./processed_qwen3_dataset/processed_dataset.pkl",
+                   max_length: int = 512, temperature: float = 0.7, do_sample: bool = True):
+    """Run inference on multiple sample IDs."""
+    logger.info(f"Running batch inference on {len(sample_ids)} samples")
+    results = []
+    for sample_id in sample_ids:
+        result = inference_by_id(
+            sample_id=sample_id,
+            checkpoint_path=checkpoint_path,
+            dataset_path=dataset_path,
+            max_length=max_length,
+            temperature=temperature,
+            do_sample=do_sample
+        )
+        if result:
+            results.append(result)
+        print("\n" + "🔄 " + "-"*78 + " 🔄\n")  # Separator between samples
+    print("\n" + "="*80)
+    print(f"📋 BATCH INFERENCE SUMMARY")
+    print("="*80)
+    print(f"Total samples processed: {len(results)}")
+    if results:
+        avg_expected_len = sum(r["expected_length"] for r in results) / len(results)
+        avg_generated_len = sum(r["generated_length"] for r in results) / len(results)
+        print(f"Average expected length: {avg_expected_len:.1f} words")
+        print(f"Average generated length: {avg_generated_len:.1f} words")
+    print("="*80)
+    return results
+def random_inference(num_samples: int = 3, checkpoint_path: str = "./grouped_qwen3_checkpoint/epoch_2_best",
+                    dataset_path: str = "./processed_qwen3_dataset/processed_dataset.pkl",
+                    max_length: int = 512, temperature: float = 0.7, do_sample: bool = True):
+    """Run inference on random samples from the dataset."""
+    import random
+    try:
+        with open(dataset_path, 'rb') as f:
+            processed_data = pickle.load(f)
+        # Find valid samples
+        valid_indices = [i for i, item in enumerate(processed_data) if not item.get("error", False)]
+        if len(valid_indices) < num_samples:
+            logger.warning(f"Only {len(valid_indices)} valid samples available, using all of them")
+            num_samples = len(valid_indices)
+        # Select random samples
+        random_ids = random.sample(valid_indices, num_samples)
+        logger.info(f"Selected random sample IDs: {random_ids}")
+        # Run batch inference
+        return batch_inference(
+            sample_ids=random_ids,
+            checkpoint_path=checkpoint_path,
+            dataset_path=dataset_path,
+            max_length=max_length,
+            temperature=temperature,
+            do_sample=do_sample
+        )
+    except Exception as e:
+        logger.error(f"Failed to load dataset for random sampling: {e}")
+        return None
+def interactive_inference(checkpoint_path: str = "./grouped_qwen3_checkpoint/epoch_2_best",
+                         dataset_path: str = "./processed_qwen3_dataset/processed_dataset.pkl"):
+    """Interactive inference mode where user can input sample IDs."""
+    print("\n" + "="*80)
+    print("🤖 INTERACTIVE INFERENCE MODE")
+    print("="*80)
+    print("Commands:")
+    print("  <number>     - Run inference on sample ID")
+    print("  random <n>   - Run inference on n random samples (default: 3)")
+    print("  batch <ids>  - Run inference on multiple IDs (e.g., 'batch 1,5,10')")
+    print("  quit         - Exit")
+    print("="*80)
+    while True:
+        try:
+            user_input = input("\n🔍 Enter command: ").strip().lower()
+            if user_input in ['quit', 'exit', 'q']:
+                print("👋 Goodbye!")
+                break
+            elif user_input.startswith('random'):
+                parts = user_input.split()
+                num_samples = int(parts[1]) if len(parts) > 1 else 3
+                random_inference(num_samples=num_samples, checkpoint_path=checkpoint_path, dataset_path=dataset_path)
+            elif user_input.startswith('batch'):
+                parts = user_input.split(maxsplit=1)
+                if len(parts) > 1:
+                    ids_str = parts[1]
+                    sample_ids = [int(x.strip()) for x in ids_str.split(',')]
+                    batch_inference(sample_ids=sample_ids, checkpoint_path=checkpoint_path, dataset_path=dataset_path)
+                else:
+                    print("❌ Please provide sample IDs: batch 1,5,10")
+            elif user_input.isdigit():
+                sample_id = int(user_input)
+                inference_by_id(sample_id=sample_id, checkpoint_path=checkpoint_path, dataset_path=dataset_path)
+            else:
+                print("❌ Invalid command. Try a number, 'random', 'batch', or 'quit'")
+        except ValueError:
+            print("❌ Invalid input. Please enter a valid number or command.")
+        except KeyboardInterrupt:
+            print("\n👋 Goodbye!")
+            break
+        except Exception as e:
+            print(f"❌ Error: {e}")
+def test_inference():
+    logger.info("Running inference tests...")
+    test_ids = [0, 1, 2, 100, 500]  # Mix of early and later samples
+    print("\n🧪 TESTING INFERENCE ON PREDEFINED SAMPLES")
+    results = batch_inference(
+        sample_ids=test_ids,
+        max_length=300,
+        temperature=0.7,
+        do_sample=True
+    )
+    return results
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Grouped Qwen3 Training and Inference")
+    parser.add_argument("--mode", choices=["train", "test", "inference", "interactive", "random"],
+                       default="train", help="Mode to run")
+    parser.add_argument("--sample_id", type=int, help="Sample ID for inference mode")
+    parser.add_argument("--sample_ids", type=str, help="Comma-separated sample IDs for batch inference")
+    parser.add_argument("--num_samples", type=int, default=3, help="Number of random samples for random mode")
+    parser.add_argument("--checkpoint", type=str, default="./grouped_qwen3_checkpoint/epoch_2_best",
+                       help="Path to model checkpoint")
+    parser.add_argument("--dataset", type=str, default="./processed_qwen3_dataset/processed_dataset.pkl",
+                       help="Path to dataset")
+    parser.add_argument("--max_length", type=int, default=512, help="Maximum generation length")
+    parser.add_argument("--temperature", type=float, default=0.7, help="Generation temperature")
+    args = parser.parse_args()
+    if args.mode == "train":
+        main()
+    elif args.mode == "test":
+        test_inference()
+    elif args.mode == "inference":
+        if args.sample_id is not None:
+            inference_by_id(
+                sample_id=args.sample_id,
+                checkpoint_path=args.checkpoint,
+                dataset_path=args.dataset,
+                max_length=args.max_length,
+                temperature=args.temperature
+            )
+        elif args.sample_ids is not None:
+            sample_ids = [int(x.strip()) for x in args.sample_ids.split(',')]
+            batch_inference(
+                sample_ids=sample_ids,
+                checkpoint_path=args.checkpoint,
+                dataset_path=args.dataset,
+                max_length=args.max_length,
+                temperature=args.temperature
+            )
+        else:
+            print("❌ Please provide --sample_id or --sample_ids for inference mode")
+    elif args.mode == "interactive":
+        interactive_inference(
+            checkpoint_path=args.checkpoint,
+            dataset_path=args.dataset
+        )
+    elif args.mode == "random":
+        random_inference(
+            num_samples=args.num_samples,
+            checkpoint_path=args.checkpoint,
+            dataset_path=args.dataset,
+            max_length=args.max_length,
+            temperature=args.temperature
+        )