ovedrive commited on Sep 23

Commit

863a8ef

verified ·

1 Parent(s): 48b12db

Upload folder using huggingface_hub

Browse files

Files changed (31) hide show

.gitattributes +1 -0
README.md +164 -0
model_index.json +29 -0
processor/added_tokens.json +24 -0
processor/chat_template.jinja +7 -0
processor/merges.txt +0 -0
processor/preprocessor_config.json +37 -0
processor/special_tokens_map.json +31 -0
processor/tokenizer.json +3 -0
processor/tokenizer_config.json +208 -0
processor/video_preprocessor_config.json +44 -0
processor/vocab.json +0 -0
quantization_info.json +6 -0
scheduler/scheduler_config.json +18 -0
text_encoder/config.json +181 -0
text_encoder/generation_config.json +14 -0
text_encoder/model-00001-of-00002.safetensors +3 -0
text_encoder/model-00002-of-00002.safetensors +3 -0
text_encoder/model.safetensors.index.json +0 -0
tokenizer/added_tokens.json +24 -0
tokenizer/chat_template.jinja +54 -0
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +31 -0
tokenizer/tokenizer_config.json +207 -0
tokenizer/vocab.json +0 -0
transformer/config.json +64 -0
transformer/diffusion_pytorch_model-00001-of-00002.safetensors +3 -0
transformer/diffusion_pytorch_model-00002-of-00002.safetensors +3 -0
transformer/diffusion_pytorch_model.safetensors.index.json +0 -0
vae/config.json +103 -0
vae/diffusion_pytorch_model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+processor/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,164 @@

+---
+license: apache-2.0
+language:
+- en
+- zh
+library_name: diffusers
+pipeline_tag: image-to-image
+---
+<p align="center">
+    <img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png" width="400"/>
+<p>
+<p align="center">
+          💜 <a href="https://chat.qwen.ai/"><b>Qwen Chat</b></a>&nbsp&nbsp | &nbsp&nbsp🤗 <a href="https://huggingface.co/Qwen/Qwen-Image-Edit-2509">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/models/Qwen/Qwen-Image-Edit-2509">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp 📑 <a href="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/Qwen_Image.pdf">Tech Report</a> &nbsp&nbsp | &nbsp&nbsp 📑 <a href="https://qwenlm.github.io/blog/qwen-image-edit/">Blog</a> &nbsp&nbsp
+<br>
+🖥️ <a href="https://huggingface.co/spaces/Qwen/Qwen-Image-Edit">Demo</a>&nbsp&nbsp | &nbsp&nbsp💬 <a href="https://github.com/QwenLM/Qwen-Image/blob/main/assets/wechat.png">WeChat (微信)</a>&nbsp&nbsp | &nbsp&nbsp🫨 <a href="https://discord.gg/CV4E9rpNSD">Discord</a>&nbsp&nbsp| &nbsp&nbsp <a href="https://github.com/QwenLM/Qwen-Image">Github</a>&nbsp&nbsp
+</p>
+<p align="center">
+    <img src="https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen-Image/edit2509/edit2509_top.jpg" width="1600"/>
+<p>
+# Introduction
+This September, we are pleased to introduce Qwen-Image-Edit-2509, the monthly iteration of Qwen-Image-Edit. To experience the latest model, please visit [Qwen Chat](https://qwen.ai)  and select the "Image Editing" feature.
+Compared with Qwen-Image-Edit released in August, the main improvements of Qwen-Image-Edit-2509 include:
+* **Multi-image Editing Support**: For multi-image inputs, Qwen-Image-Edit-2509 builds upon the Qwen-Image-Edit architecture and is further trained via image concatenation to enable multi-image editing. It supports various combinations such as "person + person," "person + product," and "person + scene." Optimal performance is currently achieved with 1 to 3 input images.
+* **Enhanced Single-image Consistency**: For single-image inputs, Qwen-Image-Edit-2509 significantly improves editing consistency, specifically in the following areas:
+  - **Improved Person Editing Consistency**: Better preservation of facial identity, supporting various portrait styles and pose transformations;
+  - **Improved Product Editing Consistency**: Better preservation of product identity, supporting product poster editing；
+  - **Improved Text Editing Consistency**: In addition to modifying text content, it also supports editing text fonts, colors, and materials；
+* **Native Support for ControlNet**: Including depth maps, edge maps, keypoint maps, and more.
+## Quick Start
+Install the latest version of diffusers
+```
+pip install git+https://github.com/huggingface/diffusers
+```
+The following contains a code snippet illustrating how to use `Qwen-Image-Edit-2509`:
+```python
+import os
+import torch
+from PIL import Image
+from diffusers import QwenImageEditPlusPipeline
+pipeline = QwenImageEditPlusPipeline.from_pretrained("Qwen/Qwen-Image-Edit-2509", torch_dtype=torch.bfloat16)
+print("pipeline loaded")
+pipeline.to('cuda')
+pipeline.set_progress_bar_config(disable=None)
+image1 = Image.open("input1.png")
+image2 = Image.open("input2.png")
+prompt = "The magician bear is on the left, the alchemist bear is on the right, facing each other in the central park square."
+inputs = {
+    "image": [image1, image2],
+    "prompt": prompt,
+    "generator": torch.manual_seed(0),
+    "true_cfg_scale": 4.0,
+    "negative_prompt": " ",
+    "num_inference_steps": 40,
+    "guidance_scale": 1.0,
+    "num_images_per_prompt": 1,
+}
+with torch.inference_mode():
+    output = pipeline(**inputs)
+    output_image = output.images[0]
+    output_image.save("output_image_edit_plus.png")
+    print("image saved at", os.path.abspath("output_image_edit_plus.png"))
+```
+## Showcase
+**The primary update in Qwen-Image-Edit-2509 is support for multi-image inputs.**
+Let’s first look at a "person + person" example:
+![Person + Person Example](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8719.JPG#center)
+Here is a "person + scene" example:
+![Person + Scene Example](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8720.JPG#center)
+Below is a "person + object" example:
+![Person + Object Example](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8721.JPG#center)
+In fact, multi-image input also supports commonly used ControlNet keypoint maps—for example, changing a person’s pose:
+![ControlNet Keypoint Example](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8722.JPG#center)
+Similarly, the following examples demonstrate results using three input images:
+![Three Images Example 1](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8723.JPG#center)
+![Three Images Example 2](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8724.JPG#center)
+![Three Images Example 3](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8725.JPG#center)
+---
+**Another major update in Qwen-Image-Edit-2509 is enhanced consistency.**
+First, regarding person consistency, Qwen-Image-Edit-2509 shows significant improvement over Qwen-Image-Edit. Below are examples generating various portrait styles:
+![Portrait Styles Example](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%871.JPG#center)
+For instance, changing a person’s pose while maintaining excellent identity consistency:
+![Pose Change with Identity Consistency](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%872.JPG#center)
+Leveraging this improvement along with Qwen-Image’s unique text rendering capability, we find that Qwen-Image-Edit-2509 excels at creating meme images:
+![Meme Image Example](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%873.JPG#center)
+Of course, even with longer text, Qwen-Image-Edit-2509 can still render it while preserving the person’s identity:
+![Long Text with Identity Preservation](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%874.JPG#center)
+Person consistency is also evident in old photo restoration. Below are two examples:
+![Old Photo Restoration 1](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8717.JPG#center)
+![Old Photo Restoration 2](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8718.JPG#center)
+Naturally, besides real people, generating cartoon characters and cultural creations is also possible:
+![Cartoon & Cultural Creation](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8715.JPG#center)
+Second, Qwen-Image-Edit-2509 specifically enhances product consistency. We find that the model can naturally generate product posters from plain-background product images:
+![Product Poster Example](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%875.JPG#center)
+Or even simple logos:
+![Logo Generation Example](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8716.JPG#center)
+Third, Qwen-Image-Edit-2509 specifically enhances text consistency and supports editing font type, font color, and font material:
+![Text Font Type](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8710.JPG#center)
+![Text Font Color](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8711.JPG#center)
+![Text Font Material](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8712.JPG#center)
+Moreover, the ability for precise text editing has been significantly enhanced:
+![Precise Text Editing 1](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8713.JPG#center)
+![Precise Text Editing 2](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%8714.JPG#center)
+It is worth noting that text editing can often be seamlessly integrated with image editing—for example, in this poster editing case:
+![Integrated Text & Image Editing](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%876.JPG#center)
+---
+**The final update in Qwen-Image-Edit-2509 is native support for commonly used ControlNet image conditions, such as keypoint control and sketches:**
+![Keypoint Control Example](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%877.JPG#center)
+![Sketch Control Example 1](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%878.JPG#center)
+![Sketch Control Example 2](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/edit2509/%E5%B9%BB%E7%81%AF%E7%89%879.JPG#center)
+## License Agreement
+Qwen-Image is licensed under Apache 2.0.
+## Citation
+We kindly encourage citation of our work if you find it useful.
+```bibtex
+@misc{wu2025qwenimagetechnicalreport,
+      title={Qwen-Image Technical Report},
+      author={Chenfei Wu and Jiahao Li and Jingren Zhou and Junyang Lin and Kaiyuan Gao and Kun Yan and Sheng-ming Yin and Shuai Bai and Xiao Xu and Yilei Chen and Yuxiang Chen and Zecheng Tang and Zekai Zhang and Zhengyi Wang and An Yang and Bowen Yu and Chen Cheng and Dayiheng Liu and Deqing Li and Hang Zhang and Hao Meng and Hu Wei and Jingyuan Ni and Kai Chen and Kuan Cao and Liang Peng and Lin Qu and Minggang Wu and Peng Wang and Shuting Yu and Tingkun Wen and Wensen Feng and Xiaoxiao Xu and Yi Wang and Yichang Zhang and Yongqiang Zhu and Yujia Wu and Yuxuan Cai and Zenan Liu},
+      year={2025},
+      eprint={2508.02324},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2508.02324},
+}
+```

model_index.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "_class_name": "QwenImageEditPlusPipeline",
+  "_diffusers_version": "0.36.0.dev0",
+  "_name_or_path": "./tmp_model",
+  "processor": [
+    "transformers",
+    "Qwen2VLProcessor"
+  ],
+  "scheduler": [
+    "diffusers",
+    "FlowMatchEulerDiscreteScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "Qwen2_5_VLForConditionalGeneration"
+  ],
+  "tokenizer": [
+    "transformers",
+    "Qwen2Tokenizer"
+  ],
+  "transformer": [
+    "diffusers",
+    "QwenImageTransformer2DModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKLQwenImage"
+  ]
+}

processor/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

processor/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

processor/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

processor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2
+}

processor/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

processor/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

processor/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen2VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

processor/video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": false,
+  "fps": null,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_frames": 768,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_frames": 4,
+  "min_pixels": 3136,
+  "num_frames": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_metadata": false,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "size_divisor": null,
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen2VLVideoProcessor"
+}

processor/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

quantization_info.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "quantization_method": "mixed_precision_nf4",
+  "description": "First and last transformer blocks kept at bfloat16, middle layers quantized to NF4",
+  "high_precision_layers_count": 30,
+  "note": "Based on city96/Qwen-Image-gguf approach for better quality"
+}

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.36.0.dev0",
+  "base_image_seq_len": 256,
+  "base_shift": 0.5,
+  "invert_sigmas": false,
+  "max_image_seq_len": 8192,
+  "max_shift": 0.9,
+  "num_train_timesteps": 1000,
+  "shift": 1.0,
+  "shift_terminal": 0.02,
+  "stochastic_sampling": false,
+  "time_shift_type": "exponential",
+  "use_beta_sigmas": false,
+  "use_dynamic_shifting": true,
+  "use_exponential_sigmas": false,
+  "use_karras_sigmas": false
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,181 @@

+{
+  "architectures": [
+    "Qwen2_5_VLForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "image_token_id": 151655,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 28,
+  "model_type": "qwen2_5_vl",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "quantization_config": {
+    "_load_in_4bit": true,
+    "_load_in_8bit": false,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": [
+      "transformer_blocks.0.img_mod.1",
+      "transformer_blocks.0.attn.to_q",
+      "transformer_blocks.0.attn.to_k",
+      "transformer_blocks.0.attn.to_v",
+      "transformer_blocks.0.attn.add_k_proj",
+      "transformer_blocks.0.attn.add_v_proj",
+      "transformer_blocks.0.attn.add_q_proj",
+      "transformer_blocks.0.attn.to_out.0",
+      "transformer_blocks.0.attn.to_add_out",
+      "transformer_blocks.0.img_mlp.net.0.proj",
+      "transformer_blocks.0.img_mlp.net.2",
+      "transformer_blocks.0.txt_mod.1",
+      "transformer_blocks.0.txt_mlp.net.0.proj",
+      "transformer_blocks.0.txt_mlp.net.2",
+      "transformer_blocks.59.img_mod.1",
+      "transformer_blocks.59.attn.to_q",
+      "transformer_blocks.59.attn.to_k",
+      "transformer_blocks.59.attn.to_v",
+      "transformer_blocks.59.attn.add_k_proj",
+      "transformer_blocks.59.attn.add_v_proj",
+      "transformer_blocks.59.attn.add_q_proj",
+      "transformer_blocks.59.attn.to_out.0",
+      "transformer_blocks.59.attn.to_add_out",
+      "transformer_blocks.59.img_mlp.net.0.proj",
+      "transformer_blocks.59.img_mlp.net.2",
+      "transformer_blocks.59.txt_mod.1",
+      "transformer_blocks.59.txt_mlp.net.0.proj",
+      "transformer_blocks.59.txt_mlp.net.2",
+      "norm_out.linear",
+      "proj_out"
+    ],
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "text_config": {
+    "architectures": [
+      "Qwen2_5_VLForConditionalGeneration"
+    ],
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "image_token_id": null,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 128000,
+    "max_window_layers": 28,
+    "model_type": "qwen2_5_vl_text",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_section": [
+        16,
+        24,
+        24
+      ],
+      "rope_type": "default",
+      "type": "default"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "video_token_id": null,
+    "vision_end_token_id": 151653,
+    "vision_start_token_id": 151652,
+    "vision_token_id": 151654,
+    "vocab_size": 152064
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "4.56.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "depth": 32,
+    "dtype": "bfloat16",
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3420,
+    "model_type": "qwen2_5_vl",
+    "num_heads": 16,
+    "out_hidden_size": 3584,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "temporal_patch_size": 2,
+    "tokens_per_second": 2,
+    "window_size": 112
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vocab_size": 152064
+}

text_encoder/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 0.1,
+  "top_k": 1,
+  "top_p": 0.001,
+  "transformers_version": "4.56.2"
+}

text_encoder/model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1905aae08b7a148c6f2a69939fe196c314f7a75652934e7b85fe222976a28663
+size 4809612598

text_encoder/model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b727105e5dbd38583aee6a600cad89d2c911ed6592b728702fd4bb60f108704
+size 281149198

text_encoder/model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

tokenizer/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

transformer/config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "_class_name": "QwenImageTransformer2DModel",
+  "_diffusers_version": "0.36.0.dev0",
+  "_name_or_path": "./tmp_model/transformer",
+  "attention_head_dim": 128,
+  "axes_dims_rope": [
+    16,
+    56,
+    56
+  ],
+  "guidance_embeds": false,
+  "in_channels": 64,
+  "joint_attention_dim": 3584,
+  "num_attention_heads": 24,
+  "num_layers": 60,
+  "out_channels": 16,
+  "patch_size": 2,
+  "quantization_config": {
+    "_load_in_4bit": true,
+    "_load_in_8bit": false,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": [
+      "transformer_blocks.0.img_mod.1",
+      "transformer_blocks.0.attn.to_q",
+      "transformer_blocks.0.attn.to_k",
+      "transformer_blocks.0.attn.to_v",
+      "transformer_blocks.0.attn.add_k_proj",
+      "transformer_blocks.0.attn.add_v_proj",
+      "transformer_blocks.0.attn.add_q_proj",
+      "transformer_blocks.0.attn.to_out.0",
+      "transformer_blocks.0.attn.to_add_out",
+      "transformer_blocks.0.img_mlp.net.0.proj",
+      "transformer_blocks.0.img_mlp.net.2",
+      "transformer_blocks.0.txt_mod.1",
+      "transformer_blocks.0.txt_mlp.net.0.proj",
+      "transformer_blocks.0.txt_mlp.net.2",
+      "transformer_blocks.59.img_mod.1",
+      "transformer_blocks.59.attn.to_q",
+      "transformer_blocks.59.attn.to_k",
+      "transformer_blocks.59.attn.to_v",
+      "transformer_blocks.59.attn.add_k_proj",
+      "transformer_blocks.59.attn.add_v_proj",
+      "transformer_blocks.59.attn.add_q_proj",
+      "transformer_blocks.59.attn.to_out.0",
+      "transformer_blocks.59.attn.to_add_out",
+      "transformer_blocks.59.img_mlp.net.0.proj",
+      "transformer_blocks.59.img_mlp.net.2",
+      "transformer_blocks.59.txt_mod.1",
+      "transformer_blocks.59.txt_mlp.net.0.proj",
+      "transformer_blocks.59.txt_mlp.net.2",
+      "norm_out.linear",
+      "proj_out"
+    ],
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  }
+}

transformer/diffusion_pytorch_model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc1691d8d848942ca98a1eee1ad193b281f9dce624b077d76f3b7c2f2a0e1c57
+size 9990996271

transformer/diffusion_pytorch_model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9793f01eb5402b65695195c15328f21b273bfe849051a73e074554a87e7c476e
+size 1595201030

transformer/diffusion_pytorch_model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vae/config.json ADDED Viewed

	@@ -0,0 +1,103 @@

+{
+  "_class_name": "AutoencoderKLQwenImage",
+  "_diffusers_version": "0.36.0.dev0",
+  "_name_or_path": "./tmp_model/vae",
+  "attn_scales": [],
+  "base_dim": 96,
+  "dim_mult": [
+    1,
+    2,
+    4,
+    4
+  ],
+  "dropout": 0.0,
+  "latents_mean": [
+    -0.7571,
+    -0.7089,
+    -0.9113,
+    0.1075,
+    -0.1745,
+    0.9653,
+    -0.1517,
+    1.5508,
+    0.4134,
+    -0.0715,
+    0.5517,
+    -0.3632,
+    -0.1922,
+    -0.9497,
+    0.2503,
+    -0.2921
+  ],
+  "latents_std": [
+    2.8184,
+    1.4541,
+    2.3275,
+    2.6558,
+    1.2196,
+    1.7708,
+    2.6052,
+    2.0743,
+    3.2687,
+    2.1526,
+    2.8652,
+    1.5579,
+    1.6382,
+    1.1253,
+    2.8251,
+    1.916
+  ],
+  "num_res_blocks": 2,
+  "quantization_config": {
+    "_load_in_4bit": true,
+    "_load_in_8bit": false,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": [
+      "transformer_blocks.0.img_mod.1",
+      "transformer_blocks.0.attn.to_q",
+      "transformer_blocks.0.attn.to_k",
+      "transformer_blocks.0.attn.to_v",
+      "transformer_blocks.0.attn.add_k_proj",
+      "transformer_blocks.0.attn.add_v_proj",
+      "transformer_blocks.0.attn.add_q_proj",
+      "transformer_blocks.0.attn.to_out.0",
+      "transformer_blocks.0.attn.to_add_out",
+      "transformer_blocks.0.img_mlp.net.0.proj",
+      "transformer_blocks.0.img_mlp.net.2",
+      "transformer_blocks.0.txt_mod.1",
+      "transformer_blocks.0.txt_mlp.net.0.proj",
+      "transformer_blocks.0.txt_mlp.net.2",
+      "transformer_blocks.59.img_mod.1",
+      "transformer_blocks.59.attn.to_q",
+      "transformer_blocks.59.attn.to_k",
+      "transformer_blocks.59.attn.to_v",
+      "transformer_blocks.59.attn.add_k_proj",
+      "transformer_blocks.59.attn.add_v_proj",
+      "transformer_blocks.59.attn.add_q_proj",
+      "transformer_blocks.59.attn.to_out.0",
+      "transformer_blocks.59.attn.to_add_out",
+      "transformer_blocks.59.img_mlp.net.0.proj",
+      "transformer_blocks.59.img_mlp.net.2",
+      "transformer_blocks.59.txt_mod.1",
+      "transformer_blocks.59.txt_mlp.net.0.proj",
+      "transformer_blocks.59.txt_mlp.net.2",
+      "norm_out.linear",
+      "proj_out"
+    ],
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  },
+  "temperal_downsample": [
+    false,
+    true,
+    true
+  ],
+  "z_dim": 16
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c8bc8b758c649abef9ea407b95408389a3b2f610d0d10fcb054fe171d0a8344
+size 253806966