Spaces:
Sleeping
Sleeping
Fix GPU abort: Add memory optimizations (half precision, better monitoring)
Browse files- Convert model to bfloat16/float16 after loading to reduce memory by ~50%
- Change autocast from float32 to bfloat16/float16 for memory efficiency
- Add GPU memory monitoring before and during generation
- Improve error handling for GPU abort scenarios
- Move half precision conversion after parallelization for safety
app.py
CHANGED
|
@@ -236,8 +236,18 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
|
|
| 236 |
|
| 237 |
# Verify GPU is available before starting
|
| 238 |
if torch.cuda.is_available():
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
else:
|
| 242 |
status_msg += "⚠️ Warning: CUDA not available, will use CPU (very slow)\n"
|
| 243 |
|
|
@@ -274,17 +284,20 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
|
|
| 274 |
|
| 275 |
# Check for GPU abort or CUDA errors
|
| 276 |
error_output = (result.stderr + result.stdout).lower()
|
| 277 |
-
if "gpu aborted" in error_output or "cuda" in error_output or "out of memory" in error_output:
|
| 278 |
error_msg += "⚠️ GPU ERROR DETECTED\n\n"
|
| 279 |
error_msg += "Possible causes:\n"
|
| 280 |
error_msg += "1. GPU timeout (ZeroGPU may have a 5-10 min limit)\n"
|
| 281 |
error_msg += "2. CUDA out of memory (model too large for GPU)\n"
|
| 282 |
-
error_msg += "3. GPU allocation failed (ZeroGPU not detected)\n
|
|
|
|
| 283 |
error_msg += "Solutions:\n"
|
|
|
|
| 284 |
error_msg += "- Try again (GPU may have been released)\n"
|
| 285 |
error_msg += "- Check Space logs for detailed error\n"
|
| 286 |
error_msg += "- Ensure @spaces.GPU decorator is applied\n"
|
| 287 |
-
error_msg += "- Consider using paid GPU tier for longer runs\n
|
|
|
|
| 288 |
|
| 289 |
error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
|
| 290 |
error_msg += f"=== STDOUT ===\n{result.stdout}\n\n"
|
|
|
|
| 236 |
|
| 237 |
# Verify GPU is available before starting
|
| 238 |
if torch.cuda.is_available():
|
| 239 |
+
gpu_name = torch.cuda.get_device_name(0)
|
| 240 |
+
total_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
| 241 |
+
allocated = torch.cuda.memory_allocated(0) / 1024**3
|
| 242 |
+
reserved = torch.cuda.memory_reserved(0) / 1024**3
|
| 243 |
+
free_memory = total_memory - reserved
|
| 244 |
+
status_msg += f"✅ GPU available: {gpu_name}\n"
|
| 245 |
+
status_msg += f" Total Memory: {total_memory:.1f} GB\n"
|
| 246 |
+
status_msg += f" Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB, Free: {free_memory:.2f} GB\n"
|
| 247 |
+
|
| 248 |
+
# Warn if memory is low
|
| 249 |
+
if free_memory < 2.0:
|
| 250 |
+
status_msg += f"⚠️ Warning: Low GPU memory ({free_memory:.2f} GB free). Model may not fit.\n"
|
| 251 |
else:
|
| 252 |
status_msg += "⚠️ Warning: CUDA not available, will use CPU (very slow)\n"
|
| 253 |
|
|
|
|
| 284 |
|
| 285 |
# Check for GPU abort or CUDA errors
|
| 286 |
error_output = (result.stderr + result.stdout).lower()
|
| 287 |
+
if "gpu aborted" in error_output or "cuda" in error_output or "out of memory" in error_output or "killed" in error_output:
|
| 288 |
error_msg += "⚠️ GPU ERROR DETECTED\n\n"
|
| 289 |
error_msg += "Possible causes:\n"
|
| 290 |
error_msg += "1. GPU timeout (ZeroGPU may have a 5-10 min limit)\n"
|
| 291 |
error_msg += "2. CUDA out of memory (model too large for GPU)\n"
|
| 292 |
+
error_msg += "3. GPU allocation failed (ZeroGPU not detected)\n"
|
| 293 |
+
error_msg += "4. Process killed due to memory limit\n\n"
|
| 294 |
error_msg += "Solutions:\n"
|
| 295 |
+
error_msg += "- Model is now using bfloat16/float16 for memory efficiency\n"
|
| 296 |
error_msg += "- Try again (GPU may have been released)\n"
|
| 297 |
error_msg += "- Check Space logs for detailed error\n"
|
| 298 |
error_msg += "- Ensure @spaces.GPU decorator is applied\n"
|
| 299 |
+
error_msg += "- Consider using paid GPU tier for longer runs\n"
|
| 300 |
+
error_msg += "- If issue persists, model may be too large for available GPU\n\n"
|
| 301 |
|
| 302 |
error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
|
| 303 |
error_msg += f"=== STDOUT ===\n{result.stdout}\n\n"
|
sample.py
CHANGED
|
@@ -127,8 +127,28 @@ def setup_model_and_components(args: argparse.Namespace) -> Tuple[torch.nn.Modul
|
|
| 127 |
p.requires_grad = False
|
| 128 |
model.eval()
|
| 129 |
|
| 130 |
-
# Parallelize model for multi-GPU sampling
|
| 131 |
_, model = utils.parallelize_model(args, model, dist, device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
torch.cuda.empty_cache() # Final cache clear
|
| 133 |
|
| 134 |
return model, vae, (tokenizer, text_encoder, dist, device)
|
|
@@ -278,7 +298,15 @@ def main(args: argparse.Namespace) -> None:
|
|
| 278 |
|
| 279 |
with torch.no_grad():
|
| 280 |
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
for i in tqdm.tqdm(range(int(np.ceil(num_samples / (args.sample_batch_size * dist.world_size))))):
|
| 283 |
# Determine aspect ratio and image shape
|
| 284 |
x_aspect = args.aspect_ratio if args.mix_aspect else None
|
|
|
|
| 127 |
p.requires_grad = False
|
| 128 |
model.eval()
|
| 129 |
|
| 130 |
+
# Parallelize model for multi-GPU sampling (do this before half precision conversion)
|
| 131 |
_, model = utils.parallelize_model(args, model, dist, device)
|
| 132 |
+
torch.cuda.empty_cache()
|
| 133 |
+
|
| 134 |
+
# Convert model to half precision for memory efficiency (if CUDA available)
|
| 135 |
+
# Do this AFTER parallelization to avoid issues
|
| 136 |
+
if torch.cuda.is_available():
|
| 137 |
+
# Use bfloat16 if supported, otherwise float16
|
| 138 |
+
if torch.cuda.is_bf16_supported():
|
| 139 |
+
model = model.to(torch.bfloat16)
|
| 140 |
+
print("✅ Converted model to bfloat16 for memory efficiency")
|
| 141 |
+
else:
|
| 142 |
+
model = model.to(torch.float16)
|
| 143 |
+
print("✅ Converted model to float16 for memory efficiency")
|
| 144 |
+
torch.cuda.empty_cache()
|
| 145 |
+
|
| 146 |
+
# Print memory usage
|
| 147 |
+
allocated = torch.cuda.memory_allocated(0) / 1024**3
|
| 148 |
+
reserved = torch.cuda.memory_reserved(0) / 1024**3
|
| 149 |
+
total = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
| 150 |
+
print(f"📊 GPU Memory: {allocated:.2f} GB allocated, {reserved:.2f} GB reserved, {total:.2f} GB total")
|
| 151 |
+
|
| 152 |
torch.cuda.empty_cache() # Final cache clear
|
| 153 |
|
| 154 |
return model, vae, (tokenizer, text_encoder, dist, device)
|
|
|
|
| 298 |
|
| 299 |
with torch.no_grad():
|
| 300 |
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 301 |
+
# Use bfloat16 for CUDA (memory efficient), float32 for CPU
|
| 302 |
+
if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
|
| 303 |
+
autocast_dtype = torch.bfloat16
|
| 304 |
+
elif torch.cuda.is_available():
|
| 305 |
+
autocast_dtype = torch.float16
|
| 306 |
+
else:
|
| 307 |
+
autocast_dtype = torch.float32
|
| 308 |
+
|
| 309 |
+
with torch.autocast(device_type=device_type, dtype=autocast_dtype):
|
| 310 |
for i in tqdm.tqdm(range(int(np.ceil(num_samples / (args.sample_batch_size * dist.world_size))))):
|
| 311 |
# Determine aspect ratio and image shape
|
| 312 |
x_aspect = args.aspect_ratio if args.mix_aspect else None
|