leoeric commited on
Commit
fa69135
·
1 Parent(s): 34395b9

Fix GPU abort: Add memory optimizations (half precision, better monitoring)

Browse files

- Convert model to bfloat16/float16 after loading to reduce memory by ~50%
- Change autocast from float32 to bfloat16/float16 for memory efficiency
- Add GPU memory monitoring before and during generation
- Improve error handling for GPU abort scenarios
- Move half precision conversion after parallelization for safety

Files changed (2) hide show
  1. app.py +18 -5
  2. sample.py +30 -2
app.py CHANGED
@@ -236,8 +236,18 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
236
 
237
  # Verify GPU is available before starting
238
  if torch.cuda.is_available():
239
- status_msg += f"✅ GPU available: {torch.cuda.get_device_name(0)}\n"
240
- status_msg += f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB\n"
 
 
 
 
 
 
 
 
 
 
241
  else:
242
  status_msg += "⚠️ Warning: CUDA not available, will use CPU (very slow)\n"
243
 
@@ -274,17 +284,20 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
274
 
275
  # Check for GPU abort or CUDA errors
276
  error_output = (result.stderr + result.stdout).lower()
277
- if "gpu aborted" in error_output or "cuda" in error_output or "out of memory" in error_output:
278
  error_msg += "⚠️ GPU ERROR DETECTED\n\n"
279
  error_msg += "Possible causes:\n"
280
  error_msg += "1. GPU timeout (ZeroGPU may have a 5-10 min limit)\n"
281
  error_msg += "2. CUDA out of memory (model too large for GPU)\n"
282
- error_msg += "3. GPU allocation failed (ZeroGPU not detected)\n\n"
 
283
  error_msg += "Solutions:\n"
 
284
  error_msg += "- Try again (GPU may have been released)\n"
285
  error_msg += "- Check Space logs for detailed error\n"
286
  error_msg += "- Ensure @spaces.GPU decorator is applied\n"
287
- error_msg += "- Consider using paid GPU tier for longer runs\n\n"
 
288
 
289
  error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
290
  error_msg += f"=== STDOUT ===\n{result.stdout}\n\n"
 
236
 
237
  # Verify GPU is available before starting
238
  if torch.cuda.is_available():
239
+ gpu_name = torch.cuda.get_device_name(0)
240
+ total_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
241
+ allocated = torch.cuda.memory_allocated(0) / 1024**3
242
+ reserved = torch.cuda.memory_reserved(0) / 1024**3
243
+ free_memory = total_memory - reserved
244
+ status_msg += f"✅ GPU available: {gpu_name}\n"
245
+ status_msg += f" Total Memory: {total_memory:.1f} GB\n"
246
+ status_msg += f" Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB, Free: {free_memory:.2f} GB\n"
247
+
248
+ # Warn if memory is low
249
+ if free_memory < 2.0:
250
+ status_msg += f"⚠️ Warning: Low GPU memory ({free_memory:.2f} GB free). Model may not fit.\n"
251
  else:
252
  status_msg += "⚠️ Warning: CUDA not available, will use CPU (very slow)\n"
253
 
 
284
 
285
  # Check for GPU abort or CUDA errors
286
  error_output = (result.stderr + result.stdout).lower()
287
+ if "gpu aborted" in error_output or "cuda" in error_output or "out of memory" in error_output or "killed" in error_output:
288
  error_msg += "⚠️ GPU ERROR DETECTED\n\n"
289
  error_msg += "Possible causes:\n"
290
  error_msg += "1. GPU timeout (ZeroGPU may have a 5-10 min limit)\n"
291
  error_msg += "2. CUDA out of memory (model too large for GPU)\n"
292
+ error_msg += "3. GPU allocation failed (ZeroGPU not detected)\n"
293
+ error_msg += "4. Process killed due to memory limit\n\n"
294
  error_msg += "Solutions:\n"
295
+ error_msg += "- Model is now using bfloat16/float16 for memory efficiency\n"
296
  error_msg += "- Try again (GPU may have been released)\n"
297
  error_msg += "- Check Space logs for detailed error\n"
298
  error_msg += "- Ensure @spaces.GPU decorator is applied\n"
299
+ error_msg += "- Consider using paid GPU tier for longer runs\n"
300
+ error_msg += "- If issue persists, model may be too large for available GPU\n\n"
301
 
302
  error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
303
  error_msg += f"=== STDOUT ===\n{result.stdout}\n\n"
sample.py CHANGED
@@ -127,8 +127,28 @@ def setup_model_and_components(args: argparse.Namespace) -> Tuple[torch.nn.Modul
127
  p.requires_grad = False
128
  model.eval()
129
 
130
- # Parallelize model for multi-GPU sampling
131
  _, model = utils.parallelize_model(args, model, dist, device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  torch.cuda.empty_cache() # Final cache clear
133
 
134
  return model, vae, (tokenizer, text_encoder, dist, device)
@@ -278,7 +298,15 @@ def main(args: argparse.Namespace) -> None:
278
 
279
  with torch.no_grad():
280
  device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
281
- with torch.autocast(device_type=device_type, dtype=torch.float32):
 
 
 
 
 
 
 
 
282
  for i in tqdm.tqdm(range(int(np.ceil(num_samples / (args.sample_batch_size * dist.world_size))))):
283
  # Determine aspect ratio and image shape
284
  x_aspect = args.aspect_ratio if args.mix_aspect else None
 
127
  p.requires_grad = False
128
  model.eval()
129
 
130
+ # Parallelize model for multi-GPU sampling (do this before half precision conversion)
131
  _, model = utils.parallelize_model(args, model, dist, device)
132
+ torch.cuda.empty_cache()
133
+
134
+ # Convert model to half precision for memory efficiency (if CUDA available)
135
+ # Do this AFTER parallelization to avoid issues
136
+ if torch.cuda.is_available():
137
+ # Use bfloat16 if supported, otherwise float16
138
+ if torch.cuda.is_bf16_supported():
139
+ model = model.to(torch.bfloat16)
140
+ print("✅ Converted model to bfloat16 for memory efficiency")
141
+ else:
142
+ model = model.to(torch.float16)
143
+ print("✅ Converted model to float16 for memory efficiency")
144
+ torch.cuda.empty_cache()
145
+
146
+ # Print memory usage
147
+ allocated = torch.cuda.memory_allocated(0) / 1024**3
148
+ reserved = torch.cuda.memory_reserved(0) / 1024**3
149
+ total = torch.cuda.get_device_properties(0).total_memory / 1024**3
150
+ print(f"📊 GPU Memory: {allocated:.2f} GB allocated, {reserved:.2f} GB reserved, {total:.2f} GB total")
151
+
152
  torch.cuda.empty_cache() # Final cache clear
153
 
154
  return model, vae, (tokenizer, text_encoder, dist, device)
 
298
 
299
  with torch.no_grad():
300
  device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
301
+ # Use bfloat16 for CUDA (memory efficient), float32 for CPU
302
+ if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
303
+ autocast_dtype = torch.bfloat16
304
+ elif torch.cuda.is_available():
305
+ autocast_dtype = torch.float16
306
+ else:
307
+ autocast_dtype = torch.float32
308
+
309
+ with torch.autocast(device_type=device_type, dtype=autocast_dtype):
310
  for i in tqdm.tqdm(range(int(np.ceil(num_samples / (args.sample_batch_size * dist.world_size))))):
311
  # Determine aspect ratio and image shape
312
  x_aspect = args.aspect_ratio if args.mix_aspect else None