Spaces:

GlobalStudio
/

starflow

Sleeping

leoeric commited on 4 days ago

Commit

fa69135

1 Parent(s): 34395b9

Fix GPU abort: Add memory optimizations (half precision, better monitoring)

- Convert model to bfloat16/float16 after loading to reduce memory by ~50%
- Change autocast from float32 to bfloat16/float16 for memory efficiency
- Add GPU memory monitoring before and during generation
- Improve error handling for GPU abort scenarios
- Move half precision conversion after parallelization for safety

Files changed (2) hide show

app.py +18 -5
sample.py +30 -2

app.py CHANGED Viewed

@@ -236,8 +236,18 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
         # Verify GPU is available before starting
         if torch.cuda.is_available():
-            status_msg += f"✅ GPU available: {torch.cuda.get_device_name(0)}\n"
-            status_msg += f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB\n"
         else:
             status_msg += "⚠️  Warning: CUDA not available, will use CPU (very slow)\n"
@@ -274,17 +284,20 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
             # Check for GPU abort or CUDA errors
             error_output = (result.stderr + result.stdout).lower()
-            if "gpu aborted" in error_output or "cuda" in error_output or "out of memory" in error_output:
                 error_msg += "⚠️  GPU ERROR DETECTED\n\n"
                 error_msg += "Possible causes:\n"
                 error_msg += "1. GPU timeout (ZeroGPU may have a 5-10 min limit)\n"
                 error_msg += "2. CUDA out of memory (model too large for GPU)\n"
-                error_msg += "3. GPU allocation failed (ZeroGPU not detected)\n\n"
                 error_msg += "Solutions:\n"
                 error_msg += "- Try again (GPU may have been released)\n"
                 error_msg += "- Check Space logs for detailed error\n"
                 error_msg += "- Ensure @spaces.GPU decorator is applied\n"
-                error_msg += "- Consider using paid GPU tier for longer runs\n\n"
             error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
             error_msg += f"=== STDOUT ===\n{result.stdout}\n\n"

         # Verify GPU is available before starting
         if torch.cuda.is_available():
+            gpu_name = torch.cuda.get_device_name(0)
+            total_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
+            allocated = torch.cuda.memory_allocated(0) / 1024**3
+            reserved = torch.cuda.memory_reserved(0) / 1024**3
+            free_memory = total_memory - reserved
+            status_msg += f"✅ GPU available: {gpu_name}\n"
+            status_msg += f"   Total Memory: {total_memory:.1f} GB\n"
+            status_msg += f"   Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB, Free: {free_memory:.2f} GB\n"
+            # Warn if memory is low
+            if free_memory < 2.0:
+                status_msg += f"⚠️  Warning: Low GPU memory ({free_memory:.2f} GB free). Model may not fit.\n"
         else:
             status_msg += "⚠️  Warning: CUDA not available, will use CPU (very slow)\n"
             # Check for GPU abort or CUDA errors
             error_output = (result.stderr + result.stdout).lower()
+            if "gpu aborted" in error_output or "cuda" in error_output or "out of memory" in error_output or "killed" in error_output:
                 error_msg += "⚠️  GPU ERROR DETECTED\n\n"
                 error_msg += "Possible causes:\n"
                 error_msg += "1. GPU timeout (ZeroGPU may have a 5-10 min limit)\n"
                 error_msg += "2. CUDA out of memory (model too large for GPU)\n"
+                error_msg += "3. GPU allocation failed (ZeroGPU not detected)\n"
+                error_msg += "4. Process killed due to memory limit\n\n"
                 error_msg += "Solutions:\n"
+                error_msg += "- Model is now using bfloat16/float16 for memory efficiency\n"
                 error_msg += "- Try again (GPU may have been released)\n"
                 error_msg += "- Check Space logs for detailed error\n"
                 error_msg += "- Ensure @spaces.GPU decorator is applied\n"
+                error_msg += "- Consider using paid GPU tier for longer runs\n"
+                error_msg += "- If issue persists, model may be too large for available GPU\n\n"
             error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
             error_msg += f"=== STDOUT ===\n{result.stdout}\n\n"

sample.py CHANGED Viewed

@@ -127,8 +127,28 @@ def setup_model_and_components(args: argparse.Namespace) -> Tuple[torch.nn.Modul
         p.requires_grad = False
     model.eval()
-    # Parallelize model for multi-GPU sampling
     _, model = utils.parallelize_model(args, model, dist, device)
     torch.cuda.empty_cache()  # Final cache clear
     return model, vae, (tokenizer, text_encoder, dist, device)
@@ -278,7 +298,15 @@ def main(args: argparse.Namespace) -> None:
     with torch.no_grad():
         device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
-        with torch.autocast(device_type=device_type, dtype=torch.float32):
             for i in tqdm.tqdm(range(int(np.ceil(num_samples / (args.sample_batch_size * dist.world_size))))):
                 # Determine aspect ratio and image shape
                 x_aspect = args.aspect_ratio if args.mix_aspect else None

         p.requires_grad = False
     model.eval()
+    # Parallelize model for multi-GPU sampling (do this before half precision conversion)
     _, model = utils.parallelize_model(args, model, dist, device)
+    torch.cuda.empty_cache()
+    # Convert model to half precision for memory efficiency (if CUDA available)
+    # Do this AFTER parallelization to avoid issues
+    if torch.cuda.is_available():
+        # Use bfloat16 if supported, otherwise float16
+        if torch.cuda.is_bf16_supported():
+            model = model.to(torch.bfloat16)
+            print("✅ Converted model to bfloat16 for memory efficiency")
+        else:
+            model = model.to(torch.float16)
+            print("✅ Converted model to float16 for memory efficiency")
+        torch.cuda.empty_cache()
+        # Print memory usage
+        allocated = torch.cuda.memory_allocated(0) / 1024**3
+        reserved = torch.cuda.memory_reserved(0) / 1024**3
+        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
+        print(f"📊 GPU Memory: {allocated:.2f} GB allocated, {reserved:.2f} GB reserved, {total:.2f} GB total")
     torch.cuda.empty_cache()  # Final cache clear
     return model, vae, (tokenizer, text_encoder, dist, device)
     with torch.no_grad():
         device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
+        # Use bfloat16 for CUDA (memory efficient), float32 for CPU
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            autocast_dtype = torch.bfloat16
+        elif torch.cuda.is_available():
+            autocast_dtype = torch.float16
+        else:
+            autocast_dtype = torch.float32
+        with torch.autocast(device_type=device_type, dtype=autocast_dtype):
             for i in tqdm.tqdm(range(int(np.ceil(num_samples / (args.sample_batch_size * dist.world_size))))):
                 # Determine aspect ratio and image shape
                 x_aspect = args.aspect_ratio if args.mix_aspect else None