NON_WORKING_matrix_game_2

Paused

App Files Files Community

Julian Bilcke commited on May 16

Commit

f1803ce

1 Parent(s): 998c45a

up

Browse files

Files changed (4) hide show

README.md +34 -1
client/client.js +130 -22
client/index.html +145 -54
server.py +294 -18

README.md CHANGED Viewed

@@ -75,10 +75,43 @@ pip install -r requirements.txt
 # install apex and FlashAttention-3
 # Our project also depends on [apex](https://github.com/NVIDIA/apex) and [FlashAttention-3](https://github.com/Dao-AILab/flash-attention)
-# inference
 bash run_inference.sh
 ```
 ## 🔧 Hardware Requirements
 - **GPU**:

 # install apex and FlashAttention-3
 # Our project also depends on [apex](https://github.com/NVIDIA/apex) and [FlashAttention-3](https://github.com/Dao-AILab/flash-attention)
+# Run batch inference to generate videos
 bash run_inference.sh
+# Run interactive websocket server
+python server.py --model_root ./models/matrixgame
+```
+## Interactive WebSocket Server
+We've implemented a real-time interactive WebSocket server that uses the Matrix-Game model to generate game frames based on keyboard and mouse inputs:
+### Features:
+- **Real-time Generation**: Frames are generated on-the-fly based on user inputs
+- **Keyboard & Mouse Control**: Move through the virtual world using WASD keys and mouse movements
+- **Multiple Scenes**: Choose from different environments (forest, desert, beach, hills, etc.)
+- **Fallback Mode**: Automatically falls back to demo mode when GPU resources are unavailable
+### Usage:
+```bash
+# Basic startup
+python server.py
+# With custom model paths
+python server.py --model_root ./models/matrixgame --port 8080
+# With individual model component paths
+python server.py --dit_path ./custom/dit --vae_path ./custom/vae --textenc_path ./custom/textenc
 ```
+### Connection:
+- WebSocket endpoint: ws://localhost:8080/ws
+- Web client: http://localhost:8080/
+### System Requirements:
+- NVIDIA GPU with CUDA support
+- 24GB+ VRAM recommended for smooth frame generation
 ## 🔧 Hardware Requirements
 - **GPU**:

client/client.js CHANGED Viewed

@@ -19,6 +19,11 @@ const mousePosition = document.getElementById('mouse-position');
 const fpsCounter = document.getElementById('fps-counter');
 const mouseTrackingArea = document.getElementById('mouse-tracking-area');
 // Keyboard DOM elements
 const keyElements = {
     'w': document.getElementById('key-w'),
@@ -32,9 +37,13 @@ const keyElements = {
 // Key mapping to action names
 const keyToAction = {
     'w': 'forward',
     'a': 'left',
     's': 'back',
     'd': 'right',
     ' ': 'jump',
     'shift': 'attack'
 };
@@ -52,7 +61,8 @@ const keyState = {
 // Mouse state
 const mouseState = {
     x: 0,
-    y: 0
 };
 // Test server connectivity before establishing WebSocket
@@ -454,27 +464,101 @@ document.addEventListener('keyup', (event) => {
     }
 });
-// Mouse tracking
 mouseTrackingArea.addEventListener('mousemove', (event) => {
-    // Calculate normalized coordinates relative to the center of the tracking area
-    const rect = mouseTrackingArea.getBoundingClientRect();
-    const centerX = rect.width / 2;
-    const centerY = rect.height / 2;
-    // Calculate relative position from center (-1 to 1)
-    const relX = (event.clientX - rect.left - centerX) / centerX;
-    const relY = (event.clientY - rect.top - centerY) / centerY;
-    // Scale down for smoother movement (similar to conditions.py)
-    const scaleFactor = 0.05;
-    mouseState.x = relX * scaleFactor;
-    mouseState.y = -relY * scaleFactor;  // Invert Y for intuitive camera control
-    // Update display
-    mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
-    // Send to server (throttled)
-    throttledSendMouseInput();
 });
 // Throttle mouse movement to avoid flooding the server
@@ -491,5 +575,29 @@ const throttledSendMouseInput = (() => {
     };
 })();
 // Initialize the UI
-resetUI();

 const fpsCounter = document.getElementById('fps-counter');
 const mouseTrackingArea = document.getElementById('mouse-tracking-area');
+// Pointer Lock API support check
+const pointerLockSupported = 'pointerLockElement' in document ||
+                            'mozPointerLockElement' in document ||
+                            'webkitPointerLockElement' in document;
 // Keyboard DOM elements
 const keyElements = {
     'w': document.getElementById('key-w'),
 // Key mapping to action names
 const keyToAction = {
     'w': 'forward',
+    'arrowup': 'forward',
     'a': 'left',
+    'arrowleft': 'left',
     's': 'back',
+    'arrowdown': 'back',
     'd': 'right',
+    'arrowright': 'right',
     ' ': 'jump',
     'shift': 'attack'
 };
 // Mouse state
 const mouseState = {
     x: 0,
+    y: 0,
+    captured: false
 };
 // Test server connectivity before establishing WebSocket
     }
 });
+// Mouse capture functions
+function requestPointerLock() {
+    if (!mouseState.captured && pointerLockSupported) {
+        mouseTrackingArea.requestPointerLock = mouseTrackingArea.requestPointerLock ||
+                                            mouseTrackingArea.mozRequestPointerLock ||
+                                            mouseTrackingArea.webkitRequestPointerLock;
+        mouseTrackingArea.requestPointerLock();
+        logMessage('Mouse captured. Press ESC to release.');
+    }
+}
+function exitPointerLock() {
+    if (mouseState.captured) {
+        document.exitPointerLock = document.exitPointerLock ||
+                                 document.mozExitPointerLock ||
+                                 document.webkitExitPointerLock;
+        document.exitPointerLock();
+        logMessage('Mouse released.');
+    }
+}
+// Handle pointer lock change events
+document.addEventListener('pointerlockchange', pointerLockChangeHandler);
+document.addEventListener('mozpointerlockchange', pointerLockChangeHandler);
+document.addEventListener('webkitpointerlockchange', pointerLockChangeHandler);
+function pointerLockChangeHandler() {
+    if (document.pointerLockElement === mouseTrackingArea ||
+        document.mozPointerLockElement === mouseTrackingArea ||
+        document.webkitPointerLockElement === mouseTrackingArea) {
+        // Pointer is locked, enable mouse movement tracking
+        mouseState.captured = true;
+        document.addEventListener('mousemove', handleMouseMovement);
+    } else {
+        // Pointer is unlocked, disable mouse movement tracking
+        mouseState.captured = false;
+        document.removeEventListener('mousemove', handleMouseMovement);
+        // Reset mouse state
+        mouseState.x = 0;
+        mouseState.y = 0;
+        mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
+        throttledSendMouseInput();
+    }
+}
+// Mouse tracking with pointer lock
+function handleMouseMovement(event) {
+    if (mouseState.captured) {
+        // Use movement for mouse look when captured
+        const sensitivity = 0.005; // Adjust sensitivity
+        mouseState.x += event.movementX * sensitivity;
+        mouseState.y -= event.movementY * sensitivity; // Invert Y for intuitive camera control
+        // Clamp values
+        mouseState.x = Math.max(-1, Math.min(1, mouseState.x));
+        mouseState.y = Math.max(-1, Math.min(1, mouseState.y));
+        // Update display
+        mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
+        // Send to server (throttled)
+        throttledSendMouseInput();
+    }
+}
+// Mouse click to capture
+mouseTrackingArea.addEventListener('click', () => {
+    if (!mouseState.captured && isStreaming) {
+        requestPointerLock();
+    }
+});
+// Standard mouse tracking for when pointer is not locked
 mouseTrackingArea.addEventListener('mousemove', (event) => {
+    if (!mouseState.captured) {
+        // Calculate normalized coordinates relative to the center of the tracking area
+        const rect = mouseTrackingArea.getBoundingClientRect();
+        const centerX = rect.width / 2;
+        const centerY = rect.height / 2;
+        // Calculate relative position from center (-1 to 1)
+        const relX = (event.clientX - rect.left - centerX) / centerX;
+        const relY = (event.clientY - rect.top - centerY) / centerY;
+        // Scale down for smoother movement (similar to conditions.py)
+        const scaleFactor = 0.05;
+        mouseState.x = relX * scaleFactor;
+        mouseState.y = -relY * scaleFactor;  // Invert Y for intuitive camera control
+        // Update display
+        mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
+        // Send to server (throttled)
+        throttledSendMouseInput();
+    }
 });
 // Throttle mouse movement to avoid flooding the server
     };
 })();
+// Toggle panel collapse/expand
+function togglePanel(panelId) {
+    const panel = document.getElementById(panelId);
+    const button = panel.querySelector('.toggle-button');
+    if (panel.classList.contains('collapsed')) {
+        // Expand the panel
+        panel.classList.remove('collapsed');
+        button.textContent = '−'; // Minus sign
+    } else {
+        // Collapse the panel
+        panel.classList.add('collapsed');
+        button.textContent = '+'; // Plus sign
+    }
+}
 // Initialize the UI
+resetUI();
+// Make panel headers clickable
+document.querySelectorAll('.panel-header').forEach(header => {
+    header.addEventListener('click', () => {
+        const panelId = header.parentElement.id;
+        togglePanel(panelId);
+    });
+});

client/index.html CHANGED Viewed

@@ -14,47 +14,124 @@
             display: flex;
             flex-direction: column;
             align-items: center;
         }
         .container {
-            width: 90%;
-            max-width: 1200px;
-            margin: 20px auto;
-        }
-        h1 {
-            color: #4CAF50;
-            text-align: center;
         }
         .game-area {
             display: flex;
             flex-direction: column;
             align-items: center;
-            margin: 20px 0;
         }
         #game-canvas {
-            border: 2px solid #4CAF50;
-            background-color: #000;
-            width: 640px;
-            height: 360px;
             object-fit: contain;
         }
         .controls {
             display: flex;
             justify-content: space-between;
-            width: 640px;
-            margin-top: 10px;
         }
-        .control-panel {
             background-color: #1E1E1E;
-            padding: 15px;
             border-radius: 5px;
-            margin-top: 20px;
-            width: 640px;
         }
         button {
@@ -130,14 +207,9 @@
         }
         .connection-info {
-            margin-top: 20px;
-            padding: 10px;
-            background-color: #1E1E1E;
-            border-radius: 5px;
             font-family: monospace;
-            height: 150px;
             overflow-y: auto;
-            width: 640px;
         }
         .log-entry {
@@ -155,13 +227,9 @@
             padding: 5px;
             border-radius: 3px;
             font-family: monospace;
         }
-        #mouse-tracking-area {
-            position: relative;
-            width: 640px;
-            height: 360px;
-        }
         #mouse-position {
             position: absolute;
@@ -172,13 +240,18 @@
             padding: 5px;
             border-radius: 3px;
             font-family: monospace;
         }
     </style>
 </head>
 <body>
     <div class="container">
-        <h1>MatrixGame WebSocket Client</h1>
         <div class="game-area">
             <div id="mouse-tracking-area">
                 <img id="game-canvas" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" alt="Game Frame">
@@ -203,33 +276,51 @@
             </div>
         </div>
-        <div class="control-panel">
-            <h3>Keyboard Controls</h3>
-            <div class="key-indicators">
-                <div class="key-row">
-                    <div id="key-w" class="key">W</div>
                 </div>
-                <div class="key-row">
-                    <div id="key-a" class="key">A</div>
-                    <div id="key-s" class="key">S</div>
-                    <div id="key-d" class="key">D</div>
                 </div>
-                <div class="key-row">
-                    <div id="key-space" class="key spacebar">SPACE</div>
                 </div>
-                <div class="key-row">
-                    <div id="key-shift" class="key">SHIFT</div>
                 </div>
             </div>
-            <p class="status">
-                W = Forward, S = Back, A = Left, D = Right<br>
-                Space = Jump, Shift = Attack<br>
-                Mouse = Look around
-            </p>
-        </div>
-        <div class="connection-info" id="connection-log">
-            <div class="log-entry">Waiting to connect...</div>
         </div>
     </div>

             display: flex;
             flex-direction: column;
             align-items: center;
+            user-select: none; /* Disable text selection */
+            -webkit-user-select: none;
+            -moz-user-select: none;
+            -ms-user-select: none;
+            overflow-x: hidden;
         }
         .container {
+            width: 100%;
+            max-width: 100%;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
         }
         .game-area {
             display: flex;
             flex-direction: column;
             align-items: center;
+            width: 100%;
+            max-height: 85vh;
+            margin: 0;
+            position: relative;
+        }
+        #mouse-tracking-area {
+            position: relative;
+            width: 100%;
+            height: auto;
+            cursor: pointer; /* Show cursor as pointer to encourage clicks */
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            max-height: 85vh;
         }
         #game-canvas {
+            width: 100%;
+            height: auto;
+            max-height: 85vh;
             object-fit: contain;
+            background-color: #000;
+            pointer-events: none; /* Prevent drag on the image */
+            -webkit-user-drag: none;
+            -khtml-user-drag: none;
+            -moz-user-drag: none;
+            -o-user-drag: none;
+            user-drag: none;
         }
         .controls {
             display: flex;
             justify-content: space-between;
+            width: 100%;
+            max-width: 1200px;
+            padding: 10px;
+            background-color: rgba(0, 0, 0, 0.5);
+            position: absolute;
+            bottom: 0;
+            z-index: 10;
+            box-sizing: border-box;
+        }
+        .panels-container {
+            display: flex;
+            width: 100%;
+            max-width: 1200px;
+            margin: 10px auto;
+            gap: 10px;
         }
+        .panel {
+            flex: 1;
             background-color: #1E1E1E;
             border-radius: 5px;
+            overflow: hidden;
+            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
+            transition: height 0.3s ease;
+        }
+        .panel-header {
+            background-color: #272727;
+            padding: 10px 15px;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            cursor: pointer;
+        }
+        .panel-title {
+            font-weight: bold;
+            color: #4CAF50;
+        }
+        .toggle-button {
+            background: none;
+            border: none;
+            color: #e0e0e0;
+            font-size: 18px;
+            cursor: pointer;
+        }
+        .toggle-button:focus {
+            outline: none;
+        }
+        .panel-content {
+            padding: 15px;
+            max-height: 300px;
+            overflow-y: auto;
+            transition: all 0.3s ease;
+        }
+        .collapsed .panel-content {
+            max-height: 0;
+            padding-top: 0;
+            padding-bottom: 0;
+            overflow: hidden;
         }
         button {
         }
         .connection-info {
             font-family: monospace;
+            height: 100%;
             overflow-y: auto;
         }
         .log-entry {
             padding: 5px;
             border-radius: 3px;
             font-family: monospace;
+            z-index: 20;
         }
         #mouse-position {
             position: absolute;
             padding: 5px;
             border-radius: 3px;
             font-family: monospace;
+            z-index: 20;
+        }
+        @media (max-width: 768px) {
+            .panels-container {
+                flex-direction: column;
+            }
         }
     </style>
 </head>
 <body>
     <div class="container">
         <div class="game-area">
             <div id="mouse-tracking-area">
                 <img id="game-canvas" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" alt="Game Frame">
             </div>
         </div>
+        <div class="panels-container">
+            <!-- Controls Panel -->
+            <div class="panel" id="controls-panel">
+                <div class="panel-header" onclick="togglePanel('controls-panel')">
+                    <div class="panel-title">Keyboard Controls</div>
+                    <button class="toggle-button">−</button>
                 </div>
+                <div class="panel-content">
+                    <div class="key-indicators">
+                        <div class="key-row">
+                            <div id="key-w" class="key">W</div>
+                        </div>
+                        <div class="key-row">
+                            <div id="key-a" class="key">A</div>
+                            <div id="key-s" class="key">S</div>
+                            <div id="key-d" class="key">D</div>
+                        </div>
+                        <div class="key-row">
+                            <div id="key-space" class="key spacebar">SPACE</div>
+                        </div>
+                        <div class="key-row">
+                            <div id="key-shift" class="key">SHIFT</div>
+                        </div>
+                    </div>
+                    <p class="status">
+                        W or ↑ = Forward, S or ↓ = Back, A or ← = Left, D or → = Right<br>
+                        Space = Jump, Shift = Attack<br>
+                        Click on game view to capture mouse (ESC to release)<br>
+                        Mouse = Look around
+                    </p>
                 </div>
+            </div>
+            <!-- Connection Log Panel -->
+            <div class="panel" id="log-panel">
+                <div class="panel-header" onclick="togglePanel('log-panel')">
+                    <div class="panel-title">Connection Log</div>
+                    <button class="toggle-button">−</button>
                 </div>
+                <div class="panel-content">
+                    <div class="connection-info" id="connection-log">
+                        <div class="log-entry">Waiting to connect...</div>
+                    </div>
                 </div>
             </div>
         </div>
     </div>

server.py CHANGED Viewed

@@ -25,6 +25,16 @@ from PIL import Image
 import cv2
 from aiohttp import web, WSMsgType
 from condtions import Bench_actions_76
 # Configure logging
 logging.basicConfig(
@@ -35,14 +45,23 @@ logger = logging.getLogger(__name__)
 class FrameGenerator:
     """
-    Simplified frame generator for the game.
-    In production, this would use the MatrixGame model.
     """
     def __init__(self):
         self.frame_width = 640
         self.frame_height = 360
         self.fps = 16
         self.frame_count = 0
         self.scenes = {
             'forest': self._load_scene_frames('forest'),
             'desert': self._load_scene_frames('desert'),
@@ -54,6 +73,87 @@ class FrameGenerator:
             'plain': self._load_scene_frames('plain')
         }
     def _load_scene_frames(self, scene_name):
         """Load initial frames for a scene from asset directory"""
         frames = []
@@ -72,7 +172,7 @@ class FrameGenerator:
         # If no frames were loaded, create a default colored frame with text
         if not frames:
-            frame = np.ones((self.frame_height, self.frame_width, 3), dtype=np.uint8) * 100
             # Add scene name as text
             cv2.putText(frame, f"Scene: {scene_name}", (50, 180),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
@@ -80,9 +180,29 @@ class FrameGenerator:
         return frames
     def get_next_frame(self, scene_name, keyboard_condition=None, mouse_condition=None):
         """
-        Generate the next frame based on current conditions.
         Args:
             scene_name: Name of the current scene
@@ -92,16 +212,134 @@ class FrameGenerator:
         Returns:
             JPEG bytes of the frame
         """
-        scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
-        # In a real implementation, this would use the MatrixGame model to generate frames
-        # based on the keyboard_condition and mouse_condition
-        # For the demo, just cycle through the pre-loaded frames
         frame_idx = self.frame_count % len(scene_frames)
         frame = scene_frames[frame_idx].copy()
         self.frame_count += 1
         # If we have keyboard/mouse conditions, visualize them on the frame
         if keyboard_condition:
             # Visualize keyboard inputs (simple example)
@@ -122,15 +360,7 @@ class FrameGenerator:
             cv2.putText(frame, f"Mouse: {mouse_x:.2f}, {mouse_y:.2f}",
                        (self.frame_width - 250, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)
-        # Convert frame to JPEG
-        success, buffer = cv2.imencode('.jpg', frame)
-        if not success:
-            logger.error("Failed to encode frame as JPEG")
-            # Return a blank frame
-            blank = np.ones((self.frame_height, self.frame_width, 3), dtype=np.uint8) * 100
-            success, buffer = cv2.imencode('.jpg', blank)
-        return buffer.tobytes()
 class GameSession:
     """
@@ -721,10 +951,56 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--host", type=str, default="0.0.0.0", help="Host IP to bind to")
     parser.add_argument("--port", type=int, default=8080, help="Port to listen on")
     parser.add_argument("--path", type=str, default="", help="Base path for the server (for proxy setups)")
-    return parser.parse_args()
 if __name__ == '__main__':
     args = parse_args()
     loop = asyncio.get_event_loop()
     app = loop.run_until_complete(init_app(base_path=args.path))
     web.run_app(app, host=args.host, port=args.port)

 import cv2
 from aiohttp import web, WSMsgType
 from condtions import Bench_actions_76
+from einops import rearrange
+from diffusers.utils import load_image
+from diffusers.video_processor import VideoProcessor
+from matrixgame.sample.pipeline_matrixgame import MatrixGameVideoPipeline
+from matrixgame.model_variants import get_dit
+from matrixgame.vae_variants import get_vae
+from matrixgame.encoder_variants import get_text_enc
+from matrixgame.model_variants.matrixgame_dit_src import MGVideoDiffusionTransformerI2V
+from matrixgame.sample.flow_matching_scheduler_matrixgame import FlowMatchDiscreteScheduler
+from teacache_forward import teacache_forward
 # Configure logging
 logging.basicConfig(
 class FrameGenerator:
     """
+    Game frame generator using the MatrixGame model.
+    Generates frames based on keyboard and mouse inputs.
     """
     def __init__(self):
         self.frame_width = 640
         self.frame_height = 360
         self.fps = 16
         self.frame_count = 0
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.weight_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+        # Model paths (can be made configurable through arguments)
+        self.vae_path = os.environ.get("VAE_PATH", "./models/matrixgame/vae/")
+        self.dit_path = os.environ.get("DIT_PATH", "./models/matrixgame/dit/")
+        self.textenc_path = os.environ.get("TEXTENC_PATH", "./models/matrixgame")
+        # Cache scene initial frames (used as conditioning)
         self.scenes = {
             'forest': self._load_scene_frames('forest'),
             'desert': self._load_scene_frames('desert'),
             'plain': self._load_scene_frames('plain')
         }
+        # Cache initial images for model input
+        self.scene_initial_images = {}
+        for scene_name, frames in self.scenes.items():
+            if frames:
+                # Use first frame as initial image
+                self.scene_initial_images[scene_name] = self._preprocess_image(frames[0])
+        # Initialize MatrixGame pipeline if CUDA is available
+        self.model_loaded = False
+        if torch.cuda.is_available():
+            try:
+                self._init_models()
+                self.model_loaded = True
+                logger.info("MatrixGame models loaded successfully")
+            except Exception as e:
+                logger.error(f"Failed to initialize MatrixGame models: {str(e)}")
+                logger.info("Falling back to frame cycling mode")
+        else:
+            logger.warning("CUDA not available. Using frame cycling mode only.")
+    def _init_models(self):
+        """Initialize MatrixGame models (VAE, text encoder, transformer)"""
+        # Initialize flow matching scheduler
+        self.scheduler = FlowMatchDiscreteScheduler(
+            shift=15.0,
+            reverse=True,
+            solver="euler"
+        )
+        # Initialize VAE
+        try:
+            self.vae = get_vae("matrixgame", self.vae_path, self.weight_dtype)
+            self.vae.requires_grad_(False)
+            self.vae.eval()
+            self.vae.enable_tiling()
+            logger.info("VAE model loaded successfully")
+        except Exception as e:
+            logger.error(f"Error loading VAE model: {str(e)}")
+            raise
+        # Initialize DIT (Transformer)
+        try:
+            dit = MGVideoDiffusionTransformerI2V.from_pretrained(self.dit_path)
+            dit.requires_grad_(False)
+            dit.eval()
+            logger.info("DIT model loaded successfully")
+        except Exception as e:
+            logger.error(f"Error loading DIT model: {str(e)}")
+            raise
+        # Initialize text encoder
+        try:
+            self.text_enc = get_text_enc('matrixgame', self.textenc_path, weight_dtype=self.weight_dtype, i2v_type='refiner')
+            logger.info("Text encoder loaded successfully")
+        except Exception as e:
+            logger.error(f"Error loading text encoder: {str(e)}")
+            raise
+        # Initialize pipeline
+        try:
+            self.pipeline = MatrixGameVideoPipeline(
+                vae=self.vae.vae,
+                text_encoder=self.text_enc,
+                transformer=dit,
+                scheduler=self.scheduler,
+            ).to(self.weight_dtype).to(self.device)
+            logger.info("Pipeline initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing pipeline: {str(e)}")
+            raise
+        # Configure teacache for the transformer
+        self.pipeline.transformer.__class__.enable_teacache = True
+        self.pipeline.transformer.__class__.cnt = 0
+        self.pipeline.transformer.__class__.num_steps = 20  # Reduced inference steps for real-time performance
+        self.pipeline.transformer.__class__.accumulated_rel_l1_distance = 0
+        self.pipeline.transformer.__class__.rel_l1_thresh = 0.075
+        self.pipeline.transformer.__class__.previous_modulated_input = None
+        self.pipeline.transformer.__class__.previous_residual = None
+        self.pipeline.transformer.__class__.forward = teacache_forward
     def _load_scene_frames(self, scene_name):
         """Load initial frames for a scene from asset directory"""
         frames = []
         # If no frames were loaded, create a default colored frame with text
         if not frames:
+            frame = np.ones((self.frame_height, self.frame_height, 3), dtype=np.uint8) * 100
             # Add scene name as text
             cv2.putText(frame, f"Scene: {scene_name}", (50, 180),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
         return frames
+    def _preprocess_image(self, image_array):
+        """Preprocess an image for the model"""
+        # Convert numpy array to PIL Image if needed
+        if isinstance(image_array, np.ndarray):
+            image = Image.fromarray(image_array)
+        else:
+            image = image_array
+        # Preprocess for VAE
+        vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, 'vae') else 8
+        video_processor = VideoProcessor(vae_scale_factor=vae_scale_factor)
+        initial_image = video_processor.preprocess(image, height=self.frame_height, width=self.frame_width)
+        # Add past frames for stability (use same frame repeated)
+        num_pre_frames = 3  # Use fewer pre-frames for real-time performance
+        past_frames = initial_image.repeat(num_pre_frames, 1, 1, 1)
+        initial_image = torch.cat([initial_image, past_frames], dim=0)
+        return initial_image
     def get_next_frame(self, scene_name, keyboard_condition=None, mouse_condition=None):
         """
+        Generate the next frame based on current conditions using MatrixGame model.
         Args:
             scene_name: Name of the current scene
         Returns:
             JPEG bytes of the frame
         """
+        # Check if model is loaded
+        if not self.model_loaded or not torch.cuda.is_available():
+            # Fall back to frame cycling for demo mode or if models failed to load
+            scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
+            frame_idx = self.frame_count % len(scene_frames)
+            frame = scene_frames[frame_idx].copy()
+            self.frame_count += 1
+            # Add frame count indicator in corner
+            cv2.putText(frame, f"Demo mode: {self.frame_count}",
+                      (self.frame_width - 200, self.frame_height - 20),
+                      cv2.FONT_HERSHEY_SIMPLEX, 0.5, (200, 200, 200), 1)
+        else:
+            # Use MatrixGame model for frame generation
+            try:
+                # Get initial image for this scene
+                initial_image = self.scene_initial_images.get(scene_name)
+                if initial_image is None:
+                    # Use forest as default if we don't have an initial image for this scene
+                    initial_image = self.scene_initial_images.get('forest')
+                    if initial_image is None:
+                        # If we still don't have an initial image, fall back to frame cycling
+                        logger.error(f"No initial image available for scene {scene_name}")
+                        return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
+                # Prepare input tensors (move to device and format correctly)
+                if keyboard_condition is None:
+                    keyboard_condition = [[0, 0, 0, 0, 0, 0]]
+                if mouse_condition is None:
+                    mouse_condition = [[0, 0]]
+                # Convert conditions to tensors
+                keyboard_tensor = torch.tensor(keyboard_condition, dtype=torch.float32)
+                mouse_tensor = torch.tensor(mouse_condition, dtype=torch.float32)
+                # Move to device and convert to correct dtype
+                keyboard_tensor = keyboard_tensor.to(self.weight_dtype).to(self.device)
+                mouse_tensor = mouse_tensor.to(self.weight_dtype).to(self.device)
+                # Get the first frame from the scene for semantic conditioning
+                scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
+                if not scene_frames:
+                    return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
+                semantic_image = Image.fromarray(scene_frames[0])
+                # Get PIL image version of the frame for visualization
+                for scene_frame in scene_frames:
+                    if isinstance(scene_frame, np.ndarray):
+                        semantic_image = Image.fromarray(scene_frame)
+                        break
+                # Generate a single frame with the model
+                # Use fewer inference steps for interactive frame generation
+                with torch.no_grad():
+                    # Generate a short video (we'll just use the first frame)
+                    # We're using a short length (3 frames) for real-time performance
+                    video = self.pipeline(
+                        height=self.frame_height,
+                        width=self.frame_width,
+                        video_length=3,  # Generate a very short video for speed
+                        mouse_condition=mouse_tensor,
+                        keyboard_condition=keyboard_tensor,
+                        initial_image=initial_image,
+                        num_inference_steps=20,  # Reduced for real-time performance
+                        guidance_scale=6.0,
+                        embedded_guidance_scale=None,
+                        data_type="video",
+                        vae_ver='884-16c-hy',
+                        enable_tiling=True,
+                        generator=torch.Generator(device=self.device).manual_seed(42),
+                        i2v_type='refiner',
+                        semantic_images=semantic_image
+                    ).videos[0]
+                # Convert video tensor to numpy array (use first frame)
+                video_frame = video[0].permute(1, 2, 0).cpu().numpy()
+                video_frame = (video_frame * 255).astype(np.uint8)
+                frame = video_frame
+                # Increment frame counter
+                self.frame_count += 1
+            except Exception as e:
+                logger.error(f"Error generating frame with MatrixGame model: {str(e)}")
+                # Fall back to cycling demo frames if model generation fails
+                return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
+        # Add visualization of input controls (for both model and fallback modes)
+        frame = self._visualize_controls(frame, keyboard_condition, mouse_condition)
+        # Convert frame to JPEG
+        success, buffer = cv2.imencode('.jpg', frame)
+        if not success:
+            logger.error("Failed to encode frame as JPEG")
+            # Return a blank frame
+            blank = np.ones((self.frame_height, self.frame_width, 3), dtype=np.uint8) * 100
+            success, buffer = cv2.imencode('.jpg', blank)
+        return buffer.tobytes()
+    def _fallback_frame(self, scene_name, keyboard_condition, mouse_condition):
+        """Generate a fallback frame when model generation fails"""
+        scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
         frame_idx = self.frame_count % len(scene_frames)
         frame = scene_frames[frame_idx].copy()
         self.frame_count += 1
+        # Add fallback mode indicator
+        cv2.putText(frame, "Fallback mode",
+                  (10, self.frame_height - 20),
+                  cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
+        # Add visualization of input controls
+        frame = self._visualize_controls(frame, keyboard_condition, mouse_condition)
+        # Convert frame to JPEG
+        success, buffer = cv2.imencode('.jpg', frame)
+        if not success:
+            logger.error("Failed to encode fallback frame as JPEG")
+            # Return a blank frame
+            blank = np.ones((self.frame_height, self.frame_width, 3), dtype=np.uint8) * 100
+            success, buffer = cv2.imencode('.jpg', blank)
+        return buffer.tobytes()
+    def _visualize_controls(self, frame, keyboard_condition, mouse_condition):
+        """Visualize keyboard and mouse controls on the frame"""
         # If we have keyboard/mouse conditions, visualize them on the frame
         if keyboard_condition:
             # Visualize keyboard inputs (simple example)
             cv2.putText(frame, f"Mouse: {mouse_x:.2f}, {mouse_y:.2f}",
                        (self.frame_width - 250, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)
+        return frame
 class GameSession:
     """
     parser.add_argument("--host", type=str, default="0.0.0.0", help="Host IP to bind to")
     parser.add_argument("--port", type=int, default=8080, help="Port to listen on")
     parser.add_argument("--path", type=str, default="", help="Base path for the server (for proxy setups)")
+    # Model paths
+    parser.add_argument("--model_root", type=str, default="./models/matrixgame",
+                        help="Root directory for model files")
+    parser.add_argument("--dit_path", type=str, default=None,
+                        help="Path to DIT model. If not provided, will use MODEL_ROOT/dit/")
+    parser.add_argument("--vae_path", type=str, default=None,
+                        help="Path to VAE model. If not provided, will use MODEL_ROOT/vae/")
+    parser.add_argument("--textenc_path", type=str, default=None,
+                        help="Path to text encoder model. If not provided, will use MODEL_ROOT")
+    args = parser.parse_args()
+    # Set environment variables for model paths if provided
+    if args.model_root:
+        os.environ.setdefault("MODEL_ROOT", args.model_root)
+    if args.dit_path:
+        os.environ.setdefault("DIT_PATH", args.dit_path)
+    else:
+        os.environ.setdefault("DIT_PATH", os.path.join(os.environ.get("MODEL_ROOT", "./models/matrixgame"), "dit/"))
+    if args.vae_path:
+        os.environ.setdefault("VAE_PATH", args.vae_path)
+    else:
+        os.environ.setdefault("VAE_PATH", os.path.join(os.environ.get("MODEL_ROOT", "./models/matrixgame"), "vae/"))
+    if args.textenc_path:
+        os.environ.setdefault("TEXTENC_PATH", args.textenc_path)
+    else:
+        os.environ.setdefault("TEXTENC_PATH", os.environ.get("MODEL_ROOT", "./models/matrixgame"))
+    return args
 if __name__ == '__main__':
+    # Set CUDA memory allocation environment variable
+    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+    # Parse command line arguments
     args = parse_args()
+    # Log GPU availability
+    if torch.cuda.is_available():
+        gpu_count = torch.cuda.device_count()
+        gpu_name = torch.cuda.get_device_name(0) if gpu_count > 0 else "Unknown"
+        logger.info(f"CUDA is available. Found {gpu_count} GPU(s). Using: {gpu_name}")
+    else:
+        logger.warning("CUDA is not available. Running in CPU-only mode. Model generation disabled.")
+    # Initialize app
     loop = asyncio.get_event_loop()
     app = loop.run_until_complete(init_app(base_path=args.path))
+    # Start server
+    logger.info(f"Starting MatrixGame WebSocket Server at {args.host}:{args.port}")
     web.run_app(app, host=args.host, port=args.port)