Julian Bilcke
commited on
Commit
·
f1803ce
1
Parent(s):
998c45a
up
Browse files- README.md +34 -1
- client/client.js +130 -22
- client/index.html +145 -54
- server.py +294 -18
README.md
CHANGED
|
@@ -75,10 +75,43 @@ pip install -r requirements.txt
|
|
| 75 |
# install apex and FlashAttention-3
|
| 76 |
# Our project also depends on [apex](https://github.com/NVIDIA/apex) and [FlashAttention-3](https://github.com/Dao-AILab/flash-attention)
|
| 77 |
|
| 78 |
-
# inference
|
| 79 |
bash run_inference.sh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
```
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
## 🔧 Hardware Requirements
|
| 84 |
- **GPU**:
|
|
|
|
| 75 |
# install apex and FlashAttention-3
|
| 76 |
# Our project also depends on [apex](https://github.com/NVIDIA/apex) and [FlashAttention-3](https://github.com/Dao-AILab/flash-attention)
|
| 77 |
|
| 78 |
+
# Run batch inference to generate videos
|
| 79 |
bash run_inference.sh
|
| 80 |
+
|
| 81 |
+
# Run interactive websocket server
|
| 82 |
+
python server.py --model_root ./models/matrixgame
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
## Interactive WebSocket Server
|
| 86 |
+
|
| 87 |
+
We've implemented a real-time interactive WebSocket server that uses the Matrix-Game model to generate game frames based on keyboard and mouse inputs:
|
| 88 |
+
|
| 89 |
+
### Features:
|
| 90 |
+
- **Real-time Generation**: Frames are generated on-the-fly based on user inputs
|
| 91 |
+
- **Keyboard & Mouse Control**: Move through the virtual world using WASD keys and mouse movements
|
| 92 |
+
- **Multiple Scenes**: Choose from different environments (forest, desert, beach, hills, etc.)
|
| 93 |
+
- **Fallback Mode**: Automatically falls back to demo mode when GPU resources are unavailable
|
| 94 |
+
|
| 95 |
+
### Usage:
|
| 96 |
+
```bash
|
| 97 |
+
# Basic startup
|
| 98 |
+
python server.py
|
| 99 |
+
|
| 100 |
+
# With custom model paths
|
| 101 |
+
python server.py --model_root ./models/matrixgame --port 8080
|
| 102 |
+
|
| 103 |
+
# With individual model component paths
|
| 104 |
+
python server.py --dit_path ./custom/dit --vae_path ./custom/vae --textenc_path ./custom/textenc
|
| 105 |
```
|
| 106 |
|
| 107 |
+
### Connection:
|
| 108 |
+
- WebSocket endpoint: ws://localhost:8080/ws
|
| 109 |
+
- Web client: http://localhost:8080/
|
| 110 |
+
|
| 111 |
+
### System Requirements:
|
| 112 |
+
- NVIDIA GPU with CUDA support
|
| 113 |
+
- 24GB+ VRAM recommended for smooth frame generation
|
| 114 |
+
|
| 115 |
|
| 116 |
## 🔧 Hardware Requirements
|
| 117 |
- **GPU**:
|
client/client.js
CHANGED
|
@@ -19,6 +19,11 @@ const mousePosition = document.getElementById('mouse-position');
|
|
| 19 |
const fpsCounter = document.getElementById('fps-counter');
|
| 20 |
const mouseTrackingArea = document.getElementById('mouse-tracking-area');
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
// Keyboard DOM elements
|
| 23 |
const keyElements = {
|
| 24 |
'w': document.getElementById('key-w'),
|
|
@@ -32,9 +37,13 @@ const keyElements = {
|
|
| 32 |
// Key mapping to action names
|
| 33 |
const keyToAction = {
|
| 34 |
'w': 'forward',
|
|
|
|
| 35 |
'a': 'left',
|
|
|
|
| 36 |
's': 'back',
|
|
|
|
| 37 |
'd': 'right',
|
|
|
|
| 38 |
' ': 'jump',
|
| 39 |
'shift': 'attack'
|
| 40 |
};
|
|
@@ -52,7 +61,8 @@ const keyState = {
|
|
| 52 |
// Mouse state
|
| 53 |
const mouseState = {
|
| 54 |
x: 0,
|
| 55 |
-
y: 0
|
|
|
|
| 56 |
};
|
| 57 |
|
| 58 |
// Test server connectivity before establishing WebSocket
|
|
@@ -454,27 +464,101 @@ document.addEventListener('keyup', (event) => {
|
|
| 454 |
}
|
| 455 |
});
|
| 456 |
|
| 457 |
-
// Mouse
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
mouseTrackingArea.addEventListener('mousemove', (event) => {
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
|
|
|
|
|
|
| 478 |
});
|
| 479 |
|
| 480 |
// Throttle mouse movement to avoid flooding the server
|
|
@@ -491,5 +575,29 @@ const throttledSendMouseInput = (() => {
|
|
| 491 |
};
|
| 492 |
})();
|
| 493 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
// Initialize the UI
|
| 495 |
-
resetUI();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
const fpsCounter = document.getElementById('fps-counter');
|
| 20 |
const mouseTrackingArea = document.getElementById('mouse-tracking-area');
|
| 21 |
|
| 22 |
+
// Pointer Lock API support check
|
| 23 |
+
const pointerLockSupported = 'pointerLockElement' in document ||
|
| 24 |
+
'mozPointerLockElement' in document ||
|
| 25 |
+
'webkitPointerLockElement' in document;
|
| 26 |
+
|
| 27 |
// Keyboard DOM elements
|
| 28 |
const keyElements = {
|
| 29 |
'w': document.getElementById('key-w'),
|
|
|
|
| 37 |
// Key mapping to action names
|
| 38 |
const keyToAction = {
|
| 39 |
'w': 'forward',
|
| 40 |
+
'arrowup': 'forward',
|
| 41 |
'a': 'left',
|
| 42 |
+
'arrowleft': 'left',
|
| 43 |
's': 'back',
|
| 44 |
+
'arrowdown': 'back',
|
| 45 |
'd': 'right',
|
| 46 |
+
'arrowright': 'right',
|
| 47 |
' ': 'jump',
|
| 48 |
'shift': 'attack'
|
| 49 |
};
|
|
|
|
| 61 |
// Mouse state
|
| 62 |
const mouseState = {
|
| 63 |
x: 0,
|
| 64 |
+
y: 0,
|
| 65 |
+
captured: false
|
| 66 |
};
|
| 67 |
|
| 68 |
// Test server connectivity before establishing WebSocket
|
|
|
|
| 464 |
}
|
| 465 |
});
|
| 466 |
|
| 467 |
+
// Mouse capture functions
|
| 468 |
+
function requestPointerLock() {
|
| 469 |
+
if (!mouseState.captured && pointerLockSupported) {
|
| 470 |
+
mouseTrackingArea.requestPointerLock = mouseTrackingArea.requestPointerLock ||
|
| 471 |
+
mouseTrackingArea.mozRequestPointerLock ||
|
| 472 |
+
mouseTrackingArea.webkitRequestPointerLock;
|
| 473 |
+
mouseTrackingArea.requestPointerLock();
|
| 474 |
+
logMessage('Mouse captured. Press ESC to release.');
|
| 475 |
+
}
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
function exitPointerLock() {
|
| 479 |
+
if (mouseState.captured) {
|
| 480 |
+
document.exitPointerLock = document.exitPointerLock ||
|
| 481 |
+
document.mozExitPointerLock ||
|
| 482 |
+
document.webkitExitPointerLock;
|
| 483 |
+
document.exitPointerLock();
|
| 484 |
+
logMessage('Mouse released.');
|
| 485 |
+
}
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
// Handle pointer lock change events
|
| 489 |
+
document.addEventListener('pointerlockchange', pointerLockChangeHandler);
|
| 490 |
+
document.addEventListener('mozpointerlockchange', pointerLockChangeHandler);
|
| 491 |
+
document.addEventListener('webkitpointerlockchange', pointerLockChangeHandler);
|
| 492 |
+
|
| 493 |
+
function pointerLockChangeHandler() {
|
| 494 |
+
if (document.pointerLockElement === mouseTrackingArea ||
|
| 495 |
+
document.mozPointerLockElement === mouseTrackingArea ||
|
| 496 |
+
document.webkitPointerLockElement === mouseTrackingArea) {
|
| 497 |
+
// Pointer is locked, enable mouse movement tracking
|
| 498 |
+
mouseState.captured = true;
|
| 499 |
+
document.addEventListener('mousemove', handleMouseMovement);
|
| 500 |
+
} else {
|
| 501 |
+
// Pointer is unlocked, disable mouse movement tracking
|
| 502 |
+
mouseState.captured = false;
|
| 503 |
+
document.removeEventListener('mousemove', handleMouseMovement);
|
| 504 |
+
// Reset mouse state
|
| 505 |
+
mouseState.x = 0;
|
| 506 |
+
mouseState.y = 0;
|
| 507 |
+
mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
|
| 508 |
+
throttledSendMouseInput();
|
| 509 |
+
}
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
// Mouse tracking with pointer lock
|
| 513 |
+
function handleMouseMovement(event) {
|
| 514 |
+
if (mouseState.captured) {
|
| 515 |
+
// Use movement for mouse look when captured
|
| 516 |
+
const sensitivity = 0.005; // Adjust sensitivity
|
| 517 |
+
mouseState.x += event.movementX * sensitivity;
|
| 518 |
+
mouseState.y -= event.movementY * sensitivity; // Invert Y for intuitive camera control
|
| 519 |
+
|
| 520 |
+
// Clamp values
|
| 521 |
+
mouseState.x = Math.max(-1, Math.min(1, mouseState.x));
|
| 522 |
+
mouseState.y = Math.max(-1, Math.min(1, mouseState.y));
|
| 523 |
+
|
| 524 |
+
// Update display
|
| 525 |
+
mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
|
| 526 |
+
|
| 527 |
+
// Send to server (throttled)
|
| 528 |
+
throttledSendMouseInput();
|
| 529 |
+
}
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
// Mouse click to capture
|
| 533 |
+
mouseTrackingArea.addEventListener('click', () => {
|
| 534 |
+
if (!mouseState.captured && isStreaming) {
|
| 535 |
+
requestPointerLock();
|
| 536 |
+
}
|
| 537 |
+
});
|
| 538 |
+
|
| 539 |
+
// Standard mouse tracking for when pointer is not locked
|
| 540 |
mouseTrackingArea.addEventListener('mousemove', (event) => {
|
| 541 |
+
if (!mouseState.captured) {
|
| 542 |
+
// Calculate normalized coordinates relative to the center of the tracking area
|
| 543 |
+
const rect = mouseTrackingArea.getBoundingClientRect();
|
| 544 |
+
const centerX = rect.width / 2;
|
| 545 |
+
const centerY = rect.height / 2;
|
| 546 |
+
|
| 547 |
+
// Calculate relative position from center (-1 to 1)
|
| 548 |
+
const relX = (event.clientX - rect.left - centerX) / centerX;
|
| 549 |
+
const relY = (event.clientY - rect.top - centerY) / centerY;
|
| 550 |
+
|
| 551 |
+
// Scale down for smoother movement (similar to conditions.py)
|
| 552 |
+
const scaleFactor = 0.05;
|
| 553 |
+
mouseState.x = relX * scaleFactor;
|
| 554 |
+
mouseState.y = -relY * scaleFactor; // Invert Y for intuitive camera control
|
| 555 |
+
|
| 556 |
+
// Update display
|
| 557 |
+
mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
|
| 558 |
+
|
| 559 |
+
// Send to server (throttled)
|
| 560 |
+
throttledSendMouseInput();
|
| 561 |
+
}
|
| 562 |
});
|
| 563 |
|
| 564 |
// Throttle mouse movement to avoid flooding the server
|
|
|
|
| 575 |
};
|
| 576 |
})();
|
| 577 |
|
| 578 |
+
// Toggle panel collapse/expand
|
| 579 |
+
function togglePanel(panelId) {
|
| 580 |
+
const panel = document.getElementById(panelId);
|
| 581 |
+
const button = panel.querySelector('.toggle-button');
|
| 582 |
+
|
| 583 |
+
if (panel.classList.contains('collapsed')) {
|
| 584 |
+
// Expand the panel
|
| 585 |
+
panel.classList.remove('collapsed');
|
| 586 |
+
button.textContent = '−'; // Minus sign
|
| 587 |
+
} else {
|
| 588 |
+
// Collapse the panel
|
| 589 |
+
panel.classList.add('collapsed');
|
| 590 |
+
button.textContent = '+'; // Plus sign
|
| 591 |
+
}
|
| 592 |
+
}
|
| 593 |
+
|
| 594 |
// Initialize the UI
|
| 595 |
+
resetUI();
|
| 596 |
+
|
| 597 |
+
// Make panel headers clickable
|
| 598 |
+
document.querySelectorAll('.panel-header').forEach(header => {
|
| 599 |
+
header.addEventListener('click', () => {
|
| 600 |
+
const panelId = header.parentElement.id;
|
| 601 |
+
togglePanel(panelId);
|
| 602 |
+
});
|
| 603 |
+
});
|
client/index.html
CHANGED
|
@@ -14,47 +14,124 @@
|
|
| 14 |
display: flex;
|
| 15 |
flex-direction: column;
|
| 16 |
align-items: center;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
}
|
| 18 |
|
| 19 |
.container {
|
| 20 |
-
width:
|
| 21 |
-
max-width:
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
h1 {
|
| 26 |
-
color: #4CAF50;
|
| 27 |
-
text-align: center;
|
| 28 |
}
|
| 29 |
|
| 30 |
.game-area {
|
| 31 |
display: flex;
|
| 32 |
flex-direction: column;
|
| 33 |
align-items: center;
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
}
|
| 36 |
|
| 37 |
#game-canvas {
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
height: 360px;
|
| 42 |
object-fit: contain;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
}
|
| 44 |
|
| 45 |
.controls {
|
| 46 |
display: flex;
|
| 47 |
justify-content: space-between;
|
| 48 |
-
width:
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
}
|
| 51 |
|
| 52 |
-
.
|
|
|
|
| 53 |
background-color: #1E1E1E;
|
| 54 |
-
padding: 15px;
|
| 55 |
border-radius: 5px;
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
}
|
| 59 |
|
| 60 |
button {
|
|
@@ -130,14 +207,9 @@
|
|
| 130 |
}
|
| 131 |
|
| 132 |
.connection-info {
|
| 133 |
-
margin-top: 20px;
|
| 134 |
-
padding: 10px;
|
| 135 |
-
background-color: #1E1E1E;
|
| 136 |
-
border-radius: 5px;
|
| 137 |
font-family: monospace;
|
| 138 |
-
height:
|
| 139 |
overflow-y: auto;
|
| 140 |
-
width: 640px;
|
| 141 |
}
|
| 142 |
|
| 143 |
.log-entry {
|
|
@@ -155,13 +227,9 @@
|
|
| 155 |
padding: 5px;
|
| 156 |
border-radius: 3px;
|
| 157 |
font-family: monospace;
|
|
|
|
| 158 |
}
|
| 159 |
|
| 160 |
-
#mouse-tracking-area {
|
| 161 |
-
position: relative;
|
| 162 |
-
width: 640px;
|
| 163 |
-
height: 360px;
|
| 164 |
-
}
|
| 165 |
|
| 166 |
#mouse-position {
|
| 167 |
position: absolute;
|
|
@@ -172,13 +240,18 @@
|
|
| 172 |
padding: 5px;
|
| 173 |
border-radius: 3px;
|
| 174 |
font-family: monospace;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
}
|
| 176 |
</style>
|
| 177 |
</head>
|
| 178 |
<body>
|
| 179 |
<div class="container">
|
| 180 |
-
<h1>MatrixGame WebSocket Client</h1>
|
| 181 |
-
|
| 182 |
<div class="game-area">
|
| 183 |
<div id="mouse-tracking-area">
|
| 184 |
<img id="game-canvas" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" alt="Game Frame">
|
|
@@ -203,33 +276,51 @@
|
|
| 203 |
</div>
|
| 204 |
</div>
|
| 205 |
|
| 206 |
-
<div class="
|
| 207 |
-
|
| 208 |
-
<div class="
|
| 209 |
-
<div class="
|
| 210 |
-
<div
|
|
|
|
| 211 |
</div>
|
| 212 |
-
<div class="
|
| 213 |
-
<div
|
| 214 |
-
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
</div>
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
</div>
|
| 220 |
-
<div class="
|
| 221 |
-
<div
|
|
|
|
|
|
|
| 222 |
</div>
|
| 223 |
</div>
|
| 224 |
-
<p class="status">
|
| 225 |
-
W = Forward, S = Back, A = Left, D = Right<br>
|
| 226 |
-
Space = Jump, Shift = Attack<br>
|
| 227 |
-
Mouse = Look around
|
| 228 |
-
</p>
|
| 229 |
-
</div>
|
| 230 |
-
|
| 231 |
-
<div class="connection-info" id="connection-log">
|
| 232 |
-
<div class="log-entry">Waiting to connect...</div>
|
| 233 |
</div>
|
| 234 |
</div>
|
| 235 |
|
|
|
|
| 14 |
display: flex;
|
| 15 |
flex-direction: column;
|
| 16 |
align-items: center;
|
| 17 |
+
user-select: none; /* Disable text selection */
|
| 18 |
+
-webkit-user-select: none;
|
| 19 |
+
-moz-user-select: none;
|
| 20 |
+
-ms-user-select: none;
|
| 21 |
+
overflow-x: hidden;
|
| 22 |
}
|
| 23 |
|
| 24 |
.container {
|
| 25 |
+
width: 100%;
|
| 26 |
+
max-width: 100%;
|
| 27 |
+
display: flex;
|
| 28 |
+
flex-direction: column;
|
| 29 |
+
align-items: center;
|
|
|
|
|
|
|
|
|
|
| 30 |
}
|
| 31 |
|
| 32 |
.game-area {
|
| 33 |
display: flex;
|
| 34 |
flex-direction: column;
|
| 35 |
align-items: center;
|
| 36 |
+
width: 100%;
|
| 37 |
+
max-height: 85vh;
|
| 38 |
+
margin: 0;
|
| 39 |
+
position: relative;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
#mouse-tracking-area {
|
| 43 |
+
position: relative;
|
| 44 |
+
width: 100%;
|
| 45 |
+
height: auto;
|
| 46 |
+
cursor: pointer; /* Show cursor as pointer to encourage clicks */
|
| 47 |
+
display: flex;
|
| 48 |
+
justify-content: center;
|
| 49 |
+
align-items: center;
|
| 50 |
+
max-height: 85vh;
|
| 51 |
}
|
| 52 |
|
| 53 |
#game-canvas {
|
| 54 |
+
width: 100%;
|
| 55 |
+
height: auto;
|
| 56 |
+
max-height: 85vh;
|
|
|
|
| 57 |
object-fit: contain;
|
| 58 |
+
background-color: #000;
|
| 59 |
+
pointer-events: none; /* Prevent drag on the image */
|
| 60 |
+
-webkit-user-drag: none;
|
| 61 |
+
-khtml-user-drag: none;
|
| 62 |
+
-moz-user-drag: none;
|
| 63 |
+
-o-user-drag: none;
|
| 64 |
+
user-drag: none;
|
| 65 |
}
|
| 66 |
|
| 67 |
.controls {
|
| 68 |
display: flex;
|
| 69 |
justify-content: space-between;
|
| 70 |
+
width: 100%;
|
| 71 |
+
max-width: 1200px;
|
| 72 |
+
padding: 10px;
|
| 73 |
+
background-color: rgba(0, 0, 0, 0.5);
|
| 74 |
+
position: absolute;
|
| 75 |
+
bottom: 0;
|
| 76 |
+
z-index: 10;
|
| 77 |
+
box-sizing: border-box;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
.panels-container {
|
| 81 |
+
display: flex;
|
| 82 |
+
width: 100%;
|
| 83 |
+
max-width: 1200px;
|
| 84 |
+
margin: 10px auto;
|
| 85 |
+
gap: 10px;
|
| 86 |
}
|
| 87 |
|
| 88 |
+
.panel {
|
| 89 |
+
flex: 1;
|
| 90 |
background-color: #1E1E1E;
|
|
|
|
| 91 |
border-radius: 5px;
|
| 92 |
+
overflow: hidden;
|
| 93 |
+
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
| 94 |
+
transition: height 0.3s ease;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
.panel-header {
|
| 98 |
+
background-color: #272727;
|
| 99 |
+
padding: 10px 15px;
|
| 100 |
+
display: flex;
|
| 101 |
+
justify-content: space-between;
|
| 102 |
+
align-items: center;
|
| 103 |
+
cursor: pointer;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
.panel-title {
|
| 107 |
+
font-weight: bold;
|
| 108 |
+
color: #4CAF50;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
.toggle-button {
|
| 112 |
+
background: none;
|
| 113 |
+
border: none;
|
| 114 |
+
color: #e0e0e0;
|
| 115 |
+
font-size: 18px;
|
| 116 |
+
cursor: pointer;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
.toggle-button:focus {
|
| 120 |
+
outline: none;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.panel-content {
|
| 124 |
+
padding: 15px;
|
| 125 |
+
max-height: 300px;
|
| 126 |
+
overflow-y: auto;
|
| 127 |
+
transition: all 0.3s ease;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
.collapsed .panel-content {
|
| 131 |
+
max-height: 0;
|
| 132 |
+
padding-top: 0;
|
| 133 |
+
padding-bottom: 0;
|
| 134 |
+
overflow: hidden;
|
| 135 |
}
|
| 136 |
|
| 137 |
button {
|
|
|
|
| 207 |
}
|
| 208 |
|
| 209 |
.connection-info {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
font-family: monospace;
|
| 211 |
+
height: 100%;
|
| 212 |
overflow-y: auto;
|
|
|
|
| 213 |
}
|
| 214 |
|
| 215 |
.log-entry {
|
|
|
|
| 227 |
padding: 5px;
|
| 228 |
border-radius: 3px;
|
| 229 |
font-family: monospace;
|
| 230 |
+
z-index: 20;
|
| 231 |
}
|
| 232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
#mouse-position {
|
| 235 |
position: absolute;
|
|
|
|
| 240 |
padding: 5px;
|
| 241 |
border-radius: 3px;
|
| 242 |
font-family: monospace;
|
| 243 |
+
z-index: 20;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
@media (max-width: 768px) {
|
| 247 |
+
.panels-container {
|
| 248 |
+
flex-direction: column;
|
| 249 |
+
}
|
| 250 |
}
|
| 251 |
</style>
|
| 252 |
</head>
|
| 253 |
<body>
|
| 254 |
<div class="container">
|
|
|
|
|
|
|
| 255 |
<div class="game-area">
|
| 256 |
<div id="mouse-tracking-area">
|
| 257 |
<img id="game-canvas" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" alt="Game Frame">
|
|
|
|
| 276 |
</div>
|
| 277 |
</div>
|
| 278 |
|
| 279 |
+
<div class="panels-container">
|
| 280 |
+
<!-- Controls Panel -->
|
| 281 |
+
<div class="panel" id="controls-panel">
|
| 282 |
+
<div class="panel-header" onclick="togglePanel('controls-panel')">
|
| 283 |
+
<div class="panel-title">Keyboard Controls</div>
|
| 284 |
+
<button class="toggle-button">−</button>
|
| 285 |
</div>
|
| 286 |
+
<div class="panel-content">
|
| 287 |
+
<div class="key-indicators">
|
| 288 |
+
<div class="key-row">
|
| 289 |
+
<div id="key-w" class="key">W</div>
|
| 290 |
+
</div>
|
| 291 |
+
<div class="key-row">
|
| 292 |
+
<div id="key-a" class="key">A</div>
|
| 293 |
+
<div id="key-s" class="key">S</div>
|
| 294 |
+
<div id="key-d" class="key">D</div>
|
| 295 |
+
</div>
|
| 296 |
+
<div class="key-row">
|
| 297 |
+
<div id="key-space" class="key spacebar">SPACE</div>
|
| 298 |
+
</div>
|
| 299 |
+
<div class="key-row">
|
| 300 |
+
<div id="key-shift" class="key">SHIFT</div>
|
| 301 |
+
</div>
|
| 302 |
+
</div>
|
| 303 |
+
<p class="status">
|
| 304 |
+
W or ↑ = Forward, S or ↓ = Back, A or ← = Left, D or → = Right<br>
|
| 305 |
+
Space = Jump, Shift = Attack<br>
|
| 306 |
+
Click on game view to capture mouse (ESC to release)<br>
|
| 307 |
+
Mouse = Look around
|
| 308 |
+
</p>
|
| 309 |
</div>
|
| 310 |
+
</div>
|
| 311 |
+
|
| 312 |
+
<!-- Connection Log Panel -->
|
| 313 |
+
<div class="panel" id="log-panel">
|
| 314 |
+
<div class="panel-header" onclick="togglePanel('log-panel')">
|
| 315 |
+
<div class="panel-title">Connection Log</div>
|
| 316 |
+
<button class="toggle-button">−</button>
|
| 317 |
</div>
|
| 318 |
+
<div class="panel-content">
|
| 319 |
+
<div class="connection-info" id="connection-log">
|
| 320 |
+
<div class="log-entry">Waiting to connect...</div>
|
| 321 |
+
</div>
|
| 322 |
</div>
|
| 323 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
</div>
|
| 325 |
</div>
|
| 326 |
|
server.py
CHANGED
|
@@ -25,6 +25,16 @@ from PIL import Image
|
|
| 25 |
import cv2
|
| 26 |
from aiohttp import web, WSMsgType
|
| 27 |
from condtions import Bench_actions_76
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# Configure logging
|
| 30 |
logging.basicConfig(
|
|
@@ -35,14 +45,23 @@ logger = logging.getLogger(__name__)
|
|
| 35 |
|
| 36 |
class FrameGenerator:
|
| 37 |
"""
|
| 38 |
-
|
| 39 |
-
|
| 40 |
"""
|
| 41 |
def __init__(self):
|
| 42 |
self.frame_width = 640
|
| 43 |
self.frame_height = 360
|
| 44 |
self.fps = 16
|
| 45 |
self.frame_count = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
self.scenes = {
|
| 47 |
'forest': self._load_scene_frames('forest'),
|
| 48 |
'desert': self._load_scene_frames('desert'),
|
|
@@ -54,6 +73,87 @@ class FrameGenerator:
|
|
| 54 |
'plain': self._load_scene_frames('plain')
|
| 55 |
}
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def _load_scene_frames(self, scene_name):
|
| 58 |
"""Load initial frames for a scene from asset directory"""
|
| 59 |
frames = []
|
|
@@ -72,7 +172,7 @@ class FrameGenerator:
|
|
| 72 |
|
| 73 |
# If no frames were loaded, create a default colored frame with text
|
| 74 |
if not frames:
|
| 75 |
-
frame = np.ones((self.frame_height, self.
|
| 76 |
# Add scene name as text
|
| 77 |
cv2.putText(frame, f"Scene: {scene_name}", (50, 180),
|
| 78 |
cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
|
|
@@ -80,9 +180,29 @@ class FrameGenerator:
|
|
| 80 |
|
| 81 |
return frames
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
def get_next_frame(self, scene_name, keyboard_condition=None, mouse_condition=None):
|
| 84 |
"""
|
| 85 |
-
Generate the next frame based on current conditions.
|
| 86 |
|
| 87 |
Args:
|
| 88 |
scene_name: Name of the current scene
|
|
@@ -92,16 +212,134 @@ class FrameGenerator:
|
|
| 92 |
Returns:
|
| 93 |
JPEG bytes of the frame
|
| 94 |
"""
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
-
#
|
| 98 |
-
|
| 99 |
|
| 100 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
frame_idx = self.frame_count % len(scene_frames)
|
| 102 |
frame = scene_frames[frame_idx].copy()
|
| 103 |
self.frame_count += 1
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
# If we have keyboard/mouse conditions, visualize them on the frame
|
| 106 |
if keyboard_condition:
|
| 107 |
# Visualize keyboard inputs (simple example)
|
|
@@ -122,15 +360,7 @@ class FrameGenerator:
|
|
| 122 |
cv2.putText(frame, f"Mouse: {mouse_x:.2f}, {mouse_y:.2f}",
|
| 123 |
(self.frame_width - 250, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)
|
| 124 |
|
| 125 |
-
|
| 126 |
-
success, buffer = cv2.imencode('.jpg', frame)
|
| 127 |
-
if not success:
|
| 128 |
-
logger.error("Failed to encode frame as JPEG")
|
| 129 |
-
# Return a blank frame
|
| 130 |
-
blank = np.ones((self.frame_height, self.frame_width, 3), dtype=np.uint8) * 100
|
| 131 |
-
success, buffer = cv2.imencode('.jpg', blank)
|
| 132 |
-
|
| 133 |
-
return buffer.tobytes()
|
| 134 |
|
| 135 |
class GameSession:
|
| 136 |
"""
|
|
@@ -721,10 +951,56 @@ def parse_args() -> argparse.Namespace:
|
|
| 721 |
parser.add_argument("--host", type=str, default="0.0.0.0", help="Host IP to bind to")
|
| 722 |
parser.add_argument("--port", type=int, default=8080, help="Port to listen on")
|
| 723 |
parser.add_argument("--path", type=str, default="", help="Base path for the server (for proxy setups)")
|
| 724 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 725 |
|
| 726 |
if __name__ == '__main__':
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
args = parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 728 |
loop = asyncio.get_event_loop()
|
| 729 |
app = loop.run_until_complete(init_app(base_path=args.path))
|
|
|
|
|
|
|
|
|
|
| 730 |
web.run_app(app, host=args.host, port=args.port)
|
|
|
|
| 25 |
import cv2
|
| 26 |
from aiohttp import web, WSMsgType
|
| 27 |
from condtions import Bench_actions_76
|
| 28 |
+
from einops import rearrange
|
| 29 |
+
from diffusers.utils import load_image
|
| 30 |
+
from diffusers.video_processor import VideoProcessor
|
| 31 |
+
from matrixgame.sample.pipeline_matrixgame import MatrixGameVideoPipeline
|
| 32 |
+
from matrixgame.model_variants import get_dit
|
| 33 |
+
from matrixgame.vae_variants import get_vae
|
| 34 |
+
from matrixgame.encoder_variants import get_text_enc
|
| 35 |
+
from matrixgame.model_variants.matrixgame_dit_src import MGVideoDiffusionTransformerI2V
|
| 36 |
+
from matrixgame.sample.flow_matching_scheduler_matrixgame import FlowMatchDiscreteScheduler
|
| 37 |
+
from teacache_forward import teacache_forward
|
| 38 |
|
| 39 |
# Configure logging
|
| 40 |
logging.basicConfig(
|
|
|
|
| 45 |
|
| 46 |
class FrameGenerator:
|
| 47 |
"""
|
| 48 |
+
Game frame generator using the MatrixGame model.
|
| 49 |
+
Generates frames based on keyboard and mouse inputs.
|
| 50 |
"""
|
| 51 |
def __init__(self):
|
| 52 |
self.frame_width = 640
|
| 53 |
self.frame_height = 360
|
| 54 |
self.fps = 16
|
| 55 |
self.frame_count = 0
|
| 56 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 57 |
+
self.weight_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| 58 |
+
|
| 59 |
+
# Model paths (can be made configurable through arguments)
|
| 60 |
+
self.vae_path = os.environ.get("VAE_PATH", "./models/matrixgame/vae/")
|
| 61 |
+
self.dit_path = os.environ.get("DIT_PATH", "./models/matrixgame/dit/")
|
| 62 |
+
self.textenc_path = os.environ.get("TEXTENC_PATH", "./models/matrixgame")
|
| 63 |
+
|
| 64 |
+
# Cache scene initial frames (used as conditioning)
|
| 65 |
self.scenes = {
|
| 66 |
'forest': self._load_scene_frames('forest'),
|
| 67 |
'desert': self._load_scene_frames('desert'),
|
|
|
|
| 73 |
'plain': self._load_scene_frames('plain')
|
| 74 |
}
|
| 75 |
|
| 76 |
+
# Cache initial images for model input
|
| 77 |
+
self.scene_initial_images = {}
|
| 78 |
+
for scene_name, frames in self.scenes.items():
|
| 79 |
+
if frames:
|
| 80 |
+
# Use first frame as initial image
|
| 81 |
+
self.scene_initial_images[scene_name] = self._preprocess_image(frames[0])
|
| 82 |
+
|
| 83 |
+
# Initialize MatrixGame pipeline if CUDA is available
|
| 84 |
+
self.model_loaded = False
|
| 85 |
+
if torch.cuda.is_available():
|
| 86 |
+
try:
|
| 87 |
+
self._init_models()
|
| 88 |
+
self.model_loaded = True
|
| 89 |
+
logger.info("MatrixGame models loaded successfully")
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logger.error(f"Failed to initialize MatrixGame models: {str(e)}")
|
| 92 |
+
logger.info("Falling back to frame cycling mode")
|
| 93 |
+
else:
|
| 94 |
+
logger.warning("CUDA not available. Using frame cycling mode only.")
|
| 95 |
+
|
| 96 |
+
def _init_models(self):
|
| 97 |
+
"""Initialize MatrixGame models (VAE, text encoder, transformer)"""
|
| 98 |
+
# Initialize flow matching scheduler
|
| 99 |
+
self.scheduler = FlowMatchDiscreteScheduler(
|
| 100 |
+
shift=15.0,
|
| 101 |
+
reverse=True,
|
| 102 |
+
solver="euler"
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
# Initialize VAE
|
| 106 |
+
try:
|
| 107 |
+
self.vae = get_vae("matrixgame", self.vae_path, self.weight_dtype)
|
| 108 |
+
self.vae.requires_grad_(False)
|
| 109 |
+
self.vae.eval()
|
| 110 |
+
self.vae.enable_tiling()
|
| 111 |
+
logger.info("VAE model loaded successfully")
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logger.error(f"Error loading VAE model: {str(e)}")
|
| 114 |
+
raise
|
| 115 |
+
|
| 116 |
+
# Initialize DIT (Transformer)
|
| 117 |
+
try:
|
| 118 |
+
dit = MGVideoDiffusionTransformerI2V.from_pretrained(self.dit_path)
|
| 119 |
+
dit.requires_grad_(False)
|
| 120 |
+
dit.eval()
|
| 121 |
+
logger.info("DIT model loaded successfully")
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.error(f"Error loading DIT model: {str(e)}")
|
| 124 |
+
raise
|
| 125 |
+
|
| 126 |
+
# Initialize text encoder
|
| 127 |
+
try:
|
| 128 |
+
self.text_enc = get_text_enc('matrixgame', self.textenc_path, weight_dtype=self.weight_dtype, i2v_type='refiner')
|
| 129 |
+
logger.info("Text encoder loaded successfully")
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"Error loading text encoder: {str(e)}")
|
| 132 |
+
raise
|
| 133 |
+
|
| 134 |
+
# Initialize pipeline
|
| 135 |
+
try:
|
| 136 |
+
self.pipeline = MatrixGameVideoPipeline(
|
| 137 |
+
vae=self.vae.vae,
|
| 138 |
+
text_encoder=self.text_enc,
|
| 139 |
+
transformer=dit,
|
| 140 |
+
scheduler=self.scheduler,
|
| 141 |
+
).to(self.weight_dtype).to(self.device)
|
| 142 |
+
logger.info("Pipeline initialized successfully")
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.error(f"Error initializing pipeline: {str(e)}")
|
| 145 |
+
raise
|
| 146 |
+
|
| 147 |
+
# Configure teacache for the transformer
|
| 148 |
+
self.pipeline.transformer.__class__.enable_teacache = True
|
| 149 |
+
self.pipeline.transformer.__class__.cnt = 0
|
| 150 |
+
self.pipeline.transformer.__class__.num_steps = 20 # Reduced inference steps for real-time performance
|
| 151 |
+
self.pipeline.transformer.__class__.accumulated_rel_l1_distance = 0
|
| 152 |
+
self.pipeline.transformer.__class__.rel_l1_thresh = 0.075
|
| 153 |
+
self.pipeline.transformer.__class__.previous_modulated_input = None
|
| 154 |
+
self.pipeline.transformer.__class__.previous_residual = None
|
| 155 |
+
self.pipeline.transformer.__class__.forward = teacache_forward
|
| 156 |
+
|
| 157 |
def _load_scene_frames(self, scene_name):
|
| 158 |
"""Load initial frames for a scene from asset directory"""
|
| 159 |
frames = []
|
|
|
|
| 172 |
|
| 173 |
# If no frames were loaded, create a default colored frame with text
|
| 174 |
if not frames:
|
| 175 |
+
frame = np.ones((self.frame_height, self.frame_height, 3), dtype=np.uint8) * 100
|
| 176 |
# Add scene name as text
|
| 177 |
cv2.putText(frame, f"Scene: {scene_name}", (50, 180),
|
| 178 |
cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
|
|
|
|
| 180 |
|
| 181 |
return frames
|
| 182 |
|
| 183 |
+
def _preprocess_image(self, image_array):
|
| 184 |
+
"""Preprocess an image for the model"""
|
| 185 |
+
# Convert numpy array to PIL Image if needed
|
| 186 |
+
if isinstance(image_array, np.ndarray):
|
| 187 |
+
image = Image.fromarray(image_array)
|
| 188 |
+
else:
|
| 189 |
+
image = image_array
|
| 190 |
+
|
| 191 |
+
# Preprocess for VAE
|
| 192 |
+
vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, 'vae') else 8
|
| 193 |
+
video_processor = VideoProcessor(vae_scale_factor=vae_scale_factor)
|
| 194 |
+
initial_image = video_processor.preprocess(image, height=self.frame_height, width=self.frame_width)
|
| 195 |
+
|
| 196 |
+
# Add past frames for stability (use same frame repeated)
|
| 197 |
+
num_pre_frames = 3 # Use fewer pre-frames for real-time performance
|
| 198 |
+
past_frames = initial_image.repeat(num_pre_frames, 1, 1, 1)
|
| 199 |
+
initial_image = torch.cat([initial_image, past_frames], dim=0)
|
| 200 |
+
|
| 201 |
+
return initial_image
|
| 202 |
+
|
| 203 |
def get_next_frame(self, scene_name, keyboard_condition=None, mouse_condition=None):
|
| 204 |
"""
|
| 205 |
+
Generate the next frame based on current conditions using MatrixGame model.
|
| 206 |
|
| 207 |
Args:
|
| 208 |
scene_name: Name of the current scene
|
|
|
|
| 212 |
Returns:
|
| 213 |
JPEG bytes of the frame
|
| 214 |
"""
|
| 215 |
+
# Check if model is loaded
|
| 216 |
+
if not self.model_loaded or not torch.cuda.is_available():
|
| 217 |
+
# Fall back to frame cycling for demo mode or if models failed to load
|
| 218 |
+
scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
|
| 219 |
+
frame_idx = self.frame_count % len(scene_frames)
|
| 220 |
+
frame = scene_frames[frame_idx].copy()
|
| 221 |
+
self.frame_count += 1
|
| 222 |
+
|
| 223 |
+
# Add frame count indicator in corner
|
| 224 |
+
cv2.putText(frame, f"Demo mode: {self.frame_count}",
|
| 225 |
+
(self.frame_width - 200, self.frame_height - 20),
|
| 226 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (200, 200, 200), 1)
|
| 227 |
+
else:
|
| 228 |
+
# Use MatrixGame model for frame generation
|
| 229 |
+
try:
|
| 230 |
+
# Get initial image for this scene
|
| 231 |
+
initial_image = self.scene_initial_images.get(scene_name)
|
| 232 |
+
if initial_image is None:
|
| 233 |
+
# Use forest as default if we don't have an initial image for this scene
|
| 234 |
+
initial_image = self.scene_initial_images.get('forest')
|
| 235 |
+
if initial_image is None:
|
| 236 |
+
# If we still don't have an initial image, fall back to frame cycling
|
| 237 |
+
logger.error(f"No initial image available for scene {scene_name}")
|
| 238 |
+
return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
|
| 239 |
+
|
| 240 |
+
# Prepare input tensors (move to device and format correctly)
|
| 241 |
+
if keyboard_condition is None:
|
| 242 |
+
keyboard_condition = [[0, 0, 0, 0, 0, 0]]
|
| 243 |
+
if mouse_condition is None:
|
| 244 |
+
mouse_condition = [[0, 0]]
|
| 245 |
+
|
| 246 |
+
# Convert conditions to tensors
|
| 247 |
+
keyboard_tensor = torch.tensor(keyboard_condition, dtype=torch.float32)
|
| 248 |
+
mouse_tensor = torch.tensor(mouse_condition, dtype=torch.float32)
|
| 249 |
+
|
| 250 |
+
# Move to device and convert to correct dtype
|
| 251 |
+
keyboard_tensor = keyboard_tensor.to(self.weight_dtype).to(self.device)
|
| 252 |
+
mouse_tensor = mouse_tensor.to(self.weight_dtype).to(self.device)
|
| 253 |
+
|
| 254 |
+
# Get the first frame from the scene for semantic conditioning
|
| 255 |
+
scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
|
| 256 |
+
if not scene_frames:
|
| 257 |
+
return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
|
| 258 |
+
|
| 259 |
+
semantic_image = Image.fromarray(scene_frames[0])
|
| 260 |
+
|
| 261 |
+
# Get PIL image version of the frame for visualization
|
| 262 |
+
for scene_frame in scene_frames:
|
| 263 |
+
if isinstance(scene_frame, np.ndarray):
|
| 264 |
+
semantic_image = Image.fromarray(scene_frame)
|
| 265 |
+
break
|
| 266 |
+
|
| 267 |
+
# Generate a single frame with the model
|
| 268 |
+
# Use fewer inference steps for interactive frame generation
|
| 269 |
+
with torch.no_grad():
|
| 270 |
+
# Generate a short video (we'll just use the first frame)
|
| 271 |
+
# We're using a short length (3 frames) for real-time performance
|
| 272 |
+
video = self.pipeline(
|
| 273 |
+
height=self.frame_height,
|
| 274 |
+
width=self.frame_width,
|
| 275 |
+
video_length=3, # Generate a very short video for speed
|
| 276 |
+
mouse_condition=mouse_tensor,
|
| 277 |
+
keyboard_condition=keyboard_tensor,
|
| 278 |
+
initial_image=initial_image,
|
| 279 |
+
num_inference_steps=20, # Reduced for real-time performance
|
| 280 |
+
guidance_scale=6.0,
|
| 281 |
+
embedded_guidance_scale=None,
|
| 282 |
+
data_type="video",
|
| 283 |
+
vae_ver='884-16c-hy',
|
| 284 |
+
enable_tiling=True,
|
| 285 |
+
generator=torch.Generator(device=self.device).manual_seed(42),
|
| 286 |
+
i2v_type='refiner',
|
| 287 |
+
semantic_images=semantic_image
|
| 288 |
+
).videos[0]
|
| 289 |
+
|
| 290 |
+
# Convert video tensor to numpy array (use first frame)
|
| 291 |
+
video_frame = video[0].permute(1, 2, 0).cpu().numpy()
|
| 292 |
+
video_frame = (video_frame * 255).astype(np.uint8)
|
| 293 |
+
frame = video_frame
|
| 294 |
+
|
| 295 |
+
# Increment frame counter
|
| 296 |
+
self.frame_count += 1
|
| 297 |
+
|
| 298 |
+
except Exception as e:
|
| 299 |
+
logger.error(f"Error generating frame with MatrixGame model: {str(e)}")
|
| 300 |
+
# Fall back to cycling demo frames if model generation fails
|
| 301 |
+
return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
|
| 302 |
|
| 303 |
+
# Add visualization of input controls (for both model and fallback modes)
|
| 304 |
+
frame = self._visualize_controls(frame, keyboard_condition, mouse_condition)
|
| 305 |
|
| 306 |
+
# Convert frame to JPEG
|
| 307 |
+
success, buffer = cv2.imencode('.jpg', frame)
|
| 308 |
+
if not success:
|
| 309 |
+
logger.error("Failed to encode frame as JPEG")
|
| 310 |
+
# Return a blank frame
|
| 311 |
+
blank = np.ones((self.frame_height, self.frame_width, 3), dtype=np.uint8) * 100
|
| 312 |
+
success, buffer = cv2.imencode('.jpg', blank)
|
| 313 |
+
|
| 314 |
+
return buffer.tobytes()
|
| 315 |
+
|
| 316 |
+
def _fallback_frame(self, scene_name, keyboard_condition, mouse_condition):
|
| 317 |
+
"""Generate a fallback frame when model generation fails"""
|
| 318 |
+
scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
|
| 319 |
frame_idx = self.frame_count % len(scene_frames)
|
| 320 |
frame = scene_frames[frame_idx].copy()
|
| 321 |
self.frame_count += 1
|
| 322 |
|
| 323 |
+
# Add fallback mode indicator
|
| 324 |
+
cv2.putText(frame, "Fallback mode",
|
| 325 |
+
(10, self.frame_height - 20),
|
| 326 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
|
| 327 |
+
|
| 328 |
+
# Add visualization of input controls
|
| 329 |
+
frame = self._visualize_controls(frame, keyboard_condition, mouse_condition)
|
| 330 |
+
|
| 331 |
+
# Convert frame to JPEG
|
| 332 |
+
success, buffer = cv2.imencode('.jpg', frame)
|
| 333 |
+
if not success:
|
| 334 |
+
logger.error("Failed to encode fallback frame as JPEG")
|
| 335 |
+
# Return a blank frame
|
| 336 |
+
blank = np.ones((self.frame_height, self.frame_width, 3), dtype=np.uint8) * 100
|
| 337 |
+
success, buffer = cv2.imencode('.jpg', blank)
|
| 338 |
+
|
| 339 |
+
return buffer.tobytes()
|
| 340 |
+
|
| 341 |
+
def _visualize_controls(self, frame, keyboard_condition, mouse_condition):
|
| 342 |
+
"""Visualize keyboard and mouse controls on the frame"""
|
| 343 |
# If we have keyboard/mouse conditions, visualize them on the frame
|
| 344 |
if keyboard_condition:
|
| 345 |
# Visualize keyboard inputs (simple example)
|
|
|
|
| 360 |
cv2.putText(frame, f"Mouse: {mouse_x:.2f}, {mouse_y:.2f}",
|
| 361 |
(self.frame_width - 250, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)
|
| 362 |
|
| 363 |
+
return frame
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
class GameSession:
|
| 366 |
"""
|
|
|
|
| 951 |
parser.add_argument("--host", type=str, default="0.0.0.0", help="Host IP to bind to")
|
| 952 |
parser.add_argument("--port", type=int, default=8080, help="Port to listen on")
|
| 953 |
parser.add_argument("--path", type=str, default="", help="Base path for the server (for proxy setups)")
|
| 954 |
+
|
| 955 |
+
# Model paths
|
| 956 |
+
parser.add_argument("--model_root", type=str, default="./models/matrixgame",
|
| 957 |
+
help="Root directory for model files")
|
| 958 |
+
parser.add_argument("--dit_path", type=str, default=None,
|
| 959 |
+
help="Path to DIT model. If not provided, will use MODEL_ROOT/dit/")
|
| 960 |
+
parser.add_argument("--vae_path", type=str, default=None,
|
| 961 |
+
help="Path to VAE model. If not provided, will use MODEL_ROOT/vae/")
|
| 962 |
+
parser.add_argument("--textenc_path", type=str, default=None,
|
| 963 |
+
help="Path to text encoder model. If not provided, will use MODEL_ROOT")
|
| 964 |
+
|
| 965 |
+
args = parser.parse_args()
|
| 966 |
+
|
| 967 |
+
# Set environment variables for model paths if provided
|
| 968 |
+
if args.model_root:
|
| 969 |
+
os.environ.setdefault("MODEL_ROOT", args.model_root)
|
| 970 |
+
if args.dit_path:
|
| 971 |
+
os.environ.setdefault("DIT_PATH", args.dit_path)
|
| 972 |
+
else:
|
| 973 |
+
os.environ.setdefault("DIT_PATH", os.path.join(os.environ.get("MODEL_ROOT", "./models/matrixgame"), "dit/"))
|
| 974 |
+
if args.vae_path:
|
| 975 |
+
os.environ.setdefault("VAE_PATH", args.vae_path)
|
| 976 |
+
else:
|
| 977 |
+
os.environ.setdefault("VAE_PATH", os.path.join(os.environ.get("MODEL_ROOT", "./models/matrixgame"), "vae/"))
|
| 978 |
+
if args.textenc_path:
|
| 979 |
+
os.environ.setdefault("TEXTENC_PATH", args.textenc_path)
|
| 980 |
+
else:
|
| 981 |
+
os.environ.setdefault("TEXTENC_PATH", os.environ.get("MODEL_ROOT", "./models/matrixgame"))
|
| 982 |
+
|
| 983 |
+
return args
|
| 984 |
|
| 985 |
if __name__ == '__main__':
|
| 986 |
+
# Set CUDA memory allocation environment variable
|
| 987 |
+
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
| 988 |
+
|
| 989 |
+
# Parse command line arguments
|
| 990 |
args = parse_args()
|
| 991 |
+
|
| 992 |
+
# Log GPU availability
|
| 993 |
+
if torch.cuda.is_available():
|
| 994 |
+
gpu_count = torch.cuda.device_count()
|
| 995 |
+
gpu_name = torch.cuda.get_device_name(0) if gpu_count > 0 else "Unknown"
|
| 996 |
+
logger.info(f"CUDA is available. Found {gpu_count} GPU(s). Using: {gpu_name}")
|
| 997 |
+
else:
|
| 998 |
+
logger.warning("CUDA is not available. Running in CPU-only mode. Model generation disabled.")
|
| 999 |
+
|
| 1000 |
+
# Initialize app
|
| 1001 |
loop = asyncio.get_event_loop()
|
| 1002 |
app = loop.run_until_complete(init_app(base_path=args.path))
|
| 1003 |
+
|
| 1004 |
+
# Start server
|
| 1005 |
+
logger.info(f"Starting MatrixGame WebSocket Server at {args.host}:{args.port}")
|
| 1006 |
web.run_app(app, host=args.host, port=args.port)
|