""" Reachy Mini Controller A centralized server that listens for Robot connections and hosts a Gradio control interface. """ import asyncio import threading import time import queue from dataclasses import dataclass from typing import Optional, Tuple import cv2 import gradio as gr import numpy as np from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.responses import StreamingResponse import uvicorn from fastrtc import WebRTC, StreamHandler, get_cloudflare_turn_credentials from huggingface_hub import get_token from reachy_mini.utils import create_head_pose token = get_token() if not token: raise ValueError("No token found. Please set the HF_TOKEN environment variable or login to Hugging Face.") # ------------------------------------------------------------------- # 1. Configuration # ------------------------------------------------------------------- AUDIO_SAMPLE_RATE = 16000 # respeaker samplerate # Audio queue configuration MAX_AUDIO_QUEUE_SIZE = 2 # Movement step sizes NUDGE_ANGLE = 5.0 # degrees for head roll / yaw NUDGE_BODY = 0.3 # degrees for body_yaw NUDGE_PITCH = 5.0 # degrees for pitch # Video loop timing FRAME_SLEEP_S = 0.04 # 25 fps # TURN config TURN_TTL_SERVER_MS = 360_000 USE_VIDEO_WEBRTC = False USE_AUDIO_WEBRTC = False turn_credentials = None server_turn_credentials = None while turn_credentials is None or server_turn_credentials is None: try: if turn_credentials is None: turn_credentials = get_cloudflare_turn_credentials(hf_token=token) if server_turn_credentials is None: server_turn_credentials = get_cloudflare_turn_credentials(ttl=TURN_TTL_SERVER_MS, hf_token=token) except Exception as e: print(f"[Video] Error getting turn credentials: {e!r}") time.sleep(1) # ------------------------------------------------------------------- # 2. Data Models # ------------------------------------------------------------------- @dataclass class Movement: name: str x: float = 0 y: float = 0 z: float = 0 roll: float = 0 pitch: float = 0 yaw: float = 0 body_yaw: float = 0 left_antenna: Optional[float] = None right_antenna: Optional[float] = None duration: float = 1.0 # ------------------------------------------------------------------- # 3. Global State # ------------------------------------------------------------------- class GlobalState: """ Singleton-style class to manage shared state between FastAPI (WebSockets) and Gradio (UI). """ def __init__(self): # Connection handles self.robot_ws: Optional[WebSocket] = None self.robot_loop: Optional[asyncio.AbstractEventLoop] = None # Video Stream Data self.frame_lock = threading.Lock() self.black_frame = np.zeros((640, 640, 3), dtype=np.uint8) _, buffer = cv2.imencode(".jpg", self.black_frame) self.latest_frame_bytes = buffer.tobytes() self.latest_frame_ts = time.time() # Latency tracking for video self.video_latencies = [] self.video_latency_window = 100 # Keep last 100 measurements # Audio from robot -> browser # Queue of (sample_rate: int, audio_bytes: bytes, timestamp: float) self.audio_queue: "queue.Queue[Tuple[int, bytes, float]]" = queue.Queue() # Audio from operator -> robot self.audio_to_robot_queue: "queue.Queue[bytes]" = queue.Queue() # Latency tracking for audio self.audio_latencies = [] self.audio_latency_window = 100 # Keep last 100 measurements # Live pose state self.pose_lock = threading.Lock() self.current_pose = Movement( name="Current", x=0, y=0, z=0, roll=0, pitch=0, yaw=0, body_yaw=0, left_antenna=0, right_antenna=0, duration=0.2, ) # --- Connection management --- def set_robot_connection(self, ws: WebSocket, loop: asyncio.AbstractEventLoop) -> None: self.robot_ws = ws self.robot_loop = loop def clear_robot_connection(self) -> None: self.robot_ws = None self.robot_loop = None # --- Video --- def update_frame(self, frame_bytes: bytes, robot_timestamp: Optional[float] = None) -> None: """ Update the latest video frame. Args: frame_bytes: JPEG encoded frame data robot_timestamp: Optional timestamp from when the frame was captured on the robot """ receive_time = time.time() with self.frame_lock: self.latest_frame_bytes = frame_bytes self.latest_frame_ts = receive_time # Calculate latency if timestamp provided if robot_timestamp is not None: latency_ms = (receive_time - robot_timestamp) * 1000 self.video_latencies.append(latency_ms) # Keep only recent measurements if len(self.video_latencies) > self.video_latency_window: self.video_latencies.pop(0) def get_video_latency_stats(self) -> dict: """Get video latency statistics in milliseconds.""" if not self.video_latencies: return {"min": 0, "max": 0, "avg": 0, "latest": 0, "count": 0} return { "min": min(self.video_latencies), "max": max(self.video_latencies), "avg": sum(self.video_latencies) / len(self.video_latencies), "latest": self.video_latencies[-1], "count": len(self.video_latencies) } # --- Audio queues --- @staticmethod def _push_bounded(q: queue.Queue, item, max_size: int, description: str) -> None: while q.qsize() >= max_size: try: dropped = q.get_nowait() del dropped except queue.Empty: break q.put(item) def push_audio_from_robot(self, audio_bytes: bytes, robot_timestamp: Optional[float] = None) -> None: """ Push audio data from robot to the queue for browser playback. Args: audio_bytes: Audio data bytes robot_timestamp: Optional timestamp from when audio was captured on robot """ self._push_bounded( self.audio_queue, (AUDIO_SAMPLE_RATE, audio_bytes, robot_timestamp if robot_timestamp is not None else time.time()), MAX_AUDIO_QUEUE_SIZE, "FROM robot", ) def push_audio_to_robot(self, audio_bytes: bytes) -> None: self._push_bounded( self.audio_to_robot_queue, audio_bytes, MAX_AUDIO_QUEUE_SIZE, "TO robot", ) def get_audio_to_robot_blocking(self) -> bytes: try: return self.audio_to_robot_queue.get(timeout=0.2) except queue.Empty: return None def track_audio_latency(self, robot_timestamp: float) -> None: """Track audio latency when audio is about to be played.""" playback_time = time.time() latency_ms = (playback_time - robot_timestamp) * 1000 self.audio_latencies.append(latency_ms) # Keep only recent measurements if len(self.audio_latencies) > self.audio_latency_window: self.audio_latencies.pop(0) def get_audio_latency_stats(self) -> dict: """Get audio latency statistics in milliseconds.""" if not self.audio_latencies: return {"min": 0, "max": 0, "avg": 0, "latest": 0, "count": 0} return { "min": min(self.audio_latencies), "max": max(self.audio_latencies), "avg": sum(self.audio_latencies) / len(self.audio_latencies), "latest": self.audio_latencies[-1], "count": len(self.audio_latencies) } # --- Status --- def get_connection_status(self) -> str: return "✅ Robot Connected" if self.robot_ws else "🔴 Waiting for Robot..." def get_latency_display(self) -> str: """Get formatted latency statistics for display.""" video_stats = self.get_video_latency_stats() audio_stats = self.get_audio_latency_stats() lines = [] if video_stats["count"] > 0: lines.append( f"📹 Video: {video_stats['latest']:.0f}ms " f"(avg: {video_stats['avg']:.0f}ms, max: {video_stats['max']:.0f}ms)" ) if audio_stats["count"] > 0: lines.append( f"đŸŽĩ Audio: {audio_stats['latest']:.0f}ms " f"(avg: {audio_stats['avg']:.0f}ms, max: {audio_stats['max']:.0f}ms)" ) if not lines: return "âąī¸ Latency: Waiting for data..." return "\n".join(lines) # --- Pose management --- def update_pose( self, dx: float = 0, dy: float = 0, dz: float = 0, droll: float = 0, dpitch: float = 0, dyaw: float = 0, dbody_yaw: float = 0, ) -> Movement: with self.pose_lock: p = self.current_pose new = Movement( name="Current", x=p.x + dx, y=p.y + dy, z=p.z + dz, roll=p.roll + droll, pitch=p.pitch + dpitch, yaw=p.yaw + dyaw, body_yaw=p.body_yaw + dbody_yaw, left_antenna=p.left_antenna, right_antenna=p.right_antenna, duration=0.1, ) # Clamp posed values new.pitch = float(np.clip(new.pitch, -30, 30)) new.yaw = float(np.clip(new.yaw, -180, 180)) new.roll = float(np.clip(new.roll, -40, 40)) new.body_yaw = float(np.clip(new.body_yaw, -3, 3)) new.z = float(np.clip(new.z, -20, 50)) new.x = float(np.clip(new.x, -50, 50)) new.y = float(np.clip(new.y, -50, 50)) self.current_pose = new return new def reset_pose(self) -> Movement: with self.pose_lock: self.current_pose = Movement( name="Current", x=0, y=0, z=0, roll=0, pitch=0, yaw=0, body_yaw=0, left_antenna=0, right_antenna=0, duration=0.3, ) return self.current_pose def get_pose_text(self) -> str: with self.pose_lock: p = self.current_pose return ( "Head position:\n" f" x={p.x:.1f}, y={p.y:.1f}, z={p.z:.1f}\n" f" roll={p.roll:.1f}, pitch={p.pitch:.1f}, yaw={p.yaw:.1f}\n" "Body:\n" f" body_yaw={p.body_yaw:.1f}" ) state = GlobalState() # ------------------------------------------------------------------- # 4. Robot commands # ------------------------------------------------------------------- def send_pose_to_robot(mov: Movement, msg: str = "Move sent"): if not (state.robot_ws and state.robot_loop): return state.get_pose_text(), "âš ī¸ Robot not connected" pose = create_head_pose( x=mov.x, y=mov.y, z=mov.z, roll=mov.roll, pitch=mov.pitch, yaw=mov.yaw, degrees=True, mm=True, ) payload = { "type": "movement", "movement": { "head": pose.tolist(), "body_yaw": mov.body_yaw, "duration": mov.duration, }, } if mov.left_antenna is not None and mov.right_antenna is not None: payload["movement"]["antennas"] = [ np.deg2rad(mov.right_antenna), np.deg2rad(mov.left_antenna), ] asyncio.run_coroutine_threadsafe( state.robot_ws.send_json(payload), state.robot_loop, ) return state.get_pose_text(), f"✅ {msg}" # ------------------------------------------------------------------- # 5. Video streaming helpers # ------------------------------------------------------------------- def generate_mjpeg_stream(): last_timestamp = 0.0 frame_count = 0 start_time = time.time() while True: with state.frame_lock: current_bytes = state.latest_frame_bytes current_timestamp = state.latest_frame_ts if current_timestamp > last_timestamp and current_bytes is not None: last_timestamp = current_timestamp frame_count += 1 elapsed = time.time() - start_time if elapsed > 1.0: fps = frame_count / elapsed print(f"[generate_mjpeg_stream] Current FPS: {fps:.2f}") frame_count = 0 start_time = time.time() yield ( b"--frame\r\n" b"Content-Type: image/jpeg\r\n\r\n" + current_bytes + b"\r\n" ) else: time.sleep(FRAME_SLEEP_S) continue time.sleep(FRAME_SLEEP_S) def webrtc_video_generator(): """ Generator for FastRTC WebRTC (mode='receive', modality='video'). """ last_ts = 0.0 frame = state.black_frame.copy() frame_count = 0 start_time = time.time() while True: with state.frame_lock: ts = state.latest_frame_ts frame_bytes = state.latest_frame_bytes if ts > last_ts and frame_bytes: last_ts = ts np_bytes = np.frombuffer(frame_bytes, dtype=np.uint8) decoded = cv2.imdecode(np_bytes, cv2.IMREAD_COLOR) if decoded is not None: frame = decoded else: frame = state.black_frame.copy() frame_count += 1 elapsed = time.time() - start_time if elapsed > 1.0: fps = frame_count / elapsed print(f"[webrtc_video_generator] Current FPS: {fps:.2f}") frame_count = 0 start_time = time.time() yield frame time.sleep(FRAME_SLEEP_S) # ------------------------------------------------------------------- # 6. FastAPI endpoints # ------------------------------------------------------------------- app = FastAPI() @app.websocket("/robot") async def robot_endpoint(ws: WebSocket): """Endpoint for the Robot to connect to (control channel).""" await ws.accept() state.set_robot_connection(ws, asyncio.get_running_loop()) print("[System] Robot Connected") try: while True: msg = await ws.receive() if msg.get("type") == "websocket.disconnect": break except (WebSocketDisconnect, Exception): print("[System] Robot Disconnected") finally: state.clear_robot_connection() @app.get("/video_feed") def video_feed(): return StreamingResponse( generate_mjpeg_stream(), media_type="multipart/x-mixed-replace; boundary=frame", ) def generate_audio_stream(): """ Generator for Raw PCM audio (No WAV header). Sends raw Int16 bytes directly to the browser. """ # Clear old data to start fresh with state.audio_queue.mutex: state.audio_queue.queue.clear() # OPTIMIZATION: Lower latency target (e.g., 512 samples = ~32ms) TARGET_SAMPLES = 512 byte_buffer = bytearray() while True: try: # Wait for data sample_rate, chunk_bytes, robot_timestamp = state.audio_queue.get(timeout=1.0) # Track latency if robot_timestamp is not None: state.track_audio_latency(robot_timestamp) if chunk_bytes: byte_buffer.extend(chunk_bytes) except queue.Empty: # Send a tiny silence frame to keep the connection alive if needed # or just continue. continue # Yield chunks suitable for the network # 1024 bytes (512 samples) is a good balance for TCP packets chunk_size = TARGET_SAMPLES * 2 while len(byte_buffer) >= chunk_size: out_bytes = byte_buffer[:chunk_size] byte_buffer = byte_buffer[chunk_size:] yield bytes(out_bytes) @app.get("/audio_feed") def audio_feed(): """Endpoint for streaming Raw PCM.""" return StreamingResponse( generate_audio_stream(), media_type="application/octet-stream", # Changed from audio/wav headers={ "Cache-Control": "no-cache", "X-Content-Type-Options": "nosniff", } ) @app.websocket("/video_stream") async def stream_endpoint(ws: WebSocket): """ Endpoint for Robot/Sim to send video frames. Expected message formats: 1. Binary only (legacy): Just the JPEG frame bytes 2. JSON with timestamp: {"timestamp": , "frame": } """ await ws.accept() frame_count = 0 start_time = time.time() latency_report_interval = 5.0 # Report latency stats every 5 seconds last_latency_report = time.time() try: while True: msg = await ws.receive() # Handle binary-only messages (legacy mode, no timestamp) data = msg.get("bytes") if data: state.update_frame(data, robot_timestamp=None) frame_count += 1 # Handle text/JSON messages with timestamp text_data = msg.get("text") if text_data: import base64 import json try: json_data = json.loads(text_data) timestamp = json_data.get("timestamp") frame_b64 = json_data.get("frame") if frame_b64: frame_bytes = base64.b64decode(frame_b64) state.update_frame(frame_bytes, robot_timestamp=timestamp) frame_count += 1 except (json.JSONDecodeError, KeyError) as e: print(f"[Video] Error parsing JSON: {e}") # FPS reporting elapsed = time.time() - start_time if elapsed > 1.0: fps = frame_count / elapsed print(f"[Video] Receiving FPS: {fps:.2f}") frame_count = 0 start_time = time.time() # Latency reporting if time.time() - last_latency_report > latency_report_interval: stats = state.get_video_latency_stats() if stats["count"] > 0: print( f"[Video Latency] " f"Latest: {stats['latest']:.1f}ms, " f"Avg: {stats['avg']:.1f}ms, " f"Min: {stats['min']:.1f}ms, " f"Max: {stats['max']:.1f}ms " f"(over {stats['count']} frames)" ) last_latency_report = time.time() except WebSocketDisconnect as e: print(f"[Video] WebSocketDisconnect: code={e.code}, reason={e.reason}") except asyncio.CancelledError: print("[Video] stream_endpoint cancelled") except Exception as e: print(f"[Video] stream_endpoint closed with error: {e!r}") finally: print("[Video] stream_endpoint closed (finally)") @app.websocket("/audio_stream") async def audio_endpoint(ws: WebSocket): """ Full duplex audio channel between Robot/Sim and server. Expected message formats from robot: 1. Binary only (legacy): Just the audio bytes 2. JSON with timestamp: {"timestamp": , "audio": } """ await ws.accept() print("[Audio] Stream Connected") latency_report_interval = 5.0 # Report latency stats every 5 seconds last_latency_report = time.time() async def robot_to_server(): nonlocal last_latency_report try: while True: data = await ws.receive() t = data.get("type") if t == "websocket.disconnect": print("[Audio] Disconnected (recv)") break if t == "websocket.receive": # Handle binary-only messages (legacy mode, no timestamp) if data.get("bytes"): state.push_audio_from_robot(data["bytes"], robot_timestamp=None) # Handle JSON messages with timestamp elif data.get("text"): text_data = data.get("text") if text_data == "ping": print("[Audio] Received ping") else: import json import base64 try: json_data = json.loads(text_data) timestamp = json_data.get("timestamp") audio_b64 = json_data.get("audio") if audio_b64: audio_bytes = base64.b64decode(audio_b64) state.push_audio_from_robot(audio_bytes, robot_timestamp=timestamp) except (json.JSONDecodeError, KeyError): pass # Latency reporting if time.time() - last_latency_report > latency_report_interval: stats = state.get_audio_latency_stats() if stats["count"] > 0: print( f"[Audio Latency] " f"Latest: {stats['latest']:.1f}ms, " f"Avg: {stats['avg']:.1f}ms, " f"Min: {stats['min']:.1f}ms, " f"Max: {stats['max']:.1f}ms " f"(over {stats['count']} chunks)" ) last_latency_report = time.time() except asyncio.CancelledError: print("[Audio] robot_to_server cancelled") except Exception as e: print(f"[Audio] robot_to_server error: {e}") async def server_to_robot(): loop = asyncio.get_running_loop() try: while True: chunk: bytes = await loop.run_in_executor( None, state.get_audio_to_robot_blocking ) if chunk is not None: await ws.send_bytes(chunk) except asyncio.CancelledError: print("[Audio] server_to_robot cancelled") except Exception as e: print(f"[Audio] server_to_robot error: {e}") try: await asyncio.gather(robot_to_server(), server_to_robot()) except asyncio.CancelledError: print("[Audio] audio_endpoint cancelled") finally: print("[Audio] Stream Closed") @app.websocket("/browser_stream") async def browser_stream_endpoint(ws: WebSocket): """ Bi-directional connection for the Browser. - Sends Microphone data (Browser -> Robot) - Receives Speaker data (Robot -> Browser) """ await ws.accept() print("[Browser] WebSocket Connected") # Task: Send Audio FROM Robot TO Browser async def send_to_browser(): while True: # Get audio from the robot queue (non-blocking check) if not state.audio_queue.empty(): _, chunk_bytes, robot_timestamp = state.audio_queue.get(timeout=0.5) # Track latency if we have a timestamp if robot_timestamp is not None: state.track_audio_latency(robot_timestamp) try: # Send as binary message await ws.send_bytes(chunk_bytes) except Exception: break else: await asyncio.sleep(0.005) # Tiny sleep to prevent CPU burn # Task: Receive Audio FROM Browser TO Robot async def receive_from_browser(): try: while True: data = await ws.receive_bytes() # Push directly to the robot's input queue state.push_audio_to_robot(data) except Exception as e: print(f"[Browser] Input stream ended: {e}") try: # Run both tasks concurrently await asyncio.gather(send_to_browser(), receive_from_browser()) except Exception as e: print(f"[Browser] WebSocket Closed: {e}") finally: print("[Browser] Disconnected") # ------------------------------------------------------------------- # 7. FastRTC audio handler # ------------------------------------------------------------------- class RobotAudioHandler(StreamHandler): """ FastRTC handler that connects browser WebRTC audio to the robot. - receive(): browser mic -> state.audio_to_robot_queue -> /audio_stream -> robot - emit(): state.audio_queue (robot) -> browser playback """ def __init__(self) -> None: super().__init__( input_sample_rate=AUDIO_SAMPLE_RATE, output_sample_rate=AUDIO_SAMPLE_RATE, ) def receive(self, frame: Tuple[int, np.ndarray]) -> None: if frame is None: return sample_rate, array = frame if array is None: return arr = np.asarray(array) # Ensure mono if arr.ndim > 1: arr = arr[0] if arr.dtype != np.int16: if np.issubdtype(arr.dtype, np.floating): arr = np.clip(arr, -1.0, 1.0) arr = (arr * 32767.0).astype(np.int16) else: arr = arr.astype(np.int16) state.push_audio_to_robot(arr.tobytes()) def emit(self): try: sample_rate, frame_bytes, robot_timestamp = state.audio_queue.get(timeout=0.5) # Track latency if we have a timestamp if robot_timestamp is not None: state.track_audio_latency(robot_timestamp) audio = np.frombuffer(frame_bytes, dtype=np.int16).reshape(1, -1) return sample_rate, audio except queue.Empty: return None def copy(self) -> "RobotAudioHandler": return RobotAudioHandler() def shutdown(self) -> None: pass def start_up(self) -> None: pass # ------------------------------------------------------------------- # 8. Movement UI helpers # ------------------------------------------------------------------- def get_pose_string(): """Returns pose in format JS can parse: pitch:X,yaw:Y,roll:Z,body:B""" with state.pose_lock: p = state.current_pose return f"pitch:{p.pitch:.1f},yaw:{p.yaw:.1f},roll:{p.roll:.1f},body:{p.body_yaw:.1f}" def nudge_pose(dpitch=0, dyaw=0, droll=0, dbody_yaw=0, label="Move"): """Modified to return pose string instead of tuple.""" mov = state.update_pose( dpitch=dpitch, dyaw=dyaw, droll=droll, dbody_yaw=dbody_yaw, ) send_pose_to_robot(mov, label) return get_pose_string() def center_pose(): """Modified to return pose string.""" mov = state.reset_pose() send_pose_to_robot(mov, "Reset pose") return get_pose_string() # ------------------------------------------------------------------- # 9. Gradio UI # ------------------------------------------------------------------- CUSTOM_CSS = """ /* Dark theme overrides */ .gradio-container { background: linear-gradient(135deg, #0a0a0f 0%, #121218 100%) !important; min-height: 100vh; } .dark { --background-fill-primary: #12121a !important; --background-fill-secondary: #1a1a24 !important; --border-color-primary: #2a2a3a !important; --text-color-subdued: #888 !important; } /* Header styling */ #header-row { background: transparent !important; border: none !important; margin-bottom: 1rem; display: flex !important; justify-content: space-between !important; align-items: flex-start !important; } #app-title { font-size: 1.5rem !important; font-weight: 600 !important; background: linear-gradient(90deg, #fff, #888) !important; -webkit-background-clip: text !important; -webkit-text-fill-color: transparent !important; border: none !important; padding: 0 !important; margin: 0 !important; } /* Status badge */ #status-box { flex-shrink: 0 !important; width: auto !important; max-width: 200px !important; min-width: 160px !important; background: rgba(16, 185, 129, 0.15) !important; border: 1px solid rgba(16, 185, 129, 0.4) !important; border-radius: 9999px !important; padding: 0.4rem 1rem !important; font-size: 0.875rem !important; } #status-box textarea { background: transparent !important; border: none !important; color: #10b981 !important; text-align: center !important; font-weight: 500 !important; padding: 0 !important; min-height: unset !important; height: auto !important; line-height: 1.4 !important; } /* Latency display */ #latency-box { flex-shrink: 0 !important; width: 100% !important; max-width: 100% !important; min-width: 300px !important; background: rgba(139, 92, 246, 0.15) !important; border: 1px solid rgba(139, 92, 246, 0.4) !important; border-radius: 0.75rem !important; padding: 0.5rem 1rem !important; font-size: 0.85rem !important; margin-top: 0.5rem !important; } #latency-box textarea { background: transparent !important; border: none !important; color: #a78bfa !important; text-align: left !important; font-weight: 500 !important; font-family: monospace !important; padding: 0 !important; min-height: 2.5rem !important; height: auto !important; max-height: 4rem !important; line-height: 1.4 !important; white-space: pre-wrap !important; overflow-y: auto !important; resize: none !important; } #latency-box textarea::-webkit-scrollbar { display: none !important; } #latency-box .scroll-hide { scrollbar-width: none !important; } #latency-box::-webkit-scrollbar { display: none !important; } /* Video panel */ #video-column { background: #0f0f14 !important; border-radius: 1rem !important; border: 1px solid #2a2a3a !important; overflow: hidden !important; min-height: 500px !important; } #robot-video { border-radius: 0.75rem !important; overflow: hidden !important; } /* Control panel cards */ .control-card { background: rgba(26, 26, 36, 0.8) !important; border: 1px solid #2a2a3a !important; border-radius: 0.75rem !important; padding: 1rem !important; } /* Audio section */ #audio-section { background: rgba(26, 26, 36, 0.8) !important; border: 1px solid #2a2a3a !important; border-radius: 0.75rem !important; } #listen-btn { background: rgba(139, 92, 246, 0.2) !important; border: 1px solid rgba(139, 92, 246, 0.3) !important; color: #a78bfa !important; border-radius: 0.5rem !important; transition: all 0.2s !important; } #listen-btn:hover { background: rgba(139, 92, 246, 0.3) !important; } /* Hide the default keyboard buttons */ #keyboard-buttons { display: none !important; } /* Quick action buttons */ .quick-btn { background: #1f1f2e !important; border: 1px solid #2a2a3a !important; border-radius: 0.5rem !important; padding: 0.5rem !important; font-size: 0.75rem !important; transition: all 0.2s !important; } .quick-btn:hover { background: #2a2a3a !important; } /* Keyboard visualization container */ #keyboard-viz { position: fixed; bottom: 3.5rem; right: 2rem; z-index: 1000; pointer-events: none; } /* Gauges container */ #gauges-viz { position: fixed; bottom: 3.5rem; left: 2rem; z-index: 1000; pointer-events: none; } /* Hide Gradio footer or make room for it */ footer { opacity: 0.5; } /* Hidden pose state (keep in DOM for JS) */ #pose-state { position: absolute !important; opacity: 0 !important; pointer-events: none !important; height: 0 !important; overflow: hidden !important; } """ KEYBOARD_VIZ_HTML = """
Q
W
E
A
S
D
J
H
L
""" GAUGES_HTML = """
Pitch
0.0°
Yaw
0.0°
Roll
0.0°
Body
0.0°
""" KEYBOARD_JS = """ () => { const keyMap = { 'w': 'w', 's': 's', 'a': 'a', 'd': 'd', 'q': 'q', 'e': 'e', 'h': 'h', 'j': 'j', 'l': 'l', }; const btnMap = { 'w': 'btn-forward', 's': 'btn-back', 'a': 'btn-left', 'd': 'btn-right', 'q': 'btn-tilt-up', 'e': 'btn-tilt-down', 'h': 'btn-center', 'j': 'btn-body-left', 'l': 'btn-body-right', }; let lastPressed = {}; const REPEAT_MS = 120; document.addEventListener('keydown', (ev) => { const key = ev.key.toLowerCase(); if (!keyMap[key]) return; // Visual feedback const keyEl = document.querySelector(`.key[data-key="${key}"]`); if (keyEl) keyEl.classList.add('active'); // Rate limit and trigger button const now = Date.now(); if (lastPressed[key] && now - lastPressed[key] < REPEAT_MS) return; lastPressed[key] = now; ev.preventDefault(); const btn = document.getElementById(btnMap[key]); if (btn) btn.click(); }); document.addEventListener('keyup', (ev) => { const key = ev.key.toLowerCase(); const keyEl = document.querySelector(`.key[data-key="${key}"]`); if (keyEl) keyEl.classList.remove('active'); }); // Watch pose-state textbox for changes and update gauges const updateGaugesFromState = () => { const poseEl = document.querySelector('#pose-state textarea'); if (!poseEl) return; const text = poseEl.value; // Parse: "pitch:0.0,yaw:0.0,roll:0.0,body:0.0" const match = text.match(/pitch:([\\d.-]+),yaw:([\\d.-]+),roll:([\\d.-]+),body:([\\d.-]+)/); if (!match) return; const pitch = parseFloat(match[1]); const yaw = parseFloat(match[2]); const roll = parseFloat(match[3]); const body = parseFloat(match[4]); const gauges = { pitch: [-30, 30], yaw: [-180, 180], roll: [-40, 40], body: [-3, 3] }; const values = { pitch, yaw, roll, body }; Object.entries(gauges).forEach(([name, [min, max]]) => { const value = values[name]; const normalized = (value - min) / (max - min); const angle = (normalized - 0.5) * 180; const needle = document.querySelector(`.gauge-needle[data-gauge="${name}"]`); if (needle) needle.setAttribute('transform', `rotate(${angle}, 36, 42)`); const display = document.querySelector(`.gauge-value[data-gauge="${name}"]`); if (display) display.textContent = value.toFixed(1) + '°'; }); }; // Poll for pose updates every 100ms setInterval(updateGaugesFromState, 100); // Update status box styling based on connection state const updateStatusStyle = () => { const statusBox = document.querySelector('#status-box'); if (!statusBox) return; const textarea = statusBox.querySelector('textarea'); if (!textarea) return; const isConnected = textarea.value.includes('Connected'); if (isConnected) { statusBox.style.background = 'rgba(16, 185, 129, 0.15)'; statusBox.style.borderColor = 'rgba(16, 185, 129, 0.4)'; textarea.style.color = '#10b981'; } else { statusBox.style.background = 'rgba(239, 68, 68, 0.15)'; statusBox.style.borderColor = 'rgba(239, 68, 68, 0.4)'; textarea.style.color = '#ef4444'; } }; setInterval(updateStatusStyle, 500); console.log('🎮 Keyboard controls ready'); } """ APP_JS = """ () => { // ========================================== // 1. BI-DIRECTIONAL AUDIO WITH MIC SELECTION // ========================================== // Global handles to manage hot-swapping window.currentStream = null; window.wsHandle = null; // --- Helper: Populate Mic List --- window.refreshMicList = async function() { const select = document.getElementById('mic-select'); try { const devices = await navigator.mediaDevices.enumerateDevices(); const audioInputs = devices.filter(device => device.kind === 'audioinput'); const currentVal = select.value; select.innerHTML = ''; // Clear existing // Add Default option const defaultOpt = document.createElement('option'); defaultOpt.value = ""; defaultOpt.text = "Default Microphone"; select.appendChild(defaultOpt); audioInputs.forEach(device => { const option = document.createElement('option'); option.value = device.deviceId; // If label is empty, permission isn't granted yet option.text = device.label || `Microphone ${device.deviceId.slice(0,5)}...`; select.appendChild(option); }); // Restore selection if it still exists if (currentVal) select.value = currentVal; } catch (e) { console.error("Error listing devices", e); } }; window.startAudioPlayer = async function() { const btn = document.getElementById('start-stream-btn'); const status = document.getElementById('audio-status'); const micSelect = document.getElementById('mic-select'); console.log("[Audio] Starting Bi-Directional Stream..."); try { // --- A. Setup Audio Context --- const AudioContext = window.AudioContext || window.webkitAudioContext; if (!window.audioCtx) { window.audioCtx = new AudioContext({ sampleRate: 16000 }); } const ctx = window.audioCtx; if (ctx.state === 'suspended') await ctx.resume(); status.innerText = "Status: Requesting Mic..."; btn.disabled = true; // --- B. Get Microphone (Input) --- // Check dropdown for specific device ID const selectedMicId = micSelect.value; const constraints = { audio: { deviceId: selectedMicId ? { exact: selectedMicId } : undefined, channelCount: 1, sampleRate: 16000, echoCancellation: true, noiseSuppression: true, autoGainControl: true } }; const stream = await navigator.mediaDevices.getUserMedia(constraints); window.currentStream = stream; // Save global ref // **Refresh list now that we have permission (to show labels)** await window.refreshMicList(); if (selectedMicId) micSelect.value = selectedMicId; status.innerText = "Status: Connecting WS..."; // --- C. Setup WebSocket --- // If we are restarting, reuse WS if open, or create new let ws = window.wsHandle; if (!ws || ws.readyState !== WebSocket.OPEN) { const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; const wsUrl = `${protocol}//${window.location.host}/browser_stream`; ws = new WebSocket(wsUrl); ws.binaryType = 'arraybuffer'; window.wsHandle = ws; } // --- D. Setup Input Processor (Mic -> WS) --- const source = ctx.createMediaStreamSource(stream); // BufferSize 1024 provides a good balance of latency vs stability const processor = ctx.createScriptProcessor(1024, 1, 1); processor.onaudioprocess = (e) => { if (ws.readyState !== WebSocket.OPEN) return; const inputData = e.inputBuffer.getChannelData(0); // Convert Float32 (-1.0 to 1.0) -> Int16 (-32768 to 32767) const int16Buffer = new Int16Array(inputData.length); for (let i = 0; i < inputData.length; i++) { let s = Math.max(-1, Math.min(1, inputData[i])); int16Buffer[i] = s < 0 ? s * 0x8000 : s * 0x7FFF; } ws.send(int16Buffer.buffer); }; source.connect(processor); processor.connect(ctx.destination); // --- E. Setup Output (WS -> Speaker) --- // Only attach listener if it's a new WS connection if (!ws.onmessage) { let nextTime = 0; ws.onopen = () => { console.log("[Audio] WebSocket Open"); status.innerText = "Status: đŸŸĸ Connected"; btn.innerText = "Microphone Active"; }; ws.onmessage = (event) => { const int16Data = new Int16Array(event.data); const floatBuffer = ctx.createBuffer(1, int16Data.length, 16000); const channelData = floatBuffer.getChannelData(0); for (let i = 0; i < int16Data.length; i++) { channelData[i] = (int16Data[i] / 32768.0); } const src = ctx.createBufferSource(); src.buffer = floatBuffer; src.connect(ctx.destination); const now = ctx.currentTime; if (nextTime < now) nextTime = now; if (nextTime > now + 0.08) { console.log("Catching up audio latency..."); nextTime = now; } src.start(nextTime); nextTime += floatBuffer.duration; }; ws.onerror = (e) => { console.error("WS Error", e); status.innerText = "Status: WebSocket Error"; btn.disabled = false; }; ws.onclose = () => { status.innerText = "Status: Disconnected"; btn.disabled = false; btn.innerText = "â–ļī¸ Reconnect"; if (window.currentStream) { window.currentStream.getTracks().forEach(track => track.stop()); } processor.disconnect(); source.disconnect(); }; } else { // If WS was already open, just update UI status.innerText = "Status: đŸŸĸ Connected (Mic Switched)"; btn.innerText = "Microphone Active"; } // Handle Mic Switching micSelect.onchange = async () => { console.log("Switching microphone..."); status.innerText = "Status: Switching Mic..."; // Stop current mic tracks if (window.currentStream) { window.currentStream.getTracks().forEach(t => t.stop()); } processor.disconnect(); source.disconnect(); // Restart player (will pick up new value from dropdown) await window.startAudioPlayer(); }; } catch (err) { console.error("[Audio] Setup Error:", err); status.innerText = "Error: " + err.message; btn.disabled = false; } }; // Attempt to list mics on load (will likely have empty labels until permission) setTimeout(window.refreshMicList, 1000); // ========================================== // 2. KEYBOARD & GAUGE LOGIC (Unchanged) // ========================================== const keyMap = {'w':'w','s':'s','a':'a','d':'d','q':'q','e':'e','h':'h','j':'j','l':'l'}; const btnMap = {'w':'btn-forward','s':'btn-back','a':'btn-left','d':'btn-right','q':'btn-tilt-up','e':'btn-tilt-down','h':'btn-center','j':'btn-body-left','l':'btn-body-right'}; let lastPressed = {}; const REPEAT_MS = 120; document.addEventListener('keydown', (ev) => { const key = ev.key.toLowerCase(); if (!keyMap[key]) return; const keyEl = document.querySelector(`.key[data-key="${key}"]`); if (keyEl) keyEl.classList.add('active'); const now = Date.now(); if (lastPressed[key] && now - lastPressed[key] < REPEAT_MS) return; lastPressed[key] = now; ev.preventDefault(); const btn = document.getElementById(btnMap[key]); if (btn) btn.click(); }); document.addEventListener('keyup', (ev) => { const key = ev.key.toLowerCase(); const keyEl = document.querySelector(`.key[data-key="${key}"]`); if (keyEl) keyEl.classList.remove('active'); }); const updateGaugesFromState = () => { const poseEl = document.querySelector('#pose-state textarea'); if (!poseEl) return; const text = poseEl.value; const match = text.match(/pitch:([\\d.-]+),yaw:([\\d.-]+),roll:([\\d.-]+),body:([\\d.-]+)/); if (!match) return; const values = { pitch: parseFloat(match[1]), yaw: parseFloat(match[2]), roll: parseFloat(match[3]), body: parseFloat(match[4]) }; const gauges = { pitch: [-30, 30], yaw: [-180, 180], roll: [-40, 40], body: [-3, 3] }; Object.entries(gauges).forEach(([name, [min, max]]) => { const normalized = (values[name] - min) / (max - min); const angle = (normalized - 0.5) * 180; const needle = document.querySelector(`.gauge-needle[data-gauge="${name}"]`); if (needle) needle.setAttribute('transform', `rotate(${angle}, 36, 42)`); const display = document.querySelector(`.gauge-value[data-gauge="${name}"]`); if (display) display.textContent = values[name].toFixed(1) + '°'; }); }; setInterval(updateGaugesFromState, 100); const updateStatusStyle = () => { const statusBox = document.querySelector('#status-box'); if (!statusBox || !statusBox.querySelector('textarea')) return; const textarea = statusBox.querySelector('textarea'); const isConnected = textarea.value.includes('Connected'); statusBox.style.background = isConnected ? 'rgba(16, 185, 129, 0.15)' : 'rgba(239, 68, 68, 0.15)'; statusBox.style.borderColor = isConnected ? 'rgba(16, 185, 129, 0.4)' : 'rgba(239, 68, 68, 0.4)'; textarea.style.color = isConnected ? '#10b981' : '#ef4444'; }; setInterval(updateStatusStyle, 500); console.log('🎮 Controls & Mic Select Ready'); } """ # ------------------------------------------------------------------- # Gradio UI with new styling # ------------------------------------------------------------------- with gr.Blocks( title="Reachy Controller", theme=gr.themes.Base( primary_hue="violet", neutral_hue="slate", ), css=CUSTOM_CSS, ) as demo: # Header with gr.Row(elem_id="header-row"): gr.Markdown("## 🤖 Reachy Mini", elem_id="app-title") with gr.Column(scale=1): status_box = gr.Textbox( value=state.get_connection_status, every=2, show_label=False, container=False, elem_id="status-box", ) latency_box = gr.Textbox( value=state.get_latency_display, every=1, show_label=False, container=False, elem_id="latency-box", ) with gr.Row(): # Left column - Controls with gr.Column(scale=1): # Hidden pose state textbox - polls pose for JS gauges pose_state = gr.Textbox( value=get_pose_string, every=0.2, show_label=False, container=False, elem_id="pose-state", ) # Audio section with gr.Group(elem_id="audio-section"): gr.Markdown("### 🎧 Audio") if USE_AUDIO_WEBRTC: listen_btn = gr.Button("🎤 Start Listening", elem_id="listen-btn") robot_audio = WebRTC( label="", modality="audio", mode="send-receive", rtc_configuration=turn_credentials, server_rtc_configuration=server_turn_credentials, full_screen=False, ) robot_audio.stream( fn=RobotAudioHandler(), inputs=[robot_audio], outputs=[robot_audio], trigger=listen_btn.click, ) else: # HTML with Microphone Select Dropdown audio_player_html = """
Status: Stopped
""" gr.HTML(value=audio_player_html, label="đŸŽĩ Robot Audio") # Quick actions with gr.Group(elem_classes="control-card"): gr.Markdown("### ⚡ Quick Actions") with gr.Row(): btn_center_quick = gr.Button("🏠 Center", elem_classes="quick-btn") btn_look_up = gr.Button("👀 Look Up", elem_classes="quick-btn") with gr.Row(): btn_curious = gr.Button("🎭 Curious", elem_classes="quick-btn") btn_excited = gr.Button("🎉 Excited", elem_classes="quick-btn") # Hidden keyboard buttons (still needed for JS clicks) with gr.Group(elem_id="keyboard-buttons"): btn_forward = gr.Button("W", elem_id="btn-forward") btn_back = gr.Button("S", elem_id="btn-back") btn_left = gr.Button("A", elem_id="btn-left") btn_right = gr.Button("D", elem_id="btn-right") btn_tilt_up = gr.Button("Q", elem_id="btn-tilt-up") btn_tilt_down = gr.Button("E", elem_id="btn-tilt-down") btn_body_left = gr.Button("J", elem_id="btn-body-left") btn_body_right = gr.Button("L", elem_id="btn-body-right") btn_center = gr.Button("H", elem_id="btn-center") # Wire up hidden buttons - outputs required for Gradio to execute! btn_forward.click( lambda: nudge_pose(dpitch=-NUDGE_PITCH, label="W"), outputs=[pose_state], ) btn_back.click( lambda: nudge_pose(dpitch=NUDGE_PITCH, label="S"), outputs=[pose_state], ) btn_left.click( lambda: nudge_pose(dyaw=NUDGE_ANGLE * 2, label="A"), outputs=[pose_state], ) btn_right.click( lambda: nudge_pose(dyaw=-NUDGE_ANGLE * 2, label="D"), outputs=[pose_state], ) btn_tilt_up.click( lambda: nudge_pose(droll=-NUDGE_ANGLE, label="Q"), outputs=[pose_state], ) btn_tilt_down.click( lambda: nudge_pose(droll=NUDGE_ANGLE, label="E"), outputs=[pose_state], ) btn_body_left.click( lambda: nudge_pose(dbody_yaw=NUDGE_BODY, label="J"), outputs=[pose_state], ) btn_body_right.click( lambda: nudge_pose(dbody_yaw=-NUDGE_BODY, label="L"), outputs=[pose_state], ) btn_center.click(center_pose, outputs=[pose_state]) # Wire up quick action buttons btn_center_quick.click(center_pose, outputs=[pose_state]) btn_look_up.click( lambda: nudge_pose(dpitch=-15, label="Look Up"), outputs=[pose_state], ) btn_curious.click( lambda: nudge_pose(dpitch=-10, droll=15, label="Curious"), outputs=[pose_state], ) btn_excited.click( lambda: nudge_pose(dpitch=-5, droll=-10, label="Excited"), outputs=[pose_state], ) # Right column - Video with gr.Column(scale=2, elem_id="video-column"): if USE_VIDEO_WEBRTC: robot_video = WebRTC( label="", modality="video", mode="receive", rtc_configuration=turn_credentials, server_rtc_configuration=server_turn_credentials, elem_id="robot-video", ) robot_video.stream( fn=webrtc_video_generator, inputs=[], outputs=[robot_video], trigger=listen_btn.click, ) else: html_code = """ """ gr.HTML(value=html_code, label="đŸŽŦ Robot Video") # Floating keyboard visualization gr.HTML(KEYBOARD_VIZ_HTML) gr.HTML(GAUGES_HTML) # Load keyboard handler demo.load(None, None, None, js=KEYBOARD_JS if USE_AUDIO_WEBRTC else APP_JS) # ------------------------------------------------------------------- # 10. Mount & run # ------------------------------------------------------------------- app = gr.mount_gradio_app(app, demo, path="/") if __name__ == "__main__": print("🚀 Server starting on http://0.0.0.0:7860") print("â„šī¸ Point your Robot/Sim to: ws://:7860/robot") uvicorn.run(app, host="0.0.0.0", port=7860, proxy_headers=True, forwarded_allow_ips="*", log_level="warning")