diff --git a/voicebot/bots/whisper.py b/voicebot/bots/whisper.py index 080aff5..5bb9967 100644 --- a/voicebot/bots/whisper.py +++ b/voicebot/bots/whisper.py @@ -656,13 +656,22 @@ _whisper_model: Optional[OpenVINOWhisperModel] = None _audio_processors: Dict[str, "OptimizedAudioProcessor"] = {} _send_chat_func: Optional[Callable[[str], Awaitable[None]]] = None +# Model loading status for video display +_model_loading_status: str = "Not loaded" +_model_loading_progress: float = 0.0 + def _ensure_model_loaded(device: str = _device) -> OpenVINOWhisperModel: """Ensure the global model is loaded.""" - global _whisper_model + global _whisper_model, _model_loading_status, _model_loading_progress if _whisper_model is None: setup_intel_arc_environment() logger.info(f"Loading OpenVINO Whisper model: {_model_id}") + _model_loading_status = "Loading model..." + _model_loading_progress = 0.1 + _whisper_model = OpenVINOWhisperModel(model_id=_model_id, config=_ov_config, device=device) + _model_loading_status = "Model loaded successfully" + _model_loading_progress = 1.0 logger.info("OpenVINO Whisper model loaded successfully") return _whisper_model @@ -1012,7 +1021,7 @@ class WaveformVideoTrack(MediaStreamTrack): kind = "video" - def __init__(self, session_name: str, width: int = 640, height: int = 240, fps: int = 15) -> None: + def __init__(self, session_name: str, width: int = 640, height: int = 480, fps: int = 15) -> None: super().__init__() self.session_name = session_name self.width = int(width) @@ -1039,6 +1048,22 @@ class WaveformVideoTrack(MediaStreamTrack): frame_array: npt.NDArray[np.uint8] = np.zeros((self.height, self.width, 3), dtype=np.uint8) + # Display model loading status prominently + status_text = _model_loading_status + progress = _model_loading_progress + + # Draw status background + cv2.rectangle(frame_array, (0, 0), (self.width, 60), (0, 0, 0), -1) + + # Draw progress bar if loading + if progress < 1.0 and "Ready" not in status_text: + bar_width = int(progress * (self.width - 40)) + cv2.rectangle(frame_array, (20, 40), (20 + bar_width, 50), (0, 255, 0), -1) + cv2.rectangle(frame_array, (20, 40), (self.width - 20, 50), (255, 255, 255), 1) + + # Draw status text + cv2.putText(frame_array, f"Status: {status_text}", (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2) + # Select the most active processor (highest RMS) and draw its waveform best_proc = None best_rms = 0.0 @@ -1088,16 +1113,16 @@ class WaveformVideoTrack(MediaStreamTrack): points: list[tuple[int, int]] = [] for x in range(self.width): v = float(norm[x]) if x < norm.size and not np.isnan(norm[x]) else 0.0 - y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 1)) - points.append((x, max(0, min(self.height - 1, y)))) + y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 70)) + 60 # Offset below status bar + points.append((x, max(60, min(self.height - 1, y)))) if len(points) > 1: pts_np = np.array(points, dtype=np.int32) cv2.polylines(frame_array, [pts_np], isClosed=False, color=(0, 200, 80), thickness=2) - cv2.putText(frame_array, f"Waveform: {pname}", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + cv2.putText(frame_array, f"Waveform: {pname}", (10, self.height - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) else: - cv2.putText(frame_array, "No audio", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (200, 200, 200), 1) + cv2.putText(frame_array, "No audio", (10, self.height - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (200, 200, 200), 1) frame = VideoFrame.from_ndarray(frame_array, format="bgr24") frame.pts = pts @@ -1112,20 +1137,85 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None: logger.info(f"Ignoring non-audio track from {peer.peer_name}: {track.kind}") return - # Create audio processor if peer.peer_name not in _audio_processors: if _send_chat_func is None: logger.error(f"Cannot create processor for {peer.peer_name}: no send_chat_func") return - logger.info(f"Creating OptimizedAudioProcessor for {peer.peer_name}") - _audio_processors[peer.peer_name] = OptimizedAudioProcessor( - peer_name=peer.peer_name, - send_chat_func=_send_chat_func - ) + # Start background task to load model and create processor + async def init_processor(): + global _model_loading_status, _model_loading_progress + # Load model asynchronously to avoid blocking frame reading + _model_loading_status = "Initializing model loading..." + _model_loading_progress = 0.0 + + loop = asyncio.get_event_loop() + await loop.run_in_executor(None, _ensure_model_loaded) + + _model_loading_status = "Model loaded, creating processor..." + _model_loading_progress = 0.8 + + logger.info(f"Creating OptimizedAudioProcessor for {peer.peer_name}") + if _send_chat_func is None: + logger.error(f"No send_chat_func available for {peer.peer_name}") + _model_loading_status = "Error: No send function available" + return + _audio_processors[peer.peer_name] = OptimizedAudioProcessor( + peer_name=peer.peer_name, + send_chat_func=_send_chat_func + ) + + audio_processor = _audio_processors[peer.peer_name] + _model_loading_status = "Ready for transcription" + _model_loading_progress = 1.0 + logger.info(f"Starting OpenVINO audio processing for {peer.peer_name}") + + # Now start processing frames + frame_count = 0 + while True: + try: + frame = await track.recv() + frame_count += 1 + + if frame_count % 100 == 0: + logger.debug(f"Received {frame_count} frames from {peer.peer_name}") + + except MediaStreamError as e: + logger.info(f"Audio stream ended for {peer.peer_name}: {e}") + break + except Exception as e: + logger.error(f"Error receiving frame from {peer.peer_name}: {e}") + break + + if isinstance(frame, AudioFrame): + try: + # Convert frame to numpy array + audio_data = frame.to_ndarray() + + # Handle audio format conversion + audio_data = _process_audio_frame(audio_data, frame) + + # Resample if needed + if frame.sample_rate != SAMPLE_RATE: + audio_data = _resample_audio(audio_data, frame.sample_rate, SAMPLE_RATE) + + # Convert to float32 + audio_data_float32 = audio_data.astype(np.float32) + audio_data = normalize_audio(audio_data) + + # Process with optimized processor + audio_processor.add_audio_data(audio_data_float32) + + except Exception as e: + logger.error(f"Error processing audio frame for {peer.peer_name}: {e}") + continue + + asyncio.create_task(init_processor()) + return # Exit early, processing is handled in background + # If processor already exists, just continue processing audio_processor = _audio_processors[peer.peer_name] - logger.info(f"Starting OpenVINO audio processing for {peer.peer_name}") + logger.info(f"Continuing OpenVINO audio processing for {peer.peer_name}") try: frame_count = 0 @@ -1239,7 +1329,7 @@ def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]: await asyncio.sleep(1 / self.fps) return frame try: - video_track = WaveformVideoTrack(session_name=session_name, width=640, height=240, fps=15) + video_track = WaveformVideoTrack(session_name=session_name, width=640, height=480, fps=15) audio_track = SilentAudioTrack() return {"video": video_track, "audio": audio_track} except Exception as e: @@ -1304,5 +1394,7 @@ def get_model_info() -> Dict[str, Any]: "quantization_enabled": _ov_config.enable_quantization, "is_quantized": ov_model.is_quantized, "sample_rate": SAMPLE_RATE, - "chunk_duration_ms": CHUNK_DURATION_MS + "chunk_duration_ms": CHUNK_DURATION_MS, + "loading_status": _model_loading_status, + "loading_progress": _model_loading_progress } \ No newline at end of file