diff --git a/voicebot/bots/whisper.py b/voicebot/bots/whisper.py index 4b9d6de..7a0b388 100644 --- a/voicebot/bots/whisper.py +++ b/voicebot/bots/whisper.py @@ -724,6 +724,9 @@ _send_chat_func: Optional[Callable[[str], Awaitable[None]]] = None _model_loading_status: str = "Not loaded" _model_loading_progress: float = 0.0 +# Raw audio buffer for immediate graphing +_raw_audio_buffer: Dict[str, npt.NDArray[np.float32]] = {} + def _ensure_model_loaded(device: str = _device) -> OpenVINOWhisperModel: """Ensure the global model is loaded.""" @@ -1434,13 +1437,29 @@ class WaveformVideoTrack(MediaStreamTrack): 3, ) - # Select the most active processor (highest RMS) and draw its waveform + # Draw clock in lower right corner, right justified + current_time = time.strftime("%H:%M:%S") + (text_width, text_height), _ = cv2.getTextSize( + current_time, cv2.FONT_HERSHEY_SIMPLEX, 1.0, 2 + ) + clock_x = self.width - text_width - 10 # 10px margin from right edge + clock_y = self.height - 10 # 10px margin from bottom + cv2.putText( + frame_array, + current_time, + (clock_x, clock_y), + cv2.FONT_HERSHEY_SIMPLEX, + 1.0, + (255, 255, 255), + 2, + ) + + # Select the most active audio buffer (highest RMS) and draw its waveform best_proc = None best_rms = 0.0 try: - for pname, proc in _audio_processors.items(): + for pname, arr in _raw_audio_buffer.items(): try: - arr = getattr(proc, "current_phrase_audio", None) if arr is None or len(arr) == 0: continue rms = float(np.sqrt(np.mean(arr**2))) @@ -1455,13 +1474,15 @@ class WaveformVideoTrack(MediaStreamTrack): if best_proc is not None: pname, arr = best_proc - # Use the entire current phrase audio (from the start of the ongoing recording) - # This ensures the waveform shows audio from when recording began until it is processed. + # Use the last 2 second of audio data, padded with zeros if less + samples_needed = SAMPLE_RATE * 2 # 2 second(s) if len(arr) <= 0: - arr_segment = np.zeros(1, dtype=np.float32) + arr_segment = np.zeros(samples_needed, dtype=np.float32) + elif len(arr) >= samples_needed: + arr_segment = arr[-samples_needed:].copy() else: - # Copy the buffer so downstream operations (resizing/bucketing) are safe - arr_segment = arr.copy() + # Pad with zeros at the beginning + arr_segment = np.concatenate([np.zeros(samples_needed - len(arr), dtype=np.float32), arr]) # Assume arr_segment is already in [-1, 1] norm = arr_segment @@ -1527,12 +1548,16 @@ class WaveformVideoTrack(MediaStreamTrack): async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None: """Handle incoming audio tracks from WebRTC peers.""" - global _audio_processors, _send_chat_func + global _audio_processors, _send_chat_func, _raw_audio_buffer if track.kind != "audio": logger.info(f"Ignoring non-audio track from {peer.peer_name}: {track.kind}") return + # Initialize raw audio buffer for immediate graphing + if peer.peer_name not in _raw_audio_buffer: + _raw_audio_buffer[peer.peer_name] = np.array([], dtype=np.float32) + if peer.peer_name not in _audio_processors: if _send_chat_func is None: logger.error( @@ -1615,7 +1640,7 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None: continue asyncio.create_task(init_processor()) - return # Exit early, processing is handled in background + return # Exit early, processor is handled in the background # If processor already exists, just continue processing audio_processor = _audio_processors[peer.peer_name] @@ -1623,13 +1648,17 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None: try: frame_count = 0 + logger.info(f"Entering frame processing loop for {peer.peer_name}") while True: try: + logger.debug(f"Waiting for frame from {peer.peer_name}") frame = await track.recv() frame_count += 1 - if frame_count % 100 == 0: - logger.debug(f"Received {frame_count} frames from {peer.peer_name}") + if frame_count == 1: + logger.info(f"Received first frame from {peer.peer_name}") + elif frame_count % 50 == 0: + logger.info(f"Received {frame_count} frames from {peer.peer_name}") except MediaStreamError as e: logger.info(f"Audio stream ended for {peer.peer_name}: {e}") @@ -1638,7 +1667,6 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None: logger.error(f"Error receiving frame from {peer.peer_name}: {e}") break - logger.info(f"Processing frame {frame_count} from {peer.peer_name}") if isinstance(frame, AudioFrame): try: # Convert frame to numpy array @@ -1656,7 +1684,16 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None: # Convert to float32 audio_data_float32 = audio_data.astype(np.float32) - # Process with optimized processor + logger.debug(f"Processed audio frame {frame_count} from {peer.peer_name}: {len(audio_data_float32)} samples") + + # Update raw buffer for graphing + _raw_audio_buffer[peer.peer_name] = np.concatenate([_raw_audio_buffer[peer.peer_name], audio_data_float32]) + # Limit buffer size to last 10 seconds + max_samples = SAMPLE_RATE * 10 + if len(_raw_audio_buffer[peer.peer_name]) > max_samples: + _raw_audio_buffer[peer.peer_name] = _raw_audio_buffer[peer.peer_name][-max_samples:] + + # Process with optimized processor if available audio_processor.add_audio_data(audio_data_float32) except Exception as e: @@ -1797,7 +1834,7 @@ def bind_send_chat_function(send_chat_func: Callable[[str], Awaitable[None]]) -> def cleanup_peer_processor(peer_name: str) -> None: """Clean up processor for disconnected peer.""" - global _audio_processors + global _audio_processors, _raw_audio_buffer if peer_name in _audio_processors: logger.info(f"Cleaning up processor for {peer_name}") @@ -1806,6 +1843,9 @@ def cleanup_peer_processor(peer_name: str) -> None: del _audio_processors[peer_name] logger.info(f"Processor cleanup complete for {peer_name}") + if peer_name in _raw_audio_buffer: + del _raw_audio_buffer[peer_name] + def get_active_processors() -> Dict[str, OptimizedAudioProcessor]: """Get active processors for debugging."""