From 51b1ef7bc29e07c3a10bacfcaad9938139d8dd50 Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Sat, 13 Sep 2025 21:40:21 -0700 Subject: [PATCH] Tmp --- voicebot/bots/whisper.py | 89 ++++++++++++++++++++++++++++------------ 1 file changed, 62 insertions(+), 27 deletions(-) diff --git a/voicebot/bots/whisper.py b/voicebot/bots/whisper.py index 7a0b388..e5a6add 100644 --- a/voicebot/bots/whisper.py +++ b/voicebot/bots/whisper.py @@ -724,8 +724,7 @@ _send_chat_func: Optional[Callable[[str], Awaitable[None]]] = None _model_loading_status: str = "Not loaded" _model_loading_progress: float = 0.0 -# Raw audio buffer for immediate graphing -_raw_audio_buffer: Dict[str, npt.NDArray[np.float32]] = {} +# Raw audio buffer for immediate graphing (now handled by WaveformVideoTrack.buffer) def _ensure_model_loaded(device: str = _device) -> OpenVINOWhisperModel: @@ -837,6 +836,10 @@ class OptimizedAudioProcessor: self.send_chat_func = send_chat_func self.sample_rate = SAMPLE_RATE + # Initialize visualization buffer if not already done + if self.peer_name not in WaveformVideoTrack.buffer: + WaveformVideoTrack.buffer[self.peer_name] = np.array([], dtype=np.float32) + # Optimized buffering parameters self.chunk_size = int(self.sample_rate * CHUNK_DURATION_MS / 1000) self.buffer_size = self.chunk_size * 50 @@ -952,6 +955,12 @@ class OptimizedAudioProcessor: self.silence_frames = 0 self.last_activity_time = time.time() self._add_to_circular_buffer(audio_data) + # Update visualization buffer + WaveformVideoTrack.buffer[self.peer_name] = np.concatenate([WaveformVideoTrack.buffer[self.peer_name], audio_data]) + # Limit buffer size to last 10 seconds + max_samples = SAMPLE_RATE * 10 + if len(WaveformVideoTrack.buffer[self.peer_name]) > max_samples: + WaveformVideoTrack.buffer[self.peer_name] = WaveformVideoTrack.buffer[self.peer_name][-max_samples:] elif ( len(self.current_phrase_audio) > 0 and self.silence_frames < self.max_trailing_silence_frames @@ -959,6 +968,12 @@ class OptimizedAudioProcessor: logger.info(f"Trailing silence accepted for {self.peer_name}") self.silence_frames += 1 self._add_to_circular_buffer(audio_data) + # Update visualization buffer + WaveformVideoTrack.buffer[self.peer_name] = np.concatenate([WaveformVideoTrack.buffer[self.peer_name], audio_data]) + # Limit buffer size to last 10 seconds + max_samples = SAMPLE_RATE * 10 + if len(WaveformVideoTrack.buffer[self.peer_name]) > max_samples: + WaveformVideoTrack.buffer[self.peer_name] = WaveformVideoTrack.buffer[self.peer_name][-max_samples:] else: if (self.silence_frames % 10 == 0) and (self.silence_frames > 0): logger.info( @@ -1380,6 +1395,9 @@ class WaveformVideoTrack(MediaStreamTrack): kind = "video" + # Shared buffer for audio data + buffer: Dict[str, npt.NDArray[np.float32]] = {} + def __init__( self, session_name: str, width: int = 640, height: int = 480, fps: int = 15 ) -> None: @@ -1443,7 +1461,7 @@ class WaveformVideoTrack(MediaStreamTrack): current_time, cv2.FONT_HERSHEY_SIMPLEX, 1.0, 2 ) clock_x = self.width - text_width - 10 # 10px margin from right edge - clock_y = self.height - 10 # 10px margin from bottom + clock_y = self.height - 30 # Move to 450 for height=480 cv2.putText( frame_array, current_time, @@ -1458,11 +1476,13 @@ class WaveformVideoTrack(MediaStreamTrack): best_proc = None best_rms = 0.0 try: - for pname, arr in _raw_audio_buffer.items(): + for pname, arr in self.__class__.buffer.items(): try: - if arr is None or len(arr) == 0: - continue - rms = float(np.sqrt(np.mean(arr**2))) + # Allow empty arrays to be selected + if len(arr) == 0: + rms = 0.0 + else: + rms = float(np.sqrt(np.mean(arr**2))) if rms > best_rms: best_rms = rms best_proc = (pname, arr.copy()) @@ -1507,7 +1527,7 @@ class WaveformVideoTrack(MediaStreamTrack): points: list[tuple[int, int]] = [] for x in range(self.width): v = float(norm[x]) if x < norm.size and not np.isnan(norm[x]) else 0.0 - y = int((1.0 - ((v + 1.0) / 2.0)) * self.height) + y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 80)) + 80 points.append((x, y)) if len(points) > 1: @@ -1516,8 +1536,8 @@ class WaveformVideoTrack(MediaStreamTrack): frame_array, [pts_np], isClosed=False, - color=(0, 200, 80), - thickness=2, + color=(255, 255, 255), + thickness=4, ) cv2.putText( @@ -1530,13 +1550,35 @@ class WaveformVideoTrack(MediaStreamTrack): 2, ) else: + # Draw a test sine wave to verify drawing works + import math + test_points: list[tuple[int, int]] = [] + for x in range(self.width): + # Create a sine wave with some amplitude + v = 0.5 * math.sin(2 * math.pi * x / self.width * 4) # 4 cycles across width + y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 80)) + 80 + test_points.append((x, y)) + + if len(test_points) > 1: + pts_np = np.array(test_points, dtype=np.int32) + cv2.polylines( + frame_array, + [pts_np], + isClosed=False, + color=(255, 255, 255), + thickness=4, + ) + + # Show buffer status + buffer_keys = len(self.__class__.buffer) + total_samples = sum(len(arr) for arr in self.__class__.buffer.values()) cv2.putText( frame_array, - "No audio", - (10, self.height - 15), + f"Test waveform - Buffer: {buffer_keys} keys, {total_samples} samples", + (10, 400), cv2.FONT_HERSHEY_SIMPLEX, - 1.2, - (200, 200, 200), + 0.8, + (255, 255, 255), 2, ) @@ -1548,15 +1590,15 @@ class WaveformVideoTrack(MediaStreamTrack): async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None: """Handle incoming audio tracks from WebRTC peers.""" - global _audio_processors, _send_chat_func, _raw_audio_buffer + global _audio_processors, _send_chat_func if track.kind != "audio": logger.info(f"Ignoring non-audio track from {peer.peer_name}: {track.kind}") return # Initialize raw audio buffer for immediate graphing - if peer.peer_name not in _raw_audio_buffer: - _raw_audio_buffer[peer.peer_name] = np.array([], dtype=np.float32) + if peer.peer_name not in WaveformVideoTrack.buffer: + WaveformVideoTrack.buffer[peer.peer_name] = np.array([], dtype=np.float32) if peer.peer_name not in _audio_processors: if _send_chat_func is None: @@ -1686,13 +1728,6 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None: logger.debug(f"Processed audio frame {frame_count} from {peer.peer_name}: {len(audio_data_float32)} samples") - # Update raw buffer for graphing - _raw_audio_buffer[peer.peer_name] = np.concatenate([_raw_audio_buffer[peer.peer_name], audio_data_float32]) - # Limit buffer size to last 10 seconds - max_samples = SAMPLE_RATE * 10 - if len(_raw_audio_buffer[peer.peer_name]) > max_samples: - _raw_audio_buffer[peer.peer_name] = _raw_audio_buffer[peer.peer_name][-max_samples:] - # Process with optimized processor if available audio_processor.add_audio_data(audio_data_float32) @@ -1834,7 +1869,7 @@ def bind_send_chat_function(send_chat_func: Callable[[str], Awaitable[None]]) -> def cleanup_peer_processor(peer_name: str) -> None: """Clean up processor for disconnected peer.""" - global _audio_processors, _raw_audio_buffer + global _audio_processors if peer_name in _audio_processors: logger.info(f"Cleaning up processor for {peer_name}") @@ -1843,8 +1878,8 @@ def cleanup_peer_processor(peer_name: str) -> None: del _audio_processors[peer_name] logger.info(f"Processor cleanup complete for {peer_name}") - if peer_name in _raw_audio_buffer: - del _raw_audio_buffer[peer_name] + if peer_name in WaveformVideoTrack.buffer: + del WaveformVideoTrack.buffer[peer_name] def get_active_processors() -> Dict[str, OptimizedAudioProcessor]: