From b2169da4cd58ec29341efdbb3a07c98018a1aa31 Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Mon, 15 Sep 2025 14:30:16 -0700 Subject: [PATCH] Fixing transcode --- voicebot/bot_orchestrator.py | 11 +- voicebot/bots/whisper.py | 460 ++++++++++++++++++++++++++++++++--- 2 files changed, 439 insertions(+), 32 deletions(-) diff --git a/voicebot/bot_orchestrator.py b/voicebot/bot_orchestrator.py index 95b4309..188b240 100644 --- a/voicebot/bot_orchestrator.py +++ b/voicebot/bot_orchestrator.py @@ -8,6 +8,7 @@ import asyncio import threading import uuid import importlib +import inspect import pkgutil import sys import os @@ -346,7 +347,10 @@ async def bot_join(bot_name: str, req: JoinRequest): if req.config_values and config_handler: try: logger.info(f"Applying existing configuration to bot {bot_name}") - success = await config_handler(req.lobby_id, req.config_values) + if inspect.iscoroutinefunction(config_handler): + success = await config_handler(req.lobby_id, req.config_values) + else: + success = config_handler(req.lobby_id, req.config_values) if success: logger.info(f"Successfully applied existing configuration to bot {bot_name}") else: @@ -451,7 +455,10 @@ async def update_bot_config(bot_name: str, config_data: dict[str, Any]) -> dict[ raise HTTPException(status_code=400, detail="lobby_id is required") # Call the bot's configuration handler - success = await config_handler(lobby_id, config_values) + if inspect.iscoroutinefunction(config_handler): + success = await config_handler(lobby_id, config_values) + else: + success = config_handler(lobby_id, config_values) if success: return {"success": True, "message": "Configuration updated successfully"} diff --git a/voicebot/bots/whisper.py b/voicebot/bots/whisper.py index 84398b3..a4a1172 100644 --- a/voicebot/bots/whisper.py +++ b/voicebot/bots/whisper.py @@ -1115,6 +1115,7 @@ class OptimizedAudioProcessor: self.transcription_history: List[TranscriptionHistoryItem] = [] self.last_activity_time = time.time() self.last_audio_time = time.time() # Track when any audio chunk is received + self.final_transcription_pending = False # Flag to prevent accumulating audio during final transcription # Current transcription message for refinements self.current_message: Optional[ChatMessageModel] = None @@ -1172,6 +1173,7 @@ class OptimizedAudioProcessor: if is_speech: self.silence_frames = 0 self.last_activity_time = time.time() + self.final_transcription_pending = False # Reset flag when new speech is detected self._add_to_circular_buffer(audio_data) elif (len(self.current_phrase_audio) > 0 and self.silence_frames < self.max_trailing_silence_frames): @@ -1262,6 +1264,7 @@ class OptimizedAudioProcessor: ): # At least 0.5 seconds if self.main_loop: logger.info(f"Queueing final transcription for {self.peer_name}") + self.final_transcription_pending = True asyncio.create_task( self._transcribe_and_send( self.current_phrase_audio.copy(), is_final=True @@ -1282,24 +1285,25 @@ class OptimizedAudioProcessor: ) # Add to current phrase - self.current_phrase_audio = np.concatenate( - [self.current_phrase_audio, audio_item.audio] - ) - - # Check if we should transcribe - phrase_duration = len(self.current_phrase_audio) / self.sample_rate - - if phrase_duration >= 1.0: # Transcribe every 1 second - logger.info(f"Transcribing for {self.peer_name} (1s interval)") - await self._transcribe_and_send( - self.current_phrase_audio.copy(), is_final=False + if not self.final_transcription_pending: + self.current_phrase_audio = np.concatenate( + [self.current_phrase_audio, audio_item.audio] ) + # Check if we should transcribe + phrase_duration = len(self.current_phrase_audio) / self.sample_rate + + if phrase_duration >= 1.0: # Transcribe every 1 second + logger.info(f"Transcribing for {self.peer_name} (1s interval)") + await self._transcribe_and_send( + self.current_phrase_audio.copy(), is_final=False + ) + except asyncio.TimeoutError: # Check for final transcription on timeout if ( len(self.current_phrase_audio) > 0 - and time.time() - self.last_audio_time > 2.0 + and time.time() - self.last_activity_time > 2.0 ): logger.info( f"Final transcription timeout for {self.peer_name} (asyncio.TimeoutError)" @@ -1308,6 +1312,7 @@ class OptimizedAudioProcessor: self.current_phrase_audio.copy(), is_final=True ) self.current_phrase_audio = np.array([], dtype=np.float32) + self.final_transcription_pending = False # Reset the flag except Exception as e: logger.error( f"Error in async processing loop for {self.peer_name}: {e}" @@ -1325,30 +1330,31 @@ class OptimizedAudioProcessor: audio_item = self._threading_queue.get(timeout=1.0) # Add to current phrase - self.current_phrase_audio = np.concatenate( - [self.current_phrase_audio, audio_item.audio] - ) + if not self.final_transcription_pending: + self.current_phrase_audio = np.concatenate( + [self.current_phrase_audio, audio_item.audio] + ) - # Check if we should transcribe - phrase_duration = len(self.current_phrase_audio) / self.sample_rate + # Check if we should transcribe + phrase_duration = len(self.current_phrase_audio) / self.sample_rate - if phrase_duration >= 1.0: - if self.main_loop: - logger.info( - f"Transcribing from thread for {self.peer_name} (_thread_processing_loop > 1s interval)" - ) - asyncio.run_coroutine_threadsafe( - self._transcribe_and_send( - self.current_phrase_audio.copy(), is_final=False - ), - self.main_loop, - ) + if phrase_duration >= 1.0: + if self.main_loop: + logger.info( + f"Transcribing from thread for {self.peer_name} (_thread_processing_loop > 1s interval)" + ) + asyncio.run_coroutine_threadsafe( + self._transcribe_and_send( + self.current_phrase_audio.copy(), is_final=False + ), + self.main_loop, + ) except Empty: # Check for final transcription if ( len(self.current_phrase_audio) > 0 - and time.time() - self.last_audio_time > 2.0 + and time.time() - self.last_activity_time > 2.0 ): if self.main_loop: logger.info( @@ -1361,6 +1367,7 @@ class OptimizedAudioProcessor: self.main_loop, ) self.current_phrase_audio = np.array([], dtype=np.float32) + self.final_transcription_pending = False # Reset the flag except Exception as e: logger.error( f"Error in thread processing loop for {self.peer_name}: {e}" @@ -1380,6 +1387,10 @@ class OptimizedAudioProcessor: if audio_array.ndim != 1: raise ValueError("Expected mono audio as a 1D numpy array.") + # Reset the flag for final transcriptions to allow new processing + if is_final: + self.final_transcription_pending = False + transcription_start = time.time() transcription_type = "final" if is_final else "streaming" @@ -2047,7 +2058,396 @@ def _resample_audio( # Public API functions def agent_info() -> Dict[str, str]: - return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION, "has_media": "true"} + return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION, "has_media": "true", "configurable": "true"} + + +def get_config_schema() -> Dict[str, Any]: + """Get the configuration schema for the Whisper bot""" + return { + "bot_name": AGENT_NAME, + "version": "1.0", + "parameters": [ + { + "name": "model_id", + "type": "select", + "label": "Whisper Model", + "description": "The Whisper model to use for transcription", + "default_value": _model_id, + "required": True, + "options": [ + {"value": "distil-whisper/distil-large-v2", "label": "Distil-Whisper Large v2 (Fast)"}, + {"value": "distil-whisper/distil-medium.en", "label": "Distil-Whisper Medium EN"}, + {"value": "distil-whisper/distil-small.en", "label": "Distil-Whisper Small EN"}, + {"value": "openai/whisper-large-v3", "label": "OpenAI Whisper Large v3"}, + {"value": "openai/whisper-large-v2", "label": "OpenAI Whisper Large v2"}, + {"value": "openai/whisper-large", "label": "OpenAI Whisper Large"}, + {"value": "openai/whisper-medium", "label": "OpenAI Whisper Medium"}, + {"value": "openai/whisper-small", "label": "OpenAI Whisper Small"}, + {"value": "openai/whisper-base", "label": "OpenAI Whisper Base"}, + {"value": "openai/whisper-tiny", "label": "OpenAI Whisper Tiny"} + ] + }, + { + "name": "device", + "type": "select", + "label": "Inference Device", + "description": "The device to run inference on", + "default_value": _device, + "required": True, + "options": [ + {"value": "GPU.1", "label": "Intel Arc GPU (GPU.1)"}, + {"value": "GPU", "label": "GPU"}, + {"value": "CPU", "label": "CPU"} + ] + }, + { + "name": "enable_quantization", + "type": "boolean", + "label": "Enable Quantization", + "description": "Enable INT8 quantization for faster inference", + "default_value": _ov_config.enable_quantization, + "required": False + }, + { + "name": "throughput_streams", + "type": "range", + "label": "Throughput Streams", + "description": "Number of parallel inference streams", + "default_value": _ov_config.throughput_streams, + "required": False, + "min_value": 1, + "max_value": 8, + "step": 1 + }, + { + "name": "max_threads", + "type": "range", + "label": "Max CPU Threads", + "description": "Maximum number of CPU threads for inference", + "default_value": _ov_config.max_threads, + "required": False, + "min_value": 1, + "max_value": 16, + "step": 1 + }, + { + "name": "sample_rate", + "type": "number", + "label": "Sample Rate", + "description": "Audio sample rate in Hz", + "default_value": SAMPLE_RATE, + "required": True, + "min_value": 8000, + "max_value": 48000 + }, + { + "name": "chunk_duration_ms", + "type": "range", + "label": "Chunk Duration (ms)", + "description": "Duration of audio chunks in milliseconds", + "default_value": CHUNK_DURATION_MS, + "required": False, + "min_value": 50, + "max_value": 500, + "step": 10 + }, + { + "name": "vad_threshold", + "type": "range", + "label": "VAD Threshold", + "description": "Voice activity detection threshold", + "default_value": VAD_THRESHOLD, + "required": False, + "min_value": 0.001, + "max_value": 0.1, + "step": 0.001 + }, + { + "name": "max_silence_frames", + "type": "range", + "label": "Max Silence Frames", + "description": "Maximum frames of silence before stopping", + "default_value": MAX_SILENCE_FRAMES, + "required": False, + "min_value": 10, + "max_value": 100, + "step": 5 + }, + { + "name": "max_trailing_silence_frames", + "type": "range", + "label": "Max Trailing Silence Frames", + "description": "Maximum trailing silence frames to include", + "default_value": MAX_TRAILING_SILENCE_FRAMES, + "required": False, + "min_value": 1, + "max_value": 20, + "step": 1 + }, + { + "name": "vad_energy_threshold", + "type": "range", + "label": "VAD Energy Threshold", + "description": "Energy threshold for voice activity detection", + "default_value": 0.005, + "required": False, + "min_value": 0.001, + "max_value": 0.05, + "step": 0.001 + }, + { + "name": "vad_zcr_min", + "type": "range", + "label": "VAD ZCR Min", + "description": "Minimum zero-crossing rate for speech", + "default_value": 0.02, + "required": False, + "min_value": 0.01, + "max_value": 0.5, + "step": 0.01 + }, + { + "name": "vad_zcr_max", + "type": "range", + "label": "VAD ZCR Max", + "description": "Maximum zero-crossing rate for speech", + "default_value": 0.8, + "required": False, + "min_value": 0.1, + "max_value": 1.0, + "step": 0.05 + }, + { + "name": "vad_spectral_centroid_min", + "type": "range", + "label": "VAD Spectral Centroid Min", + "description": "Minimum spectral centroid for speech", + "default_value": 200, + "required": False, + "min_value": 50, + "max_value": 1000, + "step": 50 + }, + { + "name": "vad_spectral_centroid_max", + "type": "range", + "label": "VAD Spectral Centroid Max", + "description": "Maximum spectral centroid for speech", + "default_value": 4000, + "required": False, + "min_value": 1000, + "max_value": 8000, + "step": 500 + }, + { + "name": "vad_spectral_rolloff_threshold", + "type": "range", + "label": "VAD Spectral Rolloff Threshold", + "description": "Spectral rolloff threshold for speech detection", + "default_value": 3000, + "required": False, + "min_value": 1000, + "max_value": 10000, + "step": 500 + }, + { + "name": "vad_minimum_duration", + "type": "range", + "label": "VAD Minimum Duration", + "description": "Minimum duration for speech segments", + "default_value": 0.2, + "required": False, + "min_value": 0.1, + "max_value": 1.0, + "step": 0.1 + }, + { + "name": "vad_max_history", + "type": "range", + "label": "VAD Max History", + "description": "Maximum history frames for temporal consistency", + "default_value": 8, + "required": False, + "min_value": 4, + "max_value": 20, + "step": 1 + }, + { + "name": "vad_noise_floor_energy", + "type": "range", + "label": "VAD Noise Floor Energy", + "description": "Initial noise floor energy level", + "default_value": 0.001, + "required": False, + "min_value": 0.0001, + "max_value": 0.01, + "step": 0.0001 + }, + { + "name": "vad_adaptation_rate", + "type": "range", + "label": "VAD Adaptation Rate", + "description": "Rate of noise floor adaptation", + "default_value": 0.05, + "required": False, + "min_value": 0.01, + "max_value": 0.2, + "step": 0.01 + }, + { + "name": "vad_harmonic_threshold", + "type": "range", + "label": "VAD Harmonic Threshold", + "description": "Threshold for harmonic content detection", + "default_value": 0.15, + "required": False, + "min_value": 0.05, + "max_value": 0.5, + "step": 0.05 + } + ], + "categories": [ + {"Model Settings": ["model_id", "device", "enable_quantization"]}, + {"Performance Settings": ["throughput_streams", "max_threads"]}, + {"Audio Settings": ["sample_rate", "chunk_duration_ms"]}, + {"Voice Activity Detection": ["vad_threshold", "max_silence_frames", "max_trailing_silence_frames", "vad_energy_threshold", "vad_zcr_min", "vad_zcr_max", "vad_spectral_centroid_min", "vad_spectral_centroid_max", "vad_spectral_rolloff_threshold", "vad_minimum_duration", "vad_max_history", "vad_noise_floor_energy", "vad_adaptation_rate", "vad_harmonic_threshold"]} + ] + } + + +def handle_config_update(lobby_id: str, config_values: Dict[str, Any]) -> bool: + """Handle configuration update for a specific lobby""" + global _model_id, _device, _ov_config, SAMPLE_RATE, CHUNK_DURATION_MS, VAD_THRESHOLD + global MAX_SILENCE_FRAMES, MAX_TRAILING_SILENCE_FRAMES + + try: + logger.info(f"Updating Whisper config for lobby {lobby_id}: {config_values}") + + config_applied = False + + # Update model configuration + if "model_id" in config_values: + new_model_id = config_values["model_id"] + if new_model_id in [model for models in model_ids.values() for model in models]: + _model_id = new_model_id + config_applied = True + logger.info(f"Updated model_id to: {_model_id}") + else: + logger.warning(f"Invalid model_id: {new_model_id}") + + # Update device configuration + if "device" in config_values: + new_device = config_values["device"] + available_devices = [d["name"] for d in get_available_devices()] + if new_device in available_devices or new_device in ["CPU", "GPU", "GPU.1"]: + _device = new_device + _ov_config.device = new_device + config_applied = True + logger.info(f"Updated device to: {_device}") + else: + logger.warning(f"Invalid device: {new_device}, available: {available_devices}") + + # Update OpenVINO configuration + if "enable_quantization" in config_values: + _ov_config.enable_quantization = bool(config_values["enable_quantization"]) + config_applied = True + logger.info(f"Updated quantization to: {_ov_config.enable_quantization}") + + if "throughput_streams" in config_values: + streams = int(config_values["throughput_streams"]) + if 1 <= streams <= 8: + _ov_config.throughput_streams = streams + config_applied = True + logger.info(f"Updated throughput_streams to: {_ov_config.throughput_streams}") + + if "max_threads" in config_values: + threads = int(config_values["max_threads"]) + if 1 <= threads <= 16: + _ov_config.max_threads = threads + config_applied = True + logger.info(f"Updated max_threads to: {_ov_config.max_threads}") + + # Update audio processing parameters + if "sample_rate" in config_values: + rate = int(config_values["sample_rate"]) + if 8000 <= rate <= 48000: + SAMPLE_RATE = rate + config_applied = True + logger.info(f"Updated sample_rate to: {SAMPLE_RATE}") + + if "chunk_duration_ms" in config_values: + duration = int(config_values["chunk_duration_ms"]) + if 50 <= duration <= 500: + CHUNK_DURATION_MS = duration + config_applied = True + logger.info(f"Updated chunk_duration_ms to: {CHUNK_DURATION_MS}") + + if "vad_threshold" in config_values: + threshold = float(config_values["vad_threshold"]) + if 0.001 <= threshold <= 0.1: + VAD_THRESHOLD = threshold + config_applied = True + logger.info(f"Updated vad_threshold to: {VAD_THRESHOLD}") + + if "max_silence_frames" in config_values: + frames = int(config_values["max_silence_frames"]) + if 10 <= frames <= 100: + MAX_SILENCE_FRAMES = frames + config_applied = True + logger.info(f"Updated max_silence_frames to: {MAX_SILENCE_FRAMES}") + + if "max_trailing_silence_frames" in config_values: + frames = int(config_values["max_trailing_silence_frames"]) + if 1 <= frames <= 20: + MAX_TRAILING_SILENCE_FRAMES = frames + config_applied = True + logger.info(f"Updated max_trailing_silence_frames to: {MAX_TRAILING_SILENCE_FRAMES}") + + # Update VAD configuration (this would require updating existing processors) + vad_updates = {} + if "vad_energy_threshold" in config_values: + vad_updates["energy_threshold"] = float(config_values["vad_energy_threshold"]) + if "vad_zcr_min" in config_values: + vad_updates["zcr_min"] = float(config_values["vad_zcr_min"]) + if "vad_zcr_max" in config_values: + vad_updates["zcr_max"] = float(config_values["vad_zcr_max"]) + if "vad_spectral_centroid_min" in config_values: + vad_updates["spectral_centroid_min"] = float(config_values["vad_spectral_centroid_min"]) + if "vad_spectral_centroid_max" in config_values: + vad_updates["spectral_centroid_max"] = float(config_values["vad_spectral_centroid_max"]) + if "vad_spectral_rolloff_threshold" in config_values: + vad_updates["spectral_rolloff_threshold"] = float(config_values["vad_spectral_rolloff_threshold"]) + if "vad_minimum_duration" in config_values: + vad_updates["minimum_duration"] = float(config_values["vad_minimum_duration"]) + if "vad_max_history" in config_values: + vad_updates["max_history"] = int(config_values["vad_max_history"]) + if "vad_noise_floor_energy" in config_values: + vad_updates["noise_floor_energy"] = float(config_values["vad_noise_floor_energy"]) + if "vad_adaptation_rate" in config_values: + vad_updates["adaptation_rate"] = float(config_values["vad_adaptation_rate"]) + if "vad_harmonic_threshold" in config_values: + vad_updates["harmonic_threshold"] = float(config_values["vad_harmonic_threshold"]) + + if vad_updates: + # Update VAD_CONFIG global + VAD_CONFIG.update(vad_updates) + config_applied = True + logger.info(f"Updated VAD config: {vad_updates}") + + # Note: Existing processors would need to be recreated to pick up VAD changes + # For now, we'll log that a restart may be needed + logger.info("VAD configuration updated - existing processors may need restart to take effect") + + if config_applied: + logger.info(f"Configuration update completed for lobby {lobby_id}") + else: + logger.warning(f"No valid configuration changes applied for lobby {lobby_id}") + + return config_applied + + except Exception as e: + logger.error(f"Failed to apply Whisper config update: {e}") + return False def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]: