Fixing transcode

2025-09-15 14:30:16 -07:00 · 2025-09-15 14:30:16 -07:00 · b2169da4cd
commit b2169da4cd
parent 825544002e
2 changed files with 439 additions and 32 deletions
--- a/voicebot/bot_orchestrator.py
+++ b/voicebot/bot_orchestrator.py
@ -8,6 +8,7 @@ import asyncio
 import threading
 import uuid
 import importlib
+import inspect
 import pkgutil
 import sys
 import os
@ -346,7 +347,10 @@ async def bot_join(bot_name: str, req: JoinRequest):
    if req.config_values and config_handler:
        try:
            logger.info(f"Applying existing configuration to bot {bot_name}")
-            success = await config_handler(req.lobby_id, req.config_values)
+            if inspect.iscoroutinefunction(config_handler):
+                success = await config_handler(req.lobby_id, req.config_values)
+            else:
+                success = config_handler(req.lobby_id, req.config_values)
            if success:
                logger.info(f"Successfully applied existing configuration to bot {bot_name}")
            else:
@ -451,7 +455,10 @@ async def update_bot_config(bot_name: str, config_data: dict[str, Any]) -> dict[
            raise HTTPException(status_code=400, detail="lobby_id is required")
        
        # Call the bot's configuration handler
-        success = await config_handler(lobby_id, config_values)
+        if inspect.iscoroutinefunction(config_handler):
+            success = await config_handler(lobby_id, config_values)
+        else:
+            success = config_handler(lobby_id, config_values)
        
        if success:
            return {"success": True, "message": "Configuration updated successfully"}
--- a/voicebot/bots/whisper.py
+++ b/voicebot/bots/whisper.py
@ -1115,6 +1115,7 @@ class OptimizedAudioProcessor:
        self.transcription_history: List[TranscriptionHistoryItem] = []
        self.last_activity_time = time.time()
        self.last_audio_time = time.time()  # Track when any audio chunk is received
+        self.final_transcription_pending = False  # Flag to prevent accumulating audio during final transcription

        # Current transcription message for refinements
        self.current_message: Optional[ChatMessageModel] = None
@ -1172,6 +1173,7 @@ class OptimizedAudioProcessor:
        if is_speech:
            self.silence_frames = 0
            self.last_activity_time = time.time()
+            self.final_transcription_pending = False  # Reset flag when new speech is detected
            self._add_to_circular_buffer(audio_data)
        elif (len(self.current_phrase_audio) > 0 and 
              self.silence_frames < self.max_trailing_silence_frames):
@ -1262,6 +1264,7 @@ class OptimizedAudioProcessor:
        ):  # At least 0.5 seconds
            if self.main_loop:
                logger.info(f"Queueing final transcription for {self.peer_name}")
+                self.final_transcription_pending = True
                asyncio.create_task(
                    self._transcribe_and_send(
                        self.current_phrase_audio.copy(), is_final=True
@ -1282,24 +1285,25 @@ class OptimizedAudioProcessor:
                )

                # Add to current phrase
-                self.current_phrase_audio = np.concatenate(
-                    [self.current_phrase_audio, audio_item.audio]
-                )
-
-                # Check if we should transcribe
-                phrase_duration = len(self.current_phrase_audio) / self.sample_rate
-
-                if phrase_duration >= 1.0:  # Transcribe every 1 second
-                    logger.info(f"Transcribing for {self.peer_name} (1s interval)")
-                    await self._transcribe_and_send(
-                        self.current_phrase_audio.copy(), is_final=False
+                if not self.final_transcription_pending:
+                    self.current_phrase_audio = np.concatenate(
+                        [self.current_phrase_audio, audio_item.audio]
                    )

+                    # Check if we should transcribe
+                    phrase_duration = len(self.current_phrase_audio) / self.sample_rate
+
+                    if phrase_duration >= 1.0:  # Transcribe every 1 second
+                        logger.info(f"Transcribing for {self.peer_name} (1s interval)")
+                        await self._transcribe_and_send(
+                            self.current_phrase_audio.copy(), is_final=False
+                        )
+
            except asyncio.TimeoutError:
                # Check for final transcription on timeout
                if (
                    len(self.current_phrase_audio) > 0
-                    and time.time() - self.last_audio_time > 2.0
+                    and time.time() - self.last_activity_time > 2.0
                ):
                    logger.info(
                        f"Final transcription timeout for {self.peer_name} (asyncio.TimeoutError)"
@ -1308,6 +1312,7 @@ class OptimizedAudioProcessor:
                        self.current_phrase_audio.copy(), is_final=True
                    )
                    self.current_phrase_audio = np.array([], dtype=np.float32)
+                    self.final_transcription_pending = False  # Reset the flag
            except Exception as e:
                logger.error(
                    f"Error in async processing loop for {self.peer_name}: {e}"
@ -1325,30 +1330,31 @@ class OptimizedAudioProcessor:
                audio_item = self._threading_queue.get(timeout=1.0)

                # Add to current phrase
-                self.current_phrase_audio = np.concatenate(
-                    [self.current_phrase_audio, audio_item.audio]
-                )
+                if not self.final_transcription_pending:
+                    self.current_phrase_audio = np.concatenate(
+                        [self.current_phrase_audio, audio_item.audio]
+                    )

-                # Check if we should transcribe
-                phrase_duration = len(self.current_phrase_audio) / self.sample_rate
+                    # Check if we should transcribe
+                    phrase_duration = len(self.current_phrase_audio) / self.sample_rate

-                if phrase_duration >= 1.0:
-                    if self.main_loop:
-                        logger.info(
-                            f"Transcribing from thread for {self.peer_name} (_thread_processing_loop > 1s interval)"
-                        )
-                        asyncio.run_coroutine_threadsafe(
-                            self._transcribe_and_send(
-                                self.current_phrase_audio.copy(), is_final=False
-                            ),
-                            self.main_loop,
-                        )
+                    if phrase_duration >= 1.0:
+                        if self.main_loop:
+                            logger.info(
+                                f"Transcribing from thread for {self.peer_name} (_thread_processing_loop > 1s interval)"
+                            )
+                            asyncio.run_coroutine_threadsafe(
+                                self._transcribe_and_send(
+                                    self.current_phrase_audio.copy(), is_final=False
+                                ),
+                                self.main_loop,
+                            )

            except Empty:
                # Check for final transcription
                if (
                    len(self.current_phrase_audio) > 0
-                    and time.time() - self.last_audio_time > 2.0
+                    and time.time() - self.last_activity_time > 2.0
                ):
                    if self.main_loop:
                        logger.info(
@ -1361,6 +1367,7 @@ class OptimizedAudioProcessor:
                            self.main_loop,
                        )
                    self.current_phrase_audio = np.array([], dtype=np.float32)
+                    self.final_transcription_pending = False  # Reset the flag
            except Exception as e:
                logger.error(
                    f"Error in thread processing loop for {self.peer_name}: {e}"
@ -1380,6 +1387,10 @@ class OptimizedAudioProcessor:
        if audio_array.ndim != 1:
            raise ValueError("Expected mono audio as a 1D numpy array.")

+        # Reset the flag for final transcriptions to allow new processing
+        if is_final:
+            self.final_transcription_pending = False
+
        transcription_start = time.time()
        transcription_type = "final" if is_final else "streaming"

@ -2047,7 +2058,396 @@ def _resample_audio(

 # Public API functions
 def agent_info() -> Dict[str, str]:
-    return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION, "has_media": "true"}
+    return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION, "has_media": "true", "configurable": "true"}
+
+
+def get_config_schema() -> Dict[str, Any]:
+    """Get the configuration schema for the Whisper bot"""
+    return {
+        "bot_name": AGENT_NAME,
+        "version": "1.0",
+        "parameters": [
+            {
+                "name": "model_id",
+                "type": "select",
+                "label": "Whisper Model",
+                "description": "The Whisper model to use for transcription",
+                "default_value": _model_id,
+                "required": True,
+                "options": [
+                    {"value": "distil-whisper/distil-large-v2", "label": "Distil-Whisper Large v2 (Fast)"},
+                    {"value": "distil-whisper/distil-medium.en", "label": "Distil-Whisper Medium EN"},
+                    {"value": "distil-whisper/distil-small.en", "label": "Distil-Whisper Small EN"},
+                    {"value": "openai/whisper-large-v3", "label": "OpenAI Whisper Large v3"},
+                    {"value": "openai/whisper-large-v2", "label": "OpenAI Whisper Large v2"},
+                    {"value": "openai/whisper-large", "label": "OpenAI Whisper Large"},
+                    {"value": "openai/whisper-medium", "label": "OpenAI Whisper Medium"},
+                    {"value": "openai/whisper-small", "label": "OpenAI Whisper Small"},
+                    {"value": "openai/whisper-base", "label": "OpenAI Whisper Base"},
+                    {"value": "openai/whisper-tiny", "label": "OpenAI Whisper Tiny"}
+                ]
+            },
+            {
+                "name": "device",
+                "type": "select",
+                "label": "Inference Device",
+                "description": "The device to run inference on",
+                "default_value": _device,
+                "required": True,
+                "options": [
+                    {"value": "GPU.1", "label": "Intel Arc GPU (GPU.1)"},
+                    {"value": "GPU", "label": "GPU"},
+                    {"value": "CPU", "label": "CPU"}
+                ]
+            },
+            {
+                "name": "enable_quantization",
+                "type": "boolean",
+                "label": "Enable Quantization",
+                "description": "Enable INT8 quantization for faster inference",
+                "default_value": _ov_config.enable_quantization,
+                "required": False
+            },
+            {
+                "name": "throughput_streams",
+                "type": "range",
+                "label": "Throughput Streams",
+                "description": "Number of parallel inference streams",
+                "default_value": _ov_config.throughput_streams,
+                "required": False,
+                "min_value": 1,
+                "max_value": 8,
+                "step": 1
+            },
+            {
+                "name": "max_threads",
+                "type": "range",
+                "label": "Max CPU Threads",
+                "description": "Maximum number of CPU threads for inference",
+                "default_value": _ov_config.max_threads,
+                "required": False,
+                "min_value": 1,
+                "max_value": 16,
+                "step": 1
+            },
+            {
+                "name": "sample_rate",
+                "type": "number",
+                "label": "Sample Rate",
+                "description": "Audio sample rate in Hz",
+                "default_value": SAMPLE_RATE,
+                "required": True,
+                "min_value": 8000,
+                "max_value": 48000
+            },
+            {
+                "name": "chunk_duration_ms",
+                "type": "range",
+                "label": "Chunk Duration (ms)",
+                "description": "Duration of audio chunks in milliseconds",
+                "default_value": CHUNK_DURATION_MS,
+                "required": False,
+                "min_value": 50,
+                "max_value": 500,
+                "step": 10
+            },
+            {
+                "name": "vad_threshold",
+                "type": "range",
+                "label": "VAD Threshold",
+                "description": "Voice activity detection threshold",
+                "default_value": VAD_THRESHOLD,
+                "required": False,
+                "min_value": 0.001,
+                "max_value": 0.1,
+                "step": 0.001
+            },
+            {
+                "name": "max_silence_frames",
+                "type": "range",
+                "label": "Max Silence Frames",
+                "description": "Maximum frames of silence before stopping",
+                "default_value": MAX_SILENCE_FRAMES,
+                "required": False,
+                "min_value": 10,
+                "max_value": 100,
+                "step": 5
+            },
+            {
+                "name": "max_trailing_silence_frames",
+                "type": "range",
+                "label": "Max Trailing Silence Frames",
+                "description": "Maximum trailing silence frames to include",
+                "default_value": MAX_TRAILING_SILENCE_FRAMES,
+                "required": False,
+                "min_value": 1,
+                "max_value": 20,
+                "step": 1
+            },
+            {
+                "name": "vad_energy_threshold",
+                "type": "range",
+                "label": "VAD Energy Threshold",
+                "description": "Energy threshold for voice activity detection",
+                "default_value": 0.005,
+                "required": False,
+                "min_value": 0.001,
+                "max_value": 0.05,
+                "step": 0.001
+            },
+            {
+                "name": "vad_zcr_min",
+                "type": "range",
+                "label": "VAD ZCR Min",
+                "description": "Minimum zero-crossing rate for speech",
+                "default_value": 0.02,
+                "required": False,
+                "min_value": 0.01,
+                "max_value": 0.5,
+                "step": 0.01
+            },
+            {
+                "name": "vad_zcr_max",
+                "type": "range",
+                "label": "VAD ZCR Max",
+                "description": "Maximum zero-crossing rate for speech",
+                "default_value": 0.8,
+                "required": False,
+                "min_value": 0.1,
+                "max_value": 1.0,
+                "step": 0.05
+            },
+            {
+                "name": "vad_spectral_centroid_min",
+                "type": "range",
+                "label": "VAD Spectral Centroid Min",
+                "description": "Minimum spectral centroid for speech",
+                "default_value": 200,
+                "required": False,
+                "min_value": 50,
+                "max_value": 1000,
+                "step": 50
+            },
+            {
+                "name": "vad_spectral_centroid_max",
+                "type": "range",
+                "label": "VAD Spectral Centroid Max",
+                "description": "Maximum spectral centroid for speech",
+                "default_value": 4000,
+                "required": False,
+                "min_value": 1000,
+                "max_value": 8000,
+                "step": 500
+            },
+            {
+                "name": "vad_spectral_rolloff_threshold",
+                "type": "range",
+                "label": "VAD Spectral Rolloff Threshold",
+                "description": "Spectral rolloff threshold for speech detection",
+                "default_value": 3000,
+                "required": False,
+                "min_value": 1000,
+                "max_value": 10000,
+                "step": 500
+            },
+            {
+                "name": "vad_minimum_duration",
+                "type": "range",
+                "label": "VAD Minimum Duration",
+                "description": "Minimum duration for speech segments",
+                "default_value": 0.2,
+                "required": False,
+                "min_value": 0.1,
+                "max_value": 1.0,
+                "step": 0.1
+            },
+            {
+                "name": "vad_max_history",
+                "type": "range",
+                "label": "VAD Max History",
+                "description": "Maximum history frames for temporal consistency",
+                "default_value": 8,
+                "required": False,
+                "min_value": 4,
+                "max_value": 20,
+                "step": 1
+            },
+            {
+                "name": "vad_noise_floor_energy",
+                "type": "range",
+                "label": "VAD Noise Floor Energy",
+                "description": "Initial noise floor energy level",
+                "default_value": 0.001,
+                "required": False,
+                "min_value": 0.0001,
+                "max_value": 0.01,
+                "step": 0.0001
+            },
+            {
+                "name": "vad_adaptation_rate",
+                "type": "range",
+                "label": "VAD Adaptation Rate",
+                "description": "Rate of noise floor adaptation",
+                "default_value": 0.05,
+                "required": False,
+                "min_value": 0.01,
+                "max_value": 0.2,
+                "step": 0.01
+            },
+            {
+                "name": "vad_harmonic_threshold",
+                "type": "range",
+                "label": "VAD Harmonic Threshold",
+                "description": "Threshold for harmonic content detection",
+                "default_value": 0.15,
+                "required": False,
+                "min_value": 0.05,
+                "max_value": 0.5,
+                "step": 0.05
+            }
+        ],
+        "categories": [
+            {"Model Settings": ["model_id", "device", "enable_quantization"]},
+            {"Performance Settings": ["throughput_streams", "max_threads"]},
+            {"Audio Settings": ["sample_rate", "chunk_duration_ms"]},
+            {"Voice Activity Detection": ["vad_threshold", "max_silence_frames", "max_trailing_silence_frames", "vad_energy_threshold", "vad_zcr_min", "vad_zcr_max", "vad_spectral_centroid_min", "vad_spectral_centroid_max", "vad_spectral_rolloff_threshold", "vad_minimum_duration", "vad_max_history", "vad_noise_floor_energy", "vad_adaptation_rate", "vad_harmonic_threshold"]}
+        ]
+    }
+
+
+def handle_config_update(lobby_id: str, config_values: Dict[str, Any]) -> bool:
+    """Handle configuration update for a specific lobby"""
+    global _model_id, _device, _ov_config, SAMPLE_RATE, CHUNK_DURATION_MS, VAD_THRESHOLD
+    global MAX_SILENCE_FRAMES, MAX_TRAILING_SILENCE_FRAMES
+    
+    try:
+        logger.info(f"Updating Whisper config for lobby {lobby_id}: {config_values}")
+        
+        config_applied = False
+        
+        # Update model configuration
+        if "model_id" in config_values:
+            new_model_id = config_values["model_id"]
+            if new_model_id in [model for models in model_ids.values() for model in models]:
+                _model_id = new_model_id
+                config_applied = True
+                logger.info(f"Updated model_id to: {_model_id}")
+            else:
+                logger.warning(f"Invalid model_id: {new_model_id}")
+        
+        # Update device configuration
+        if "device" in config_values:
+            new_device = config_values["device"]
+            available_devices = [d["name"] for d in get_available_devices()]
+            if new_device in available_devices or new_device in ["CPU", "GPU", "GPU.1"]:
+                _device = new_device
+                _ov_config.device = new_device
+                config_applied = True
+                logger.info(f"Updated device to: {_device}")
+            else:
+                logger.warning(f"Invalid device: {new_device}, available: {available_devices}")
+        
+        # Update OpenVINO configuration
+        if "enable_quantization" in config_values:
+            _ov_config.enable_quantization = bool(config_values["enable_quantization"])
+            config_applied = True
+            logger.info(f"Updated quantization to: {_ov_config.enable_quantization}")
+        
+        if "throughput_streams" in config_values:
+            streams = int(config_values["throughput_streams"])
+            if 1 <= streams <= 8:
+                _ov_config.throughput_streams = streams
+                config_applied = True
+                logger.info(f"Updated throughput_streams to: {_ov_config.throughput_streams}")
+        
+        if "max_threads" in config_values:
+            threads = int(config_values["max_threads"])
+            if 1 <= threads <= 16:
+                _ov_config.max_threads = threads
+                config_applied = True
+                logger.info(f"Updated max_threads to: {_ov_config.max_threads}")
+        
+        # Update audio processing parameters
+        if "sample_rate" in config_values:
+            rate = int(config_values["sample_rate"])
+            if 8000 <= rate <= 48000:
+                SAMPLE_RATE = rate
+                config_applied = True
+                logger.info(f"Updated sample_rate to: {SAMPLE_RATE}")
+        
+        if "chunk_duration_ms" in config_values:
+            duration = int(config_values["chunk_duration_ms"])
+            if 50 <= duration <= 500:
+                CHUNK_DURATION_MS = duration
+                config_applied = True
+                logger.info(f"Updated chunk_duration_ms to: {CHUNK_DURATION_MS}")
+        
+        if "vad_threshold" in config_values:
+            threshold = float(config_values["vad_threshold"])
+            if 0.001 <= threshold <= 0.1:
+                VAD_THRESHOLD = threshold
+                config_applied = True
+                logger.info(f"Updated vad_threshold to: {VAD_THRESHOLD}")
+        
+        if "max_silence_frames" in config_values:
+            frames = int(config_values["max_silence_frames"])
+            if 10 <= frames <= 100:
+                MAX_SILENCE_FRAMES = frames
+                config_applied = True
+                logger.info(f"Updated max_silence_frames to: {MAX_SILENCE_FRAMES}")
+        
+        if "max_trailing_silence_frames" in config_values:
+            frames = int(config_values["max_trailing_silence_frames"])
+            if 1 <= frames <= 20:
+                MAX_TRAILING_SILENCE_FRAMES = frames
+                config_applied = True
+                logger.info(f"Updated max_trailing_silence_frames to: {MAX_TRAILING_SILENCE_FRAMES}")
+        
+        # Update VAD configuration (this would require updating existing processors)
+        vad_updates = {}
+        if "vad_energy_threshold" in config_values:
+            vad_updates["energy_threshold"] = float(config_values["vad_energy_threshold"])
+        if "vad_zcr_min" in config_values:
+            vad_updates["zcr_min"] = float(config_values["vad_zcr_min"])
+        if "vad_zcr_max" in config_values:
+            vad_updates["zcr_max"] = float(config_values["vad_zcr_max"])
+        if "vad_spectral_centroid_min" in config_values:
+            vad_updates["spectral_centroid_min"] = float(config_values["vad_spectral_centroid_min"])
+        if "vad_spectral_centroid_max" in config_values:
+            vad_updates["spectral_centroid_max"] = float(config_values["vad_spectral_centroid_max"])
+        if "vad_spectral_rolloff_threshold" in config_values:
+            vad_updates["spectral_rolloff_threshold"] = float(config_values["vad_spectral_rolloff_threshold"])
+        if "vad_minimum_duration" in config_values:
+            vad_updates["minimum_duration"] = float(config_values["vad_minimum_duration"])
+        if "vad_max_history" in config_values:
+            vad_updates["max_history"] = int(config_values["vad_max_history"])
+        if "vad_noise_floor_energy" in config_values:
+            vad_updates["noise_floor_energy"] = float(config_values["vad_noise_floor_energy"])
+        if "vad_adaptation_rate" in config_values:
+            vad_updates["adaptation_rate"] = float(config_values["vad_adaptation_rate"])
+        if "vad_harmonic_threshold" in config_values:
+            vad_updates["harmonic_threshold"] = float(config_values["vad_harmonic_threshold"])
+        
+        if vad_updates:
+            # Update VAD_CONFIG global
+            VAD_CONFIG.update(vad_updates)
+            config_applied = True
+            logger.info(f"Updated VAD config: {vad_updates}")
+            
+            # Note: Existing processors would need to be recreated to pick up VAD changes
+            # For now, we'll log that a restart may be needed
+            logger.info("VAD configuration updated - existing processors may need restart to take effect")
+        
+        if config_applied:
+            logger.info(f"Configuration update completed for lobby {lobby_id}")
+        else:
+            logger.warning(f"No valid configuration changes applied for lobby {lobby_id}")
+        
+        return config_applied
+        
+    except Exception as e:
+        logger.error(f"Failed to apply Whisper config update: {e}")
+        return False


 def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]: