Fixing transcode

This commit is contained in:
James Ketr 2025-09-15 14:30:16 -07:00
parent 825544002e
commit b2169da4cd
2 changed files with 439 additions and 32 deletions

View File

@ -8,6 +8,7 @@ import asyncio
import threading
import uuid
import importlib
import inspect
import pkgutil
import sys
import os
@ -346,7 +347,10 @@ async def bot_join(bot_name: str, req: JoinRequest):
if req.config_values and config_handler:
try:
logger.info(f"Applying existing configuration to bot {bot_name}")
success = await config_handler(req.lobby_id, req.config_values)
if inspect.iscoroutinefunction(config_handler):
success = await config_handler(req.lobby_id, req.config_values)
else:
success = config_handler(req.lobby_id, req.config_values)
if success:
logger.info(f"Successfully applied existing configuration to bot {bot_name}")
else:
@ -451,7 +455,10 @@ async def update_bot_config(bot_name: str, config_data: dict[str, Any]) -> dict[
raise HTTPException(status_code=400, detail="lobby_id is required")
# Call the bot's configuration handler
success = await config_handler(lobby_id, config_values)
if inspect.iscoroutinefunction(config_handler):
success = await config_handler(lobby_id, config_values)
else:
success = config_handler(lobby_id, config_values)
if success:
return {"success": True, "message": "Configuration updated successfully"}

View File

@ -1115,6 +1115,7 @@ class OptimizedAudioProcessor:
self.transcription_history: List[TranscriptionHistoryItem] = []
self.last_activity_time = time.time()
self.last_audio_time = time.time() # Track when any audio chunk is received
self.final_transcription_pending = False # Flag to prevent accumulating audio during final transcription
# Current transcription message for refinements
self.current_message: Optional[ChatMessageModel] = None
@ -1172,6 +1173,7 @@ class OptimizedAudioProcessor:
if is_speech:
self.silence_frames = 0
self.last_activity_time = time.time()
self.final_transcription_pending = False # Reset flag when new speech is detected
self._add_to_circular_buffer(audio_data)
elif (len(self.current_phrase_audio) > 0 and
self.silence_frames < self.max_trailing_silence_frames):
@ -1262,6 +1264,7 @@ class OptimizedAudioProcessor:
): # At least 0.5 seconds
if self.main_loop:
logger.info(f"Queueing final transcription for {self.peer_name}")
self.final_transcription_pending = True
asyncio.create_task(
self._transcribe_and_send(
self.current_phrase_audio.copy(), is_final=True
@ -1282,24 +1285,25 @@ class OptimizedAudioProcessor:
)
# Add to current phrase
self.current_phrase_audio = np.concatenate(
[self.current_phrase_audio, audio_item.audio]
)
# Check if we should transcribe
phrase_duration = len(self.current_phrase_audio) / self.sample_rate
if phrase_duration >= 1.0: # Transcribe every 1 second
logger.info(f"Transcribing for {self.peer_name} (1s interval)")
await self._transcribe_and_send(
self.current_phrase_audio.copy(), is_final=False
if not self.final_transcription_pending:
self.current_phrase_audio = np.concatenate(
[self.current_phrase_audio, audio_item.audio]
)
# Check if we should transcribe
phrase_duration = len(self.current_phrase_audio) / self.sample_rate
if phrase_duration >= 1.0: # Transcribe every 1 second
logger.info(f"Transcribing for {self.peer_name} (1s interval)")
await self._transcribe_and_send(
self.current_phrase_audio.copy(), is_final=False
)
except asyncio.TimeoutError:
# Check for final transcription on timeout
if (
len(self.current_phrase_audio) > 0
and time.time() - self.last_audio_time > 2.0
and time.time() - self.last_activity_time > 2.0
):
logger.info(
f"Final transcription timeout for {self.peer_name} (asyncio.TimeoutError)"
@ -1308,6 +1312,7 @@ class OptimizedAudioProcessor:
self.current_phrase_audio.copy(), is_final=True
)
self.current_phrase_audio = np.array([], dtype=np.float32)
self.final_transcription_pending = False # Reset the flag
except Exception as e:
logger.error(
f"Error in async processing loop for {self.peer_name}: {e}"
@ -1325,30 +1330,31 @@ class OptimizedAudioProcessor:
audio_item = self._threading_queue.get(timeout=1.0)
# Add to current phrase
self.current_phrase_audio = np.concatenate(
[self.current_phrase_audio, audio_item.audio]
)
if not self.final_transcription_pending:
self.current_phrase_audio = np.concatenate(
[self.current_phrase_audio, audio_item.audio]
)
# Check if we should transcribe
phrase_duration = len(self.current_phrase_audio) / self.sample_rate
# Check if we should transcribe
phrase_duration = len(self.current_phrase_audio) / self.sample_rate
if phrase_duration >= 1.0:
if self.main_loop:
logger.info(
f"Transcribing from thread for {self.peer_name} (_thread_processing_loop > 1s interval)"
)
asyncio.run_coroutine_threadsafe(
self._transcribe_and_send(
self.current_phrase_audio.copy(), is_final=False
),
self.main_loop,
)
if phrase_duration >= 1.0:
if self.main_loop:
logger.info(
f"Transcribing from thread for {self.peer_name} (_thread_processing_loop > 1s interval)"
)
asyncio.run_coroutine_threadsafe(
self._transcribe_and_send(
self.current_phrase_audio.copy(), is_final=False
),
self.main_loop,
)
except Empty:
# Check for final transcription
if (
len(self.current_phrase_audio) > 0
and time.time() - self.last_audio_time > 2.0
and time.time() - self.last_activity_time > 2.0
):
if self.main_loop:
logger.info(
@ -1361,6 +1367,7 @@ class OptimizedAudioProcessor:
self.main_loop,
)
self.current_phrase_audio = np.array([], dtype=np.float32)
self.final_transcription_pending = False # Reset the flag
except Exception as e:
logger.error(
f"Error in thread processing loop for {self.peer_name}: {e}"
@ -1380,6 +1387,10 @@ class OptimizedAudioProcessor:
if audio_array.ndim != 1:
raise ValueError("Expected mono audio as a 1D numpy array.")
# Reset the flag for final transcriptions to allow new processing
if is_final:
self.final_transcription_pending = False
transcription_start = time.time()
transcription_type = "final" if is_final else "streaming"
@ -2047,7 +2058,396 @@ def _resample_audio(
# Public API functions
def agent_info() -> Dict[str, str]:
return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION, "has_media": "true"}
return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION, "has_media": "true", "configurable": "true"}
def get_config_schema() -> Dict[str, Any]:
"""Get the configuration schema for the Whisper bot"""
return {
"bot_name": AGENT_NAME,
"version": "1.0",
"parameters": [
{
"name": "model_id",
"type": "select",
"label": "Whisper Model",
"description": "The Whisper model to use for transcription",
"default_value": _model_id,
"required": True,
"options": [
{"value": "distil-whisper/distil-large-v2", "label": "Distil-Whisper Large v2 (Fast)"},
{"value": "distil-whisper/distil-medium.en", "label": "Distil-Whisper Medium EN"},
{"value": "distil-whisper/distil-small.en", "label": "Distil-Whisper Small EN"},
{"value": "openai/whisper-large-v3", "label": "OpenAI Whisper Large v3"},
{"value": "openai/whisper-large-v2", "label": "OpenAI Whisper Large v2"},
{"value": "openai/whisper-large", "label": "OpenAI Whisper Large"},
{"value": "openai/whisper-medium", "label": "OpenAI Whisper Medium"},
{"value": "openai/whisper-small", "label": "OpenAI Whisper Small"},
{"value": "openai/whisper-base", "label": "OpenAI Whisper Base"},
{"value": "openai/whisper-tiny", "label": "OpenAI Whisper Tiny"}
]
},
{
"name": "device",
"type": "select",
"label": "Inference Device",
"description": "The device to run inference on",
"default_value": _device,
"required": True,
"options": [
{"value": "GPU.1", "label": "Intel Arc GPU (GPU.1)"},
{"value": "GPU", "label": "GPU"},
{"value": "CPU", "label": "CPU"}
]
},
{
"name": "enable_quantization",
"type": "boolean",
"label": "Enable Quantization",
"description": "Enable INT8 quantization for faster inference",
"default_value": _ov_config.enable_quantization,
"required": False
},
{
"name": "throughput_streams",
"type": "range",
"label": "Throughput Streams",
"description": "Number of parallel inference streams",
"default_value": _ov_config.throughput_streams,
"required": False,
"min_value": 1,
"max_value": 8,
"step": 1
},
{
"name": "max_threads",
"type": "range",
"label": "Max CPU Threads",
"description": "Maximum number of CPU threads for inference",
"default_value": _ov_config.max_threads,
"required": False,
"min_value": 1,
"max_value": 16,
"step": 1
},
{
"name": "sample_rate",
"type": "number",
"label": "Sample Rate",
"description": "Audio sample rate in Hz",
"default_value": SAMPLE_RATE,
"required": True,
"min_value": 8000,
"max_value": 48000
},
{
"name": "chunk_duration_ms",
"type": "range",
"label": "Chunk Duration (ms)",
"description": "Duration of audio chunks in milliseconds",
"default_value": CHUNK_DURATION_MS,
"required": False,
"min_value": 50,
"max_value": 500,
"step": 10
},
{
"name": "vad_threshold",
"type": "range",
"label": "VAD Threshold",
"description": "Voice activity detection threshold",
"default_value": VAD_THRESHOLD,
"required": False,
"min_value": 0.001,
"max_value": 0.1,
"step": 0.001
},
{
"name": "max_silence_frames",
"type": "range",
"label": "Max Silence Frames",
"description": "Maximum frames of silence before stopping",
"default_value": MAX_SILENCE_FRAMES,
"required": False,
"min_value": 10,
"max_value": 100,
"step": 5
},
{
"name": "max_trailing_silence_frames",
"type": "range",
"label": "Max Trailing Silence Frames",
"description": "Maximum trailing silence frames to include",
"default_value": MAX_TRAILING_SILENCE_FRAMES,
"required": False,
"min_value": 1,
"max_value": 20,
"step": 1
},
{
"name": "vad_energy_threshold",
"type": "range",
"label": "VAD Energy Threshold",
"description": "Energy threshold for voice activity detection",
"default_value": 0.005,
"required": False,
"min_value": 0.001,
"max_value": 0.05,
"step": 0.001
},
{
"name": "vad_zcr_min",
"type": "range",
"label": "VAD ZCR Min",
"description": "Minimum zero-crossing rate for speech",
"default_value": 0.02,
"required": False,
"min_value": 0.01,
"max_value": 0.5,
"step": 0.01
},
{
"name": "vad_zcr_max",
"type": "range",
"label": "VAD ZCR Max",
"description": "Maximum zero-crossing rate for speech",
"default_value": 0.8,
"required": False,
"min_value": 0.1,
"max_value": 1.0,
"step": 0.05
},
{
"name": "vad_spectral_centroid_min",
"type": "range",
"label": "VAD Spectral Centroid Min",
"description": "Minimum spectral centroid for speech",
"default_value": 200,
"required": False,
"min_value": 50,
"max_value": 1000,
"step": 50
},
{
"name": "vad_spectral_centroid_max",
"type": "range",
"label": "VAD Spectral Centroid Max",
"description": "Maximum spectral centroid for speech",
"default_value": 4000,
"required": False,
"min_value": 1000,
"max_value": 8000,
"step": 500
},
{
"name": "vad_spectral_rolloff_threshold",
"type": "range",
"label": "VAD Spectral Rolloff Threshold",
"description": "Spectral rolloff threshold for speech detection",
"default_value": 3000,
"required": False,
"min_value": 1000,
"max_value": 10000,
"step": 500
},
{
"name": "vad_minimum_duration",
"type": "range",
"label": "VAD Minimum Duration",
"description": "Minimum duration for speech segments",
"default_value": 0.2,
"required": False,
"min_value": 0.1,
"max_value": 1.0,
"step": 0.1
},
{
"name": "vad_max_history",
"type": "range",
"label": "VAD Max History",
"description": "Maximum history frames for temporal consistency",
"default_value": 8,
"required": False,
"min_value": 4,
"max_value": 20,
"step": 1
},
{
"name": "vad_noise_floor_energy",
"type": "range",
"label": "VAD Noise Floor Energy",
"description": "Initial noise floor energy level",
"default_value": 0.001,
"required": False,
"min_value": 0.0001,
"max_value": 0.01,
"step": 0.0001
},
{
"name": "vad_adaptation_rate",
"type": "range",
"label": "VAD Adaptation Rate",
"description": "Rate of noise floor adaptation",
"default_value": 0.05,
"required": False,
"min_value": 0.01,
"max_value": 0.2,
"step": 0.01
},
{
"name": "vad_harmonic_threshold",
"type": "range",
"label": "VAD Harmonic Threshold",
"description": "Threshold for harmonic content detection",
"default_value": 0.15,
"required": False,
"min_value": 0.05,
"max_value": 0.5,
"step": 0.05
}
],
"categories": [
{"Model Settings": ["model_id", "device", "enable_quantization"]},
{"Performance Settings": ["throughput_streams", "max_threads"]},
{"Audio Settings": ["sample_rate", "chunk_duration_ms"]},
{"Voice Activity Detection": ["vad_threshold", "max_silence_frames", "max_trailing_silence_frames", "vad_energy_threshold", "vad_zcr_min", "vad_zcr_max", "vad_spectral_centroid_min", "vad_spectral_centroid_max", "vad_spectral_rolloff_threshold", "vad_minimum_duration", "vad_max_history", "vad_noise_floor_energy", "vad_adaptation_rate", "vad_harmonic_threshold"]}
]
}
def handle_config_update(lobby_id: str, config_values: Dict[str, Any]) -> bool:
"""Handle configuration update for a specific lobby"""
global _model_id, _device, _ov_config, SAMPLE_RATE, CHUNK_DURATION_MS, VAD_THRESHOLD
global MAX_SILENCE_FRAMES, MAX_TRAILING_SILENCE_FRAMES
try:
logger.info(f"Updating Whisper config for lobby {lobby_id}: {config_values}")
config_applied = False
# Update model configuration
if "model_id" in config_values:
new_model_id = config_values["model_id"]
if new_model_id in [model for models in model_ids.values() for model in models]:
_model_id = new_model_id
config_applied = True
logger.info(f"Updated model_id to: {_model_id}")
else:
logger.warning(f"Invalid model_id: {new_model_id}")
# Update device configuration
if "device" in config_values:
new_device = config_values["device"]
available_devices = [d["name"] for d in get_available_devices()]
if new_device in available_devices or new_device in ["CPU", "GPU", "GPU.1"]:
_device = new_device
_ov_config.device = new_device
config_applied = True
logger.info(f"Updated device to: {_device}")
else:
logger.warning(f"Invalid device: {new_device}, available: {available_devices}")
# Update OpenVINO configuration
if "enable_quantization" in config_values:
_ov_config.enable_quantization = bool(config_values["enable_quantization"])
config_applied = True
logger.info(f"Updated quantization to: {_ov_config.enable_quantization}")
if "throughput_streams" in config_values:
streams = int(config_values["throughput_streams"])
if 1 <= streams <= 8:
_ov_config.throughput_streams = streams
config_applied = True
logger.info(f"Updated throughput_streams to: {_ov_config.throughput_streams}")
if "max_threads" in config_values:
threads = int(config_values["max_threads"])
if 1 <= threads <= 16:
_ov_config.max_threads = threads
config_applied = True
logger.info(f"Updated max_threads to: {_ov_config.max_threads}")
# Update audio processing parameters
if "sample_rate" in config_values:
rate = int(config_values["sample_rate"])
if 8000 <= rate <= 48000:
SAMPLE_RATE = rate
config_applied = True
logger.info(f"Updated sample_rate to: {SAMPLE_RATE}")
if "chunk_duration_ms" in config_values:
duration = int(config_values["chunk_duration_ms"])
if 50 <= duration <= 500:
CHUNK_DURATION_MS = duration
config_applied = True
logger.info(f"Updated chunk_duration_ms to: {CHUNK_DURATION_MS}")
if "vad_threshold" in config_values:
threshold = float(config_values["vad_threshold"])
if 0.001 <= threshold <= 0.1:
VAD_THRESHOLD = threshold
config_applied = True
logger.info(f"Updated vad_threshold to: {VAD_THRESHOLD}")
if "max_silence_frames" in config_values:
frames = int(config_values["max_silence_frames"])
if 10 <= frames <= 100:
MAX_SILENCE_FRAMES = frames
config_applied = True
logger.info(f"Updated max_silence_frames to: {MAX_SILENCE_FRAMES}")
if "max_trailing_silence_frames" in config_values:
frames = int(config_values["max_trailing_silence_frames"])
if 1 <= frames <= 20:
MAX_TRAILING_SILENCE_FRAMES = frames
config_applied = True
logger.info(f"Updated max_trailing_silence_frames to: {MAX_TRAILING_SILENCE_FRAMES}")
# Update VAD configuration (this would require updating existing processors)
vad_updates = {}
if "vad_energy_threshold" in config_values:
vad_updates["energy_threshold"] = float(config_values["vad_energy_threshold"])
if "vad_zcr_min" in config_values:
vad_updates["zcr_min"] = float(config_values["vad_zcr_min"])
if "vad_zcr_max" in config_values:
vad_updates["zcr_max"] = float(config_values["vad_zcr_max"])
if "vad_spectral_centroid_min" in config_values:
vad_updates["spectral_centroid_min"] = float(config_values["vad_spectral_centroid_min"])
if "vad_spectral_centroid_max" in config_values:
vad_updates["spectral_centroid_max"] = float(config_values["vad_spectral_centroid_max"])
if "vad_spectral_rolloff_threshold" in config_values:
vad_updates["spectral_rolloff_threshold"] = float(config_values["vad_spectral_rolloff_threshold"])
if "vad_minimum_duration" in config_values:
vad_updates["minimum_duration"] = float(config_values["vad_minimum_duration"])
if "vad_max_history" in config_values:
vad_updates["max_history"] = int(config_values["vad_max_history"])
if "vad_noise_floor_energy" in config_values:
vad_updates["noise_floor_energy"] = float(config_values["vad_noise_floor_energy"])
if "vad_adaptation_rate" in config_values:
vad_updates["adaptation_rate"] = float(config_values["vad_adaptation_rate"])
if "vad_harmonic_threshold" in config_values:
vad_updates["harmonic_threshold"] = float(config_values["vad_harmonic_threshold"])
if vad_updates:
# Update VAD_CONFIG global
VAD_CONFIG.update(vad_updates)
config_applied = True
logger.info(f"Updated VAD config: {vad_updates}")
# Note: Existing processors would need to be recreated to pick up VAD changes
# For now, we'll log that a restart may be needed
logger.info("VAD configuration updated - existing processors may need restart to take effect")
if config_applied:
logger.info(f"Configuration update completed for lobby {lobby_id}")
else:
logger.warning(f"No valid configuration changes applied for lobby {lobby_id}")
return config_applied
except Exception as e:
logger.error(f"Failed to apply Whisper config update: {e}")
return False
def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]: