#!/usr/bin/env python3 """ VibeVoice Text-to-Speech Bot for Voicebot Framework Integrates Microsoft's VibeVoice TTS with the voicebot framework. Watches for chat messages and converts them to speech with text display. """ import threading import queue import time import numpy as np import cv2 import math from typing import Dict, Optional, Any, Tuple, Callable, Awaitable, Union from av.audio.frame import AudioFrame from av import VideoFrame from aiortc import MediaStreamTrack import fractions import torch import librosa import os from shared.logger import logger from shared.models import ChatMessageModel # Implement a local WaveformVideoTrack-like helper to hold shared waveform buffers # and lightweight speech status per session. This avoids depending on bots.whisper class WaveformVideoTrack: """Lightweight shared storage for waveform visualization and speech status. This class is not itself a MediaStreamTrack; it's used as a shared in-memory store that video tracks in this file will read from to render waveforms and status overlays. """ # session_name -> np.ndarray(float32) containing recent audio samples (mono) buffer: Dict[str, np.ndarray] = {} # session_name -> dict with status flags (is_speech, energy, is_processing, is_playing, etc.) speech_status: Dict[str, Dict[str, Any]] = {} # session_name -> sample_rate used for that buffer sample_rates: Dict[str, int] = {} # Proxy wrapper for AudioStreamer to log put() calls and basic stats without # modifying upstream VibeVoice internals. We'll wrap any created AudioStreamer # with this to capture whether model.generate() actually calls put(). class ProxyAudioStreamer: def __init__(self, real_streamer, session_name: Optional[str] = None): self._real = real_streamer self.session_name = session_name or "unknown" self.put_calls = 0 self.total_samples = 0 def put(self, audio_chunk, *args, **kwargs): # Try to measure number of samples in the chunk for diagnostics try: if torch.is_tensor(audio_chunk): length = int(audio_chunk.numel()) else: arr = np.array(audio_chunk) length = int(arr.size) except Exception: length = -1 try: # Inspect possible sample_indices positional argument for diagnostics si_info = None if len(args) >= 1: try: si = args[0] if torch.is_tensor(si): si_info = f"tensor(shape={tuple(si.shape)}, min={int(torch.min(si).item())}, max={int(torch.max(si).item())}, unique={int(len(torch.unique(si)))} )" else: arrsi = np.array(si) si_info = f"array(shape={arrsi.shape}, min={int(arrsi.min()) if arrsi.size>0 else -1}, max={int(arrsi.max()) if arrsi.size>0 else -1}, unique={int(len(np.unique(arrsi))) if arrsi.size>0 else 0})" except Exception: si_info = str(type(args[0])) logger.info(f"VibeVoice audio: ProxyAudioStreamer.put called for session {self.session_name} - samples={length} sample_indices={si_info}") except Exception: pass self.put_calls += 1 if length > 0: self.total_samples += length return getattr(self._real, 'put')(audio_chunk, *args, **kwargs) def get_stream(self, *args, **kwargs): return getattr(self._real, 'get_stream')(*args, **kwargs) def end(self, *args, **kwargs): return getattr(self._real, 'end')(*args, **kwargs) def __getattr__(self, name): return getattr(self._real, name) # Import VibeVoice components try: from vibevoice import VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor from vibevoice.modular.streamer import AudioStreamer except Exception as e: logger.warning("VibeVoice not available. Install with: git clone https://github.com/microsoft/VibeVoice.git && cd VibeVoice && pip install -e .") raise e class MediaClock: """Shared clock for media synchronization.""" def __init__(self): self.t0 = time.perf_counter() def now(self) -> float: return time.perf_counter() - self.t0 class VibeVoiceTTS: """Minimal VibeVoice Text-to-Speech wrapper.""" def __init__(self, device: str = "cpu", inference_steps: int = 10, config: Optional[Dict[str, Any]] = None): self.device = device self.inference_steps = inference_steps self.config = config or {} self.model = None self.processor = None self.sample_rate = 24000 # VibeVoice uses 24kHz self.is_initialized = False self.voice_presets = {} self.available_voices = {} try: self._initialize_model() self._setup_voice_presets() except Exception as e: logger.error(f"Failed to initialize VibeVoice: {e}") def _initialize_model(self): """Initialize the VibeVoice model with robust device handling.""" try: logger.info("Loading VibeVoice model...") # Normalize potential 'mpx' if self.device.lower() == "mpx": logger.info("Note: device 'mpx' detected, treating it as 'mps'.") self.device = "mps" if self.device == "mps" and not torch.backends.mps.is_available(): logger.warning("Warning: MPS not available. Falling back to CPU.") self.device = "cpu" logger.info(f"Using device: {self.device}") # Load processor self.processor = VibeVoiceProcessor.from_pretrained("vibevoice/VibeVoice-1.5B") # Decide dtype & attention if self.device == "mps": load_dtype = torch.float32 attn_impl_primary = "sdpa" elif self.device == "cuda": load_dtype = torch.bfloat16 attn_impl_primary = "flash_attention_2" else: load_dtype = torch.float32 attn_impl_primary = "sdpa" logger.info(f"Using device: {self.device}, torch_dtype: {load_dtype}, attn_implementation: {attn_impl_primary}") # Load model try: if self.device == "mps": self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( "vibevoice/VibeVoice-1.5B", torch_dtype=load_dtype, attn_implementation=attn_impl_primary, device_map=None, ) self.model.to("mps") elif self.device == "cuda": self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( "vibevoice/VibeVoice-1.5B", torch_dtype=load_dtype, device_map="cuda", attn_implementation=attn_impl_primary, ) else: self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( "vibevoice/VibeVoice-1.5B", torch_dtype=load_dtype, device_map="cpu", attn_implementation=attn_impl_primary, ) except Exception as e: if attn_impl_primary == 'flash_attention_2': logger.warning(f"Error with flash_attention_2: {e}") logger.info("Falling back to attention implementation: sdpa") fallback_attn = "sdpa" self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( "vibevoice/VibeVoice-1.5B", torch_dtype=load_dtype, device_map=(self.device if self.device in ("cuda", "cpu") else None), attn_implementation=fallback_attn, ) if self.device == "mps": self.model.to("mps") else: raise e self.model.eval() # Use SDE solver by default self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config( self.model.model.noise_scheduler.config, algorithm_type='sde-dpmsolver++', beta_schedule='squaredcos_cap_v2' ) self.model.set_ddpm_inference_steps(num_steps=self.inference_steps) if hasattr(self.model.model, 'language_model'): logger.info(f"Language model attention: {self.model.model.language_model.config._attn_implementation}") self.is_initialized = True logger.info("VibeVoice model loaded successfully!") except Exception as e: logger.error(f"Error loading VibeVoice model: {e}") raise def _setup_voice_presets(self): """Setup voice presets by scanning the voices directory.""" # Look for voices directory in multiple possible locations possible_voice_dirs = [ os.path.join(os.path.dirname(__file__), "voices"), # /voicebot/bots/voices/ os.path.join(os.path.dirname(__file__), "..", "VibeVoice", "demo", "voices"), # /voicebot/VibeVoice/demo/voices/ "/voicebot/VibeVoice/demo/voices", # Absolute path ] voices_dir = None for possible_dir in possible_voice_dirs: if os.path.exists(possible_dir): voices_dir = possible_dir break # Check if voices directory exists if not voices_dir: logger.warning(f"Warning: Voices directory not found in any of: {possible_voice_dirs}") self.voice_presets = {} self.available_voices = {} self.speaker_mapping = {} return # Scan for all WAV files in the voices directory self.voice_presets = {} # Get all supported audio files audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac') audio_files = [f for f in os.listdir(voices_dir) if f.lower().endswith(audio_extensions) and os.path.isfile(os.path.join(voices_dir, f))] # Create dictionary with filename (without extension) as key for audio_file in audio_files: # Remove extension to get the name name = os.path.splitext(audio_file)[0] # Create full path full_path = os.path.join(voices_dir, audio_file) self.voice_presets[name] = full_path # Sort the voice presets alphabetically by name for better UI self.voice_presets = dict(sorted(self.voice_presets.items())) # Filter out voices that don't exist (this is now redundant but kept for safety) self.available_voices = { name: path for name, path in self.voice_presets.items() if os.path.exists(path) } # Map speaker numbers (1, 2, 3, 4) to available voice files self.speaker_mapping = {} available_voice_names = list(self.available_voices.keys()) for i in range(1, 5): # Support speakers 1-4 if i <= len(available_voice_names): voice_name = available_voice_names[i-1] # 0-indexed self.speaker_mapping[str(i)] = voice_name logger.info(f"Mapped Speaker {i} to voice '{voice_name}'") else: logger.warning(f"No voice file available for Speaker {i}") if not self.available_voices: logger.warning("No voice presets found. Please add audio files to the voices directory.") else: logger.info(f"Found {len(self.available_voices)} voice files in {voices_dir}") logger.info(f"Available voices: {', '.join(self.available_voices.keys())}") logger.info(f"Speaker mapping: {self.speaker_mapping}") def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray: """Read and preprocess audio file.""" try: import soundfile as sf wav, sr = sf.read(audio_path) if len(wav.shape) > 1: wav = np.mean(wav, axis=1) if sr != target_sr: wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr) return wav except Exception as e: logger.error(f"Error reading audio {audio_path}: {e}") return np.array([]) def generate_speech(self, text: str, speaker: str = "1", cfg_scale: float = 1.3) -> Optional[np.ndarray]: """Generate speech using the AudioStreamer and return a single concatenated numpy array. This removes the old synchronous model.generate path and uses the streamer-based generation even for blocking calls. Returns None if generation isn't possible. """ # Must have model initialized and streamer available if not self.is_initialized: logger.error("VibeVoice TTS: Model not initialized - cannot generate speech synchronously") return None try: # Prepare formatted text and voice samples (same as demo) formatted_text = f"Speaker {speaker}: {text}" voice_samples = [] if speaker in self.speaker_mapping: voice_name = self.speaker_mapping[speaker] if voice_name in self.available_voices: audio_path = self.available_voices[voice_name] audio_data = self.read_audio(audio_path) if len(audio_data) > 0: voice_samples.append(audio_data) else: voice_samples.append([]) else: voice_samples.append([]) else: voice_samples.append([]) inputs = self.processor( # type: ignore text=[formatted_text], voice_samples=[voice_samples], padding=True, return_tensors="pt" ) # Move tensors to device target_device = self.device if self.device in ("cuda", "mps") else "cpu" for k, v in inputs.items(): if torch.is_tensor(v): inputs[k] = v.to(target_device) # Create streamer and run generation real_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None) audio_streamer = ProxyAudioStreamer(real_streamer, session_name=self.session_name) with torch.no_grad(): try: self.model.generate( # type: ignore **inputs, max_new_tokens=None, cfg_scale=cfg_scale, tokenizer=self.processor.tokenizer, # type: ignore generation_config={'do_sample': False}, verbose=False, streamer=audio_streamer, ) finally: # ensure streamer end if model.generate returns try: audio_streamer.end() except Exception: pass # Collect streamed chunks collected = [] for audio_chunk in audio_streamer.get_stream(0): try: if torch.is_tensor(audio_chunk): if audio_chunk.dtype == torch.bfloat16: audio_chunk = audio_chunk.float() audio_np = audio_chunk.cpu().numpy().astype(np.float32) else: audio_np = np.array(audio_chunk, dtype=np.float32) if audio_np.ndim > 1: audio_np = audio_np.squeeze() collected.append(audio_np) except Exception as e: logger.error(f"VibeVoice TTS: Error collecting chunk: {e}") if not collected: logger.error("VibeVoice TTS: No audio chunks received from streamer") return None audio = np.concatenate(collected) # Mix with background noise if enabled noise_type = self.config.get('background_noise_type', 'none') noise_volume = self.config.get('background_noise_volume', 0.0) audio = self.mix_audio_with_background_noise(audio, noise_type, noise_volume) # Resample to 16kHz for compatibility with existing audio pipeline audio_16k = librosa.resample(audio, orig_sr=24000, target_sr=16000) return audio_16k.astype(np.float32) except Exception as e: logger.error(f"VibeVoice TTS: Error generating speech via streamer: {e}") return None def generate_background_noise(self, duration_seconds: float, noise_type: str = "white", volume: float = 0.01, sample_rate: Optional[int] = None) -> np.ndarray: """Generate background noise of specified type and duration.""" if sample_rate is None: sample_rate = self.sample_rate if noise_type == "none": return np.zeros(int(duration_seconds * sample_rate), dtype=np.float32) num_samples = int(duration_seconds * sample_rate) if noise_type == "white": # White noise - equal power across all frequencies noise = np.random.normal(0, 1, num_samples).astype(np.float32) elif noise_type == "pink": # Pink noise - 1/f frequency response (approximated) white = np.random.normal(0, 1, num_samples).astype(np.float32) # Simple pink noise approximation using IIR filter b = [0.049922035, -0.095993537, 0.050612699, -0.004408786] a = [1, -2.494956002, 2.017265875, -0.522189400] noise = np.zeros_like(white) for i in range(len(b), len(white)): noise[i] = b[0] * white[i] + b[1] * white[i-1] + b[2] * white[i-2] + b[3] * white[i-3] - a[1] * noise[i-1] - a[2] * noise[i-2] - a[3] * noise[i-3] elif noise_type == "brown": # Brown noise - 1/f² frequency response (integrated white noise) white = np.random.normal(0, 1, num_samples).astype(np.float32) noise = np.cumsum(white) # Normalize to prevent drift noise = (noise - np.mean(noise)) / np.std(noise) else: # Default to white noise noise = np.random.normal(0, 1, num_samples).astype(np.float32) # Apply volume noise *= volume return noise def mix_audio_with_background_noise(self, audio: np.ndarray, noise_type: str = "white", volume: float = 0.01) -> np.ndarray: """Mix generated audio with background noise.""" # Default to disabled when not present in config to avoid unexpected noise if not self.config.get('background_noise_enabled', False): return audio # Generate background noise for the duration of the audio using the TTS sample rate duration_seconds = len(audio) / self.sample_rate background_noise = self.generate_background_noise(duration_seconds, noise_type, volume, self.sample_rate) # Mix audio with background noise mixed_audio = audio + background_noise # Normalize to prevent clipping max_val = np.max(np.abs(mixed_audio)) if max_val > 1.0: mixed_audio /= max_val return mixed_audio class VibeVoiceVideoTrack(MediaStreamTrack): """Video track that displays text being spoken.""" kind = "video" def __init__(self, clock, config: Dict[str, Any], session_name: Optional[str] = None): super().__init__() self.clock = clock self.config = config # Keep session_name for looking up waveform buffers and status self.session_name = session_name or config.get('session_name') or f"VibeVoice:{int(time.time())}" self.width = config.get('width', 640) self.height = config.get('height', 480) self.fps = config.get('fps', 15) # Text display state self.current_text = "" self.text_queue = queue.Queue() self.display_start_time = 0 self.display_duration = 3.0 # seconds to display each text self.frame_count = 0 # Font settings self.font = cv2.FONT_HERSHEY_SIMPLEX self.font_scale = min(self.width, self.height) / 800 self.font_thickness = max(1, int(self.font_scale * 2)) def update_text(self, text: str): """Update the text to display.""" self.text_queue.put(text) logger.info(f"VibeVoice video: Queued text '{text}'") def update_config(self, config_updates: Dict[str, Any]) -> bool: """Update video configuration.""" try: self.config.update(config_updates) if 'width' in config_updates: self.width = config_updates['width'] if 'height' in config_updates: self.height = config_updates['height'] if 'fps' in config_updates: self.fps = config_updates['fps'] return True except Exception as e: logger.error(f"Error updating video config: {e}") return False async def next_timestamp(self) -> Tuple[int, float]: """Get next timestamp for video frame.""" pts = int(self.frame_count * (90000 / self.fps)) time_base = 1 / 90000 return pts, time_base async def recv(self) -> VideoFrame: """Generate video frame with current text.""" # Update current text if needed current_time = time.time() if (not self.current_text or current_time - self.display_start_time > self.display_duration): try: self.current_text = self.text_queue.get_nowait() self.display_start_time = current_time logger.info(f"VibeVoice video: Displaying '{self.current_text}'") except queue.Empty: self.current_text = "" # Create frame frame = np.zeros((self.height, self.width, 3), dtype=np.uint8) if self.current_text: # Add background cv2.rectangle(frame, (0, 0), (self.width, self.height), (0, 0, 0), -1) # Split text into lines if too long words = self.current_text.split() lines = [] current_line = "" max_chars_per_line = int(self.width / (self.font_scale * 20)) for word in words: if len(current_line + " " + word) <= max_chars_per_line: current_line += " " + word if current_line else word else: if current_line: lines.append(current_line) current_line = word if current_line: lines.append(current_line) # Draw text lines line_height = int(self.font_scale * 40) total_text_height = len(lines) * line_height start_y = (self.height - total_text_height) // 2 + line_height for i, line in enumerate(lines): text_size = cv2.getTextSize(line, self.font, self.font_scale, self.font_thickness)[0] text_x = (self.width - text_size[0]) // 2 text_y = start_y + i * line_height # Add text shadow cv2.putText(frame, line, (text_x + 2, text_y + 2), self.font, self.font_scale, (0, 0, 0), self.font_thickness + 1) # Add main text cv2.putText(frame, line, (text_x, text_y), self.font, self.font_scale, (255, 255, 255), self.font_thickness) else: # Default background when no text cv2.putText(frame, "VibeVoice TTS", (50, self.height // 2), self.font, self.font_scale * 2, (255, 255, 255), self.font_thickness) # Draw waveform and status overlays from shared WaveformVideoTrack buffers try: pname = self.session_name buf = WaveformVideoTrack.buffer.get(pname, None) status = WaveformVideoTrack.speech_status.get(pname, {}) # Draw small status box in top-left status_text = "Idle" if status.get('is_processing'): status_text = "Processing..." elif status.get('is_speech'): status_text = "Speaking" elif buf is not None and len(buf) > 0: # buffered seconds approx sr = WaveformVideoTrack.sample_rates.get(pname, self.config.get('sample_rate', 16000)) buffered_sec = len(buf) / float(sr) if sr > 0 else 0.0 status_text = f"Buffered: {buffered_sec:.1f}s" box_w = int(self.width * 0.28) box_h = int(self.height * 0.12) cv2.rectangle(frame, (10, 10), (10 + box_w, 10 + box_h), (50, 50, 50), -1) cv2.putText(frame, status_text, (20, 10 + int(box_h/2)), self.font, self.font_scale, (200, 200, 200), self.font_thickness) # Draw small energy meter energy = status.get('energy', 0.0) meter_h = int(box_h * 0.4) meter_w = int(box_w * 0.6) mx = 20 my = 10 + box_h - 5 filled = int(min(1.0, energy * 50.0) * meter_w) cv2.rectangle(frame, (mx, my - meter_h), (mx + meter_w, my), (80, 80, 80), -1) cv2.rectangle(frame, (mx, my - meter_h), (mx + filled, my), (0, 200, 0), -1) # Draw waveform at bottom area if buf is not None and buf.size > 4: sr = WaveformVideoTrack.sample_rates.get(pname, self.config.get('sample_rate', 16000)) # Use last N samples corresponding to width pixels samples_to_show = min(buf.size, max(1, int(sr * 5))) # show up to last 5s slice_buf = buf[-samples_to_show:] # Downsample to width points idx = (np.linspace(0, samples_to_show - 1, num=self.width)).astype(np.int32) waveform = slice_buf[idx] # Normalize waveform to -1..1 maxv = np.max(np.abs(waveform)) if waveform.size > 0 else 1.0 if maxv <= 0: maxv = 1.0 waveform = waveform / maxv # Map to pixel coordinates in bottom strip wf_h = int(self.height * 0.22) wf_y0 = self.height - wf_h - 10 pts = [] for i, v in enumerate(waveform): px = int(i * (self.width / len(waveform))) py = int(wf_y0 + (wf_h / 2) * (1 - v)) pts.append((px, py)) if len(pts) >= 2: cv2.polylines(frame, [np.array(pts, dtype=np.int32)], False, (100, 200, 255), 1) # Fill under curve for nicer look fill_pts = pts + [(self.width - 1, wf_y0 + wf_h), (0, wf_y0 + wf_h)] cv2.fillPoly(frame, [np.array(fill_pts, dtype=np.int32)], (30, 60, 80)) except Exception: # Non-critical rendering failure shouldn't break video pass self.frame_count += 1 return VideoFrame.from_ndarray(frame, format="bgr24") class VibeVoiceAudioTrack(MediaStreamTrack): """Audio track that plays TTS speech.""" kind = "audio" def __init__(self, clock, config: Dict[str, Any], tts_engine: VibeVoiceTTS, session_name: Optional[str] = None): super().__init__() self.clock = clock self.config = config self.tts = tts_engine self.sample_rate = config.get('sample_rate', 16000) self.samples_per_frame = config.get('samples_per_frame', 960) # 60ms at 16kHz # Audio playback state self.audio_queue = queue.Queue() self.current_audio = None self.audio_position = 0 self.is_speaking = False self.speaker = config.get('speaker', 'Alice') # Audio buffer for mixing multiple TTS segments self.audio_buffer = np.array([], dtype=np.float32) self.buffer_lock = threading.Lock() # Optional looping and debug options self.loop = config.get('loop', True) self.debug_save_wav = config.get('debug_save_wav', True) # Keep the last fully-generated audio to enable looping self.last_generated_audio = np.array([], dtype=np.float32) # Protect last_generated_audio updates self._last_gen_lock = threading.Lock() # Track total samples generated for proper PTS calculation self._samples_generated = 0 # Optional session name used to publish waveform data for visualization self.session_name = session_name or f"VibeVoice:{int(time.time())}" def update_config(self, config_updates: Dict[str, Any]) -> bool: """Update audio configuration.""" try: self.config.update(config_updates) if 'sample_rate' in config_updates: self.sample_rate = config_updates['sample_rate'] if 'samples_per_frame' in config_updates: self.samples_per_frame = config_updates['samples_per_frame'] if 'speaker' in config_updates: self.speaker = config_updates['speaker'] if 'loop' in config_updates: self.loop = bool(config_updates['loop']) logger.info(f"🔁 Looping {'enabled' if self.loop else 'disabled'} for session {self.session_name}") if 'debug_save_wav' in config_updates: self.debug_save_wav = bool(config_updates['debug_save_wav']) logger.info(f"🐞 Debug save wav {'enabled' if self.debug_save_wav else 'disabled'} for session {self.session_name}") # Log background noise configuration updates background_noise_updated = False if 'background_noise_enabled' in config_updates: logger.info(f"🎵 Background noise enabled: {config_updates['background_noise_enabled']}") background_noise_updated = True if 'background_noise_type' in config_updates: logger.info(f"🎵 Background noise type: {config_updates['background_noise_type']}") background_noise_updated = True if 'background_noise_volume' in config_updates: logger.info(f"🎵 Background noise volume: {config_updates['background_noise_volume']}") background_noise_updated = True if background_noise_updated: logger.info("🎵 Background noise configuration updated - changes will take effect on next audio frame") return True except Exception as e: logger.error(f"Error updating audio config: {e}") return False def speak_text(self, text: str, cfg_scale: Optional[float] = None): """Queue text for speech synthesis.""" if cfg_scale is None: cfg_scale = 1.3 # Default value logger.info(f"VibeVoice audio: Starting background TTS generation for '{text}' with cfg_scale={cfg_scale}") # Start TTS generation in a background thread import threading thread = threading.Thread( target=self._generate_tts_background, args=(text, self.speaker, cfg_scale), daemon=True ) thread.start() def _generate_tts_background(self, text: str, speaker: str, cfg_scale: float): """Generate TTS in background thread and add to audio buffer.""" try: logger.info(f"VibeVoice audio: Background TTS generation started for '{text}'") # Log some diagnostic info about the TTS engine state try: logger.info(f"VibeVoice audio: TTS engine initialized={getattr(self.tts, 'is_initialized', False)}, device={getattr(self.tts, 'device', None)}, tts_sample_rate={getattr(self.tts, 'sample_rate', None)}") # available_voices and speaker_mapping may be large; log summaries try: avv = getattr(self.tts, 'available_voices', {}) smap = getattr(self.tts, 'speaker_mapping', {}) logger.info(f"VibeVoice audio: available_voices={list(avv.keys())[:5]} (count={len(avv)}), speaker_mapping_count={len(smap)}") except Exception: pass except Exception: pass # Mark processing state for video overlay try: WaveformVideoTrack.speech_status[self.session_name] = WaveformVideoTrack.speech_status.get(self.session_name, {}) WaveformVideoTrack.speech_status[self.session_name]['is_processing'] = True except Exception: pass # Require model and streamer to be available for streaming generation if not self.tts.is_initialized: logger.error("VibeVoice audio: Model or AudioStreamer not available - background generation disabled") return # Prepare formatted text and inputs (same expectations as generate_speech) formatted_text = f"Speaker {speaker}: {text}" voice_samples = [] if speaker in self.tts.speaker_mapping: voice_name = self.tts.speaker_mapping[speaker] if voice_name in self.tts.available_voices: audio_path = self.tts.available_voices[voice_name] audio_data = self.tts.read_audio(audio_path) if len(audio_data) > 0: voice_samples.append(audio_data) else: voice_samples.append([]) else: voice_samples.append([]) else: voice_samples.append([]) inputs = self.tts.processor( # type: ignore text=[formatted_text], voice_samples=[voice_samples], padding=True, return_tensors="pt" ) # Move tensors to device target_device = self.tts.device if self.tts.device in ("cuda", "mps") else "cpu" for k, v in inputs.items(): if torch.is_tensor(v): inputs[k] = v.to(target_device) # Log a summary of inputs for diagnostic purposes try: inp_summary = {} for k, v in inputs.items(): if torch.is_tensor(v): inp_summary[k] = f"tensor(shape={tuple(v.shape)}, dtype={v.dtype})" else: try: inp_summary[k] = f"{type(v).__name__}(len={len(v)})" except Exception: inp_summary[k] = type(v).__name__ logger.info(f"VibeVoice audio: Input summary for generation: {inp_summary}") except Exception: pass # Create audio streamer and start model.generate in a separate thread real_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None) audio_streamer = ProxyAudioStreamer(real_streamer, session_name=self.session_name) def _run_generate(): try: logger.info(f"VibeVoice audio: model.generate starting for session {self.session_name}") with torch.no_grad(): self.tts.model.generate( # type: ignore **inputs, max_new_tokens=None, cfg_scale=cfg_scale, tokenizer=self.tts.processor.tokenizer, # type: ignore generation_config={'do_sample': False}, verbose=False, streamer=audio_streamer, ) except Exception as e: logger.error(f"VibeVoice audio: Error during model.generate: {e}") finally: # Ensure streamer is ended try: audio_streamer.end() except Exception: pass logger.info(f"VibeVoice audio: model.generate finished for session {self.session_name}") gen_thread = threading.Thread(target=_run_generate, daemon=True) gen_thread.start() # Consume chunks from streamer and append to audio buffer as they arrive generated_chunks = [] chunk_count = 0 total_samples_streamed = 0 logger.info(f"VibeVoice audio: Audio streamer started for session {self.session_name}") try: logger.info(f"VibeVoice audio: audio_streamer repr: {repr(audio_streamer)[:400]}") gs = None try: gs = audio_streamer.get_stream(0) logger.info(f"VibeVoice audio: get_stream returned object type: {type(gs)}") except Exception as _e: logger.error(f"VibeVoice audio: calling audio_streamer.get_stream raised: {_e}") gs = None except Exception: gs = None if gs is None: logger.warning(f"VibeVoice audio: audio_streamer.get_stream did not return a stream for session {self.session_name}") iterator = [] else: iterator = gs for audio_chunk in iterator: try: # Convert tensor to numpy if needed if torch.is_tensor(audio_chunk): if audio_chunk.dtype == torch.bfloat16: audio_chunk = audio_chunk.float() audio_np = audio_chunk.cpu().numpy().astype(np.float32) else: audio_np = np.array(audio_chunk, dtype=np.float32) # Squeeze to 1D if needed if audio_np.ndim > 1: audio_np = audio_np.squeeze() # Resample from model sampling rate (usually 24000) to track sample rate if hasattr(self.tts, 'sample_rate') and self.tts.sample_rate != self.sample_rate: try: audio_np = librosa.resample(audio_np, orig_sr=self.tts.sample_rate, target_sr=self.sample_rate) except Exception: # If resample fails, keep original chunk pass # Append to internal buffer with self.buffer_lock: if len(self.audio_buffer) == 0: self.audio_buffer = audio_np else: self.audio_buffer = np.concatenate([self.audio_buffer, audio_np]) # Also collect into generated_chunks for possible looping/debug save try: generated_chunks.append(audio_np.astype(np.float32)) except Exception: pass total_samples_streamed += len(audio_np) chunk_count += 1 # Log every few chunks to avoid log spam if chunk_count % 5 == 0: logger.info(f"VibeVoice audio: Streamed {total_samples_streamed} samples so far for session {self.session_name} (chunks={chunk_count})") else: logger.debug(f"VibeVoice audio: Streamed {len(audio_np)} samples to buffer (total buffer: {len(self.audio_buffer)})") # Also publish into the global waveform buffer used by WaveformVideoTrack try: if WaveformVideoTrack is not None: pname = self.session_name # Ensure buffer key exists if pname not in WaveformVideoTrack.buffer: WaveformVideoTrack.buffer[pname] = np.array([], dtype=np.float32) # Append to shared waveform buffer WaveformVideoTrack.buffer[pname] = np.concatenate([ WaveformVideoTrack.buffer[pname], audio_np.astype(np.float32) ]) # Ensure sample rate is set for this session WaveformVideoTrack.sample_rates[pname] = self.sample_rate # Limit buffer to last 10 seconds for this track max_samples = int(self.sample_rate * 10) if len(WaveformVideoTrack.buffer[pname]) > max_samples: WaveformVideoTrack.buffer[pname] = WaveformVideoTrack.buffer[pname][-max_samples:] # Update a lightweight speech_status for display energy = float(np.sqrt(np.mean(audio_np.astype(np.float32) ** 2))) if audio_np.size > 0 else 0.0 # Approximate zero-crossing rate try: if audio_np.size > 1: zcr = float(np.mean(np.abs(np.diff(np.sign(audio_np)) ) > 0)) else: zcr = 0.0 except Exception: zcr = 0.0 is_speech = energy > 0.005 WaveformVideoTrack.speech_status[pname] = { 'is_speech': bool(is_speech), 'energy': float(energy), 'zcr': float(zcr), 'centroid': 0.0, 'rolloff': 0.0, 'flux': 0.0, 'harmonicity': 0.0, 'noise_floor_energy': 0.0, 'adaptive_threshold': 0.0, 'energy_check': bool(energy > 0.002), 'zcr_check': bool(zcr > 0.01), 'spectral_check': False, 'harmonic_check': False, 'temporal_consistency': True, 'is_processing': True, 'is_playing': False, } except Exception: # Non-critical - don't break TTS on visualization failures pass except Exception as e: logger.error(f"VibeVoice audio: Error processing audio chunk from streamer: {e}") # Ensure generation thread finishes gen_thread.join(timeout=5.0) # If generation thread is still alive after join, log a warning if gen_thread.is_alive(): logger.warning(f"VibeVoice audio: generation thread still alive after join for session {self.session_name}") # When generation completes, store last_generated_audio for looping and optionally save debug WAV logger.info(f"VibeVoice audio: Generation completed for session {self.session_name}. total_samples_streamed={total_samples_streamed}, chunks={chunk_count}") # If no chunks were received, emit a diagnostic warning with some state to help debugging if chunk_count == 0: try: # Provide more diagnostic info: inputs summary and streamer introspection try: sdi = { 'repr': repr(audio_streamer)[:400], 'dir': [n for n in dir(audio_streamer) if not n.startswith('_')][:40] } except Exception: sdi = {'repr': 'unavailable', 'dir': []} try: logger.warning( f"VibeVoice audio: No audio chunks were streamed for session {self.session_name}. " f"is_initialized={getattr(self.tts, 'is_initialized', False)}, model_present={hasattr(self.tts, 'model')} ; " f"audio_streamer={sdi}" ) except Exception: logger.warning(f"VibeVoice audio: No audio chunks were streamed for session {self.session_name} (diagnostics failed)") except Exception: logger.warning(f"VibeVoice audio: No audio chunks were streamed for session {self.session_name} (additional diagnostics unavailable)") # Fallback: attempt a synchronous generation that returns a full numpy audio array try: logger.info(f"VibeVoice audio: Attempting synchronous fallback generation for session {self.session_name}") fallback_audio = None try: fallback_audio = self.tts.generate_speech(text, speaker, cfg_scale=cfg_scale) except Exception as e: logger.error(f"VibeVoice audio: synchronous fallback generation raised: {e}") if fallback_audio is not None and getattr(fallback_audio, 'size', 0) > 0: try: fa = fallback_audio.astype(np.float32) except Exception: fa = np.array(fallback_audio, dtype=np.float32) # Resample if needed try: tts_sr = getattr(self.tts, 'sample_rate', 24000) if tts_sr != self.sample_rate: fa = librosa.resample(fa, orig_sr=tts_sr, target_sr=self.sample_rate) except Exception: pass # Append into internal buffer and last_generated_audio with self.buffer_lock: if len(self.audio_buffer) == 0: self.audio_buffer = fa else: self.audio_buffer = np.concatenate([self.audio_buffer, fa]) with self._last_gen_lock: self.last_generated_audio = fa.copy() # Publish to waveform buffer try: pname = self.session_name if pname not in WaveformVideoTrack.buffer: WaveformVideoTrack.buffer[pname] = np.array([], dtype=np.float32) WaveformVideoTrack.buffer[pname] = np.concatenate([WaveformVideoTrack.buffer[pname], fa.astype(np.float32)]) WaveformVideoTrack.sample_rates[pname] = self.sample_rate except Exception: pass # Optionally save debug wav if self.debug_save_wav: try: try: import soundfile as sf fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav" sf.write(fname, fa, samplerate=self.sample_rate) logger.info(f"🐞 Saved fallback generated wav to {fname} (soundfile)") except Exception: try: from scipy.io import wavfile fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav" wavfile.write(fname, self.sample_rate, (fa * 32767).astype('int16')) logger.info(f"🐞 Saved fallback generated wav to {fname} (scipy)") except Exception: try: import wave fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav" with wave.open(fname, 'wb') as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(self.sample_rate) int_data = (fa * 32767).astype('int16') wf.writeframes(int_data.tobytes()) logger.info(f"🐞 Saved fallback generated wav to {fname} (wave)") except Exception as e: logger.error(f"Error saving fallback debug wav (all methods failed): {e}") except Exception as e: logger.error(f"Error saving fallback debug wav: {e}") logger.info(f"VibeVoice audio: Fallback synchronous generation successful for session {self.session_name} (samples={len(fa)})") else: logger.warning(f"VibeVoice audio: Fallback synchronous generation produced no audio for session {self.session_name}") except Exception as e: logger.error(f"VibeVoice audio: Exception during synchronous fallback generation: {e}") try: if len(generated_chunks) > 0: try: all_gen = np.concatenate(generated_chunks).astype(np.float32) except Exception: all_gen = np.array([], dtype=np.float32) with self._last_gen_lock: self.last_generated_audio = all_gen.copy() # Optionally save to disk for debugging if self.debug_save_wav: try: try: import soundfile as sf fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav" sf.write(fname, all_gen, samplerate=self.sample_rate) logger.info(f"🐞 Saved generated wav to {fname} (soundfile)") except Exception: # Try scipy fallback try: from scipy.io import wavfile fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav" # scipy expects int16 wavfile.write(fname, self.sample_rate, (all_gen * 32767).astype('int16')) logger.info(f"🐞 Saved generated wav to {fname} (scipy)") except Exception: # Ultimate fallback: write raw wave via wave module try: import wave fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav" with wave.open(fname, 'wb') as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(self.sample_rate) int_data = (all_gen * 32767).astype('int16') wf.writeframes(int_data.tobytes()) logger.info(f"🐞 Saved generated wav to {fname} (wave)") except Exception as e: logger.error(f"Error saving debug wav (all methods failed): {e}") except Exception as e: logger.error(f"Error saving debug wav: {e}") except Exception: pass # Clear processing flag when generation completes try: if self.session_name in WaveformVideoTrack.speech_status: WaveformVideoTrack.speech_status[self.session_name]['is_processing'] = False except Exception: pass except Exception as e: logger.error(f"VibeVoice audio: Error in background TTS generation: {e}") def _get_samples_from_buffer(self, num_samples: int) -> np.ndarray: """Get samples from audio buffer, removing them from buffer.""" # Try to refill from last_generated_audio if looping is enabled with self._last_gen_lock: last_gen = self.last_generated_audio.copy() if getattr(self, 'last_generated_audio', None) is not None else np.array([], dtype=np.float32) with self.buffer_lock: if len(self.audio_buffer) == 0: # If we're configured to loop and have a generated sample, refill the buffer if getattr(self, 'loop', False) and last_gen.size > 0: try: # Repeat last_gen as needed to reach at least num_samples repeats = int(math.ceil(float(num_samples) / float(len(last_gen)))) if len(last_gen) > 0 else 1 refill = np.tile(last_gen, repeats) self.audio_buffer = refill.astype(np.float32) logger.debug(f"VibeVoice audio: Refilled audio_buffer from last_generated_audio (len={len(last_gen)}) repeats={repeats}") except Exception: # Fallback to silence on any failure self.audio_buffer = np.zeros(num_samples, dtype=np.float32) else: return np.zeros(num_samples, dtype=np.float32) if len(self.audio_buffer) >= num_samples: samples = self.audio_buffer[:num_samples] self.audio_buffer = self.audio_buffer[num_samples:] return samples else: # Return remaining samples and pad with zeros samples = self.audio_buffer padding = np.zeros(num_samples - len(self.audio_buffer), dtype=np.float32) self.audio_buffer = np.array([], dtype=np.float32) return np.concatenate([samples, padding]) async def next_timestamp(self) -> Tuple[int, float]: """Get next timestamp for audio frame.""" pts = self._samples_generated time_base = 1 / self.sample_rate return pts, time_base async def recv(self) -> AudioFrame: """Generate audio frame with TTS speech from buffer.""" # Get samples from buffer samples = self._get_samples_from_buffer(self.samples_per_frame) # If no TTS audio available, generate background noise if np.all(samples == 0): # Default to disabled when not present in config to avoid unexpected noise if self.config.get('background_noise_enabled', False): noise_type = self.config.get('background_noise_type', 'white') noise_volume = self.config.get('background_noise_volume', 0.01) # Generate noise for this frame duration frame_duration = self.samples_per_frame / self.sample_rate logger.debug(f"🎵 Generating background noise: type={noise_type}, volume={noise_volume}, duration={frame_duration:.3f}s") background_noise = self.tts.generate_background_noise(frame_duration, noise_type, noise_volume, self.sample_rate) logger.debug(f"🎵 Generated background noise: {len(background_noise)} samples") samples = background_noise else: # Generate silence if background noise is disabled logger.debug("🎵 Background noise disabled - generating silence") samples = np.zeros(self.samples_per_frame, dtype=np.float32) # Convert to 16-bit PCM # Update shared speech_status for visualization: energy + playing flag try: energy = float(np.sqrt(np.mean(samples.astype(np.float32) ** 2))) if samples.size > 0 else 0.0 pname = self.session_name st = WaveformVideoTrack.speech_status.get(pname, {}) st['energy'] = float(energy) # Consider playing when energy above small threshold st['is_playing'] = bool(energy > 0.001) st['is_speech'] = bool(energy > 0.003) WaveformVideoTrack.speech_status[pname] = st except Exception: pass samples_int16 = (samples * 32767).astype(np.int16) # Create stereo audio (duplicate mono channel) left = samples_int16 right = samples_int16.copy() stereo = np.empty(self.samples_per_frame * 2, dtype=np.int16) stereo[0::2] = left stereo[1::2] = right # Create audio frame frame = AudioFrame.from_ndarray(stereo.reshape(1, -1), format="s16", layout="stereo") frame.sample_rate = self.sample_rate frame.pts = self._samples_generated frame.time_base = fractions.Fraction(1, self.sample_rate) # Increment sample counter self._samples_generated += self.samples_per_frame return frame class VibeVoiceTTSBot: """VibeVoice Text-to-Speech Bot for voicebot framework.""" def __init__(self, session_name: str, config: Optional[Dict[str, Any]] = None): self.session_name = session_name self.config = config or {} # Initialize TTS engine with enhanced parameters device = self.config.get('device', 'cpu') inference_steps = self.config.get('inference_steps', 10) self.tts_engine = VibeVoiceTTS(device=device, inference_steps=inference_steps, config=self.config) # Store generation parameters self.cfg_scale = self.config.get('cfg_scale', 1.3) self.speaker = self.config.get('speaker', '1') # Initialize media components self.media_clock = MediaClock() # Pass session name into video track so it can show per-session waveform/status self.video_track = VibeVoiceVideoTrack(self.media_clock, self.config, session_name=session_name) self.audio_track = VibeVoiceAudioTrack(self.media_clock, self.config, self.tts_engine, session_name=session_name) # Initialize shared waveform store sample rate and empty buffer/status try: WaveformVideoTrack.sample_rates[session_name] = self.config.get('sample_rate', 16000) if session_name not in WaveformVideoTrack.buffer: WaveformVideoTrack.buffer[session_name] = np.array([], dtype=np.float32) if session_name not in WaveformVideoTrack.speech_status: WaveformVideoTrack.speech_status[session_name] = {'is_speech': False, 'energy': 0.0, 'is_processing': False, 'is_playing': False} except Exception: pass # Apply initial configuration values to ensure defaults from schema/config provider try: self.update_config(self.config) except Exception: # Don't let config application stop initialization pass logger.info(f"VibeVoice bot initialized for session {session_name} with cfg_scale={self.cfg_scale}, speaker={self.speaker}") def get_tracks(self) -> Dict[str, MediaStreamTrack]: """Get video and audio tracks.""" return { "video": self.video_track, "audio": self.audio_track } def handle_chat_message(self, message: ChatMessageModel): """Handle incoming chat messages by converting them to speech.""" try: text = message.message.strip() if text: logger.info(f"VibeVoice bot received chat: '{text}' from {message.sender_name}") # Queue text for both video display and audio speech self.video_track.update_text(text) self.audio_track.speak_text(text, self.cfg_scale) except Exception as e: logger.error(f"Error handling chat message in VibeVoice bot: {e}") def update_config(self, config_updates: Dict[str, Any]) -> bool: """Update bot configuration.""" try: self.config.update(config_updates) # Update TTS-specific parameters if 'cfg_scale' in config_updates: self.cfg_scale = config_updates['cfg_scale'] if 'speaker' in config_updates: self.speaker = config_updates['speaker'] # Update tracks video_success = self.video_track.update_config(config_updates) audio_success = self.audio_track.update_config(config_updates) if video_success and audio_success: logger.info(f"VibeVoice bot configuration updated: {config_updates}") return True else: logger.warning("Partial configuration update failure in VibeVoice bot") return False except Exception as e: logger.error(f"Error updating VibeVoice bot configuration: {e}") return False # Global bot instance registry _vibevoice_bots: Dict[str, VibeVoiceTTSBot] = {} def create_vibevoice_bot_tracks(session_name: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, MediaStreamTrack]: """ Create VibeVoice TTS bot tracks. Args: session_name: Name for the session config: Configuration dictionary with options: - width: video width (default 640) - height: video height (default 480) - fps: frames per second (default 15) - sample_rate: audio sample rate (default 16000) - samples_per_frame: audio samples per frame (default 960) - speaker: TTS speaker name (default '1') - device: device for TTS ('cpu', 'cuda', 'mps') - cfg_scale: CFG scale for generation (default 1.3) - inference_steps: Number of inference steps (default 10) Returns: Dictionary containing 'video' and 'audio' tracks """ if config is None: config = {} # Set defaults default_config = { 'width': 640, 'height': 480, 'fps': 15, 'sample_rate': 16000, 'samples_per_frame': 960, 'speaker': '1', 'device': 'cpu', 'cfg_scale': 1.3, 'inference_steps': 10, # Explicit background noise defaults - disabled by default 'background_noise_enabled': False, 'background_noise_type': 'none', 'background_noise_volume': 0.0, } default_config.update(config) # Create bot instance bot = VibeVoiceTTSBot(session_name, default_config) _vibevoice_bots[session_name] = bot logger.info(f"Created VibeVoice bot tracks for {session_name}") return bot.get_tracks() def handle_config_update(session_name: str, config_values: Dict[str, Any]) -> bool: """ Handle runtime configuration updates for VibeVoice bot. Args: session_name: Name of the session/bot instance config_values: Dictionary of configuration values to update Returns: bool: True if update was successful, False otherwise """ try: if session_name in _vibevoice_bots: return _vibevoice_bots[session_name].update_config(config_values) else: logger.warning(f"No VibeVoice bot found for session {session_name}") return False except Exception as e: logger.error(f"Error updating VibeVoice bot configuration: {e}") return False async def handle_chat_message( chat_message: ChatMessageModel, send_message_func: Callable[[Union[str, ChatMessageModel]], Awaitable[None]] ) -> Optional[str]: """ Handle incoming chat messages and convert them to speech. Args: chat_message: The chat message to process send_message_func: Function to send chat responses (not used by TTS bot) """ try: # Find the bot instance - we need to get session name from somewhere # For now, we'll use the first available bot instance if _vibevoice_bots: session_name = list(_vibevoice_bots.keys())[0] _vibevoice_bots[session_name].handle_chat_message(chat_message) logger.info(f"VibeVoice bot processed chat message from {chat_message.sender_name}: '{chat_message.message}'") else: logger.warning("No VibeVoice bot instances available to handle chat message") except Exception as e: logger.error(f"Error handling chat message in VibeVoice bot: {e}") # TTS bot doesn't send chat responses, so return None return None # Agent descriptor exported for dynamic discovery by the FastAPI service AGENT_NAME = "VibeVoice TTS Bot" AGENT_DESCRIPTION = "Microsoft VibeVoice text-to-speech bot with visual text display" def agent_info() -> Dict[str, str]: """Return agent metadata for discovery.""" return { "name": AGENT_NAME, "description": AGENT_DESCRIPTION, "has_media": "true", "configurable": "true", "chat_enabled": "true" } def get_config_schema() -> Dict[str, Any]: """Get the configuration schema for the VibeVoice Bot.""" return { "bot_name": AGENT_NAME, "version": "1.0", "parameters": [ { "name": "width", "type": "number", "label": "Video Width", "description": "Width of the video frame in pixels", "default_value": 640, "required": False, "min_value": 320, "max_value": 1920, "step": 1 }, { "name": "height", "type": "number", "label": "Video Height", "description": "Height of the video frame in pixels", "default_value": 480, "required": False, "min_value": 240, "max_value": 1080, "step": 1 }, { "name": "fps", "type": "number", "label": "Frames Per Second", "description": "Video frame rate", "default_value": 15, "required": False, "min_value": 1, "max_value": 60, "step": 1 }, { "name": "speaker", "type": "select", "label": "TTS Speaker", "description": "Voice to use for text-to-speech", "default_value": "1", "required": True, "options": [ {"value": "1", "label": "Speaker 1 (en-Alice_woman)"}, {"value": "2", "label": "Speaker 2 (en-Carter_man)"}, {"value": "3", "label": "Speaker 3 (en-Frank_man)"}, {"value": "4", "label": "Speaker 4 (en-Mary_woman_bgm)"} ] }, { "name": "background_noise_enabled", "type": "boolean", "label": "Enable Background Noise", "description": "Add background noise to ensure continuous audio streaming", "default_value": False, "required": False }, { "name": "background_noise_type", "type": "select", "label": "Background Noise Type", "description": "Type of background noise to generate", # 'none' indicates no noise - matches default disabled behavior "default_value": "none", "required": False, "options": [ {"value": "white", "label": "White Noise"}, {"value": "pink", "label": "Pink Noise"}, {"value": "brown", "label": "Brown Noise"}, {"value": "none", "label": "None"} ] }, { "name": "background_noise_volume", "type": "number", "label": "Background Noise Volume", "description": "Volume level of background noise (0.0 to 1.0)", "default_value": 0.01, "required": False, "min_value": 0.0, "max_value": 1.0, "step": 0.001 }, { "name": "device", "type": "select", "label": "Processing Device", "description": "Device to use for TTS processing", "default_value": "cpu", "required": True, "options": [ {"value": "cpu", "label": "CPU"}, {"value": "cuda", "label": "CUDA (GPU)"}, {"value": "mps", "label": "MPS (Apple Silicon)"} ] }, { "name": "cfg_scale", "type": "number", "label": "CFG Scale", "description": "Classifier-free guidance scale for controlling generation quality", "default_value": 1.3, "required": False, "min_value": 1.0, "max_value": 2.0, "step": 0.05 }, { "name": "inference_steps", "type": "number", "label": "Inference Steps", "description": "Number of denoising steps for audio generation", "default_value": 10, "required": False, "min_value": 5, "max_value": 50, "step": 1 } ], "categories": [ { "Video Settings": ["width", "height", "fps"] }, { "TTS Settings": ["speaker", "device", "cfg_scale", "inference_steps"] }, { "Background Noise": ["background_noise_enabled", "background_noise_type", "background_noise_volume"] } ] } def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]: """Factory wrapper used by the FastAPI service to instantiate tracks for an agent.""" return create_vibevoice_bot_tracks(session_name)