From 6620c0ac74f5e40b1acd259b42a4adf51b5b27ee Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Wed, 17 Sep 2025 14:06:18 -0700 Subject: [PATCH] Before claude rewrite --- .dockerignore | 1 + .github/copilot-instructions.md | 2 +- client/src/App.css | 225 ----- client/src/App.tsx | 6 +- client/src/LobbyChat.css | 6 +- client/src/index.css | 23 +- voicebot/bots/vibevoice.py | 1574 +++++++++++++++++++++++++++++++ voicebot/bots/whisper.py | 279 +++++- voicebot/requirements.txt | 354 +++---- 9 files changed, 2021 insertions(+), 449 deletions(-) create mode 100644 voicebot/bots/vibevoice.py diff --git a/.dockerignore b/.dockerignore index b045329..bc548ad 100644 --- a/.dockerignore +++ b/.dockerignore @@ -12,3 +12,4 @@ **/*.key **/package-lock.json **/*.pyc +**/VibeVoice diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 3be8dbb..94fe719 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -15,7 +15,7 @@ - Always run tests inside the appropriate Docker containers using `docker compose exec` - Use `uv run` for Python commands in voicebot and server containers - Tests should be placed in the `tests/` directory (bind mounted to `/tests` in containers) -- Use proper PYTHONPATH when running Python code: `PYTHONPATH=/shared:/voicebot` for voicebot, `PYTHONPATH=/shared:/server` for server +- Use proper PYTHONPATH when running Python code: `PYTHONPATH=/:/voicebot` for voicebot, `PYTHONPATH=/:/server` for server - Check container logs with `docker compose logs --since 10m SERVICE_NAME` for debugging ### Voicebot Testing (Python with uv) diff --git a/client/src/App.css b/client/src/App.css index de25b97..628975a 100755 --- a/client/src/App.css +++ b/client/src/App.css @@ -2,228 +2,3 @@ body { font-family: 'Droid Sans', 'Arial Narrow', Arial, sans-serif; overflow: hidden; } - -#root { - width: 100vw; -/* height: 100vh; breaks on mobile -- not needed */ -} - -.Table { - display: flex; - position: absolute; - top: 0; - left: 0; - width: 100%; - bottom: 0; - flex-direction: row; - /* background-image: url("./assets/tabletop.png"); */ -} - -.Table .Dialogs { - z-index: 10000; - display: flex; - justify-content: space-around; - align-items: center; - position: absolute; - top: 0; - left: 0; - bottom: 0; - right: 0; -} - -.Table .Dialogs .Dialog { - display: flex; - position: absolute; - flex-shrink: 1; - flex-direction: column; - padding: 0.25rem; - left: 0; - right: 0; - top: 0; - bottom: 0; - justify-content: space-around; - align-items: center; - z-index: 60000; -} - -.Table .Dialogs .Dialog > div { - display: flex; - padding: 1rem; - flex-direction: column; -} - -.Table .Dialogs .Dialog > div > div:first-child { - padding: 1rem; -} - -.Table .Dialogs .TurnNoticeDialog { - background-color: #7a680060; -} - -.Table .Dialogs .ErrorDialog { - background-color: #40000060; -} - -.Table .Dialogs .WarningDialog { - background-color: #00000060; -} - -.Table .Game { - position: relative; - display: flex; - flex-direction: column; - flex-grow: 1; -} - -.Table .Board { - display: flex; - position: relative; - flex-grow: 1; - z-index: 500; -} - -.Table .PlayersStatus { - z-index: 500; /* Under Hand */ -} - -.Table .PlayersStatus.ActivePlayer { - z-index: 1500; /* On top of Hand */ -} - -.Table .Hand { - display: flex; - position: relative; - height: 11rem; - z-index: 10000; -} - -.Table .Sidebar { - display: flex; - flex-direction: column; - justify-content: space-between; - width: 25rem; - max-width: 25rem; - overflow: hidden; - z-index: 5000; -} - -.Table .Sidebar .Chat { - display: flex; - position: relative; - flex-grow: 1; -} - -.Table .Trade { - display: flex; - position: relative; - z-index: 25000; - align-self: center; -} - -.Table .Dialogs { - position: absolute; - display: flex; - top: 0; - bottom: 0; - right: 0; - left: 0; - justify-content: space-around; - align-items: center; - z-index: 20000; - pointer-events: none; -} - -.Table .Dialogs > * { - pointer-events: all; -} - -.Table .ViewCard { - display: flex; - position: absolute; - top: 0; - left: 0; - right: 0; - bottom: 0; -} - -.Table .Winner { - display: flex; - position: absolute; - top: 0; - left: 0; - right: 0; - bottom: 0; -} - - -.Table .HouseRules { - display: flex; - position: absolute; - top: 0; - left: 0; - right: 0; - bottom: 0; -} - -.Table .ChooseCard { - display: flex; - position: relative; - top: 0; - left: 0; - right: 0; - bottom: 0; -} - -.Table button { - margin: 0.25rem; - background-color: white; - border: 1px solid black; /* why !important */ -} - -.Table .MuiButton-text { - padding: 0.25rem 0.55rem; -} - -.Table button:disabled { - opacity: 0.5; - border: 1px solid #ccc; /* why !important */ -} - -.Table .ActivitiesBox { - display: flex; - flex-direction: column; - position: absolute; - left: 1em; - top: 1em; -} - -.Table .DiceRoll { - display: flex; - flex-direction: column; - position: relative; - /* - left: 1rem; - top: 5rem;*/ - flex-wrap: wrap; - justify-content: left; - align-items: left; - z-index: 1000; -} - -.Table .DiceRoll div:not(:last-child) { - border: 1px solid black; - background-color: white; - padding: 0.25rem 0.5rem; - border-radius: 0.25rem; -} -.Table .DiceRoll div:last-child { - display: flex; - flex-direction: row; -} - -.Table .DiceRoll .Dice { - margin: 0.25rem; - width: 2.75rem; - height: 2.75rem; - border-radius: 0.5rem; -} \ No newline at end of file diff --git a/client/src/App.tsx b/client/src/App.tsx index 56dfdac..533ab86 100644 --- a/client/src/App.tsx +++ b/client/src/App.tsx @@ -191,8 +191,8 @@ const LobbyView: React.FC = (props: LobbyProps) => { sx={{ p: { xs: 1, sm: 2 }, m: { xs: 0, sm: 2 }, - width: { xs: "100%", sm: "fit-content" }, - maxWidth: { xs: "100%", sm: 600 }, + // width: { xs: "100%", sm: "fit-content" }, + // maxWidth: { xs: "100%", sm: 600 }, }} > {readyState !== ReadyState.OPEN || !session ? ( @@ -299,7 +299,7 @@ const App = () => { np.ndarray(float32) containing recent audio samples (mono) + buffer: Dict[str, np.ndarray] = {} + + # session_name -> dict with status flags (is_speech, energy, is_processing, is_playing, etc.) + speech_status: Dict[str, Dict[str, Any]] = {} + + # session_name -> sample_rate used for that buffer + sample_rates: Dict[str, int] = {} + + +# Proxy wrapper for AudioStreamer to log put() calls and basic stats without +# modifying upstream VibeVoice internals. We'll wrap any created AudioStreamer +# with this to capture whether model.generate() actually calls put(). +class ProxyAudioStreamer: + def __init__(self, real_streamer, session_name: Optional[str] = None): + self._real = real_streamer + self.session_name = session_name or "unknown" + self.put_calls = 0 + self.total_samples = 0 + + def put(self, audio_chunk, *args, **kwargs): + # Try to measure number of samples in the chunk for diagnostics + try: + if torch.is_tensor(audio_chunk): + length = int(audio_chunk.numel()) + else: + arr = np.array(audio_chunk) + length = int(arr.size) + except Exception: + length = -1 + + try: + # Inspect possible sample_indices positional argument for diagnostics + si_info = None + if len(args) >= 1: + try: + si = args[0] + if torch.is_tensor(si): + si_info = f"tensor(shape={tuple(si.shape)}, min={int(torch.min(si).item())}, max={int(torch.max(si).item())}, unique={int(len(torch.unique(si)))} )" + else: + arrsi = np.array(si) + si_info = f"array(shape={arrsi.shape}, min={int(arrsi.min()) if arrsi.size>0 else -1}, max={int(arrsi.max()) if arrsi.size>0 else -1}, unique={int(len(np.unique(arrsi))) if arrsi.size>0 else 0})" + except Exception: + si_info = str(type(args[0])) + + logger.info(f"VibeVoice audio: ProxyAudioStreamer.put called for session {self.session_name} - samples={length} sample_indices={si_info}") + except Exception: + pass + + self.put_calls += 1 + if length > 0: + self.total_samples += length + + return getattr(self._real, 'put')(audio_chunk, *args, **kwargs) + + def get_stream(self, *args, **kwargs): + return getattr(self._real, 'get_stream')(*args, **kwargs) + + def end(self, *args, **kwargs): + return getattr(self._real, 'end')(*args, **kwargs) + + def __getattr__(self, name): + return getattr(self._real, name) + + +# Import VibeVoice components +try: + from vibevoice import VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor + from vibevoice.modular.streamer import AudioStreamer +except Exception as e: + logger.warning("VibeVoice not available. Install with: git clone https://github.com/microsoft/VibeVoice.git && cd VibeVoice && pip install -e .") + raise e + + + +class MediaClock: + """Shared clock for media synchronization.""" + + def __init__(self): + self.t0 = time.perf_counter() + + def now(self) -> float: + return time.perf_counter() - self.t0 + + +class VibeVoiceTTS: + """Minimal VibeVoice Text-to-Speech wrapper.""" + + def __init__(self, device: str = "cpu", inference_steps: int = 10, config: Optional[Dict[str, Any]] = None): + self.device = device + self.inference_steps = inference_steps + self.config = config or {} + self.model = None + self.processor = None + self.sample_rate = 24000 # VibeVoice uses 24kHz + self.is_initialized = False + self.voice_presets = {} + self.available_voices = {} + + try: + self._initialize_model() + self._setup_voice_presets() + except Exception as e: + logger.error(f"Failed to initialize VibeVoice: {e}") + + def _initialize_model(self): + """Initialize the VibeVoice model with robust device handling.""" + try: + logger.info("Loading VibeVoice model...") + + # Normalize potential 'mpx' + if self.device.lower() == "mpx": + logger.info("Note: device 'mpx' detected, treating it as 'mps'.") + self.device = "mps" + if self.device == "mps" and not torch.backends.mps.is_available(): + logger.warning("Warning: MPS not available. Falling back to CPU.") + self.device = "cpu" + + logger.info(f"Using device: {self.device}") + + # Load processor + self.processor = VibeVoiceProcessor.from_pretrained("vibevoice/VibeVoice-1.5B") + + # Decide dtype & attention + if self.device == "mps": + load_dtype = torch.float32 + attn_impl_primary = "sdpa" + elif self.device == "cuda": + load_dtype = torch.bfloat16 + attn_impl_primary = "flash_attention_2" + else: + load_dtype = torch.float32 + attn_impl_primary = "sdpa" + + logger.info(f"Using device: {self.device}, torch_dtype: {load_dtype}, attn_implementation: {attn_impl_primary}") + + # Load model + try: + if self.device == "mps": + self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( + "vibevoice/VibeVoice-1.5B", + torch_dtype=load_dtype, + attn_implementation=attn_impl_primary, + device_map=None, + ) + self.model.to("mps") + elif self.device == "cuda": + self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( + "vibevoice/VibeVoice-1.5B", + torch_dtype=load_dtype, + device_map="cuda", + attn_implementation=attn_impl_primary, + ) + else: + self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( + "vibevoice/VibeVoice-1.5B", + torch_dtype=load_dtype, + device_map="cpu", + attn_implementation=attn_impl_primary, + ) + except Exception as e: + if attn_impl_primary == 'flash_attention_2': + logger.warning(f"Error with flash_attention_2: {e}") + logger.info("Falling back to attention implementation: sdpa") + fallback_attn = "sdpa" + self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( + "vibevoice/VibeVoice-1.5B", + torch_dtype=load_dtype, + device_map=(self.device if self.device in ("cuda", "cpu") else None), + attn_implementation=fallback_attn, + ) + if self.device == "mps": + self.model.to("mps") + else: + raise e + + self.model.eval() + + # Use SDE solver by default + self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config( + self.model.model.noise_scheduler.config, + algorithm_type='sde-dpmsolver++', + beta_schedule='squaredcos_cap_v2' + ) + self.model.set_ddpm_inference_steps(num_steps=self.inference_steps) + + if hasattr(self.model.model, 'language_model'): + logger.info(f"Language model attention: {self.model.model.language_model.config._attn_implementation}") + + self.is_initialized = True + logger.info("VibeVoice model loaded successfully!") + + except Exception as e: + logger.error(f"Error loading VibeVoice model: {e}") + raise + + def _setup_voice_presets(self): + """Setup voice presets by scanning the voices directory.""" + # Look for voices directory in multiple possible locations + possible_voice_dirs = [ + os.path.join(os.path.dirname(__file__), "voices"), # /voicebot/bots/voices/ + os.path.join(os.path.dirname(__file__), "..", "VibeVoice", "demo", "voices"), # /voicebot/VibeVoice/demo/voices/ + "/voicebot/VibeVoice/demo/voices", # Absolute path + ] + + voices_dir = None + for possible_dir in possible_voice_dirs: + if os.path.exists(possible_dir): + voices_dir = possible_dir + break + + # Check if voices directory exists + if not voices_dir: + logger.warning(f"Warning: Voices directory not found in any of: {possible_voice_dirs}") + self.voice_presets = {} + self.available_voices = {} + self.speaker_mapping = {} + return + + # Scan for all WAV files in the voices directory + self.voice_presets = {} + + # Get all supported audio files + audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac') + audio_files = [f for f in os.listdir(voices_dir) + if f.lower().endswith(audio_extensions) and os.path.isfile(os.path.join(voices_dir, f))] + + # Create dictionary with filename (without extension) as key + for audio_file in audio_files: + # Remove extension to get the name + name = os.path.splitext(audio_file)[0] + # Create full path + full_path = os.path.join(voices_dir, audio_file) + self.voice_presets[name] = full_path + + # Sort the voice presets alphabetically by name for better UI + self.voice_presets = dict(sorted(self.voice_presets.items())) + + # Filter out voices that don't exist (this is now redundant but kept for safety) + self.available_voices = { + name: path for name, path in self.voice_presets.items() + if os.path.exists(path) + } + + # Map speaker numbers (1, 2, 3, 4) to available voice files + self.speaker_mapping = {} + available_voice_names = list(self.available_voices.keys()) + for i in range(1, 5): # Support speakers 1-4 + if i <= len(available_voice_names): + voice_name = available_voice_names[i-1] # 0-indexed + self.speaker_mapping[str(i)] = voice_name + logger.info(f"Mapped Speaker {i} to voice '{voice_name}'") + else: + logger.warning(f"No voice file available for Speaker {i}") + + if not self.available_voices: + logger.warning("No voice presets found. Please add audio files to the voices directory.") + else: + logger.info(f"Found {len(self.available_voices)} voice files in {voices_dir}") + logger.info(f"Available voices: {', '.join(self.available_voices.keys())}") + logger.info(f"Speaker mapping: {self.speaker_mapping}") + + def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray: + """Read and preprocess audio file.""" + try: + import soundfile as sf + wav, sr = sf.read(audio_path) + if len(wav.shape) > 1: + wav = np.mean(wav, axis=1) + if sr != target_sr: + wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr) + return wav + except Exception as e: + logger.error(f"Error reading audio {audio_path}: {e}") + return np.array([]) + + def generate_speech(self, text: str, speaker: str = "1", cfg_scale: float = 1.3) -> Optional[np.ndarray]: + """Generate speech using the AudioStreamer and return a single concatenated numpy array. + + This removes the old synchronous model.generate path and uses the streamer-based + generation even for blocking calls. Returns None if generation isn't possible. + """ + # Must have model initialized and streamer available + if not self.is_initialized: + logger.error("VibeVoice TTS: Model not initialized - cannot generate speech synchronously") + return None + + try: + # Prepare formatted text and voice samples (same as demo) + formatted_text = f"Speaker {speaker}: {text}" + voice_samples = [] + if speaker in self.speaker_mapping: + voice_name = self.speaker_mapping[speaker] + if voice_name in self.available_voices: + audio_path = self.available_voices[voice_name] + audio_data = self.read_audio(audio_path) + if len(audio_data) > 0: + voice_samples.append(audio_data) + else: + voice_samples.append([]) + else: + voice_samples.append([]) + else: + voice_samples.append([]) + + inputs = self.processor( # type: ignore + text=[formatted_text], + voice_samples=[voice_samples], + padding=True, + return_tensors="pt" + ) + + # Move tensors to device + target_device = self.device if self.device in ("cuda", "mps") else "cpu" + for k, v in inputs.items(): + if torch.is_tensor(v): + inputs[k] = v.to(target_device) + + # Create streamer and run generation + real_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None) + audio_streamer = ProxyAudioStreamer(real_streamer, session_name=self.session_name) + + with torch.no_grad(): + try: + self.model.generate( # type: ignore + **inputs, + max_new_tokens=None, + cfg_scale=cfg_scale, + tokenizer=self.processor.tokenizer, # type: ignore + generation_config={'do_sample': False}, + verbose=False, + streamer=audio_streamer, + ) + finally: + # ensure streamer end if model.generate returns + try: + audio_streamer.end() + except Exception: + pass + + # Collect streamed chunks + collected = [] + for audio_chunk in audio_streamer.get_stream(0): + try: + if torch.is_tensor(audio_chunk): + if audio_chunk.dtype == torch.bfloat16: + audio_chunk = audio_chunk.float() + audio_np = audio_chunk.cpu().numpy().astype(np.float32) + else: + audio_np = np.array(audio_chunk, dtype=np.float32) + + if audio_np.ndim > 1: + audio_np = audio_np.squeeze() + + collected.append(audio_np) + except Exception as e: + logger.error(f"VibeVoice TTS: Error collecting chunk: {e}") + + if not collected: + logger.error("VibeVoice TTS: No audio chunks received from streamer") + return None + + audio = np.concatenate(collected) + + # Mix with background noise if enabled + noise_type = self.config.get('background_noise_type', 'none') + noise_volume = self.config.get('background_noise_volume', 0.0) + audio = self.mix_audio_with_background_noise(audio, noise_type, noise_volume) + + # Resample to 16kHz for compatibility with existing audio pipeline + audio_16k = librosa.resample(audio, orig_sr=24000, target_sr=16000) + return audio_16k.astype(np.float32) + + except Exception as e: + logger.error(f"VibeVoice TTS: Error generating speech via streamer: {e}") + return None + + def generate_background_noise(self, duration_seconds: float, noise_type: str = "white", volume: float = 0.01, sample_rate: Optional[int] = None) -> np.ndarray: + """Generate background noise of specified type and duration.""" + if sample_rate is None: + sample_rate = self.sample_rate + + if noise_type == "none": + return np.zeros(int(duration_seconds * sample_rate), dtype=np.float32) + + num_samples = int(duration_seconds * sample_rate) + + if noise_type == "white": + # White noise - equal power across all frequencies + noise = np.random.normal(0, 1, num_samples).astype(np.float32) + elif noise_type == "pink": + # Pink noise - 1/f frequency response (approximated) + white = np.random.normal(0, 1, num_samples).astype(np.float32) + # Simple pink noise approximation using IIR filter + b = [0.049922035, -0.095993537, 0.050612699, -0.004408786] + a = [1, -2.494956002, 2.017265875, -0.522189400] + noise = np.zeros_like(white) + for i in range(len(b), len(white)): + noise[i] = b[0] * white[i] + b[1] * white[i-1] + b[2] * white[i-2] + b[3] * white[i-3] - a[1] * noise[i-1] - a[2] * noise[i-2] - a[3] * noise[i-3] + elif noise_type == "brown": + # Brown noise - 1/f² frequency response (integrated white noise) + white = np.random.normal(0, 1, num_samples).astype(np.float32) + noise = np.cumsum(white) + # Normalize to prevent drift + noise = (noise - np.mean(noise)) / np.std(noise) + else: + # Default to white noise + noise = np.random.normal(0, 1, num_samples).astype(np.float32) + + # Apply volume + noise *= volume + return noise + + def mix_audio_with_background_noise(self, audio: np.ndarray, noise_type: str = "white", volume: float = 0.01) -> np.ndarray: + """Mix generated audio with background noise.""" + # Default to disabled when not present in config to avoid unexpected noise + if not self.config.get('background_noise_enabled', False): + return audio + + # Generate background noise for the duration of the audio using the TTS sample rate + duration_seconds = len(audio) / self.sample_rate + background_noise = self.generate_background_noise(duration_seconds, noise_type, volume, self.sample_rate) + + # Mix audio with background noise + mixed_audio = audio + background_noise + + # Normalize to prevent clipping + max_val = np.max(np.abs(mixed_audio)) + if max_val > 1.0: + mixed_audio /= max_val + + return mixed_audio + + +class VibeVoiceVideoTrack(MediaStreamTrack): + """Video track that displays text being spoken.""" + + kind = "video" + + def __init__(self, clock, config: Dict[str, Any], session_name: Optional[str] = None): + super().__init__() + self.clock = clock + self.config = config + # Keep session_name for looking up waveform buffers and status + self.session_name = session_name or config.get('session_name') or f"VibeVoice:{int(time.time())}" + self.width = config.get('width', 640) + self.height = config.get('height', 480) + self.fps = config.get('fps', 15) + + # Text display state + self.current_text = "" + self.text_queue = queue.Queue() + self.display_start_time = 0 + self.display_duration = 3.0 # seconds to display each text + self.frame_count = 0 + + # Font settings + self.font = cv2.FONT_HERSHEY_SIMPLEX + self.font_scale = min(self.width, self.height) / 800 + self.font_thickness = max(1, int(self.font_scale * 2)) + + def update_text(self, text: str): + """Update the text to display.""" + self.text_queue.put(text) + logger.info(f"VibeVoice video: Queued text '{text}'") + + def update_config(self, config_updates: Dict[str, Any]) -> bool: + """Update video configuration.""" + try: + self.config.update(config_updates) + if 'width' in config_updates: + self.width = config_updates['width'] + if 'height' in config_updates: + self.height = config_updates['height'] + if 'fps' in config_updates: + self.fps = config_updates['fps'] + return True + except Exception as e: + logger.error(f"Error updating video config: {e}") + return False + + async def next_timestamp(self) -> Tuple[int, float]: + """Get next timestamp for video frame.""" + pts = int(self.frame_count * (90000 / self.fps)) + time_base = 1 / 90000 + return pts, time_base + + async def recv(self) -> VideoFrame: + """Generate video frame with current text.""" + # Update current text if needed + current_time = time.time() + if (not self.current_text or + current_time - self.display_start_time > self.display_duration): + try: + self.current_text = self.text_queue.get_nowait() + self.display_start_time = current_time + logger.info(f"VibeVoice video: Displaying '{self.current_text}'") + except queue.Empty: + self.current_text = "" + # Create frame + frame = np.zeros((self.height, self.width, 3), dtype=np.uint8) + + if self.current_text: + # Add background + cv2.rectangle(frame, (0, 0), (self.width, self.height), (0, 0, 0), -1) + + # Split text into lines if too long + words = self.current_text.split() + lines = [] + current_line = "" + max_chars_per_line = int(self.width / (self.font_scale * 20)) + + for word in words: + if len(current_line + " " + word) <= max_chars_per_line: + current_line += " " + word if current_line else word + else: + if current_line: + lines.append(current_line) + current_line = word + if current_line: + lines.append(current_line) + + # Draw text lines + line_height = int(self.font_scale * 40) + total_text_height = len(lines) * line_height + start_y = (self.height - total_text_height) // 2 + line_height + + for i, line in enumerate(lines): + text_size = cv2.getTextSize(line, self.font, self.font_scale, self.font_thickness)[0] + text_x = (self.width - text_size[0]) // 2 + text_y = start_y + i * line_height + + # Add text shadow + cv2.putText(frame, line, (text_x + 2, text_y + 2), + self.font, self.font_scale, (0, 0, 0), self.font_thickness + 1) + # Add main text + cv2.putText(frame, line, (text_x, text_y), + self.font, self.font_scale, (255, 255, 255), self.font_thickness) + else: + # Default background when no text + cv2.putText(frame, "VibeVoice TTS", (50, self.height // 2), + self.font, self.font_scale * 2, (255, 255, 255), self.font_thickness) + + # Draw waveform and status overlays from shared WaveformVideoTrack buffers + try: + pname = self.session_name + buf = WaveformVideoTrack.buffer.get(pname, None) + status = WaveformVideoTrack.speech_status.get(pname, {}) + + # Draw small status box in top-left + status_text = "Idle" + if status.get('is_processing'): + status_text = "Processing..." + elif status.get('is_speech'): + status_text = "Speaking" + elif buf is not None and len(buf) > 0: + # buffered seconds approx + sr = WaveformVideoTrack.sample_rates.get(pname, self.config.get('sample_rate', 16000)) + buffered_sec = len(buf) / float(sr) if sr > 0 else 0.0 + status_text = f"Buffered: {buffered_sec:.1f}s" + + box_w = int(self.width * 0.28) + box_h = int(self.height * 0.12) + cv2.rectangle(frame, (10, 10), (10 + box_w, 10 + box_h), (50, 50, 50), -1) + cv2.putText(frame, status_text, (20, 10 + int(box_h/2)), self.font, self.font_scale, (200, 200, 200), self.font_thickness) + + # Draw small energy meter + energy = status.get('energy', 0.0) + meter_h = int(box_h * 0.4) + meter_w = int(box_w * 0.6) + mx = 20 + my = 10 + box_h - 5 + filled = int(min(1.0, energy * 50.0) * meter_w) + cv2.rectangle(frame, (mx, my - meter_h), (mx + meter_w, my), (80, 80, 80), -1) + cv2.rectangle(frame, (mx, my - meter_h), (mx + filled, my), (0, 200, 0), -1) + + # Draw waveform at bottom area + if buf is not None and buf.size > 4: + sr = WaveformVideoTrack.sample_rates.get(pname, self.config.get('sample_rate', 16000)) + # Use last N samples corresponding to width pixels + samples_to_show = min(buf.size, max(1, int(sr * 5))) # show up to last 5s + slice_buf = buf[-samples_to_show:] + + # Downsample to width points + idx = (np.linspace(0, samples_to_show - 1, num=self.width)).astype(np.int32) + waveform = slice_buf[idx] + # Normalize waveform to -1..1 + maxv = np.max(np.abs(waveform)) if waveform.size > 0 else 1.0 + if maxv <= 0: + maxv = 1.0 + waveform = waveform / maxv + + # Map to pixel coordinates in bottom strip + wf_h = int(self.height * 0.22) + wf_y0 = self.height - wf_h - 10 + pts = [] + for i, v in enumerate(waveform): + px = int(i * (self.width / len(waveform))) + py = int(wf_y0 + (wf_h / 2) * (1 - v)) + pts.append((px, py)) + + if len(pts) >= 2: + cv2.polylines(frame, [np.array(pts, dtype=np.int32)], False, (100, 200, 255), 1) + # Fill under curve for nicer look + fill_pts = pts + [(self.width - 1, wf_y0 + wf_h), (0, wf_y0 + wf_h)] + cv2.fillPoly(frame, [np.array(fill_pts, dtype=np.int32)], (30, 60, 80)) + except Exception: + # Non-critical rendering failure shouldn't break video + pass + + self.frame_count += 1 + return VideoFrame.from_ndarray(frame, format="bgr24") + + +class VibeVoiceAudioTrack(MediaStreamTrack): + """Audio track that plays TTS speech.""" + + kind = "audio" + + def __init__(self, clock, config: Dict[str, Any], tts_engine: VibeVoiceTTS, session_name: Optional[str] = None): + super().__init__() + self.clock = clock + self.config = config + self.tts = tts_engine + self.sample_rate = config.get('sample_rate', 16000) + self.samples_per_frame = config.get('samples_per_frame', 960) # 60ms at 16kHz + + # Audio playback state + self.audio_queue = queue.Queue() + self.current_audio = None + self.audio_position = 0 + self.is_speaking = False + self.speaker = config.get('speaker', 'Alice') + + # Audio buffer for mixing multiple TTS segments + self.audio_buffer = np.array([], dtype=np.float32) + self.buffer_lock = threading.Lock() + + # Optional looping and debug options + self.loop = config.get('loop', True) + self.debug_save_wav = config.get('debug_save_wav', True) + # Keep the last fully-generated audio to enable looping + self.last_generated_audio = np.array([], dtype=np.float32) + # Protect last_generated_audio updates + self._last_gen_lock = threading.Lock() + + # Track total samples generated for proper PTS calculation + self._samples_generated = 0 + # Optional session name used to publish waveform data for visualization + self.session_name = session_name or f"VibeVoice:{int(time.time())}" + + def update_config(self, config_updates: Dict[str, Any]) -> bool: + """Update audio configuration.""" + try: + self.config.update(config_updates) + if 'sample_rate' in config_updates: + self.sample_rate = config_updates['sample_rate'] + if 'samples_per_frame' in config_updates: + self.samples_per_frame = config_updates['samples_per_frame'] + if 'speaker' in config_updates: + self.speaker = config_updates['speaker'] + if 'loop' in config_updates: + self.loop = bool(config_updates['loop']) + logger.info(f"🔁 Looping {'enabled' if self.loop else 'disabled'} for session {self.session_name}") + if 'debug_save_wav' in config_updates: + self.debug_save_wav = bool(config_updates['debug_save_wav']) + logger.info(f"🐞 Debug save wav {'enabled' if self.debug_save_wav else 'disabled'} for session {self.session_name}") + + # Log background noise configuration updates + background_noise_updated = False + if 'background_noise_enabled' in config_updates: + logger.info(f"🎵 Background noise enabled: {config_updates['background_noise_enabled']}") + background_noise_updated = True + if 'background_noise_type' in config_updates: + logger.info(f"🎵 Background noise type: {config_updates['background_noise_type']}") + background_noise_updated = True + if 'background_noise_volume' in config_updates: + logger.info(f"🎵 Background noise volume: {config_updates['background_noise_volume']}") + background_noise_updated = True + + if background_noise_updated: + logger.info("🎵 Background noise configuration updated - changes will take effect on next audio frame") + + return True + except Exception as e: + logger.error(f"Error updating audio config: {e}") + return False + + def speak_text(self, text: str, cfg_scale: Optional[float] = None): + """Queue text for speech synthesis.""" + if cfg_scale is None: + cfg_scale = 1.3 # Default value + + logger.info(f"VibeVoice audio: Starting background TTS generation for '{text}' with cfg_scale={cfg_scale}") + + # Start TTS generation in a background thread + import threading + thread = threading.Thread( + target=self._generate_tts_background, + args=(text, self.speaker, cfg_scale), + daemon=True + ) + thread.start() + + def _generate_tts_background(self, text: str, speaker: str, cfg_scale: float): + """Generate TTS in background thread and add to audio buffer.""" + try: + logger.info(f"VibeVoice audio: Background TTS generation started for '{text}'") + + # Log some diagnostic info about the TTS engine state + try: + logger.info(f"VibeVoice audio: TTS engine initialized={getattr(self.tts, 'is_initialized', False)}, device={getattr(self.tts, 'device', None)}, tts_sample_rate={getattr(self.tts, 'sample_rate', None)}") + # available_voices and speaker_mapping may be large; log summaries + try: + avv = getattr(self.tts, 'available_voices', {}) + smap = getattr(self.tts, 'speaker_mapping', {}) + logger.info(f"VibeVoice audio: available_voices={list(avv.keys())[:5]} (count={len(avv)}), speaker_mapping_count={len(smap)}") + except Exception: + pass + except Exception: + pass + + # Mark processing state for video overlay + try: + WaveformVideoTrack.speech_status[self.session_name] = WaveformVideoTrack.speech_status.get(self.session_name, {}) + WaveformVideoTrack.speech_status[self.session_name]['is_processing'] = True + except Exception: + pass + + # Require model and streamer to be available for streaming generation + if not self.tts.is_initialized: + logger.error("VibeVoice audio: Model or AudioStreamer not available - background generation disabled") + return + + # Prepare formatted text and inputs (same expectations as generate_speech) + formatted_text = f"Speaker {speaker}: {text}" + voice_samples = [] + if speaker in self.tts.speaker_mapping: + voice_name = self.tts.speaker_mapping[speaker] + if voice_name in self.tts.available_voices: + audio_path = self.tts.available_voices[voice_name] + audio_data = self.tts.read_audio(audio_path) + if len(audio_data) > 0: + voice_samples.append(audio_data) + else: + voice_samples.append([]) + else: + voice_samples.append([]) + else: + voice_samples.append([]) + + inputs = self.tts.processor( # type: ignore + text=[formatted_text], + voice_samples=[voice_samples], + padding=True, + return_tensors="pt" + ) + + # Move tensors to device + target_device = self.tts.device if self.tts.device in ("cuda", "mps") else "cpu" + for k, v in inputs.items(): + if torch.is_tensor(v): + inputs[k] = v.to(target_device) + + # Log a summary of inputs for diagnostic purposes + try: + inp_summary = {} + for k, v in inputs.items(): + if torch.is_tensor(v): + inp_summary[k] = f"tensor(shape={tuple(v.shape)}, dtype={v.dtype})" + else: + try: + inp_summary[k] = f"{type(v).__name__}(len={len(v)})" + except Exception: + inp_summary[k] = type(v).__name__ + logger.info(f"VibeVoice audio: Input summary for generation: {inp_summary}") + except Exception: + pass + + # Create audio streamer and start model.generate in a separate thread + real_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None) + audio_streamer = ProxyAudioStreamer(real_streamer, session_name=self.session_name) + + def _run_generate(): + try: + logger.info(f"VibeVoice audio: model.generate starting for session {self.session_name}") + with torch.no_grad(): + self.tts.model.generate( # type: ignore + **inputs, + max_new_tokens=None, + cfg_scale=cfg_scale, + tokenizer=self.tts.processor.tokenizer, # type: ignore + generation_config={'do_sample': False}, + verbose=False, + streamer=audio_streamer, + ) + except Exception as e: + logger.error(f"VibeVoice audio: Error during model.generate: {e}") + finally: + # Ensure streamer is ended + try: + audio_streamer.end() + except Exception: + pass + logger.info(f"VibeVoice audio: model.generate finished for session {self.session_name}") + + gen_thread = threading.Thread(target=_run_generate, daemon=True) + gen_thread.start() + + # Consume chunks from streamer and append to audio buffer as they arrive + generated_chunks = [] + chunk_count = 0 + total_samples_streamed = 0 + logger.info(f"VibeVoice audio: Audio streamer started for session {self.session_name}") + try: + logger.info(f"VibeVoice audio: audio_streamer repr: {repr(audio_streamer)[:400]}") + gs = None + try: + gs = audio_streamer.get_stream(0) + logger.info(f"VibeVoice audio: get_stream returned object type: {type(gs)}") + except Exception as _e: + logger.error(f"VibeVoice audio: calling audio_streamer.get_stream raised: {_e}") + gs = None + except Exception: + gs = None + + if gs is None: + logger.warning(f"VibeVoice audio: audio_streamer.get_stream did not return a stream for session {self.session_name}") + iterator = [] + else: + iterator = gs + + for audio_chunk in iterator: + try: + # Convert tensor to numpy if needed + if torch.is_tensor(audio_chunk): + if audio_chunk.dtype == torch.bfloat16: + audio_chunk = audio_chunk.float() + audio_np = audio_chunk.cpu().numpy().astype(np.float32) + else: + audio_np = np.array(audio_chunk, dtype=np.float32) + + # Squeeze to 1D if needed + if audio_np.ndim > 1: + audio_np = audio_np.squeeze() + + # Resample from model sampling rate (usually 24000) to track sample rate + if hasattr(self.tts, 'sample_rate') and self.tts.sample_rate != self.sample_rate: + try: + audio_np = librosa.resample(audio_np, orig_sr=self.tts.sample_rate, target_sr=self.sample_rate) + except Exception: + # If resample fails, keep original chunk + pass + + # Append to internal buffer + with self.buffer_lock: + if len(self.audio_buffer) == 0: + self.audio_buffer = audio_np + else: + self.audio_buffer = np.concatenate([self.audio_buffer, audio_np]) + + # Also collect into generated_chunks for possible looping/debug save + try: + generated_chunks.append(audio_np.astype(np.float32)) + except Exception: + pass + + total_samples_streamed += len(audio_np) + chunk_count += 1 + # Log every few chunks to avoid log spam + if chunk_count % 5 == 0: + logger.info(f"VibeVoice audio: Streamed {total_samples_streamed} samples so far for session {self.session_name} (chunks={chunk_count})") + else: + logger.debug(f"VibeVoice audio: Streamed {len(audio_np)} samples to buffer (total buffer: {len(self.audio_buffer)})") + + # Also publish into the global waveform buffer used by WaveformVideoTrack + try: + if WaveformVideoTrack is not None: + pname = self.session_name + # Ensure buffer key exists + if pname not in WaveformVideoTrack.buffer: + WaveformVideoTrack.buffer[pname] = np.array([], dtype=np.float32) + + # Append to shared waveform buffer + WaveformVideoTrack.buffer[pname] = np.concatenate([ + WaveformVideoTrack.buffer[pname], audio_np.astype(np.float32) + ]) + + # Ensure sample rate is set for this session + WaveformVideoTrack.sample_rates[pname] = self.sample_rate + + # Limit buffer to last 10 seconds for this track + max_samples = int(self.sample_rate * 10) + if len(WaveformVideoTrack.buffer[pname]) > max_samples: + WaveformVideoTrack.buffer[pname] = WaveformVideoTrack.buffer[pname][-max_samples:] + + # Update a lightweight speech_status for display + energy = float(np.sqrt(np.mean(audio_np.astype(np.float32) ** 2))) if audio_np.size > 0 else 0.0 + # Approximate zero-crossing rate + try: + if audio_np.size > 1: + zcr = float(np.mean(np.abs(np.diff(np.sign(audio_np)) ) > 0)) + else: + zcr = 0.0 + except Exception: + zcr = 0.0 + + is_speech = energy > 0.005 + + WaveformVideoTrack.speech_status[pname] = { + 'is_speech': bool(is_speech), + 'energy': float(energy), + 'zcr': float(zcr), + 'centroid': 0.0, + 'rolloff': 0.0, + 'flux': 0.0, + 'harmonicity': 0.0, + 'noise_floor_energy': 0.0, + 'adaptive_threshold': 0.0, + 'energy_check': bool(energy > 0.002), + 'zcr_check': bool(zcr > 0.01), + 'spectral_check': False, + 'harmonic_check': False, + 'temporal_consistency': True, + 'is_processing': True, + 'is_playing': False, + } + except Exception: + # Non-critical - don't break TTS on visualization failures + pass + except Exception as e: + logger.error(f"VibeVoice audio: Error processing audio chunk from streamer: {e}") + + # Ensure generation thread finishes + gen_thread.join(timeout=5.0) + + # If generation thread is still alive after join, log a warning + if gen_thread.is_alive(): + logger.warning(f"VibeVoice audio: generation thread still alive after join for session {self.session_name}") + + # When generation completes, store last_generated_audio for looping and optionally save debug WAV + logger.info(f"VibeVoice audio: Generation completed for session {self.session_name}. total_samples_streamed={total_samples_streamed}, chunks={chunk_count}") + + # If no chunks were received, emit a diagnostic warning with some state to help debugging + if chunk_count == 0: + try: + # Provide more diagnostic info: inputs summary and streamer introspection + try: + sdi = { + 'repr': repr(audio_streamer)[:400], + 'dir': [n for n in dir(audio_streamer) if not n.startswith('_')][:40] + } + except Exception: + sdi = {'repr': 'unavailable', 'dir': []} + + try: + logger.warning( + f"VibeVoice audio: No audio chunks were streamed for session {self.session_name}. " + f"is_initialized={getattr(self.tts, 'is_initialized', False)}, model_present={hasattr(self.tts, 'model')} ; " + f"audio_streamer={sdi}" + ) + except Exception: + logger.warning(f"VibeVoice audio: No audio chunks were streamed for session {self.session_name} (diagnostics failed)") + except Exception: + logger.warning(f"VibeVoice audio: No audio chunks were streamed for session {self.session_name} (additional diagnostics unavailable)") + # Fallback: attempt a synchronous generation that returns a full numpy audio array + try: + logger.info(f"VibeVoice audio: Attempting synchronous fallback generation for session {self.session_name}") + fallback_audio = None + try: + fallback_audio = self.tts.generate_speech(text, speaker, cfg_scale=cfg_scale) + except Exception as e: + logger.error(f"VibeVoice audio: synchronous fallback generation raised: {e}") + + if fallback_audio is not None and getattr(fallback_audio, 'size', 0) > 0: + try: + fa = fallback_audio.astype(np.float32) + except Exception: + fa = np.array(fallback_audio, dtype=np.float32) + + # Resample if needed + try: + tts_sr = getattr(self.tts, 'sample_rate', 24000) + if tts_sr != self.sample_rate: + fa = librosa.resample(fa, orig_sr=tts_sr, target_sr=self.sample_rate) + except Exception: + pass + + # Append into internal buffer and last_generated_audio + with self.buffer_lock: + if len(self.audio_buffer) == 0: + self.audio_buffer = fa + else: + self.audio_buffer = np.concatenate([self.audio_buffer, fa]) + with self._last_gen_lock: + self.last_generated_audio = fa.copy() + + # Publish to waveform buffer + try: + pname = self.session_name + if pname not in WaveformVideoTrack.buffer: + WaveformVideoTrack.buffer[pname] = np.array([], dtype=np.float32) + WaveformVideoTrack.buffer[pname] = np.concatenate([WaveformVideoTrack.buffer[pname], fa.astype(np.float32)]) + WaveformVideoTrack.sample_rates[pname] = self.sample_rate + except Exception: + pass + + # Optionally save debug wav + if self.debug_save_wav: + try: + try: + import soundfile as sf + fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav" + sf.write(fname, fa, samplerate=self.sample_rate) + logger.info(f"🐞 Saved fallback generated wav to {fname} (soundfile)") + except Exception: + try: + from scipy.io import wavfile + fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav" + wavfile.write(fname, self.sample_rate, (fa * 32767).astype('int16')) + logger.info(f"🐞 Saved fallback generated wav to {fname} (scipy)") + except Exception: + try: + import wave + fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav" + with wave.open(fname, 'wb') as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(self.sample_rate) + int_data = (fa * 32767).astype('int16') + wf.writeframes(int_data.tobytes()) + logger.info(f"🐞 Saved fallback generated wav to {fname} (wave)") + except Exception as e: + logger.error(f"Error saving fallback debug wav (all methods failed): {e}") + except Exception as e: + logger.error(f"Error saving fallback debug wav: {e}") + + logger.info(f"VibeVoice audio: Fallback synchronous generation successful for session {self.session_name} (samples={len(fa)})") + else: + logger.warning(f"VibeVoice audio: Fallback synchronous generation produced no audio for session {self.session_name}") + except Exception as e: + logger.error(f"VibeVoice audio: Exception during synchronous fallback generation: {e}") + try: + if len(generated_chunks) > 0: + try: + all_gen = np.concatenate(generated_chunks).astype(np.float32) + except Exception: + all_gen = np.array([], dtype=np.float32) + with self._last_gen_lock: + self.last_generated_audio = all_gen.copy() + + # Optionally save to disk for debugging + if self.debug_save_wav: + try: + try: + import soundfile as sf + fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav" + sf.write(fname, all_gen, samplerate=self.sample_rate) + logger.info(f"🐞 Saved generated wav to {fname} (soundfile)") + except Exception: + # Try scipy fallback + try: + from scipy.io import wavfile + fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav" + # scipy expects int16 + wavfile.write(fname, self.sample_rate, (all_gen * 32767).astype('int16')) + logger.info(f"🐞 Saved generated wav to {fname} (scipy)") + except Exception: + # Ultimate fallback: write raw wave via wave module + try: + import wave + fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav" + with wave.open(fname, 'wb') as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(self.sample_rate) + int_data = (all_gen * 32767).astype('int16') + wf.writeframes(int_data.tobytes()) + logger.info(f"🐞 Saved generated wav to {fname} (wave)") + except Exception as e: + logger.error(f"Error saving debug wav (all methods failed): {e}") + except Exception as e: + logger.error(f"Error saving debug wav: {e}") + + except Exception: + pass + + # Clear processing flag when generation completes + try: + if self.session_name in WaveformVideoTrack.speech_status: + WaveformVideoTrack.speech_status[self.session_name]['is_processing'] = False + except Exception: + pass + + except Exception as e: + logger.error(f"VibeVoice audio: Error in background TTS generation: {e}") + + def _get_samples_from_buffer(self, num_samples: int) -> np.ndarray: + """Get samples from audio buffer, removing them from buffer.""" + # Try to refill from last_generated_audio if looping is enabled + with self._last_gen_lock: + last_gen = self.last_generated_audio.copy() if getattr(self, 'last_generated_audio', None) is not None else np.array([], dtype=np.float32) + + with self.buffer_lock: + if len(self.audio_buffer) == 0: + # If we're configured to loop and have a generated sample, refill the buffer + if getattr(self, 'loop', False) and last_gen.size > 0: + try: + # Repeat last_gen as needed to reach at least num_samples + repeats = int(math.ceil(float(num_samples) / float(len(last_gen)))) if len(last_gen) > 0 else 1 + refill = np.tile(last_gen, repeats) + self.audio_buffer = refill.astype(np.float32) + logger.debug(f"VibeVoice audio: Refilled audio_buffer from last_generated_audio (len={len(last_gen)}) repeats={repeats}") + except Exception: + # Fallback to silence on any failure + self.audio_buffer = np.zeros(num_samples, dtype=np.float32) + else: + return np.zeros(num_samples, dtype=np.float32) + + if len(self.audio_buffer) >= num_samples: + samples = self.audio_buffer[:num_samples] + self.audio_buffer = self.audio_buffer[num_samples:] + return samples + else: + # Return remaining samples and pad with zeros + samples = self.audio_buffer + padding = np.zeros(num_samples - len(self.audio_buffer), dtype=np.float32) + self.audio_buffer = np.array([], dtype=np.float32) + return np.concatenate([samples, padding]) + + async def next_timestamp(self) -> Tuple[int, float]: + """Get next timestamp for audio frame.""" + pts = self._samples_generated + time_base = 1 / self.sample_rate + return pts, time_base + + async def recv(self) -> AudioFrame: + """Generate audio frame with TTS speech from buffer.""" + # Get samples from buffer + samples = self._get_samples_from_buffer(self.samples_per_frame) + + # If no TTS audio available, generate background noise + if np.all(samples == 0): + # Default to disabled when not present in config to avoid unexpected noise + if self.config.get('background_noise_enabled', False): + noise_type = self.config.get('background_noise_type', 'white') + noise_volume = self.config.get('background_noise_volume', 0.01) + # Generate noise for this frame duration + frame_duration = self.samples_per_frame / self.sample_rate + logger.debug(f"🎵 Generating background noise: type={noise_type}, volume={noise_volume}, duration={frame_duration:.3f}s") + background_noise = self.tts.generate_background_noise(frame_duration, noise_type, noise_volume, self.sample_rate) + logger.debug(f"🎵 Generated background noise: {len(background_noise)} samples") + samples = background_noise + else: + # Generate silence if background noise is disabled + logger.debug("🎵 Background noise disabled - generating silence") + samples = np.zeros(self.samples_per_frame, dtype=np.float32) + + # Convert to 16-bit PCM + # Update shared speech_status for visualization: energy + playing flag + try: + energy = float(np.sqrt(np.mean(samples.astype(np.float32) ** 2))) if samples.size > 0 else 0.0 + pname = self.session_name + st = WaveformVideoTrack.speech_status.get(pname, {}) + st['energy'] = float(energy) + # Consider playing when energy above small threshold + st['is_playing'] = bool(energy > 0.001) + st['is_speech'] = bool(energy > 0.003) + WaveformVideoTrack.speech_status[pname] = st + except Exception: + pass + + samples_int16 = (samples * 32767).astype(np.int16) + + # Create stereo audio (duplicate mono channel) + left = samples_int16 + right = samples_int16.copy() + stereo = np.empty(self.samples_per_frame * 2, dtype=np.int16) + stereo[0::2] = left + stereo[1::2] = right + + # Create audio frame + frame = AudioFrame.from_ndarray(stereo.reshape(1, -1), format="s16", layout="stereo") + frame.sample_rate = self.sample_rate + frame.pts = self._samples_generated + frame.time_base = fractions.Fraction(1, self.sample_rate) + + # Increment sample counter + self._samples_generated += self.samples_per_frame + + return frame + + +class VibeVoiceTTSBot: + """VibeVoice Text-to-Speech Bot for voicebot framework.""" + + def __init__(self, session_name: str, config: Optional[Dict[str, Any]] = None): + self.session_name = session_name + self.config = config or {} + + # Initialize TTS engine with enhanced parameters + device = self.config.get('device', 'cpu') + inference_steps = self.config.get('inference_steps', 10) + self.tts_engine = VibeVoiceTTS(device=device, inference_steps=inference_steps, config=self.config) + + # Store generation parameters + self.cfg_scale = self.config.get('cfg_scale', 1.3) + self.speaker = self.config.get('speaker', '1') + + # Initialize media components + self.media_clock = MediaClock() + # Pass session name into video track so it can show per-session waveform/status + self.video_track = VibeVoiceVideoTrack(self.media_clock, self.config, session_name=session_name) + self.audio_track = VibeVoiceAudioTrack(self.media_clock, self.config, self.tts_engine, session_name=session_name) + + # Initialize shared waveform store sample rate and empty buffer/status + try: + WaveformVideoTrack.sample_rates[session_name] = self.config.get('sample_rate', 16000) + if session_name not in WaveformVideoTrack.buffer: + WaveformVideoTrack.buffer[session_name] = np.array([], dtype=np.float32) + if session_name not in WaveformVideoTrack.speech_status: + WaveformVideoTrack.speech_status[session_name] = {'is_speech': False, 'energy': 0.0, 'is_processing': False, 'is_playing': False} + except Exception: + pass + + # Apply initial configuration values to ensure defaults from schema/config provider + try: + self.update_config(self.config) + except Exception: + # Don't let config application stop initialization + pass + + logger.info(f"VibeVoice bot initialized for session {session_name} with cfg_scale={self.cfg_scale}, speaker={self.speaker}") + + def get_tracks(self) -> Dict[str, MediaStreamTrack]: + """Get video and audio tracks.""" + return { + "video": self.video_track, + "audio": self.audio_track + } + + def handle_chat_message(self, message: ChatMessageModel): + """Handle incoming chat messages by converting them to speech.""" + try: + text = message.message.strip() + if text: + logger.info(f"VibeVoice bot received chat: '{text}' from {message.sender_name}") + + # Queue text for both video display and audio speech + self.video_track.update_text(text) + self.audio_track.speak_text(text, self.cfg_scale) + + except Exception as e: + logger.error(f"Error handling chat message in VibeVoice bot: {e}") + + def update_config(self, config_updates: Dict[str, Any]) -> bool: + """Update bot configuration.""" + try: + self.config.update(config_updates) + + # Update TTS-specific parameters + if 'cfg_scale' in config_updates: + self.cfg_scale = config_updates['cfg_scale'] + if 'speaker' in config_updates: + self.speaker = config_updates['speaker'] + + # Update tracks + video_success = self.video_track.update_config(config_updates) + audio_success = self.audio_track.update_config(config_updates) + + if video_success and audio_success: + logger.info(f"VibeVoice bot configuration updated: {config_updates}") + return True + else: + logger.warning("Partial configuration update failure in VibeVoice bot") + return False + + except Exception as e: + logger.error(f"Error updating VibeVoice bot configuration: {e}") + return False + + +# Global bot instance registry +_vibevoice_bots: Dict[str, VibeVoiceTTSBot] = {} + + +def create_vibevoice_bot_tracks(session_name: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, MediaStreamTrack]: + """ + Create VibeVoice TTS bot tracks. + + Args: + session_name: Name for the session + config: Configuration dictionary with options: + - width: video width (default 640) + - height: video height (default 480) + - fps: frames per second (default 15) + - sample_rate: audio sample rate (default 16000) + - samples_per_frame: audio samples per frame (default 960) + - speaker: TTS speaker name (default '1') + - device: device for TTS ('cpu', 'cuda', 'mps') + - cfg_scale: CFG scale for generation (default 1.3) + - inference_steps: Number of inference steps (default 10) + + Returns: + Dictionary containing 'video' and 'audio' tracks + """ + if config is None: + config = {} + + # Set defaults + default_config = { + 'width': 640, + 'height': 480, + 'fps': 15, + 'sample_rate': 16000, + 'samples_per_frame': 960, + 'speaker': '1', + 'device': 'cpu', + 'cfg_scale': 1.3, + 'inference_steps': 10, + # Explicit background noise defaults - disabled by default + 'background_noise_enabled': False, + 'background_noise_type': 'none', + 'background_noise_volume': 0.0, + } + default_config.update(config) + + # Create bot instance + bot = VibeVoiceTTSBot(session_name, default_config) + _vibevoice_bots[session_name] = bot + + logger.info(f"Created VibeVoice bot tracks for {session_name}") + return bot.get_tracks() + + +def handle_config_update(session_name: str, config_values: Dict[str, Any]) -> bool: + """ + Handle runtime configuration updates for VibeVoice bot. + + Args: + session_name: Name of the session/bot instance + config_values: Dictionary of configuration values to update + + Returns: + bool: True if update was successful, False otherwise + """ + try: + if session_name in _vibevoice_bots: + return _vibevoice_bots[session_name].update_config(config_values) + else: + logger.warning(f"No VibeVoice bot found for session {session_name}") + return False + except Exception as e: + logger.error(f"Error updating VibeVoice bot configuration: {e}") + return False + + +async def handle_chat_message( + chat_message: ChatMessageModel, + send_message_func: Callable[[Union[str, ChatMessageModel]], Awaitable[None]] +) -> Optional[str]: + """ + Handle incoming chat messages and convert them to speech. + + Args: + chat_message: The chat message to process + send_message_func: Function to send chat responses (not used by TTS bot) + """ + try: + # Find the bot instance - we need to get session name from somewhere + # For now, we'll use the first available bot instance + if _vibevoice_bots: + session_name = list(_vibevoice_bots.keys())[0] + _vibevoice_bots[session_name].handle_chat_message(chat_message) + logger.info(f"VibeVoice bot processed chat message from {chat_message.sender_name}: '{chat_message.message}'") + else: + logger.warning("No VibeVoice bot instances available to handle chat message") + except Exception as e: + logger.error(f"Error handling chat message in VibeVoice bot: {e}") + + # TTS bot doesn't send chat responses, so return None + return None + + +# Agent descriptor exported for dynamic discovery by the FastAPI service +AGENT_NAME = "VibeVoice TTS Bot" +AGENT_DESCRIPTION = "Microsoft VibeVoice text-to-speech bot with visual text display" + +def agent_info() -> Dict[str, str]: + """Return agent metadata for discovery.""" + return { + "name": AGENT_NAME, + "description": AGENT_DESCRIPTION, + "has_media": "true", + "configurable": "true", + "chat_enabled": "true" + } + + +def get_config_schema() -> Dict[str, Any]: + """Get the configuration schema for the VibeVoice Bot.""" + return { + "bot_name": AGENT_NAME, + "version": "1.0", + "parameters": [ + { + "name": "width", + "type": "number", + "label": "Video Width", + "description": "Width of the video frame in pixels", + "default_value": 640, + "required": False, + "min_value": 320, + "max_value": 1920, + "step": 1 + }, + { + "name": "height", + "type": "number", + "label": "Video Height", + "description": "Height of the video frame in pixels", + "default_value": 480, + "required": False, + "min_value": 240, + "max_value": 1080, + "step": 1 + }, + { + "name": "fps", + "type": "number", + "label": "Frames Per Second", + "description": "Video frame rate", + "default_value": 15, + "required": False, + "min_value": 1, + "max_value": 60, + "step": 1 + }, + { + "name": "speaker", + "type": "select", + "label": "TTS Speaker", + "description": "Voice to use for text-to-speech", + "default_value": "1", + "required": True, + "options": [ + {"value": "1", "label": "Speaker 1 (en-Alice_woman)"}, + {"value": "2", "label": "Speaker 2 (en-Carter_man)"}, + {"value": "3", "label": "Speaker 3 (en-Frank_man)"}, + {"value": "4", "label": "Speaker 4 (en-Mary_woman_bgm)"} + ] + }, + { + "name": "background_noise_enabled", + "type": "boolean", + "label": "Enable Background Noise", + "description": "Add background noise to ensure continuous audio streaming", + "default_value": False, + "required": False + }, + { + "name": "background_noise_type", + "type": "select", + "label": "Background Noise Type", + "description": "Type of background noise to generate", + # 'none' indicates no noise - matches default disabled behavior + "default_value": "none", + "required": False, + "options": [ + {"value": "white", "label": "White Noise"}, + {"value": "pink", "label": "Pink Noise"}, + {"value": "brown", "label": "Brown Noise"}, + {"value": "none", "label": "None"} + ] + }, + { + "name": "background_noise_volume", + "type": "number", + "label": "Background Noise Volume", + "description": "Volume level of background noise (0.0 to 1.0)", + "default_value": 0.01, + "required": False, + "min_value": 0.0, + "max_value": 1.0, + "step": 0.001 + }, + { + "name": "device", + "type": "select", + "label": "Processing Device", + "description": "Device to use for TTS processing", + "default_value": "cpu", + "required": True, + "options": [ + {"value": "cpu", "label": "CPU"}, + {"value": "cuda", "label": "CUDA (GPU)"}, + {"value": "mps", "label": "MPS (Apple Silicon)"} + ] + }, + { + "name": "cfg_scale", + "type": "number", + "label": "CFG Scale", + "description": "Classifier-free guidance scale for controlling generation quality", + "default_value": 1.3, + "required": False, + "min_value": 1.0, + "max_value": 2.0, + "step": 0.05 + }, + { + "name": "inference_steps", + "type": "number", + "label": "Inference Steps", + "description": "Number of denoising steps for audio generation", + "default_value": 10, + "required": False, + "min_value": 5, + "max_value": 50, + "step": 1 + } + ], + "categories": [ + { + "Video Settings": ["width", "height", "fps"] + }, + { + "TTS Settings": ["speaker", "device", "cfg_scale", "inference_steps"] + }, + { + "Background Noise": ["background_noise_enabled", "background_noise_type", "background_noise_volume"] + } + ] + } + + +def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]: + """Factory wrapper used by the FastAPI service to instantiate tracks for an agent.""" + return create_vibevoice_bot_tracks(session_name) \ No newline at end of file diff --git a/voicebot/bots/whisper.py b/voicebot/bots/whisper.py index 3d69e62..7ae120f 100644 --- a/voicebot/bots/whisper.py +++ b/voicebot/bots/whisper.py @@ -58,13 +58,153 @@ AudioArray = npt.NDArray[np.float32] ModelConfig = Dict[str, Union[str, int, bool]] CalibrationData = List[Dict[str, Any]] -_device = "GPU.1" # Default to Intel Arc B580 GPU - # Global lock to serialize calls into the OpenVINO model.generate/decode # since some backends are not safe for concurrent generate calls. _generate_global_lock = threading.Lock() +def _do_generate_once(model, *args, **kwargs): + """Submit a single generate call to the serialized worker and return result. + + Raises any exception raised by the underlying generate call. + """ + return _submit_generate_to_worker(model.generate, *args, **kwargs) + + +def _safe_generate_with_retries(model, *args, max_retries: int = 20, initial_delay: float = 0.05, **kwargs): + """Call model.generate while handling OpenVINO 'Infer Request is busy' by retrying. + + This helper retries on RuntimeError containing 'Infer Request is busy' with + exponential backoff. It raises the last exception if retries are exhausted. + """ + delay = initial_delay + last_exc = None + for attempt in range(1, max_retries + 1): + try: + # Submit the actual blocking generate to the serialized worker + return _do_generate_once(model, *args, **kwargs) + except RuntimeError as e: + last_exc = e + msg = str(e) + # Match the specific OpenVINO busy error message + if "Infer Request is busy" in msg: + logger.warning( + f"OpenVINO infer busy (attempt {attempt}/{max_retries}), retrying after {delay:.3f}s..." + ) + time.sleep(delay) + delay = min(delay * 2.0, 1.0) + continue + # Not the busy error - re-raise immediately + raise + except Exception: + raise + # Retries exhausted + logger.error(f"OpenVINO generate retries exhausted ({max_retries}) - raising last error: {last_exc}") + raise last_exc + + +# Global serialized generate worker to ensure OpenVINO infer requests are not +# called concurrently across threads. Some OpenVINO backends will error with +# "Infer Request is busy" if multiple infer calls overlap on the same +# compiled model; queueing here serializes calls at the process level. +_generate_queue = Queue() +_generate_worker_started = False + + +def _generate_worker() -> None: + while True: + fn, args, kwargs, ev, out = _generate_queue.get() + try: + # Perform internal retries if OpenVINO reports the request as busy. + delay = 0.02 + max_inner_retries = 20 + last_exc = None + for attempt in range(1, max_inner_retries + 1): + try: + res = fn(*args, **kwargs) + out['result'] = res + out['exc'] = None + break + except RuntimeError as e: + last_exc = e + msg = str(e) + if "Infer Request is busy" in msg: + # log at debug to avoid noise but keep visibility + logger.debug(f"Worker: infer busy (attempt {attempt}/{max_inner_retries}), sleeping {delay:.3f}s") + time.sleep(delay) + delay = min(delay * 2.0, 1.0) + continue + # not a busy error - surface immediately + out['result'] = None + out['exc'] = e + break + except Exception as e: + out['result'] = None + out['exc'] = e + break + else: + # exhausted retries + out['result'] = None + out['exc'] = last_exc + finally: + try: + ev.set() + except Exception: + pass + + +def _ensure_generate_worker() -> None: + global _generate_worker_started + if _generate_worker_started: + return + t = threading.Thread(target=_generate_worker, daemon=True) + t.start() + _generate_worker_started = True + + +def _submit_generate_to_worker(fn, *args, **kwargs): + """Submit a blocking generate fn to the serialized worker and wait for result.""" + _ensure_generate_worker() + ev = threading.Event() + out: Dict[str, Any] = {} + _generate_queue.put((fn, args, kwargs, ev, out)) + ev.wait() + if out.get('exc'): + raise out['exc'] + return out.get('result') + + + +async def _safe_generate_with_retries_async(model, *args, max_retries: int = 20, initial_delay: float = 0.05, **kwargs): + """Async variant of the generate retry helper that uses asyncio.sleep. + + Should be awaited from asynchronous contexts to avoid blocking the event loop. + """ + delay = initial_delay + last_exc = None + for attempt in range(1, max_retries + 1): + try: + # Delegate to the serialized worker in an executor so the event loop + # isn't blocked waiting on the worker event. + loop = asyncio.get_running_loop() + return await loop.run_in_executor(None, lambda: _do_generate_once(model, *args, **kwargs)) + except RuntimeError as e: + last_exc = e + msg = str(e) + if "Infer Request is busy" in msg: + logger.warning( + f"OpenVINO infer busy (async attempt {attempt}/{max_retries}), retrying after {delay:.3f}s..." + ) + await asyncio.sleep(delay) + delay = min(delay * 2.0, 1.0) + continue + raise + except Exception: + raise + logger.error(f"OpenVINO async generate retries exhausted ({max_retries}) - raising last error: {last_exc}") + raise last_exc + + def get_available_devices() -> list[dict[str, Any]]: """List available OpenVINO devices with their properties.""" try: @@ -125,9 +265,27 @@ def print_available_devices(device: str | None = None): logger.info(f" Type: {d.get('type')}") +def find_best_device(preferred_type: str = "DISCRETE") -> str: + """Find the best available OpenVINO device, preferring the specified type (e.g., 'DISCRETE', 'INTEGRATED', 'CPU', 'GPU').""" + devices = get_available_devices() + if not devices: + logger.warning("No OpenVINO devices found, defaulting to CPU") + return "CPU" + for d in devices: + device_type = str(d.get("type", "")).upper() + if device_type == preferred_type.upper(): + logger.info(f"Using preferred device: {preferred_type}") + return d.get("name", "CPU") + logger.info("Preferred device not found, using first available device") + return devices[0].get("name", "CPU") + +_device = find_best_device(preferred_type="Type.DISCRETE") + print_available_devices(_device) + + class AudioQueueItem(BaseModel): """Audio data with timestamp for processing queue.""" @@ -536,11 +694,39 @@ class OpenVINOWhisperModel: logger.info("Whisper processor loaded successfully") # Export the model to OpenVINO IR if not already converted - self.ov_model = OVModelForSpeechSeq2Seq.from_pretrained( # type: ignore - self.model_id, export=True, device=self.device - ) # type: ignore + try: + self.ov_model = OVModelForSpeechSeq2Seq.from_pretrained( # type: ignore + self.model_id, export=True, device=self.device + ) # type: ignore + logger.info("Whisper model exported as OpenVINO IR") + except Exception as export_e: + logger.warning(f"Initial OpenVINO export failed: {export_e}") + # Retry using processor-derived example_inputs if possible + try: + if self.processor is None: + self.processor = WhisperProcessor.from_pretrained(self.model_id, use_fast=True) # type: ignore + dummy_audio = np.random.randn(16000).astype(np.float32) + try: + example_inputs = self.processor(# type: ignore + dummy_audio, sampling_rate=16000, return_tensors="pt" + ).input_features # type: ignore + except Exception as ex_inputs: + logger.warning(f"Failed to generate example_inputs for export retry: {ex_inputs}") + example_inputs = None - logger.info("Whisper model exported as OpenVINO IR") + if example_inputs is not None: + self.ov_model = OVModelForSpeechSeq2Seq.from_pretrained( # type: ignore + self.model_id, export=True, device=self.device, example_inputs=example_inputs + ) + else: + self.ov_model = OVModelForSpeechSeq2Seq.from_pretrained( # type: ignore + self.model_id, export=True, device=self.device + ) + + logger.info("Whisper model exported as OpenVINO IR (retry with example_inputs)") + except Exception as retry_export_e: + logger.error(f"Export retry failed: {retry_export_e}") + raise # # Try to load quantized model first if it exists # if self.config.enable_quantization and self.quantized_model_path.exists(): @@ -599,6 +785,60 @@ class OpenVINOWhisperModel: except Exception as e: logger.error(f"Model conversion failed: {e}") + # If conversion failed due to example_input / tracing mismatch + # try converting again by providing a correctly-shaped example + # input derived from the Whisper processor. This can resolve + # mismatches between the default example and model signatures. + try: + logger.info("Retrying conversion with processor-derived example_inputs...") + if self.processor is None: + # Ensure processor is available + self.processor = WhisperProcessor.from_pretrained(self.model_id, use_fast=True) # type: ignore + + # Create a short dummy audio (1s) to produce input_features + try: + dummy_audio = np.random.randn(16000).astype(np.float32) + example_inputs = self.processor(# type: ignore + dummy_audio, sampling_rate=16000, return_tensors="pt" + ).input_features # type: ignore + except Exception as ex_inputs: + logger.warning(f"Failed to generate example_inputs from processor: {ex_inputs}") + example_inputs = None + + # Attempt conversion again, supplying example_inputs if available + if example_inputs is not None: + ov_model = OVModelForSpeechSeq2Seq.from_pretrained( # type: ignore + self.model_id, + ov_config=self.config.to_ov_config(), + export=True, + compile=False, + example_inputs=example_inputs, + load_in_8bit=False, + ) + else: + ov_model = OVModelForSpeechSeq2Seq.from_pretrained( # type: ignore + self.model_id, + ov_config=self.config.to_ov_config(), + export=True, + compile=False, + load_in_8bit=False, + ) + + if hasattr(ov_model, 'half'): + ov_model.half() # type: ignore + ov_model.save_pretrained(self.model_path) # type: ignore + logger.info("Model converted and saved in FP16 format (retry with example_inputs)") + self.ov_model = ov_model # type: ignore + self._compile_model() + return + except TypeError as te: + # from_pretrained may not accept example_inputs in some versions + logger.warning(f"Conversion retry with example_inputs not supported: {te}") + except Exception as retry_e: + logger.warning(f"Retry conversion with example_inputs failed: {retry_e}") + + # If all conversion attempts fail, propagate to fallback path + logger.warning("Falling back to basic conversion without advanced export options") raise def _convert_model_basic(self) -> None: @@ -816,8 +1056,8 @@ class OpenVINOWhisperModel: ) # type: ignore # Run inference to collect calibration data - _ = self.ov_model.generate( # type: ignore - inputs.input_features, max_new_tokens=10 # type: ignore + _ = _safe_generate_with_retries( # type: ignore + self.ov_model, inputs.input_features, max_new_tokens=10 ) if i % 5 == 0: @@ -957,7 +1197,7 @@ class OpenVINOWhisperModel: # Run warmup iterations for i in range(3): - _ = self.ov_model.generate(dummy_features, max_new_tokens=10)# type: ignore + _ = _safe_generate_with_retries(self.ov_model, dummy_features, max_new_tokens=10) # type: ignore if i == 0: logger.debug("First warmup iteration completed") except Exception as e: @@ -1482,9 +1722,7 @@ class OptimizedAudioProcessor: # Serialize access to the underlying OpenVINO generation call # to avoid concurrency problems with the OpenVINO runtime. with _generate_global_lock: - gen_out = ov_model.ov_model.generate(# type: ignore - input_features, generation_config=gen_cfg# type: ignore - ) + gen_out = _safe_generate_with_retries(ov_model.ov_model, input_features, generation_config=gen_cfg) # type: ignore # Try to extract sequences if present if hasattr(gen_out, "sequences"): # type: ignore @@ -1886,9 +2124,8 @@ class OptimizedAudioProcessor: logger.info(f"{self.peer_name}: calling model.generate (async lock) (final)") else: logger.debug(f"{self.peer_name}: calling model.generate (async lock)") - generation_output = ov_model.ov_model.generate( # type: ignore - input_features, generation_config=generation_config - ) + # Use async-safe retry wrapper to avoid blocking event loop + generation_output = await _safe_generate_with_retries_async(ov_model.ov_model, input_features, generation_config=generation_config) # type: ignore finally: self._generate_lock.release() elif hasattr(self, "_generate_lock") and isinstance(self._generate_lock, threading.Lock): @@ -1897,17 +2134,13 @@ class OptimizedAudioProcessor: logger.info(f"{self.peer_name}: calling model.generate (thread lock) (final)") else: logger.debug(f"{self.peer_name}: calling model.generate (thread lock)") - generation_output = ov_model.ov_model.generate( # type: ignore - input_features, generation_config=generation_config - ) + generation_output = _safe_generate_with_retries(ov_model.ov_model, input_features, generation_config=generation_config) # type: ignore else: if is_final: logger.info(f"{self.peer_name}: calling model.generate (no lock) (final)") else: logger.debug(f"{self.peer_name}: calling model.generate (no lock)") - generation_output = ov_model.ov_model.generate( # type: ignore - input_features, generation_config=generation_config - ) + generation_output = _safe_generate_with_retries(ov_model.ov_model, input_features, generation_config=generation_config) # type: ignore if is_final: logger.info(f"{self.peer_name}: model.generate complete (final) (type={type(generation_output)})") @@ -2686,7 +2919,7 @@ def get_config_schema() -> Dict[str, Any]: "default_value": _device, "required": True, "options": [ - {"value": "GPU.1", "label": "Intel Arc GPU (GPU.1)"}, + # {"value": "GPU.1", "label": "Intel Arc GPU (GPU.1)"}, {"value": "GPU", "label": "GPU"}, {"value": "CPU", "label": "CPU"} ] @@ -2959,7 +3192,7 @@ def handle_config_update(lobby_id: str, config_values: Dict[str, Any]) -> bool: if "device" in config_values: new_device = config_values["device"] # type: ignore available_devices = [d["name"] for d in get_available_devices()] - if new_device in available_devices or new_device in ["CPU", "GPU", "GPU.1"]: + if new_device in available_devices or new_device in ["CPU", "GPU"]:#, "GPU.1"]: _device = new_device _ov_config.device = new_device config_applied = True diff --git a/voicebot/requirements.txt b/voicebot/requirements.txt index 0fdd756..a3b3d60 100644 --- a/voicebot/requirements.txt +++ b/voicebot/requirements.txt @@ -1,175 +1,183 @@ -about-time -aiofiles -aiohappyeyeballs -aiohttp -aioice -aiortc -aiosignal -alive-progress -annotated-types -anthropic -anyio -attrs -audioread -autograd -av -brotli -certifi -cffi -charset-normalizer -click -cma -contourpy -cryptography -cycler -datasets -decorator -deprecated -dill -distro -dnspython -fastapi -ffmpy -filelock -fonttools -frozenlist -fsspec -google-crc32c -gradio -gradio-client -grapheme -graphemeu -groovy -h11 -hf-xet -httpcore -httpx -huggingface-hub -idna -ifaddr -iniconfig -jinja2 -jiter -jiwer -joblib -jsonschema -jsonschema-specifications -kiwisolver -lazy-loader -librosa -llvmlite -markdown-it-py -markupsafe -matplotlib -mdurl -ml-dtypes -more-itertools -mpmath -msgpack -multidict -multiprocess -natsort -networkx -ninja -nncf -numba -numpy -nvidia-cublas-cu12 -nvidia-cuda-cupti-cu12 -nvidia-cuda-nvrtc-cu12 -nvidia-cuda-runtime-cu12 -nvidia-cudnn-cu12 -nvidia-cufft-cu12 -nvidia-cufile-cu12 -nvidia-curand-cu12 -nvidia-cusolver-cu12 -nvidia-cusparse-cu12 -nvidia-cusparselt-cu12 -nvidia-nccl-cu12 -nvidia-nvjitlink-cu12 -nvidia-nvtx-cu12 -onnx -openai +about-time==4.2.1 +absl-py==2.3.1 +accelerate==1.6.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aioice==0.10.1 +aiortc==1.13.0 +aiosignal==1.4.0 +alive-progress==3.2.0 +annotated-types==0.7.0 +anthropic==0.67.0 +anyio==4.10.0 +attrs==25.3.0 +audioread==3.0.1 +autograd==1.8.0 +av==14.4.0 +brotli==1.1.0 +certifi==2025.8.3 +cffi==2.0.0 +charset-normalizer==3.4.3 +click==8.2.1 +cma==4.3.0 +contourpy==1.3.3 +cryptography==45.0.7 +cycler==0.12.1 +datasets==4.1.0 +decorator==5.2.1 +deprecated==1.2.18 +diffusers==0.35.1 +dill==0.4.0 +distro==1.9.0 +dnspython==2.8.0 +fastapi==0.116.1 +ffmpy==0.6.1 +filelock==3.19.1 +fonttools==4.59.2 +frozenlist==1.7.0 +fsspec==2025.9.0 +google-crc32c==1.7.1 +gradio==5.45.0 +gradio-client==1.13.0 +grapheme==0.6.0 +graphemeu==0.8.0 +groovy==0.1.2 +h11==0.16.0 +hf-xet==1.1.10 +httpcore==1.0.9 +httpx==0.28.1 +huggingface-hub==0.34.5 +idna==3.10 +ifaddr==0.2.0 +importlib-metadata==8.7.0 +iniconfig==2.1.0 +jinja2==3.1.6 +jiter==0.11.0 +jiwer==4.0.0 +joblib==1.5.2 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +kiwisolver==1.4.9 +lazy-loader==0.4 +librosa==0.11.0 +llvmlite==0.44.0 +markdown-it-py==4.0.0 +markupsafe==3.0.2 +matplotlib==3.10.6 +mdurl==0.1.2 +ml-collections==1.1.0 +ml-dtypes==0.5.3 +more-itertools==10.8.0 +mpmath==1.3.0 +msgpack==1.1.1 +multidict==6.6.4 +multiprocess==0.70.16 +natsort==8.4.0 +networkx==3.4.2 +ninja==1.13.0 +nncf==2.18.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.3 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvtx-cu12==12.8.90 +onnx==1.19.0 +openai==1.107.2 openai-whisper @ git+https://github.com/openai/whisper.git@c0d2f624c09dc18e709e37c2ad90c039a4eb72a2 -opencv-python -openvino -openvino-genai -openvino-telemetry -openvino-tokenizers -optimum +opencv-python==4.12.0.88 +openvino==2025.3.0 +openvino-genai==2025.3.0.0 +openvino-telemetry==2025.2.0 +openvino-tokenizers==2025.3.0.0 +optimum==1.27.0 optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@b9c151fec6b414d9ca78be8643d08e267b133bfc -orjson -packaging -pandas -pillow -platformdirs -pluggy -pooch -propcache -protobuf -psutil -pyarrow -pycparser -pydantic -pydantic-core -pydot -pydub -pyee -pygments -pylibsrtp -pymoo -pyopencl -pyopenssl -pyparsing -pytest -pytest-asyncio -python-dateutil -python-ffmpeg -python-multipart -pytools -pytz -pyyaml -rapidfuzz -referencing -regex -requests -resampy -rich -rpds-py -ruff -safehttpx -safetensors -scikit-learn -scipy -semantic-version -setuptools -shellingham -siphash24 -six -sniffio -soundfile -soxr -speechrecognition -starlette -sympy -tabulate -threadpoolctl -tiktoken -tokenizers -tomlkit -torch -torchvision -tqdm -transformers -triton -typer -typing-extensions -typing-inspection -tzdata -urllib3 -uvicorn -watchdog -websockets -wrapt -xxhash -yarl +orjson==3.11.3 +packaging==25.0 +pandas==2.3.2 +peft==0.17.1 +pillow==11.3.0 +platformdirs==4.4.0 +pluggy==1.6.0 +pooch==1.8.2 +propcache==0.3.2 +protobuf==6.32.1 +psutil==7.0.0 +pyarrow==21.0.0 +pycparser==2.23 +pydantic==2.11.9 +pydantic-core==2.33.2 +pydot==3.0.4 +pydub==0.25.1 +pyee==13.0.0 +pygments==2.19.2 +pylibsrtp==0.12.0 +pymoo==0.6.1.5 +pyopencl==2025.2.6 +pyopenssl==25.2.0 +pyparsing==3.2.4 +pytest==8.4.2 +pytest-asyncio==1.2.0 +python-dateutil==2.9.0.post0 +python-ffmpeg==2.0.12 +python-multipart==0.0.20 +pytools==2025.2.4 +pytz==2025.2 +pyyaml==6.0.2 +rapidfuzz==3.14.1 +referencing==0.36.2 +regex==2025.9.1 +requests==2.32.5 +resampy==0.4.3 +rich==14.1.0 +rpds-py==0.27.1 +ruff==0.13.0 +safehttpx==0.1.6 +safetensors==0.6.2 +scikit-learn==1.7.2 +scipy==1.16.2 +semantic-version==2.10.0 +setuptools==80.9.0 +shellingham==1.5.4 +siphash24==1.8 +six==1.17.0 +sniffio==1.3.1 +soundfile==0.13.1 +soxr==1.0.0 +speechrecognition==3.14.3 +starlette==0.47.3 +sympy==1.14.0 +tabulate==0.9.0 +threadpoolctl==3.6.0 +tiktoken==0.11.0 +tokenizers==0.21.4 +tomlkit==0.13.3 +torch==2.8.0 +torchvision==0.23.0 +tqdm==4.67.1 +transformers==4.53.3 +triton==3.4.0 +typer==0.17.4 +typing-extensions==4.15.0 +typing-inspection==0.4.1 +tzdata==2025.2 +urllib3==2.5.0 +uvicorn==0.35.0 +-e file:///voicebot/VibeVoice +watchdog==6.0.0 +websockets==15.0.1 +wrapt==1.17.3 +xxhash==3.5.0 +yarl==1.20.1 +zipp==3.23.0