From 6620c0ac74f5e40b1acd259b42a4adf51b5b27ee Mon Sep 17 00:00:00 2001
From: James Ketrenos <james_git@ketrenos.com>
Date: Wed, 17 Sep 2025 14:06:18 -0700
Subject: [PATCH] Before claude rewrite

---
 .dockerignore                   |    1 +
 .github/copilot-instructions.md |    2 +-
 client/src/App.css              |  225 -----
 client/src/App.tsx              |    6 +-
 client/src/LobbyChat.css        |    6 +-
 client/src/index.css            |   23 +-
 voicebot/bots/vibevoice.py      | 1574 +++++++++++++++++++++++++++++++
 voicebot/bots/whisper.py        |  279 +++++-
 voicebot/requirements.txt       |  354 +++----
 9 files changed, 2021 insertions(+), 449 deletions(-)
 create mode 100644 voicebot/bots/vibevoice.py

diff --git a/.dockerignore b/.dockerignore
index b045329..bc548ad 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -12,3 +12,4 @@
 **/*.key
 **/package-lock.json
 **/*.pyc
+**/VibeVoice
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 3be8dbb..94fe719 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -15,7 +15,7 @@
 - Always run tests inside the appropriate Docker containers using `docker compose exec`
 - Use `uv run` for Python commands in voicebot and server containers
 - Tests should be placed in the `tests/` directory (bind mounted to `/tests` in containers)
-- Use proper PYTHONPATH when running Python code: `PYTHONPATH=/shared:/voicebot` for voicebot, `PYTHONPATH=/shared:/server` for server
+- Use proper PYTHONPATH when running Python code: `PYTHONPATH=/:/voicebot` for voicebot, `PYTHONPATH=/:/server` for server
 - Check container logs with `docker compose logs --since 10m SERVICE_NAME` for debugging
 
 ### Voicebot Testing (Python with uv)
diff --git a/client/src/App.css b/client/src/App.css
index de25b97..628975a 100755
--- a/client/src/App.css
+++ b/client/src/App.css
@@ -2,228 +2,3 @@ body {
   font-family: 'Droid Sans', 'Arial Narrow', Arial, sans-serif;
   overflow: hidden;
 }
-
-#root {
-  width: 100vw;
-/*  height: 100vh; breaks on mobile -- not needed */
-}
-
-.Table {
-  display: flex;
-  position: absolute;
-  top: 0;
-  left: 0;
-  width: 100%;
-  bottom: 0;
-  flex-direction: row;
-  /* background-image: url("./assets/tabletop.png"); */
-}
-
-.Table .Dialogs {
-  z-index: 10000;
-  display: flex;
-  justify-content: space-around;
-  align-items: center;
-  position: absolute;
-  top: 0;
-  left: 0;
-  bottom: 0;
-  right: 0;
-}
-
-.Table .Dialogs .Dialog {
-  display: flex;
-  position: absolute;
-  flex-shrink: 1;
-  flex-direction: column;
-  padding: 0.25rem;
-  left: 0;
-  right: 0;
-  top: 0;
-  bottom: 0;
-  justify-content: space-around;
-  align-items: center;
-  z-index: 60000;
-}
-
-.Table .Dialogs .Dialog > div {
-  display: flex;
-  padding: 1rem;
-  flex-direction: column;
-}
-
-.Table .Dialogs .Dialog > div > div:first-child {
-  padding: 1rem;
-}
-
-.Table .Dialogs .TurnNoticeDialog {
-  background-color: #7a680060;
-}
-
-.Table .Dialogs .ErrorDialog {
-  background-color: #40000060;
-}
-
-.Table .Dialogs .WarningDialog {
-  background-color: #00000060;
-}
-
-.Table .Game {
-  position: relative;
-  display: flex;
-  flex-direction: column;
-  flex-grow: 1;
-}
-
-.Table .Board {
-  display: flex;
-  position: relative;
-  flex-grow: 1;
-  z-index: 500;
-}
-
-.Table .PlayersStatus {
-  z-index: 500; /* Under Hand */
-}
-
-.Table .PlayersStatus.ActivePlayer {
-  z-index: 1500; /* On top of Hand */
-}
-
-.Table .Hand {
-  display: flex;
-  position: relative;
-  height: 11rem;
-  z-index: 10000;
-}
-
-.Table .Sidebar {
-  display: flex;
-  flex-direction: column;
-  justify-content: space-between;
-  width: 25rem;
-  max-width: 25rem;
-  overflow: hidden;
-  z-index: 5000;
-}
-
-.Table .Sidebar .Chat {
-  display: flex;
-  position: relative;
-  flex-grow: 1;
-}
-
-.Table .Trade {
-  display: flex;
-  position: relative;
-  z-index: 25000;
-  align-self: center;
-}
-
-.Table .Dialogs {
-  position: absolute;
-  display: flex;
-  top: 0;
-  bottom: 0;
-  right: 0;
-  left: 0;
-  justify-content: space-around;
-  align-items: center;
-  z-index: 20000;
-  pointer-events: none;
-}
-
-.Table .Dialogs > * {
-  pointer-events: all;
-}
-
-.Table .ViewCard {
-  display: flex;
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  bottom: 0;
-}
-
-.Table .Winner {
-  display: flex;
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  bottom: 0;
-}
-
-
-.Table .HouseRules {
-  display: flex;
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  bottom: 0;
-}
-
-.Table .ChooseCard {
-  display: flex;
-  position: relative;
-  top: 0;
-  left: 0;
-  right: 0;
-  bottom: 0;
-}
-
-.Table button {
-  margin: 0.25rem;
-  background-color: white;
-  border: 1px solid black;  /* why !important */
-}
-
-.Table .MuiButton-text {
-  padding: 0.25rem 0.55rem;
-}
-
-.Table button:disabled {
-  opacity: 0.5;
-  border: 1px solid #ccc; /* why !important */
-}
-
-.Table .ActivitiesBox {
-  display: flex;
-  flex-direction: column;
-  position: absolute;
-  left: 1em;
-  top: 1em;
-}
-
-.Table .DiceRoll {
-  display: flex;
-  flex-direction: column;
-  position: relative;
-  /*
-  left: 1rem;
-  top: 5rem;*/
-  flex-wrap: wrap;
-  justify-content: left;
-  align-items: left;
-  z-index: 1000;
-}
-
-.Table .DiceRoll div:not(:last-child) {
-  border: 1px solid black;
-  background-color: white;
-  padding: 0.25rem 0.5rem;
-  border-radius: 0.25rem;
-}
-.Table .DiceRoll div:last-child {
-  display: flex;
-  flex-direction: row;
-}
-
-.Table .DiceRoll .Dice {
-  margin: 0.25rem;
-  width: 2.75rem;
-  height: 2.75rem;
-  border-radius: 0.5rem;
-}
\ No newline at end of file
diff --git a/client/src/App.tsx b/client/src/App.tsx
index 56dfdac..533ab86 100644
--- a/client/src/App.tsx
+++ b/client/src/App.tsx
@@ -191,8 +191,8 @@ const LobbyView: React.FC<LobbyProps> = (props: LobbyProps) => {
       sx={{
         p: { xs: 1, sm: 2 },
         m: { xs: 0, sm: 2 },
-        width: { xs: "100%", sm: "fit-content" },
-        maxWidth: { xs: "100%", sm: 600 },
+        // width: { xs: "100%", sm: "fit-content" },
+        // maxWidth: { xs: "100%", sm: 600 },
       }}
     >
       {readyState !== ReadyState.OPEN || !session ? (
@@ -299,7 +299,7 @@ const App = () => {
     <Box
       sx={{
         p: { xs: 1, sm: 2 },
-        maxWidth: { xs: "100%", sm: 800 },
+        // maxWidth: { xs: "100%", sm: 800 },
         margin: "0 auto",
         height: "100vh",
         overflowY: "auto",
diff --git a/client/src/LobbyChat.css b/client/src/LobbyChat.css
index e17f83a..a1cbcef 100644
--- a/client/src/LobbyChat.css
+++ b/client/src/LobbyChat.css
@@ -1,6 +1,6 @@
 .lobby-chat {
   min-width: 300px;
-  max-width: 400px;
+  max-width: 100%;
 }
 
 .chat-messages {
@@ -35,9 +35,9 @@
   border-bottom-left-radius: 4px !important;
 }
 
-@media (max-width: 768px) {
+/* @media (max-width: 768px) {
   .lobby-chat {
     min-width: 250px;
     max-width: 300px;
   }
-}
+} */
diff --git a/client/src/index.css b/client/src/index.css
index 483b1ac..4317d66 100644
--- a/client/src/index.css
+++ b/client/src/index.css
@@ -1,32 +1,13 @@
-/*@media only screen and (max-height: 512px) {
-  html {
-    font-size: 6.75px;
-  }
-}
-
-@media only screen and (min-height: 513px) and (max-height: 800px) {*/
-  html {
-    font-size: 2vh;/*10px;*/
-  }
-/*}
-
-@media only screen and (min-height: 2000px) {
-  html {
-    font-size: 30px;
-  }
-}*/
-
 html {
-  height: 100%;
+  height: 100dvh;
   width: 100%;
   margin: 0;
   padding: 0;
 }
 
 body {
-  display: flex;
+  display: block;
   position: relative;
-  height: 100%;
   width: 100%;
   height: 100dvh;
   padding: 0;
diff --git a/voicebot/bots/vibevoice.py b/voicebot/bots/vibevoice.py
new file mode 100644
index 0000000..92eabd7
--- /dev/null
+++ b/voicebot/bots/vibevoice.py
@@ -0,0 +1,1574 @@
+#!/usr/bin/env python3
+"""
+VibeVoice Text-to-Speech Bot for Voicebot Framework
+
+Integrates Microsoft's VibeVoice TTS with the voicebot framework.
+Watches for chat messages and converts them to speech with text display.
+"""
+
+import threading
+import queue
+import time
+import numpy as np
+import cv2
+import math
+from typing import Dict, Optional, Any, Tuple, Callable, Awaitable, Union
+from av.audio.frame import AudioFrame
+from av import VideoFrame
+from aiortc import MediaStreamTrack
+import fractions
+import torch
+import librosa
+import os
+from shared.logger import logger
+from shared.models import ChatMessageModel
+
+# Implement a local WaveformVideoTrack-like helper to hold shared waveform buffers
+# and lightweight speech status per session. This avoids depending on bots.whisper
+class WaveformVideoTrack:
+    """Lightweight shared storage for waveform visualization and speech status.
+
+    This class is not itself a MediaStreamTrack; it's used as a shared in-memory
+    store that video tracks in this file will read from to render waveforms and
+    status overlays.
+    """
+
+    # session_name -> np.ndarray(float32) containing recent audio samples (mono)
+    buffer: Dict[str, np.ndarray] = {}
+
+    # session_name -> dict with status flags (is_speech, energy, is_processing, is_playing, etc.)
+    speech_status: Dict[str, Dict[str, Any]] = {}
+
+    # session_name -> sample_rate used for that buffer
+    sample_rates: Dict[str, int] = {}
+
+
+# Proxy wrapper for AudioStreamer to log put() calls and basic stats without
+# modifying upstream VibeVoice internals. We'll wrap any created AudioStreamer
+# with this to capture whether model.generate() actually calls put().
+class ProxyAudioStreamer:
+    def __init__(self, real_streamer, session_name: Optional[str] = None):
+        self._real = real_streamer
+        self.session_name = session_name or "unknown"
+        self.put_calls = 0
+        self.total_samples = 0
+
+    def put(self, audio_chunk, *args, **kwargs):
+        # Try to measure number of samples in the chunk for diagnostics
+        try:
+            if torch.is_tensor(audio_chunk):
+                length = int(audio_chunk.numel())
+            else:
+                arr = np.array(audio_chunk)
+                length = int(arr.size)
+        except Exception:
+            length = -1
+
+        try:
+            # Inspect possible sample_indices positional argument for diagnostics
+            si_info = None
+            if len(args) >= 1:
+                try:
+                    si = args[0]
+                    if torch.is_tensor(si):
+                        si_info = f"tensor(shape={tuple(si.shape)}, min={int(torch.min(si).item())}, max={int(torch.max(si).item())}, unique={int(len(torch.unique(si)))} )"
+                    else:
+                        arrsi = np.array(si)
+                        si_info = f"array(shape={arrsi.shape}, min={int(arrsi.min()) if arrsi.size>0 else -1}, max={int(arrsi.max()) if arrsi.size>0 else -1}, unique={int(len(np.unique(arrsi))) if arrsi.size>0 else 0})"
+                except Exception:
+                    si_info = str(type(args[0]))
+
+            logger.info(f"VibeVoice audio: ProxyAudioStreamer.put called for session {self.session_name} - samples={length} sample_indices={si_info}")
+        except Exception:
+            pass
+
+        self.put_calls += 1
+        if length > 0:
+            self.total_samples += length
+
+        return getattr(self._real, 'put')(audio_chunk, *args, **kwargs)
+
+    def get_stream(self, *args, **kwargs):
+        return getattr(self._real, 'get_stream')(*args, **kwargs)
+
+    def end(self, *args, **kwargs):
+        return getattr(self._real, 'end')(*args, **kwargs)
+
+    def __getattr__(self, name):
+        return getattr(self._real, name)
+
+
+# Import VibeVoice components
+try:
+    from vibevoice import VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor
+    from vibevoice.modular.streamer import AudioStreamer
+except Exception as e:
+    logger.warning("VibeVoice not available. Install with: git clone https://github.com/microsoft/VibeVoice.git && cd VibeVoice && pip install -e .")
+    raise e
+
+
+
+class MediaClock:
+    """Shared clock for media synchronization."""
+
+    def __init__(self):
+        self.t0 = time.perf_counter()
+
+    def now(self) -> float:
+        return time.perf_counter() - self.t0
+
+
+class VibeVoiceTTS:
+    """Minimal VibeVoice Text-to-Speech wrapper."""
+
+    def __init__(self, device: str = "cpu", inference_steps: int = 10, config: Optional[Dict[str, Any]] = None):
+        self.device = device
+        self.inference_steps = inference_steps
+        self.config = config or {}
+        self.model = None
+        self.processor = None
+        self.sample_rate = 24000  # VibeVoice uses 24kHz
+        self.is_initialized = False
+        self.voice_presets = {}
+        self.available_voices = {}
+
+        try:
+            self._initialize_model()
+            self._setup_voice_presets()
+        except Exception as e:
+            logger.error(f"Failed to initialize VibeVoice: {e}")
+
+    def _initialize_model(self):
+        """Initialize the VibeVoice model with robust device handling."""
+        try:
+            logger.info("Loading VibeVoice model...")
+
+            # Normalize potential 'mpx'
+            if self.device.lower() == "mpx":
+                logger.info("Note: device 'mpx' detected, treating it as 'mps'.")
+                self.device = "mps"
+            if self.device == "mps" and not torch.backends.mps.is_available():
+                logger.warning("Warning: MPS not available. Falling back to CPU.")
+                self.device = "cpu"
+
+            logger.info(f"Using device: {self.device}")
+
+            # Load processor
+            self.processor = VibeVoiceProcessor.from_pretrained("vibevoice/VibeVoice-1.5B")
+
+            # Decide dtype & attention
+            if self.device == "mps":
+                load_dtype = torch.float32
+                attn_impl_primary = "sdpa"
+            elif self.device == "cuda":
+                load_dtype = torch.bfloat16
+                attn_impl_primary = "flash_attention_2"
+            else:
+                load_dtype = torch.float32
+                attn_impl_primary = "sdpa"
+
+            logger.info(f"Using device: {self.device}, torch_dtype: {load_dtype}, attn_implementation: {attn_impl_primary}")
+
+            # Load model
+            try:
+                if self.device == "mps":
+                    self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+                        "vibevoice/VibeVoice-1.5B",
+                        torch_dtype=load_dtype,
+                        attn_implementation=attn_impl_primary,
+                        device_map=None,
+                    )
+                    self.model.to("mps")
+                elif self.device == "cuda":
+                    self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+                        "vibevoice/VibeVoice-1.5B",
+                        torch_dtype=load_dtype,
+                        device_map="cuda",
+                        attn_implementation=attn_impl_primary,
+                    )
+                else:
+                    self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+                        "vibevoice/VibeVoice-1.5B",
+                        torch_dtype=load_dtype,
+                        device_map="cpu",
+                        attn_implementation=attn_impl_primary,
+                    )
+            except Exception as e:
+                if attn_impl_primary == 'flash_attention_2':
+                    logger.warning(f"Error with flash_attention_2: {e}")
+                    logger.info("Falling back to attention implementation: sdpa")
+                    fallback_attn = "sdpa"
+                    self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+                        "vibevoice/VibeVoice-1.5B",
+                        torch_dtype=load_dtype,
+                        device_map=(self.device if self.device in ("cuda", "cpu") else None),
+                        attn_implementation=fallback_attn,
+                    )
+                    if self.device == "mps":
+                        self.model.to("mps")
+                else:
+                    raise e
+
+            self.model.eval()
+
+            # Use SDE solver by default
+            self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
+                self.model.model.noise_scheduler.config,
+                algorithm_type='sde-dpmsolver++',
+                beta_schedule='squaredcos_cap_v2'
+            )
+            self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
+
+            if hasattr(self.model.model, 'language_model'):
+                logger.info(f"Language model attention: {self.model.model.language_model.config._attn_implementation}")
+
+            self.is_initialized = True
+            logger.info("VibeVoice model loaded successfully!")
+
+        except Exception as e:
+            logger.error(f"Error loading VibeVoice model: {e}")
+            raise
+
+    def _setup_voice_presets(self):
+        """Setup voice presets by scanning the voices directory."""
+        # Look for voices directory in multiple possible locations
+        possible_voice_dirs = [
+            os.path.join(os.path.dirname(__file__), "voices"),  # /voicebot/bots/voices/
+            os.path.join(os.path.dirname(__file__), "..", "VibeVoice", "demo", "voices"),  # /voicebot/VibeVoice/demo/voices/
+            "/voicebot/VibeVoice/demo/voices",  # Absolute path
+        ]
+        
+        voices_dir = None
+        for possible_dir in possible_voice_dirs:
+            if os.path.exists(possible_dir):
+                voices_dir = possible_dir
+                break
+        
+        # Check if voices directory exists
+        if not voices_dir:
+            logger.warning(f"Warning: Voices directory not found in any of: {possible_voice_dirs}")
+            self.voice_presets = {}
+            self.available_voices = {}
+            self.speaker_mapping = {}
+            return
+        
+        # Scan for all WAV files in the voices directory
+        self.voice_presets = {}
+
+        # Get all supported audio files
+        audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
+        audio_files = [f for f in os.listdir(voices_dir) 
+                      if f.lower().endswith(audio_extensions) and os.path.isfile(os.path.join(voices_dir, f))]
+
+        # Create dictionary with filename (without extension) as key
+        for audio_file in audio_files:
+            # Remove extension to get the name
+            name = os.path.splitext(audio_file)[0]
+            # Create full path
+            full_path = os.path.join(voices_dir, audio_file)
+            self.voice_presets[name] = full_path
+
+        # Sort the voice presets alphabetically by name for better UI
+        self.voice_presets = dict(sorted(self.voice_presets.items()))
+
+        # Filter out voices that don't exist (this is now redundant but kept for safety)
+        self.available_voices = {
+            name: path for name, path in self.voice_presets.items()
+            if os.path.exists(path)
+        }
+
+        # Map speaker numbers (1, 2, 3, 4) to available voice files
+        self.speaker_mapping = {}
+        available_voice_names = list(self.available_voices.keys())
+        for i in range(1, 5):  # Support speakers 1-4
+            if i <= len(available_voice_names):
+                voice_name = available_voice_names[i-1]  # 0-indexed
+                self.speaker_mapping[str(i)] = voice_name
+                logger.info(f"Mapped Speaker {i} to voice '{voice_name}'")
+            else:
+                logger.warning(f"No voice file available for Speaker {i}")
+
+        if not self.available_voices:
+            logger.warning("No voice presets found. Please add audio files to the voices directory.")
+        else:
+            logger.info(f"Found {len(self.available_voices)} voice files in {voices_dir}")
+            logger.info(f"Available voices: {', '.join(self.available_voices.keys())}")
+            logger.info(f"Speaker mapping: {self.speaker_mapping}")
+
+    def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
+        """Read and preprocess audio file."""
+        try:
+            import soundfile as sf
+            wav, sr = sf.read(audio_path)
+            if len(wav.shape) > 1:
+                wav = np.mean(wav, axis=1)
+            if sr != target_sr:
+                wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
+            return wav
+        except Exception as e:
+            logger.error(f"Error reading audio {audio_path}: {e}")
+            return np.array([])
+
+    def generate_speech(self, text: str, speaker: str = "1", cfg_scale: float = 1.3) -> Optional[np.ndarray]:
+        """Generate speech using the AudioStreamer and return a single concatenated numpy array.
+
+        This removes the old synchronous model.generate path and uses the streamer-based
+        generation even for blocking calls. Returns None if generation isn't possible.
+        """
+        # Must have model initialized and streamer available
+        if not self.is_initialized:
+            logger.error("VibeVoice TTS: Model not initialized - cannot generate speech synchronously")
+            return None
+
+        try:
+            # Prepare formatted text and voice samples (same as demo)
+            formatted_text = f"Speaker {speaker}: {text}"
+            voice_samples = []
+            if speaker in self.speaker_mapping:
+                voice_name = self.speaker_mapping[speaker]
+                if voice_name in self.available_voices:
+                    audio_path = self.available_voices[voice_name]
+                    audio_data = self.read_audio(audio_path)
+                    if len(audio_data) > 0:
+                        voice_samples.append(audio_data)
+                    else:
+                        voice_samples.append([])
+                else:
+                    voice_samples.append([])
+            else:
+                voice_samples.append([])
+
+            inputs = self.processor(  # type: ignore
+                text=[formatted_text],
+                voice_samples=[voice_samples],
+                padding=True,
+                return_tensors="pt"
+            )
+
+            # Move tensors to device
+            target_device = self.device if self.device in ("cuda", "mps") else "cpu"
+            for k, v in inputs.items():
+                if torch.is_tensor(v):
+                    inputs[k] = v.to(target_device)
+
+            # Create streamer and run generation
+            real_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
+            audio_streamer = ProxyAudioStreamer(real_streamer, session_name=self.session_name)
+
+            with torch.no_grad():
+                try:
+                    self.model.generate(  # type: ignore
+                        **inputs,
+                        max_new_tokens=None,
+                        cfg_scale=cfg_scale,
+                        tokenizer=self.processor.tokenizer,  # type: ignore
+                        generation_config={'do_sample': False},
+                        verbose=False,
+                        streamer=audio_streamer,
+                    )
+                finally:
+                    # ensure streamer end if model.generate returns
+                    try:
+                        audio_streamer.end()
+                    except Exception:
+                        pass
+
+            # Collect streamed chunks
+            collected = []
+            for audio_chunk in audio_streamer.get_stream(0):
+                try:
+                    if torch.is_tensor(audio_chunk):
+                        if audio_chunk.dtype == torch.bfloat16:
+                            audio_chunk = audio_chunk.float()
+                        audio_np = audio_chunk.cpu().numpy().astype(np.float32)
+                    else:
+                        audio_np = np.array(audio_chunk, dtype=np.float32)
+
+                    if audio_np.ndim > 1:
+                        audio_np = audio_np.squeeze()
+
+                    collected.append(audio_np)
+                except Exception as e:
+                    logger.error(f"VibeVoice TTS: Error collecting chunk: {e}")
+
+            if not collected:
+                logger.error("VibeVoice TTS: No audio chunks received from streamer")
+                return None
+
+            audio = np.concatenate(collected)
+
+            # Mix with background noise if enabled
+            noise_type = self.config.get('background_noise_type', 'none')
+            noise_volume = self.config.get('background_noise_volume', 0.0)
+            audio = self.mix_audio_with_background_noise(audio, noise_type, noise_volume)
+
+            # Resample to 16kHz for compatibility with existing audio pipeline
+            audio_16k = librosa.resample(audio, orig_sr=24000, target_sr=16000)
+            return audio_16k.astype(np.float32)
+
+        except Exception as e:
+            logger.error(f"VibeVoice TTS: Error generating speech via streamer: {e}")
+            return None
+
+    def generate_background_noise(self, duration_seconds: float, noise_type: str = "white", volume: float = 0.01, sample_rate: Optional[int] = None) -> np.ndarray:
+        """Generate background noise of specified type and duration."""
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+            
+        if noise_type == "none":
+            return np.zeros(int(duration_seconds * sample_rate), dtype=np.float32)
+
+        num_samples = int(duration_seconds * sample_rate)
+
+        if noise_type == "white":
+            # White noise - equal power across all frequencies
+            noise = np.random.normal(0, 1, num_samples).astype(np.float32)
+        elif noise_type == "pink":
+            # Pink noise - 1/f frequency response (approximated)
+            white = np.random.normal(0, 1, num_samples).astype(np.float32)
+            # Simple pink noise approximation using IIR filter
+            b = [0.049922035, -0.095993537, 0.050612699, -0.004408786]
+            a = [1, -2.494956002, 2.017265875, -0.522189400]
+            noise = np.zeros_like(white)
+            for i in range(len(b), len(white)):
+                noise[i] = b[0] * white[i] + b[1] * white[i-1] + b[2] * white[i-2] + b[3] * white[i-3] - a[1] * noise[i-1] - a[2] * noise[i-2] - a[3] * noise[i-3]
+        elif noise_type == "brown":
+            # Brown noise - 1/f² frequency response (integrated white noise)
+            white = np.random.normal(0, 1, num_samples).astype(np.float32)
+            noise = np.cumsum(white)
+            # Normalize to prevent drift
+            noise = (noise - np.mean(noise)) / np.std(noise)
+        else:
+            # Default to white noise
+            noise = np.random.normal(0, 1, num_samples).astype(np.float32)
+
+        # Apply volume
+        noise *= volume
+        return noise
+
+    def mix_audio_with_background_noise(self, audio: np.ndarray, noise_type: str = "white", volume: float = 0.01) -> np.ndarray:
+        """Mix generated audio with background noise."""
+        # Default to disabled when not present in config to avoid unexpected noise
+        if not self.config.get('background_noise_enabled', False):
+            return audio
+
+        # Generate background noise for the duration of the audio using the TTS sample rate
+        duration_seconds = len(audio) / self.sample_rate
+        background_noise = self.generate_background_noise(duration_seconds, noise_type, volume, self.sample_rate)
+
+        # Mix audio with background noise
+        mixed_audio = audio + background_noise
+
+        # Normalize to prevent clipping
+        max_val = np.max(np.abs(mixed_audio))
+        if max_val > 1.0:
+            mixed_audio /= max_val
+
+        return mixed_audio
+
+
+class VibeVoiceVideoTrack(MediaStreamTrack):
+    """Video track that displays text being spoken."""
+
+    kind = "video"
+
+    def __init__(self, clock, config: Dict[str, Any], session_name: Optional[str] = None):
+        super().__init__()
+        self.clock = clock
+        self.config = config
+        # Keep session_name for looking up waveform buffers and status
+        self.session_name = session_name or config.get('session_name') or f"VibeVoice:{int(time.time())}"
+        self.width = config.get('width', 640)
+        self.height = config.get('height', 480)
+        self.fps = config.get('fps', 15)
+
+        # Text display state
+        self.current_text = ""
+        self.text_queue = queue.Queue()
+        self.display_start_time = 0
+        self.display_duration = 3.0  # seconds to display each text
+        self.frame_count = 0
+
+        # Font settings
+        self.font = cv2.FONT_HERSHEY_SIMPLEX
+        self.font_scale = min(self.width, self.height) / 800
+        self.font_thickness = max(1, int(self.font_scale * 2))
+
+    def update_text(self, text: str):
+        """Update the text to display."""
+        self.text_queue.put(text)
+        logger.info(f"VibeVoice video: Queued text '{text}'")
+
+    def update_config(self, config_updates: Dict[str, Any]) -> bool:
+        """Update video configuration."""
+        try:
+            self.config.update(config_updates)
+            if 'width' in config_updates:
+                self.width = config_updates['width']
+            if 'height' in config_updates:
+                self.height = config_updates['height']
+            if 'fps' in config_updates:
+                self.fps = config_updates['fps']
+            return True
+        except Exception as e:
+            logger.error(f"Error updating video config: {e}")
+            return False
+
+    async def next_timestamp(self) -> Tuple[int, float]:
+        """Get next timestamp for video frame."""
+        pts = int(self.frame_count * (90000 / self.fps))
+        time_base = 1 / 90000
+        return pts, time_base
+
+    async def recv(self) -> VideoFrame:
+        """Generate video frame with current text."""
+        # Update current text if needed
+        current_time = time.time()
+        if (not self.current_text or
+            current_time - self.display_start_time > self.display_duration):
+            try:
+                self.current_text = self.text_queue.get_nowait()
+                self.display_start_time = current_time
+                logger.info(f"VibeVoice video: Displaying '{self.current_text}'")
+            except queue.Empty:
+                self.current_text = ""
+        # Create frame
+        frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
+
+        if self.current_text:
+            # Add background
+            cv2.rectangle(frame, (0, 0), (self.width, self.height), (0, 0, 0), -1)
+
+            # Split text into lines if too long
+            words = self.current_text.split()
+            lines = []
+            current_line = ""
+            max_chars_per_line = int(self.width / (self.font_scale * 20))
+
+            for word in words:
+                if len(current_line + " " + word) <= max_chars_per_line:
+                    current_line += " " + word if current_line else word
+                else:
+                    if current_line:
+                        lines.append(current_line)
+                    current_line = word
+            if current_line:
+                lines.append(current_line)
+
+            # Draw text lines
+            line_height = int(self.font_scale * 40)
+            total_text_height = len(lines) * line_height
+            start_y = (self.height - total_text_height) // 2 + line_height
+
+            for i, line in enumerate(lines):
+                text_size = cv2.getTextSize(line, self.font, self.font_scale, self.font_thickness)[0]
+                text_x = (self.width - text_size[0]) // 2
+                text_y = start_y + i * line_height
+
+                # Add text shadow
+                cv2.putText(frame, line, (text_x + 2, text_y + 2),
+                           self.font, self.font_scale, (0, 0, 0), self.font_thickness + 1)
+                # Add main text
+                cv2.putText(frame, line, (text_x, text_y),
+                           self.font, self.font_scale, (255, 255, 255), self.font_thickness)
+        else:
+            # Default background when no text
+            cv2.putText(frame, "VibeVoice TTS", (50, self.height // 2),
+                       self.font, self.font_scale * 2, (255, 255, 255), self.font_thickness)
+
+        # Draw waveform and status overlays from shared WaveformVideoTrack buffers
+        try:
+            pname = self.session_name
+            buf = WaveformVideoTrack.buffer.get(pname, None)
+            status = WaveformVideoTrack.speech_status.get(pname, {})
+
+            # Draw small status box in top-left
+            status_text = "Idle"
+            if status.get('is_processing'):
+                status_text = "Processing..."
+            elif status.get('is_speech'):
+                status_text = "Speaking"
+            elif buf is not None and len(buf) > 0:
+                # buffered seconds approx
+                sr = WaveformVideoTrack.sample_rates.get(pname, self.config.get('sample_rate', 16000))
+                buffered_sec = len(buf) / float(sr) if sr > 0 else 0.0
+                status_text = f"Buffered: {buffered_sec:.1f}s"
+
+            box_w = int(self.width * 0.28)
+            box_h = int(self.height * 0.12)
+            cv2.rectangle(frame, (10, 10), (10 + box_w, 10 + box_h), (50, 50, 50), -1)
+            cv2.putText(frame, status_text, (20, 10 + int(box_h/2)), self.font, self.font_scale, (200, 200, 200), self.font_thickness)
+
+            # Draw small energy meter
+            energy = status.get('energy', 0.0)
+            meter_h = int(box_h * 0.4)
+            meter_w = int(box_w * 0.6)
+            mx = 20
+            my = 10 + box_h - 5
+            filled = int(min(1.0, energy * 50.0) * meter_w)
+            cv2.rectangle(frame, (mx, my - meter_h), (mx + meter_w, my), (80, 80, 80), -1)
+            cv2.rectangle(frame, (mx, my - meter_h), (mx + filled, my), (0, 200, 0), -1)
+
+            # Draw waveform at bottom area
+            if buf is not None and buf.size > 4:
+                sr = WaveformVideoTrack.sample_rates.get(pname, self.config.get('sample_rate', 16000))
+                # Use last N samples corresponding to width pixels
+                samples_to_show = min(buf.size, max(1, int(sr * 5)))  # show up to last 5s
+                slice_buf = buf[-samples_to_show:]
+
+                # Downsample to width points
+                idx = (np.linspace(0, samples_to_show - 1, num=self.width)).astype(np.int32)
+                waveform = slice_buf[idx]
+                # Normalize waveform to -1..1
+                maxv = np.max(np.abs(waveform)) if waveform.size > 0 else 1.0
+                if maxv <= 0:
+                    maxv = 1.0
+                waveform = waveform / maxv
+
+                # Map to pixel coordinates in bottom strip
+                wf_h = int(self.height * 0.22)
+                wf_y0 = self.height - wf_h - 10
+                pts = []
+                for i, v in enumerate(waveform):
+                    px = int(i * (self.width / len(waveform)))
+                    py = int(wf_y0 + (wf_h / 2) * (1 - v))
+                    pts.append((px, py))
+
+                if len(pts) >= 2:
+                    cv2.polylines(frame, [np.array(pts, dtype=np.int32)], False, (100, 200, 255), 1)
+                    # Fill under curve for nicer look
+                    fill_pts = pts + [(self.width - 1, wf_y0 + wf_h), (0, wf_y0 + wf_h)]
+                    cv2.fillPoly(frame, [np.array(fill_pts, dtype=np.int32)], (30, 60, 80))
+        except Exception:
+            # Non-critical rendering failure shouldn't break video
+            pass
+
+        self.frame_count += 1
+        return VideoFrame.from_ndarray(frame, format="bgr24")
+
+
+class VibeVoiceAudioTrack(MediaStreamTrack):
+    """Audio track that plays TTS speech."""
+
+    kind = "audio"
+
+    def __init__(self, clock, config: Dict[str, Any], tts_engine: VibeVoiceTTS, session_name: Optional[str] = None):
+        super().__init__()
+        self.clock = clock
+        self.config = config
+        self.tts = tts_engine
+        self.sample_rate = config.get('sample_rate', 16000)
+        self.samples_per_frame = config.get('samples_per_frame', 960)  # 60ms at 16kHz
+
+        # Audio playback state
+        self.audio_queue = queue.Queue()
+        self.current_audio = None
+        self.audio_position = 0
+        self.is_speaking = False
+        self.speaker = config.get('speaker', 'Alice')
+
+        # Audio buffer for mixing multiple TTS segments
+        self.audio_buffer = np.array([], dtype=np.float32)
+        self.buffer_lock = threading.Lock()
+
+        # Optional looping and debug options
+        self.loop = config.get('loop', True)
+        self.debug_save_wav = config.get('debug_save_wav', True)
+        # Keep the last fully-generated audio to enable looping
+        self.last_generated_audio = np.array([], dtype=np.float32)
+        # Protect last_generated_audio updates
+        self._last_gen_lock = threading.Lock()
+
+        # Track total samples generated for proper PTS calculation
+        self._samples_generated = 0
+        # Optional session name used to publish waveform data for visualization
+        self.session_name = session_name or f"VibeVoice:{int(time.time())}"
+
+    def update_config(self, config_updates: Dict[str, Any]) -> bool:
+        """Update audio configuration."""
+        try:
+            self.config.update(config_updates)
+            if 'sample_rate' in config_updates:
+                self.sample_rate = config_updates['sample_rate']
+            if 'samples_per_frame' in config_updates:
+                self.samples_per_frame = config_updates['samples_per_frame']
+            if 'speaker' in config_updates:
+                self.speaker = config_updates['speaker']
+            if 'loop' in config_updates:
+                self.loop = bool(config_updates['loop'])
+                logger.info(f"🔁 Looping {'enabled' if self.loop else 'disabled'} for session {self.session_name}")
+            if 'debug_save_wav' in config_updates:
+                self.debug_save_wav = bool(config_updates['debug_save_wav'])
+                logger.info(f"🐞 Debug save wav {'enabled' if self.debug_save_wav else 'disabled'} for session {self.session_name}")
+
+            # Log background noise configuration updates
+            background_noise_updated = False
+            if 'background_noise_enabled' in config_updates:
+                logger.info(f"🎵 Background noise enabled: {config_updates['background_noise_enabled']}")
+                background_noise_updated = True
+            if 'background_noise_type' in config_updates:
+                logger.info(f"🎵 Background noise type: {config_updates['background_noise_type']}")
+                background_noise_updated = True
+            if 'background_noise_volume' in config_updates:
+                logger.info(f"🎵 Background noise volume: {config_updates['background_noise_volume']}")
+                background_noise_updated = True
+
+            if background_noise_updated:
+                logger.info("🎵 Background noise configuration updated - changes will take effect on next audio frame")
+
+            return True
+        except Exception as e:
+            logger.error(f"Error updating audio config: {e}")
+            return False
+
+    def speak_text(self, text: str, cfg_scale: Optional[float] = None):
+        """Queue text for speech synthesis."""
+        if cfg_scale is None:
+            cfg_scale = 1.3  # Default value
+        
+        logger.info(f"VibeVoice audio: Starting background TTS generation for '{text}' with cfg_scale={cfg_scale}")
+        
+        # Start TTS generation in a background thread
+        import threading
+        thread = threading.Thread(
+            target=self._generate_tts_background,
+            args=(text, self.speaker, cfg_scale),
+            daemon=True
+        )
+        thread.start()
+
+    def _generate_tts_background(self, text: str, speaker: str, cfg_scale: float):
+        """Generate TTS in background thread and add to audio buffer."""
+        try:
+            logger.info(f"VibeVoice audio: Background TTS generation started for '{text}'")
+
+            # Log some diagnostic info about the TTS engine state
+            try:
+                logger.info(f"VibeVoice audio: TTS engine initialized={getattr(self.tts, 'is_initialized', False)}, device={getattr(self.tts, 'device', None)}, tts_sample_rate={getattr(self.tts, 'sample_rate', None)}")
+                # available_voices and speaker_mapping may be large; log summaries
+                try:
+                    avv = getattr(self.tts, 'available_voices', {})
+                    smap = getattr(self.tts, 'speaker_mapping', {})
+                    logger.info(f"VibeVoice audio: available_voices={list(avv.keys())[:5]} (count={len(avv)}), speaker_mapping_count={len(smap)}")
+                except Exception:
+                    pass
+            except Exception:
+                pass
+
+            # Mark processing state for video overlay
+            try:
+                WaveformVideoTrack.speech_status[self.session_name] = WaveformVideoTrack.speech_status.get(self.session_name, {})
+                WaveformVideoTrack.speech_status[self.session_name]['is_processing'] = True
+            except Exception:
+                pass
+
+            # Require model and streamer to be available for streaming generation
+            if not self.tts.is_initialized:
+                logger.error("VibeVoice audio: Model or AudioStreamer not available - background generation disabled")
+                return
+
+            # Prepare formatted text and inputs (same expectations as generate_speech)
+            formatted_text = f"Speaker {speaker}: {text}"
+            voice_samples = []
+            if speaker in self.tts.speaker_mapping:
+                voice_name = self.tts.speaker_mapping[speaker]
+                if voice_name in self.tts.available_voices:
+                    audio_path = self.tts.available_voices[voice_name]
+                    audio_data = self.tts.read_audio(audio_path)
+                    if len(audio_data) > 0:
+                        voice_samples.append(audio_data)
+                    else:
+                        voice_samples.append([])
+                else:
+                    voice_samples.append([])
+            else:
+                voice_samples.append([])
+
+            inputs = self.tts.processor(  # type: ignore
+                text=[formatted_text],
+                voice_samples=[voice_samples],
+                padding=True,
+                return_tensors="pt"
+            )
+
+            # Move tensors to device
+            target_device = self.tts.device if self.tts.device in ("cuda", "mps") else "cpu"
+            for k, v in inputs.items():
+                if torch.is_tensor(v):
+                    inputs[k] = v.to(target_device)
+
+            # Log a summary of inputs for diagnostic purposes
+            try:
+                inp_summary = {}
+                for k, v in inputs.items():
+                    if torch.is_tensor(v):
+                        inp_summary[k] = f"tensor(shape={tuple(v.shape)}, dtype={v.dtype})"
+                    else:
+                        try:
+                            inp_summary[k] = f"{type(v).__name__}(len={len(v)})"
+                        except Exception:
+                            inp_summary[k] = type(v).__name__
+                logger.info(f"VibeVoice audio: Input summary for generation: {inp_summary}")
+            except Exception:
+                pass
+
+            # Create audio streamer and start model.generate in a separate thread
+            real_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
+            audio_streamer = ProxyAudioStreamer(real_streamer, session_name=self.session_name)
+
+            def _run_generate():
+                try:
+                    logger.info(f"VibeVoice audio: model.generate starting for session {self.session_name}")
+                    with torch.no_grad():
+                        self.tts.model.generate(  # type: ignore
+                            **inputs,
+                            max_new_tokens=None,
+                            cfg_scale=cfg_scale,
+                            tokenizer=self.tts.processor.tokenizer,  # type: ignore
+                            generation_config={'do_sample': False},
+                            verbose=False,
+                            streamer=audio_streamer,
+                        )
+                except Exception as e:
+                    logger.error(f"VibeVoice audio: Error during model.generate: {e}")
+                finally:
+                    # Ensure streamer is ended
+                    try:
+                        audio_streamer.end()
+                    except Exception:
+                        pass
+                    logger.info(f"VibeVoice audio: model.generate finished for session {self.session_name}")
+
+            gen_thread = threading.Thread(target=_run_generate, daemon=True)
+            gen_thread.start()
+
+            # Consume chunks from streamer and append to audio buffer as they arrive
+            generated_chunks = []
+            chunk_count = 0
+            total_samples_streamed = 0
+            logger.info(f"VibeVoice audio: Audio streamer started for session {self.session_name}")
+            try:
+                logger.info(f"VibeVoice audio: audio_streamer repr: {repr(audio_streamer)[:400]}")
+                gs = None
+                try:
+                    gs = audio_streamer.get_stream(0)
+                    logger.info(f"VibeVoice audio: get_stream returned object type: {type(gs)}")
+                except Exception as _e:
+                    logger.error(f"VibeVoice audio: calling audio_streamer.get_stream raised: {_e}")
+                    gs = None
+            except Exception:
+                gs = None
+
+            if gs is None:
+                logger.warning(f"VibeVoice audio: audio_streamer.get_stream did not return a stream for session {self.session_name}")
+                iterator = []
+            else:
+                iterator = gs
+
+            for audio_chunk in iterator:
+                try:
+                    # Convert tensor to numpy if needed
+                    if torch.is_tensor(audio_chunk):
+                        if audio_chunk.dtype == torch.bfloat16:
+                            audio_chunk = audio_chunk.float()
+                        audio_np = audio_chunk.cpu().numpy().astype(np.float32)
+                    else:
+                        audio_np = np.array(audio_chunk, dtype=np.float32)
+
+                    # Squeeze to 1D if needed
+                    if audio_np.ndim > 1:
+                        audio_np = audio_np.squeeze()
+
+                    # Resample from model sampling rate (usually 24000) to track sample rate
+                    if hasattr(self.tts, 'sample_rate') and self.tts.sample_rate != self.sample_rate:
+                        try:
+                            audio_np = librosa.resample(audio_np, orig_sr=self.tts.sample_rate, target_sr=self.sample_rate)
+                        except Exception:
+                            # If resample fails, keep original chunk
+                            pass
+
+                    # Append to internal buffer
+                    with self.buffer_lock:
+                        if len(self.audio_buffer) == 0:
+                            self.audio_buffer = audio_np
+                        else:
+                            self.audio_buffer = np.concatenate([self.audio_buffer, audio_np])
+
+                    # Also collect into generated_chunks for possible looping/debug save
+                    try:
+                        generated_chunks.append(audio_np.astype(np.float32))
+                    except Exception:
+                        pass
+
+                    total_samples_streamed += len(audio_np)
+                    chunk_count += 1
+                    # Log every few chunks to avoid log spam
+                    if chunk_count % 5 == 0:
+                        logger.info(f"VibeVoice audio: Streamed {total_samples_streamed} samples so far for session {self.session_name} (chunks={chunk_count})")
+                    else:
+                        logger.debug(f"VibeVoice audio: Streamed {len(audio_np)} samples to buffer (total buffer: {len(self.audio_buffer)})")
+
+                    # Also publish into the global waveform buffer used by WaveformVideoTrack
+                    try:
+                        if WaveformVideoTrack is not None:
+                            pname = self.session_name
+                            # Ensure buffer key exists
+                            if pname not in WaveformVideoTrack.buffer:
+                                WaveformVideoTrack.buffer[pname] = np.array([], dtype=np.float32)
+
+                            # Append to shared waveform buffer
+                            WaveformVideoTrack.buffer[pname] = np.concatenate([
+                                WaveformVideoTrack.buffer[pname], audio_np.astype(np.float32)
+                            ])
+
+                            # Ensure sample rate is set for this session
+                            WaveformVideoTrack.sample_rates[pname] = self.sample_rate
+
+                            # Limit buffer to last 10 seconds for this track
+                            max_samples = int(self.sample_rate * 10)
+                            if len(WaveformVideoTrack.buffer[pname]) > max_samples:
+                                WaveformVideoTrack.buffer[pname] = WaveformVideoTrack.buffer[pname][-max_samples:]
+
+                            # Update a lightweight speech_status for display
+                            energy = float(np.sqrt(np.mean(audio_np.astype(np.float32) ** 2))) if audio_np.size > 0 else 0.0
+                            # Approximate zero-crossing rate
+                            try:
+                                if audio_np.size > 1:
+                                    zcr = float(np.mean(np.abs(np.diff(np.sign(audio_np)) ) > 0))
+                                else:
+                                    zcr = 0.0
+                            except Exception:
+                                zcr = 0.0
+
+                            is_speech = energy > 0.005
+
+                            WaveformVideoTrack.speech_status[pname] = {
+                                'is_speech': bool(is_speech),
+                                'energy': float(energy),
+                                'zcr': float(zcr),
+                                'centroid': 0.0,
+                                'rolloff': 0.0,
+                                'flux': 0.0,
+                                'harmonicity': 0.0,
+                                'noise_floor_energy': 0.0,
+                                'adaptive_threshold': 0.0,
+                                'energy_check': bool(energy > 0.002),
+                                'zcr_check': bool(zcr > 0.01),
+                                'spectral_check': False,
+                                'harmonic_check': False,
+                                'temporal_consistency': True,
+                                'is_processing': True,
+                                'is_playing': False,
+                            }
+                    except Exception:
+                        # Non-critical - don't break TTS on visualization failures
+                        pass
+                except Exception as e:
+                    logger.error(f"VibeVoice audio: Error processing audio chunk from streamer: {e}")
+
+            # Ensure generation thread finishes
+            gen_thread.join(timeout=5.0)
+
+            # If generation thread is still alive after join, log a warning
+            if gen_thread.is_alive():
+                logger.warning(f"VibeVoice audio: generation thread still alive after join for session {self.session_name}")
+
+            # When generation completes, store last_generated_audio for looping and optionally save debug WAV
+            logger.info(f"VibeVoice audio: Generation completed for session {self.session_name}. total_samples_streamed={total_samples_streamed}, chunks={chunk_count}")
+
+            # If no chunks were received, emit a diagnostic warning with some state to help debugging
+            if chunk_count == 0:
+                try:
+                    # Provide more diagnostic info: inputs summary and streamer introspection
+                    try:
+                        sdi = {
+                            'repr': repr(audio_streamer)[:400],
+                            'dir': [n for n in dir(audio_streamer) if not n.startswith('_')][:40]
+                        }
+                    except Exception:
+                        sdi = {'repr': 'unavailable', 'dir': []}
+
+                    try:
+                        logger.warning(
+                            f"VibeVoice audio: No audio chunks were streamed for session {self.session_name}. "
+                            f"is_initialized={getattr(self.tts, 'is_initialized', False)}, model_present={hasattr(self.tts, 'model')} ; "
+                            f"audio_streamer={sdi}"
+                        )
+                    except Exception:
+                        logger.warning(f"VibeVoice audio: No audio chunks were streamed for session {self.session_name} (diagnostics failed)")
+                except Exception:
+                    logger.warning(f"VibeVoice audio: No audio chunks were streamed for session {self.session_name} (additional diagnostics unavailable)")
+                # Fallback: attempt a synchronous generation that returns a full numpy audio array
+                try:
+                    logger.info(f"VibeVoice audio: Attempting synchronous fallback generation for session {self.session_name}")
+                    fallback_audio = None
+                    try:
+                        fallback_audio = self.tts.generate_speech(text, speaker, cfg_scale=cfg_scale)
+                    except Exception as e:
+                        logger.error(f"VibeVoice audio: synchronous fallback generation raised: {e}")
+
+                    if fallback_audio is not None and getattr(fallback_audio, 'size', 0) > 0:
+                        try:
+                            fa = fallback_audio.astype(np.float32)
+                        except Exception:
+                            fa = np.array(fallback_audio, dtype=np.float32)
+
+                        # Resample if needed
+                        try:
+                            tts_sr = getattr(self.tts, 'sample_rate', 24000)
+                            if tts_sr != self.sample_rate:
+                                fa = librosa.resample(fa, orig_sr=tts_sr, target_sr=self.sample_rate)
+                        except Exception:
+                            pass
+
+                        # Append into internal buffer and last_generated_audio
+                        with self.buffer_lock:
+                            if len(self.audio_buffer) == 0:
+                                self.audio_buffer = fa
+                            else:
+                                self.audio_buffer = np.concatenate([self.audio_buffer, fa])
+                        with self._last_gen_lock:
+                            self.last_generated_audio = fa.copy()
+
+                        # Publish to waveform buffer
+                        try:
+                            pname = self.session_name
+                            if pname not in WaveformVideoTrack.buffer:
+                                WaveformVideoTrack.buffer[pname] = np.array([], dtype=np.float32)
+                            WaveformVideoTrack.buffer[pname] = np.concatenate([WaveformVideoTrack.buffer[pname], fa.astype(np.float32)])
+                            WaveformVideoTrack.sample_rates[pname] = self.sample_rate
+                        except Exception:
+                            pass
+
+                        # Optionally save debug wav
+                        if self.debug_save_wav:
+                            try:
+                                try:
+                                    import soundfile as sf
+                                    fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav"
+                                    sf.write(fname, fa, samplerate=self.sample_rate)
+                                    logger.info(f"🐞 Saved fallback generated wav to {fname} (soundfile)")
+                                except Exception:
+                                    try:
+                                        from scipy.io import wavfile
+                                        fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav"
+                                        wavfile.write(fname, self.sample_rate, (fa * 32767).astype('int16'))
+                                        logger.info(f"🐞 Saved fallback generated wav to {fname} (scipy)")
+                                    except Exception:
+                                        try:
+                                            import wave
+                                            fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav"
+                                            with wave.open(fname, 'wb') as wf:
+                                                wf.setnchannels(1)
+                                                wf.setsampwidth(2)
+                                                wf.setframerate(self.sample_rate)
+                                                int_data = (fa * 32767).astype('int16')
+                                                wf.writeframes(int_data.tobytes())
+                                            logger.info(f"🐞 Saved fallback generated wav to {fname} (wave)")
+                                        except Exception as e:
+                                            logger.error(f"Error saving fallback debug wav (all methods failed): {e}")
+                            except Exception as e:
+                                logger.error(f"Error saving fallback debug wav: {e}")
+
+                        logger.info(f"VibeVoice audio: Fallback synchronous generation successful for session {self.session_name} (samples={len(fa)})")
+                    else:
+                        logger.warning(f"VibeVoice audio: Fallback synchronous generation produced no audio for session {self.session_name}")
+                except Exception as e:
+                    logger.error(f"VibeVoice audio: Exception during synchronous fallback generation: {e}")
+            try:
+                if len(generated_chunks) > 0:
+                    try:
+                        all_gen = np.concatenate(generated_chunks).astype(np.float32)
+                    except Exception:
+                        all_gen = np.array([], dtype=np.float32)
+                    with self._last_gen_lock:
+                        self.last_generated_audio = all_gen.copy()
+
+                    # Optionally save to disk for debugging
+                    if self.debug_save_wav:
+                        try:
+                            try:
+                                import soundfile as sf
+                                fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav"
+                                sf.write(fname, all_gen, samplerate=self.sample_rate)
+                                logger.info(f"🐞 Saved generated wav to {fname} (soundfile)")
+                            except Exception:
+                                # Try scipy fallback
+                                try:
+                                    from scipy.io import wavfile
+                                    fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav"
+                                    # scipy expects int16
+                                    wavfile.write(fname, self.sample_rate, (all_gen * 32767).astype('int16'))
+                                    logger.info(f"🐞 Saved generated wav to {fname} (scipy)")
+                                except Exception:
+                                    # Ultimate fallback: write raw wave via wave module
+                                    try:
+                                        import wave
+                                        fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav"
+                                        with wave.open(fname, 'wb') as wf:
+                                            wf.setnchannels(1)
+                                            wf.setsampwidth(2)
+                                            wf.setframerate(self.sample_rate)
+                                            int_data = (all_gen * 32767).astype('int16')
+                                            wf.writeframes(int_data.tobytes())
+                                        logger.info(f"🐞 Saved generated wav to {fname} (wave)")
+                                    except Exception as e:
+                                        logger.error(f"Error saving debug wav (all methods failed): {e}")
+                        except Exception as e:
+                            logger.error(f"Error saving debug wav: {e}")
+
+            except Exception:
+                pass
+
+            # Clear processing flag when generation completes
+            try:
+                if self.session_name in WaveformVideoTrack.speech_status:
+                    WaveformVideoTrack.speech_status[self.session_name]['is_processing'] = False
+            except Exception:
+                pass
+
+        except Exception as e:
+            logger.error(f"VibeVoice audio: Error in background TTS generation: {e}")
+
+    def _get_samples_from_buffer(self, num_samples: int) -> np.ndarray:
+        """Get samples from audio buffer, removing them from buffer."""
+        # Try to refill from last_generated_audio if looping is enabled
+        with self._last_gen_lock:
+            last_gen = self.last_generated_audio.copy() if getattr(self, 'last_generated_audio', None) is not None else np.array([], dtype=np.float32)
+
+        with self.buffer_lock:
+            if len(self.audio_buffer) == 0:
+                # If we're configured to loop and have a generated sample, refill the buffer
+                if getattr(self, 'loop', False) and last_gen.size > 0:
+                    try:
+                        # Repeat last_gen as needed to reach at least num_samples
+                        repeats = int(math.ceil(float(num_samples) / float(len(last_gen)))) if len(last_gen) > 0 else 1
+                        refill = np.tile(last_gen, repeats)
+                        self.audio_buffer = refill.astype(np.float32)
+                        logger.debug(f"VibeVoice audio: Refilled audio_buffer from last_generated_audio (len={len(last_gen)}) repeats={repeats}")
+                    except Exception:
+                        # Fallback to silence on any failure
+                        self.audio_buffer = np.zeros(num_samples, dtype=np.float32)
+                else:
+                    return np.zeros(num_samples, dtype=np.float32)
+
+            if len(self.audio_buffer) >= num_samples:
+                samples = self.audio_buffer[:num_samples]
+                self.audio_buffer = self.audio_buffer[num_samples:]
+                return samples
+            else:
+                # Return remaining samples and pad with zeros
+                samples = self.audio_buffer
+                padding = np.zeros(num_samples - len(self.audio_buffer), dtype=np.float32)
+                self.audio_buffer = np.array([], dtype=np.float32)
+                return np.concatenate([samples, padding])
+
+    async def next_timestamp(self) -> Tuple[int, float]:
+        """Get next timestamp for audio frame."""
+        pts = self._samples_generated
+        time_base = 1 / self.sample_rate
+        return pts, time_base
+
+    async def recv(self) -> AudioFrame:
+        """Generate audio frame with TTS speech from buffer."""
+        # Get samples from buffer
+        samples = self._get_samples_from_buffer(self.samples_per_frame)
+        
+        # If no TTS audio available, generate background noise
+        if np.all(samples == 0):
+            # Default to disabled when not present in config to avoid unexpected noise
+            if self.config.get('background_noise_enabled', False):
+                noise_type = self.config.get('background_noise_type', 'white')
+                noise_volume = self.config.get('background_noise_volume', 0.01)
+                # Generate noise for this frame duration
+                frame_duration = self.samples_per_frame / self.sample_rate
+                logger.debug(f"🎵 Generating background noise: type={noise_type}, volume={noise_volume}, duration={frame_duration:.3f}s")
+                background_noise = self.tts.generate_background_noise(frame_duration, noise_type, noise_volume, self.sample_rate)
+                logger.debug(f"🎵 Generated background noise: {len(background_noise)} samples")
+                samples = background_noise
+            else:
+                # Generate silence if background noise is disabled
+                logger.debug("🎵 Background noise disabled - generating silence")
+                samples = np.zeros(self.samples_per_frame, dtype=np.float32)
+
+        # Convert to 16-bit PCM
+        # Update shared speech_status for visualization: energy + playing flag
+        try:
+            energy = float(np.sqrt(np.mean(samples.astype(np.float32) ** 2))) if samples.size > 0 else 0.0
+            pname = self.session_name
+            st = WaveformVideoTrack.speech_status.get(pname, {})
+            st['energy'] = float(energy)
+            # Consider playing when energy above small threshold
+            st['is_playing'] = bool(energy > 0.001)
+            st['is_speech'] = bool(energy > 0.003)
+            WaveformVideoTrack.speech_status[pname] = st
+        except Exception:
+            pass
+
+        samples_int16 = (samples * 32767).astype(np.int16)
+
+        # Create stereo audio (duplicate mono channel)
+        left = samples_int16
+        right = samples_int16.copy()
+        stereo = np.empty(self.samples_per_frame * 2, dtype=np.int16)
+        stereo[0::2] = left
+        stereo[1::2] = right
+
+        # Create audio frame
+        frame = AudioFrame.from_ndarray(stereo.reshape(1, -1), format="s16", layout="stereo")
+        frame.sample_rate = self.sample_rate
+        frame.pts = self._samples_generated
+        frame.time_base = fractions.Fraction(1, self.sample_rate)
+
+        # Increment sample counter
+        self._samples_generated += self.samples_per_frame
+
+        return frame
+
+
+class VibeVoiceTTSBot:
+    """VibeVoice Text-to-Speech Bot for voicebot framework."""
+
+    def __init__(self, session_name: str, config: Optional[Dict[str, Any]] = None):
+        self.session_name = session_name
+        self.config = config or {}
+
+        # Initialize TTS engine with enhanced parameters
+        device = self.config.get('device', 'cpu')
+        inference_steps = self.config.get('inference_steps', 10)
+        self.tts_engine = VibeVoiceTTS(device=device, inference_steps=inference_steps, config=self.config)
+
+        # Store generation parameters
+        self.cfg_scale = self.config.get('cfg_scale', 1.3)
+        self.speaker = self.config.get('speaker', '1')
+
+        # Initialize media components
+        self.media_clock = MediaClock()
+        # Pass session name into video track so it can show per-session waveform/status
+        self.video_track = VibeVoiceVideoTrack(self.media_clock, self.config, session_name=session_name)
+        self.audio_track = VibeVoiceAudioTrack(self.media_clock, self.config, self.tts_engine, session_name=session_name)
+
+        # Initialize shared waveform store sample rate and empty buffer/status
+        try:
+            WaveformVideoTrack.sample_rates[session_name] = self.config.get('sample_rate', 16000)
+            if session_name not in WaveformVideoTrack.buffer:
+                WaveformVideoTrack.buffer[session_name] = np.array([], dtype=np.float32)
+            if session_name not in WaveformVideoTrack.speech_status:
+                WaveformVideoTrack.speech_status[session_name] = {'is_speech': False, 'energy': 0.0, 'is_processing': False, 'is_playing': False}
+        except Exception:
+            pass
+
+        # Apply initial configuration values to ensure defaults from schema/config provider
+        try:
+            self.update_config(self.config)
+        except Exception:
+            # Don't let config application stop initialization
+            pass
+
+        logger.info(f"VibeVoice bot initialized for session {session_name} with cfg_scale={self.cfg_scale}, speaker={self.speaker}")
+
+    def get_tracks(self) -> Dict[str, MediaStreamTrack]:
+        """Get video and audio tracks."""
+        return {
+            "video": self.video_track,
+            "audio": self.audio_track
+        }
+
+    def handle_chat_message(self, message: ChatMessageModel):
+        """Handle incoming chat messages by converting them to speech."""
+        try:
+            text = message.message.strip()
+            if text:
+                logger.info(f"VibeVoice bot received chat: '{text}' from {message.sender_name}")
+
+                # Queue text for both video display and audio speech
+                self.video_track.update_text(text)
+                self.audio_track.speak_text(text, self.cfg_scale)
+
+        except Exception as e:
+            logger.error(f"Error handling chat message in VibeVoice bot: {e}")
+
+    def update_config(self, config_updates: Dict[str, Any]) -> bool:
+        """Update bot configuration."""
+        try:
+            self.config.update(config_updates)
+
+            # Update TTS-specific parameters
+            if 'cfg_scale' in config_updates:
+                self.cfg_scale = config_updates['cfg_scale']
+            if 'speaker' in config_updates:
+                self.speaker = config_updates['speaker']
+
+            # Update tracks
+            video_success = self.video_track.update_config(config_updates)
+            audio_success = self.audio_track.update_config(config_updates)
+
+            if video_success and audio_success:
+                logger.info(f"VibeVoice bot configuration updated: {config_updates}")
+                return True
+            else:
+                logger.warning("Partial configuration update failure in VibeVoice bot")
+                return False
+
+        except Exception as e:
+            logger.error(f"Error updating VibeVoice bot configuration: {e}")
+            return False
+
+
+# Global bot instance registry
+_vibevoice_bots: Dict[str, VibeVoiceTTSBot] = {}
+
+
+def create_vibevoice_bot_tracks(session_name: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, MediaStreamTrack]:
+    """
+    Create VibeVoice TTS bot tracks.
+
+    Args:
+        session_name: Name for the session
+        config: Configuration dictionary with options:
+            - width: video width (default 640)
+            - height: video height (default 480)
+            - fps: frames per second (default 15)
+            - sample_rate: audio sample rate (default 16000)
+            - samples_per_frame: audio samples per frame (default 960)
+            - speaker: TTS speaker name (default '1')
+            - device: device for TTS ('cpu', 'cuda', 'mps')
+            - cfg_scale: CFG scale for generation (default 1.3)
+            - inference_steps: Number of inference steps (default 10)
+
+    Returns:
+        Dictionary containing 'video' and 'audio' tracks
+    """
+    if config is None:
+        config = {}
+
+    # Set defaults
+    default_config = {
+        'width': 640,
+        'height': 480,
+        'fps': 15,
+        'sample_rate': 16000,
+        'samples_per_frame': 960,
+        'speaker': '1',
+        'device': 'cpu',
+        'cfg_scale': 1.3,
+        'inference_steps': 10,
+        # Explicit background noise defaults - disabled by default
+        'background_noise_enabled': False,
+        'background_noise_type': 'none',
+        'background_noise_volume': 0.0,
+    }
+    default_config.update(config)
+
+    # Create bot instance
+    bot = VibeVoiceTTSBot(session_name, default_config)
+    _vibevoice_bots[session_name] = bot
+
+    logger.info(f"Created VibeVoice bot tracks for {session_name}")
+    return bot.get_tracks()
+
+
+def handle_config_update(session_name: str, config_values: Dict[str, Any]) -> bool:
+    """
+    Handle runtime configuration updates for VibeVoice bot.
+
+    Args:
+        session_name: Name of the session/bot instance
+        config_values: Dictionary of configuration values to update
+
+    Returns:
+        bool: True if update was successful, False otherwise
+    """
+    try:
+        if session_name in _vibevoice_bots:
+            return _vibevoice_bots[session_name].update_config(config_values)
+        else:
+            logger.warning(f"No VibeVoice bot found for session {session_name}")
+            return False
+    except Exception as e:
+        logger.error(f"Error updating VibeVoice bot configuration: {e}")
+        return False
+
+
+async def handle_chat_message(
+    chat_message: ChatMessageModel, 
+    send_message_func: Callable[[Union[str, ChatMessageModel]], Awaitable[None]]
+) -> Optional[str]:
+    """
+    Handle incoming chat messages and convert them to speech.
+
+    Args:
+        chat_message: The chat message to process
+        send_message_func: Function to send chat responses (not used by TTS bot)
+    """
+    try:
+        # Find the bot instance - we need to get session name from somewhere
+        # For now, we'll use the first available bot instance
+        if _vibevoice_bots:
+            session_name = list(_vibevoice_bots.keys())[0]
+            _vibevoice_bots[session_name].handle_chat_message(chat_message)
+            logger.info(f"VibeVoice bot processed chat message from {chat_message.sender_name}: '{chat_message.message}'")
+        else:
+            logger.warning("No VibeVoice bot instances available to handle chat message")
+    except Exception as e:
+        logger.error(f"Error handling chat message in VibeVoice bot: {e}")
+    
+    # TTS bot doesn't send chat responses, so return None
+    return None
+
+
+# Agent descriptor exported for dynamic discovery by the FastAPI service
+AGENT_NAME = "VibeVoice TTS Bot"
+AGENT_DESCRIPTION = "Microsoft VibeVoice text-to-speech bot with visual text display"
+
+def agent_info() -> Dict[str, str]:
+    """Return agent metadata for discovery."""
+    return {
+        "name": AGENT_NAME,
+        "description": AGENT_DESCRIPTION,
+        "has_media": "true",
+        "configurable": "true",
+        "chat_enabled": "true"
+    }
+
+
+def get_config_schema() -> Dict[str, Any]:
+    """Get the configuration schema for the VibeVoice Bot."""
+    return {
+        "bot_name": AGENT_NAME,
+        "version": "1.0",
+        "parameters": [
+            {
+                "name": "width",
+                "type": "number",
+                "label": "Video Width",
+                "description": "Width of the video frame in pixels",
+                "default_value": 640,
+                "required": False,
+                "min_value": 320,
+                "max_value": 1920,
+                "step": 1
+            },
+            {
+                "name": "height",
+                "type": "number",
+                "label": "Video Height",
+                "description": "Height of the video frame in pixels",
+                "default_value": 480,
+                "required": False,
+                "min_value": 240,
+                "max_value": 1080,
+                "step": 1
+            },
+            {
+                "name": "fps",
+                "type": "number",
+                "label": "Frames Per Second",
+                "description": "Video frame rate",
+                "default_value": 15,
+                "required": False,
+                "min_value": 1,
+                "max_value": 60,
+                "step": 1
+            },
+            {
+                "name": "speaker",
+                "type": "select",
+                "label": "TTS Speaker",
+                "description": "Voice to use for text-to-speech",
+                "default_value": "1",
+                "required": True,
+                "options": [
+                    {"value": "1", "label": "Speaker 1 (en-Alice_woman)"},
+                    {"value": "2", "label": "Speaker 2 (en-Carter_man)"},
+                    {"value": "3", "label": "Speaker 3 (en-Frank_man)"},
+                    {"value": "4", "label": "Speaker 4 (en-Mary_woman_bgm)"}
+                ]
+            },
+            {
+                "name": "background_noise_enabled",
+                "type": "boolean",
+                "label": "Enable Background Noise",
+                "description": "Add background noise to ensure continuous audio streaming",
+                "default_value": False,
+                "required": False
+            },
+            {
+                "name": "background_noise_type",
+                "type": "select",
+                "label": "Background Noise Type",
+                "description": "Type of background noise to generate",
+                # 'none' indicates no noise - matches default disabled behavior
+                "default_value": "none",
+                "required": False,
+                "options": [
+                    {"value": "white", "label": "White Noise"},
+                    {"value": "pink", "label": "Pink Noise"},
+                    {"value": "brown", "label": "Brown Noise"},
+                    {"value": "none", "label": "None"}
+                ]
+            },
+            {
+                "name": "background_noise_volume",
+                "type": "number",
+                "label": "Background Noise Volume",
+                "description": "Volume level of background noise (0.0 to 1.0)",
+                "default_value": 0.01,
+                "required": False,
+                "min_value": 0.0,
+                "max_value": 1.0,
+                "step": 0.001
+            },
+            {
+                "name": "device",
+                "type": "select",
+                "label": "Processing Device",
+                "description": "Device to use for TTS processing",
+                "default_value": "cpu",
+                "required": True,
+                "options": [
+                    {"value": "cpu", "label": "CPU"},
+                    {"value": "cuda", "label": "CUDA (GPU)"},
+                    {"value": "mps", "label": "MPS (Apple Silicon)"}
+                ]
+            },
+            {
+                "name": "cfg_scale",
+                "type": "number",
+                "label": "CFG Scale",
+                "description": "Classifier-free guidance scale for controlling generation quality",
+                "default_value": 1.3,
+                "required": False,
+                "min_value": 1.0,
+                "max_value": 2.0,
+                "step": 0.05
+            },
+            {
+                "name": "inference_steps",
+                "type": "number",
+                "label": "Inference Steps",
+                "description": "Number of denoising steps for audio generation",
+                "default_value": 10,
+                "required": False,
+                "min_value": 5,
+                "max_value": 50,
+                "step": 1
+            }
+        ],
+        "categories": [
+            {
+                "Video Settings": ["width", "height", "fps"]
+            },
+            {
+                "TTS Settings": ["speaker", "device", "cfg_scale", "inference_steps"]
+            },
+            {
+                "Background Noise": ["background_noise_enabled", "background_noise_type", "background_noise_volume"]
+            }
+        ]
+    }
+
+
+def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]:
+    """Factory wrapper used by the FastAPI service to instantiate tracks for an agent."""
+    return create_vibevoice_bot_tracks(session_name)
\ No newline at end of file
diff --git a/voicebot/bots/whisper.py b/voicebot/bots/whisper.py
index 3d69e62..7ae120f 100644
--- a/voicebot/bots/whisper.py
+++ b/voicebot/bots/whisper.py
@@ -58,13 +58,153 @@ AudioArray = npt.NDArray[np.float32]
 ModelConfig = Dict[str, Union[str, int, bool]]
 CalibrationData = List[Dict[str, Any]]
 
-_device = "GPU.1"  # Default to Intel Arc B580 GPU
-
 # Global lock to serialize calls into the OpenVINO model.generate/decode
 # since some backends are not safe for concurrent generate calls.
 _generate_global_lock = threading.Lock()
 
 
+def _do_generate_once(model, *args, **kwargs):
+    """Submit a single generate call to the serialized worker and return result.
+
+    Raises any exception raised by the underlying generate call.
+    """
+    return _submit_generate_to_worker(model.generate, *args, **kwargs)
+
+
+def _safe_generate_with_retries(model, *args, max_retries: int = 20, initial_delay: float = 0.05, **kwargs):
+    """Call model.generate while handling OpenVINO 'Infer Request is busy' by retrying.
+
+    This helper retries on RuntimeError containing 'Infer Request is busy' with
+    exponential backoff. It raises the last exception if retries are exhausted.
+    """
+    delay = initial_delay
+    last_exc = None
+    for attempt in range(1, max_retries + 1):
+        try:
+            # Submit the actual blocking generate to the serialized worker
+            return _do_generate_once(model, *args, **kwargs)
+        except RuntimeError as e:
+            last_exc = e
+            msg = str(e)
+            # Match the specific OpenVINO busy error message
+            if "Infer Request is busy" in msg:
+                logger.warning(
+                    f"OpenVINO infer busy (attempt {attempt}/{max_retries}), retrying after {delay:.3f}s..."
+                )
+                time.sleep(delay)
+                delay = min(delay * 2.0, 1.0)
+                continue
+            # Not the busy error - re-raise immediately
+            raise
+        except Exception:
+            raise
+    # Retries exhausted
+    logger.error(f"OpenVINO generate retries exhausted ({max_retries}) - raising last error: {last_exc}")
+    raise last_exc
+
+
+# Global serialized generate worker to ensure OpenVINO infer requests are not
+# called concurrently across threads. Some OpenVINO backends will error with
+# "Infer Request is busy" if multiple infer calls overlap on the same
+# compiled model; queueing here serializes calls at the process level.
+_generate_queue = Queue()
+_generate_worker_started = False
+
+
+def _generate_worker() -> None:
+    while True:
+        fn, args, kwargs, ev, out = _generate_queue.get()
+        try:
+            # Perform internal retries if OpenVINO reports the request as busy.
+            delay = 0.02
+            max_inner_retries = 20
+            last_exc = None
+            for attempt in range(1, max_inner_retries + 1):
+                try:
+                    res = fn(*args, **kwargs)
+                    out['result'] = res
+                    out['exc'] = None
+                    break
+                except RuntimeError as e:
+                    last_exc = e
+                    msg = str(e)
+                    if "Infer Request is busy" in msg:
+                        # log at debug to avoid noise but keep visibility
+                        logger.debug(f"Worker: infer busy (attempt {attempt}/{max_inner_retries}), sleeping {delay:.3f}s")
+                        time.sleep(delay)
+                        delay = min(delay * 2.0, 1.0)
+                        continue
+                    # not a busy error - surface immediately
+                    out['result'] = None
+                    out['exc'] = e
+                    break
+                except Exception as e:
+                    out['result'] = None
+                    out['exc'] = e
+                    break
+            else:
+                # exhausted retries
+                out['result'] = None
+                out['exc'] = last_exc
+        finally:
+            try:
+                ev.set()
+            except Exception:
+                pass
+
+
+def _ensure_generate_worker() -> None:
+    global _generate_worker_started
+    if _generate_worker_started:
+        return
+    t = threading.Thread(target=_generate_worker, daemon=True)
+    t.start()
+    _generate_worker_started = True
+
+
+def _submit_generate_to_worker(fn, *args, **kwargs):
+    """Submit a blocking generate fn to the serialized worker and wait for result."""
+    _ensure_generate_worker()
+    ev = threading.Event()
+    out: Dict[str, Any] = {}
+    _generate_queue.put((fn, args, kwargs, ev, out))
+    ev.wait()
+    if out.get('exc'):
+        raise out['exc']
+    return out.get('result')
+
+
+
+async def _safe_generate_with_retries_async(model, *args, max_retries: int = 20, initial_delay: float = 0.05, **kwargs):
+    """Async variant of the generate retry helper that uses asyncio.sleep.
+
+    Should be awaited from asynchronous contexts to avoid blocking the event loop.
+    """
+    delay = initial_delay
+    last_exc = None
+    for attempt in range(1, max_retries + 1):
+        try:
+            # Delegate to the serialized worker in an executor so the event loop
+            # isn't blocked waiting on the worker event.
+            loop = asyncio.get_running_loop()
+            return await loop.run_in_executor(None, lambda: _do_generate_once(model, *args, **kwargs))
+        except RuntimeError as e:
+            last_exc = e
+            msg = str(e)
+            if "Infer Request is busy" in msg:
+                logger.warning(
+                    f"OpenVINO infer busy (async attempt {attempt}/{max_retries}), retrying after {delay:.3f}s..."
+                )
+                await asyncio.sleep(delay)
+                delay = min(delay * 2.0, 1.0)
+                continue
+            raise
+        except Exception:
+            raise
+    logger.error(f"OpenVINO async generate retries exhausted ({max_retries}) - raising last error: {last_exc}")
+    raise last_exc
+
+
 def get_available_devices() -> list[dict[str, Any]]:
     """List available OpenVINO devices with their properties."""
     try:
@@ -125,9 +265,27 @@ def print_available_devices(device: str | None = None):
         logger.info(f"  Type: {d.get('type')}")
 
 
+def find_best_device(preferred_type: str = "DISCRETE") -> str:
+    """Find the best available OpenVINO device, preferring the specified type (e.g., 'DISCRETE', 'INTEGRATED', 'CPU', 'GPU')."""
+    devices = get_available_devices()
+    if not devices:
+        logger.warning("No OpenVINO devices found, defaulting to CPU")
+        return "CPU"
+    for d in devices:
+        device_type = str(d.get("type", "")).upper()
+        if device_type == preferred_type.upper():
+            logger.info(f"Using preferred device: {preferred_type}")
+            return d.get("name", "CPU")
+    logger.info("Preferred device not found, using first available device")
+    return devices[0].get("name", "CPU")
+
+_device = find_best_device(preferred_type="Type.DISCRETE")
+
 print_available_devices(_device)
 
 
+
+
 class AudioQueueItem(BaseModel):
     """Audio data with timestamp for processing queue."""
 
@@ -536,11 +694,39 @@ class OpenVINOWhisperModel:
             logger.info("Whisper processor loaded successfully")
 
             # Export the model to OpenVINO IR if not already converted
-            self.ov_model = OVModelForSpeechSeq2Seq.from_pretrained( # type: ignore
-                self.model_id, export=True, device=self.device
-            )  # type: ignore
+            try:
+                self.ov_model = OVModelForSpeechSeq2Seq.from_pretrained( # type: ignore
+                    self.model_id, export=True, device=self.device
+                )  # type: ignore
+                logger.info("Whisper model exported as OpenVINO IR")
+            except Exception as export_e:
+                logger.warning(f"Initial OpenVINO export failed: {export_e}")
+                # Retry using processor-derived example_inputs if possible
+                try:
+                    if self.processor is None:
+                        self.processor = WhisperProcessor.from_pretrained(self.model_id, use_fast=True)  # type: ignore
+                    dummy_audio = np.random.randn(16000).astype(np.float32)
+                    try:
+                        example_inputs = self.processor(# type: ignore
+                            dummy_audio, sampling_rate=16000, return_tensors="pt"
+                        ).input_features  # type: ignore
+                    except Exception as ex_inputs:
+                        logger.warning(f"Failed to generate example_inputs for export retry: {ex_inputs}")
+                        example_inputs = None
 
-            logger.info("Whisper model exported as OpenVINO IR")
+                    if example_inputs is not None:
+                        self.ov_model = OVModelForSpeechSeq2Seq.from_pretrained(  # type: ignore
+                            self.model_id, export=True, device=self.device, example_inputs=example_inputs
+                        )
+                    else:
+                        self.ov_model = OVModelForSpeechSeq2Seq.from_pretrained(  # type: ignore
+                            self.model_id, export=True, device=self.device
+                        )
+
+                    logger.info("Whisper model exported as OpenVINO IR (retry with example_inputs)")
+                except Exception as retry_export_e:
+                    logger.error(f"Export retry failed: {retry_export_e}")
+                    raise
 
             # # Try to load quantized model first if it exists
             # if self.config.enable_quantization and self.quantized_model_path.exists():
@@ -599,6 +785,60 @@ class OpenVINOWhisperModel:
 
         except Exception as e:
             logger.error(f"Model conversion failed: {e}")
+            # If conversion failed due to example_input / tracing mismatch
+            # try converting again by providing a correctly-shaped example
+            # input derived from the Whisper processor. This can resolve
+            # mismatches between the default example and model signatures.
+            try:
+                logger.info("Retrying conversion with processor-derived example_inputs...")
+                if self.processor is None:
+                    # Ensure processor is available
+                    self.processor = WhisperProcessor.from_pretrained(self.model_id, use_fast=True)  # type: ignore
+
+                # Create a short dummy audio (1s) to produce input_features
+                try:
+                    dummy_audio = np.random.randn(16000).astype(np.float32)
+                    example_inputs = self.processor(# type: ignore
+                        dummy_audio, sampling_rate=16000, return_tensors="pt"
+                    ).input_features  # type: ignore
+                except Exception as ex_inputs:
+                    logger.warning(f"Failed to generate example_inputs from processor: {ex_inputs}")
+                    example_inputs = None
+
+                # Attempt conversion again, supplying example_inputs if available
+                if example_inputs is not None:
+                    ov_model = OVModelForSpeechSeq2Seq.from_pretrained(  # type: ignore
+                        self.model_id,
+                        ov_config=self.config.to_ov_config(),
+                        export=True,
+                        compile=False,
+                        example_inputs=example_inputs,
+                        load_in_8bit=False,
+                    )
+                else:
+                    ov_model = OVModelForSpeechSeq2Seq.from_pretrained(  # type: ignore
+                        self.model_id,
+                        ov_config=self.config.to_ov_config(),
+                        export=True,
+                        compile=False,
+                        load_in_8bit=False,
+                    )
+
+                if hasattr(ov_model, 'half'):
+                    ov_model.half()  # type: ignore
+                ov_model.save_pretrained(self.model_path)  # type: ignore
+                logger.info("Model converted and saved in FP16 format (retry with example_inputs)")
+                self.ov_model = ov_model  # type: ignore
+                self._compile_model()
+                return
+            except TypeError as te:
+                # from_pretrained may not accept example_inputs in some versions
+                logger.warning(f"Conversion retry with example_inputs not supported: {te}")
+            except Exception as retry_e:
+                logger.warning(f"Retry conversion with example_inputs failed: {retry_e}")
+
+            # If all conversion attempts fail, propagate to fallback path
+            logger.warning("Falling back to basic conversion without advanced export options")
             raise
 
     def _convert_model_basic(self) -> None:
@@ -816,8 +1056,8 @@ class OpenVINOWhisperModel:
                     )  # type: ignore
 
                     # Run inference to collect calibration data
-                    _ = self.ov_model.generate(  # type: ignore
-                        inputs.input_features, max_new_tokens=10  # type: ignore
+                    _ = _safe_generate_with_retries(  # type: ignore
+                        self.ov_model, inputs.input_features, max_new_tokens=10
                     )
 
                     if i % 5 == 0:
@@ -957,7 +1197,7 @@ class OpenVINOWhisperModel:
 
             # Run warmup iterations
             for i in range(3):
-                _ = self.ov_model.generate(dummy_features, max_new_tokens=10)# type: ignore
+                _ = _safe_generate_with_retries(self.ov_model, dummy_features, max_new_tokens=10)  # type: ignore
                 if i == 0:
                     logger.debug("First warmup iteration completed")
         except Exception as e:
@@ -1482,9 +1722,7 @@ class OptimizedAudioProcessor:
                 # Serialize access to the underlying OpenVINO generation call
                 # to avoid concurrency problems with the OpenVINO runtime.
                 with _generate_global_lock:
-                    gen_out = ov_model.ov_model.generate(# type: ignore
-                        input_features, generation_config=gen_cfg# type: ignore
-                    )
+                    gen_out = _safe_generate_with_retries(ov_model.ov_model, input_features, generation_config=gen_cfg)  # type: ignore
 
                 # Try to extract sequences if present
                 if hasattr(gen_out, "sequences"):  # type: ignore
@@ -1886,9 +2124,8 @@ class OptimizedAudioProcessor:
                             logger.info(f"{self.peer_name}: calling model.generate (async lock) (final)")
                         else:
                             logger.debug(f"{self.peer_name}: calling model.generate (async lock)")
-                        generation_output = ov_model.ov_model.generate(  # type: ignore
-                            input_features, generation_config=generation_config
-                        )
+                        # Use async-safe retry wrapper to avoid blocking event loop
+                        generation_output = await _safe_generate_with_retries_async(ov_model.ov_model, input_features, generation_config=generation_config)  # type: ignore
                     finally:
                         self._generate_lock.release()
                 elif hasattr(self, "_generate_lock") and isinstance(self._generate_lock, threading.Lock):
@@ -1897,17 +2134,13 @@ class OptimizedAudioProcessor:
                             logger.info(f"{self.peer_name}: calling model.generate (thread lock) (final)")
                         else:
                             logger.debug(f"{self.peer_name}: calling model.generate (thread lock)")
-                        generation_output = ov_model.ov_model.generate(  # type: ignore
-                            input_features, generation_config=generation_config
-                        )
+                        generation_output = _safe_generate_with_retries(ov_model.ov_model, input_features, generation_config=generation_config)  # type: ignore
                 else:
                     if is_final:
                         logger.info(f"{self.peer_name}: calling model.generate (no lock) (final)")
                     else:
                         logger.debug(f"{self.peer_name}: calling model.generate (no lock)")
-                    generation_output = ov_model.ov_model.generate(  # type: ignore
-                        input_features, generation_config=generation_config
-                    )
+                    generation_output = _safe_generate_with_retries(ov_model.ov_model, input_features, generation_config=generation_config)  # type: ignore
 
                 if is_final:
                     logger.info(f"{self.peer_name}: model.generate complete (final) (type={type(generation_output)})")
@@ -2686,7 +2919,7 @@ def get_config_schema() -> Dict[str, Any]:
                 "default_value": _device,
                 "required": True,
                 "options": [
-                    {"value": "GPU.1", "label": "Intel Arc GPU (GPU.1)"},
+                        # {"value": "GPU.1", "label": "Intel Arc GPU (GPU.1)"},
                     {"value": "GPU", "label": "GPU"},
                     {"value": "CPU", "label": "CPU"}
                 ]
@@ -2959,7 +3192,7 @@ def handle_config_update(lobby_id: str, config_values: Dict[str, Any]) -> bool:
         if "device" in config_values:
             new_device = config_values["device"] # type: ignore
             available_devices = [d["name"] for d in get_available_devices()]
-            if new_device in available_devices or new_device in ["CPU", "GPU", "GPU.1"]:
+            if new_device in available_devices or new_device in ["CPU", "GPU"]:#, "GPU.1"]:
                 _device = new_device
                 _ov_config.device = new_device
                 config_applied = True
diff --git a/voicebot/requirements.txt b/voicebot/requirements.txt
index 0fdd756..a3b3d60 100644
--- a/voicebot/requirements.txt
+++ b/voicebot/requirements.txt
@@ -1,175 +1,183 @@
-about-time
-aiofiles
-aiohappyeyeballs
-aiohttp
-aioice
-aiortc
-aiosignal
-alive-progress
-annotated-types
-anthropic
-anyio
-attrs
-audioread
-autograd
-av
-brotli
-certifi
-cffi
-charset-normalizer
-click
-cma
-contourpy
-cryptography
-cycler
-datasets
-decorator
-deprecated
-dill
-distro
-dnspython
-fastapi
-ffmpy
-filelock
-fonttools
-frozenlist
-fsspec
-google-crc32c
-gradio
-gradio-client
-grapheme
-graphemeu
-groovy
-h11
-hf-xet
-httpcore
-httpx
-huggingface-hub
-idna
-ifaddr
-iniconfig
-jinja2
-jiter
-jiwer
-joblib
-jsonschema
-jsonschema-specifications
-kiwisolver
-lazy-loader
-librosa
-llvmlite
-markdown-it-py
-markupsafe
-matplotlib
-mdurl
-ml-dtypes
-more-itertools
-mpmath
-msgpack
-multidict
-multiprocess
-natsort
-networkx
-ninja
-nncf
-numba
-numpy
-nvidia-cublas-cu12
-nvidia-cuda-cupti-cu12
-nvidia-cuda-nvrtc-cu12
-nvidia-cuda-runtime-cu12
-nvidia-cudnn-cu12
-nvidia-cufft-cu12
-nvidia-cufile-cu12
-nvidia-curand-cu12
-nvidia-cusolver-cu12
-nvidia-cusparse-cu12
-nvidia-cusparselt-cu12
-nvidia-nccl-cu12
-nvidia-nvjitlink-cu12
-nvidia-nvtx-cu12
-onnx
-openai
+about-time==4.2.1
+absl-py==2.3.1
+accelerate==1.6.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.15
+aioice==0.10.1
+aiortc==1.13.0
+aiosignal==1.4.0
+alive-progress==3.2.0
+annotated-types==0.7.0
+anthropic==0.67.0
+anyio==4.10.0
+attrs==25.3.0
+audioread==3.0.1
+autograd==1.8.0
+av==14.4.0
+brotli==1.1.0
+certifi==2025.8.3
+cffi==2.0.0
+charset-normalizer==3.4.3
+click==8.2.1
+cma==4.3.0
+contourpy==1.3.3
+cryptography==45.0.7
+cycler==0.12.1
+datasets==4.1.0
+decorator==5.2.1
+deprecated==1.2.18
+diffusers==0.35.1
+dill==0.4.0
+distro==1.9.0
+dnspython==2.8.0
+fastapi==0.116.1
+ffmpy==0.6.1
+filelock==3.19.1
+fonttools==4.59.2
+frozenlist==1.7.0
+fsspec==2025.9.0
+google-crc32c==1.7.1
+gradio==5.45.0
+gradio-client==1.13.0
+grapheme==0.6.0
+graphemeu==0.8.0
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.1.10
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.34.5
+idna==3.10
+ifaddr==0.2.0
+importlib-metadata==8.7.0
+iniconfig==2.1.0
+jinja2==3.1.6
+jiter==0.11.0
+jiwer==4.0.0
+joblib==1.5.2
+jsonschema==4.25.1
+jsonschema-specifications==2025.9.1
+kiwisolver==1.4.9
+lazy-loader==0.4
+librosa==0.11.0
+llvmlite==0.44.0
+markdown-it-py==4.0.0
+markupsafe==3.0.2
+matplotlib==3.10.6
+mdurl==0.1.2
+ml-collections==1.1.0
+ml-dtypes==0.5.3
+more-itertools==10.8.0
+mpmath==1.3.0
+msgpack==1.1.1
+multidict==6.6.4
+multiprocess==0.70.16
+natsort==8.4.0
+networkx==3.4.2
+ninja==1.13.0
+nncf==2.18.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.3
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvtx-cu12==12.8.90
+onnx==1.19.0
+openai==1.107.2
 openai-whisper @ git+https://github.com/openai/whisper.git@c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
-opencv-python
-openvino
-openvino-genai
-openvino-telemetry
-openvino-tokenizers
-optimum
+opencv-python==4.12.0.88
+openvino==2025.3.0
+openvino-genai==2025.3.0.0
+openvino-telemetry==2025.2.0
+openvino-tokenizers==2025.3.0.0
+optimum==1.27.0
 optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@b9c151fec6b414d9ca78be8643d08e267b133bfc
-orjson
-packaging
-pandas
-pillow
-platformdirs
-pluggy
-pooch
-propcache
-protobuf
-psutil
-pyarrow
-pycparser
-pydantic
-pydantic-core
-pydot
-pydub
-pyee
-pygments
-pylibsrtp
-pymoo
-pyopencl
-pyopenssl
-pyparsing
-pytest
-pytest-asyncio
-python-dateutil
-python-ffmpeg
-python-multipart
-pytools
-pytz
-pyyaml
-rapidfuzz
-referencing
-regex
-requests
-resampy
-rich
-rpds-py
-ruff
-safehttpx
-safetensors
-scikit-learn
-scipy
-semantic-version
-setuptools
-shellingham
-siphash24
-six
-sniffio
-soundfile
-soxr
-speechrecognition
-starlette
-sympy
-tabulate
-threadpoolctl
-tiktoken
-tokenizers
-tomlkit
-torch
-torchvision
-tqdm
-transformers
-triton
-typer
-typing-extensions
-typing-inspection
-tzdata
-urllib3
-uvicorn
-watchdog
-websockets
-wrapt
-xxhash
-yarl
+orjson==3.11.3
+packaging==25.0
+pandas==2.3.2
+peft==0.17.1
+pillow==11.3.0
+platformdirs==4.4.0
+pluggy==1.6.0
+pooch==1.8.2
+propcache==0.3.2
+protobuf==6.32.1
+psutil==7.0.0
+pyarrow==21.0.0
+pycparser==2.23
+pydantic==2.11.9
+pydantic-core==2.33.2
+pydot==3.0.4
+pydub==0.25.1
+pyee==13.0.0
+pygments==2.19.2
+pylibsrtp==0.12.0
+pymoo==0.6.1.5
+pyopencl==2025.2.6
+pyopenssl==25.2.0
+pyparsing==3.2.4
+pytest==8.4.2
+pytest-asyncio==1.2.0
+python-dateutil==2.9.0.post0
+python-ffmpeg==2.0.12
+python-multipart==0.0.20
+pytools==2025.2.4
+pytz==2025.2
+pyyaml==6.0.2
+rapidfuzz==3.14.1
+referencing==0.36.2
+regex==2025.9.1
+requests==2.32.5
+resampy==0.4.3
+rich==14.1.0
+rpds-py==0.27.1
+ruff==0.13.0
+safehttpx==0.1.6
+safetensors==0.6.2
+scikit-learn==1.7.2
+scipy==1.16.2
+semantic-version==2.10.0
+setuptools==80.9.0
+shellingham==1.5.4
+siphash24==1.8
+six==1.17.0
+sniffio==1.3.1
+soundfile==0.13.1
+soxr==1.0.0
+speechrecognition==3.14.3
+starlette==0.47.3
+sympy==1.14.0
+tabulate==0.9.0
+threadpoolctl==3.6.0
+tiktoken==0.11.0
+tokenizers==0.21.4
+tomlkit==0.13.3
+torch==2.8.0
+torchvision==0.23.0
+tqdm==4.67.1
+transformers==4.53.3
+triton==3.4.0
+typer==0.17.4
+typing-extensions==4.15.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
+uvicorn==0.35.0
+-e file:///voicebot/VibeVoice
+watchdog==6.0.0
+websockets==15.0.1
+wrapt==1.17.3
+xxhash==3.5.0
+yarl==1.20.1
+zipp==3.23.0