ai-voicebot/voicebot/bots/vibevoice.py

1574 lines
70 KiB
Python

#!/usr/bin/env python3
"""
VibeVoice Text-to-Speech Bot for Voicebot Framework
Integrates Microsoft's VibeVoice TTS with the voicebot framework.
Watches for chat messages and converts them to speech with text display.
"""
import threading
import queue
import time
import numpy as np
import cv2
import math
from typing import Dict, Optional, Any, Tuple, Callable, Awaitable, Union
from av.audio.frame import AudioFrame
from av import VideoFrame
from aiortc import MediaStreamTrack
import fractions
import torch
import librosa
import os
from shared.logger import logger
from shared.models import ChatMessageModel
# Implement a local WaveformVideoTrack-like helper to hold shared waveform buffers
# and lightweight speech status per session. This avoids depending on bots.whisper
class WaveformVideoTrack:
"""Lightweight shared storage for waveform visualization and speech status.
This class is not itself a MediaStreamTrack; it's used as a shared in-memory
store that video tracks in this file will read from to render waveforms and
status overlays.
"""
# session_name -> np.ndarray(float32) containing recent audio samples (mono)
buffer: Dict[str, np.ndarray] = {}
# session_name -> dict with status flags (is_speech, energy, is_processing, is_playing, etc.)
speech_status: Dict[str, Dict[str, Any]] = {}
# session_name -> sample_rate used for that buffer
sample_rates: Dict[str, int] = {}
# Proxy wrapper for AudioStreamer to log put() calls and basic stats without
# modifying upstream VibeVoice internals. We'll wrap any created AudioStreamer
# with this to capture whether model.generate() actually calls put().
class ProxyAudioStreamer:
def __init__(self, real_streamer, session_name: Optional[str] = None):
self._real = real_streamer
self.session_name = session_name or "unknown"
self.put_calls = 0
self.total_samples = 0
def put(self, audio_chunk, *args, **kwargs):
# Try to measure number of samples in the chunk for diagnostics
try:
if torch.is_tensor(audio_chunk):
length = int(audio_chunk.numel())
else:
arr = np.array(audio_chunk)
length = int(arr.size)
except Exception:
length = -1
try:
# Inspect possible sample_indices positional argument for diagnostics
si_info = None
if len(args) >= 1:
try:
si = args[0]
if torch.is_tensor(si):
si_info = f"tensor(shape={tuple(si.shape)}, min={int(torch.min(si).item())}, max={int(torch.max(si).item())}, unique={int(len(torch.unique(si)))} )"
else:
arrsi = np.array(si)
si_info = f"array(shape={arrsi.shape}, min={int(arrsi.min()) if arrsi.size>0 else -1}, max={int(arrsi.max()) if arrsi.size>0 else -1}, unique={int(len(np.unique(arrsi))) if arrsi.size>0 else 0})"
except Exception:
si_info = str(type(args[0]))
logger.info(f"VibeVoice audio: ProxyAudioStreamer.put called for session {self.session_name} - samples={length} sample_indices={si_info}")
except Exception:
pass
self.put_calls += 1
if length > 0:
self.total_samples += length
return getattr(self._real, 'put')(audio_chunk, *args, **kwargs)
def get_stream(self, *args, **kwargs):
return getattr(self._real, 'get_stream')(*args, **kwargs)
def end(self, *args, **kwargs):
return getattr(self._real, 'end')(*args, **kwargs)
def __getattr__(self, name):
return getattr(self._real, name)
# Import VibeVoice components
try:
from vibevoice import VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor
from vibevoice.modular.streamer import AudioStreamer
except Exception as e:
logger.warning("VibeVoice not available. Install with: git clone https://github.com/microsoft/VibeVoice.git && cd VibeVoice && pip install -e .")
raise e
class MediaClock:
"""Shared clock for media synchronization."""
def __init__(self):
self.t0 = time.perf_counter()
def now(self) -> float:
return time.perf_counter() - self.t0
class VibeVoiceTTS:
"""Minimal VibeVoice Text-to-Speech wrapper."""
def __init__(self, device: str = "cpu", inference_steps: int = 10, config: Optional[Dict[str, Any]] = None):
self.device = device
self.inference_steps = inference_steps
self.config = config or {}
self.model = None
self.processor = None
self.sample_rate = 24000 # VibeVoice uses 24kHz
self.is_initialized = False
self.voice_presets = {}
self.available_voices = {}
try:
self._initialize_model()
self._setup_voice_presets()
except Exception as e:
logger.error(f"Failed to initialize VibeVoice: {e}")
def _initialize_model(self):
"""Initialize the VibeVoice model with robust device handling."""
try:
logger.info("Loading VibeVoice model...")
# Normalize potential 'mpx'
if self.device.lower() == "mpx":
logger.info("Note: device 'mpx' detected, treating it as 'mps'.")
self.device = "mps"
if self.device == "mps" and not torch.backends.mps.is_available():
logger.warning("Warning: MPS not available. Falling back to CPU.")
self.device = "cpu"
logger.info(f"Using device: {self.device}")
# Load processor
self.processor = VibeVoiceProcessor.from_pretrained("vibevoice/VibeVoice-1.5B")
# Decide dtype & attention
if self.device == "mps":
load_dtype = torch.float32
attn_impl_primary = "sdpa"
elif self.device == "cuda":
load_dtype = torch.bfloat16
attn_impl_primary = "flash_attention_2"
else:
load_dtype = torch.float32
attn_impl_primary = "sdpa"
logger.info(f"Using device: {self.device}, torch_dtype: {load_dtype}, attn_implementation: {attn_impl_primary}")
# Load model
try:
if self.device == "mps":
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
"vibevoice/VibeVoice-1.5B",
torch_dtype=load_dtype,
attn_implementation=attn_impl_primary,
device_map=None,
)
self.model.to("mps")
elif self.device == "cuda":
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
"vibevoice/VibeVoice-1.5B",
torch_dtype=load_dtype,
device_map="cuda",
attn_implementation=attn_impl_primary,
)
else:
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
"vibevoice/VibeVoice-1.5B",
torch_dtype=load_dtype,
device_map="cpu",
attn_implementation=attn_impl_primary,
)
except Exception as e:
if attn_impl_primary == 'flash_attention_2':
logger.warning(f"Error with flash_attention_2: {e}")
logger.info("Falling back to attention implementation: sdpa")
fallback_attn = "sdpa"
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
"vibevoice/VibeVoice-1.5B",
torch_dtype=load_dtype,
device_map=(self.device if self.device in ("cuda", "cpu") else None),
attn_implementation=fallback_attn,
)
if self.device == "mps":
self.model.to("mps")
else:
raise e
self.model.eval()
# Use SDE solver by default
self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
self.model.model.noise_scheduler.config,
algorithm_type='sde-dpmsolver++',
beta_schedule='squaredcos_cap_v2'
)
self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
if hasattr(self.model.model, 'language_model'):
logger.info(f"Language model attention: {self.model.model.language_model.config._attn_implementation}")
self.is_initialized = True
logger.info("VibeVoice model loaded successfully!")
except Exception as e:
logger.error(f"Error loading VibeVoice model: {e}")
raise
def _setup_voice_presets(self):
"""Setup voice presets by scanning the voices directory."""
# Look for voices directory in multiple possible locations
possible_voice_dirs = [
os.path.join(os.path.dirname(__file__), "voices"), # /voicebot/bots/voices/
os.path.join(os.path.dirname(__file__), "..", "VibeVoice", "demo", "voices"), # /voicebot/VibeVoice/demo/voices/
"/voicebot/VibeVoice/demo/voices", # Absolute path
]
voices_dir = None
for possible_dir in possible_voice_dirs:
if os.path.exists(possible_dir):
voices_dir = possible_dir
break
# Check if voices directory exists
if not voices_dir:
logger.warning(f"Warning: Voices directory not found in any of: {possible_voice_dirs}")
self.voice_presets = {}
self.available_voices = {}
self.speaker_mapping = {}
return
# Scan for all WAV files in the voices directory
self.voice_presets = {}
# Get all supported audio files
audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
audio_files = [f for f in os.listdir(voices_dir)
if f.lower().endswith(audio_extensions) and os.path.isfile(os.path.join(voices_dir, f))]
# Create dictionary with filename (without extension) as key
for audio_file in audio_files:
# Remove extension to get the name
name = os.path.splitext(audio_file)[0]
# Create full path
full_path = os.path.join(voices_dir, audio_file)
self.voice_presets[name] = full_path
# Sort the voice presets alphabetically by name for better UI
self.voice_presets = dict(sorted(self.voice_presets.items()))
# Filter out voices that don't exist (this is now redundant but kept for safety)
self.available_voices = {
name: path for name, path in self.voice_presets.items()
if os.path.exists(path)
}
# Map speaker numbers (1, 2, 3, 4) to available voice files
self.speaker_mapping = {}
available_voice_names = list(self.available_voices.keys())
for i in range(1, 5): # Support speakers 1-4
if i <= len(available_voice_names):
voice_name = available_voice_names[i-1] # 0-indexed
self.speaker_mapping[str(i)] = voice_name
logger.info(f"Mapped Speaker {i} to voice '{voice_name}'")
else:
logger.warning(f"No voice file available for Speaker {i}")
if not self.available_voices:
logger.warning("No voice presets found. Please add audio files to the voices directory.")
else:
logger.info(f"Found {len(self.available_voices)} voice files in {voices_dir}")
logger.info(f"Available voices: {', '.join(self.available_voices.keys())}")
logger.info(f"Speaker mapping: {self.speaker_mapping}")
def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
"""Read and preprocess audio file."""
try:
import soundfile as sf
wav, sr = sf.read(audio_path)
if len(wav.shape) > 1:
wav = np.mean(wav, axis=1)
if sr != target_sr:
wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
return wav
except Exception as e:
logger.error(f"Error reading audio {audio_path}: {e}")
return np.array([])
def generate_speech(self, text: str, speaker: str = "1", cfg_scale: float = 1.3) -> Optional[np.ndarray]:
"""Generate speech using the AudioStreamer and return a single concatenated numpy array.
This removes the old synchronous model.generate path and uses the streamer-based
generation even for blocking calls. Returns None if generation isn't possible.
"""
# Must have model initialized and streamer available
if not self.is_initialized:
logger.error("VibeVoice TTS: Model not initialized - cannot generate speech synchronously")
return None
try:
# Prepare formatted text and voice samples (same as demo)
formatted_text = f"Speaker {speaker}: {text}"
voice_samples = []
if speaker in self.speaker_mapping:
voice_name = self.speaker_mapping[speaker]
if voice_name in self.available_voices:
audio_path = self.available_voices[voice_name]
audio_data = self.read_audio(audio_path)
if len(audio_data) > 0:
voice_samples.append(audio_data)
else:
voice_samples.append([])
else:
voice_samples.append([])
else:
voice_samples.append([])
inputs = self.processor( # type: ignore
text=[formatted_text],
voice_samples=[voice_samples],
padding=True,
return_tensors="pt"
)
# Move tensors to device
target_device = self.device if self.device in ("cuda", "mps") else "cpu"
for k, v in inputs.items():
if torch.is_tensor(v):
inputs[k] = v.to(target_device)
# Create streamer and run generation
real_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
audio_streamer = ProxyAudioStreamer(real_streamer, session_name=self.session_name)
with torch.no_grad():
try:
self.model.generate( # type: ignore
**inputs,
max_new_tokens=None,
cfg_scale=cfg_scale,
tokenizer=self.processor.tokenizer, # type: ignore
generation_config={'do_sample': False},
verbose=False,
streamer=audio_streamer,
)
finally:
# ensure streamer end if model.generate returns
try:
audio_streamer.end()
except Exception:
pass
# Collect streamed chunks
collected = []
for audio_chunk in audio_streamer.get_stream(0):
try:
if torch.is_tensor(audio_chunk):
if audio_chunk.dtype == torch.bfloat16:
audio_chunk = audio_chunk.float()
audio_np = audio_chunk.cpu().numpy().astype(np.float32)
else:
audio_np = np.array(audio_chunk, dtype=np.float32)
if audio_np.ndim > 1:
audio_np = audio_np.squeeze()
collected.append(audio_np)
except Exception as e:
logger.error(f"VibeVoice TTS: Error collecting chunk: {e}")
if not collected:
logger.error("VibeVoice TTS: No audio chunks received from streamer")
return None
audio = np.concatenate(collected)
# Mix with background noise if enabled
noise_type = self.config.get('background_noise_type', 'none')
noise_volume = self.config.get('background_noise_volume', 0.0)
audio = self.mix_audio_with_background_noise(audio, noise_type, noise_volume)
# Resample to 16kHz for compatibility with existing audio pipeline
audio_16k = librosa.resample(audio, orig_sr=24000, target_sr=16000)
return audio_16k.astype(np.float32)
except Exception as e:
logger.error(f"VibeVoice TTS: Error generating speech via streamer: {e}")
return None
def generate_background_noise(self, duration_seconds: float, noise_type: str = "white", volume: float = 0.01, sample_rate: Optional[int] = None) -> np.ndarray:
"""Generate background noise of specified type and duration."""
if sample_rate is None:
sample_rate = self.sample_rate
if noise_type == "none":
return np.zeros(int(duration_seconds * sample_rate), dtype=np.float32)
num_samples = int(duration_seconds * sample_rate)
if noise_type == "white":
# White noise - equal power across all frequencies
noise = np.random.normal(0, 1, num_samples).astype(np.float32)
elif noise_type == "pink":
# Pink noise - 1/f frequency response (approximated)
white = np.random.normal(0, 1, num_samples).astype(np.float32)
# Simple pink noise approximation using IIR filter
b = [0.049922035, -0.095993537, 0.050612699, -0.004408786]
a = [1, -2.494956002, 2.017265875, -0.522189400]
noise = np.zeros_like(white)
for i in range(len(b), len(white)):
noise[i] = b[0] * white[i] + b[1] * white[i-1] + b[2] * white[i-2] + b[3] * white[i-3] - a[1] * noise[i-1] - a[2] * noise[i-2] - a[3] * noise[i-3]
elif noise_type == "brown":
# Brown noise - 1/f² frequency response (integrated white noise)
white = np.random.normal(0, 1, num_samples).astype(np.float32)
noise = np.cumsum(white)
# Normalize to prevent drift
noise = (noise - np.mean(noise)) / np.std(noise)
else:
# Default to white noise
noise = np.random.normal(0, 1, num_samples).astype(np.float32)
# Apply volume
noise *= volume
return noise
def mix_audio_with_background_noise(self, audio: np.ndarray, noise_type: str = "white", volume: float = 0.01) -> np.ndarray:
"""Mix generated audio with background noise."""
# Default to disabled when not present in config to avoid unexpected noise
if not self.config.get('background_noise_enabled', False):
return audio
# Generate background noise for the duration of the audio using the TTS sample rate
duration_seconds = len(audio) / self.sample_rate
background_noise = self.generate_background_noise(duration_seconds, noise_type, volume, self.sample_rate)
# Mix audio with background noise
mixed_audio = audio + background_noise
# Normalize to prevent clipping
max_val = np.max(np.abs(mixed_audio))
if max_val > 1.0:
mixed_audio /= max_val
return mixed_audio
class VibeVoiceVideoTrack(MediaStreamTrack):
"""Video track that displays text being spoken."""
kind = "video"
def __init__(self, clock, config: Dict[str, Any], session_name: Optional[str] = None):
super().__init__()
self.clock = clock
self.config = config
# Keep session_name for looking up waveform buffers and status
self.session_name = session_name or config.get('session_name') or f"VibeVoice:{int(time.time())}"
self.width = config.get('width', 640)
self.height = config.get('height', 480)
self.fps = config.get('fps', 15)
# Text display state
self.current_text = ""
self.text_queue = queue.Queue()
self.display_start_time = 0
self.display_duration = 3.0 # seconds to display each text
self.frame_count = 0
# Font settings
self.font = cv2.FONT_HERSHEY_SIMPLEX
self.font_scale = min(self.width, self.height) / 800
self.font_thickness = max(1, int(self.font_scale * 2))
def update_text(self, text: str):
"""Update the text to display."""
self.text_queue.put(text)
logger.info(f"VibeVoice video: Queued text '{text}'")
def update_config(self, config_updates: Dict[str, Any]) -> bool:
"""Update video configuration."""
try:
self.config.update(config_updates)
if 'width' in config_updates:
self.width = config_updates['width']
if 'height' in config_updates:
self.height = config_updates['height']
if 'fps' in config_updates:
self.fps = config_updates['fps']
return True
except Exception as e:
logger.error(f"Error updating video config: {e}")
return False
async def next_timestamp(self) -> Tuple[int, float]:
"""Get next timestamp for video frame."""
pts = int(self.frame_count * (90000 / self.fps))
time_base = 1 / 90000
return pts, time_base
async def recv(self) -> VideoFrame:
"""Generate video frame with current text."""
# Update current text if needed
current_time = time.time()
if (not self.current_text or
current_time - self.display_start_time > self.display_duration):
try:
self.current_text = self.text_queue.get_nowait()
self.display_start_time = current_time
logger.info(f"VibeVoice video: Displaying '{self.current_text}'")
except queue.Empty:
self.current_text = ""
# Create frame
frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
if self.current_text:
# Add background
cv2.rectangle(frame, (0, 0), (self.width, self.height), (0, 0, 0), -1)
# Split text into lines if too long
words = self.current_text.split()
lines = []
current_line = ""
max_chars_per_line = int(self.width / (self.font_scale * 20))
for word in words:
if len(current_line + " " + word) <= max_chars_per_line:
current_line += " " + word if current_line else word
else:
if current_line:
lines.append(current_line)
current_line = word
if current_line:
lines.append(current_line)
# Draw text lines
line_height = int(self.font_scale * 40)
total_text_height = len(lines) * line_height
start_y = (self.height - total_text_height) // 2 + line_height
for i, line in enumerate(lines):
text_size = cv2.getTextSize(line, self.font, self.font_scale, self.font_thickness)[0]
text_x = (self.width - text_size[0]) // 2
text_y = start_y + i * line_height
# Add text shadow
cv2.putText(frame, line, (text_x + 2, text_y + 2),
self.font, self.font_scale, (0, 0, 0), self.font_thickness + 1)
# Add main text
cv2.putText(frame, line, (text_x, text_y),
self.font, self.font_scale, (255, 255, 255), self.font_thickness)
else:
# Default background when no text
cv2.putText(frame, "VibeVoice TTS", (50, self.height // 2),
self.font, self.font_scale * 2, (255, 255, 255), self.font_thickness)
# Draw waveform and status overlays from shared WaveformVideoTrack buffers
try:
pname = self.session_name
buf = WaveformVideoTrack.buffer.get(pname, None)
status = WaveformVideoTrack.speech_status.get(pname, {})
# Draw small status box in top-left
status_text = "Idle"
if status.get('is_processing'):
status_text = "Processing..."
elif status.get('is_speech'):
status_text = "Speaking"
elif buf is not None and len(buf) > 0:
# buffered seconds approx
sr = WaveformVideoTrack.sample_rates.get(pname, self.config.get('sample_rate', 16000))
buffered_sec = len(buf) / float(sr) if sr > 0 else 0.0
status_text = f"Buffered: {buffered_sec:.1f}s"
box_w = int(self.width * 0.28)
box_h = int(self.height * 0.12)
cv2.rectangle(frame, (10, 10), (10 + box_w, 10 + box_h), (50, 50, 50), -1)
cv2.putText(frame, status_text, (20, 10 + int(box_h/2)), self.font, self.font_scale, (200, 200, 200), self.font_thickness)
# Draw small energy meter
energy = status.get('energy', 0.0)
meter_h = int(box_h * 0.4)
meter_w = int(box_w * 0.6)
mx = 20
my = 10 + box_h - 5
filled = int(min(1.0, energy * 50.0) * meter_w)
cv2.rectangle(frame, (mx, my - meter_h), (mx + meter_w, my), (80, 80, 80), -1)
cv2.rectangle(frame, (mx, my - meter_h), (mx + filled, my), (0, 200, 0), -1)
# Draw waveform at bottom area
if buf is not None and buf.size > 4:
sr = WaveformVideoTrack.sample_rates.get(pname, self.config.get('sample_rate', 16000))
# Use last N samples corresponding to width pixels
samples_to_show = min(buf.size, max(1, int(sr * 5))) # show up to last 5s
slice_buf = buf[-samples_to_show:]
# Downsample to width points
idx = (np.linspace(0, samples_to_show - 1, num=self.width)).astype(np.int32)
waveform = slice_buf[idx]
# Normalize waveform to -1..1
maxv = np.max(np.abs(waveform)) if waveform.size > 0 else 1.0
if maxv <= 0:
maxv = 1.0
waveform = waveform / maxv
# Map to pixel coordinates in bottom strip
wf_h = int(self.height * 0.22)
wf_y0 = self.height - wf_h - 10
pts = []
for i, v in enumerate(waveform):
px = int(i * (self.width / len(waveform)))
py = int(wf_y0 + (wf_h / 2) * (1 - v))
pts.append((px, py))
if len(pts) >= 2:
cv2.polylines(frame, [np.array(pts, dtype=np.int32)], False, (100, 200, 255), 1)
# Fill under curve for nicer look
fill_pts = pts + [(self.width - 1, wf_y0 + wf_h), (0, wf_y0 + wf_h)]
cv2.fillPoly(frame, [np.array(fill_pts, dtype=np.int32)], (30, 60, 80))
except Exception:
# Non-critical rendering failure shouldn't break video
pass
self.frame_count += 1
return VideoFrame.from_ndarray(frame, format="bgr24")
class VibeVoiceAudioTrack(MediaStreamTrack):
"""Audio track that plays TTS speech."""
kind = "audio"
def __init__(self, clock, config: Dict[str, Any], tts_engine: VibeVoiceTTS, session_name: Optional[str] = None):
super().__init__()
self.clock = clock
self.config = config
self.tts = tts_engine
self.sample_rate = config.get('sample_rate', 16000)
self.samples_per_frame = config.get('samples_per_frame', 960) # 60ms at 16kHz
# Audio playback state
self.audio_queue = queue.Queue()
self.current_audio = None
self.audio_position = 0
self.is_speaking = False
self.speaker = config.get('speaker', 'Alice')
# Audio buffer for mixing multiple TTS segments
self.audio_buffer = np.array([], dtype=np.float32)
self.buffer_lock = threading.Lock()
# Optional looping and debug options
self.loop = config.get('loop', True)
self.debug_save_wav = config.get('debug_save_wav', True)
# Keep the last fully-generated audio to enable looping
self.last_generated_audio = np.array([], dtype=np.float32)
# Protect last_generated_audio updates
self._last_gen_lock = threading.Lock()
# Track total samples generated for proper PTS calculation
self._samples_generated = 0
# Optional session name used to publish waveform data for visualization
self.session_name = session_name or f"VibeVoice:{int(time.time())}"
def update_config(self, config_updates: Dict[str, Any]) -> bool:
"""Update audio configuration."""
try:
self.config.update(config_updates)
if 'sample_rate' in config_updates:
self.sample_rate = config_updates['sample_rate']
if 'samples_per_frame' in config_updates:
self.samples_per_frame = config_updates['samples_per_frame']
if 'speaker' in config_updates:
self.speaker = config_updates['speaker']
if 'loop' in config_updates:
self.loop = bool(config_updates['loop'])
logger.info(f"🔁 Looping {'enabled' if self.loop else 'disabled'} for session {self.session_name}")
if 'debug_save_wav' in config_updates:
self.debug_save_wav = bool(config_updates['debug_save_wav'])
logger.info(f"🐞 Debug save wav {'enabled' if self.debug_save_wav else 'disabled'} for session {self.session_name}")
# Log background noise configuration updates
background_noise_updated = False
if 'background_noise_enabled' in config_updates:
logger.info(f"🎵 Background noise enabled: {config_updates['background_noise_enabled']}")
background_noise_updated = True
if 'background_noise_type' in config_updates:
logger.info(f"🎵 Background noise type: {config_updates['background_noise_type']}")
background_noise_updated = True
if 'background_noise_volume' in config_updates:
logger.info(f"🎵 Background noise volume: {config_updates['background_noise_volume']}")
background_noise_updated = True
if background_noise_updated:
logger.info("🎵 Background noise configuration updated - changes will take effect on next audio frame")
return True
except Exception as e:
logger.error(f"Error updating audio config: {e}")
return False
def speak_text(self, text: str, cfg_scale: Optional[float] = None):
"""Queue text for speech synthesis."""
if cfg_scale is None:
cfg_scale = 1.3 # Default value
logger.info(f"VibeVoice audio: Starting background TTS generation for '{text}' with cfg_scale={cfg_scale}")
# Start TTS generation in a background thread
import threading
thread = threading.Thread(
target=self._generate_tts_background,
args=(text, self.speaker, cfg_scale),
daemon=True
)
thread.start()
def _generate_tts_background(self, text: str, speaker: str, cfg_scale: float):
"""Generate TTS in background thread and add to audio buffer."""
try:
logger.info(f"VibeVoice audio: Background TTS generation started for '{text}'")
# Log some diagnostic info about the TTS engine state
try:
logger.info(f"VibeVoice audio: TTS engine initialized={getattr(self.tts, 'is_initialized', False)}, device={getattr(self.tts, 'device', None)}, tts_sample_rate={getattr(self.tts, 'sample_rate', None)}")
# available_voices and speaker_mapping may be large; log summaries
try:
avv = getattr(self.tts, 'available_voices', {})
smap = getattr(self.tts, 'speaker_mapping', {})
logger.info(f"VibeVoice audio: available_voices={list(avv.keys())[:5]} (count={len(avv)}), speaker_mapping_count={len(smap)}")
except Exception:
pass
except Exception:
pass
# Mark processing state for video overlay
try:
WaveformVideoTrack.speech_status[self.session_name] = WaveformVideoTrack.speech_status.get(self.session_name, {})
WaveformVideoTrack.speech_status[self.session_name]['is_processing'] = True
except Exception:
pass
# Require model and streamer to be available for streaming generation
if not self.tts.is_initialized:
logger.error("VibeVoice audio: Model or AudioStreamer not available - background generation disabled")
return
# Prepare formatted text and inputs (same expectations as generate_speech)
formatted_text = f"Speaker {speaker}: {text}"
voice_samples = []
if speaker in self.tts.speaker_mapping:
voice_name = self.tts.speaker_mapping[speaker]
if voice_name in self.tts.available_voices:
audio_path = self.tts.available_voices[voice_name]
audio_data = self.tts.read_audio(audio_path)
if len(audio_data) > 0:
voice_samples.append(audio_data)
else:
voice_samples.append([])
else:
voice_samples.append([])
else:
voice_samples.append([])
inputs = self.tts.processor( # type: ignore
text=[formatted_text],
voice_samples=[voice_samples],
padding=True,
return_tensors="pt"
)
# Move tensors to device
target_device = self.tts.device if self.tts.device in ("cuda", "mps") else "cpu"
for k, v in inputs.items():
if torch.is_tensor(v):
inputs[k] = v.to(target_device)
# Log a summary of inputs for diagnostic purposes
try:
inp_summary = {}
for k, v in inputs.items():
if torch.is_tensor(v):
inp_summary[k] = f"tensor(shape={tuple(v.shape)}, dtype={v.dtype})"
else:
try:
inp_summary[k] = f"{type(v).__name__}(len={len(v)})"
except Exception:
inp_summary[k] = type(v).__name__
logger.info(f"VibeVoice audio: Input summary for generation: {inp_summary}")
except Exception:
pass
# Create audio streamer and start model.generate in a separate thread
real_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
audio_streamer = ProxyAudioStreamer(real_streamer, session_name=self.session_name)
def _run_generate():
try:
logger.info(f"VibeVoice audio: model.generate starting for session {self.session_name}")
with torch.no_grad():
self.tts.model.generate( # type: ignore
**inputs,
max_new_tokens=None,
cfg_scale=cfg_scale,
tokenizer=self.tts.processor.tokenizer, # type: ignore
generation_config={'do_sample': False},
verbose=False,
streamer=audio_streamer,
)
except Exception as e:
logger.error(f"VibeVoice audio: Error during model.generate: {e}")
finally:
# Ensure streamer is ended
try:
audio_streamer.end()
except Exception:
pass
logger.info(f"VibeVoice audio: model.generate finished for session {self.session_name}")
gen_thread = threading.Thread(target=_run_generate, daemon=True)
gen_thread.start()
# Consume chunks from streamer and append to audio buffer as they arrive
generated_chunks = []
chunk_count = 0
total_samples_streamed = 0
logger.info(f"VibeVoice audio: Audio streamer started for session {self.session_name}")
try:
logger.info(f"VibeVoice audio: audio_streamer repr: {repr(audio_streamer)[:400]}")
gs = None
try:
gs = audio_streamer.get_stream(0)
logger.info(f"VibeVoice audio: get_stream returned object type: {type(gs)}")
except Exception as _e:
logger.error(f"VibeVoice audio: calling audio_streamer.get_stream raised: {_e}")
gs = None
except Exception:
gs = None
if gs is None:
logger.warning(f"VibeVoice audio: audio_streamer.get_stream did not return a stream for session {self.session_name}")
iterator = []
else:
iterator = gs
for audio_chunk in iterator:
try:
# Convert tensor to numpy if needed
if torch.is_tensor(audio_chunk):
if audio_chunk.dtype == torch.bfloat16:
audio_chunk = audio_chunk.float()
audio_np = audio_chunk.cpu().numpy().astype(np.float32)
else:
audio_np = np.array(audio_chunk, dtype=np.float32)
# Squeeze to 1D if needed
if audio_np.ndim > 1:
audio_np = audio_np.squeeze()
# Resample from model sampling rate (usually 24000) to track sample rate
if hasattr(self.tts, 'sample_rate') and self.tts.sample_rate != self.sample_rate:
try:
audio_np = librosa.resample(audio_np, orig_sr=self.tts.sample_rate, target_sr=self.sample_rate)
except Exception:
# If resample fails, keep original chunk
pass
# Append to internal buffer
with self.buffer_lock:
if len(self.audio_buffer) == 0:
self.audio_buffer = audio_np
else:
self.audio_buffer = np.concatenate([self.audio_buffer, audio_np])
# Also collect into generated_chunks for possible looping/debug save
try:
generated_chunks.append(audio_np.astype(np.float32))
except Exception:
pass
total_samples_streamed += len(audio_np)
chunk_count += 1
# Log every few chunks to avoid log spam
if chunk_count % 5 == 0:
logger.info(f"VibeVoice audio: Streamed {total_samples_streamed} samples so far for session {self.session_name} (chunks={chunk_count})")
else:
logger.debug(f"VibeVoice audio: Streamed {len(audio_np)} samples to buffer (total buffer: {len(self.audio_buffer)})")
# Also publish into the global waveform buffer used by WaveformVideoTrack
try:
if WaveformVideoTrack is not None:
pname = self.session_name
# Ensure buffer key exists
if pname not in WaveformVideoTrack.buffer:
WaveformVideoTrack.buffer[pname] = np.array([], dtype=np.float32)
# Append to shared waveform buffer
WaveformVideoTrack.buffer[pname] = np.concatenate([
WaveformVideoTrack.buffer[pname], audio_np.astype(np.float32)
])
# Ensure sample rate is set for this session
WaveformVideoTrack.sample_rates[pname] = self.sample_rate
# Limit buffer to last 10 seconds for this track
max_samples = int(self.sample_rate * 10)
if len(WaveformVideoTrack.buffer[pname]) > max_samples:
WaveformVideoTrack.buffer[pname] = WaveformVideoTrack.buffer[pname][-max_samples:]
# Update a lightweight speech_status for display
energy = float(np.sqrt(np.mean(audio_np.astype(np.float32) ** 2))) if audio_np.size > 0 else 0.0
# Approximate zero-crossing rate
try:
if audio_np.size > 1:
zcr = float(np.mean(np.abs(np.diff(np.sign(audio_np)) ) > 0))
else:
zcr = 0.0
except Exception:
zcr = 0.0
is_speech = energy > 0.005
WaveformVideoTrack.speech_status[pname] = {
'is_speech': bool(is_speech),
'energy': float(energy),
'zcr': float(zcr),
'centroid': 0.0,
'rolloff': 0.0,
'flux': 0.0,
'harmonicity': 0.0,
'noise_floor_energy': 0.0,
'adaptive_threshold': 0.0,
'energy_check': bool(energy > 0.002),
'zcr_check': bool(zcr > 0.01),
'spectral_check': False,
'harmonic_check': False,
'temporal_consistency': True,
'is_processing': True,
'is_playing': False,
}
except Exception:
# Non-critical - don't break TTS on visualization failures
pass
except Exception as e:
logger.error(f"VibeVoice audio: Error processing audio chunk from streamer: {e}")
# Ensure generation thread finishes
gen_thread.join(timeout=5.0)
# If generation thread is still alive after join, log a warning
if gen_thread.is_alive():
logger.warning(f"VibeVoice audio: generation thread still alive after join for session {self.session_name}")
# When generation completes, store last_generated_audio for looping and optionally save debug WAV
logger.info(f"VibeVoice audio: Generation completed for session {self.session_name}. total_samples_streamed={total_samples_streamed}, chunks={chunk_count}")
# If no chunks were received, emit a diagnostic warning with some state to help debugging
if chunk_count == 0:
try:
# Provide more diagnostic info: inputs summary and streamer introspection
try:
sdi = {
'repr': repr(audio_streamer)[:400],
'dir': [n for n in dir(audio_streamer) if not n.startswith('_')][:40]
}
except Exception:
sdi = {'repr': 'unavailable', 'dir': []}
try:
logger.warning(
f"VibeVoice audio: No audio chunks were streamed for session {self.session_name}. "
f"is_initialized={getattr(self.tts, 'is_initialized', False)}, model_present={hasattr(self.tts, 'model')} ; "
f"audio_streamer={sdi}"
)
except Exception:
logger.warning(f"VibeVoice audio: No audio chunks were streamed for session {self.session_name} (diagnostics failed)")
except Exception:
logger.warning(f"VibeVoice audio: No audio chunks were streamed for session {self.session_name} (additional diagnostics unavailable)")
# Fallback: attempt a synchronous generation that returns a full numpy audio array
try:
logger.info(f"VibeVoice audio: Attempting synchronous fallback generation for session {self.session_name}")
fallback_audio = None
try:
fallback_audio = self.tts.generate_speech(text, speaker, cfg_scale=cfg_scale)
except Exception as e:
logger.error(f"VibeVoice audio: synchronous fallback generation raised: {e}")
if fallback_audio is not None and getattr(fallback_audio, 'size', 0) > 0:
try:
fa = fallback_audio.astype(np.float32)
except Exception:
fa = np.array(fallback_audio, dtype=np.float32)
# Resample if needed
try:
tts_sr = getattr(self.tts, 'sample_rate', 24000)
if tts_sr != self.sample_rate:
fa = librosa.resample(fa, orig_sr=tts_sr, target_sr=self.sample_rate)
except Exception:
pass
# Append into internal buffer and last_generated_audio
with self.buffer_lock:
if len(self.audio_buffer) == 0:
self.audio_buffer = fa
else:
self.audio_buffer = np.concatenate([self.audio_buffer, fa])
with self._last_gen_lock:
self.last_generated_audio = fa.copy()
# Publish to waveform buffer
try:
pname = self.session_name
if pname not in WaveformVideoTrack.buffer:
WaveformVideoTrack.buffer[pname] = np.array([], dtype=np.float32)
WaveformVideoTrack.buffer[pname] = np.concatenate([WaveformVideoTrack.buffer[pname], fa.astype(np.float32)])
WaveformVideoTrack.sample_rates[pname] = self.sample_rate
except Exception:
pass
# Optionally save debug wav
if self.debug_save_wav:
try:
try:
import soundfile as sf
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav"
sf.write(fname, fa, samplerate=self.sample_rate)
logger.info(f"🐞 Saved fallback generated wav to {fname} (soundfile)")
except Exception:
try:
from scipy.io import wavfile
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav"
wavfile.write(fname, self.sample_rate, (fa * 32767).astype('int16'))
logger.info(f"🐞 Saved fallback generated wav to {fname} (scipy)")
except Exception:
try:
import wave
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav"
with wave.open(fname, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(self.sample_rate)
int_data = (fa * 32767).astype('int16')
wf.writeframes(int_data.tobytes())
logger.info(f"🐞 Saved fallback generated wav to {fname} (wave)")
except Exception as e:
logger.error(f"Error saving fallback debug wav (all methods failed): {e}")
except Exception as e:
logger.error(f"Error saving fallback debug wav: {e}")
logger.info(f"VibeVoice audio: Fallback synchronous generation successful for session {self.session_name} (samples={len(fa)})")
else:
logger.warning(f"VibeVoice audio: Fallback synchronous generation produced no audio for session {self.session_name}")
except Exception as e:
logger.error(f"VibeVoice audio: Exception during synchronous fallback generation: {e}")
try:
if len(generated_chunks) > 0:
try:
all_gen = np.concatenate(generated_chunks).astype(np.float32)
except Exception:
all_gen = np.array([], dtype=np.float32)
with self._last_gen_lock:
self.last_generated_audio = all_gen.copy()
# Optionally save to disk for debugging
if self.debug_save_wav:
try:
try:
import soundfile as sf
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav"
sf.write(fname, all_gen, samplerate=self.sample_rate)
logger.info(f"🐞 Saved generated wav to {fname} (soundfile)")
except Exception:
# Try scipy fallback
try:
from scipy.io import wavfile
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav"
# scipy expects int16
wavfile.write(fname, self.sample_rate, (all_gen * 32767).astype('int16'))
logger.info(f"🐞 Saved generated wav to {fname} (scipy)")
except Exception:
# Ultimate fallback: write raw wave via wave module
try:
import wave
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav"
with wave.open(fname, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(self.sample_rate)
int_data = (all_gen * 32767).astype('int16')
wf.writeframes(int_data.tobytes())
logger.info(f"🐞 Saved generated wav to {fname} (wave)")
except Exception as e:
logger.error(f"Error saving debug wav (all methods failed): {e}")
except Exception as e:
logger.error(f"Error saving debug wav: {e}")
except Exception:
pass
# Clear processing flag when generation completes
try:
if self.session_name in WaveformVideoTrack.speech_status:
WaveformVideoTrack.speech_status[self.session_name]['is_processing'] = False
except Exception:
pass
except Exception as e:
logger.error(f"VibeVoice audio: Error in background TTS generation: {e}")
def _get_samples_from_buffer(self, num_samples: int) -> np.ndarray:
"""Get samples from audio buffer, removing them from buffer."""
# Try to refill from last_generated_audio if looping is enabled
with self._last_gen_lock:
last_gen = self.last_generated_audio.copy() if getattr(self, 'last_generated_audio', None) is not None else np.array([], dtype=np.float32)
with self.buffer_lock:
if len(self.audio_buffer) == 0:
# If we're configured to loop and have a generated sample, refill the buffer
if getattr(self, 'loop', False) and last_gen.size > 0:
try:
# Repeat last_gen as needed to reach at least num_samples
repeats = int(math.ceil(float(num_samples) / float(len(last_gen)))) if len(last_gen) > 0 else 1
refill = np.tile(last_gen, repeats)
self.audio_buffer = refill.astype(np.float32)
logger.debug(f"VibeVoice audio: Refilled audio_buffer from last_generated_audio (len={len(last_gen)}) repeats={repeats}")
except Exception:
# Fallback to silence on any failure
self.audio_buffer = np.zeros(num_samples, dtype=np.float32)
else:
return np.zeros(num_samples, dtype=np.float32)
if len(self.audio_buffer) >= num_samples:
samples = self.audio_buffer[:num_samples]
self.audio_buffer = self.audio_buffer[num_samples:]
return samples
else:
# Return remaining samples and pad with zeros
samples = self.audio_buffer
padding = np.zeros(num_samples - len(self.audio_buffer), dtype=np.float32)
self.audio_buffer = np.array([], dtype=np.float32)
return np.concatenate([samples, padding])
async def next_timestamp(self) -> Tuple[int, float]:
"""Get next timestamp for audio frame."""
pts = self._samples_generated
time_base = 1 / self.sample_rate
return pts, time_base
async def recv(self) -> AudioFrame:
"""Generate audio frame with TTS speech from buffer."""
# Get samples from buffer
samples = self._get_samples_from_buffer(self.samples_per_frame)
# If no TTS audio available, generate background noise
if np.all(samples == 0):
# Default to disabled when not present in config to avoid unexpected noise
if self.config.get('background_noise_enabled', False):
noise_type = self.config.get('background_noise_type', 'white')
noise_volume = self.config.get('background_noise_volume', 0.01)
# Generate noise for this frame duration
frame_duration = self.samples_per_frame / self.sample_rate
logger.debug(f"🎵 Generating background noise: type={noise_type}, volume={noise_volume}, duration={frame_duration:.3f}s")
background_noise = self.tts.generate_background_noise(frame_duration, noise_type, noise_volume, self.sample_rate)
logger.debug(f"🎵 Generated background noise: {len(background_noise)} samples")
samples = background_noise
else:
# Generate silence if background noise is disabled
logger.debug("🎵 Background noise disabled - generating silence")
samples = np.zeros(self.samples_per_frame, dtype=np.float32)
# Convert to 16-bit PCM
# Update shared speech_status for visualization: energy + playing flag
try:
energy = float(np.sqrt(np.mean(samples.astype(np.float32) ** 2))) if samples.size > 0 else 0.0
pname = self.session_name
st = WaveformVideoTrack.speech_status.get(pname, {})
st['energy'] = float(energy)
# Consider playing when energy above small threshold
st['is_playing'] = bool(energy > 0.001)
st['is_speech'] = bool(energy > 0.003)
WaveformVideoTrack.speech_status[pname] = st
except Exception:
pass
samples_int16 = (samples * 32767).astype(np.int16)
# Create stereo audio (duplicate mono channel)
left = samples_int16
right = samples_int16.copy()
stereo = np.empty(self.samples_per_frame * 2, dtype=np.int16)
stereo[0::2] = left
stereo[1::2] = right
# Create audio frame
frame = AudioFrame.from_ndarray(stereo.reshape(1, -1), format="s16", layout="stereo")
frame.sample_rate = self.sample_rate
frame.pts = self._samples_generated
frame.time_base = fractions.Fraction(1, self.sample_rate)
# Increment sample counter
self._samples_generated += self.samples_per_frame
return frame
class VibeVoiceTTSBot:
"""VibeVoice Text-to-Speech Bot for voicebot framework."""
def __init__(self, session_name: str, config: Optional[Dict[str, Any]] = None):
self.session_name = session_name
self.config = config or {}
# Initialize TTS engine with enhanced parameters
device = self.config.get('device', 'cpu')
inference_steps = self.config.get('inference_steps', 10)
self.tts_engine = VibeVoiceTTS(device=device, inference_steps=inference_steps, config=self.config)
# Store generation parameters
self.cfg_scale = self.config.get('cfg_scale', 1.3)
self.speaker = self.config.get('speaker', '1')
# Initialize media components
self.media_clock = MediaClock()
# Pass session name into video track so it can show per-session waveform/status
self.video_track = VibeVoiceVideoTrack(self.media_clock, self.config, session_name=session_name)
self.audio_track = VibeVoiceAudioTrack(self.media_clock, self.config, self.tts_engine, session_name=session_name)
# Initialize shared waveform store sample rate and empty buffer/status
try:
WaveformVideoTrack.sample_rates[session_name] = self.config.get('sample_rate', 16000)
if session_name not in WaveformVideoTrack.buffer:
WaveformVideoTrack.buffer[session_name] = np.array([], dtype=np.float32)
if session_name not in WaveformVideoTrack.speech_status:
WaveformVideoTrack.speech_status[session_name] = {'is_speech': False, 'energy': 0.0, 'is_processing': False, 'is_playing': False}
except Exception:
pass
# Apply initial configuration values to ensure defaults from schema/config provider
try:
self.update_config(self.config)
except Exception:
# Don't let config application stop initialization
pass
logger.info(f"VibeVoice bot initialized for session {session_name} with cfg_scale={self.cfg_scale}, speaker={self.speaker}")
def get_tracks(self) -> Dict[str, MediaStreamTrack]:
"""Get video and audio tracks."""
return {
"video": self.video_track,
"audio": self.audio_track
}
def handle_chat_message(self, message: ChatMessageModel):
"""Handle incoming chat messages by converting them to speech."""
try:
text = message.message.strip()
if text:
logger.info(f"VibeVoice bot received chat: '{text}' from {message.sender_name}")
# Queue text for both video display and audio speech
self.video_track.update_text(text)
self.audio_track.speak_text(text, self.cfg_scale)
except Exception as e:
logger.error(f"Error handling chat message in VibeVoice bot: {e}")
def update_config(self, config_updates: Dict[str, Any]) -> bool:
"""Update bot configuration."""
try:
self.config.update(config_updates)
# Update TTS-specific parameters
if 'cfg_scale' in config_updates:
self.cfg_scale = config_updates['cfg_scale']
if 'speaker' in config_updates:
self.speaker = config_updates['speaker']
# Update tracks
video_success = self.video_track.update_config(config_updates)
audio_success = self.audio_track.update_config(config_updates)
if video_success and audio_success:
logger.info(f"VibeVoice bot configuration updated: {config_updates}")
return True
else:
logger.warning("Partial configuration update failure in VibeVoice bot")
return False
except Exception as e:
logger.error(f"Error updating VibeVoice bot configuration: {e}")
return False
# Global bot instance registry
_vibevoice_bots: Dict[str, VibeVoiceTTSBot] = {}
def create_vibevoice_bot_tracks(session_name: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, MediaStreamTrack]:
"""
Create VibeVoice TTS bot tracks.
Args:
session_name: Name for the session
config: Configuration dictionary with options:
- width: video width (default 640)
- height: video height (default 480)
- fps: frames per second (default 15)
- sample_rate: audio sample rate (default 16000)
- samples_per_frame: audio samples per frame (default 960)
- speaker: TTS speaker name (default '1')
- device: device for TTS ('cpu', 'cuda', 'mps')
- cfg_scale: CFG scale for generation (default 1.3)
- inference_steps: Number of inference steps (default 10)
Returns:
Dictionary containing 'video' and 'audio' tracks
"""
if config is None:
config = {}
# Set defaults
default_config = {
'width': 640,
'height': 480,
'fps': 15,
'sample_rate': 16000,
'samples_per_frame': 960,
'speaker': '1',
'device': 'cpu',
'cfg_scale': 1.3,
'inference_steps': 10,
# Explicit background noise defaults - disabled by default
'background_noise_enabled': False,
'background_noise_type': 'none',
'background_noise_volume': 0.0,
}
default_config.update(config)
# Create bot instance
bot = VibeVoiceTTSBot(session_name, default_config)
_vibevoice_bots[session_name] = bot
logger.info(f"Created VibeVoice bot tracks for {session_name}")
return bot.get_tracks()
def handle_config_update(session_name: str, config_values: Dict[str, Any]) -> bool:
"""
Handle runtime configuration updates for VibeVoice bot.
Args:
session_name: Name of the session/bot instance
config_values: Dictionary of configuration values to update
Returns:
bool: True if update was successful, False otherwise
"""
try:
if session_name in _vibevoice_bots:
return _vibevoice_bots[session_name].update_config(config_values)
else:
logger.warning(f"No VibeVoice bot found for session {session_name}")
return False
except Exception as e:
logger.error(f"Error updating VibeVoice bot configuration: {e}")
return False
async def handle_chat_message(
chat_message: ChatMessageModel,
send_message_func: Callable[[Union[str, ChatMessageModel]], Awaitable[None]]
) -> Optional[str]:
"""
Handle incoming chat messages and convert them to speech.
Args:
chat_message: The chat message to process
send_message_func: Function to send chat responses (not used by TTS bot)
"""
try:
# Find the bot instance - we need to get session name from somewhere
# For now, we'll use the first available bot instance
if _vibevoice_bots:
session_name = list(_vibevoice_bots.keys())[0]
_vibevoice_bots[session_name].handle_chat_message(chat_message)
logger.info(f"VibeVoice bot processed chat message from {chat_message.sender_name}: '{chat_message.message}'")
else:
logger.warning("No VibeVoice bot instances available to handle chat message")
except Exception as e:
logger.error(f"Error handling chat message in VibeVoice bot: {e}")
# TTS bot doesn't send chat responses, so return None
return None
# Agent descriptor exported for dynamic discovery by the FastAPI service
AGENT_NAME = "VibeVoice TTS Bot"
AGENT_DESCRIPTION = "Microsoft VibeVoice text-to-speech bot with visual text display"
def agent_info() -> Dict[str, str]:
"""Return agent metadata for discovery."""
return {
"name": AGENT_NAME,
"description": AGENT_DESCRIPTION,
"has_media": "true",
"configurable": "true",
"chat_enabled": "true"
}
def get_config_schema() -> Dict[str, Any]:
"""Get the configuration schema for the VibeVoice Bot."""
return {
"bot_name": AGENT_NAME,
"version": "1.0",
"parameters": [
{
"name": "width",
"type": "number",
"label": "Video Width",
"description": "Width of the video frame in pixels",
"default_value": 640,
"required": False,
"min_value": 320,
"max_value": 1920,
"step": 1
},
{
"name": "height",
"type": "number",
"label": "Video Height",
"description": "Height of the video frame in pixels",
"default_value": 480,
"required": False,
"min_value": 240,
"max_value": 1080,
"step": 1
},
{
"name": "fps",
"type": "number",
"label": "Frames Per Second",
"description": "Video frame rate",
"default_value": 15,
"required": False,
"min_value": 1,
"max_value": 60,
"step": 1
},
{
"name": "speaker",
"type": "select",
"label": "TTS Speaker",
"description": "Voice to use for text-to-speech",
"default_value": "1",
"required": True,
"options": [
{"value": "1", "label": "Speaker 1 (en-Alice_woman)"},
{"value": "2", "label": "Speaker 2 (en-Carter_man)"},
{"value": "3", "label": "Speaker 3 (en-Frank_man)"},
{"value": "4", "label": "Speaker 4 (en-Mary_woman_bgm)"}
]
},
{
"name": "background_noise_enabled",
"type": "boolean",
"label": "Enable Background Noise",
"description": "Add background noise to ensure continuous audio streaming",
"default_value": False,
"required": False
},
{
"name": "background_noise_type",
"type": "select",
"label": "Background Noise Type",
"description": "Type of background noise to generate",
# 'none' indicates no noise - matches default disabled behavior
"default_value": "none",
"required": False,
"options": [
{"value": "white", "label": "White Noise"},
{"value": "pink", "label": "Pink Noise"},
{"value": "brown", "label": "Brown Noise"},
{"value": "none", "label": "None"}
]
},
{
"name": "background_noise_volume",
"type": "number",
"label": "Background Noise Volume",
"description": "Volume level of background noise (0.0 to 1.0)",
"default_value": 0.01,
"required": False,
"min_value": 0.0,
"max_value": 1.0,
"step": 0.001
},
{
"name": "device",
"type": "select",
"label": "Processing Device",
"description": "Device to use for TTS processing",
"default_value": "cpu",
"required": True,
"options": [
{"value": "cpu", "label": "CPU"},
{"value": "cuda", "label": "CUDA (GPU)"},
{"value": "mps", "label": "MPS (Apple Silicon)"}
]
},
{
"name": "cfg_scale",
"type": "number",
"label": "CFG Scale",
"description": "Classifier-free guidance scale for controlling generation quality",
"default_value": 1.3,
"required": False,
"min_value": 1.0,
"max_value": 2.0,
"step": 0.05
},
{
"name": "inference_steps",
"type": "number",
"label": "Inference Steps",
"description": "Number of denoising steps for audio generation",
"default_value": 10,
"required": False,
"min_value": 5,
"max_value": 50,
"step": 1
}
],
"categories": [
{
"Video Settings": ["width", "height", "fps"]
},
{
"TTS Settings": ["speaker", "device", "cfg_scale", "inference_steps"]
},
{
"Background Noise": ["background_noise_enabled", "background_noise_type", "background_noise_volume"]
}
]
}
def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]:
"""Factory wrapper used by the FastAPI service to instantiate tracks for an agent."""
return create_vibevoice_bot_tracks(session_name)