1574 lines
70 KiB
Python
1574 lines
70 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
VibeVoice Text-to-Speech Bot for Voicebot Framework
|
|
|
|
Integrates Microsoft's VibeVoice TTS with the voicebot framework.
|
|
Watches for chat messages and converts them to speech with text display.
|
|
"""
|
|
|
|
import threading
|
|
import queue
|
|
import time
|
|
import numpy as np
|
|
import cv2
|
|
import math
|
|
from typing import Dict, Optional, Any, Tuple, Callable, Awaitable, Union
|
|
from av.audio.frame import AudioFrame
|
|
from av import VideoFrame
|
|
from aiortc import MediaStreamTrack
|
|
import fractions
|
|
import torch
|
|
import librosa
|
|
import os
|
|
from shared.logger import logger
|
|
from shared.models import ChatMessageModel
|
|
|
|
# Implement a local WaveformVideoTrack-like helper to hold shared waveform buffers
|
|
# and lightweight speech status per session. This avoids depending on bots.whisper
|
|
class WaveformVideoTrack:
|
|
"""Lightweight shared storage for waveform visualization and speech status.
|
|
|
|
This class is not itself a MediaStreamTrack; it's used as a shared in-memory
|
|
store that video tracks in this file will read from to render waveforms and
|
|
status overlays.
|
|
"""
|
|
|
|
# session_name -> np.ndarray(float32) containing recent audio samples (mono)
|
|
buffer: Dict[str, np.ndarray] = {}
|
|
|
|
# session_name -> dict with status flags (is_speech, energy, is_processing, is_playing, etc.)
|
|
speech_status: Dict[str, Dict[str, Any]] = {}
|
|
|
|
# session_name -> sample_rate used for that buffer
|
|
sample_rates: Dict[str, int] = {}
|
|
|
|
|
|
# Proxy wrapper for AudioStreamer to log put() calls and basic stats without
|
|
# modifying upstream VibeVoice internals. We'll wrap any created AudioStreamer
|
|
# with this to capture whether model.generate() actually calls put().
|
|
class ProxyAudioStreamer:
|
|
def __init__(self, real_streamer, session_name: Optional[str] = None):
|
|
self._real = real_streamer
|
|
self.session_name = session_name or "unknown"
|
|
self.put_calls = 0
|
|
self.total_samples = 0
|
|
|
|
def put(self, audio_chunk, *args, **kwargs):
|
|
# Try to measure number of samples in the chunk for diagnostics
|
|
try:
|
|
if torch.is_tensor(audio_chunk):
|
|
length = int(audio_chunk.numel())
|
|
else:
|
|
arr = np.array(audio_chunk)
|
|
length = int(arr.size)
|
|
except Exception:
|
|
length = -1
|
|
|
|
try:
|
|
# Inspect possible sample_indices positional argument for diagnostics
|
|
si_info = None
|
|
if len(args) >= 1:
|
|
try:
|
|
si = args[0]
|
|
if torch.is_tensor(si):
|
|
si_info = f"tensor(shape={tuple(si.shape)}, min={int(torch.min(si).item())}, max={int(torch.max(si).item())}, unique={int(len(torch.unique(si)))} )"
|
|
else:
|
|
arrsi = np.array(si)
|
|
si_info = f"array(shape={arrsi.shape}, min={int(arrsi.min()) if arrsi.size>0 else -1}, max={int(arrsi.max()) if arrsi.size>0 else -1}, unique={int(len(np.unique(arrsi))) if arrsi.size>0 else 0})"
|
|
except Exception:
|
|
si_info = str(type(args[0]))
|
|
|
|
logger.info(f"VibeVoice audio: ProxyAudioStreamer.put called for session {self.session_name} - samples={length} sample_indices={si_info}")
|
|
except Exception:
|
|
pass
|
|
|
|
self.put_calls += 1
|
|
if length > 0:
|
|
self.total_samples += length
|
|
|
|
return getattr(self._real, 'put')(audio_chunk, *args, **kwargs)
|
|
|
|
def get_stream(self, *args, **kwargs):
|
|
return getattr(self._real, 'get_stream')(*args, **kwargs)
|
|
|
|
def end(self, *args, **kwargs):
|
|
return getattr(self._real, 'end')(*args, **kwargs)
|
|
|
|
def __getattr__(self, name):
|
|
return getattr(self._real, name)
|
|
|
|
|
|
# Import VibeVoice components
|
|
try:
|
|
from vibevoice import VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor
|
|
from vibevoice.modular.streamer import AudioStreamer
|
|
except Exception as e:
|
|
logger.warning("VibeVoice not available. Install with: git clone https://github.com/microsoft/VibeVoice.git && cd VibeVoice && pip install -e .")
|
|
raise e
|
|
|
|
|
|
|
|
class MediaClock:
|
|
"""Shared clock for media synchronization."""
|
|
|
|
def __init__(self):
|
|
self.t0 = time.perf_counter()
|
|
|
|
def now(self) -> float:
|
|
return time.perf_counter() - self.t0
|
|
|
|
|
|
class VibeVoiceTTS:
|
|
"""Minimal VibeVoice Text-to-Speech wrapper."""
|
|
|
|
def __init__(self, device: str = "cpu", inference_steps: int = 10, config: Optional[Dict[str, Any]] = None):
|
|
self.device = device
|
|
self.inference_steps = inference_steps
|
|
self.config = config or {}
|
|
self.model = None
|
|
self.processor = None
|
|
self.sample_rate = 24000 # VibeVoice uses 24kHz
|
|
self.is_initialized = False
|
|
self.voice_presets = {}
|
|
self.available_voices = {}
|
|
|
|
try:
|
|
self._initialize_model()
|
|
self._setup_voice_presets()
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize VibeVoice: {e}")
|
|
|
|
def _initialize_model(self):
|
|
"""Initialize the VibeVoice model with robust device handling."""
|
|
try:
|
|
logger.info("Loading VibeVoice model...")
|
|
|
|
# Normalize potential 'mpx'
|
|
if self.device.lower() == "mpx":
|
|
logger.info("Note: device 'mpx' detected, treating it as 'mps'.")
|
|
self.device = "mps"
|
|
if self.device == "mps" and not torch.backends.mps.is_available():
|
|
logger.warning("Warning: MPS not available. Falling back to CPU.")
|
|
self.device = "cpu"
|
|
|
|
logger.info(f"Using device: {self.device}")
|
|
|
|
# Load processor
|
|
self.processor = VibeVoiceProcessor.from_pretrained("vibevoice/VibeVoice-1.5B")
|
|
|
|
# Decide dtype & attention
|
|
if self.device == "mps":
|
|
load_dtype = torch.float32
|
|
attn_impl_primary = "sdpa"
|
|
elif self.device == "cuda":
|
|
load_dtype = torch.bfloat16
|
|
attn_impl_primary = "flash_attention_2"
|
|
else:
|
|
load_dtype = torch.float32
|
|
attn_impl_primary = "sdpa"
|
|
|
|
logger.info(f"Using device: {self.device}, torch_dtype: {load_dtype}, attn_implementation: {attn_impl_primary}")
|
|
|
|
# Load model
|
|
try:
|
|
if self.device == "mps":
|
|
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
|
"vibevoice/VibeVoice-1.5B",
|
|
torch_dtype=load_dtype,
|
|
attn_implementation=attn_impl_primary,
|
|
device_map=None,
|
|
)
|
|
self.model.to("mps")
|
|
elif self.device == "cuda":
|
|
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
|
"vibevoice/VibeVoice-1.5B",
|
|
torch_dtype=load_dtype,
|
|
device_map="cuda",
|
|
attn_implementation=attn_impl_primary,
|
|
)
|
|
else:
|
|
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
|
"vibevoice/VibeVoice-1.5B",
|
|
torch_dtype=load_dtype,
|
|
device_map="cpu",
|
|
attn_implementation=attn_impl_primary,
|
|
)
|
|
except Exception as e:
|
|
if attn_impl_primary == 'flash_attention_2':
|
|
logger.warning(f"Error with flash_attention_2: {e}")
|
|
logger.info("Falling back to attention implementation: sdpa")
|
|
fallback_attn = "sdpa"
|
|
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
|
"vibevoice/VibeVoice-1.5B",
|
|
torch_dtype=load_dtype,
|
|
device_map=(self.device if self.device in ("cuda", "cpu") else None),
|
|
attn_implementation=fallback_attn,
|
|
)
|
|
if self.device == "mps":
|
|
self.model.to("mps")
|
|
else:
|
|
raise e
|
|
|
|
self.model.eval()
|
|
|
|
# Use SDE solver by default
|
|
self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
|
|
self.model.model.noise_scheduler.config,
|
|
algorithm_type='sde-dpmsolver++',
|
|
beta_schedule='squaredcos_cap_v2'
|
|
)
|
|
self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
|
|
|
|
if hasattr(self.model.model, 'language_model'):
|
|
logger.info(f"Language model attention: {self.model.model.language_model.config._attn_implementation}")
|
|
|
|
self.is_initialized = True
|
|
logger.info("VibeVoice model loaded successfully!")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error loading VibeVoice model: {e}")
|
|
raise
|
|
|
|
def _setup_voice_presets(self):
|
|
"""Setup voice presets by scanning the voices directory."""
|
|
# Look for voices directory in multiple possible locations
|
|
possible_voice_dirs = [
|
|
os.path.join(os.path.dirname(__file__), "voices"), # /voicebot/bots/voices/
|
|
os.path.join(os.path.dirname(__file__), "..", "VibeVoice", "demo", "voices"), # /voicebot/VibeVoice/demo/voices/
|
|
"/voicebot/VibeVoice/demo/voices", # Absolute path
|
|
]
|
|
|
|
voices_dir = None
|
|
for possible_dir in possible_voice_dirs:
|
|
if os.path.exists(possible_dir):
|
|
voices_dir = possible_dir
|
|
break
|
|
|
|
# Check if voices directory exists
|
|
if not voices_dir:
|
|
logger.warning(f"Warning: Voices directory not found in any of: {possible_voice_dirs}")
|
|
self.voice_presets = {}
|
|
self.available_voices = {}
|
|
self.speaker_mapping = {}
|
|
return
|
|
|
|
# Scan for all WAV files in the voices directory
|
|
self.voice_presets = {}
|
|
|
|
# Get all supported audio files
|
|
audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
|
|
audio_files = [f for f in os.listdir(voices_dir)
|
|
if f.lower().endswith(audio_extensions) and os.path.isfile(os.path.join(voices_dir, f))]
|
|
|
|
# Create dictionary with filename (without extension) as key
|
|
for audio_file in audio_files:
|
|
# Remove extension to get the name
|
|
name = os.path.splitext(audio_file)[0]
|
|
# Create full path
|
|
full_path = os.path.join(voices_dir, audio_file)
|
|
self.voice_presets[name] = full_path
|
|
|
|
# Sort the voice presets alphabetically by name for better UI
|
|
self.voice_presets = dict(sorted(self.voice_presets.items()))
|
|
|
|
# Filter out voices that don't exist (this is now redundant but kept for safety)
|
|
self.available_voices = {
|
|
name: path for name, path in self.voice_presets.items()
|
|
if os.path.exists(path)
|
|
}
|
|
|
|
# Map speaker numbers (1, 2, 3, 4) to available voice files
|
|
self.speaker_mapping = {}
|
|
available_voice_names = list(self.available_voices.keys())
|
|
for i in range(1, 5): # Support speakers 1-4
|
|
if i <= len(available_voice_names):
|
|
voice_name = available_voice_names[i-1] # 0-indexed
|
|
self.speaker_mapping[str(i)] = voice_name
|
|
logger.info(f"Mapped Speaker {i} to voice '{voice_name}'")
|
|
else:
|
|
logger.warning(f"No voice file available for Speaker {i}")
|
|
|
|
if not self.available_voices:
|
|
logger.warning("No voice presets found. Please add audio files to the voices directory.")
|
|
else:
|
|
logger.info(f"Found {len(self.available_voices)} voice files in {voices_dir}")
|
|
logger.info(f"Available voices: {', '.join(self.available_voices.keys())}")
|
|
logger.info(f"Speaker mapping: {self.speaker_mapping}")
|
|
|
|
def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
|
|
"""Read and preprocess audio file."""
|
|
try:
|
|
import soundfile as sf
|
|
wav, sr = sf.read(audio_path)
|
|
if len(wav.shape) > 1:
|
|
wav = np.mean(wav, axis=1)
|
|
if sr != target_sr:
|
|
wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
|
|
return wav
|
|
except Exception as e:
|
|
logger.error(f"Error reading audio {audio_path}: {e}")
|
|
return np.array([])
|
|
|
|
def generate_speech(self, text: str, speaker: str = "1", cfg_scale: float = 1.3) -> Optional[np.ndarray]:
|
|
"""Generate speech using the AudioStreamer and return a single concatenated numpy array.
|
|
|
|
This removes the old synchronous model.generate path and uses the streamer-based
|
|
generation even for blocking calls. Returns None if generation isn't possible.
|
|
"""
|
|
# Must have model initialized and streamer available
|
|
if not self.is_initialized:
|
|
logger.error("VibeVoice TTS: Model not initialized - cannot generate speech synchronously")
|
|
return None
|
|
|
|
try:
|
|
# Prepare formatted text and voice samples (same as demo)
|
|
formatted_text = f"Speaker {speaker}: {text}"
|
|
voice_samples = []
|
|
if speaker in self.speaker_mapping:
|
|
voice_name = self.speaker_mapping[speaker]
|
|
if voice_name in self.available_voices:
|
|
audio_path = self.available_voices[voice_name]
|
|
audio_data = self.read_audio(audio_path)
|
|
if len(audio_data) > 0:
|
|
voice_samples.append(audio_data)
|
|
else:
|
|
voice_samples.append([])
|
|
else:
|
|
voice_samples.append([])
|
|
else:
|
|
voice_samples.append([])
|
|
|
|
inputs = self.processor( # type: ignore
|
|
text=[formatted_text],
|
|
voice_samples=[voice_samples],
|
|
padding=True,
|
|
return_tensors="pt"
|
|
)
|
|
|
|
# Move tensors to device
|
|
target_device = self.device if self.device in ("cuda", "mps") else "cpu"
|
|
for k, v in inputs.items():
|
|
if torch.is_tensor(v):
|
|
inputs[k] = v.to(target_device)
|
|
|
|
# Create streamer and run generation
|
|
real_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
|
|
audio_streamer = ProxyAudioStreamer(real_streamer, session_name=self.session_name)
|
|
|
|
with torch.no_grad():
|
|
try:
|
|
self.model.generate( # type: ignore
|
|
**inputs,
|
|
max_new_tokens=None,
|
|
cfg_scale=cfg_scale,
|
|
tokenizer=self.processor.tokenizer, # type: ignore
|
|
generation_config={'do_sample': False},
|
|
verbose=False,
|
|
streamer=audio_streamer,
|
|
)
|
|
finally:
|
|
# ensure streamer end if model.generate returns
|
|
try:
|
|
audio_streamer.end()
|
|
except Exception:
|
|
pass
|
|
|
|
# Collect streamed chunks
|
|
collected = []
|
|
for audio_chunk in audio_streamer.get_stream(0):
|
|
try:
|
|
if torch.is_tensor(audio_chunk):
|
|
if audio_chunk.dtype == torch.bfloat16:
|
|
audio_chunk = audio_chunk.float()
|
|
audio_np = audio_chunk.cpu().numpy().astype(np.float32)
|
|
else:
|
|
audio_np = np.array(audio_chunk, dtype=np.float32)
|
|
|
|
if audio_np.ndim > 1:
|
|
audio_np = audio_np.squeeze()
|
|
|
|
collected.append(audio_np)
|
|
except Exception as e:
|
|
logger.error(f"VibeVoice TTS: Error collecting chunk: {e}")
|
|
|
|
if not collected:
|
|
logger.error("VibeVoice TTS: No audio chunks received from streamer")
|
|
return None
|
|
|
|
audio = np.concatenate(collected)
|
|
|
|
# Mix with background noise if enabled
|
|
noise_type = self.config.get('background_noise_type', 'none')
|
|
noise_volume = self.config.get('background_noise_volume', 0.0)
|
|
audio = self.mix_audio_with_background_noise(audio, noise_type, noise_volume)
|
|
|
|
# Resample to 16kHz for compatibility with existing audio pipeline
|
|
audio_16k = librosa.resample(audio, orig_sr=24000, target_sr=16000)
|
|
return audio_16k.astype(np.float32)
|
|
|
|
except Exception as e:
|
|
logger.error(f"VibeVoice TTS: Error generating speech via streamer: {e}")
|
|
return None
|
|
|
|
def generate_background_noise(self, duration_seconds: float, noise_type: str = "white", volume: float = 0.01, sample_rate: Optional[int] = None) -> np.ndarray:
|
|
"""Generate background noise of specified type and duration."""
|
|
if sample_rate is None:
|
|
sample_rate = self.sample_rate
|
|
|
|
if noise_type == "none":
|
|
return np.zeros(int(duration_seconds * sample_rate), dtype=np.float32)
|
|
|
|
num_samples = int(duration_seconds * sample_rate)
|
|
|
|
if noise_type == "white":
|
|
# White noise - equal power across all frequencies
|
|
noise = np.random.normal(0, 1, num_samples).astype(np.float32)
|
|
elif noise_type == "pink":
|
|
# Pink noise - 1/f frequency response (approximated)
|
|
white = np.random.normal(0, 1, num_samples).astype(np.float32)
|
|
# Simple pink noise approximation using IIR filter
|
|
b = [0.049922035, -0.095993537, 0.050612699, -0.004408786]
|
|
a = [1, -2.494956002, 2.017265875, -0.522189400]
|
|
noise = np.zeros_like(white)
|
|
for i in range(len(b), len(white)):
|
|
noise[i] = b[0] * white[i] + b[1] * white[i-1] + b[2] * white[i-2] + b[3] * white[i-3] - a[1] * noise[i-1] - a[2] * noise[i-2] - a[3] * noise[i-3]
|
|
elif noise_type == "brown":
|
|
# Brown noise - 1/f² frequency response (integrated white noise)
|
|
white = np.random.normal(0, 1, num_samples).astype(np.float32)
|
|
noise = np.cumsum(white)
|
|
# Normalize to prevent drift
|
|
noise = (noise - np.mean(noise)) / np.std(noise)
|
|
else:
|
|
# Default to white noise
|
|
noise = np.random.normal(0, 1, num_samples).astype(np.float32)
|
|
|
|
# Apply volume
|
|
noise *= volume
|
|
return noise
|
|
|
|
def mix_audio_with_background_noise(self, audio: np.ndarray, noise_type: str = "white", volume: float = 0.01) -> np.ndarray:
|
|
"""Mix generated audio with background noise."""
|
|
# Default to disabled when not present in config to avoid unexpected noise
|
|
if not self.config.get('background_noise_enabled', False):
|
|
return audio
|
|
|
|
# Generate background noise for the duration of the audio using the TTS sample rate
|
|
duration_seconds = len(audio) / self.sample_rate
|
|
background_noise = self.generate_background_noise(duration_seconds, noise_type, volume, self.sample_rate)
|
|
|
|
# Mix audio with background noise
|
|
mixed_audio = audio + background_noise
|
|
|
|
# Normalize to prevent clipping
|
|
max_val = np.max(np.abs(mixed_audio))
|
|
if max_val > 1.0:
|
|
mixed_audio /= max_val
|
|
|
|
return mixed_audio
|
|
|
|
|
|
class VibeVoiceVideoTrack(MediaStreamTrack):
|
|
"""Video track that displays text being spoken."""
|
|
|
|
kind = "video"
|
|
|
|
def __init__(self, clock, config: Dict[str, Any], session_name: Optional[str] = None):
|
|
super().__init__()
|
|
self.clock = clock
|
|
self.config = config
|
|
# Keep session_name for looking up waveform buffers and status
|
|
self.session_name = session_name or config.get('session_name') or f"VibeVoice:{int(time.time())}"
|
|
self.width = config.get('width', 640)
|
|
self.height = config.get('height', 480)
|
|
self.fps = config.get('fps', 15)
|
|
|
|
# Text display state
|
|
self.current_text = ""
|
|
self.text_queue = queue.Queue()
|
|
self.display_start_time = 0
|
|
self.display_duration = 3.0 # seconds to display each text
|
|
self.frame_count = 0
|
|
|
|
# Font settings
|
|
self.font = cv2.FONT_HERSHEY_SIMPLEX
|
|
self.font_scale = min(self.width, self.height) / 800
|
|
self.font_thickness = max(1, int(self.font_scale * 2))
|
|
|
|
def update_text(self, text: str):
|
|
"""Update the text to display."""
|
|
self.text_queue.put(text)
|
|
logger.info(f"VibeVoice video: Queued text '{text}'")
|
|
|
|
def update_config(self, config_updates: Dict[str, Any]) -> bool:
|
|
"""Update video configuration."""
|
|
try:
|
|
self.config.update(config_updates)
|
|
if 'width' in config_updates:
|
|
self.width = config_updates['width']
|
|
if 'height' in config_updates:
|
|
self.height = config_updates['height']
|
|
if 'fps' in config_updates:
|
|
self.fps = config_updates['fps']
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Error updating video config: {e}")
|
|
return False
|
|
|
|
async def next_timestamp(self) -> Tuple[int, float]:
|
|
"""Get next timestamp for video frame."""
|
|
pts = int(self.frame_count * (90000 / self.fps))
|
|
time_base = 1 / 90000
|
|
return pts, time_base
|
|
|
|
async def recv(self) -> VideoFrame:
|
|
"""Generate video frame with current text."""
|
|
# Update current text if needed
|
|
current_time = time.time()
|
|
if (not self.current_text or
|
|
current_time - self.display_start_time > self.display_duration):
|
|
try:
|
|
self.current_text = self.text_queue.get_nowait()
|
|
self.display_start_time = current_time
|
|
logger.info(f"VibeVoice video: Displaying '{self.current_text}'")
|
|
except queue.Empty:
|
|
self.current_text = ""
|
|
# Create frame
|
|
frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
|
|
|
|
if self.current_text:
|
|
# Add background
|
|
cv2.rectangle(frame, (0, 0), (self.width, self.height), (0, 0, 0), -1)
|
|
|
|
# Split text into lines if too long
|
|
words = self.current_text.split()
|
|
lines = []
|
|
current_line = ""
|
|
max_chars_per_line = int(self.width / (self.font_scale * 20))
|
|
|
|
for word in words:
|
|
if len(current_line + " " + word) <= max_chars_per_line:
|
|
current_line += " " + word if current_line else word
|
|
else:
|
|
if current_line:
|
|
lines.append(current_line)
|
|
current_line = word
|
|
if current_line:
|
|
lines.append(current_line)
|
|
|
|
# Draw text lines
|
|
line_height = int(self.font_scale * 40)
|
|
total_text_height = len(lines) * line_height
|
|
start_y = (self.height - total_text_height) // 2 + line_height
|
|
|
|
for i, line in enumerate(lines):
|
|
text_size = cv2.getTextSize(line, self.font, self.font_scale, self.font_thickness)[0]
|
|
text_x = (self.width - text_size[0]) // 2
|
|
text_y = start_y + i * line_height
|
|
|
|
# Add text shadow
|
|
cv2.putText(frame, line, (text_x + 2, text_y + 2),
|
|
self.font, self.font_scale, (0, 0, 0), self.font_thickness + 1)
|
|
# Add main text
|
|
cv2.putText(frame, line, (text_x, text_y),
|
|
self.font, self.font_scale, (255, 255, 255), self.font_thickness)
|
|
else:
|
|
# Default background when no text
|
|
cv2.putText(frame, "VibeVoice TTS", (50, self.height // 2),
|
|
self.font, self.font_scale * 2, (255, 255, 255), self.font_thickness)
|
|
|
|
# Draw waveform and status overlays from shared WaveformVideoTrack buffers
|
|
try:
|
|
pname = self.session_name
|
|
buf = WaveformVideoTrack.buffer.get(pname, None)
|
|
status = WaveformVideoTrack.speech_status.get(pname, {})
|
|
|
|
# Draw small status box in top-left
|
|
status_text = "Idle"
|
|
if status.get('is_processing'):
|
|
status_text = "Processing..."
|
|
elif status.get('is_speech'):
|
|
status_text = "Speaking"
|
|
elif buf is not None and len(buf) > 0:
|
|
# buffered seconds approx
|
|
sr = WaveformVideoTrack.sample_rates.get(pname, self.config.get('sample_rate', 16000))
|
|
buffered_sec = len(buf) / float(sr) if sr > 0 else 0.0
|
|
status_text = f"Buffered: {buffered_sec:.1f}s"
|
|
|
|
box_w = int(self.width * 0.28)
|
|
box_h = int(self.height * 0.12)
|
|
cv2.rectangle(frame, (10, 10), (10 + box_w, 10 + box_h), (50, 50, 50), -1)
|
|
cv2.putText(frame, status_text, (20, 10 + int(box_h/2)), self.font, self.font_scale, (200, 200, 200), self.font_thickness)
|
|
|
|
# Draw small energy meter
|
|
energy = status.get('energy', 0.0)
|
|
meter_h = int(box_h * 0.4)
|
|
meter_w = int(box_w * 0.6)
|
|
mx = 20
|
|
my = 10 + box_h - 5
|
|
filled = int(min(1.0, energy * 50.0) * meter_w)
|
|
cv2.rectangle(frame, (mx, my - meter_h), (mx + meter_w, my), (80, 80, 80), -1)
|
|
cv2.rectangle(frame, (mx, my - meter_h), (mx + filled, my), (0, 200, 0), -1)
|
|
|
|
# Draw waveform at bottom area
|
|
if buf is not None and buf.size > 4:
|
|
sr = WaveformVideoTrack.sample_rates.get(pname, self.config.get('sample_rate', 16000))
|
|
# Use last N samples corresponding to width pixels
|
|
samples_to_show = min(buf.size, max(1, int(sr * 5))) # show up to last 5s
|
|
slice_buf = buf[-samples_to_show:]
|
|
|
|
# Downsample to width points
|
|
idx = (np.linspace(0, samples_to_show - 1, num=self.width)).astype(np.int32)
|
|
waveform = slice_buf[idx]
|
|
# Normalize waveform to -1..1
|
|
maxv = np.max(np.abs(waveform)) if waveform.size > 0 else 1.0
|
|
if maxv <= 0:
|
|
maxv = 1.0
|
|
waveform = waveform / maxv
|
|
|
|
# Map to pixel coordinates in bottom strip
|
|
wf_h = int(self.height * 0.22)
|
|
wf_y0 = self.height - wf_h - 10
|
|
pts = []
|
|
for i, v in enumerate(waveform):
|
|
px = int(i * (self.width / len(waveform)))
|
|
py = int(wf_y0 + (wf_h / 2) * (1 - v))
|
|
pts.append((px, py))
|
|
|
|
if len(pts) >= 2:
|
|
cv2.polylines(frame, [np.array(pts, dtype=np.int32)], False, (100, 200, 255), 1)
|
|
# Fill under curve for nicer look
|
|
fill_pts = pts + [(self.width - 1, wf_y0 + wf_h), (0, wf_y0 + wf_h)]
|
|
cv2.fillPoly(frame, [np.array(fill_pts, dtype=np.int32)], (30, 60, 80))
|
|
except Exception:
|
|
# Non-critical rendering failure shouldn't break video
|
|
pass
|
|
|
|
self.frame_count += 1
|
|
return VideoFrame.from_ndarray(frame, format="bgr24")
|
|
|
|
|
|
class VibeVoiceAudioTrack(MediaStreamTrack):
|
|
"""Audio track that plays TTS speech."""
|
|
|
|
kind = "audio"
|
|
|
|
def __init__(self, clock, config: Dict[str, Any], tts_engine: VibeVoiceTTS, session_name: Optional[str] = None):
|
|
super().__init__()
|
|
self.clock = clock
|
|
self.config = config
|
|
self.tts = tts_engine
|
|
self.sample_rate = config.get('sample_rate', 16000)
|
|
self.samples_per_frame = config.get('samples_per_frame', 960) # 60ms at 16kHz
|
|
|
|
# Audio playback state
|
|
self.audio_queue = queue.Queue()
|
|
self.current_audio = None
|
|
self.audio_position = 0
|
|
self.is_speaking = False
|
|
self.speaker = config.get('speaker', 'Alice')
|
|
|
|
# Audio buffer for mixing multiple TTS segments
|
|
self.audio_buffer = np.array([], dtype=np.float32)
|
|
self.buffer_lock = threading.Lock()
|
|
|
|
# Optional looping and debug options
|
|
self.loop = config.get('loop', True)
|
|
self.debug_save_wav = config.get('debug_save_wav', True)
|
|
# Keep the last fully-generated audio to enable looping
|
|
self.last_generated_audio = np.array([], dtype=np.float32)
|
|
# Protect last_generated_audio updates
|
|
self._last_gen_lock = threading.Lock()
|
|
|
|
# Track total samples generated for proper PTS calculation
|
|
self._samples_generated = 0
|
|
# Optional session name used to publish waveform data for visualization
|
|
self.session_name = session_name or f"VibeVoice:{int(time.time())}"
|
|
|
|
def update_config(self, config_updates: Dict[str, Any]) -> bool:
|
|
"""Update audio configuration."""
|
|
try:
|
|
self.config.update(config_updates)
|
|
if 'sample_rate' in config_updates:
|
|
self.sample_rate = config_updates['sample_rate']
|
|
if 'samples_per_frame' in config_updates:
|
|
self.samples_per_frame = config_updates['samples_per_frame']
|
|
if 'speaker' in config_updates:
|
|
self.speaker = config_updates['speaker']
|
|
if 'loop' in config_updates:
|
|
self.loop = bool(config_updates['loop'])
|
|
logger.info(f"🔁 Looping {'enabled' if self.loop else 'disabled'} for session {self.session_name}")
|
|
if 'debug_save_wav' in config_updates:
|
|
self.debug_save_wav = bool(config_updates['debug_save_wav'])
|
|
logger.info(f"🐞 Debug save wav {'enabled' if self.debug_save_wav else 'disabled'} for session {self.session_name}")
|
|
|
|
# Log background noise configuration updates
|
|
background_noise_updated = False
|
|
if 'background_noise_enabled' in config_updates:
|
|
logger.info(f"🎵 Background noise enabled: {config_updates['background_noise_enabled']}")
|
|
background_noise_updated = True
|
|
if 'background_noise_type' in config_updates:
|
|
logger.info(f"🎵 Background noise type: {config_updates['background_noise_type']}")
|
|
background_noise_updated = True
|
|
if 'background_noise_volume' in config_updates:
|
|
logger.info(f"🎵 Background noise volume: {config_updates['background_noise_volume']}")
|
|
background_noise_updated = True
|
|
|
|
if background_noise_updated:
|
|
logger.info("🎵 Background noise configuration updated - changes will take effect on next audio frame")
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Error updating audio config: {e}")
|
|
return False
|
|
|
|
def speak_text(self, text: str, cfg_scale: Optional[float] = None):
|
|
"""Queue text for speech synthesis."""
|
|
if cfg_scale is None:
|
|
cfg_scale = 1.3 # Default value
|
|
|
|
logger.info(f"VibeVoice audio: Starting background TTS generation for '{text}' with cfg_scale={cfg_scale}")
|
|
|
|
# Start TTS generation in a background thread
|
|
import threading
|
|
thread = threading.Thread(
|
|
target=self._generate_tts_background,
|
|
args=(text, self.speaker, cfg_scale),
|
|
daemon=True
|
|
)
|
|
thread.start()
|
|
|
|
def _generate_tts_background(self, text: str, speaker: str, cfg_scale: float):
|
|
"""Generate TTS in background thread and add to audio buffer."""
|
|
try:
|
|
logger.info(f"VibeVoice audio: Background TTS generation started for '{text}'")
|
|
|
|
# Log some diagnostic info about the TTS engine state
|
|
try:
|
|
logger.info(f"VibeVoice audio: TTS engine initialized={getattr(self.tts, 'is_initialized', False)}, device={getattr(self.tts, 'device', None)}, tts_sample_rate={getattr(self.tts, 'sample_rate', None)}")
|
|
# available_voices and speaker_mapping may be large; log summaries
|
|
try:
|
|
avv = getattr(self.tts, 'available_voices', {})
|
|
smap = getattr(self.tts, 'speaker_mapping', {})
|
|
logger.info(f"VibeVoice audio: available_voices={list(avv.keys())[:5]} (count={len(avv)}), speaker_mapping_count={len(smap)}")
|
|
except Exception:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
# Mark processing state for video overlay
|
|
try:
|
|
WaveformVideoTrack.speech_status[self.session_name] = WaveformVideoTrack.speech_status.get(self.session_name, {})
|
|
WaveformVideoTrack.speech_status[self.session_name]['is_processing'] = True
|
|
except Exception:
|
|
pass
|
|
|
|
# Require model and streamer to be available for streaming generation
|
|
if not self.tts.is_initialized:
|
|
logger.error("VibeVoice audio: Model or AudioStreamer not available - background generation disabled")
|
|
return
|
|
|
|
# Prepare formatted text and inputs (same expectations as generate_speech)
|
|
formatted_text = f"Speaker {speaker}: {text}"
|
|
voice_samples = []
|
|
if speaker in self.tts.speaker_mapping:
|
|
voice_name = self.tts.speaker_mapping[speaker]
|
|
if voice_name in self.tts.available_voices:
|
|
audio_path = self.tts.available_voices[voice_name]
|
|
audio_data = self.tts.read_audio(audio_path)
|
|
if len(audio_data) > 0:
|
|
voice_samples.append(audio_data)
|
|
else:
|
|
voice_samples.append([])
|
|
else:
|
|
voice_samples.append([])
|
|
else:
|
|
voice_samples.append([])
|
|
|
|
inputs = self.tts.processor( # type: ignore
|
|
text=[formatted_text],
|
|
voice_samples=[voice_samples],
|
|
padding=True,
|
|
return_tensors="pt"
|
|
)
|
|
|
|
# Move tensors to device
|
|
target_device = self.tts.device if self.tts.device in ("cuda", "mps") else "cpu"
|
|
for k, v in inputs.items():
|
|
if torch.is_tensor(v):
|
|
inputs[k] = v.to(target_device)
|
|
|
|
# Log a summary of inputs for diagnostic purposes
|
|
try:
|
|
inp_summary = {}
|
|
for k, v in inputs.items():
|
|
if torch.is_tensor(v):
|
|
inp_summary[k] = f"tensor(shape={tuple(v.shape)}, dtype={v.dtype})"
|
|
else:
|
|
try:
|
|
inp_summary[k] = f"{type(v).__name__}(len={len(v)})"
|
|
except Exception:
|
|
inp_summary[k] = type(v).__name__
|
|
logger.info(f"VibeVoice audio: Input summary for generation: {inp_summary}")
|
|
except Exception:
|
|
pass
|
|
|
|
# Create audio streamer and start model.generate in a separate thread
|
|
real_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
|
|
audio_streamer = ProxyAudioStreamer(real_streamer, session_name=self.session_name)
|
|
|
|
def _run_generate():
|
|
try:
|
|
logger.info(f"VibeVoice audio: model.generate starting for session {self.session_name}")
|
|
with torch.no_grad():
|
|
self.tts.model.generate( # type: ignore
|
|
**inputs,
|
|
max_new_tokens=None,
|
|
cfg_scale=cfg_scale,
|
|
tokenizer=self.tts.processor.tokenizer, # type: ignore
|
|
generation_config={'do_sample': False},
|
|
verbose=False,
|
|
streamer=audio_streamer,
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"VibeVoice audio: Error during model.generate: {e}")
|
|
finally:
|
|
# Ensure streamer is ended
|
|
try:
|
|
audio_streamer.end()
|
|
except Exception:
|
|
pass
|
|
logger.info(f"VibeVoice audio: model.generate finished for session {self.session_name}")
|
|
|
|
gen_thread = threading.Thread(target=_run_generate, daemon=True)
|
|
gen_thread.start()
|
|
|
|
# Consume chunks from streamer and append to audio buffer as they arrive
|
|
generated_chunks = []
|
|
chunk_count = 0
|
|
total_samples_streamed = 0
|
|
logger.info(f"VibeVoice audio: Audio streamer started for session {self.session_name}")
|
|
try:
|
|
logger.info(f"VibeVoice audio: audio_streamer repr: {repr(audio_streamer)[:400]}")
|
|
gs = None
|
|
try:
|
|
gs = audio_streamer.get_stream(0)
|
|
logger.info(f"VibeVoice audio: get_stream returned object type: {type(gs)}")
|
|
except Exception as _e:
|
|
logger.error(f"VibeVoice audio: calling audio_streamer.get_stream raised: {_e}")
|
|
gs = None
|
|
except Exception:
|
|
gs = None
|
|
|
|
if gs is None:
|
|
logger.warning(f"VibeVoice audio: audio_streamer.get_stream did not return a stream for session {self.session_name}")
|
|
iterator = []
|
|
else:
|
|
iterator = gs
|
|
|
|
for audio_chunk in iterator:
|
|
try:
|
|
# Convert tensor to numpy if needed
|
|
if torch.is_tensor(audio_chunk):
|
|
if audio_chunk.dtype == torch.bfloat16:
|
|
audio_chunk = audio_chunk.float()
|
|
audio_np = audio_chunk.cpu().numpy().astype(np.float32)
|
|
else:
|
|
audio_np = np.array(audio_chunk, dtype=np.float32)
|
|
|
|
# Squeeze to 1D if needed
|
|
if audio_np.ndim > 1:
|
|
audio_np = audio_np.squeeze()
|
|
|
|
# Resample from model sampling rate (usually 24000) to track sample rate
|
|
if hasattr(self.tts, 'sample_rate') and self.tts.sample_rate != self.sample_rate:
|
|
try:
|
|
audio_np = librosa.resample(audio_np, orig_sr=self.tts.sample_rate, target_sr=self.sample_rate)
|
|
except Exception:
|
|
# If resample fails, keep original chunk
|
|
pass
|
|
|
|
# Append to internal buffer
|
|
with self.buffer_lock:
|
|
if len(self.audio_buffer) == 0:
|
|
self.audio_buffer = audio_np
|
|
else:
|
|
self.audio_buffer = np.concatenate([self.audio_buffer, audio_np])
|
|
|
|
# Also collect into generated_chunks for possible looping/debug save
|
|
try:
|
|
generated_chunks.append(audio_np.astype(np.float32))
|
|
except Exception:
|
|
pass
|
|
|
|
total_samples_streamed += len(audio_np)
|
|
chunk_count += 1
|
|
# Log every few chunks to avoid log spam
|
|
if chunk_count % 5 == 0:
|
|
logger.info(f"VibeVoice audio: Streamed {total_samples_streamed} samples so far for session {self.session_name} (chunks={chunk_count})")
|
|
else:
|
|
logger.debug(f"VibeVoice audio: Streamed {len(audio_np)} samples to buffer (total buffer: {len(self.audio_buffer)})")
|
|
|
|
# Also publish into the global waveform buffer used by WaveformVideoTrack
|
|
try:
|
|
if WaveformVideoTrack is not None:
|
|
pname = self.session_name
|
|
# Ensure buffer key exists
|
|
if pname not in WaveformVideoTrack.buffer:
|
|
WaveformVideoTrack.buffer[pname] = np.array([], dtype=np.float32)
|
|
|
|
# Append to shared waveform buffer
|
|
WaveformVideoTrack.buffer[pname] = np.concatenate([
|
|
WaveformVideoTrack.buffer[pname], audio_np.astype(np.float32)
|
|
])
|
|
|
|
# Ensure sample rate is set for this session
|
|
WaveformVideoTrack.sample_rates[pname] = self.sample_rate
|
|
|
|
# Limit buffer to last 10 seconds for this track
|
|
max_samples = int(self.sample_rate * 10)
|
|
if len(WaveformVideoTrack.buffer[pname]) > max_samples:
|
|
WaveformVideoTrack.buffer[pname] = WaveformVideoTrack.buffer[pname][-max_samples:]
|
|
|
|
# Update a lightweight speech_status for display
|
|
energy = float(np.sqrt(np.mean(audio_np.astype(np.float32) ** 2))) if audio_np.size > 0 else 0.0
|
|
# Approximate zero-crossing rate
|
|
try:
|
|
if audio_np.size > 1:
|
|
zcr = float(np.mean(np.abs(np.diff(np.sign(audio_np)) ) > 0))
|
|
else:
|
|
zcr = 0.0
|
|
except Exception:
|
|
zcr = 0.0
|
|
|
|
is_speech = energy > 0.005
|
|
|
|
WaveformVideoTrack.speech_status[pname] = {
|
|
'is_speech': bool(is_speech),
|
|
'energy': float(energy),
|
|
'zcr': float(zcr),
|
|
'centroid': 0.0,
|
|
'rolloff': 0.0,
|
|
'flux': 0.0,
|
|
'harmonicity': 0.0,
|
|
'noise_floor_energy': 0.0,
|
|
'adaptive_threshold': 0.0,
|
|
'energy_check': bool(energy > 0.002),
|
|
'zcr_check': bool(zcr > 0.01),
|
|
'spectral_check': False,
|
|
'harmonic_check': False,
|
|
'temporal_consistency': True,
|
|
'is_processing': True,
|
|
'is_playing': False,
|
|
}
|
|
except Exception:
|
|
# Non-critical - don't break TTS on visualization failures
|
|
pass
|
|
except Exception as e:
|
|
logger.error(f"VibeVoice audio: Error processing audio chunk from streamer: {e}")
|
|
|
|
# Ensure generation thread finishes
|
|
gen_thread.join(timeout=5.0)
|
|
|
|
# If generation thread is still alive after join, log a warning
|
|
if gen_thread.is_alive():
|
|
logger.warning(f"VibeVoice audio: generation thread still alive after join for session {self.session_name}")
|
|
|
|
# When generation completes, store last_generated_audio for looping and optionally save debug WAV
|
|
logger.info(f"VibeVoice audio: Generation completed for session {self.session_name}. total_samples_streamed={total_samples_streamed}, chunks={chunk_count}")
|
|
|
|
# If no chunks were received, emit a diagnostic warning with some state to help debugging
|
|
if chunk_count == 0:
|
|
try:
|
|
# Provide more diagnostic info: inputs summary and streamer introspection
|
|
try:
|
|
sdi = {
|
|
'repr': repr(audio_streamer)[:400],
|
|
'dir': [n for n in dir(audio_streamer) if not n.startswith('_')][:40]
|
|
}
|
|
except Exception:
|
|
sdi = {'repr': 'unavailable', 'dir': []}
|
|
|
|
try:
|
|
logger.warning(
|
|
f"VibeVoice audio: No audio chunks were streamed for session {self.session_name}. "
|
|
f"is_initialized={getattr(self.tts, 'is_initialized', False)}, model_present={hasattr(self.tts, 'model')} ; "
|
|
f"audio_streamer={sdi}"
|
|
)
|
|
except Exception:
|
|
logger.warning(f"VibeVoice audio: No audio chunks were streamed for session {self.session_name} (diagnostics failed)")
|
|
except Exception:
|
|
logger.warning(f"VibeVoice audio: No audio chunks were streamed for session {self.session_name} (additional diagnostics unavailable)")
|
|
# Fallback: attempt a synchronous generation that returns a full numpy audio array
|
|
try:
|
|
logger.info(f"VibeVoice audio: Attempting synchronous fallback generation for session {self.session_name}")
|
|
fallback_audio = None
|
|
try:
|
|
fallback_audio = self.tts.generate_speech(text, speaker, cfg_scale=cfg_scale)
|
|
except Exception as e:
|
|
logger.error(f"VibeVoice audio: synchronous fallback generation raised: {e}")
|
|
|
|
if fallback_audio is not None and getattr(fallback_audio, 'size', 0) > 0:
|
|
try:
|
|
fa = fallback_audio.astype(np.float32)
|
|
except Exception:
|
|
fa = np.array(fallback_audio, dtype=np.float32)
|
|
|
|
# Resample if needed
|
|
try:
|
|
tts_sr = getattr(self.tts, 'sample_rate', 24000)
|
|
if tts_sr != self.sample_rate:
|
|
fa = librosa.resample(fa, orig_sr=tts_sr, target_sr=self.sample_rate)
|
|
except Exception:
|
|
pass
|
|
|
|
# Append into internal buffer and last_generated_audio
|
|
with self.buffer_lock:
|
|
if len(self.audio_buffer) == 0:
|
|
self.audio_buffer = fa
|
|
else:
|
|
self.audio_buffer = np.concatenate([self.audio_buffer, fa])
|
|
with self._last_gen_lock:
|
|
self.last_generated_audio = fa.copy()
|
|
|
|
# Publish to waveform buffer
|
|
try:
|
|
pname = self.session_name
|
|
if pname not in WaveformVideoTrack.buffer:
|
|
WaveformVideoTrack.buffer[pname] = np.array([], dtype=np.float32)
|
|
WaveformVideoTrack.buffer[pname] = np.concatenate([WaveformVideoTrack.buffer[pname], fa.astype(np.float32)])
|
|
WaveformVideoTrack.sample_rates[pname] = self.sample_rate
|
|
except Exception:
|
|
pass
|
|
|
|
# Optionally save debug wav
|
|
if self.debug_save_wav:
|
|
try:
|
|
try:
|
|
import soundfile as sf
|
|
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav"
|
|
sf.write(fname, fa, samplerate=self.sample_rate)
|
|
logger.info(f"🐞 Saved fallback generated wav to {fname} (soundfile)")
|
|
except Exception:
|
|
try:
|
|
from scipy.io import wavfile
|
|
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav"
|
|
wavfile.write(fname, self.sample_rate, (fa * 32767).astype('int16'))
|
|
logger.info(f"🐞 Saved fallback generated wav to {fname} (scipy)")
|
|
except Exception:
|
|
try:
|
|
import wave
|
|
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}_fallback.wav"
|
|
with wave.open(fname, 'wb') as wf:
|
|
wf.setnchannels(1)
|
|
wf.setsampwidth(2)
|
|
wf.setframerate(self.sample_rate)
|
|
int_data = (fa * 32767).astype('int16')
|
|
wf.writeframes(int_data.tobytes())
|
|
logger.info(f"🐞 Saved fallback generated wav to {fname} (wave)")
|
|
except Exception as e:
|
|
logger.error(f"Error saving fallback debug wav (all methods failed): {e}")
|
|
except Exception as e:
|
|
logger.error(f"Error saving fallback debug wav: {e}")
|
|
|
|
logger.info(f"VibeVoice audio: Fallback synchronous generation successful for session {self.session_name} (samples={len(fa)})")
|
|
else:
|
|
logger.warning(f"VibeVoice audio: Fallback synchronous generation produced no audio for session {self.session_name}")
|
|
except Exception as e:
|
|
logger.error(f"VibeVoice audio: Exception during synchronous fallback generation: {e}")
|
|
try:
|
|
if len(generated_chunks) > 0:
|
|
try:
|
|
all_gen = np.concatenate(generated_chunks).astype(np.float32)
|
|
except Exception:
|
|
all_gen = np.array([], dtype=np.float32)
|
|
with self._last_gen_lock:
|
|
self.last_generated_audio = all_gen.copy()
|
|
|
|
# Optionally save to disk for debugging
|
|
if self.debug_save_wav:
|
|
try:
|
|
try:
|
|
import soundfile as sf
|
|
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav"
|
|
sf.write(fname, all_gen, samplerate=self.sample_rate)
|
|
logger.info(f"🐞 Saved generated wav to {fname} (soundfile)")
|
|
except Exception:
|
|
# Try scipy fallback
|
|
try:
|
|
from scipy.io import wavfile
|
|
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav"
|
|
# scipy expects int16
|
|
wavfile.write(fname, self.sample_rate, (all_gen * 32767).astype('int16'))
|
|
logger.info(f"🐞 Saved generated wav to {fname} (scipy)")
|
|
except Exception:
|
|
# Ultimate fallback: write raw wave via wave module
|
|
try:
|
|
import wave
|
|
fname = f"/tmp/vibevoice_{self.session_name}_{int(time.time())}.wav"
|
|
with wave.open(fname, 'wb') as wf:
|
|
wf.setnchannels(1)
|
|
wf.setsampwidth(2)
|
|
wf.setframerate(self.sample_rate)
|
|
int_data = (all_gen * 32767).astype('int16')
|
|
wf.writeframes(int_data.tobytes())
|
|
logger.info(f"🐞 Saved generated wav to {fname} (wave)")
|
|
except Exception as e:
|
|
logger.error(f"Error saving debug wav (all methods failed): {e}")
|
|
except Exception as e:
|
|
logger.error(f"Error saving debug wav: {e}")
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
# Clear processing flag when generation completes
|
|
try:
|
|
if self.session_name in WaveformVideoTrack.speech_status:
|
|
WaveformVideoTrack.speech_status[self.session_name]['is_processing'] = False
|
|
except Exception:
|
|
pass
|
|
|
|
except Exception as e:
|
|
logger.error(f"VibeVoice audio: Error in background TTS generation: {e}")
|
|
|
|
def _get_samples_from_buffer(self, num_samples: int) -> np.ndarray:
|
|
"""Get samples from audio buffer, removing them from buffer."""
|
|
# Try to refill from last_generated_audio if looping is enabled
|
|
with self._last_gen_lock:
|
|
last_gen = self.last_generated_audio.copy() if getattr(self, 'last_generated_audio', None) is not None else np.array([], dtype=np.float32)
|
|
|
|
with self.buffer_lock:
|
|
if len(self.audio_buffer) == 0:
|
|
# If we're configured to loop and have a generated sample, refill the buffer
|
|
if getattr(self, 'loop', False) and last_gen.size > 0:
|
|
try:
|
|
# Repeat last_gen as needed to reach at least num_samples
|
|
repeats = int(math.ceil(float(num_samples) / float(len(last_gen)))) if len(last_gen) > 0 else 1
|
|
refill = np.tile(last_gen, repeats)
|
|
self.audio_buffer = refill.astype(np.float32)
|
|
logger.debug(f"VibeVoice audio: Refilled audio_buffer from last_generated_audio (len={len(last_gen)}) repeats={repeats}")
|
|
except Exception:
|
|
# Fallback to silence on any failure
|
|
self.audio_buffer = np.zeros(num_samples, dtype=np.float32)
|
|
else:
|
|
return np.zeros(num_samples, dtype=np.float32)
|
|
|
|
if len(self.audio_buffer) >= num_samples:
|
|
samples = self.audio_buffer[:num_samples]
|
|
self.audio_buffer = self.audio_buffer[num_samples:]
|
|
return samples
|
|
else:
|
|
# Return remaining samples and pad with zeros
|
|
samples = self.audio_buffer
|
|
padding = np.zeros(num_samples - len(self.audio_buffer), dtype=np.float32)
|
|
self.audio_buffer = np.array([], dtype=np.float32)
|
|
return np.concatenate([samples, padding])
|
|
|
|
async def next_timestamp(self) -> Tuple[int, float]:
|
|
"""Get next timestamp for audio frame."""
|
|
pts = self._samples_generated
|
|
time_base = 1 / self.sample_rate
|
|
return pts, time_base
|
|
|
|
async def recv(self) -> AudioFrame:
|
|
"""Generate audio frame with TTS speech from buffer."""
|
|
# Get samples from buffer
|
|
samples = self._get_samples_from_buffer(self.samples_per_frame)
|
|
|
|
# If no TTS audio available, generate background noise
|
|
if np.all(samples == 0):
|
|
# Default to disabled when not present in config to avoid unexpected noise
|
|
if self.config.get('background_noise_enabled', False):
|
|
noise_type = self.config.get('background_noise_type', 'white')
|
|
noise_volume = self.config.get('background_noise_volume', 0.01)
|
|
# Generate noise for this frame duration
|
|
frame_duration = self.samples_per_frame / self.sample_rate
|
|
logger.debug(f"🎵 Generating background noise: type={noise_type}, volume={noise_volume}, duration={frame_duration:.3f}s")
|
|
background_noise = self.tts.generate_background_noise(frame_duration, noise_type, noise_volume, self.sample_rate)
|
|
logger.debug(f"🎵 Generated background noise: {len(background_noise)} samples")
|
|
samples = background_noise
|
|
else:
|
|
# Generate silence if background noise is disabled
|
|
logger.debug("🎵 Background noise disabled - generating silence")
|
|
samples = np.zeros(self.samples_per_frame, dtype=np.float32)
|
|
|
|
# Convert to 16-bit PCM
|
|
# Update shared speech_status for visualization: energy + playing flag
|
|
try:
|
|
energy = float(np.sqrt(np.mean(samples.astype(np.float32) ** 2))) if samples.size > 0 else 0.0
|
|
pname = self.session_name
|
|
st = WaveformVideoTrack.speech_status.get(pname, {})
|
|
st['energy'] = float(energy)
|
|
# Consider playing when energy above small threshold
|
|
st['is_playing'] = bool(energy > 0.001)
|
|
st['is_speech'] = bool(energy > 0.003)
|
|
WaveformVideoTrack.speech_status[pname] = st
|
|
except Exception:
|
|
pass
|
|
|
|
samples_int16 = (samples * 32767).astype(np.int16)
|
|
|
|
# Create stereo audio (duplicate mono channel)
|
|
left = samples_int16
|
|
right = samples_int16.copy()
|
|
stereo = np.empty(self.samples_per_frame * 2, dtype=np.int16)
|
|
stereo[0::2] = left
|
|
stereo[1::2] = right
|
|
|
|
# Create audio frame
|
|
frame = AudioFrame.from_ndarray(stereo.reshape(1, -1), format="s16", layout="stereo")
|
|
frame.sample_rate = self.sample_rate
|
|
frame.pts = self._samples_generated
|
|
frame.time_base = fractions.Fraction(1, self.sample_rate)
|
|
|
|
# Increment sample counter
|
|
self._samples_generated += self.samples_per_frame
|
|
|
|
return frame
|
|
|
|
|
|
class VibeVoiceTTSBot:
|
|
"""VibeVoice Text-to-Speech Bot for voicebot framework."""
|
|
|
|
def __init__(self, session_name: str, config: Optional[Dict[str, Any]] = None):
|
|
self.session_name = session_name
|
|
self.config = config or {}
|
|
|
|
# Initialize TTS engine with enhanced parameters
|
|
device = self.config.get('device', 'cpu')
|
|
inference_steps = self.config.get('inference_steps', 10)
|
|
self.tts_engine = VibeVoiceTTS(device=device, inference_steps=inference_steps, config=self.config)
|
|
|
|
# Store generation parameters
|
|
self.cfg_scale = self.config.get('cfg_scale', 1.3)
|
|
self.speaker = self.config.get('speaker', '1')
|
|
|
|
# Initialize media components
|
|
self.media_clock = MediaClock()
|
|
# Pass session name into video track so it can show per-session waveform/status
|
|
self.video_track = VibeVoiceVideoTrack(self.media_clock, self.config, session_name=session_name)
|
|
self.audio_track = VibeVoiceAudioTrack(self.media_clock, self.config, self.tts_engine, session_name=session_name)
|
|
|
|
# Initialize shared waveform store sample rate and empty buffer/status
|
|
try:
|
|
WaveformVideoTrack.sample_rates[session_name] = self.config.get('sample_rate', 16000)
|
|
if session_name not in WaveformVideoTrack.buffer:
|
|
WaveformVideoTrack.buffer[session_name] = np.array([], dtype=np.float32)
|
|
if session_name not in WaveformVideoTrack.speech_status:
|
|
WaveformVideoTrack.speech_status[session_name] = {'is_speech': False, 'energy': 0.0, 'is_processing': False, 'is_playing': False}
|
|
except Exception:
|
|
pass
|
|
|
|
# Apply initial configuration values to ensure defaults from schema/config provider
|
|
try:
|
|
self.update_config(self.config)
|
|
except Exception:
|
|
# Don't let config application stop initialization
|
|
pass
|
|
|
|
logger.info(f"VibeVoice bot initialized for session {session_name} with cfg_scale={self.cfg_scale}, speaker={self.speaker}")
|
|
|
|
def get_tracks(self) -> Dict[str, MediaStreamTrack]:
|
|
"""Get video and audio tracks."""
|
|
return {
|
|
"video": self.video_track,
|
|
"audio": self.audio_track
|
|
}
|
|
|
|
def handle_chat_message(self, message: ChatMessageModel):
|
|
"""Handle incoming chat messages by converting them to speech."""
|
|
try:
|
|
text = message.message.strip()
|
|
if text:
|
|
logger.info(f"VibeVoice bot received chat: '{text}' from {message.sender_name}")
|
|
|
|
# Queue text for both video display and audio speech
|
|
self.video_track.update_text(text)
|
|
self.audio_track.speak_text(text, self.cfg_scale)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error handling chat message in VibeVoice bot: {e}")
|
|
|
|
def update_config(self, config_updates: Dict[str, Any]) -> bool:
|
|
"""Update bot configuration."""
|
|
try:
|
|
self.config.update(config_updates)
|
|
|
|
# Update TTS-specific parameters
|
|
if 'cfg_scale' in config_updates:
|
|
self.cfg_scale = config_updates['cfg_scale']
|
|
if 'speaker' in config_updates:
|
|
self.speaker = config_updates['speaker']
|
|
|
|
# Update tracks
|
|
video_success = self.video_track.update_config(config_updates)
|
|
audio_success = self.audio_track.update_config(config_updates)
|
|
|
|
if video_success and audio_success:
|
|
logger.info(f"VibeVoice bot configuration updated: {config_updates}")
|
|
return True
|
|
else:
|
|
logger.warning("Partial configuration update failure in VibeVoice bot")
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating VibeVoice bot configuration: {e}")
|
|
return False
|
|
|
|
|
|
# Global bot instance registry
|
|
_vibevoice_bots: Dict[str, VibeVoiceTTSBot] = {}
|
|
|
|
|
|
def create_vibevoice_bot_tracks(session_name: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, MediaStreamTrack]:
|
|
"""
|
|
Create VibeVoice TTS bot tracks.
|
|
|
|
Args:
|
|
session_name: Name for the session
|
|
config: Configuration dictionary with options:
|
|
- width: video width (default 640)
|
|
- height: video height (default 480)
|
|
- fps: frames per second (default 15)
|
|
- sample_rate: audio sample rate (default 16000)
|
|
- samples_per_frame: audio samples per frame (default 960)
|
|
- speaker: TTS speaker name (default '1')
|
|
- device: device for TTS ('cpu', 'cuda', 'mps')
|
|
- cfg_scale: CFG scale for generation (default 1.3)
|
|
- inference_steps: Number of inference steps (default 10)
|
|
|
|
Returns:
|
|
Dictionary containing 'video' and 'audio' tracks
|
|
"""
|
|
if config is None:
|
|
config = {}
|
|
|
|
# Set defaults
|
|
default_config = {
|
|
'width': 640,
|
|
'height': 480,
|
|
'fps': 15,
|
|
'sample_rate': 16000,
|
|
'samples_per_frame': 960,
|
|
'speaker': '1',
|
|
'device': 'cpu',
|
|
'cfg_scale': 1.3,
|
|
'inference_steps': 10,
|
|
# Explicit background noise defaults - disabled by default
|
|
'background_noise_enabled': False,
|
|
'background_noise_type': 'none',
|
|
'background_noise_volume': 0.0,
|
|
}
|
|
default_config.update(config)
|
|
|
|
# Create bot instance
|
|
bot = VibeVoiceTTSBot(session_name, default_config)
|
|
_vibevoice_bots[session_name] = bot
|
|
|
|
logger.info(f"Created VibeVoice bot tracks for {session_name}")
|
|
return bot.get_tracks()
|
|
|
|
|
|
def handle_config_update(session_name: str, config_values: Dict[str, Any]) -> bool:
|
|
"""
|
|
Handle runtime configuration updates for VibeVoice bot.
|
|
|
|
Args:
|
|
session_name: Name of the session/bot instance
|
|
config_values: Dictionary of configuration values to update
|
|
|
|
Returns:
|
|
bool: True if update was successful, False otherwise
|
|
"""
|
|
try:
|
|
if session_name in _vibevoice_bots:
|
|
return _vibevoice_bots[session_name].update_config(config_values)
|
|
else:
|
|
logger.warning(f"No VibeVoice bot found for session {session_name}")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Error updating VibeVoice bot configuration: {e}")
|
|
return False
|
|
|
|
|
|
async def handle_chat_message(
|
|
chat_message: ChatMessageModel,
|
|
send_message_func: Callable[[Union[str, ChatMessageModel]], Awaitable[None]]
|
|
) -> Optional[str]:
|
|
"""
|
|
Handle incoming chat messages and convert them to speech.
|
|
|
|
Args:
|
|
chat_message: The chat message to process
|
|
send_message_func: Function to send chat responses (not used by TTS bot)
|
|
"""
|
|
try:
|
|
# Find the bot instance - we need to get session name from somewhere
|
|
# For now, we'll use the first available bot instance
|
|
if _vibevoice_bots:
|
|
session_name = list(_vibevoice_bots.keys())[0]
|
|
_vibevoice_bots[session_name].handle_chat_message(chat_message)
|
|
logger.info(f"VibeVoice bot processed chat message from {chat_message.sender_name}: '{chat_message.message}'")
|
|
else:
|
|
logger.warning("No VibeVoice bot instances available to handle chat message")
|
|
except Exception as e:
|
|
logger.error(f"Error handling chat message in VibeVoice bot: {e}")
|
|
|
|
# TTS bot doesn't send chat responses, so return None
|
|
return None
|
|
|
|
|
|
# Agent descriptor exported for dynamic discovery by the FastAPI service
|
|
AGENT_NAME = "VibeVoice TTS Bot"
|
|
AGENT_DESCRIPTION = "Microsoft VibeVoice text-to-speech bot with visual text display"
|
|
|
|
def agent_info() -> Dict[str, str]:
|
|
"""Return agent metadata for discovery."""
|
|
return {
|
|
"name": AGENT_NAME,
|
|
"description": AGENT_DESCRIPTION,
|
|
"has_media": "true",
|
|
"configurable": "true",
|
|
"chat_enabled": "true"
|
|
}
|
|
|
|
|
|
def get_config_schema() -> Dict[str, Any]:
|
|
"""Get the configuration schema for the VibeVoice Bot."""
|
|
return {
|
|
"bot_name": AGENT_NAME,
|
|
"version": "1.0",
|
|
"parameters": [
|
|
{
|
|
"name": "width",
|
|
"type": "number",
|
|
"label": "Video Width",
|
|
"description": "Width of the video frame in pixels",
|
|
"default_value": 640,
|
|
"required": False,
|
|
"min_value": 320,
|
|
"max_value": 1920,
|
|
"step": 1
|
|
},
|
|
{
|
|
"name": "height",
|
|
"type": "number",
|
|
"label": "Video Height",
|
|
"description": "Height of the video frame in pixels",
|
|
"default_value": 480,
|
|
"required": False,
|
|
"min_value": 240,
|
|
"max_value": 1080,
|
|
"step": 1
|
|
},
|
|
{
|
|
"name": "fps",
|
|
"type": "number",
|
|
"label": "Frames Per Second",
|
|
"description": "Video frame rate",
|
|
"default_value": 15,
|
|
"required": False,
|
|
"min_value": 1,
|
|
"max_value": 60,
|
|
"step": 1
|
|
},
|
|
{
|
|
"name": "speaker",
|
|
"type": "select",
|
|
"label": "TTS Speaker",
|
|
"description": "Voice to use for text-to-speech",
|
|
"default_value": "1",
|
|
"required": True,
|
|
"options": [
|
|
{"value": "1", "label": "Speaker 1 (en-Alice_woman)"},
|
|
{"value": "2", "label": "Speaker 2 (en-Carter_man)"},
|
|
{"value": "3", "label": "Speaker 3 (en-Frank_man)"},
|
|
{"value": "4", "label": "Speaker 4 (en-Mary_woman_bgm)"}
|
|
]
|
|
},
|
|
{
|
|
"name": "background_noise_enabled",
|
|
"type": "boolean",
|
|
"label": "Enable Background Noise",
|
|
"description": "Add background noise to ensure continuous audio streaming",
|
|
"default_value": False,
|
|
"required": False
|
|
},
|
|
{
|
|
"name": "background_noise_type",
|
|
"type": "select",
|
|
"label": "Background Noise Type",
|
|
"description": "Type of background noise to generate",
|
|
# 'none' indicates no noise - matches default disabled behavior
|
|
"default_value": "none",
|
|
"required": False,
|
|
"options": [
|
|
{"value": "white", "label": "White Noise"},
|
|
{"value": "pink", "label": "Pink Noise"},
|
|
{"value": "brown", "label": "Brown Noise"},
|
|
{"value": "none", "label": "None"}
|
|
]
|
|
},
|
|
{
|
|
"name": "background_noise_volume",
|
|
"type": "number",
|
|
"label": "Background Noise Volume",
|
|
"description": "Volume level of background noise (0.0 to 1.0)",
|
|
"default_value": 0.01,
|
|
"required": False,
|
|
"min_value": 0.0,
|
|
"max_value": 1.0,
|
|
"step": 0.001
|
|
},
|
|
{
|
|
"name": "device",
|
|
"type": "select",
|
|
"label": "Processing Device",
|
|
"description": "Device to use for TTS processing",
|
|
"default_value": "cpu",
|
|
"required": True,
|
|
"options": [
|
|
{"value": "cpu", "label": "CPU"},
|
|
{"value": "cuda", "label": "CUDA (GPU)"},
|
|
{"value": "mps", "label": "MPS (Apple Silicon)"}
|
|
]
|
|
},
|
|
{
|
|
"name": "cfg_scale",
|
|
"type": "number",
|
|
"label": "CFG Scale",
|
|
"description": "Classifier-free guidance scale for controlling generation quality",
|
|
"default_value": 1.3,
|
|
"required": False,
|
|
"min_value": 1.0,
|
|
"max_value": 2.0,
|
|
"step": 0.05
|
|
},
|
|
{
|
|
"name": "inference_steps",
|
|
"type": "number",
|
|
"label": "Inference Steps",
|
|
"description": "Number of denoising steps for audio generation",
|
|
"default_value": 10,
|
|
"required": False,
|
|
"min_value": 5,
|
|
"max_value": 50,
|
|
"step": 1
|
|
}
|
|
],
|
|
"categories": [
|
|
{
|
|
"Video Settings": ["width", "height", "fps"]
|
|
},
|
|
{
|
|
"TTS Settings": ["speaker", "device", "cfg_scale", "inference_steps"]
|
|
},
|
|
{
|
|
"Background Noise": ["background_noise_enabled", "background_noise_type", "background_noise_volume"]
|
|
}
|
|
]
|
|
}
|
|
|
|
|
|
def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]:
|
|
"""Factory wrapper used by the FastAPI service to instantiate tracks for an agent."""
|
|
return create_vibevoice_bot_tracks(session_name) |