Move vibevoice to GPU

This commit is contained in:
James Ketr 2025-09-19 13:25:18 -07:00
parent 781748a182
commit 30886d9fa8
3 changed files with 53 additions and 1166 deletions

File diff suppressed because it is too large Load Diff

View File

@ -5,9 +5,47 @@ from typing import Any, List, Tuple, Union, Optional
import time
import torch
import numpy as np
import sys
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
# Defer importing the external `vibevoice` package until we actually need it.
# In some environments the `vibevoice` package isn't installed into site-packages
# but the repo contains a local copy under `voicebot/VibeVoice`. Attempt a lazy
# import and, if that fails, add the local path(s) to sys.path and retry.
def _import_vibevoice_symbols():
try:
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
return VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor
except Exception:
# If a required package (like `diffusers`) is missing inside the
# container or venv, importing deeper VibeVoice modules will raise
# ModuleNotFoundError. Detect that and raise a clearer error that
# includes install instructions.
import traceback as _tb
exc_type, exc_val, exc_tb = _tb.sys.exc_info()
if isinstance(exc_val, ModuleNotFoundError):
missing = str(exc_val).split("'")[1] if "'" in str(exc_val) else str(exc_val)
raise ModuleNotFoundError(
f"Missing dependency when importing VibeVoice: {missing}.\n"
"Install required packages inside the voicebot container.\n"
"Example (inside container):\n"
" PYTHONPATH=/shared:/voicebot uv run python3 -m pip install diffusers accelerate safetensors\n"
"Or add the packages to the voicebot service environment / pyproject and rebuild."
) from exc_val
# Try adding likely repository-local paths where VibeVoice lives
base = os.path.dirname(__file__) # voicebot/bots
candidates = [
os.path.abspath(os.path.join(base, "..", "VibeVoice")),
os.path.abspath(os.path.join("/", "voicebot", "VibeVoice")),
]
for p in candidates:
if os.path.isdir(p) and p not in sys.path:
sys.path.insert(0, p)
# Retry import
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
return VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor
from shared.logger import logger
@ -155,6 +193,8 @@ class VibeVoiceTTS:
def _load_model(self):
"""Load the model and processor with device-specific configuration."""
logger.info(f"Loading processor & model from {self.model_path}")
# Ensure external vibevoice symbols are available (lazy import)
VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor = _import_vibevoice_symbols()
self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
# Decide dtype & attention implementation
@ -401,15 +441,16 @@ class VibeVoiceTTS:
logger.info(f"Generated tokens: {generated_tokens}")
logger.info(f"Total tokens: {output_tokens}")
# Return audio data as numpy array
# Return audio data as numpy array
if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
audio_tensor = outputs.speech_outputs[0]
# Convert to numpy array on CPU
# Convert to numpy array on CPU, ensuring compatible dtype
if hasattr(audio_tensor, 'cpu'):
audio_data = audio_tensor.cpu().numpy()
audio_data = audio_tensor.cpu().float().numpy() # Convert to float32 first
else:
audio_data = np.array(audio_tensor)
audio_data = np.array(audio_tensor, dtype=np.float32)
# Ensure it's a 1D array
if audio_data.ndim > 1:

View File

@ -1,3 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/xpu
about-time==4.2.1
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
@ -74,20 +75,6 @@ ninja==1.13.0
nncf==2.18.0
numba==0.61.2
numpy==2.2.6
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cudnn-cu12==9.10.2.21
nvidia-cufft-cu12==11.3.3.83
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparselt-cu12==0.7.1
nvidia-nccl-cu12==2.27.3
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvtx-cu12==12.8.90
onnx==1.19.0
openai==1.107.2
openai-whisper @ git+https://github.com/openai/whisper.git@c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
@ -157,11 +144,13 @@ threadpoolctl==3.6.0
tiktoken==0.11.0
tokenizers==0.21.4
tomlkit==0.13.3
torch==2.8.0
torchvision==0.23.0
tqdm==4.67.1
torch==2.8.0+xpu
torchvision==0.23.0+xpu
torchaudio==2.8.0+xpu
transformers==4.53.3
triton==3.4.0
diffusers
accelerate
typer==0.17.4
typing-extensions==4.15.0
typing-inspection==0.4.1
@ -172,4 +161,4 @@ watchdog==6.0.0
websockets==15.0.1
wrapt==1.17.3
xxhash==3.5.0
yarl==1.20.1
yarl==1.20.1