Move vibevoice to GPU

2025-09-19 13:25:18 -07:00 · 2025-09-19 13:25:18 -07:00 · 30886d9fa8
commit 30886d9fa8
parent 781748a182
3 changed files with 53 additions and 1166 deletions
--- a/voicebot/bots/vibevoice.py
+++ b/voicebot/bots/vibevoice.py
--- a/voicebot/bots/vibevoicetts.py
+++ b/voicebot/bots/vibevoicetts.py
@ -5,9 +5,47 @@ from typing import Any, List, Tuple, Union, Optional
 import time
 import torch
 import numpy as np
+import sys

-from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
-from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+# Defer importing the external `vibevoice` package until we actually need it.
+# In some environments the `vibevoice` package isn't installed into site-packages
+# but the repo contains a local copy under `voicebot/VibeVoice`. Attempt a lazy
+# import and, if that fails, add the local path(s) to sys.path and retry.
+
+def _import_vibevoice_symbols():
+    try:
+        from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+        from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+        return VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor
+    except Exception:
+        # If a required package (like `diffusers`) is missing inside the
+        # container or venv, importing deeper VibeVoice modules will raise
+        # ModuleNotFoundError. Detect that and raise a clearer error that
+        # includes install instructions.
+        import traceback as _tb
+        exc_type, exc_val, exc_tb = _tb.sys.exc_info()
+        if isinstance(exc_val, ModuleNotFoundError):
+            missing = str(exc_val).split("'")[1] if "'" in str(exc_val) else str(exc_val)
+            raise ModuleNotFoundError(
+                f"Missing dependency when importing VibeVoice: {missing}.\n"
+                "Install required packages inside the voicebot container.\n"
+                "Example (inside container):\n"
+                "  PYTHONPATH=/shared:/voicebot uv run python3 -m pip install diffusers accelerate safetensors\n"
+                "Or add the packages to the voicebot service environment / pyproject and rebuild."
+            ) from exc_val
+        # Try adding likely repository-local paths where VibeVoice lives
+        base = os.path.dirname(__file__)  # voicebot/bots
+        candidates = [
+            os.path.abspath(os.path.join(base, "..", "VibeVoice")),
+            os.path.abspath(os.path.join("/", "voicebot", "VibeVoice")),
+        ]
+        for p in candidates:
+            if os.path.isdir(p) and p not in sys.path:
+                sys.path.insert(0, p)
+        # Retry import
+        from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+        from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+        return VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor

 from shared.logger import logger

@ -155,6 +193,8 @@ class VibeVoiceTTS:
    def _load_model(self):
        """Load the model and processor with device-specific configuration."""
        logger.info(f"Loading processor & model from {self.model_path}")
+        # Ensure external vibevoice symbols are available (lazy import)
+        VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor = _import_vibevoice_symbols()
        self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
        
        # Decide dtype & attention implementation
@ -401,15 +441,16 @@ class VibeVoiceTTS:
                logger.info(f"Generated tokens: {generated_tokens}")
                logger.info(f"Total tokens: {output_tokens}")
        
+        # Return audio data as numpy array
        # Return audio data as numpy array
        if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
            audio_tensor = outputs.speech_outputs[0]
            
-            # Convert to numpy array on CPU
+            # Convert to numpy array on CPU, ensuring compatible dtype
            if hasattr(audio_tensor, 'cpu'):
-                audio_data = audio_tensor.cpu().numpy()
+                audio_data = audio_tensor.cpu().float().numpy()  # Convert to float32 first
            else:
-                audio_data = np.array(audio_tensor)
+                audio_data = np.array(audio_tensor, dtype=np.float32)
            
            # Ensure it's a 1D array
            if audio_data.ndim > 1:
--- a/voicebot/requirements.txt
+++ b/voicebot/requirements.txt
@ -1,3 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/xpu
 about-time==4.2.1
 aiofiles==24.1.0
 aiohappyeyeballs==2.6.1
@ -74,20 +75,6 @@ ninja==1.13.0
 nncf==2.18.0
 numba==0.61.2
 numpy==2.2.6
-nvidia-cublas-cu12==12.8.4.1
-nvidia-cuda-cupti-cu12==12.8.90
-nvidia-cuda-nvrtc-cu12==12.8.93
-nvidia-cuda-runtime-cu12==12.8.90
-nvidia-cudnn-cu12==9.10.2.21
-nvidia-cufft-cu12==11.3.3.83
-nvidia-cufile-cu12==1.13.1.3
-nvidia-curand-cu12==10.3.9.90
-nvidia-cusolver-cu12==11.7.3.90
-nvidia-cusparse-cu12==12.5.8.93
-nvidia-cusparselt-cu12==0.7.1
-nvidia-nccl-cu12==2.27.3
-nvidia-nvjitlink-cu12==12.8.93
-nvidia-nvtx-cu12==12.8.90
 onnx==1.19.0
 openai==1.107.2
 openai-whisper @ git+https://github.com/openai/whisper.git@c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
@ -157,11 +144,13 @@ threadpoolctl==3.6.0
 tiktoken==0.11.0
 tokenizers==0.21.4
 tomlkit==0.13.3
-torch==2.8.0
-torchvision==0.23.0
 tqdm==4.67.1
+torch==2.8.0+xpu
+torchvision==0.23.0+xpu
+torchaudio==2.8.0+xpu
 transformers==4.53.3
-triton==3.4.0
+diffusers
+accelerate
 typer==0.17.4
 typing-extensions==4.15.0
 typing-inspection==0.4.1
@ -172,4 +161,4 @@ watchdog==6.0.0
 websockets==15.0.1
 wrapt==1.17.3
 xxhash==3.5.0
-yarl==1.20.1
+yarl==1.20.1