diff --git a/voicebot/bots/vibevoicetts.py b/voicebot/bots/vibevoicetts.py index a05bea4..9e6e26b 100644 --- a/voicebot/bots/vibevoicetts.py +++ b/voicebot/bots/vibevoicetts.py @@ -234,9 +234,16 @@ class VibeVoiceTTS: self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( self.model_path, torch_dtype=load_dtype, - device_map="xpu", attn_implementation=attn_impl_primary, + device_map={"": self.device}, # Ensure XPU is used + low_cpu_mem_usage=True, # Optimized for Intel GPUs ) + try: + import intel_extension_for_pytorch as ipex + logger.info("Applying IPEX optimizations") + self.model = ipex.optimize(self.model, dtype=torch.bfloat16, inplace=True) + except ImportError: + logger.info("intel_extension_for_pytorch not found, proceeding without IPEX optimizations.") else: # cpu self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( self.model_path, @@ -399,10 +406,9 @@ class VibeVoiceTTS: ) # Move tensors to target device - target_device = self.device if self.device != "cpu" else "cpu" for k, v in inputs.items(): if torch.is_tensor(v): - inputs[k] = v.to(target_device) + inputs[k] = v.to(self.device, non_blocking=True) if verbose: logger.info(f"Starting generation with cfg_scale: {cfg_scale}") @@ -441,22 +447,25 @@ class VibeVoiceTTS: logger.info(f"Generated tokens: {generated_tokens}") logger.info(f"Total tokens: {output_tokens}") - # Return audio data as numpy array # Return audio data as numpy array if outputs.speech_outputs and outputs.speech_outputs[0] is not None: - audio_tensor = outputs.speech_outputs[0] + audio_tensor = outputs.speech_outputs[0].to("cpu", non_blocking=True).float() + audio_data = audio_tensor.numpy() + return audio_data.squeeze() + + # audio_tensor = outputs.speech_outputs[0] - # Convert to numpy array on CPU, ensuring compatible dtype - if hasattr(audio_tensor, 'cpu'): - audio_data = audio_tensor.cpu().float().numpy() # Convert to float32 first - else: - audio_data = np.array(audio_tensor, dtype=np.float32) + # # Convert to numpy array on CPU, ensuring compatible dtype + # if hasattr(audio_tensor, 'cpu'): + # audio_data = audio_tensor.cpu().float().numpy() # Convert to float32 first + # else: + # audio_data = np.array(audio_tensor, dtype=np.float32) - # Ensure it's a 1D array - if audio_data.ndim > 1: - audio_data = audio_data.squeeze() + # # Ensure it's a 1D array + # if audio_data.ndim > 1: + # audio_data = audio_data.squeeze() - return audio_data + # return audio_data else: raise RuntimeError("No audio output generated")