TTS is working, sometimes

This commit is contained in:
James Ketr 2025-09-19 13:39:18 -07:00
parent 30886d9fa8
commit 86f0b9e29b

View File

@ -234,9 +234,16 @@ class VibeVoiceTTS:
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
self.model_path, self.model_path,
torch_dtype=load_dtype, torch_dtype=load_dtype,
device_map="xpu",
attn_implementation=attn_impl_primary, attn_implementation=attn_impl_primary,
device_map={"": self.device}, # Ensure XPU is used
low_cpu_mem_usage=True, # Optimized for Intel GPUs
) )
try:
import intel_extension_for_pytorch as ipex
logger.info("Applying IPEX optimizations")
self.model = ipex.optimize(self.model, dtype=torch.bfloat16, inplace=True)
except ImportError:
logger.info("intel_extension_for_pytorch not found, proceeding without IPEX optimizations.")
else: # cpu else: # cpu
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
self.model_path, self.model_path,
@ -399,10 +406,9 @@ class VibeVoiceTTS:
) )
# Move tensors to target device # Move tensors to target device
target_device = self.device if self.device != "cpu" else "cpu"
for k, v in inputs.items(): for k, v in inputs.items():
if torch.is_tensor(v): if torch.is_tensor(v):
inputs[k] = v.to(target_device) inputs[k] = v.to(self.device, non_blocking=True)
if verbose: if verbose:
logger.info(f"Starting generation with cfg_scale: {cfg_scale}") logger.info(f"Starting generation with cfg_scale: {cfg_scale}")
@ -441,22 +447,25 @@ class VibeVoiceTTS:
logger.info(f"Generated tokens: {generated_tokens}") logger.info(f"Generated tokens: {generated_tokens}")
logger.info(f"Total tokens: {output_tokens}") logger.info(f"Total tokens: {output_tokens}")
# Return audio data as numpy array
# Return audio data as numpy array # Return audio data as numpy array
if outputs.speech_outputs and outputs.speech_outputs[0] is not None: if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
audio_tensor = outputs.speech_outputs[0] audio_tensor = outputs.speech_outputs[0].to("cpu", non_blocking=True).float()
audio_data = audio_tensor.numpy()
return audio_data.squeeze()
# audio_tensor = outputs.speech_outputs[0]
# Convert to numpy array on CPU, ensuring compatible dtype # # Convert to numpy array on CPU, ensuring compatible dtype
if hasattr(audio_tensor, 'cpu'): # if hasattr(audio_tensor, 'cpu'):
audio_data = audio_tensor.cpu().float().numpy() # Convert to float32 first # audio_data = audio_tensor.cpu().float().numpy() # Convert to float32 first
else: # else:
audio_data = np.array(audio_tensor, dtype=np.float32) # audio_data = np.array(audio_tensor, dtype=np.float32)
# Ensure it's a 1D array # # Ensure it's a 1D array
if audio_data.ndim > 1: # if audio_data.ndim > 1:
audio_data = audio_data.squeeze() # audio_data = audio_data.squeeze()
return audio_data # return audio_data
else: else:
raise RuntimeError("No audio output generated") raise RuntimeError("No audio output generated")