TTS is working, sometimes
This commit is contained in:
parent
30886d9fa8
commit
86f0b9e29b
@ -234,9 +234,16 @@ class VibeVoiceTTS:
|
||||
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
||||
self.model_path,
|
||||
torch_dtype=load_dtype,
|
||||
device_map="xpu",
|
||||
attn_implementation=attn_impl_primary,
|
||||
device_map={"": self.device}, # Ensure XPU is used
|
||||
low_cpu_mem_usage=True, # Optimized for Intel GPUs
|
||||
)
|
||||
try:
|
||||
import intel_extension_for_pytorch as ipex
|
||||
logger.info("Applying IPEX optimizations")
|
||||
self.model = ipex.optimize(self.model, dtype=torch.bfloat16, inplace=True)
|
||||
except ImportError:
|
||||
logger.info("intel_extension_for_pytorch not found, proceeding without IPEX optimizations.")
|
||||
else: # cpu
|
||||
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
||||
self.model_path,
|
||||
@ -399,10 +406,9 @@ class VibeVoiceTTS:
|
||||
)
|
||||
|
||||
# Move tensors to target device
|
||||
target_device = self.device if self.device != "cpu" else "cpu"
|
||||
for k, v in inputs.items():
|
||||
if torch.is_tensor(v):
|
||||
inputs[k] = v.to(target_device)
|
||||
inputs[k] = v.to(self.device, non_blocking=True)
|
||||
|
||||
if verbose:
|
||||
logger.info(f"Starting generation with cfg_scale: {cfg_scale}")
|
||||
@ -441,22 +447,25 @@ class VibeVoiceTTS:
|
||||
logger.info(f"Generated tokens: {generated_tokens}")
|
||||
logger.info(f"Total tokens: {output_tokens}")
|
||||
|
||||
# Return audio data as numpy array
|
||||
# Return audio data as numpy array
|
||||
if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
|
||||
audio_tensor = outputs.speech_outputs[0]
|
||||
audio_tensor = outputs.speech_outputs[0].to("cpu", non_blocking=True).float()
|
||||
audio_data = audio_tensor.numpy()
|
||||
return audio_data.squeeze()
|
||||
|
||||
# audio_tensor = outputs.speech_outputs[0]
|
||||
|
||||
# Convert to numpy array on CPU, ensuring compatible dtype
|
||||
if hasattr(audio_tensor, 'cpu'):
|
||||
audio_data = audio_tensor.cpu().float().numpy() # Convert to float32 first
|
||||
else:
|
||||
audio_data = np.array(audio_tensor, dtype=np.float32)
|
||||
# # Convert to numpy array on CPU, ensuring compatible dtype
|
||||
# if hasattr(audio_tensor, 'cpu'):
|
||||
# audio_data = audio_tensor.cpu().float().numpy() # Convert to float32 first
|
||||
# else:
|
||||
# audio_data = np.array(audio_tensor, dtype=np.float32)
|
||||
|
||||
# Ensure it's a 1D array
|
||||
if audio_data.ndim > 1:
|
||||
audio_data = audio_data.squeeze()
|
||||
# # Ensure it's a 1D array
|
||||
# if audio_data.ndim > 1:
|
||||
# audio_data = audio_data.squeeze()
|
||||
|
||||
return audio_data
|
||||
# return audio_data
|
||||
else:
|
||||
raise RuntimeError("No audio output generated")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user