TTS is working, sometimes
This commit is contained in:
parent
30886d9fa8
commit
86f0b9e29b
@ -234,9 +234,16 @@ class VibeVoiceTTS:
|
|||||||
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
||||||
self.model_path,
|
self.model_path,
|
||||||
torch_dtype=load_dtype,
|
torch_dtype=load_dtype,
|
||||||
device_map="xpu",
|
|
||||||
attn_implementation=attn_impl_primary,
|
attn_implementation=attn_impl_primary,
|
||||||
|
device_map={"": self.device}, # Ensure XPU is used
|
||||||
|
low_cpu_mem_usage=True, # Optimized for Intel GPUs
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
|
import intel_extension_for_pytorch as ipex
|
||||||
|
logger.info("Applying IPEX optimizations")
|
||||||
|
self.model = ipex.optimize(self.model, dtype=torch.bfloat16, inplace=True)
|
||||||
|
except ImportError:
|
||||||
|
logger.info("intel_extension_for_pytorch not found, proceeding without IPEX optimizations.")
|
||||||
else: # cpu
|
else: # cpu
|
||||||
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
||||||
self.model_path,
|
self.model_path,
|
||||||
@ -399,10 +406,9 @@ class VibeVoiceTTS:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Move tensors to target device
|
# Move tensors to target device
|
||||||
target_device = self.device if self.device != "cpu" else "cpu"
|
|
||||||
for k, v in inputs.items():
|
for k, v in inputs.items():
|
||||||
if torch.is_tensor(v):
|
if torch.is_tensor(v):
|
||||||
inputs[k] = v.to(target_device)
|
inputs[k] = v.to(self.device, non_blocking=True)
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
logger.info(f"Starting generation with cfg_scale: {cfg_scale}")
|
logger.info(f"Starting generation with cfg_scale: {cfg_scale}")
|
||||||
@ -441,22 +447,25 @@ class VibeVoiceTTS:
|
|||||||
logger.info(f"Generated tokens: {generated_tokens}")
|
logger.info(f"Generated tokens: {generated_tokens}")
|
||||||
logger.info(f"Total tokens: {output_tokens}")
|
logger.info(f"Total tokens: {output_tokens}")
|
||||||
|
|
||||||
# Return audio data as numpy array
|
|
||||||
# Return audio data as numpy array
|
# Return audio data as numpy array
|
||||||
if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
|
if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
|
||||||
audio_tensor = outputs.speech_outputs[0]
|
audio_tensor = outputs.speech_outputs[0].to("cpu", non_blocking=True).float()
|
||||||
|
audio_data = audio_tensor.numpy()
|
||||||
|
return audio_data.squeeze()
|
||||||
|
|
||||||
|
# audio_tensor = outputs.speech_outputs[0]
|
||||||
|
|
||||||
# Convert to numpy array on CPU, ensuring compatible dtype
|
# # Convert to numpy array on CPU, ensuring compatible dtype
|
||||||
if hasattr(audio_tensor, 'cpu'):
|
# if hasattr(audio_tensor, 'cpu'):
|
||||||
audio_data = audio_tensor.cpu().float().numpy() # Convert to float32 first
|
# audio_data = audio_tensor.cpu().float().numpy() # Convert to float32 first
|
||||||
else:
|
# else:
|
||||||
audio_data = np.array(audio_tensor, dtype=np.float32)
|
# audio_data = np.array(audio_tensor, dtype=np.float32)
|
||||||
|
|
||||||
# Ensure it's a 1D array
|
# # Ensure it's a 1D array
|
||||||
if audio_data.ndim > 1:
|
# if audio_data.ndim > 1:
|
||||||
audio_data = audio_data.squeeze()
|
# audio_data = audio_data.squeeze()
|
||||||
|
|
||||||
return audio_data
|
# return audio_data
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("No audio output generated")
|
raise RuntimeError("No audio output generated")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user