ai-voicebot/voicebot/test_whisper_pipeline.py

111 lines
3.8 KiB
Python

#!/usr/bin/env python3
"""
Debug script to test Whisper transcription with synthetic audio.
This helps identify if the issue is with audio processing or the transcription pipeline.
"""
import numpy as np
import time
import sys
import os
# Add the voicebot directory to the path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
try:
from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate
except ImportError as e:
print(f"Error importing whisper components: {e}")
print("Make sure you're running this from the voicebot directory")
sys.exit(1)
def generate_test_audio(
duration_seconds: float = 2.0, frequency: float = 440.0
) -> np.ndarray:
"""Generate a synthetic sine wave for testing."""
samples = int(duration_seconds * sample_rate)
t = np.linspace(0, duration_seconds, samples, False)
# Generate a sine wave with some amplitude modulation to simulate speech-like patterns
amplitude = 0.1 * (
1 + 0.5 * np.sin(2 * np.pi * 2 * t)
) # Amplitude modulation at 2Hz
audio = amplitude * np.sin(2 * np.pi * frequency * t)
return audio.astype(np.float32)
def test_transcription_pipeline():
"""Test the Whisper transcription pipeline with synthetic audio."""
print("Testing Whisper transcription pipeline...")
# Test 1: Complete silence
print("\n=== Test 1: Complete Silence ===")
silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32)
test_audio_transcription(silent_audio, "Silent audio")
# Test 2: Very quiet noise
print("\n=== Test 2: Very Quiet Noise ===")
quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32)
test_audio_transcription(quiet_noise, "Quiet noise")
# Test 3: Sine wave (should produce some output)
print("\n=== Test 3: Sine Wave ===")
sine_audio = generate_test_audio(2.0, 440.0)
test_audio_transcription(sine_audio, "Sine wave")
# Test 4: Multiple frequency sine wave
print("\n=== Test 4: Complex Sine Wave ===")
complex_audio = (
generate_test_audio(2.0, 220.0)
+ generate_test_audio(2.0, 440.0)
+ generate_test_audio(2.0, 880.0)
) / 3.0
test_audio_transcription(complex_audio, "Complex sine wave")
def test_audio_transcription(audio_array: np.ndarray, description: str):
"""Test transcription of a specific audio array."""
try:
# Calculate metrics
duration = len(audio_array) / sample_rate
rms = np.sqrt(np.mean(audio_array**2))
peak = np.max(np.abs(audio_array))
print(f"Testing {description}:")
print(f" Duration: {duration:.2f}s")
print(f" Samples: {len(audio_array)}")
print(f" RMS: {rms:.6f}")
print(f" Peak: {peak:.6f}")
# Test feature extraction
start_time = time.time()
input_features = extract_input_features(audio_array, sample_rate)
feature_time = time.time() - start_time
print(f" Feature extraction: {feature_time:.3f}s")
# Test model inference
start_time = time.time()
predicted_ids = _pt_model.generate(input_features)
inference_time = time.time() - start_time
print(f" Model inference: {inference_time:.3f}s")
# Test decoding
start_time = time.time()
transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True)
decoding_time = time.time() - start_time
print(f" Decoding: {decoding_time:.3f}s")
# Show result
text = (
transcription[0].strip() if transcription and len(transcription) > 0 else ""
)
print(f" Result: '{text}'" if text else " Result: (empty)")
print(f" Result length: {len(text)}")
except Exception as e:
print(f" ERROR: {e}")
if __name__ == "__main__":
test_transcription_pipeline()