111 lines
3.8 KiB
Python
111 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Debug script to test Whisper transcription with synthetic audio.
|
|
This helps identify if the issue is with audio processing or the transcription pipeline.
|
|
"""
|
|
|
|
import numpy as np
|
|
import time
|
|
import sys
|
|
import os
|
|
|
|
# Add the voicebot directory to the path
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
try:
|
|
from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate
|
|
except ImportError as e:
|
|
print(f"Error importing whisper components: {e}")
|
|
print("Make sure you're running this from the voicebot directory")
|
|
sys.exit(1)
|
|
|
|
|
|
def generate_test_audio(
|
|
duration_seconds: float = 2.0, frequency: float = 440.0
|
|
) -> np.ndarray:
|
|
"""Generate a synthetic sine wave for testing."""
|
|
samples = int(duration_seconds * sample_rate)
|
|
t = np.linspace(0, duration_seconds, samples, False)
|
|
# Generate a sine wave with some amplitude modulation to simulate speech-like patterns
|
|
amplitude = 0.1 * (
|
|
1 + 0.5 * np.sin(2 * np.pi * 2 * t)
|
|
) # Amplitude modulation at 2Hz
|
|
audio = amplitude * np.sin(2 * np.pi * frequency * t)
|
|
return audio.astype(np.float32)
|
|
|
|
|
|
def test_transcription_pipeline():
|
|
"""Test the Whisper transcription pipeline with synthetic audio."""
|
|
print("Testing Whisper transcription pipeline...")
|
|
|
|
# Test 1: Complete silence
|
|
print("\n=== Test 1: Complete Silence ===")
|
|
silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32)
|
|
test_audio_transcription(silent_audio, "Silent audio")
|
|
|
|
# Test 2: Very quiet noise
|
|
print("\n=== Test 2: Very Quiet Noise ===")
|
|
quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32)
|
|
test_audio_transcription(quiet_noise, "Quiet noise")
|
|
|
|
# Test 3: Sine wave (should produce some output)
|
|
print("\n=== Test 3: Sine Wave ===")
|
|
sine_audio = generate_test_audio(2.0, 440.0)
|
|
test_audio_transcription(sine_audio, "Sine wave")
|
|
|
|
# Test 4: Multiple frequency sine wave
|
|
print("\n=== Test 4: Complex Sine Wave ===")
|
|
complex_audio = (
|
|
generate_test_audio(2.0, 220.0)
|
|
+ generate_test_audio(2.0, 440.0)
|
|
+ generate_test_audio(2.0, 880.0)
|
|
) / 3.0
|
|
test_audio_transcription(complex_audio, "Complex sine wave")
|
|
|
|
|
|
def test_audio_transcription(audio_array: np.ndarray, description: str):
|
|
"""Test transcription of a specific audio array."""
|
|
try:
|
|
# Calculate metrics
|
|
duration = len(audio_array) / sample_rate
|
|
rms = np.sqrt(np.mean(audio_array**2))
|
|
peak = np.max(np.abs(audio_array))
|
|
|
|
print(f"Testing {description}:")
|
|
print(f" Duration: {duration:.2f}s")
|
|
print(f" Samples: {len(audio_array)}")
|
|
print(f" RMS: {rms:.6f}")
|
|
print(f" Peak: {peak:.6f}")
|
|
|
|
# Test feature extraction
|
|
start_time = time.time()
|
|
input_features = extract_input_features(audio_array, sample_rate)
|
|
feature_time = time.time() - start_time
|
|
print(f" Feature extraction: {feature_time:.3f}s")
|
|
|
|
# Test model inference
|
|
start_time = time.time()
|
|
predicted_ids = _pt_model.generate(input_features)
|
|
inference_time = time.time() - start_time
|
|
print(f" Model inference: {inference_time:.3f}s")
|
|
|
|
# Test decoding
|
|
start_time = time.time()
|
|
transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
|
decoding_time = time.time() - start_time
|
|
print(f" Decoding: {decoding_time:.3f}s")
|
|
|
|
# Show result
|
|
text = (
|
|
transcription[0].strip() if transcription and len(transcription) > 0 else ""
|
|
)
|
|
print(f" Result: '{text}'" if text else " Result: (empty)")
|
|
print(f" Result length: {len(text)}")
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_transcription_pipeline()
|