#!/usr/bin/env python3 """ Debug script to test Whisper transcription with synthetic audio. This helps identify if the issue is with audio processing or the transcription pipeline. """ import numpy as np import time import sys import os # Add the voicebot directory to the path sys.path.append(os.path.dirname(os.path.abspath(__file__))) try: from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate except ImportError as e: print(f"Error importing whisper components: {e}") print("Make sure you're running this from the voicebot directory") sys.exit(1) def generate_test_audio( duration_seconds: float = 2.0, frequency: float = 440.0 ) -> np.ndarray: """Generate a synthetic sine wave for testing.""" samples = int(duration_seconds * sample_rate) t = np.linspace(0, duration_seconds, samples, False) # Generate a sine wave with some amplitude modulation to simulate speech-like patterns amplitude = 0.1 * ( 1 + 0.5 * np.sin(2 * np.pi * 2 * t) ) # Amplitude modulation at 2Hz audio = amplitude * np.sin(2 * np.pi * frequency * t) return audio.astype(np.float32) def test_transcription_pipeline(): """Test the Whisper transcription pipeline with synthetic audio.""" print("Testing Whisper transcription pipeline...") # Test 1: Complete silence print("\n=== Test 1: Complete Silence ===") silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32) test_audio_transcription(silent_audio, "Silent audio") # Test 2: Very quiet noise print("\n=== Test 2: Very Quiet Noise ===") quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32) test_audio_transcription(quiet_noise, "Quiet noise") # Test 3: Sine wave (should produce some output) print("\n=== Test 3: Sine Wave ===") sine_audio = generate_test_audio(2.0, 440.0) test_audio_transcription(sine_audio, "Sine wave") # Test 4: Multiple frequency sine wave print("\n=== Test 4: Complex Sine Wave ===") complex_audio = ( generate_test_audio(2.0, 220.0) + generate_test_audio(2.0, 440.0) + generate_test_audio(2.0, 880.0) ) / 3.0 test_audio_transcription(complex_audio, "Complex sine wave") def test_audio_transcription(audio_array: np.ndarray, description: str): """Test transcription of a specific audio array.""" try: # Calculate metrics duration = len(audio_array) / sample_rate rms = np.sqrt(np.mean(audio_array**2)) peak = np.max(np.abs(audio_array)) print(f"Testing {description}:") print(f" Duration: {duration:.2f}s") print(f" Samples: {len(audio_array)}") print(f" RMS: {rms:.6f}") print(f" Peak: {peak:.6f}") # Test feature extraction start_time = time.time() input_features = extract_input_features(audio_array, sample_rate) feature_time = time.time() - start_time print(f" Feature extraction: {feature_time:.3f}s") # Test model inference start_time = time.time() predicted_ids = _pt_model.generate(input_features) inference_time = time.time() - start_time print(f" Model inference: {inference_time:.3f}s") # Test decoding start_time = time.time() transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True) decoding_time = time.time() - start_time print(f" Decoding: {decoding_time:.3f}s") # Show result text = ( transcription[0].strip() if transcription and len(transcription) > 0 else "" ) print(f" Result: '{text}'" if text else " Result: (empty)") print(f" Result length: {len(text)}") except Exception as e: print(f" ERROR: {e}") if __name__ == "__main__": test_transcription_pipeline()