ai-voicebot/voicebot/test_whisper_pipeline.py

#!/usr/bin/env python3
"""
Debug script to test Whisper transcription with synthetic audio.
This helps identify if the issue is with audio processing or the transcription pipeline.
"""

import numpy as np
import time
import sys
import os

# Add the voicebot directory to the path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

try:
    from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate
except ImportError as e:
    print(f"Error importing whisper components: {e}")
    print("Make sure you're running this from the voicebot directory")
    sys.exit(1)


def generate_test_audio(
    duration_seconds: float = 2.0, frequency: float = 440.0
) -> np.ndarray:
    """Generate a synthetic sine wave for testing."""
    samples = int(duration_seconds * sample_rate)
    t = np.linspace(0, duration_seconds, samples, False)
    # Generate a sine wave with some amplitude modulation to simulate speech-like patterns
    amplitude = 0.1 * (
        1 + 0.5 * np.sin(2 * np.pi * 2 * t)
    )  # Amplitude modulation at 2Hz
    audio = amplitude * np.sin(2 * np.pi * frequency * t)
    return audio.astype(np.float32)


def test_transcription_pipeline():
    """Test the Whisper transcription pipeline with synthetic audio."""
    print("Testing Whisper transcription pipeline...")

    # Test 1: Complete silence
    print("\n=== Test 1: Complete Silence ===")
    silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32)
    test_audio_transcription(silent_audio, "Silent audio")

    # Test 2: Very quiet noise
    print("\n=== Test 2: Very Quiet Noise ===")
    quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32)
    test_audio_transcription(quiet_noise, "Quiet noise")

    # Test 3: Sine wave (should produce some output)
    print("\n=== Test 3: Sine Wave ===")
    sine_audio = generate_test_audio(2.0, 440.0)
    test_audio_transcription(sine_audio, "Sine wave")

    # Test 4: Multiple frequency sine wave
    print("\n=== Test 4: Complex Sine Wave ===")
    complex_audio = (
        generate_test_audio(2.0, 220.0)
        + generate_test_audio(2.0, 440.0)
        + generate_test_audio(2.0, 880.0)
    ) / 3.0
    test_audio_transcription(complex_audio, "Complex sine wave")


def test_audio_transcription(audio_array: np.ndarray, description: str):
    """Test transcription of a specific audio array."""
    try:
        # Calculate metrics
        duration = len(audio_array) / sample_rate
        rms = np.sqrt(np.mean(audio_array**2))
        peak = np.max(np.abs(audio_array))

        print(f"Testing {description}:")
        print(f"  Duration: {duration:.2f}s")
        print(f"  Samples: {len(audio_array)}")
        print(f"  RMS: {rms:.6f}")
        print(f"  Peak: {peak:.6f}")

        # Test feature extraction
        start_time = time.time()
        input_features = extract_input_features(audio_array, sample_rate)
        feature_time = time.time() - start_time
        print(f"  Feature extraction: {feature_time:.3f}s")

        # Test model inference
        start_time = time.time()
        predicted_ids = _pt_model.generate(input_features)
        inference_time = time.time() - start_time
        print(f"  Model inference: {inference_time:.3f}s")

        # Test decoding
        start_time = time.time()
        transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True)
        decoding_time = time.time() - start_time
        print(f"  Decoding: {decoding_time:.3f}s")

        # Show result
        text = (
            transcription[0].strip() if transcription and len(transcription) > 0 else ""
        )
        print(f"  Result: '{text}'" if text else "  Result: (empty)")
        print(f"  Result length: {len(text)}")

    except Exception as e:
        print(f"  ERROR: {e}")


if __name__ == "__main__":
    test_transcription_pipeline()