ai-voicebot/voicebot/force_transcription.py

#!/usr/bin/env python3
"""
Force transcription debug - processes any accumulated audio immediately.
Run this to force the whisper agent to attempt transcription of current audio buffer.
"""

import sys
import os
import asyncio
import numpy as np

# Add the voicebot directory to the path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))


def force_transcription():
    """Force transcription of any accumulated audio."""
    try:
        from bots.whisper import _audio_processors

        if not _audio_processors:
            print(
                "❌ No audio processors found. Whisper agent may not be running or no peers connected."
            )
            return

        print(f"🔍 Found {len(_audio_processors)} active audio processors:")

        for peer_name, audio_processor in _audio_processors.items():
            print(f"\n👤 {peer_name}:")
            print(f"  - Running: {audio_processor.is_running}")
            print(f"  - Buffer size: {len(audio_processor.audio_buffer)} frames")
            print(f"  - Queue size: {audio_processor.processing_queue.qsize()}")
            print(
                f"  - Current phrase length: {len(audio_processor.current_phrase_audio)} samples"
            )

            # Force processing of current buffer
            if len(audio_processor.audio_buffer) > 0:
                print(
                    f"🔄 Forcing processing of {len(audio_processor.audio_buffer)} buffered frames for {peer_name}..."
                )
                audio_processor._queue_for_processing()
            else:
                print(f"📭 No audio in buffer to process for {peer_name}")

            # If we have a current phrase, try to transcribe it
            if len(audio_processor.current_phrase_audio) > 0:
                phrase_duration = (
                    len(audio_processor.current_phrase_audio)
                    / audio_processor.sample_rate
                )
                phrase_rms = np.sqrt(np.mean(audio_processor.current_phrase_audio**2))
                print(
                    f"🎤 Current phrase for {peer_name}: {phrase_duration:.2f}s, RMS: {phrase_rms:.6f}"
                )

                if phrase_duration > 0.3:  # Minimum duration
                    print(
                        f"🚀 Forcing transcription of current phrase for {peer_name}..."
                    )

                    # Create an event loop if none exists
                    try:
                        loop = asyncio.get_event_loop()
                    except RuntimeError:
                        loop = asyncio.new_event_loop()
                        asyncio.set_event_loop(loop)

                    # Force transcription
                    async def force_transcribe():
                        await audio_processor._transcribe_and_send(
                            audio_processor.current_phrase_audio.copy(), is_final=True
                        )

                    loop.run_until_complete(force_transcribe())
                    print(f"✅ Forced transcription completed for {peer_name}")
                else:
                    print(
                        f"⏱️ Current phrase too short for {peer_name} ({phrase_duration:.2f}s < 0.3s)"
                    )
            else:
                print(f"🤐 No current phrase to transcribe for {peer_name}")

    except ImportError:
        print(
            "❌ Could not import whisper components. Make sure the whisper agent is loaded."
        )
    except Exception as e:
        print(f"❌ Error: {e}")


def show_audio_stats():
    """Show detailed audio statistics."""
    try:
        from bots.whisper import _audio_processors

        if not _audio_processors:
            print("❌ No audio processors found")
            return

        print(
            f"\n📊 Detailed Audio Statistics for {len(_audio_processors)} processors:"
        )

        for peer_name, audio_processor in _audio_processors.items():
            print(f"\n👤 {peer_name}:")
            print(f"Sample rate: {audio_processor.sample_rate}Hz")
            print(f"Samples per frame: {audio_processor.samples_per_frame}")
            print(f"Phrase timeout: {audio_processor.phrase_timeout}s")
            print(f"Buffer max length: {audio_processor.audio_buffer.maxlen}")
            print(f"Current buffer size: {len(audio_processor.audio_buffer)}")
            print(f"Processing queue size: {audio_processor.processing_queue.qsize()}")

            if len(audio_processor.current_phrase_audio) > 0:
                phrase_duration = (
                    len(audio_processor.current_phrase_audio)
                    / audio_processor.sample_rate
                )
                phrase_rms = np.sqrt(np.mean(audio_processor.current_phrase_audio**2))
                phrase_peak = np.max(np.abs(audio_processor.current_phrase_audio))
                print("  Current phrase:")
                print(f"    Duration: {phrase_duration:.2f}s")
                print(f"    Samples: {len(audio_processor.current_phrase_audio)}")
                print(f"    RMS: {phrase_rms:.6f}")
                print(f"    Peak: {phrase_peak:.6f}")

            if len(audio_processor.audio_buffer) > 0:
                combined = np.concatenate(list(audio_processor.audio_buffer))
                buffer_duration = len(combined) / audio_processor.sample_rate
                buffer_rms = np.sqrt(np.mean(combined**2))
                buffer_peak = np.max(np.abs(combined))
                print("  Buffer contents:")
                print(f"    Duration: {buffer_duration:.2f}s")
                print(f"    Samples: {len(combined)}")
                print(f"    RMS: {buffer_rms:.6f}")
                print(f"    Peak: {buffer_peak:.6f}")

    except Exception as e:
        print(f"❌ Error getting stats: {e}")


if __name__ == "__main__":
    if len(sys.argv) > 1 and sys.argv[1] == "stats":
        show_audio_stats()
    else:
        force_transcription()
        show_audio_stats()