Audio is now routing to buffers

2025-09-07 22:46:45 -07:00 · 2025-09-07 22:46:45 -07:00 · 0691dbf97f
commit 0691dbf97f
parent 1074eb48dc
3 changed files with 64 additions and 325 deletions
--- a/voicebot/bots/whisper.py
+++ b/voicebot/bots/whisper.py
@ -18,6 +18,7 @@ from pydantic import BaseModel
 import librosa
 from logger import logger
 from aiortc import MediaStreamTrack
 from aiortc.mediastreams import MediaStreamError
 from av import AudioFrame
 # Import shared models for chat functionality
@ -472,16 +473,43 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
    audio_processor = _audio_processors[peer.peer_name]
    logger.info(
-        f"Received audio track from {peer.peer_name}, starting transcription (processor available: {audio_processor is not None})"
+        f"Received audio track from {peer.peer_name}, starting transcription"
    )
    # Start the frame reception loop
    try:
        frame_count = 0
        while True:
-            # Receive audio frame
+            try:
-            frame = await track.recv()
+                # Receive audio frame
                frame = await track.recv()
                frame_count += 1
                # Log less frequently now that we know frames are being received
                if frame_count % 100 == 0:
                    logger.info(f"Received {frame_count} frames from {peer.peer_name}")
            except MediaStreamError as e:
                # Connection was closed or media stream ended - this is normal
                logger.info(
                    f"Audio stream ended for {peer.peer_name} (MediaStreamError: {e})"
                )
                break
            except Exception as e:
                # Other errors during frame reception
                logger.error(
                    f"Error receiving audio frame from {peer.peer_name}: {e}", exc_info=True
                )
                break
            # Check if this is an audio frame and convert to numpy array for processing
            if isinstance(frame, AudioFrame):
                # Convert AudioFrame to numpy array
-                audio_data = frame.to_ndarray()
+                try:
                    audio_data = frame.to_ndarray()
                except Exception as e:
                    logger.error(f"Error converting frame to ndarray for {peer.peer_name}: {e}")
                    continue
                original_shape = audio_data.shape
                original_dtype = audio_data.dtype
@ -489,14 +517,16 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
                    f"Audio frame data: shape={original_shape}, dtype={original_dtype}, samples={frame.samples if hasattr(frame, 'samples') else 'unknown'}"
                )
-                # Handle different audio formats
+                # Handle different audio formats - convert stereo to mono if needed
                if audio_data.ndim == 2:  # Stereo -> mono
-                    audio_data = np.mean(audio_data, axis=1)
+                    if audio_data.shape[0] == 1:  # Shape is (1, samples) - just squeeze the first dimension
-                    logger.debug(
+                        audio_data = audio_data.squeeze(0)
-                        f"Converted stereo to mono: {original_shape} -> {audio_data.shape}"
+                        logger.debug(f"Squeezed single-channel audio: {original_shape} -> {audio_data.shape}")
-                    )
+                    else:  # True stereo (2, samples) or (samples, 2) - average channels
                        audio_data = np.mean(audio_data, axis=0 if audio_data.shape[0] > audio_data.shape[1] else 1)
                        logger.debug(f"Converted stereo to mono: {original_shape} -> {audio_data.shape}")
-                # Convert to float32 and normalize
+                # Convert to float32 and normalize based on data type
                if audio_data.dtype == np.int16:
                    audio_data = audio_data.astype(np.float32) / 32768.0
                    logger.debug("Normalized int16 audio to float32")
@ -504,20 +534,31 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
                    audio_data = audio_data.astype(np.float32) / 2147483648.0
                    logger.debug("Normalized int32 audio to float32")
-                # Resample to 16kHz if needed
+                # Resample to 16kHz if needed for Whisper model
                if frame.sample_rate != sample_rate:
                    original_length = len(audio_data)
-                    audio_data = librosa.resample(  # type: ignore
+                    
-                        audio_data, orig_sr=frame.sample_rate, target_sr=sample_rate
+                    # Use librosa to resample with explicit float64 conversion for better precision
-                    )
+                    try:
                        audio_float64 = audio_data.astype(np.float64)
                        audio_data = librosa.resample(  # type: ignore
                            audio_float64, orig_sr=frame.sample_rate, target_sr=sample_rate
                        )
                    except Exception as e:
                        logger.error(f"Resampling failed for {peer.peer_name}: {str(e)}")
                        # Fall back to original data
                        audio_data = audio_data
                    logger.debug(
                        f"Resampled audio: {frame.sample_rate}Hz -> {sample_rate}Hz, {original_length} -> {len(audio_data)} samples"
                    )
-
+                else:
-                # Ensure audio_data is AudioArray (float32)
+                    # No resampling needed
                    pass
                # Ensure audio_data is properly typed as float32 and calculate frame metrics
                audio_data_float32 = cast(AudioArray, audio_data.astype(np.float32))
                # Calculate audio quality metrics for this frame
                frame_rms = np.sqrt(np.mean(audio_data_float32**2))
                frame_peak = np.max(np.abs(audio_data_float32))
@ -563,7 +604,7 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
                        f"Connection active from {peer.peer_name}: Frame #{frame_count} (silent, RMS: {frame_rms:.6f})"
                    )
-                # Send to audio processor
+                # Send processed audio to the audio processor for transcription
                if audio_processor:
                    audio_processor.add_audio_data(audio_data_float32)
                else:
@ -577,8 +618,11 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
    except Exception as e:
        logger.error(
-            f"Error processing audio track from {peer.peer_name}: {e}", exc_info=True
+            f"Unexpected error processing audio track from {peer.peer_name}: {e}", exc_info=True
        )
    finally:
        # Clean up the audio processor when the stream ends
        cleanup_peer_processor(peer.peer_name)
 def agent_info() -> Dict[str, str]:
--- a/voicebot/test_step_5b.py
+++ b/voicebot/test_step_5b.py
@ -1,195 +0,0 @@
 """
 Simple test to verify Step 5B enhanced bot functionality.
 This test verifies that the enhanced bot components work correctly
 when integrated with the existing voicebot system.
 """
 import asyncio
 import os
 import time
 # Set up test environment variables
 os.environ["AI_CHATBOT_PERSONALITY"] = "helpful_assistant"
 os.environ["AI_CHATBOT_PROVIDER"] = "local"  # Use local provider for testing
 os.environ["AI_CHATBOT_STREAMING"] = "false"
 os.environ["AI_CHATBOT_MEMORY"] = "true"
 # Import test modules
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from shared.models import ChatMessageModel
 async def test_enhanced_ai_chatbot():
    """Test the enhanced AI chatbot functionality."""
    print("Testing Enhanced AI Chatbot...")
    try:
        # Import the enhanced bot
        from voicebot.bots.ai_chatbot import handle_chat_message, get_bot_status
        # Create a mock send function
        responses = []
        async def mock_send(message: str):
            responses.append(message)
            print(f"Bot Response: {message}")
        # Test message handling
        test_message = ChatMessageModel(
            id="test_message_id",
            sender_name="test_user",
            sender_session_id="test_session",
            lobby_id="test_lobby",
            message="Hello, can you help me?",
            timestamp=time.time()
        )
        print(f"Sending test message: {test_message.message}")
        response = await handle_chat_message(test_message, mock_send)
        if response:
            print(f"✓ Bot responded successfully: {response[:50]}...")
        else:
            print("✗ Bot did not respond")
        # Test bot status
        print("\nTesting bot status...")
        status = await get_bot_status()
        print("✓ Bot status retrieved:")
        print(f"  - Agent: {status.get('agent_name', 'unknown')}")
        print(f"  - Features Available: {status.get('features_available', False)}")
        print(f"  - Configuration: {status.get('configuration', {})}")
        return True
    except Exception as e:
        print(f"✗ Enhanced bot test failed: {e}")
        return False
 async def test_personality_system():
    """Test the personality system components."""
    print("\nTesting Personality System...")
    try:
        from voicebot.personality_system import personality_manager
        # Test listing templates
        templates = personality_manager.list_templates()
        print(f"✓ Found {len(templates)} personality templates:")
        for template in templates:
            print(f"  - {template.id}: {template.description}")
        # Test creating personality from template
        personality = personality_manager.create_personality_from_template("helpful_assistant")
        if personality:
            print(f"✓ Created personality: {personality.name}")
            print(f"  - Traits: {[trait.value for trait in personality.traits]}")
            print(f"  - Communication Style: {personality.communication_style.value}")
        else:
            print("✗ Failed to create personality")
        return True
    except Exception as e:
        print(f"✗ Personality system test failed: {e}")
        return False
 async def test_conversation_context():
    """Test the conversation context management."""
    print("\nTesting Conversation Context...")
    try:
        from voicebot.conversation_context import context_manager
        # Test creating context
        context = context_manager.get_or_create_context(
            session_id="test_session",
            bot_name="test_bot",
            conversation_id="test_conversation"
        )
        if context:
            print(f"✓ Created conversation context: {context.conversation_id}")
        # Test adding conversation turn
        context_manager.add_conversation_turn(
            conversation_id=context.conversation_id,
            user_message="Test message",
            bot_response="Test response",
            context_used={"test": "context"},
            metadata={"timestamp": time.time()}
        )
        print("✓ Added conversation turn")
        print(f"  - Turns in context: {len(context.turns)}")
        # Test context summary
        summary = context_manager.get_context_for_response(context.conversation_id)
        if summary:
            print(f"✓ Generated context summary: {summary[:50]}...")
        return True
    except Exception as e:
        print(f"✗ Conversation context test failed: {e}")
        return False
 async def test_integration_orchestrator():
    """Test the integration orchestrator."""
    print("\nTesting Integration Orchestrator...")
    try:
        from step_5b_integration_demo import enhanced_orchestrator
        # Test bot discovery
        enhanced_bots = await enhanced_orchestrator.discover_enhanced_bots()
        print(f"✓ Discovered {len(enhanced_bots)} bots")
        # Find enhanced bots
        enhanced_count = sum(1 for bot_info in enhanced_bots.values() 
                           if bot_info.get('enhanced_features', False))
        print(f"✓ Found {enhanced_count} enhanced bots")
        # Test analytics
        analytics = enhanced_orchestrator.get_bot_analytics()
        print(f"✓ Analytics: {analytics['enhanced_bots_count']} enhanced bots configured")
        return True
    except Exception as e:
        print(f"✗ Integration orchestrator test failed: {e}")
        return False
 async def run_all_tests():
    """Run all Step 5B tests."""
    print("=== Step 5B Enhanced Bot Management Tests ===\n")
    test_results = []
    # Run individual tests
    test_results.append(await test_enhanced_ai_chatbot())
    test_results.append(await test_personality_system()) 
    test_results.append(await test_conversation_context())
    test_results.append(await test_integration_orchestrator())
    # Summary
    passed = sum(test_results)
    total = len(test_results)
    print(f"\n=== Test Results: {passed}/{total} tests passed ===")
    if passed == total:
        print("🎉 All Step 5B components are working correctly!")
    else:
        print("⚠️  Some tests failed - check the output above for details")
    return passed == total
 if __name__ == "__main__":
    asyncio.run(run_all_tests())
--- a/voicebot/test_whisper_pipeline.py
+++ b/voicebot/test_whisper_pipeline.py
@ -1,110 +0,0 @@
 #!/usr/bin/env python3
 """
 Debug script to test Whisper transcription with synthetic audio.
 This helps identify if the issue is with audio processing or the transcription pipeline.
 """
 import numpy as np
 import time
 import sys
 import os
 # Add the voicebot directory to the path
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 try:
    from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate
 except ImportError as e:
    print(f"Error importing whisper components: {e}")
    print("Make sure you're running this from the voicebot directory")
    sys.exit(1)
 def generate_test_audio(
    duration_seconds: float = 2.0, frequency: float = 440.0
 ) -> np.ndarray:
    """Generate a synthetic sine wave for testing."""
    samples = int(duration_seconds * sample_rate)
    t = np.linspace(0, duration_seconds, samples, False)
    # Generate a sine wave with some amplitude modulation to simulate speech-like patterns
    amplitude = 0.1 * (
        1 + 0.5 * np.sin(2 * np.pi * 2 * t)
    )  # Amplitude modulation at 2Hz
    audio = amplitude * np.sin(2 * np.pi * frequency * t)
    return audio.astype(np.float32)
 def test_transcription_pipeline():
    """Test the Whisper transcription pipeline with synthetic audio."""
    print("Testing Whisper transcription pipeline...")
    # Test 1: Complete silence
    print("\n=== Test 1: Complete Silence ===")
    silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32)
    test_audio_transcription(silent_audio, "Silent audio")
    # Test 2: Very quiet noise
    print("\n=== Test 2: Very Quiet Noise ===")
    quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32)
    test_audio_transcription(quiet_noise, "Quiet noise")
    # Test 3: Sine wave (should produce some output)
    print("\n=== Test 3: Sine Wave ===")
    sine_audio = generate_test_audio(2.0, 440.0)
    test_audio_transcription(sine_audio, "Sine wave")
    # Test 4: Multiple frequency sine wave
    print("\n=== Test 4: Complex Sine Wave ===")
    complex_audio = (
        generate_test_audio(2.0, 220.0)
        + generate_test_audio(2.0, 440.0)
        + generate_test_audio(2.0, 880.0)
    ) / 3.0
    test_audio_transcription(complex_audio, "Complex sine wave")
 def test_audio_transcription(audio_array: np.ndarray, description: str):
    """Test transcription of a specific audio array."""
    try:
        # Calculate metrics
        duration = len(audio_array) / sample_rate
        rms = np.sqrt(np.mean(audio_array**2))
        peak = np.max(np.abs(audio_array))
        print(f"Testing {description}:")
        print(f"  Duration: {duration:.2f}s")
        print(f"  Samples: {len(audio_array)}")
        print(f"  RMS: {rms:.6f}")
        print(f"  Peak: {peak:.6f}")
        # Test feature extraction
        start_time = time.time()
        input_features = extract_input_features(audio_array, sample_rate)
        feature_time = time.time() - start_time
        print(f"  Feature extraction: {feature_time:.3f}s")
        # Test model inference
        start_time = time.time()
        predicted_ids = _pt_model.generate(input_features)
        inference_time = time.time() - start_time
        print(f"  Model inference: {inference_time:.3f}s")
        # Test decoding
        start_time = time.time()
        transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True)
        decoding_time = time.time() - start_time
        print(f"  Decoding: {decoding_time:.3f}s")
        # Show result
        text = (
            transcription[0].strip() if transcription and len(transcription) > 0 else ""
        )
        print(f"  Result: '{text}'" if text else "  Result: (empty)")
        print(f"  Result length: {len(text)}")
    except Exception as e:
        print(f"  ERROR: {e}")
 if __name__ == "__main__":
    test_transcription_pipeline()