Audio is now routing to buffers

2025-09-07 22:46:45 -07:00 · 2025-09-07 22:46:45 -07:00 · 0691dbf97f
commit 0691dbf97f
parent 1074eb48dc
3 changed files with 64 additions and 325 deletions
--- a/voicebot/bots/whisper.py
+++ b/voicebot/bots/whisper.py
@ -18,6 +18,7 @@ from pydantic import BaseModel
 import librosa
 from logger import logger
 from aiortc import MediaStreamTrack
+from aiortc.mediastreams import MediaStreamError
 from av import AudioFrame

 # Import shared models for chat functionality
@ -472,16 +473,43 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
    audio_processor = _audio_processors[peer.peer_name]

    logger.info(
-        f"Received audio track from {peer.peer_name}, starting transcription (processor available: {audio_processor is not None})"
+        f"Received audio track from {peer.peer_name}, starting transcription"
    )

+    # Start the frame reception loop
+
    try:
+        frame_count = 0
        while True:
-            # Receive audio frame
-            frame = await track.recv()
+            try:
+                # Receive audio frame
+                frame = await track.recv()
+                frame_count += 1
+                # Log less frequently now that we know frames are being received
+                if frame_count % 100 == 0:
+                    logger.info(f"Received {frame_count} frames from {peer.peer_name}")
+            except MediaStreamError as e:
+                # Connection was closed or media stream ended - this is normal
+                logger.info(
+                    f"Audio stream ended for {peer.peer_name} (MediaStreamError: {e})"
+                )
+                break
+            except Exception as e:
+                # Other errors during frame reception
+                logger.error(
+                    f"Error receiving audio frame from {peer.peer_name}: {e}", exc_info=True
+                )
+                break
+                
+            # Check if this is an audio frame and convert to numpy array for processing
            if isinstance(frame, AudioFrame):
                # Convert AudioFrame to numpy array
-                audio_data = frame.to_ndarray()
+                try:
+                    audio_data = frame.to_ndarray()
+                except Exception as e:
+                    logger.error(f"Error converting frame to ndarray for {peer.peer_name}: {e}")
+                    continue
+                
                original_shape = audio_data.shape
                original_dtype = audio_data.dtype

@ -489,14 +517,16 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
                    f"Audio frame data: shape={original_shape}, dtype={original_dtype}, samples={frame.samples if hasattr(frame, 'samples') else 'unknown'}"
                )

-                # Handle different audio formats
+                # Handle different audio formats - convert stereo to mono if needed
                if audio_data.ndim == 2:  # Stereo -> mono
-                    audio_data = np.mean(audio_data, axis=1)
-                    logger.debug(
-                        f"Converted stereo to mono: {original_shape} -> {audio_data.shape}"
-                    )
+                    if audio_data.shape[0] == 1:  # Shape is (1, samples) - just squeeze the first dimension
+                        audio_data = audio_data.squeeze(0)
+                        logger.debug(f"Squeezed single-channel audio: {original_shape} -> {audio_data.shape}")
+                    else:  # True stereo (2, samples) or (samples, 2) - average channels
+                        audio_data = np.mean(audio_data, axis=0 if audio_data.shape[0] > audio_data.shape[1] else 1)
+                        logger.debug(f"Converted stereo to mono: {original_shape} -> {audio_data.shape}")

-                # Convert to float32 and normalize
+                # Convert to float32 and normalize based on data type
                if audio_data.dtype == np.int16:
                    audio_data = audio_data.astype(np.float32) / 32768.0
                    logger.debug("Normalized int16 audio to float32")
@ -504,20 +534,31 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
                    audio_data = audio_data.astype(np.float32) / 2147483648.0
                    logger.debug("Normalized int32 audio to float32")

-                # Resample to 16kHz if needed
+                # Resample to 16kHz if needed for Whisper model
                if frame.sample_rate != sample_rate:
                    original_length = len(audio_data)
-                    audio_data = librosa.resample(  # type: ignore
-                        audio_data, orig_sr=frame.sample_rate, target_sr=sample_rate
-                    )
+                    
+                    # Use librosa to resample with explicit float64 conversion for better precision
+                    try:
+                        audio_float64 = audio_data.astype(np.float64)
+                        
+                        audio_data = librosa.resample(  # type: ignore
+                            audio_float64, orig_sr=frame.sample_rate, target_sr=sample_rate
+                        )
+                    except Exception as e:
+                        logger.error(f"Resampling failed for {peer.peer_name}: {str(e)}")
+                        # Fall back to original data
+                        audio_data = audio_data
+                    
                    logger.debug(
                        f"Resampled audio: {frame.sample_rate}Hz -> {sample_rate}Hz, {original_length} -> {len(audio_data)} samples"
                    )
-
-                # Ensure audio_data is AudioArray (float32)
+                else:
+                    # No resampling needed
+                    pass
+                
+                # Ensure audio_data is properly typed as float32 and calculate frame metrics
                audio_data_float32 = cast(AudioArray, audio_data.astype(np.float32))
-
-                # Calculate audio quality metrics for this frame
                frame_rms = np.sqrt(np.mean(audio_data_float32**2))
                frame_peak = np.max(np.abs(audio_data_float32))

@ -563,7 +604,7 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
                        f"Connection active from {peer.peer_name}: Frame #{frame_count} (silent, RMS: {frame_rms:.6f})"
                    )

-                # Send to audio processor
+                # Send processed audio to the audio processor for transcription
                if audio_processor:
                    audio_processor.add_audio_data(audio_data_float32)
                else:
@ -577,8 +618,11 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):

    except Exception as e:
        logger.error(
-            f"Error processing audio track from {peer.peer_name}: {e}", exc_info=True
+            f"Unexpected error processing audio track from {peer.peer_name}: {e}", exc_info=True
        )
+    finally:
+        # Clean up the audio processor when the stream ends
+        cleanup_peer_processor(peer.peer_name)


 def agent_info() -> Dict[str, str]:
--- a/voicebot/test_step_5b.py
+++ b/voicebot/test_step_5b.py
@ -1,195 +0,0 @@
-"""
-Simple test to verify Step 5B enhanced bot functionality.
-
-This test verifies that the enhanced bot components work correctly
-when integrated with the existing voicebot system.
-"""
-
-import asyncio
-import os
-import time
-
-# Set up test environment variables
-os.environ["AI_CHATBOT_PERSONALITY"] = "helpful_assistant"
-os.environ["AI_CHATBOT_PROVIDER"] = "local"  # Use local provider for testing
-os.environ["AI_CHATBOT_STREAMING"] = "false"
-os.environ["AI_CHATBOT_MEMORY"] = "true"
-
-# Import test modules
-import sys
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from shared.models import ChatMessageModel
-
-
-async def test_enhanced_ai_chatbot():
-    """Test the enhanced AI chatbot functionality."""
-    print("Testing Enhanced AI Chatbot...")
-    
-    try:
-        # Import the enhanced bot
-        from voicebot.bots.ai_chatbot import handle_chat_message, get_bot_status
-        
-        # Create a mock send function
-        responses = []
-        async def mock_send(message: str):
-            responses.append(message)
-            print(f"Bot Response: {message}")
-        
-        # Test message handling
-        test_message = ChatMessageModel(
-            id="test_message_id",
-            sender_name="test_user",
-            sender_session_id="test_session",
-            lobby_id="test_lobby",
-            message="Hello, can you help me?",
-            timestamp=time.time()
-        )
-        
-        print(f"Sending test message: {test_message.message}")
-        response = await handle_chat_message(test_message, mock_send)
-        
-        if response:
-            print(f"✓ Bot responded successfully: {response[:50]}...")
-        else:
-            print("✗ Bot did not respond")
-        
-        # Test bot status
-        print("\nTesting bot status...")
-        status = await get_bot_status()
-        print("✓ Bot status retrieved:")
-        print(f"  - Agent: {status.get('agent_name', 'unknown')}")
-        print(f"  - Features Available: {status.get('features_available', False)}")
-        print(f"  - Configuration: {status.get('configuration', {})}")
-        
-        return True
-        
-    except Exception as e:
-        print(f"✗ Enhanced bot test failed: {e}")
-        return False
-
-
-async def test_personality_system():
-    """Test the personality system components."""
-    print("\nTesting Personality System...")
-    
-    try:
-        from voicebot.personality_system import personality_manager
-        
-        # Test listing templates
-        templates = personality_manager.list_templates()
-        print(f"✓ Found {len(templates)} personality templates:")
-        for template in templates:
-            print(f"  - {template.id}: {template.description}")
-        
-        # Test creating personality from template
-        personality = personality_manager.create_personality_from_template("helpful_assistant")
-        if personality:
-            print(f"✓ Created personality: {personality.name}")
-            print(f"  - Traits: {[trait.value for trait in personality.traits]}")
-            print(f"  - Communication Style: {personality.communication_style.value}")
-        else:
-            print("✗ Failed to create personality")
-        
-        return True
-        
-    except Exception as e:
-        print(f"✗ Personality system test failed: {e}")
-        return False
-
-
-async def test_conversation_context():
-    """Test the conversation context management."""
-    print("\nTesting Conversation Context...")
-    
-    try:
-        from voicebot.conversation_context import context_manager
-        
-        # Test creating context
-        context = context_manager.get_or_create_context(
-            session_id="test_session",
-            bot_name="test_bot",
-            conversation_id="test_conversation"
-        )
-        
-        if context:
-            print(f"✓ Created conversation context: {context.conversation_id}")
-        
-        # Test adding conversation turn
-        context_manager.add_conversation_turn(
-            conversation_id=context.conversation_id,
-            user_message="Test message",
-            bot_response="Test response",
-            context_used={"test": "context"},
-            metadata={"timestamp": time.time()}
-        )
-        
-        print("✓ Added conversation turn")
-        print(f"  - Turns in context: {len(context.turns)}")
-        
-        # Test context summary
-        summary = context_manager.get_context_for_response(context.conversation_id)
-        if summary:
-            print(f"✓ Generated context summary: {summary[:50]}...")
-        
-        return True
-        
-    except Exception as e:
-        print(f"✗ Conversation context test failed: {e}")
-        return False
-
-
-async def test_integration_orchestrator():
-    """Test the integration orchestrator."""
-    print("\nTesting Integration Orchestrator...")
-    
-    try:
-        from step_5b_integration_demo import enhanced_orchestrator
-        
-        # Test bot discovery
-        enhanced_bots = await enhanced_orchestrator.discover_enhanced_bots()
-        print(f"✓ Discovered {len(enhanced_bots)} bots")
-        
-        # Find enhanced bots
-        enhanced_count = sum(1 for bot_info in enhanced_bots.values() 
-                           if bot_info.get('enhanced_features', False))
-        print(f"✓ Found {enhanced_count} enhanced bots")
-        
-        # Test analytics
-        analytics = enhanced_orchestrator.get_bot_analytics()
-        print(f"✓ Analytics: {analytics['enhanced_bots_count']} enhanced bots configured")
-        
-        return True
-        
-    except Exception as e:
-        print(f"✗ Integration orchestrator test failed: {e}")
-        return False
-
-
-async def run_all_tests():
-    """Run all Step 5B tests."""
-    print("=== Step 5B Enhanced Bot Management Tests ===\n")
-    
-    test_results = []
-    
-    # Run individual tests
-    test_results.append(await test_enhanced_ai_chatbot())
-    test_results.append(await test_personality_system()) 
-    test_results.append(await test_conversation_context())
-    test_results.append(await test_integration_orchestrator())
-    
-    # Summary
-    passed = sum(test_results)
-    total = len(test_results)
-    
-    print(f"\n=== Test Results: {passed}/{total} tests passed ===")
-    
-    if passed == total:
-        print("🎉 All Step 5B components are working correctly!")
-    else:
-        print("⚠️  Some tests failed - check the output above for details")
-    
-    return passed == total
-
-
-if __name__ == "__main__":
-    asyncio.run(run_all_tests())
--- a/voicebot/test_whisper_pipeline.py
+++ b/voicebot/test_whisper_pipeline.py
@ -1,110 +0,0 @@
-#!/usr/bin/env python3
-"""
-Debug script to test Whisper transcription with synthetic audio.
-This helps identify if the issue is with audio processing or the transcription pipeline.
-"""
-
-import numpy as np
-import time
-import sys
-import os
-
-# Add the voicebot directory to the path
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-try:
-    from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate
-except ImportError as e:
-    print(f"Error importing whisper components: {e}")
-    print("Make sure you're running this from the voicebot directory")
-    sys.exit(1)
-
-
-def generate_test_audio(
-    duration_seconds: float = 2.0, frequency: float = 440.0
-) -> np.ndarray:
-    """Generate a synthetic sine wave for testing."""
-    samples = int(duration_seconds * sample_rate)
-    t = np.linspace(0, duration_seconds, samples, False)
-    # Generate a sine wave with some amplitude modulation to simulate speech-like patterns
-    amplitude = 0.1 * (
-        1 + 0.5 * np.sin(2 * np.pi * 2 * t)
-    )  # Amplitude modulation at 2Hz
-    audio = amplitude * np.sin(2 * np.pi * frequency * t)
-    return audio.astype(np.float32)
-
-
-def test_transcription_pipeline():
-    """Test the Whisper transcription pipeline with synthetic audio."""
-    print("Testing Whisper transcription pipeline...")
-
-    # Test 1: Complete silence
-    print("\n=== Test 1: Complete Silence ===")
-    silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32)
-    test_audio_transcription(silent_audio, "Silent audio")
-
-    # Test 2: Very quiet noise
-    print("\n=== Test 2: Very Quiet Noise ===")
-    quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32)
-    test_audio_transcription(quiet_noise, "Quiet noise")
-
-    # Test 3: Sine wave (should produce some output)
-    print("\n=== Test 3: Sine Wave ===")
-    sine_audio = generate_test_audio(2.0, 440.0)
-    test_audio_transcription(sine_audio, "Sine wave")
-
-    # Test 4: Multiple frequency sine wave
-    print("\n=== Test 4: Complex Sine Wave ===")
-    complex_audio = (
-        generate_test_audio(2.0, 220.0)
-        + generate_test_audio(2.0, 440.0)
-        + generate_test_audio(2.0, 880.0)
-    ) / 3.0
-    test_audio_transcription(complex_audio, "Complex sine wave")
-
-
-def test_audio_transcription(audio_array: np.ndarray, description: str):
-    """Test transcription of a specific audio array."""
-    try:
-        # Calculate metrics
-        duration = len(audio_array) / sample_rate
-        rms = np.sqrt(np.mean(audio_array**2))
-        peak = np.max(np.abs(audio_array))
-
-        print(f"Testing {description}:")
-        print(f"  Duration: {duration:.2f}s")
-        print(f"  Samples: {len(audio_array)}")
-        print(f"  RMS: {rms:.6f}")
-        print(f"  Peak: {peak:.6f}")
-
-        # Test feature extraction
-        start_time = time.time()
-        input_features = extract_input_features(audio_array, sample_rate)
-        feature_time = time.time() - start_time
-        print(f"  Feature extraction: {feature_time:.3f}s")
-
-        # Test model inference
-        start_time = time.time()
-        predicted_ids = _pt_model.generate(input_features)
-        inference_time = time.time() - start_time
-        print(f"  Model inference: {inference_time:.3f}s")
-
-        # Test decoding
-        start_time = time.time()
-        transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True)
-        decoding_time = time.time() - start_time
-        print(f"  Decoding: {decoding_time:.3f}s")
-
-        # Show result
-        text = (
-            transcription[0].strip() if transcription and len(transcription) > 0 else ""
-        )
-        print(f"  Result: '{text}'" if text else "  Result: (empty)")
-        print(f"  Result length: {len(text)}")
-
-    except Exception as e:
-        print(f"  ERROR: {e}")
-
-
-if __name__ == "__main__":
-    test_transcription_pipeline()