diff --git a/voicebot/bots/whisper.py b/voicebot/bots/whisper.py index de876a7..9d9a016 100644 --- a/voicebot/bots/whisper.py +++ b/voicebot/bots/whisper.py @@ -18,6 +18,7 @@ from pydantic import BaseModel import librosa from logger import logger from aiortc import MediaStreamTrack +from aiortc.mediastreams import MediaStreamError from av import AudioFrame # Import shared models for chat functionality @@ -472,16 +473,43 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack): audio_processor = _audio_processors[peer.peer_name] logger.info( - f"Received audio track from {peer.peer_name}, starting transcription (processor available: {audio_processor is not None})" + f"Received audio track from {peer.peer_name}, starting transcription" ) + # Start the frame reception loop + try: + frame_count = 0 while True: - # Receive audio frame - frame = await track.recv() + try: + # Receive audio frame + frame = await track.recv() + frame_count += 1 + # Log less frequently now that we know frames are being received + if frame_count % 100 == 0: + logger.info(f"Received {frame_count} frames from {peer.peer_name}") + except MediaStreamError as e: + # Connection was closed or media stream ended - this is normal + logger.info( + f"Audio stream ended for {peer.peer_name} (MediaStreamError: {e})" + ) + break + except Exception as e: + # Other errors during frame reception + logger.error( + f"Error receiving audio frame from {peer.peer_name}: {e}", exc_info=True + ) + break + + # Check if this is an audio frame and convert to numpy array for processing if isinstance(frame, AudioFrame): # Convert AudioFrame to numpy array - audio_data = frame.to_ndarray() + try: + audio_data = frame.to_ndarray() + except Exception as e: + logger.error(f"Error converting frame to ndarray for {peer.peer_name}: {e}") + continue + original_shape = audio_data.shape original_dtype = audio_data.dtype @@ -489,14 +517,16 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack): f"Audio frame data: shape={original_shape}, dtype={original_dtype}, samples={frame.samples if hasattr(frame, 'samples') else 'unknown'}" ) - # Handle different audio formats + # Handle different audio formats - convert stereo to mono if needed if audio_data.ndim == 2: # Stereo -> mono - audio_data = np.mean(audio_data, axis=1) - logger.debug( - f"Converted stereo to mono: {original_shape} -> {audio_data.shape}" - ) + if audio_data.shape[0] == 1: # Shape is (1, samples) - just squeeze the first dimension + audio_data = audio_data.squeeze(0) + logger.debug(f"Squeezed single-channel audio: {original_shape} -> {audio_data.shape}") + else: # True stereo (2, samples) or (samples, 2) - average channels + audio_data = np.mean(audio_data, axis=0 if audio_data.shape[0] > audio_data.shape[1] else 1) + logger.debug(f"Converted stereo to mono: {original_shape} -> {audio_data.shape}") - # Convert to float32 and normalize + # Convert to float32 and normalize based on data type if audio_data.dtype == np.int16: audio_data = audio_data.astype(np.float32) / 32768.0 logger.debug("Normalized int16 audio to float32") @@ -504,20 +534,31 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack): audio_data = audio_data.astype(np.float32) / 2147483648.0 logger.debug("Normalized int32 audio to float32") - # Resample to 16kHz if needed + # Resample to 16kHz if needed for Whisper model if frame.sample_rate != sample_rate: original_length = len(audio_data) - audio_data = librosa.resample( # type: ignore - audio_data, orig_sr=frame.sample_rate, target_sr=sample_rate - ) + + # Use librosa to resample with explicit float64 conversion for better precision + try: + audio_float64 = audio_data.astype(np.float64) + + audio_data = librosa.resample( # type: ignore + audio_float64, orig_sr=frame.sample_rate, target_sr=sample_rate + ) + except Exception as e: + logger.error(f"Resampling failed for {peer.peer_name}: {str(e)}") + # Fall back to original data + audio_data = audio_data + logger.debug( f"Resampled audio: {frame.sample_rate}Hz -> {sample_rate}Hz, {original_length} -> {len(audio_data)} samples" ) - - # Ensure audio_data is AudioArray (float32) + else: + # No resampling needed + pass + + # Ensure audio_data is properly typed as float32 and calculate frame metrics audio_data_float32 = cast(AudioArray, audio_data.astype(np.float32)) - - # Calculate audio quality metrics for this frame frame_rms = np.sqrt(np.mean(audio_data_float32**2)) frame_peak = np.max(np.abs(audio_data_float32)) @@ -563,7 +604,7 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack): f"Connection active from {peer.peer_name}: Frame #{frame_count} (silent, RMS: {frame_rms:.6f})" ) - # Send to audio processor + # Send processed audio to the audio processor for transcription if audio_processor: audio_processor.add_audio_data(audio_data_float32) else: @@ -577,8 +618,11 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack): except Exception as e: logger.error( - f"Error processing audio track from {peer.peer_name}: {e}", exc_info=True + f"Unexpected error processing audio track from {peer.peer_name}: {e}", exc_info=True ) + finally: + # Clean up the audio processor when the stream ends + cleanup_peer_processor(peer.peer_name) def agent_info() -> Dict[str, str]: diff --git a/voicebot/test_step_5b.py b/voicebot/test_step_5b.py deleted file mode 100644 index 8f52914..0000000 --- a/voicebot/test_step_5b.py +++ /dev/null @@ -1,195 +0,0 @@ -""" -Simple test to verify Step 5B enhanced bot functionality. - -This test verifies that the enhanced bot components work correctly -when integrated with the existing voicebot system. -""" - -import asyncio -import os -import time - -# Set up test environment variables -os.environ["AI_CHATBOT_PERSONALITY"] = "helpful_assistant" -os.environ["AI_CHATBOT_PROVIDER"] = "local" # Use local provider for testing -os.environ["AI_CHATBOT_STREAMING"] = "false" -os.environ["AI_CHATBOT_MEMORY"] = "true" - -# Import test modules -import sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from shared.models import ChatMessageModel - - -async def test_enhanced_ai_chatbot(): - """Test the enhanced AI chatbot functionality.""" - print("Testing Enhanced AI Chatbot...") - - try: - # Import the enhanced bot - from voicebot.bots.ai_chatbot import handle_chat_message, get_bot_status - - # Create a mock send function - responses = [] - async def mock_send(message: str): - responses.append(message) - print(f"Bot Response: {message}") - - # Test message handling - test_message = ChatMessageModel( - id="test_message_id", - sender_name="test_user", - sender_session_id="test_session", - lobby_id="test_lobby", - message="Hello, can you help me?", - timestamp=time.time() - ) - - print(f"Sending test message: {test_message.message}") - response = await handle_chat_message(test_message, mock_send) - - if response: - print(f"✓ Bot responded successfully: {response[:50]}...") - else: - print("✗ Bot did not respond") - - # Test bot status - print("\nTesting bot status...") - status = await get_bot_status() - print("✓ Bot status retrieved:") - print(f" - Agent: {status.get('agent_name', 'unknown')}") - print(f" - Features Available: {status.get('features_available', False)}") - print(f" - Configuration: {status.get('configuration', {})}") - - return True - - except Exception as e: - print(f"✗ Enhanced bot test failed: {e}") - return False - - -async def test_personality_system(): - """Test the personality system components.""" - print("\nTesting Personality System...") - - try: - from voicebot.personality_system import personality_manager - - # Test listing templates - templates = personality_manager.list_templates() - print(f"✓ Found {len(templates)} personality templates:") - for template in templates: - print(f" - {template.id}: {template.description}") - - # Test creating personality from template - personality = personality_manager.create_personality_from_template("helpful_assistant") - if personality: - print(f"✓ Created personality: {personality.name}") - print(f" - Traits: {[trait.value for trait in personality.traits]}") - print(f" - Communication Style: {personality.communication_style.value}") - else: - print("✗ Failed to create personality") - - return True - - except Exception as e: - print(f"✗ Personality system test failed: {e}") - return False - - -async def test_conversation_context(): - """Test the conversation context management.""" - print("\nTesting Conversation Context...") - - try: - from voicebot.conversation_context import context_manager - - # Test creating context - context = context_manager.get_or_create_context( - session_id="test_session", - bot_name="test_bot", - conversation_id="test_conversation" - ) - - if context: - print(f"✓ Created conversation context: {context.conversation_id}") - - # Test adding conversation turn - context_manager.add_conversation_turn( - conversation_id=context.conversation_id, - user_message="Test message", - bot_response="Test response", - context_used={"test": "context"}, - metadata={"timestamp": time.time()} - ) - - print("✓ Added conversation turn") - print(f" - Turns in context: {len(context.turns)}") - - # Test context summary - summary = context_manager.get_context_for_response(context.conversation_id) - if summary: - print(f"✓ Generated context summary: {summary[:50]}...") - - return True - - except Exception as e: - print(f"✗ Conversation context test failed: {e}") - return False - - -async def test_integration_orchestrator(): - """Test the integration orchestrator.""" - print("\nTesting Integration Orchestrator...") - - try: - from step_5b_integration_demo import enhanced_orchestrator - - # Test bot discovery - enhanced_bots = await enhanced_orchestrator.discover_enhanced_bots() - print(f"✓ Discovered {len(enhanced_bots)} bots") - - # Find enhanced bots - enhanced_count = sum(1 for bot_info in enhanced_bots.values() - if bot_info.get('enhanced_features', False)) - print(f"✓ Found {enhanced_count} enhanced bots") - - # Test analytics - analytics = enhanced_orchestrator.get_bot_analytics() - print(f"✓ Analytics: {analytics['enhanced_bots_count']} enhanced bots configured") - - return True - - except Exception as e: - print(f"✗ Integration orchestrator test failed: {e}") - return False - - -async def run_all_tests(): - """Run all Step 5B tests.""" - print("=== Step 5B Enhanced Bot Management Tests ===\n") - - test_results = [] - - # Run individual tests - test_results.append(await test_enhanced_ai_chatbot()) - test_results.append(await test_personality_system()) - test_results.append(await test_conversation_context()) - test_results.append(await test_integration_orchestrator()) - - # Summary - passed = sum(test_results) - total = len(test_results) - - print(f"\n=== Test Results: {passed}/{total} tests passed ===") - - if passed == total: - print("🎉 All Step 5B components are working correctly!") - else: - print("⚠️ Some tests failed - check the output above for details") - - return passed == total - - -if __name__ == "__main__": - asyncio.run(run_all_tests()) diff --git a/voicebot/test_whisper_pipeline.py b/voicebot/test_whisper_pipeline.py deleted file mode 100644 index 61e80fc..0000000 --- a/voicebot/test_whisper_pipeline.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python3 -""" -Debug script to test Whisper transcription with synthetic audio. -This helps identify if the issue is with audio processing or the transcription pipeline. -""" - -import numpy as np -import time -import sys -import os - -# Add the voicebot directory to the path -sys.path.append(os.path.dirname(os.path.abspath(__file__))) - -try: - from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate -except ImportError as e: - print(f"Error importing whisper components: {e}") - print("Make sure you're running this from the voicebot directory") - sys.exit(1) - - -def generate_test_audio( - duration_seconds: float = 2.0, frequency: float = 440.0 -) -> np.ndarray: - """Generate a synthetic sine wave for testing.""" - samples = int(duration_seconds * sample_rate) - t = np.linspace(0, duration_seconds, samples, False) - # Generate a sine wave with some amplitude modulation to simulate speech-like patterns - amplitude = 0.1 * ( - 1 + 0.5 * np.sin(2 * np.pi * 2 * t) - ) # Amplitude modulation at 2Hz - audio = amplitude * np.sin(2 * np.pi * frequency * t) - return audio.astype(np.float32) - - -def test_transcription_pipeline(): - """Test the Whisper transcription pipeline with synthetic audio.""" - print("Testing Whisper transcription pipeline...") - - # Test 1: Complete silence - print("\n=== Test 1: Complete Silence ===") - silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32) - test_audio_transcription(silent_audio, "Silent audio") - - # Test 2: Very quiet noise - print("\n=== Test 2: Very Quiet Noise ===") - quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32) - test_audio_transcription(quiet_noise, "Quiet noise") - - # Test 3: Sine wave (should produce some output) - print("\n=== Test 3: Sine Wave ===") - sine_audio = generate_test_audio(2.0, 440.0) - test_audio_transcription(sine_audio, "Sine wave") - - # Test 4: Multiple frequency sine wave - print("\n=== Test 4: Complex Sine Wave ===") - complex_audio = ( - generate_test_audio(2.0, 220.0) - + generate_test_audio(2.0, 440.0) - + generate_test_audio(2.0, 880.0) - ) / 3.0 - test_audio_transcription(complex_audio, "Complex sine wave") - - -def test_audio_transcription(audio_array: np.ndarray, description: str): - """Test transcription of a specific audio array.""" - try: - # Calculate metrics - duration = len(audio_array) / sample_rate - rms = np.sqrt(np.mean(audio_array**2)) - peak = np.max(np.abs(audio_array)) - - print(f"Testing {description}:") - print(f" Duration: {duration:.2f}s") - print(f" Samples: {len(audio_array)}") - print(f" RMS: {rms:.6f}") - print(f" Peak: {peak:.6f}") - - # Test feature extraction - start_time = time.time() - input_features = extract_input_features(audio_array, sample_rate) - feature_time = time.time() - start_time - print(f" Feature extraction: {feature_time:.3f}s") - - # Test model inference - start_time = time.time() - predicted_ids = _pt_model.generate(input_features) - inference_time = time.time() - start_time - print(f" Model inference: {inference_time:.3f}s") - - # Test decoding - start_time = time.time() - transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True) - decoding_time = time.time() - start_time - print(f" Decoding: {decoding_time:.3f}s") - - # Show result - text = ( - transcription[0].strip() if transcription and len(transcription) > 0 else "" - ) - print(f" Result: '{text}'" if text else " Result: (empty)") - print(f" Result length: {len(text)}") - - except Exception as e: - print(f" ERROR: {e}") - - -if __name__ == "__main__": - test_transcription_pipeline()