Audio is now routing to buffers

This commit is contained in:
James Ketr 2025-09-07 22:46:45 -07:00
parent 1074eb48dc
commit 0691dbf97f
3 changed files with 64 additions and 325 deletions

View File

@ -18,6 +18,7 @@ from pydantic import BaseModel
import librosa
from logger import logger
from aiortc import MediaStreamTrack
from aiortc.mediastreams import MediaStreamError
from av import AudioFrame
# Import shared models for chat functionality
@ -472,16 +473,43 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
audio_processor = _audio_processors[peer.peer_name]
logger.info(
f"Received audio track from {peer.peer_name}, starting transcription (processor available: {audio_processor is not None})"
f"Received audio track from {peer.peer_name}, starting transcription"
)
# Start the frame reception loop
try:
frame_count = 0
while True:
# Receive audio frame
frame = await track.recv()
try:
# Receive audio frame
frame = await track.recv()
frame_count += 1
# Log less frequently now that we know frames are being received
if frame_count % 100 == 0:
logger.info(f"Received {frame_count} frames from {peer.peer_name}")
except MediaStreamError as e:
# Connection was closed or media stream ended - this is normal
logger.info(
f"Audio stream ended for {peer.peer_name} (MediaStreamError: {e})"
)
break
except Exception as e:
# Other errors during frame reception
logger.error(
f"Error receiving audio frame from {peer.peer_name}: {e}", exc_info=True
)
break
# Check if this is an audio frame and convert to numpy array for processing
if isinstance(frame, AudioFrame):
# Convert AudioFrame to numpy array
audio_data = frame.to_ndarray()
try:
audio_data = frame.to_ndarray()
except Exception as e:
logger.error(f"Error converting frame to ndarray for {peer.peer_name}: {e}")
continue
original_shape = audio_data.shape
original_dtype = audio_data.dtype
@ -489,14 +517,16 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
f"Audio frame data: shape={original_shape}, dtype={original_dtype}, samples={frame.samples if hasattr(frame, 'samples') else 'unknown'}"
)
# Handle different audio formats
# Handle different audio formats - convert stereo to mono if needed
if audio_data.ndim == 2: # Stereo -> mono
audio_data = np.mean(audio_data, axis=1)
logger.debug(
f"Converted stereo to mono: {original_shape} -> {audio_data.shape}"
)
if audio_data.shape[0] == 1: # Shape is (1, samples) - just squeeze the first dimension
audio_data = audio_data.squeeze(0)
logger.debug(f"Squeezed single-channel audio: {original_shape} -> {audio_data.shape}")
else: # True stereo (2, samples) or (samples, 2) - average channels
audio_data = np.mean(audio_data, axis=0 if audio_data.shape[0] > audio_data.shape[1] else 1)
logger.debug(f"Converted stereo to mono: {original_shape} -> {audio_data.shape}")
# Convert to float32 and normalize
# Convert to float32 and normalize based on data type
if audio_data.dtype == np.int16:
audio_data = audio_data.astype(np.float32) / 32768.0
logger.debug("Normalized int16 audio to float32")
@ -504,20 +534,31 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
audio_data = audio_data.astype(np.float32) / 2147483648.0
logger.debug("Normalized int32 audio to float32")
# Resample to 16kHz if needed
# Resample to 16kHz if needed for Whisper model
if frame.sample_rate != sample_rate:
original_length = len(audio_data)
audio_data = librosa.resample( # type: ignore
audio_data, orig_sr=frame.sample_rate, target_sr=sample_rate
)
# Use librosa to resample with explicit float64 conversion for better precision
try:
audio_float64 = audio_data.astype(np.float64)
audio_data = librosa.resample( # type: ignore
audio_float64, orig_sr=frame.sample_rate, target_sr=sample_rate
)
except Exception as e:
logger.error(f"Resampling failed for {peer.peer_name}: {str(e)}")
# Fall back to original data
audio_data = audio_data
logger.debug(
f"Resampled audio: {frame.sample_rate}Hz -> {sample_rate}Hz, {original_length} -> {len(audio_data)} samples"
)
# Ensure audio_data is AudioArray (float32)
else:
# No resampling needed
pass
# Ensure audio_data is properly typed as float32 and calculate frame metrics
audio_data_float32 = cast(AudioArray, audio_data.astype(np.float32))
# Calculate audio quality metrics for this frame
frame_rms = np.sqrt(np.mean(audio_data_float32**2))
frame_peak = np.max(np.abs(audio_data_float32))
@ -563,7 +604,7 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
f"Connection active from {peer.peer_name}: Frame #{frame_count} (silent, RMS: {frame_rms:.6f})"
)
# Send to audio processor
# Send processed audio to the audio processor for transcription
if audio_processor:
audio_processor.add_audio_data(audio_data_float32)
else:
@ -577,8 +618,11 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
except Exception as e:
logger.error(
f"Error processing audio track from {peer.peer_name}: {e}", exc_info=True
f"Unexpected error processing audio track from {peer.peer_name}: {e}", exc_info=True
)
finally:
# Clean up the audio processor when the stream ends
cleanup_peer_processor(peer.peer_name)
def agent_info() -> Dict[str, str]:

View File

@ -1,195 +0,0 @@
"""
Simple test to verify Step 5B enhanced bot functionality.
This test verifies that the enhanced bot components work correctly
when integrated with the existing voicebot system.
"""
import asyncio
import os
import time
# Set up test environment variables
os.environ["AI_CHATBOT_PERSONALITY"] = "helpful_assistant"
os.environ["AI_CHATBOT_PROVIDER"] = "local" # Use local provider for testing
os.environ["AI_CHATBOT_STREAMING"] = "false"
os.environ["AI_CHATBOT_MEMORY"] = "true"
# Import test modules
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from shared.models import ChatMessageModel
async def test_enhanced_ai_chatbot():
"""Test the enhanced AI chatbot functionality."""
print("Testing Enhanced AI Chatbot...")
try:
# Import the enhanced bot
from voicebot.bots.ai_chatbot import handle_chat_message, get_bot_status
# Create a mock send function
responses = []
async def mock_send(message: str):
responses.append(message)
print(f"Bot Response: {message}")
# Test message handling
test_message = ChatMessageModel(
id="test_message_id",
sender_name="test_user",
sender_session_id="test_session",
lobby_id="test_lobby",
message="Hello, can you help me?",
timestamp=time.time()
)
print(f"Sending test message: {test_message.message}")
response = await handle_chat_message(test_message, mock_send)
if response:
print(f"✓ Bot responded successfully: {response[:50]}...")
else:
print("✗ Bot did not respond")
# Test bot status
print("\nTesting bot status...")
status = await get_bot_status()
print("✓ Bot status retrieved:")
print(f" - Agent: {status.get('agent_name', 'unknown')}")
print(f" - Features Available: {status.get('features_available', False)}")
print(f" - Configuration: {status.get('configuration', {})}")
return True
except Exception as e:
print(f"✗ Enhanced bot test failed: {e}")
return False
async def test_personality_system():
"""Test the personality system components."""
print("\nTesting Personality System...")
try:
from voicebot.personality_system import personality_manager
# Test listing templates
templates = personality_manager.list_templates()
print(f"✓ Found {len(templates)} personality templates:")
for template in templates:
print(f" - {template.id}: {template.description}")
# Test creating personality from template
personality = personality_manager.create_personality_from_template("helpful_assistant")
if personality:
print(f"✓ Created personality: {personality.name}")
print(f" - Traits: {[trait.value for trait in personality.traits]}")
print(f" - Communication Style: {personality.communication_style.value}")
else:
print("✗ Failed to create personality")
return True
except Exception as e:
print(f"✗ Personality system test failed: {e}")
return False
async def test_conversation_context():
"""Test the conversation context management."""
print("\nTesting Conversation Context...")
try:
from voicebot.conversation_context import context_manager
# Test creating context
context = context_manager.get_or_create_context(
session_id="test_session",
bot_name="test_bot",
conversation_id="test_conversation"
)
if context:
print(f"✓ Created conversation context: {context.conversation_id}")
# Test adding conversation turn
context_manager.add_conversation_turn(
conversation_id=context.conversation_id,
user_message="Test message",
bot_response="Test response",
context_used={"test": "context"},
metadata={"timestamp": time.time()}
)
print("✓ Added conversation turn")
print(f" - Turns in context: {len(context.turns)}")
# Test context summary
summary = context_manager.get_context_for_response(context.conversation_id)
if summary:
print(f"✓ Generated context summary: {summary[:50]}...")
return True
except Exception as e:
print(f"✗ Conversation context test failed: {e}")
return False
async def test_integration_orchestrator():
"""Test the integration orchestrator."""
print("\nTesting Integration Orchestrator...")
try:
from step_5b_integration_demo import enhanced_orchestrator
# Test bot discovery
enhanced_bots = await enhanced_orchestrator.discover_enhanced_bots()
print(f"✓ Discovered {len(enhanced_bots)} bots")
# Find enhanced bots
enhanced_count = sum(1 for bot_info in enhanced_bots.values()
if bot_info.get('enhanced_features', False))
print(f"✓ Found {enhanced_count} enhanced bots")
# Test analytics
analytics = enhanced_orchestrator.get_bot_analytics()
print(f"✓ Analytics: {analytics['enhanced_bots_count']} enhanced bots configured")
return True
except Exception as e:
print(f"✗ Integration orchestrator test failed: {e}")
return False
async def run_all_tests():
"""Run all Step 5B tests."""
print("=== Step 5B Enhanced Bot Management Tests ===\n")
test_results = []
# Run individual tests
test_results.append(await test_enhanced_ai_chatbot())
test_results.append(await test_personality_system())
test_results.append(await test_conversation_context())
test_results.append(await test_integration_orchestrator())
# Summary
passed = sum(test_results)
total = len(test_results)
print(f"\n=== Test Results: {passed}/{total} tests passed ===")
if passed == total:
print("🎉 All Step 5B components are working correctly!")
else:
print("⚠️ Some tests failed - check the output above for details")
return passed == total
if __name__ == "__main__":
asyncio.run(run_all_tests())

View File

@ -1,110 +0,0 @@
#!/usr/bin/env python3
"""
Debug script to test Whisper transcription with synthetic audio.
This helps identify if the issue is with audio processing or the transcription pipeline.
"""
import numpy as np
import time
import sys
import os
# Add the voicebot directory to the path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
try:
from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate
except ImportError as e:
print(f"Error importing whisper components: {e}")
print("Make sure you're running this from the voicebot directory")
sys.exit(1)
def generate_test_audio(
duration_seconds: float = 2.0, frequency: float = 440.0
) -> np.ndarray:
"""Generate a synthetic sine wave for testing."""
samples = int(duration_seconds * sample_rate)
t = np.linspace(0, duration_seconds, samples, False)
# Generate a sine wave with some amplitude modulation to simulate speech-like patterns
amplitude = 0.1 * (
1 + 0.5 * np.sin(2 * np.pi * 2 * t)
) # Amplitude modulation at 2Hz
audio = amplitude * np.sin(2 * np.pi * frequency * t)
return audio.astype(np.float32)
def test_transcription_pipeline():
"""Test the Whisper transcription pipeline with synthetic audio."""
print("Testing Whisper transcription pipeline...")
# Test 1: Complete silence
print("\n=== Test 1: Complete Silence ===")
silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32)
test_audio_transcription(silent_audio, "Silent audio")
# Test 2: Very quiet noise
print("\n=== Test 2: Very Quiet Noise ===")
quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32)
test_audio_transcription(quiet_noise, "Quiet noise")
# Test 3: Sine wave (should produce some output)
print("\n=== Test 3: Sine Wave ===")
sine_audio = generate_test_audio(2.0, 440.0)
test_audio_transcription(sine_audio, "Sine wave")
# Test 4: Multiple frequency sine wave
print("\n=== Test 4: Complex Sine Wave ===")
complex_audio = (
generate_test_audio(2.0, 220.0)
+ generate_test_audio(2.0, 440.0)
+ generate_test_audio(2.0, 880.0)
) / 3.0
test_audio_transcription(complex_audio, "Complex sine wave")
def test_audio_transcription(audio_array: np.ndarray, description: str):
"""Test transcription of a specific audio array."""
try:
# Calculate metrics
duration = len(audio_array) / sample_rate
rms = np.sqrt(np.mean(audio_array**2))
peak = np.max(np.abs(audio_array))
print(f"Testing {description}:")
print(f" Duration: {duration:.2f}s")
print(f" Samples: {len(audio_array)}")
print(f" RMS: {rms:.6f}")
print(f" Peak: {peak:.6f}")
# Test feature extraction
start_time = time.time()
input_features = extract_input_features(audio_array, sample_rate)
feature_time = time.time() - start_time
print(f" Feature extraction: {feature_time:.3f}s")
# Test model inference
start_time = time.time()
predicted_ids = _pt_model.generate(input_features)
inference_time = time.time() - start_time
print(f" Model inference: {inference_time:.3f}s")
# Test decoding
start_time = time.time()
transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True)
decoding_time = time.time() - start_time
print(f" Decoding: {decoding_time:.3f}s")
# Show result
text = (
transcription[0].strip() if transcription and len(transcription) > 0 else ""
)
print(f" Result: '{text}'" if text else " Result: (empty)")
print(f" Result length: {len(text)}")
except Exception as e:
print(f" ERROR: {e}")
if __name__ == "__main__":
test_transcription_pipeline()