Audio is now routing to buffers
This commit is contained in:
parent
1074eb48dc
commit
0691dbf97f
@ -18,6 +18,7 @@ from pydantic import BaseModel
|
||||
import librosa
|
||||
from logger import logger
|
||||
from aiortc import MediaStreamTrack
|
||||
from aiortc.mediastreams import MediaStreamError
|
||||
from av import AudioFrame
|
||||
|
||||
# Import shared models for chat functionality
|
||||
@ -472,16 +473,43 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
||||
audio_processor = _audio_processors[peer.peer_name]
|
||||
|
||||
logger.info(
|
||||
f"Received audio track from {peer.peer_name}, starting transcription (processor available: {audio_processor is not None})"
|
||||
f"Received audio track from {peer.peer_name}, starting transcription"
|
||||
)
|
||||
|
||||
# Start the frame reception loop
|
||||
|
||||
try:
|
||||
frame_count = 0
|
||||
while True:
|
||||
# Receive audio frame
|
||||
frame = await track.recv()
|
||||
try:
|
||||
# Receive audio frame
|
||||
frame = await track.recv()
|
||||
frame_count += 1
|
||||
# Log less frequently now that we know frames are being received
|
||||
if frame_count % 100 == 0:
|
||||
logger.info(f"Received {frame_count} frames from {peer.peer_name}")
|
||||
except MediaStreamError as e:
|
||||
# Connection was closed or media stream ended - this is normal
|
||||
logger.info(
|
||||
f"Audio stream ended for {peer.peer_name} (MediaStreamError: {e})"
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
# Other errors during frame reception
|
||||
logger.error(
|
||||
f"Error receiving audio frame from {peer.peer_name}: {e}", exc_info=True
|
||||
)
|
||||
break
|
||||
|
||||
# Check if this is an audio frame and convert to numpy array for processing
|
||||
if isinstance(frame, AudioFrame):
|
||||
# Convert AudioFrame to numpy array
|
||||
audio_data = frame.to_ndarray()
|
||||
try:
|
||||
audio_data = frame.to_ndarray()
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting frame to ndarray for {peer.peer_name}: {e}")
|
||||
continue
|
||||
|
||||
original_shape = audio_data.shape
|
||||
original_dtype = audio_data.dtype
|
||||
|
||||
@ -489,14 +517,16 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
||||
f"Audio frame data: shape={original_shape}, dtype={original_dtype}, samples={frame.samples if hasattr(frame, 'samples') else 'unknown'}"
|
||||
)
|
||||
|
||||
# Handle different audio formats
|
||||
# Handle different audio formats - convert stereo to mono if needed
|
||||
if audio_data.ndim == 2: # Stereo -> mono
|
||||
audio_data = np.mean(audio_data, axis=1)
|
||||
logger.debug(
|
||||
f"Converted stereo to mono: {original_shape} -> {audio_data.shape}"
|
||||
)
|
||||
if audio_data.shape[0] == 1: # Shape is (1, samples) - just squeeze the first dimension
|
||||
audio_data = audio_data.squeeze(0)
|
||||
logger.debug(f"Squeezed single-channel audio: {original_shape} -> {audio_data.shape}")
|
||||
else: # True stereo (2, samples) or (samples, 2) - average channels
|
||||
audio_data = np.mean(audio_data, axis=0 if audio_data.shape[0] > audio_data.shape[1] else 1)
|
||||
logger.debug(f"Converted stereo to mono: {original_shape} -> {audio_data.shape}")
|
||||
|
||||
# Convert to float32 and normalize
|
||||
# Convert to float32 and normalize based on data type
|
||||
if audio_data.dtype == np.int16:
|
||||
audio_data = audio_data.astype(np.float32) / 32768.0
|
||||
logger.debug("Normalized int16 audio to float32")
|
||||
@ -504,20 +534,31 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
||||
audio_data = audio_data.astype(np.float32) / 2147483648.0
|
||||
logger.debug("Normalized int32 audio to float32")
|
||||
|
||||
# Resample to 16kHz if needed
|
||||
# Resample to 16kHz if needed for Whisper model
|
||||
if frame.sample_rate != sample_rate:
|
||||
original_length = len(audio_data)
|
||||
audio_data = librosa.resample( # type: ignore
|
||||
audio_data, orig_sr=frame.sample_rate, target_sr=sample_rate
|
||||
)
|
||||
|
||||
# Use librosa to resample with explicit float64 conversion for better precision
|
||||
try:
|
||||
audio_float64 = audio_data.astype(np.float64)
|
||||
|
||||
audio_data = librosa.resample( # type: ignore
|
||||
audio_float64, orig_sr=frame.sample_rate, target_sr=sample_rate
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Resampling failed for {peer.peer_name}: {str(e)}")
|
||||
# Fall back to original data
|
||||
audio_data = audio_data
|
||||
|
||||
logger.debug(
|
||||
f"Resampled audio: {frame.sample_rate}Hz -> {sample_rate}Hz, {original_length} -> {len(audio_data)} samples"
|
||||
)
|
||||
|
||||
# Ensure audio_data is AudioArray (float32)
|
||||
else:
|
||||
# No resampling needed
|
||||
pass
|
||||
|
||||
# Ensure audio_data is properly typed as float32 and calculate frame metrics
|
||||
audio_data_float32 = cast(AudioArray, audio_data.astype(np.float32))
|
||||
|
||||
# Calculate audio quality metrics for this frame
|
||||
frame_rms = np.sqrt(np.mean(audio_data_float32**2))
|
||||
frame_peak = np.max(np.abs(audio_data_float32))
|
||||
|
||||
@ -563,7 +604,7 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
||||
f"Connection active from {peer.peer_name}: Frame #{frame_count} (silent, RMS: {frame_rms:.6f})"
|
||||
)
|
||||
|
||||
# Send to audio processor
|
||||
# Send processed audio to the audio processor for transcription
|
||||
if audio_processor:
|
||||
audio_processor.add_audio_data(audio_data_float32)
|
||||
else:
|
||||
@ -577,8 +618,11 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing audio track from {peer.peer_name}: {e}", exc_info=True
|
||||
f"Unexpected error processing audio track from {peer.peer_name}: {e}", exc_info=True
|
||||
)
|
||||
finally:
|
||||
# Clean up the audio processor when the stream ends
|
||||
cleanup_peer_processor(peer.peer_name)
|
||||
|
||||
|
||||
def agent_info() -> Dict[str, str]:
|
||||
|
@ -1,195 +0,0 @@
|
||||
"""
|
||||
Simple test to verify Step 5B enhanced bot functionality.
|
||||
|
||||
This test verifies that the enhanced bot components work correctly
|
||||
when integrated with the existing voicebot system.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
|
||||
# Set up test environment variables
|
||||
os.environ["AI_CHATBOT_PERSONALITY"] = "helpful_assistant"
|
||||
os.environ["AI_CHATBOT_PROVIDER"] = "local" # Use local provider for testing
|
||||
os.environ["AI_CHATBOT_STREAMING"] = "false"
|
||||
os.environ["AI_CHATBOT_MEMORY"] = "true"
|
||||
|
||||
# Import test modules
|
||||
import sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from shared.models import ChatMessageModel
|
||||
|
||||
|
||||
async def test_enhanced_ai_chatbot():
|
||||
"""Test the enhanced AI chatbot functionality."""
|
||||
print("Testing Enhanced AI Chatbot...")
|
||||
|
||||
try:
|
||||
# Import the enhanced bot
|
||||
from voicebot.bots.ai_chatbot import handle_chat_message, get_bot_status
|
||||
|
||||
# Create a mock send function
|
||||
responses = []
|
||||
async def mock_send(message: str):
|
||||
responses.append(message)
|
||||
print(f"Bot Response: {message}")
|
||||
|
||||
# Test message handling
|
||||
test_message = ChatMessageModel(
|
||||
id="test_message_id",
|
||||
sender_name="test_user",
|
||||
sender_session_id="test_session",
|
||||
lobby_id="test_lobby",
|
||||
message="Hello, can you help me?",
|
||||
timestamp=time.time()
|
||||
)
|
||||
|
||||
print(f"Sending test message: {test_message.message}")
|
||||
response = await handle_chat_message(test_message, mock_send)
|
||||
|
||||
if response:
|
||||
print(f"✓ Bot responded successfully: {response[:50]}...")
|
||||
else:
|
||||
print("✗ Bot did not respond")
|
||||
|
||||
# Test bot status
|
||||
print("\nTesting bot status...")
|
||||
status = await get_bot_status()
|
||||
print("✓ Bot status retrieved:")
|
||||
print(f" - Agent: {status.get('agent_name', 'unknown')}")
|
||||
print(f" - Features Available: {status.get('features_available', False)}")
|
||||
print(f" - Configuration: {status.get('configuration', {})}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Enhanced bot test failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def test_personality_system():
|
||||
"""Test the personality system components."""
|
||||
print("\nTesting Personality System...")
|
||||
|
||||
try:
|
||||
from voicebot.personality_system import personality_manager
|
||||
|
||||
# Test listing templates
|
||||
templates = personality_manager.list_templates()
|
||||
print(f"✓ Found {len(templates)} personality templates:")
|
||||
for template in templates:
|
||||
print(f" - {template.id}: {template.description}")
|
||||
|
||||
# Test creating personality from template
|
||||
personality = personality_manager.create_personality_from_template("helpful_assistant")
|
||||
if personality:
|
||||
print(f"✓ Created personality: {personality.name}")
|
||||
print(f" - Traits: {[trait.value for trait in personality.traits]}")
|
||||
print(f" - Communication Style: {personality.communication_style.value}")
|
||||
else:
|
||||
print("✗ Failed to create personality")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Personality system test failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def test_conversation_context():
|
||||
"""Test the conversation context management."""
|
||||
print("\nTesting Conversation Context...")
|
||||
|
||||
try:
|
||||
from voicebot.conversation_context import context_manager
|
||||
|
||||
# Test creating context
|
||||
context = context_manager.get_or_create_context(
|
||||
session_id="test_session",
|
||||
bot_name="test_bot",
|
||||
conversation_id="test_conversation"
|
||||
)
|
||||
|
||||
if context:
|
||||
print(f"✓ Created conversation context: {context.conversation_id}")
|
||||
|
||||
# Test adding conversation turn
|
||||
context_manager.add_conversation_turn(
|
||||
conversation_id=context.conversation_id,
|
||||
user_message="Test message",
|
||||
bot_response="Test response",
|
||||
context_used={"test": "context"},
|
||||
metadata={"timestamp": time.time()}
|
||||
)
|
||||
|
||||
print("✓ Added conversation turn")
|
||||
print(f" - Turns in context: {len(context.turns)}")
|
||||
|
||||
# Test context summary
|
||||
summary = context_manager.get_context_for_response(context.conversation_id)
|
||||
if summary:
|
||||
print(f"✓ Generated context summary: {summary[:50]}...")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Conversation context test failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def test_integration_orchestrator():
|
||||
"""Test the integration orchestrator."""
|
||||
print("\nTesting Integration Orchestrator...")
|
||||
|
||||
try:
|
||||
from step_5b_integration_demo import enhanced_orchestrator
|
||||
|
||||
# Test bot discovery
|
||||
enhanced_bots = await enhanced_orchestrator.discover_enhanced_bots()
|
||||
print(f"✓ Discovered {len(enhanced_bots)} bots")
|
||||
|
||||
# Find enhanced bots
|
||||
enhanced_count = sum(1 for bot_info in enhanced_bots.values()
|
||||
if bot_info.get('enhanced_features', False))
|
||||
print(f"✓ Found {enhanced_count} enhanced bots")
|
||||
|
||||
# Test analytics
|
||||
analytics = enhanced_orchestrator.get_bot_analytics()
|
||||
print(f"✓ Analytics: {analytics['enhanced_bots_count']} enhanced bots configured")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Integration orchestrator test failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def run_all_tests():
|
||||
"""Run all Step 5B tests."""
|
||||
print("=== Step 5B Enhanced Bot Management Tests ===\n")
|
||||
|
||||
test_results = []
|
||||
|
||||
# Run individual tests
|
||||
test_results.append(await test_enhanced_ai_chatbot())
|
||||
test_results.append(await test_personality_system())
|
||||
test_results.append(await test_conversation_context())
|
||||
test_results.append(await test_integration_orchestrator())
|
||||
|
||||
# Summary
|
||||
passed = sum(test_results)
|
||||
total = len(test_results)
|
||||
|
||||
print(f"\n=== Test Results: {passed}/{total} tests passed ===")
|
||||
|
||||
if passed == total:
|
||||
print("🎉 All Step 5B components are working correctly!")
|
||||
else:
|
||||
print("⚠️ Some tests failed - check the output above for details")
|
||||
|
||||
return passed == total
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_all_tests())
|
@ -1,110 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug script to test Whisper transcription with synthetic audio.
|
||||
This helps identify if the issue is with audio processing or the transcription pipeline.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the voicebot directory to the path
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
try:
|
||||
from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate
|
||||
except ImportError as e:
|
||||
print(f"Error importing whisper components: {e}")
|
||||
print("Make sure you're running this from the voicebot directory")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def generate_test_audio(
|
||||
duration_seconds: float = 2.0, frequency: float = 440.0
|
||||
) -> np.ndarray:
|
||||
"""Generate a synthetic sine wave for testing."""
|
||||
samples = int(duration_seconds * sample_rate)
|
||||
t = np.linspace(0, duration_seconds, samples, False)
|
||||
# Generate a sine wave with some amplitude modulation to simulate speech-like patterns
|
||||
amplitude = 0.1 * (
|
||||
1 + 0.5 * np.sin(2 * np.pi * 2 * t)
|
||||
) # Amplitude modulation at 2Hz
|
||||
audio = amplitude * np.sin(2 * np.pi * frequency * t)
|
||||
return audio.astype(np.float32)
|
||||
|
||||
|
||||
def test_transcription_pipeline():
|
||||
"""Test the Whisper transcription pipeline with synthetic audio."""
|
||||
print("Testing Whisper transcription pipeline...")
|
||||
|
||||
# Test 1: Complete silence
|
||||
print("\n=== Test 1: Complete Silence ===")
|
||||
silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32)
|
||||
test_audio_transcription(silent_audio, "Silent audio")
|
||||
|
||||
# Test 2: Very quiet noise
|
||||
print("\n=== Test 2: Very Quiet Noise ===")
|
||||
quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32)
|
||||
test_audio_transcription(quiet_noise, "Quiet noise")
|
||||
|
||||
# Test 3: Sine wave (should produce some output)
|
||||
print("\n=== Test 3: Sine Wave ===")
|
||||
sine_audio = generate_test_audio(2.0, 440.0)
|
||||
test_audio_transcription(sine_audio, "Sine wave")
|
||||
|
||||
# Test 4: Multiple frequency sine wave
|
||||
print("\n=== Test 4: Complex Sine Wave ===")
|
||||
complex_audio = (
|
||||
generate_test_audio(2.0, 220.0)
|
||||
+ generate_test_audio(2.0, 440.0)
|
||||
+ generate_test_audio(2.0, 880.0)
|
||||
) / 3.0
|
||||
test_audio_transcription(complex_audio, "Complex sine wave")
|
||||
|
||||
|
||||
def test_audio_transcription(audio_array: np.ndarray, description: str):
|
||||
"""Test transcription of a specific audio array."""
|
||||
try:
|
||||
# Calculate metrics
|
||||
duration = len(audio_array) / sample_rate
|
||||
rms = np.sqrt(np.mean(audio_array**2))
|
||||
peak = np.max(np.abs(audio_array))
|
||||
|
||||
print(f"Testing {description}:")
|
||||
print(f" Duration: {duration:.2f}s")
|
||||
print(f" Samples: {len(audio_array)}")
|
||||
print(f" RMS: {rms:.6f}")
|
||||
print(f" Peak: {peak:.6f}")
|
||||
|
||||
# Test feature extraction
|
||||
start_time = time.time()
|
||||
input_features = extract_input_features(audio_array, sample_rate)
|
||||
feature_time = time.time() - start_time
|
||||
print(f" Feature extraction: {feature_time:.3f}s")
|
||||
|
||||
# Test model inference
|
||||
start_time = time.time()
|
||||
predicted_ids = _pt_model.generate(input_features)
|
||||
inference_time = time.time() - start_time
|
||||
print(f" Model inference: {inference_time:.3f}s")
|
||||
|
||||
# Test decoding
|
||||
start_time = time.time()
|
||||
transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
||||
decoding_time = time.time() - start_time
|
||||
print(f" Decoding: {decoding_time:.3f}s")
|
||||
|
||||
# Show result
|
||||
text = (
|
||||
transcription[0].strip() if transcription and len(transcription) > 0 else ""
|
||||
)
|
||||
print(f" Result: '{text}'" if text else " Result: (empty)")
|
||||
print(f" Result length: {len(text)}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_transcription_pipeline()
|
Loading…
x
Reference in New Issue
Block a user