Audio is now routing to buffers
This commit is contained in:
parent
1074eb48dc
commit
0691dbf97f
@ -18,6 +18,7 @@ from pydantic import BaseModel
|
|||||||
import librosa
|
import librosa
|
||||||
from logger import logger
|
from logger import logger
|
||||||
from aiortc import MediaStreamTrack
|
from aiortc import MediaStreamTrack
|
||||||
|
from aiortc.mediastreams import MediaStreamError
|
||||||
from av import AudioFrame
|
from av import AudioFrame
|
||||||
|
|
||||||
# Import shared models for chat functionality
|
# Import shared models for chat functionality
|
||||||
@ -472,16 +473,43 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
|||||||
audio_processor = _audio_processors[peer.peer_name]
|
audio_processor = _audio_processors[peer.peer_name]
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Received audio track from {peer.peer_name}, starting transcription (processor available: {audio_processor is not None})"
|
f"Received audio track from {peer.peer_name}, starting transcription"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Start the frame reception loop
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
frame_count = 0
|
||||||
while True:
|
while True:
|
||||||
# Receive audio frame
|
try:
|
||||||
frame = await track.recv()
|
# Receive audio frame
|
||||||
|
frame = await track.recv()
|
||||||
|
frame_count += 1
|
||||||
|
# Log less frequently now that we know frames are being received
|
||||||
|
if frame_count % 100 == 0:
|
||||||
|
logger.info(f"Received {frame_count} frames from {peer.peer_name}")
|
||||||
|
except MediaStreamError as e:
|
||||||
|
# Connection was closed or media stream ended - this is normal
|
||||||
|
logger.info(
|
||||||
|
f"Audio stream ended for {peer.peer_name} (MediaStreamError: {e})"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
# Other errors during frame reception
|
||||||
|
logger.error(
|
||||||
|
f"Error receiving audio frame from {peer.peer_name}: {e}", exc_info=True
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Check if this is an audio frame and convert to numpy array for processing
|
||||||
if isinstance(frame, AudioFrame):
|
if isinstance(frame, AudioFrame):
|
||||||
# Convert AudioFrame to numpy array
|
# Convert AudioFrame to numpy array
|
||||||
audio_data = frame.to_ndarray()
|
try:
|
||||||
|
audio_data = frame.to_ndarray()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error converting frame to ndarray for {peer.peer_name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
original_shape = audio_data.shape
|
original_shape = audio_data.shape
|
||||||
original_dtype = audio_data.dtype
|
original_dtype = audio_data.dtype
|
||||||
|
|
||||||
@ -489,14 +517,16 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
|||||||
f"Audio frame data: shape={original_shape}, dtype={original_dtype}, samples={frame.samples if hasattr(frame, 'samples') else 'unknown'}"
|
f"Audio frame data: shape={original_shape}, dtype={original_dtype}, samples={frame.samples if hasattr(frame, 'samples') else 'unknown'}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Handle different audio formats
|
# Handle different audio formats - convert stereo to mono if needed
|
||||||
if audio_data.ndim == 2: # Stereo -> mono
|
if audio_data.ndim == 2: # Stereo -> mono
|
||||||
audio_data = np.mean(audio_data, axis=1)
|
if audio_data.shape[0] == 1: # Shape is (1, samples) - just squeeze the first dimension
|
||||||
logger.debug(
|
audio_data = audio_data.squeeze(0)
|
||||||
f"Converted stereo to mono: {original_shape} -> {audio_data.shape}"
|
logger.debug(f"Squeezed single-channel audio: {original_shape} -> {audio_data.shape}")
|
||||||
)
|
else: # True stereo (2, samples) or (samples, 2) - average channels
|
||||||
|
audio_data = np.mean(audio_data, axis=0 if audio_data.shape[0] > audio_data.shape[1] else 1)
|
||||||
|
logger.debug(f"Converted stereo to mono: {original_shape} -> {audio_data.shape}")
|
||||||
|
|
||||||
# Convert to float32 and normalize
|
# Convert to float32 and normalize based on data type
|
||||||
if audio_data.dtype == np.int16:
|
if audio_data.dtype == np.int16:
|
||||||
audio_data = audio_data.astype(np.float32) / 32768.0
|
audio_data = audio_data.astype(np.float32) / 32768.0
|
||||||
logger.debug("Normalized int16 audio to float32")
|
logger.debug("Normalized int16 audio to float32")
|
||||||
@ -504,20 +534,31 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
|||||||
audio_data = audio_data.astype(np.float32) / 2147483648.0
|
audio_data = audio_data.astype(np.float32) / 2147483648.0
|
||||||
logger.debug("Normalized int32 audio to float32")
|
logger.debug("Normalized int32 audio to float32")
|
||||||
|
|
||||||
# Resample to 16kHz if needed
|
# Resample to 16kHz if needed for Whisper model
|
||||||
if frame.sample_rate != sample_rate:
|
if frame.sample_rate != sample_rate:
|
||||||
original_length = len(audio_data)
|
original_length = len(audio_data)
|
||||||
audio_data = librosa.resample( # type: ignore
|
|
||||||
audio_data, orig_sr=frame.sample_rate, target_sr=sample_rate
|
# Use librosa to resample with explicit float64 conversion for better precision
|
||||||
)
|
try:
|
||||||
|
audio_float64 = audio_data.astype(np.float64)
|
||||||
|
|
||||||
|
audio_data = librosa.resample( # type: ignore
|
||||||
|
audio_float64, orig_sr=frame.sample_rate, target_sr=sample_rate
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Resampling failed for {peer.peer_name}: {str(e)}")
|
||||||
|
# Fall back to original data
|
||||||
|
audio_data = audio_data
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Resampled audio: {frame.sample_rate}Hz -> {sample_rate}Hz, {original_length} -> {len(audio_data)} samples"
|
f"Resampled audio: {frame.sample_rate}Hz -> {sample_rate}Hz, {original_length} -> {len(audio_data)} samples"
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
# Ensure audio_data is AudioArray (float32)
|
# No resampling needed
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Ensure audio_data is properly typed as float32 and calculate frame metrics
|
||||||
audio_data_float32 = cast(AudioArray, audio_data.astype(np.float32))
|
audio_data_float32 = cast(AudioArray, audio_data.astype(np.float32))
|
||||||
|
|
||||||
# Calculate audio quality metrics for this frame
|
|
||||||
frame_rms = np.sqrt(np.mean(audio_data_float32**2))
|
frame_rms = np.sqrt(np.mean(audio_data_float32**2))
|
||||||
frame_peak = np.max(np.abs(audio_data_float32))
|
frame_peak = np.max(np.abs(audio_data_float32))
|
||||||
|
|
||||||
@ -563,7 +604,7 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
|||||||
f"Connection active from {peer.peer_name}: Frame #{frame_count} (silent, RMS: {frame_rms:.6f})"
|
f"Connection active from {peer.peer_name}: Frame #{frame_count} (silent, RMS: {frame_rms:.6f})"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Send to audio processor
|
# Send processed audio to the audio processor for transcription
|
||||||
if audio_processor:
|
if audio_processor:
|
||||||
audio_processor.add_audio_data(audio_data_float32)
|
audio_processor.add_audio_data(audio_data_float32)
|
||||||
else:
|
else:
|
||||||
@ -577,8 +618,11 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Error processing audio track from {peer.peer_name}: {e}", exc_info=True
|
f"Unexpected error processing audio track from {peer.peer_name}: {e}", exc_info=True
|
||||||
)
|
)
|
||||||
|
finally:
|
||||||
|
# Clean up the audio processor when the stream ends
|
||||||
|
cleanup_peer_processor(peer.peer_name)
|
||||||
|
|
||||||
|
|
||||||
def agent_info() -> Dict[str, str]:
|
def agent_info() -> Dict[str, str]:
|
||||||
|
@ -1,195 +0,0 @@
|
|||||||
"""
|
|
||||||
Simple test to verify Step 5B enhanced bot functionality.
|
|
||||||
|
|
||||||
This test verifies that the enhanced bot components work correctly
|
|
||||||
when integrated with the existing voicebot system.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
# Set up test environment variables
|
|
||||||
os.environ["AI_CHATBOT_PERSONALITY"] = "helpful_assistant"
|
|
||||||
os.environ["AI_CHATBOT_PROVIDER"] = "local" # Use local provider for testing
|
|
||||||
os.environ["AI_CHATBOT_STREAMING"] = "false"
|
|
||||||
os.environ["AI_CHATBOT_MEMORY"] = "true"
|
|
||||||
|
|
||||||
# Import test modules
|
|
||||||
import sys
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
from shared.models import ChatMessageModel
|
|
||||||
|
|
||||||
|
|
||||||
async def test_enhanced_ai_chatbot():
|
|
||||||
"""Test the enhanced AI chatbot functionality."""
|
|
||||||
print("Testing Enhanced AI Chatbot...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Import the enhanced bot
|
|
||||||
from voicebot.bots.ai_chatbot import handle_chat_message, get_bot_status
|
|
||||||
|
|
||||||
# Create a mock send function
|
|
||||||
responses = []
|
|
||||||
async def mock_send(message: str):
|
|
||||||
responses.append(message)
|
|
||||||
print(f"Bot Response: {message}")
|
|
||||||
|
|
||||||
# Test message handling
|
|
||||||
test_message = ChatMessageModel(
|
|
||||||
id="test_message_id",
|
|
||||||
sender_name="test_user",
|
|
||||||
sender_session_id="test_session",
|
|
||||||
lobby_id="test_lobby",
|
|
||||||
message="Hello, can you help me?",
|
|
||||||
timestamp=time.time()
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Sending test message: {test_message.message}")
|
|
||||||
response = await handle_chat_message(test_message, mock_send)
|
|
||||||
|
|
||||||
if response:
|
|
||||||
print(f"✓ Bot responded successfully: {response[:50]}...")
|
|
||||||
else:
|
|
||||||
print("✗ Bot did not respond")
|
|
||||||
|
|
||||||
# Test bot status
|
|
||||||
print("\nTesting bot status...")
|
|
||||||
status = await get_bot_status()
|
|
||||||
print("✓ Bot status retrieved:")
|
|
||||||
print(f" - Agent: {status.get('agent_name', 'unknown')}")
|
|
||||||
print(f" - Features Available: {status.get('features_available', False)}")
|
|
||||||
print(f" - Configuration: {status.get('configuration', {})}")
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"✗ Enhanced bot test failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
async def test_personality_system():
|
|
||||||
"""Test the personality system components."""
|
|
||||||
print("\nTesting Personality System...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
from voicebot.personality_system import personality_manager
|
|
||||||
|
|
||||||
# Test listing templates
|
|
||||||
templates = personality_manager.list_templates()
|
|
||||||
print(f"✓ Found {len(templates)} personality templates:")
|
|
||||||
for template in templates:
|
|
||||||
print(f" - {template.id}: {template.description}")
|
|
||||||
|
|
||||||
# Test creating personality from template
|
|
||||||
personality = personality_manager.create_personality_from_template("helpful_assistant")
|
|
||||||
if personality:
|
|
||||||
print(f"✓ Created personality: {personality.name}")
|
|
||||||
print(f" - Traits: {[trait.value for trait in personality.traits]}")
|
|
||||||
print(f" - Communication Style: {personality.communication_style.value}")
|
|
||||||
else:
|
|
||||||
print("✗ Failed to create personality")
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"✗ Personality system test failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
async def test_conversation_context():
|
|
||||||
"""Test the conversation context management."""
|
|
||||||
print("\nTesting Conversation Context...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
from voicebot.conversation_context import context_manager
|
|
||||||
|
|
||||||
# Test creating context
|
|
||||||
context = context_manager.get_or_create_context(
|
|
||||||
session_id="test_session",
|
|
||||||
bot_name="test_bot",
|
|
||||||
conversation_id="test_conversation"
|
|
||||||
)
|
|
||||||
|
|
||||||
if context:
|
|
||||||
print(f"✓ Created conversation context: {context.conversation_id}")
|
|
||||||
|
|
||||||
# Test adding conversation turn
|
|
||||||
context_manager.add_conversation_turn(
|
|
||||||
conversation_id=context.conversation_id,
|
|
||||||
user_message="Test message",
|
|
||||||
bot_response="Test response",
|
|
||||||
context_used={"test": "context"},
|
|
||||||
metadata={"timestamp": time.time()}
|
|
||||||
)
|
|
||||||
|
|
||||||
print("✓ Added conversation turn")
|
|
||||||
print(f" - Turns in context: {len(context.turns)}")
|
|
||||||
|
|
||||||
# Test context summary
|
|
||||||
summary = context_manager.get_context_for_response(context.conversation_id)
|
|
||||||
if summary:
|
|
||||||
print(f"✓ Generated context summary: {summary[:50]}...")
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"✗ Conversation context test failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
async def test_integration_orchestrator():
|
|
||||||
"""Test the integration orchestrator."""
|
|
||||||
print("\nTesting Integration Orchestrator...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
from step_5b_integration_demo import enhanced_orchestrator
|
|
||||||
|
|
||||||
# Test bot discovery
|
|
||||||
enhanced_bots = await enhanced_orchestrator.discover_enhanced_bots()
|
|
||||||
print(f"✓ Discovered {len(enhanced_bots)} bots")
|
|
||||||
|
|
||||||
# Find enhanced bots
|
|
||||||
enhanced_count = sum(1 for bot_info in enhanced_bots.values()
|
|
||||||
if bot_info.get('enhanced_features', False))
|
|
||||||
print(f"✓ Found {enhanced_count} enhanced bots")
|
|
||||||
|
|
||||||
# Test analytics
|
|
||||||
analytics = enhanced_orchestrator.get_bot_analytics()
|
|
||||||
print(f"✓ Analytics: {analytics['enhanced_bots_count']} enhanced bots configured")
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"✗ Integration orchestrator test failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
async def run_all_tests():
|
|
||||||
"""Run all Step 5B tests."""
|
|
||||||
print("=== Step 5B Enhanced Bot Management Tests ===\n")
|
|
||||||
|
|
||||||
test_results = []
|
|
||||||
|
|
||||||
# Run individual tests
|
|
||||||
test_results.append(await test_enhanced_ai_chatbot())
|
|
||||||
test_results.append(await test_personality_system())
|
|
||||||
test_results.append(await test_conversation_context())
|
|
||||||
test_results.append(await test_integration_orchestrator())
|
|
||||||
|
|
||||||
# Summary
|
|
||||||
passed = sum(test_results)
|
|
||||||
total = len(test_results)
|
|
||||||
|
|
||||||
print(f"\n=== Test Results: {passed}/{total} tests passed ===")
|
|
||||||
|
|
||||||
if passed == total:
|
|
||||||
print("🎉 All Step 5B components are working correctly!")
|
|
||||||
else:
|
|
||||||
print("⚠️ Some tests failed - check the output above for details")
|
|
||||||
|
|
||||||
return passed == total
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(run_all_tests())
|
|
@ -1,110 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Debug script to test Whisper transcription with synthetic audio.
|
|
||||||
This helps identify if the issue is with audio processing or the transcription pipeline.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import time
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
|
|
||||||
# Add the voicebot directory to the path
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
try:
|
|
||||||
from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate
|
|
||||||
except ImportError as e:
|
|
||||||
print(f"Error importing whisper components: {e}")
|
|
||||||
print("Make sure you're running this from the voicebot directory")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_test_audio(
|
|
||||||
duration_seconds: float = 2.0, frequency: float = 440.0
|
|
||||||
) -> np.ndarray:
|
|
||||||
"""Generate a synthetic sine wave for testing."""
|
|
||||||
samples = int(duration_seconds * sample_rate)
|
|
||||||
t = np.linspace(0, duration_seconds, samples, False)
|
|
||||||
# Generate a sine wave with some amplitude modulation to simulate speech-like patterns
|
|
||||||
amplitude = 0.1 * (
|
|
||||||
1 + 0.5 * np.sin(2 * np.pi * 2 * t)
|
|
||||||
) # Amplitude modulation at 2Hz
|
|
||||||
audio = amplitude * np.sin(2 * np.pi * frequency * t)
|
|
||||||
return audio.astype(np.float32)
|
|
||||||
|
|
||||||
|
|
||||||
def test_transcription_pipeline():
|
|
||||||
"""Test the Whisper transcription pipeline with synthetic audio."""
|
|
||||||
print("Testing Whisper transcription pipeline...")
|
|
||||||
|
|
||||||
# Test 1: Complete silence
|
|
||||||
print("\n=== Test 1: Complete Silence ===")
|
|
||||||
silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32)
|
|
||||||
test_audio_transcription(silent_audio, "Silent audio")
|
|
||||||
|
|
||||||
# Test 2: Very quiet noise
|
|
||||||
print("\n=== Test 2: Very Quiet Noise ===")
|
|
||||||
quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32)
|
|
||||||
test_audio_transcription(quiet_noise, "Quiet noise")
|
|
||||||
|
|
||||||
# Test 3: Sine wave (should produce some output)
|
|
||||||
print("\n=== Test 3: Sine Wave ===")
|
|
||||||
sine_audio = generate_test_audio(2.0, 440.0)
|
|
||||||
test_audio_transcription(sine_audio, "Sine wave")
|
|
||||||
|
|
||||||
# Test 4: Multiple frequency sine wave
|
|
||||||
print("\n=== Test 4: Complex Sine Wave ===")
|
|
||||||
complex_audio = (
|
|
||||||
generate_test_audio(2.0, 220.0)
|
|
||||||
+ generate_test_audio(2.0, 440.0)
|
|
||||||
+ generate_test_audio(2.0, 880.0)
|
|
||||||
) / 3.0
|
|
||||||
test_audio_transcription(complex_audio, "Complex sine wave")
|
|
||||||
|
|
||||||
|
|
||||||
def test_audio_transcription(audio_array: np.ndarray, description: str):
|
|
||||||
"""Test transcription of a specific audio array."""
|
|
||||||
try:
|
|
||||||
# Calculate metrics
|
|
||||||
duration = len(audio_array) / sample_rate
|
|
||||||
rms = np.sqrt(np.mean(audio_array**2))
|
|
||||||
peak = np.max(np.abs(audio_array))
|
|
||||||
|
|
||||||
print(f"Testing {description}:")
|
|
||||||
print(f" Duration: {duration:.2f}s")
|
|
||||||
print(f" Samples: {len(audio_array)}")
|
|
||||||
print(f" RMS: {rms:.6f}")
|
|
||||||
print(f" Peak: {peak:.6f}")
|
|
||||||
|
|
||||||
# Test feature extraction
|
|
||||||
start_time = time.time()
|
|
||||||
input_features = extract_input_features(audio_array, sample_rate)
|
|
||||||
feature_time = time.time() - start_time
|
|
||||||
print(f" Feature extraction: {feature_time:.3f}s")
|
|
||||||
|
|
||||||
# Test model inference
|
|
||||||
start_time = time.time()
|
|
||||||
predicted_ids = _pt_model.generate(input_features)
|
|
||||||
inference_time = time.time() - start_time
|
|
||||||
print(f" Model inference: {inference_time:.3f}s")
|
|
||||||
|
|
||||||
# Test decoding
|
|
||||||
start_time = time.time()
|
|
||||||
transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
|
||||||
decoding_time = time.time() - start_time
|
|
||||||
print(f" Decoding: {decoding_time:.3f}s")
|
|
||||||
|
|
||||||
# Show result
|
|
||||||
text = (
|
|
||||||
transcription[0].strip() if transcription and len(transcription) > 0 else ""
|
|
||||||
)
|
|
||||||
print(f" Result: '{text}'" if text else " Result: (empty)")
|
|
||||||
print(f" Result length: {len(text)}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ERROR: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
test_transcription_pipeline()
|
|
Loading…
x
Reference in New Issue
Block a user