Improving VAD

This commit is contained in:
James Ketr 2025-09-14 12:49:14 -07:00
parent 51b1ef7bc2
commit 3d5f63aa0a

View File

@ -17,6 +17,8 @@ from typing import Dict, Optional, Callable, Awaitable, Any, List, Union
from pathlib import Path from pathlib import Path
import numpy.typing as npt import numpy.typing as npt
from pydantic import BaseModel, Field, ConfigDict from pydantic import BaseModel, Field, ConfigDict
from typing import Tuple
# Core dependencies # Core dependencies
import librosa import librosa
@ -246,6 +248,185 @@ def setup_intel_arc_environment() -> None:
logger.info("Intel Arc B580 environment variables configured") logger.info("Intel Arc B580 environment variables configured")
class AdvancedVAD:
"""Advanced Voice Activity Detection with noise rejection."""
def __init__(self, sample_rate: int = SAMPLE_RATE):
self.sample_rate = sample_rate
# More conservative thresholds
self.energy_threshold = 0.02 # Increased from 0.01
self.zcr_min = 0.1
self.zcr_max = 0.5 # Reject very high ZCR (digital noise)
self.spectral_centroid_min = 300 # Human speech starts around 300Hz
self.spectral_centroid_max = 3400 # Human speech typically under 3400Hz
self.spectral_rolloff_threshold = 2000 # Speech energy concentrated lower
self.minimum_duration = 0.3 # Require 300ms of consistent speech
# Temporal consistency tracking
self.speech_history = []
self.max_history = 10 # Track last 10 frames (1 second at 100ms chunks)
# Adaptive noise floor
self.noise_floor_energy = 0.001
self.noise_floor_centroid = 1000
self.adaptation_rate = 0.02
def analyze_frame(self, audio_data: AudioArray) -> Tuple[bool, dict]:
"""Analyze audio frame for speech vs noise."""
# Basic energy features
energy = np.sqrt(np.mean(audio_data**2))
# Zero-crossing rate
signs = np.sign(audio_data)
signs[signs == 0] = 1
zcr = np.sum(np.abs(np.diff(signs))) / (2 * len(audio_data))
# Spectral features
spectral_features = self._compute_spectral_features(audio_data)
# Individual feature checks
energy_check = energy > max(self.energy_threshold, self.noise_floor_energy * 3)
zcr_check = self.zcr_min < zcr < self.zcr_max
spectral_check = (
self.spectral_centroid_min < spectral_features['centroid'] < self.spectral_centroid_max
and spectral_features['rolloff'] < self.spectral_rolloff_threshold
and spectral_features['flux'] > 0.01 # Spectral change indicates speech
)
# Harmonicity check (speech has harmonic structure)
harmonic_check = spectral_features['harmonicity'] > 0.3
# Combined decision with temporal consistency
frame_has_speech = (
energy_check and
zcr_check and
spectral_check and
harmonic_check
)
# Update history
self.speech_history.append(frame_has_speech)
if len(self.speech_history) > self.max_history:
self.speech_history.pop(0)
# Require temporal consistency (at least 3 of last 5 frames)
recent_speech = sum(self.speech_history[-5:]) >= 3 if len(self.speech_history) >= 5 else frame_has_speech
# Update noise floor during silence
if not frame_has_speech:
self.noise_floor_energy = (
self.adaptation_rate * energy +
(1 - self.adaptation_rate) * self.noise_floor_energy
)
self.noise_floor_centroid = (
self.adaptation_rate * spectral_features['centroid'] +
(1 - self.adaptation_rate) * self.noise_floor_centroid
)
metrics = {
'energy': energy,
'zcr': zcr,
'centroid': spectral_features['centroid'],
'rolloff': spectral_features['rolloff'],
'flux': spectral_features['flux'],
'harmonicity': spectral_features['harmonicity'],
'noise_floor_energy': self.noise_floor_energy,
'energy_check': energy_check,
'zcr_check': zcr_check,
'spectral_check': spectral_check,
'harmonic_check': harmonic_check,
'temporal_consistency': recent_speech
}
return recent_speech, metrics
def _compute_spectral_features(self, audio_data: AudioArray) -> dict:
"""Compute spectral features for speech detection."""
# Apply window to reduce spectral leakage
windowed = audio_data * np.hanning(len(audio_data))
# FFT
fft_data = np.fft.rfft(windowed)
magnitude = np.abs(fft_data)
freqs = np.fft.rfftfreq(len(windowed), 1/self.sample_rate)
# Avoid division by zero
if np.sum(magnitude) == 0:
return {
'centroid': 0, 'rolloff': 0, 'flux': 0, 'harmonicity': 0
}
# Spectral centroid
centroid = np.sum(freqs * magnitude) / np.sum(magnitude)
# Spectral rolloff (frequency below which 85% of energy is contained)
cumsum = np.cumsum(magnitude)
rolloff_point = 0.85 * cumsum[-1]
rolloff_idx = np.where(cumsum >= rolloff_point)[0]
rolloff = freqs[rolloff_idx[0]] if len(rolloff_idx) > 0 else freqs[-1]
# Spectral flux (measure of spectral change)
if hasattr(self, '_prev_magnitude'):
flux = np.sum((magnitude - self._prev_magnitude) ** 2)
else:
flux = 0
self._prev_magnitude = magnitude.copy()
# Harmonicity (detect harmonic structure typical of speech)
harmonicity = self._compute_harmonicity(magnitude, freqs)
return {
'centroid': centroid,
'rolloff': rolloff,
'flux': flux,
'harmonicity': harmonicity
}
def _compute_harmonicity(self, magnitude: np.ndarray, freqs: np.ndarray) -> float:
"""Compute harmonicity score (0-1, higher = more harmonic/speech-like)."""
# Find fundamental frequency candidate (peak in 80-400Hz range for speech)
speech_range = (freqs >= 80) & (freqs <= 400)
if not np.any(speech_range):
return 0.0
speech_magnitude = magnitude[speech_range]
speech_freqs = freqs[speech_range]
if len(speech_magnitude) == 0:
return 0.0
# Find strongest peak in speech range
f0_idx = np.argmax(speech_magnitude)
f0 = speech_freqs[f0_idx]
f0_strength = speech_magnitude[f0_idx]
if f0_strength < np.max(magnitude) * 0.1: # F0 should be reasonably strong
return 0.0
# Check for harmonics (2*f0, 3*f0, etc.)
harmonic_strength = 0.0
total_harmonics = 0
for harmonic in range(2, 6): # Check 2nd through 5th harmonics
harmonic_freq = f0 * harmonic
if harmonic_freq > freqs[-1]:
break
# Find closest frequency bin
harmonic_idx = np.argmin(np.abs(freqs - harmonic_freq))
harmonic_strength += magnitude[harmonic_idx]
total_harmonics += 1
if total_harmonics == 0:
return 0.0
# Normalize by fundamental strength and number of harmonics
harmonicity = (harmonic_strength / total_harmonics) / f0_strength
return min(harmonicity, 1.0)
class OpenVINOWhisperModel: class OpenVINOWhisperModel:
"""OpenVINO optimized Whisper model for Intel Arc B580.""" """OpenVINO optimized Whisper model for Intel Arc B580."""
@ -758,34 +939,39 @@ def extract_input_features(audio_array: AudioArray, sampling_rate: int) -> torch
) )
return inputs.input_features return inputs.input_features
class VoiceActivityDetector(BaseModel): class VoiceActivityDetector(BaseModel):
has_speech: bool = False has_speech: bool = False
energy: float = 0.0 energy: float = 0.0
zcr: float = 0.0 zcr: float = 0.0
centroid: float = 0.0 centroid: float = 0.0
def simple_robust_vad( def simple_robust_vad(
audio_data: AudioArray, audio_data: AudioArray,
energy_threshold: float = 0.01, energy_threshold: float = 0.01,
sample_rate: int = SAMPLE_RATE, sample_rate: int = SAMPLE_RATE,
) -> VoiceActivityDetector: ) -> VoiceActivityDetector:
"""Simplified robust VAD.""" """Simplified robust VAD."""
# Energy-based detection (RMS) # Energy-based detection (RMS)
energy = np.sqrt(np.mean(audio_data**2)) energy = np.sqrt(np.mean(audio_data**2))
# Zero-crossing rate # Zero-crossing rate
signs = np.sign(audio_data) signs = np.sign(audio_data)
signs[signs == 0] = 1 signs[signs == 0] = 1
zcr = np.sum(np.abs(np.diff(signs))) / (2 * len(audio_data)) zcr = np.sum(np.abs(np.diff(signs))) / (2 * len(audio_data))
# Relaxed speech detection - use OR instead of AND for some conditions # Relaxed speech detection - use OR instead of AND for some conditions
has_speech = ( has_speech = (
energy > energy_threshold or # Primary condition energy > energy_threshold # Primary condition
(energy > energy_threshold * 0.5 and zcr > 0.05) # Secondary condition or (energy > energy_threshold * 0.5 and zcr > 0.05) # Secondary condition
) )
return VoiceActivityDetector(has_speech=has_speech, energy=energy, zcr=zcr, centroid=0.0) return VoiceActivityDetector(
has_speech=has_speech, energy=energy, zcr=zcr, centroid=0.0
)
def enhanced_vad( def enhanced_vad(
audio_data: AudioArray, audio_data: AudioArray,
@ -823,7 +1009,9 @@ def enhanced_vad(
and 200 < centroid < 3000 # Human speech frequency range and 200 < centroid < 3000 # Human speech frequency range
) )
return VoiceActivityDetector(has_speech=has_speech, energy=energy, zcr=zcr, centroid=centroid) return VoiceActivityDetector(
has_speech=has_speech, energy=energy, zcr=zcr, centroid=centroid
)
class OptimizedAudioProcessor: class OptimizedAudioProcessor:
@ -849,24 +1037,13 @@ class OptimizedAudioProcessor:
self.write_ptr = 0 self.write_ptr = 0
self.read_ptr = 0 self.read_ptr = 0
# Enhanced VAD parameters with EMA for noise adaptation # Silence handling parameters
self.vad_energy_threshold: float = VAD_THRESHOLD
self.vad_zcr_threshold: float = 0.1
self.noise_energy_ema: float = 0.001 # Initial noise estimate
self.noise_zcr_ema: float = 0.05
self.ema_alpha: float = 0.05 # Adaptation rate
self.energy_multiplier: float = 3.0 # Threshold = noise_ema * multiplier
self.zcr_multiplier: float = 2.0
self.min_energy_threshold: float = 0.005
self.min_zcr_threshold: float = 0.05
self.silence_frames: int = 0 self.silence_frames: int = 0
self.max_silence_frames: int = MAX_SILENCE_FRAMES self.max_silence_frames: int = MAX_SILENCE_FRAMES
self.max_trailing_silence_frames: int = MAX_TRAILING_SILENCE_FRAMES self.max_trailing_silence_frames: int = MAX_TRAILING_SILENCE_FRAMES
# VAD metrics tracking for adaptive thresholds # Enhanced VAD parameters with EMA for noise adaptation
self.vad_metrics_history: list[VoiceActivityDetector] = [] self.advanced_vad = AdvancedVAD(sample_rate=self.sample_rate)
self.adaptive_threshold_enabled: bool = True
# Processing state # Processing state
self.current_phrase_audio = np.array([], dtype=np.float32) self.current_phrase_audio = np.array([], dtype=np.float32)
@ -899,99 +1076,41 @@ class OptimizedAudioProcessor:
logger.error("Processor not running or empty audio data") logger.error("Processor not running or empty audio data")
return return
vad_metrics = simple_robust_vad( is_speech, vad_metrics = self.advanced_vad.analyze_frame(audio_data)
audio_data,
energy_threshold=0.01, # self.vad_energy_threshold,
sample_rate=self.sample_rate,
)
# Use enhanced VAD
# vad_metrics = enhanced_vad(
# audio_data,
# energy_threshold=self.vad_energy_threshold,
# zcr_threshold=self.vad_zcr_threshold,
# sample_rate=self.sample_rate,
# )
# Update noise estimates if no speech # Update visualization status
if not vad_metrics.has_speech: WaveformVideoTrack.speech_status[self.peer_name] = {
self.noise_energy_ema = ( 'is_speech': is_speech,
self.ema_alpha * vad_metrics.energy **vad_metrics
+ (1 - self.ema_alpha) * self.noise_energy_ema }
)
# self.noise_zcr_ema = (
# self.ema_alpha * vad_metrics.zcr
# + (1 - self.ema_alpha) * self.noise_zcr_ema
# )
# Adapt thresholds # Log VAD decisions periodically
self.vad_energy_threshold = max( if hasattr(self, '_vad_log_counter'):
# self.noise_energy_ema * self.energy_multiplier, self.min_energy_threshold self._vad_log_counter += 1
self.noise_energy_ema * 2.0, 0.005 else:
) self._vad_log_counter = 0
# self.vad_zcr_threshold = max(
# self.noise_zcr_ema * self.zcr_multiplier, self.min_zcr_threshold if self._vad_log_counter % 50 == 0: # Every 5 seconds at 100ms chunks
# ) logger.info(f"VAD Decision for {self.peer_name}: {is_speech}, "
f"Energy: {vad_metrics['energy']:.3f}, "
# Store metrics for additional tracking f"Harmonicity: {vad_metrics['harmonicity']:.2f}, "
self.vad_metrics_history.append(vad_metrics) f"Noise floor: {vad_metrics['noise_floor_energy']:.4f}")
if len(self.vad_metrics_history) > 100:
self.vad_metrics_history.pop(0) # Speech processing logic
if is_speech:
# Log detailed VAD decision occasionally for debugging
if len(self.vad_metrics_history) % 50 == 0:
logger.debug(
f"VAD metrics for {self.peer_name}: "
f"energy={vad_metrics.energy:.4f}, "
f"zcr={vad_metrics.zcr:.4f}, "
f"centroid={vad_metrics.centroid:.1f}Hz, "
f"speech={vad_metrics.has_speech}, "
f"noise_energy_ema={self.noise_energy_ema:.4f}, "
f"threshold={self.vad_energy_threshold:.4f}"
)
# Decision logic to avoid leading silence and limit trailing silence
if vad_metrics.has_speech:
logger.info(f"Speech detected for {self.peer_name}: {vad_metrics} (current phrase length: {len(self.current_phrase_audio) / self.sample_rate})")
self.silence_frames = 0 self.silence_frames = 0
self.last_activity_time = time.time() self.last_activity_time = time.time()
self._add_to_circular_buffer(audio_data) self._add_to_circular_buffer(audio_data)
# Update visualization buffer elif (len(self.current_phrase_audio) > 0 and
WaveformVideoTrack.buffer[self.peer_name] = np.concatenate([WaveformVideoTrack.buffer[self.peer_name], audio_data]) self.silence_frames < self.max_trailing_silence_frames):
# Limit buffer size to last 10 seconds
max_samples = SAMPLE_RATE * 10
if len(WaveformVideoTrack.buffer[self.peer_name]) > max_samples:
WaveformVideoTrack.buffer[self.peer_name] = WaveformVideoTrack.buffer[self.peer_name][-max_samples:]
elif (
len(self.current_phrase_audio) > 0
and self.silence_frames < self.max_trailing_silence_frames
):
logger.info(f"Trailing silence accepted for {self.peer_name}")
self.silence_frames += 1 self.silence_frames += 1
self._add_to_circular_buffer(audio_data) self._add_to_circular_buffer(audio_data)
# Update visualization buffer
WaveformVideoTrack.buffer[self.peer_name] = np.concatenate([WaveformVideoTrack.buffer[self.peer_name], audio_data])
# Limit buffer size to last 10 seconds
max_samples = SAMPLE_RATE * 10
if len(WaveformVideoTrack.buffer[self.peer_name]) > max_samples:
WaveformVideoTrack.buffer[self.peer_name] = WaveformVideoTrack.buffer[self.peer_name][-max_samples:]
else: else:
if (self.silence_frames % 10 == 0) and (self.silence_frames > 0):
logger.info(
f"VAD metrics for {self.peer_name}: "
f"energy={vad_metrics.energy:.4f}, "
f"zcr={vad_metrics.zcr:.4f}, "
f"centroid={vad_metrics.centroid:.1f}Hz, "
f"speech={vad_metrics.has_speech}, "
f"noise_energy_ema={self.noise_energy_ema:.4f}, "
f"threshold={self.vad_energy_threshold:.4f}"
)
self.silence_frames += 1 self.silence_frames += 1
if ( if (self.silence_frames > self.max_silence_frames and
self.silence_frames > self.max_silence_frames len(self.current_phrase_audio) > 0):
and len(self.current_phrase_audio) > 0
):
self._queue_final_transcription() self._queue_final_transcription()
return # Drop pure silence chunks (leading or excessive trailing) return # Drop non-speech audio
# Check if we should process # Check if we should process
if self._available_samples() >= self.chunk_size: if self._available_samples() >= self.chunk_size:
@ -1110,7 +1229,9 @@ class OptimizedAudioProcessor:
len(self.current_phrase_audio) > 0 len(self.current_phrase_audio) > 0
and time.time() - self.last_activity_time > 2.0 and time.time() - self.last_activity_time > 2.0
): ):
logger.info(f"Final transcription timeout for {self.peer_name} (asyncio.TimeoutError)") logger.info(
f"Final transcription timeout for {self.peer_name} (asyncio.TimeoutError)"
)
await self._transcribe_and_send( await self._transcribe_and_send(
self.current_phrase_audio.copy(), is_final=True self.current_phrase_audio.copy(), is_final=True
) )
@ -1141,7 +1262,9 @@ class OptimizedAudioProcessor:
if phrase_duration >= 1.0: if phrase_duration >= 1.0:
if self.main_loop: if self.main_loop:
logger.info(f"Transcribing from thread for {self.peer_name} (_thread_processing_loop > 1s interval)") logger.info(
f"Transcribing from thread for {self.peer_name} (_thread_processing_loop > 1s interval)"
)
asyncio.run_coroutine_threadsafe( asyncio.run_coroutine_threadsafe(
self._transcribe_and_send( self._transcribe_and_send(
self.current_phrase_audio.copy(), is_final=False self.current_phrase_audio.copy(), is_final=False
@ -1156,7 +1279,9 @@ class OptimizedAudioProcessor:
and time.time() - self.last_activity_time > 2.0 and time.time() - self.last_activity_time > 2.0
): ):
if self.main_loop: if self.main_loop:
logger.info(f"Final transcription from thread for {self.peer_name}") logger.info(
f"Final transcription from thread for {self.peer_name}"
)
asyncio.run_coroutine_threadsafe( asyncio.run_coroutine_threadsafe(
self._transcribe_and_send( self._transcribe_and_send(
self.current_phrase_audio.copy(), is_final=True self.current_phrase_audio.copy(), is_final=True
@ -1397,6 +1522,7 @@ class WaveformVideoTrack(MediaStreamTrack):
# Shared buffer for audio data # Shared buffer for audio data
buffer: Dict[str, npt.NDArray[np.float32]] = {} buffer: Dict[str, npt.NDArray[np.float32]] = {}
speech_status: Dict[str, dict] = {}
def __init__( def __init__(
self, session_name: str, width: int = 640, height: int = 480, fps: int = 15 self, session_name: str, width: int = 640, height: int = 480, fps: int = 15
@ -1472,13 +1598,14 @@ class WaveformVideoTrack(MediaStreamTrack):
2, 2,
) )
# Select the most active audio buffer (highest RMS) and draw its waveform # Select the most active audio buffer and get its speech status
best_proc = None best_proc = None
best_rms = 0.0 best_rms = 0.0
speech_info = None
try: try:
for pname, arr in self.__class__.buffer.items(): for pname, arr in self.__class__.buffer.items():
try: try:
# Allow empty arrays to be selected
if len(arr) == 0: if len(arr) == 0:
rms = 0.0 rms = 0.0
else: else:
@ -1486,6 +1613,7 @@ class WaveformVideoTrack(MediaStreamTrack):
if rms > best_rms: if rms > best_rms:
best_rms = rms best_rms = rms
best_proc = (pname, arr.copy()) best_proc = (pname, arr.copy())
speech_info = self.__class__.speech_status.get(pname, {})
except Exception: except Exception:
continue continue
except Exception: except Exception:
@ -1502,7 +1630,9 @@ class WaveformVideoTrack(MediaStreamTrack):
arr_segment = arr[-samples_needed:].copy() arr_segment = arr[-samples_needed:].copy()
else: else:
# Pad with zeros at the beginning # Pad with zeros at the beginning
arr_segment = np.concatenate([np.zeros(samples_needed - len(arr), dtype=np.float32), arr]) arr_segment = np.concatenate(
[np.zeros(samples_needed - len(arr), dtype=np.float32), arr]
)
# Assume arr_segment is already in [-1, 1] # Assume arr_segment is already in [-1, 1]
norm = arr_segment norm = arr_segment
@ -1523,22 +1653,34 @@ class WaveformVideoTrack(MediaStreamTrack):
dtype=np.float32, dtype=np.float32,
) )
# Create polyline points, avoid NaN # Draw waveform with color coding for speech detection
points: list[tuple[int, int]] = [] points: list[tuple[int, int]] = []
colors: list[tuple[int, int, int]] = [] # Color for each point
for x in range(self.width): for x in range(self.width):
v = float(norm[x]) if x < norm.size and not np.isnan(norm[x]) else 0.0 v = float(norm[x]) if x < norm.size and not np.isnan(norm[x]) else 0.0
y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 80)) + 80 y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 120)) + 100
points.append((x, y)) points.append((x, y))
# Color based on speech detection status
is_speech = speech_info.get('is_speech', False) if speech_info else False
energy_check = speech_info.get('energy_check', False) if speech_info else False
if is_speech:
colors.append((0, 255, 0)) # Green for detected speech
elif energy_check:
colors.append((255, 255, 0)) # Yellow for energy but not speech
else:
colors.append((128, 128, 128)) # Gray for background noise
# Draw colored waveform
if len(points) > 1: if len(points) > 1:
pts_np = np.array(points, dtype=np.int32) for i in range(len(points) - 1):
cv2.polylines( cv2.line(frame_array, points[i], points[i+1], colors[i], 2)
frame_array,
[pts_np], # Add speech detection status overlay
isClosed=False, if speech_info:
color=(255, 255, 255), self._draw_speech_status(frame_array, speech_info, pname)
thickness=4,
)
cv2.putText( cv2.putText(
frame_array, frame_array,
@ -1549,47 +1691,51 @@ class WaveformVideoTrack(MediaStreamTrack):
(255, 255, 255), (255, 255, 255),
2, 2,
) )
else:
# Draw a test sine wave to verify drawing works
import math
test_points: list[tuple[int, int]] = []
for x in range(self.width):
# Create a sine wave with some amplitude
v = 0.5 * math.sin(2 * math.pi * x / self.width * 4) # 4 cycles across width
y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 80)) + 80
test_points.append((x, y))
if len(test_points) > 1:
pts_np = np.array(test_points, dtype=np.int32)
cv2.polylines(
frame_array,
[pts_np],
isClosed=False,
color=(255, 255, 255),
thickness=4,
)
# Show buffer status
buffer_keys = len(self.__class__.buffer)
total_samples = sum(len(arr) for arr in self.__class__.buffer.values())
cv2.putText(
frame_array,
f"Test waveform - Buffer: {buffer_keys} keys, {total_samples} samples",
(10, 400),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 255, 255),
2,
)
frame = VideoFrame.from_ndarray(frame_array, format="bgr24") frame = VideoFrame.from_ndarray(frame_array, format="bgr24")
frame.pts = pts frame.pts = pts
frame.time_base = fractions.Fraction(1 / 90000).limit_denominator(1000000) frame.time_base = fractions.Fraction(1 / 90000).limit_denominator(1000000)
return frame return frame
def _draw_speech_status(self, frame_array: np.ndarray, speech_info: dict, pname: str):
"""Draw speech detection status information."""
y_offset = 100
line_height = 25
# Main status
is_speech = speech_info.get('is_speech', False)
status_text = "🎤 SPEECH" if is_speech else "🔇 NOISE"
status_color = (0, 255, 0) if is_speech else (128, 128, 128)
cv2.putText(frame_array, f"{pname}: {status_text}",
(10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.8, status_color, 2)
# Detailed metrics (smaller text)
metrics = [
f"Energy: {speech_info.get('energy', 0):.3f} ({'' if speech_info.get('energy_check', False) else ''})",
f"ZCR: {speech_info.get('zcr', 0):.3f} ({'' if speech_info.get('zcr_check', False) else ''})",
f"Spectral: ({'' if speech_info.get('spectral_check', False) else ''})",
f"Harmonic: ({'' if speech_info.get('harmonic_check', False) else ''})",
f"Temporal: ({'' if speech_info.get('temporal_consistency', False) else ''})"
]
for i, metric in enumerate(metrics):
cv2.putText(frame_array, metric,
(320, y_offset + i * 15), cv2.FONT_HERSHEY_SIMPLEX, 0.4,
(255, 255, 255), 1)
# Noise floor indicator
noise_floor = speech_info.get('noise_floor_energy', 0)
cv2.putText(frame_array, f"Noise Floor: {noise_floor:.4f}",
(10, y_offset + 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6,
(200, 200, 200), 1)
async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None: async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None:
"""Handle incoming audio tracks from WebRTC peers.""" """Handle incoming audio tracks from WebRTC peers."""
logger.info(
f"handle_track_received called for {peer.peer_name} with track kind: {track.kind}"
)
global _audio_processors, _send_chat_func global _audio_processors, _send_chat_func
if track.kind != "audio": if track.kind != "audio":
@ -1600,6 +1746,32 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None:
if peer.peer_name not in WaveformVideoTrack.buffer: if peer.peer_name not in WaveformVideoTrack.buffer:
WaveformVideoTrack.buffer[peer.peer_name] = np.array([], dtype=np.float32) WaveformVideoTrack.buffer[peer.peer_name] = np.array([], dtype=np.float32)
# Start background task to load model and create processor
async def init_processor():
global _model_loading_status, _model_loading_progress
# Load model asynchronously to avoid blocking frame reading
_model_loading_status = "Initializing model loading..."
_model_loading_progress = 0.0
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, _ensure_model_loaded)
_model_loading_status = "Model loaded, creating processor..."
_model_loading_progress = 0.8
logger.info(f"Creating OptimizedAudioProcessor for {peer.peer_name}")
if _send_chat_func is None:
logger.error(f"No send_chat_func available for {peer.peer_name}")
_model_loading_status = "Error: No send function available"
return
_audio_processors[peer.peer_name] = OptimizedAudioProcessor(
peer_name=peer.peer_name, send_chat_func=_send_chat_func
)
_model_loading_status = "Ready for transcription"
_model_loading_progress = 1.0
logger.info(f"OptimizedAudioProcessor ready for {peer.peer_name}")
if peer.peer_name not in _audio_processors: if peer.peer_name not in _audio_processors:
if _send_chat_func is None: if _send_chat_func is None:
logger.error( logger.error(
@ -1607,82 +1779,63 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None:
) )
return return
# Start background task to load model and create processor # Start the processor initialization in background
async def init_processor():
global _model_loading_status, _model_loading_progress
# Load model asynchronously to avoid blocking frame reading
_model_loading_status = "Initializing model loading..."
_model_loading_progress = 0.0
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, _ensure_model_loaded)
_model_loading_status = "Model loaded, creating processor..."
_model_loading_progress = 0.8
logger.info(f"Creating OptimizedAudioProcessor for {peer.peer_name}")
if _send_chat_func is None:
logger.error(f"No send_chat_func available for {peer.peer_name}")
_model_loading_status = "Error: No send function available"
return
_audio_processors[peer.peer_name] = OptimizedAudioProcessor(
peer_name=peer.peer_name, send_chat_func=_send_chat_func
)
audio_processor = _audio_processors[peer.peer_name]
# asyncio.create_task(calibrate_vad(audio_processor))
_model_loading_status = "Ready for transcription"
_model_loading_progress = 1.0
logger.info(f"Starting OpenVINO audio processing for {peer.peer_name}")
# Now start processing frames
frame_count = 0
while True:
try:
frame = await track.recv()
frame_count += 1
if frame_count % 100 == 0:
logger.debug(
f"Received {frame_count} frames from {peer.peer_name}"
)
except MediaStreamError as e:
logger.info(f"Audio stream ended for {peer.peer_name}: {e}")
break
except Exception as e:
logger.error(f"Error receiving frame from {peer.peer_name}: {e}")
break
if isinstance(frame, AudioFrame):
try:
# Convert frame to numpy array
audio_data = frame.to_ndarray()
# Handle audio format conversion
audio_data = _process_audio_frame(audio_data, frame)
# Resample if needed
if frame.sample_rate != SAMPLE_RATE:
audio_data = _resample_audio(
audio_data, frame.sample_rate, SAMPLE_RATE
)
# Convert to float32
audio_data_float32 = audio_data.astype(np.float32)
# Process with optimized processor
audio_processor.add_audio_data(audio_data_float32)
except Exception as e:
logger.error(
f"Error processing audio frame for {peer.peer_name}: {e}"
)
continue
asyncio.create_task(init_processor()) asyncio.create_task(init_processor())
return # Exit early, processor is handled in the background
# Start processing frames immediately (before processor is ready)
logger.info(f"Starting frame processing loop for {peer.peer_name}")
frame_count = 0
while True:
try:
frame = await track.recv()
frame_count += 1
if frame_count % 100 == 0:
logger.debug(f"Received {frame_count} frames from {peer.peer_name}")
except MediaStreamError as e:
logger.info(f"Audio stream ended for {peer.peer_name}: {e}")
break
except Exception as e:
logger.error(f"Error receiving frame from {peer.peer_name}: {e}")
break
if isinstance(frame, AudioFrame):
try:
# Convert frame to numpy array
audio_data = frame.to_ndarray()
# Handle audio format conversion
audio_data = _process_audio_frame(audio_data, frame)
# Resample if needed
if frame.sample_rate != SAMPLE_RATE:
audio_data = _resample_audio(
audio_data, frame.sample_rate, SAMPLE_RATE
)
# Convert to float32
audio_data_float32 = audio_data.astype(np.float32)
# Update visualization buffer immediately
WaveformVideoTrack.buffer[peer.peer_name] = np.concatenate(
[WaveformVideoTrack.buffer[peer.peer_name], audio_data_float32]
)
# Limit buffer size to last 10 seconds
max_samples = SAMPLE_RATE * 10
if len(WaveformVideoTrack.buffer[peer.peer_name]) > max_samples:
WaveformVideoTrack.buffer[peer.peer_name] = (
WaveformVideoTrack.buffer[peer.peer_name][-max_samples:]
)
# Process with optimized processor if available
if peer.peer_name in _audio_processors:
audio_processor = _audio_processors[peer.peer_name]
audio_processor.add_audio_data(audio_data_float32)
except Exception as e:
logger.error(f"Error processing audio frame for {peer.peer_name}: {e}")
continue
# If processor already exists, just continue processing # If processor already exists, just continue processing
audio_processor = _audio_processors[peer.peer_name] audio_processor = _audio_processors[peer.peer_name]
@ -1726,7 +1879,9 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None:
# Convert to float32 # Convert to float32
audio_data_float32 = audio_data.astype(np.float32) audio_data_float32 = audio_data.astype(np.float32)
logger.debug(f"Processed audio frame {frame_count} from {peer.peer_name}: {len(audio_data_float32)} samples") logger.debug(
f"Processed audio frame {frame_count} from {peer.peer_name}: {len(audio_data_float32)} samples"
)
# Process with optimized processor if available # Process with optimized processor if available
audio_processor.add_audio_data(audio_data_float32) audio_processor.add_audio_data(audio_data_float32)