From d69037ff41ec3eb215b595e3affe3660ae276a70 Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Mon, 1 Sep 2025 19:43:03 -0700 Subject: [PATCH] Messing with audio --- voicebot/synthetic_media.py | 125 ++++++++++++++++++++++++++++-------- 1 file changed, 100 insertions(+), 25 deletions(-) diff --git a/voicebot/synthetic_media.py b/voicebot/synthetic_media.py index c157084..e2d5b23 100644 --- a/voicebot/synthetic_media.py +++ b/voicebot/synthetic_media.py @@ -362,46 +362,121 @@ class SyntheticAudioTrack(MediaStreamTrack): return pts, time_base async def recv(self): + """ + Generate audio frame with position-based tone and bounce effects. + + Audio Processing Pipeline: + 1. Base tone generation (frequency based on ball Y position) + 2. Bounce effect generation (separate, centered audio) + 3. Stereo panning (applied to base tone only) + 4. Volume compensation (based on ball Y position) + 5. Audio mixing and clipping prevention + 6. Final conversion to int16 stereo format + """ pts, time_base = await self.next_timestamp() - # --- 1. Generate base tone based on ball Y position --- + # --- 1. TONE GENERATION: Create base frequency tone based on ball Y position --- + # Frequency mapping: Top of screen = high pitch (400Hz), bottom = low pitch (200Hz) if self.video_track: - base_freq = self._get_ball_frequency() + base_freq = self._get_ball_frequency() # 200-400Hz range else: - base_freq = 440.0 # default if no video track + base_freq = 440.0 # default A4 if no video track + # Generate sine wave at calculated frequency t = (np.arange(self.samples_per_frame) + pts) / self.sample_rate - samples = np.sin(2 * np.pi * base_freq * t).astype(np.float32) + base_samples = np.sin(2 * np.pi * base_freq * t).astype(np.float32) - # --- 2. Add bounce sound effect if triggered --- - if getattr(self, "just_bounced", False): - logger.info("Audio: Generating bounce sound effect") - tb = np.arange(self.samples_per_frame) / self.sample_rate - bounce_freq = 600.0 # Hz - bounce_env = np.exp(-tb * 20.0) # fast exponential decay - bounce_wave = 0.4 * np.sin(2 * np.pi * bounce_freq * tb) * bounce_env - samples = samples + bounce_wave.astype(np.float32) - self.just_bounced = False + # --- 2. BOUNCE EFFECTS: Generate separate bounce sound effects (centered audio) --- + # Bounce effects are generated independently to avoid being affected by panning + bounce_samples = np.zeros(self.samples_per_frame, dtype=np.float32) + current_time_s = self.clock.now() + current_sample = int(current_time_s * self.sample_rate) - # --- 3. Stereo panning based on X position --- + for bounce in self._active_bounces: + if bounce["start_sample"] <= current_sample < bounce["end_sample"]: + # Calculate relative time within this specific bounce event + sample_offset = current_sample - bounce["start_sample"] + bounce_t = sample_offset / self.sample_rate + + # Generate bounce waveform: 600Hz tone with exponential decay envelope + tb = np.arange(self.samples_per_frame) / self.sample_rate + bounce_t + bounce_freq = 600.0 # Hz (higher than base tone for clarity) + bounce_env = np.exp( + -tb * 20.0 + ) # Fast exponential decay (20.0 = decay rate) + bounce_wave = ( + 0.8 * np.sin(2 * np.pi * bounce_freq * tb) * bounce_env + ) # 0.8 = bounce amplitude (80% of full scale) + + # Limit bounce duration to prevent runaway effects + valid_samples = tb < 0.2 # 200ms maximum bounce duration + bounce_wave[~valid_samples] = 0 + + # Accumulate bounce effects (multiple bounces can overlap) + bounce_samples = bounce_samples + bounce_wave.astype(np.float32) + + # Clean up expired bounce events to prevent memory accumulation + self._active_bounces = [ + bounce + for bounce in self._active_bounces + if bounce["end_sample"] > current_sample + ] + + # --- 3. STEREO PANNING: Apply left/right positioning to base tone only --- + # Pan calculation: 0.0 = full left, 0.5 = center, 1.0 = full right if self.video_track: - pan = self.video_track.ball["x"] / self.video_track.width + pan = ( + self.video_track.ball["x"] / self.video_track.width + ) # Normalize to 0-1 else: - pan = 0.5 # center if no video - left_gain = math.cos(pan * math.pi / 2) - right_gain = math.sin(pan * math.pi / 2) + pan = 0.5 # Center positioning if no video track - # --- 4. Volume scaling based on Y position --- + # Equal-power panning: maintains perceived loudness across stereo field + left_gain = math.cos(pan * math.pi / 2) # Left channel gain (1.0 to 0.0) + right_gain = math.sin(pan * math.pi / 2) # Right channel gain (0.0 to 1.0) + + # --- 4. VOLUME COMPENSATION: Apply Y-position based volume scaling --- + # Volume scaling compensates for perceptual frequency/amplitude relationship if self.video_track: - volume = (1.0 - (self.video_track.ball["y"] / self.video_track.height)) ** 2 + # Quadratic scaling: top = loud (1.0), bottom = quiet (approaching 0.0) + # Formula: (1 - normalized_y)² provides smooth, natural volume curve + normalized_y = self.video_track.ball["y"] / self.video_track.height + volume = (1.0 - normalized_y) ** 2 # Squared for more dramatic effect else: - volume = 1.0 + volume = 1.0 # Full volume if no video track - # --- 5. Apply gain and convert to int16 --- - left = (samples * left_gain * volume * 32767).astype(np.int16) - right = (samples * right_gain * volume * 32767).astype(np.int16) + # --- 5. AUDIO MIXING: Combine panned base tone with centered bounce effects --- + # Base tone: Apply stereo panning and volume compensation + left_base = base_samples * left_gain * volume + right_base = base_samples * right_gain * volume - # --- 6. Interleave channels for s16 format (samples arranged as [L, R, L, R, ...]) --- + # Final mix: Add bounce effects equally to both channels (no panning) + # This keeps bounce effects prominent and centered regardless of ball position + left_total = left_base + bounce_samples + right_total = right_base + bounce_samples + + # --- 6. CLIPPING PREVENTION: Dynamic normalization with headroom management --- + # Check peak amplitude across both channels to detect potential clipping + max_left = np.max(np.abs(left_total)) + max_right = np.max(np.abs(right_total)) + max_amplitude = max(max_left, max_right) + + # HEADROOM: Maintain 5% safety margin (0.95 threshold) to prevent digital artifacts + if max_amplitude > 0.95: # Threshold chosen to leave headroom for codec/DAC + # NORMALIZATION: Scale down entire signal to prevent clipping while preserving dynamics + normalization_factor = 0.95 / max_amplitude # Proportional scaling + left_total *= normalization_factor + right_total *= normalization_factor + logger.debug( + f"Audio normalization applied: peak={max_amplitude:.3f}, factor={normalization_factor:.3f}" + ) + + # FINAL CONVERSION: Convert to int16 with hard clipping as ultimate safety net + # np.clip ensures values never exceed int16 range (-32768 to 32767) + left = np.clip(left_total * 32767, -32767, 32767).astype(np.int16) + right = np.clip(right_total * 32767, -32767, 32767).astype(np.int16) + + # --- 7. Interleave channels for s16 format (samples arranged as [L, R, L, R, ...]) --- # Create interleaved array: [left[0], right[0], left[1], right[1], ...] interleaved = np.empty(self.samples_per_frame * 2, dtype=np.int16) interleaved[0::2] = left # Even indices get left channel