From d69037ff41ec3eb215b595e3affe3660ae276a70 Mon Sep 17 00:00:00 2001
From: James Ketrenos <james_git@ketrenos.com>
Date: Mon, 1 Sep 2025 19:43:03 -0700
Subject: [PATCH] Messing with audio

---
 voicebot/synthetic_media.py | 125 ++++++++++++++++++++++++++++--------
 1 file changed, 100 insertions(+), 25 deletions(-)

diff --git a/voicebot/synthetic_media.py b/voicebot/synthetic_media.py
index c157084..e2d5b23 100644
--- a/voicebot/synthetic_media.py
+++ b/voicebot/synthetic_media.py
@@ -362,46 +362,121 @@ class SyntheticAudioTrack(MediaStreamTrack):
         return pts, time_base
 
     async def recv(self):
+        """
+        Generate audio frame with position-based tone and bounce effects.
+
+        Audio Processing Pipeline:
+        1. Base tone generation (frequency based on ball Y position)
+        2. Bounce effect generation (separate, centered audio)
+        3. Stereo panning (applied to base tone only)
+        4. Volume compensation (based on ball Y position)
+        5. Audio mixing and clipping prevention
+        6. Final conversion to int16 stereo format
+        """
         pts, time_base = await self.next_timestamp()
 
-        # --- 1. Generate base tone based on ball Y position ---
+        # --- 1. TONE GENERATION: Create base frequency tone based on ball Y position ---
+        # Frequency mapping: Top of screen = high pitch (400Hz), bottom = low pitch (200Hz)
         if self.video_track:
-            base_freq = self._get_ball_frequency()
+            base_freq = self._get_ball_frequency()  # 200-400Hz range
         else:
-            base_freq = 440.0  # default if no video track
+            base_freq = 440.0  # default A4 if no video track
 
+        # Generate sine wave at calculated frequency
         t = (np.arange(self.samples_per_frame) + pts) / self.sample_rate
-        samples = np.sin(2 * np.pi * base_freq * t).astype(np.float32)
+        base_samples = np.sin(2 * np.pi * base_freq * t).astype(np.float32)
 
-        # --- 2. Add bounce sound effect if triggered ---
-        if getattr(self, "just_bounced", False):
-            logger.info("Audio: Generating bounce sound effect")
-            tb = np.arange(self.samples_per_frame) / self.sample_rate
-            bounce_freq = 600.0  # Hz
-            bounce_env = np.exp(-tb * 20.0)  # fast exponential decay
-            bounce_wave = 0.4 * np.sin(2 * np.pi * bounce_freq * tb) * bounce_env
-            samples = samples + bounce_wave.astype(np.float32)
-            self.just_bounced = False
+        # --- 2. BOUNCE EFFECTS: Generate separate bounce sound effects (centered audio) ---
+        # Bounce effects are generated independently to avoid being affected by panning
+        bounce_samples = np.zeros(self.samples_per_frame, dtype=np.float32)
+        current_time_s = self.clock.now()
+        current_sample = int(current_time_s * self.sample_rate)
 
-        # --- 3. Stereo panning based on X position ---
+        for bounce in self._active_bounces:
+            if bounce["start_sample"] <= current_sample < bounce["end_sample"]:
+                # Calculate relative time within this specific bounce event
+                sample_offset = current_sample - bounce["start_sample"]
+                bounce_t = sample_offset / self.sample_rate
+
+                # Generate bounce waveform: 600Hz tone with exponential decay envelope
+                tb = np.arange(self.samples_per_frame) / self.sample_rate + bounce_t
+                bounce_freq = 600.0  # Hz (higher than base tone for clarity)
+                bounce_env = np.exp(
+                    -tb * 20.0
+                )  # Fast exponential decay (20.0 = decay rate)
+                bounce_wave = (
+                    0.8 * np.sin(2 * np.pi * bounce_freq * tb) * bounce_env
+                )  # 0.8 = bounce amplitude (80% of full scale)
+
+                # Limit bounce duration to prevent runaway effects
+                valid_samples = tb < 0.2  # 200ms maximum bounce duration
+                bounce_wave[~valid_samples] = 0
+
+                # Accumulate bounce effects (multiple bounces can overlap)
+                bounce_samples = bounce_samples + bounce_wave.astype(np.float32)
+
+        # Clean up expired bounce events to prevent memory accumulation
+        self._active_bounces = [
+            bounce
+            for bounce in self._active_bounces
+            if bounce["end_sample"] > current_sample
+        ]
+
+        # --- 3. STEREO PANNING: Apply left/right positioning to base tone only ---
+        # Pan calculation: 0.0 = full left, 0.5 = center, 1.0 = full right
         if self.video_track:
-            pan = self.video_track.ball["x"] / self.video_track.width
+            pan = (
+                self.video_track.ball["x"] / self.video_track.width
+            )  # Normalize to 0-1
         else:
-            pan = 0.5  # center if no video
-        left_gain = math.cos(pan * math.pi / 2)
-        right_gain = math.sin(pan * math.pi / 2)
+            pan = 0.5  # Center positioning if no video track
 
-        # --- 4. Volume scaling based on Y position ---
+        # Equal-power panning: maintains perceived loudness across stereo field
+        left_gain = math.cos(pan * math.pi / 2)  # Left channel gain (1.0 to 0.0)
+        right_gain = math.sin(pan * math.pi / 2)  # Right channel gain (0.0 to 1.0)
+
+        # --- 4. VOLUME COMPENSATION: Apply Y-position based volume scaling ---
+        # Volume scaling compensates for perceptual frequency/amplitude relationship
         if self.video_track:
-            volume = (1.0 - (self.video_track.ball["y"] / self.video_track.height)) ** 2
+            # Quadratic scaling: top = loud (1.0), bottom = quiet (approaching 0.0)
+            # Formula: (1 - normalized_y)² provides smooth, natural volume curve
+            normalized_y = self.video_track.ball["y"] / self.video_track.height
+            volume = (1.0 - normalized_y) ** 2  # Squared for more dramatic effect
         else:
-            volume = 1.0
+            volume = 1.0  # Full volume if no video track
 
-        # --- 5. Apply gain and convert to int16 ---
-        left = (samples * left_gain * volume * 32767).astype(np.int16)
-        right = (samples * right_gain * volume * 32767).astype(np.int16)
+        # --- 5. AUDIO MIXING: Combine panned base tone with centered bounce effects ---
+        # Base tone: Apply stereo panning and volume compensation
+        left_base = base_samples * left_gain * volume
+        right_base = base_samples * right_gain * volume
 
-        # --- 6. Interleave channels for s16 format (samples arranged as [L, R, L, R, ...]) ---
+        # Final mix: Add bounce effects equally to both channels (no panning)
+        # This keeps bounce effects prominent and centered regardless of ball position
+        left_total = left_base + bounce_samples
+        right_total = right_base + bounce_samples
+
+        # --- 6. CLIPPING PREVENTION: Dynamic normalization with headroom management ---
+        # Check peak amplitude across both channels to detect potential clipping
+        max_left = np.max(np.abs(left_total))
+        max_right = np.max(np.abs(right_total))
+        max_amplitude = max(max_left, max_right)
+
+        # HEADROOM: Maintain 5% safety margin (0.95 threshold) to prevent digital artifacts
+        if max_amplitude > 0.95:  # Threshold chosen to leave headroom for codec/DAC
+            # NORMALIZATION: Scale down entire signal to prevent clipping while preserving dynamics
+            normalization_factor = 0.95 / max_amplitude  # Proportional scaling
+            left_total *= normalization_factor
+            right_total *= normalization_factor
+            logger.debug(
+                f"Audio normalization applied: peak={max_amplitude:.3f}, factor={normalization_factor:.3f}"
+            )
+
+        # FINAL CONVERSION: Convert to int16 with hard clipping as ultimate safety net
+        # np.clip ensures values never exceed int16 range (-32768 to 32767)
+        left = np.clip(left_total * 32767, -32767, 32767).astype(np.int16)
+        right = np.clip(right_total * 32767, -32767, 32767).astype(np.int16)
+
+        # --- 7. Interleave channels for s16 format (samples arranged as [L, R, L, R, ...]) ---
         # Create interleaved array: [left[0], right[0], left[1], right[1], ...]
         interleaved = np.empty(self.samples_per_frame * 2, dtype=np.int16)
         interleaved[0::2] = left  # Even indices get left channel