Messing with audio

2025-09-01 19:43:03 -07:00 · 2025-09-01 19:43:03 -07:00 · d69037ff41
commit d69037ff41
parent bf46a45f89
1 changed files with 100 additions and 25 deletions
--- a/voicebot/synthetic_media.py
+++ b/voicebot/synthetic_media.py
@ -362,46 +362,121 @@ class SyntheticAudioTrack(MediaStreamTrack):
        return pts, time_base
    async def recv(self):
        """
        Generate audio frame with position-based tone and bounce effects.
        Audio Processing Pipeline:
        1. Base tone generation (frequency based on ball Y position)
        2. Bounce effect generation (separate, centered audio)
        3. Stereo panning (applied to base tone only)
        4. Volume compensation (based on ball Y position)
        5. Audio mixing and clipping prevention
        6. Final conversion to int16 stereo format
        """
        pts, time_base = await self.next_timestamp()
-        # --- 1. Generate base tone based on ball Y position ---
+        # --- 1. TONE GENERATION: Create base frequency tone based on ball Y position ---
        # Frequency mapping: Top of screen = high pitch (400Hz), bottom = low pitch (200Hz)
        if self.video_track:
-            base_freq = self._get_ball_frequency()
+            base_freq = self._get_ball_frequency()  # 200-400Hz range
        else:
-            base_freq = 440.0  # default if no video track
+            base_freq = 440.0  # default A4 if no video track
        # Generate sine wave at calculated frequency
        t = (np.arange(self.samples_per_frame) + pts) / self.sample_rate
-        samples = np.sin(2 * np.pi * base_freq * t).astype(np.float32)
+        base_samples = np.sin(2 * np.pi * base_freq * t).astype(np.float32)
-        # --- 2. Add bounce sound effect if triggered ---
+        # --- 2. BOUNCE EFFECTS: Generate separate bounce sound effects (centered audio) ---
-        if getattr(self, "just_bounced", False):
+        # Bounce effects are generated independently to avoid being affected by panning
-            logger.info("Audio: Generating bounce sound effect")
+        bounce_samples = np.zeros(self.samples_per_frame, dtype=np.float32)
-            tb = np.arange(self.samples_per_frame) / self.sample_rate
+        current_time_s = self.clock.now()
-            bounce_freq = 600.0  # Hz
+        current_sample = int(current_time_s * self.sample_rate)
            bounce_env = np.exp(-tb * 20.0)  # fast exponential decay
            bounce_wave = 0.4 * np.sin(2 * np.pi * bounce_freq * tb) * bounce_env
            samples = samples + bounce_wave.astype(np.float32)
            self.just_bounced = False
-        # --- 3. Stereo panning based on X position ---
+        for bounce in self._active_bounces:
            if bounce["start_sample"] <= current_sample < bounce["end_sample"]:
                # Calculate relative time within this specific bounce event
                sample_offset = current_sample - bounce["start_sample"]
                bounce_t = sample_offset / self.sample_rate
                # Generate bounce waveform: 600Hz tone with exponential decay envelope
                tb = np.arange(self.samples_per_frame) / self.sample_rate + bounce_t
                bounce_freq = 600.0  # Hz (higher than base tone for clarity)
                bounce_env = np.exp(
                    -tb * 20.0
                )  # Fast exponential decay (20.0 = decay rate)
                bounce_wave = (
                    0.8 * np.sin(2 * np.pi * bounce_freq * tb) * bounce_env
                )  # 0.8 = bounce amplitude (80% of full scale)
                # Limit bounce duration to prevent runaway effects
                valid_samples = tb < 0.2  # 200ms maximum bounce duration
                bounce_wave[~valid_samples] = 0
                # Accumulate bounce effects (multiple bounces can overlap)
                bounce_samples = bounce_samples + bounce_wave.astype(np.float32)
        # Clean up expired bounce events to prevent memory accumulation
        self._active_bounces = [
            bounce
            for bounce in self._active_bounces
            if bounce["end_sample"] > current_sample
        ]
        # --- 3. STEREO PANNING: Apply left/right positioning to base tone only ---
        # Pan calculation: 0.0 = full left, 0.5 = center, 1.0 = full right
        if self.video_track:
-            pan = self.video_track.ball["x"] / self.video_track.width
+            pan = (
                self.video_track.ball["x"] / self.video_track.width
            )  # Normalize to 0-1
        else:
-            pan = 0.5  # center if no video
+            pan = 0.5  # Center positioning if no video track
        left_gain = math.cos(pan * math.pi / 2)
        right_gain = math.sin(pan * math.pi / 2)
-        # --- 4. Volume scaling based on Y position ---
+        # Equal-power panning: maintains perceived loudness across stereo field
        left_gain = math.cos(pan * math.pi / 2)  # Left channel gain (1.0 to 0.0)
        right_gain = math.sin(pan * math.pi / 2)  # Right channel gain (0.0 to 1.0)
        # --- 4. VOLUME COMPENSATION: Apply Y-position based volume scaling ---
        # Volume scaling compensates for perceptual frequency/amplitude relationship
        if self.video_track:
-            volume = (1.0 - (self.video_track.ball["y"] / self.video_track.height)) ** 2
+            # Quadratic scaling: top = loud (1.0), bottom = quiet (approaching 0.0)
            # Formula: (1 - normalized_y)² provides smooth, natural volume curve
            normalized_y = self.video_track.ball["y"] / self.video_track.height
            volume = (1.0 - normalized_y) ** 2  # Squared for more dramatic effect
        else:
-            volume = 1.0
+            volume = 1.0  # Full volume if no video track
-        # --- 5. Apply gain and convert to int16 ---
+        # --- 5. AUDIO MIXING: Combine panned base tone with centered bounce effects ---
-        left = (samples * left_gain * volume * 32767).astype(np.int16)
+        # Base tone: Apply stereo panning and volume compensation
-        right = (samples * right_gain * volume * 32767).astype(np.int16)
+        left_base = base_samples * left_gain * volume
        right_base = base_samples * right_gain * volume
-        # --- 6. Interleave channels for s16 format (samples arranged as [L, R, L, R, ...]) ---
+        # Final mix: Add bounce effects equally to both channels (no panning)
        # This keeps bounce effects prominent and centered regardless of ball position
        left_total = left_base + bounce_samples
        right_total = right_base + bounce_samples
        # --- 6. CLIPPING PREVENTION: Dynamic normalization with headroom management ---
        # Check peak amplitude across both channels to detect potential clipping
        max_left = np.max(np.abs(left_total))
        max_right = np.max(np.abs(right_total))
        max_amplitude = max(max_left, max_right)
        # HEADROOM: Maintain 5% safety margin (0.95 threshold) to prevent digital artifacts
        if max_amplitude > 0.95:  # Threshold chosen to leave headroom for codec/DAC
            # NORMALIZATION: Scale down entire signal to prevent clipping while preserving dynamics
            normalization_factor = 0.95 / max_amplitude  # Proportional scaling
            left_total *= normalization_factor
            right_total *= normalization_factor
            logger.debug(
                f"Audio normalization applied: peak={max_amplitude:.3f}, factor={normalization_factor:.3f}"
            )
        # FINAL CONVERSION: Convert to int16 with hard clipping as ultimate safety net
        # np.clip ensures values never exceed int16 range (-32768 to 32767)
        left = np.clip(left_total * 32767, -32767, 32767).astype(np.int16)
        right = np.clip(right_total * 32767, -32767, 32767).astype(np.int16)
        # --- 7. Interleave channels for s16 format (samples arranged as [L, R, L, R, ...]) ---
        # Create interleaved array: [left[0], right[0], left[1], right[1], ...]
        interleaved = np.empty(self.samples_per_frame * 2, dtype=np.int16)
        interleaved[0::2] = left  # Even indices get left channel