Messing with audio

2025-09-01 19:32:57 -07:00 · 2025-09-01 19:32:57 -07:00 · bf46a45f89
commit bf46a45f89
parent fb0ce3f203
1 changed files with 173 additions and 115 deletions
--- a/voicebot/synthetic_media.py
+++ b/voicebot/synthetic_media.py
@ -6,17 +6,36 @@ Contains AnimatedVideoTrack and SyntheticAudioTrack implementations ported from
 """
 import numpy as np
 import math
 import cv2
 import fractions
 import time
 import random
-from typing import TypedDict
+from av.audio.frame import AudioFrame
 from asyncio import Queue, create_task, sleep
 from typing import TypedDict, TYPE_CHECKING
 from aiortc import MediaStreamTrack
-from av import VideoFrame, AudioFrame
+from av import VideoFrame
 from logger import logger
 if TYPE_CHECKING:
    pass
 # Shared clock
 from time import perf_counter
 class MediaClock:
    def __init__(self):
        self.t0 = perf_counter()
    def now(self) -> float:
        return perf_counter() - self.t0
 class BounceEvent(TypedDict):
    """Type definition for bounce events"""
    type: str
    start_sample: int
    end_sample: int
@ -35,6 +54,7 @@ class AnimatedVideoTrack(MediaStreamTrack):
    def __init__(
        self,
        clock: MediaClock,
        width: int = 320,
        height: int = 240,
        name: str = "",
@ -44,6 +64,10 @@ class AnimatedVideoTrack(MediaStreamTrack):
        self.width = width
        self.height = height
        self.name = name
        self.clock = clock
        self.fps = 15
        self._next_frame_index = 0
        self.audio_track = audio_track  # Reference to the audio track
        self.remote_video_tracks: list[
            MediaStreamTrack
@ -73,6 +97,10 @@ class AnimatedVideoTrack(MediaStreamTrack):
        self._start_time = time.time()
        self._last_frame_time = time.time()
        self.fps = 15  # Target frames per second
        self._remote_latest = {}  # track -> np.ndarray
        self._remote_tasks: list[
            tuple[MediaStreamTrack, object, Queue[np.ndarray]]
        ] = []
    def set_ball_speed(self, speed_mps: float):
        """Set the ball speed in meters per second"""
@ -83,6 +111,19 @@ class AnimatedVideoTrack(MediaStreamTrack):
        if track.kind == "video":
            self.remote_video_tracks.append(track)
            logger.info(f"Added remote video track: {track}")
            q: Queue[np.ndarray] = Queue(maxsize=1)
            async def pump():
                while True:
                    frame = await track.recv()
                    if isinstance(frame, VideoFrame):
                        img: np.ndarray = frame.to_ndarray(format="bgr24")
                        if q.full():
                            _ = q.get_nowait()
                        await q.put(img)
            t = create_task(pump())
            self._remote_tasks.append((track, t, q))
    def remove_remote_video_track(self, track: MediaStreamTrack):
        """Remove a remote video track"""
@ -90,36 +131,16 @@ class AnimatedVideoTrack(MediaStreamTrack):
            self.remote_video_tracks.remove(track)
            logger.info(f"Removed remote video track: {track}")
-    def _calculate_velocity_components(self) -> tuple[float, float]:
+    def _calculate_velocity_components(self, dt: float) -> tuple[float, float]:
-        """
+        dir_x, dir_y = self.ball["direction_x"], self.ball["direction_y"]
-        Calculate dx and dy velocity components based on speed in meters per second.
+        mag = np.hypot(dir_x, dir_y)
-        Frame width represents 1 meter, so pixels per second = width * speed_mps
+        if mag == 0:
        """
        # Calculate actual time delta since last frame
        current_time = time.time()
        dt = current_time - self._last_frame_time
        self._last_frame_time = current_time
        # Normalize direction vector to ensure consistent speed
        dir_x = self.ball["direction_x"]
        dir_y = self.ball["direction_y"]
        magnitude = np.sqrt(dir_x * dir_x + dir_y * dir_y)
        if magnitude > 0:
            dir_x_norm = dir_x / magnitude
            dir_y_norm = dir_y / magnitude
        else:
            dir_x_norm, dir_y_norm = 1.0, 0.0
-
+        else:
-        # Convert meters per second to pixels per actual time delta
+            dir_x_norm, dir_y_norm = dir_x / mag, dir_y / mag
        pixels_per_second = self.width * self.ball["speed_mps"]
        pixels_this_frame = pixels_per_second * dt
-
+        return pixels_this_frame * dir_x_norm, pixels_this_frame * dir_y_norm
        # Apply normalized direction to get velocity components
        dx = pixels_this_frame * dir_x_norm
        dy = pixels_this_frame * dir_y_norm
        return dx, dy
    async def next_timestamp(self):
        """Returns (pts, time_base) for 15 FPS video"""
@ -171,42 +192,36 @@ class AnimatedVideoTrack(MediaStreamTrack):
        """Generate video frames at 15 FPS"""
        pts, time_base = await self.next_timestamp()
        # Target timestamp for this frame (seconds since t0)
        target_t = self._next_frame_index / self.fps
        now = self.clock.now()
        if target_t > now:
            await sleep(target_t - now)
        # Use constant dt tied to fps (prevents physics jitter)
        dt = 1.0 / self.fps
        dx, dy = self._calculate_velocity_components(dt)
        # PTS derived from frame index, not wall clock
        pts = int(self._next_frame_index * (90000 / self.fps))
        time_base = 1 / 90000
        self._next_frame_index += 1
        # Create black background
        frame_array = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        # Process remote video tracks with edge detection
-        for track in self.remote_video_tracks:
+        for _track, _task, q in self._remote_tasks:
            try:
-                # Get the latest frame from the remote track (non-blocking)
+                img: np.ndarray = q.get_nowait()
-                remote_frame = await track.recv()
+            except Exception:
                if remote_frame and isinstance(remote_frame, VideoFrame):
                    # Convert to numpy array
                    img: np.ndarray = remote_frame.to_ndarray(format="bgr24")
                    # Apply edge detection
                    edges = cv2.Canny(img, 100, 200)
                    img_edges = cv2.cvtColor(edges, cv2.COLOR_GRAY2BGR)
                    # Resize to match our canvas size if needed
                    if img_edges.shape[:2] != (self.height, self.width):
                        img_edges = cv2.resize(img_edges, (self.width, self.height))
                    # Blend with existing frame (additive blend for edge detection overlay)
                    frame_array = cv2.addWeighted(
                        frame_array.astype(np.uint8),
                        0.7,
                        img_edges.astype(np.uint8),
                        0.3,
                        0,
                    )
            except Exception as e:
                # If we can't get a frame from this track, continue with others
                logger.debug(f"Could not get frame from remote track: {e}")
                continue
-
+            edges = cv2.Canny(img, 100, 200)
-        # Calculate velocity components based on current speed
+            img_edges = cv2.cvtColor(edges, cv2.COLOR_GRAY2BGR)
-        dx, dy = self._calculate_velocity_components()
+            if img_edges.shape[:2] != (self.height, self.width):
                img_edges = cv2.resize(img_edges, (self.width, self.height))
            frame_array = cv2.addWeighted(frame_array, 0.7, img_edges, 0.3, 0.0)
        # Update ball position
        ball = self.ball
@ -225,7 +240,7 @@ class AnimatedVideoTrack(MediaStreamTrack):
        # Trigger bounce sound if a bounce occurred
        if bounce_occurred and self.audio_track:
            logger.info("Video: Bounce detected, triggering audio event")
-            self.audio_track.add_bounce_event("bounce")
+            self.audio_track.add_bounce_event_at(self.clock.now())
        # Keep ball in bounds
        ball["x"] = max(ball["radius"], min(self.width - ball["radius"], ball["x"]))
@ -272,31 +287,61 @@ class AnimatedVideoTrack(MediaStreamTrack):
 class SyntheticAudioTrack(MediaStreamTrack):
    """
    Synthetic audio track that generates continuous tones based on ball position
    and additional bounce sound effects.
    The frequency of the continuous tone is mapped to the ball's Y position:
    - Top of screen (Y=0): 800Hz (high pitch)
    - Bottom of screen (Y=height): 200Hz (low pitch)
    Bounce events add temporary audio effects on top of the continuous tone.
    """
    kind = "audio"
-    def __init__(self):
+    def __init__(
        self, clock: MediaClock, video_track: "AnimatedVideoTrack | None" = None
    ):
        super().__init__()
        self.sample_rate = 48000
        self.samples_per_frame = 960
        self._samples_generated = 0
        self._active_bounces: list[BounceEvent] = []  # List of active bounce events
        self.video_track = video_track  # Reference to video track for ball position
        self.clock = clock
-    def add_bounce_event(self, bounce_type: str = "bounce"):
+    def add_bounce_event_at(self, bounce_time_s: float):
-        """Add a bounce event"""
+        start_sample = int(bounce_time_s * self.sample_rate)
-        bounce_duration_samples = int(0.2 * self.sample_rate)  # 200ms
+        duration = int(0.2 * self.sample_rate)
-
+        self._active_bounces.append(
-        # Add new bounce to the list (they can overlap)
+            {
-        bounce_event: BounceEvent = {
+                "type": "bounce",
-            "start_sample": self._samples_generated,
+                "start_sample": start_sample,
-            "end_sample": self._samples_generated + bounce_duration_samples,
+                "end_sample": start_sample + duration,
-            "type": bounce_type,
+            }
        }
        self._active_bounces.append(bounce_event)
        logger.info(
            f"Bounce event added - start: {bounce_event['start_sample']}, end: {bounce_event['end_sample']}"
        )
    def _get_ball_frequency(self) -> float:
        """Get the current frequency based on ball Y position"""
        if not self.video_track:
            return 440.0  # Default frequency if no video track
        # Map ball Y position to frequency range (200Hz to 800Hz)
        ball_y = self.video_track.ball["y"]
        height = self.video_track.height
        # Normalize Y position (0.0 at top, 1.0 at bottom)
        normalized_y = ball_y / height
        # Map to frequency range (higher pitch for higher position, lower for lower)
        # Invert so top = high frequency, bottom = low frequency
        freq_min = 200.0
        freq_max = 400.0
        frequency = freq_max - (normalized_y * (freq_max - freq_min))
        return frequency
    def _generate_bounce_sample(self, t: float) -> float:
        """Generate a single bounce sample at time t"""
        if t < 0 or t > 0.2:
@ -318,52 +363,58 @@ class SyntheticAudioTrack(MediaStreamTrack):
    async def recv(self):
        pts, time_base = await self.next_timestamp()
        samples = np.zeros((self.samples_per_frame,), dtype=np.float32)
-        # Generate samples for this frame
+        # --- 1. Generate base tone based on ball Y position ---
-        active_bounce_count = 0
+        if self.video_track:
-        for i in range(self.samples_per_frame):
+            base_freq = self._get_ball_frequency()
-            current_sample = self._samples_generated + i
+        else:
-            sample_value = 0.0
+            base_freq = 440.0  # default if no video track
-            # Check all active bounces for this sample
+        t = (np.arange(self.samples_per_frame) + pts) / self.sample_rate
-            for bounce in self._active_bounces:
+        samples = np.sin(2 * np.pi * base_freq * t).astype(np.float32)
                if bounce["start_sample"] <= current_sample < bounce["end_sample"]:
                    # Calculate time within this bounce
                    sample_offset = current_sample - bounce["start_sample"]
                    t = sample_offset / self.sample_rate
-                    # Add this bounce's contribution
+        # --- 2. Add bounce sound effect if triggered ---
-                    sample_value += self._generate_bounce_sample(t)
+        if getattr(self, "just_bounced", False):
-                    active_bounce_count += 1
+            logger.info("Audio: Generating bounce sound effect")
            tb = np.arange(self.samples_per_frame) / self.sample_rate
            bounce_freq = 600.0  # Hz
            bounce_env = np.exp(-tb * 20.0)  # fast exponential decay
            bounce_wave = 0.4 * np.sin(2 * np.pi * bounce_freq * tb) * bounce_env
            samples = samples + bounce_wave.astype(np.float32)
            self.just_bounced = False
-            samples[i] = sample_value
+        # --- 3. Stereo panning based on X position ---
        if self.video_track:
            pan = self.video_track.ball["x"] / self.video_track.width
        else:
            pan = 0.5  # center if no video
        left_gain = math.cos(pan * math.pi / 2)
        right_gain = math.sin(pan * math.pi / 2)
-        # Clean up expired bounces
+        # --- 4. Volume scaling based on Y position ---
-        self._active_bounces: list[BounceEvent] = [
+        if self.video_track:
-            bounce
+            volume = (1.0 - (self.video_track.ball["y"] / self.video_track.height)) ** 2
-            for bounce in self._active_bounces
+        else:
-            if bounce["end_sample"] > self._samples_generated + self.samples_per_frame
+            volume = 1.0
        ]
-        if active_bounce_count > 0:
+        # --- 5. Apply gain and convert to int16 ---
-            logger.info(
+        left = (samples * left_gain * volume * 32767).astype(np.int16)
-                f"Generated audio with {len(self._active_bounces)} active bounces"
+        right = (samples * right_gain * volume * 32767).astype(np.int16)
            )
-        self._samples_generated += self.samples_per_frame
+        # --- 6. Interleave channels for s16 format (samples arranged as [L, R, L, R, ...]) ---
        # Create interleaved array: [left[0], right[0], left[1], right[1], ...]
        interleaved = np.empty(self.samples_per_frame * 2, dtype=np.int16)
        interleaved[0::2] = left  # Even indices get left channel
        interleaved[1::2] = right  # Odd indices get right channel
-        # Convert to audio frame
+        # Reshape to (1, samples*2) as expected by s16 format
-        samples = np.clip(samples, -1.0, 1.0)
+        stereo = interleaved.reshape(1, -1)
        samples_s16 = (samples * 32767).astype(np.int16)
-        frame = AudioFrame.from_ndarray(
+        frame = AudioFrame.from_ndarray(stereo, format="s16", layout="stereo")
            samples_s16.reshape(1, -1), format="s16", layout="stereo"
        )
        frame.sample_rate = self.sample_rate
        frame.pts = pts
        frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000)
-
+        self._samples_generated += self.samples_per_frame
        return frame
@ -378,13 +429,20 @@ def create_synthetic_tracks(session_name: str) -> dict[str, MediaStreamTrack]:
        Dictionary containing 'video' and 'audio' tracks
    Note:
-        To change ball speed, use: tracks["video"].set_ball_speed(speed_in_mps)
+        - To change ball speed, use: tracks["video"].set_ball_speed(speed_in_mps)
-        where speed_in_mps is meters per second (frame width = 1 meter)
+          where speed_in_mps is meters per second (frame width = 1 meter)
        - Audio generates continuous tone based on ball Y position (200-800Hz)
        - Bounce events add additional audio on top of the continuous tone
    """
-    # Create audio track first
+    media_clock = MediaClock()
    audio_track = SyntheticAudioTrack()
-    # Create video track with reference to audio track for bounce events
+    # Create video track first
-    video_track = AnimatedVideoTrack(name=session_name, audio_track=audio_track)
+    video_track = AnimatedVideoTrack(name=session_name, clock=media_clock)
    # Create audio track with reference to video track for ball position-based frequency
    audio_track = SyntheticAudioTrack(video_track=video_track, clock=media_clock)
    # Set the audio track reference on the video track for bounce events
    video_track.audio_track = audio_track
    return {"video": video_track, "audio": audio_track}