Added minimal reference example

2025-09-16 14:15:43 -07:00 · 2025-09-16 14:15:43 -07:00 · b319776c99
commit b319776c99
parent 90c3c6e19b
1 changed files with 258 additions and 0 deletions
--- a/voicebot/bots/minimal.py
+++ b/voicebot/bots/minimal.py
@ -0,0 +1,258 @@
+"""
+Minimal Bot - Reference Example
+
+A minimal bot that consumes and generates audio/video with configurable visualizations and audio creation.
+"""
+
+import numpy as np
+import cv2
+import fractions
+import time
+from av.audio.frame import AudioFrame
+from av import VideoFrame
+from aiortc import MediaStreamTrack
+from typing import Dict, Any, Optional, Tuple
+from shared.logger import logger
+
+
+class MediaClock:
+    """Shared clock for media synchronization."""
+
+    def __init__(self):
+        self.t0 = time.perf_counter()
+
+    def now(self) -> float:
+        return time.perf_counter() - self.t0
+
+
+class ConfigurableVideoTrack(MediaStreamTrack):
+    """Configurable video track with different visualization modes"""
+
+    kind = "video"
+
+    def __init__(self, clock: MediaClock, config: Dict[str, Any]):
+        """Initialize the configurable video track.
+
+        Args:
+            clock: Media clock for synchronization
+            config: Configuration dictionary for video settings
+        """
+        super().__init__()
+        self.clock = clock
+        self.config = config
+        self.width = config.get('width', 320)
+        self.height = config.get('height', 240)
+        self.fps = config.get('fps', 15)
+        self.mode = config.get('visualization', 'ball')
+        self.frame_count = 0
+        self._start_time = time.time()
+
+        # Mode-specific state
+        if self.mode == 'ball':
+            self.ball_x = self.width // 2
+            self.ball_y = self.height // 2
+            self.ball_dx = 2
+            self.ball_dy = 2
+            self.ball_radius = 20
+
+    async def next_timestamp(self) -> Tuple[int, float]:
+        pts = int(self.frame_count * (90000 / self.fps))
+        time_base = 1 / 90000
+        return pts, time_base
+
+    async def recv(self) -> VideoFrame:
+        pts, time_base = await self.next_timestamp()
+
+        # Create frame based on mode
+        if self.mode == 'ball':
+            frame_array = self._generate_ball_frame()
+        elif self.mode == 'waveform':
+            frame_array = self._generate_waveform_frame()
+        elif self.mode == 'static':
+            frame_array = self._generate_static_frame()
+        else:
+            frame_array = self._generate_ball_frame()  # default
+
+        frame = VideoFrame.from_ndarray(frame_array, format="bgr24")  # type: ignore
+        frame.pts = pts
+        frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000)
+
+        self.frame_count += 1
+        return frame
+
+    def _generate_ball_frame(self) -> Any:
+        """Generate bouncing ball visualization"""
+        frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
+
+        # Update ball position
+        self.ball_x += self.ball_dx
+        self.ball_y += self.ball_dy
+
+        # Bounce off walls
+        if self.ball_x <= self.ball_radius or self.ball_x >= self.width - self.ball_radius:
+            self.ball_dx = -self.ball_dx
+        if self.ball_y <= self.ball_radius or self.ball_y >= self.height - self.ball_radius:
+            self.ball_dy = -self.ball_dy
+
+        # Draw ball
+        cv2.circle(frame, (int(self.ball_x), int(self.ball_y)), self.ball_radius, (0, 255, 0), -1)
+
+        # Add timestamp
+        timestamp = f"Frame: {self.frame_count}"
+        cv2.putText(frame, timestamp, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+
+        return frame
+
+    def _generate_waveform_frame(self) -> Any:
+        """Generate waveform visualization"""
+        frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
+
+        # Generate sine wave
+        x = np.linspace(0, 4*np.pi, self.width)
+        y = np.sin(x + self.frame_count * 0.1) * self.height // 4 + self.height // 2
+
+        # Draw waveform
+        for i in range(1, len(y)):
+            cv2.line(frame, (i-1, int(y[i-1])), (i, int(y[i])), (255, 255, 255), 2)
+
+        return frame
+
+    def _generate_static_frame(self) -> Any:
+        """Generate static color frame"""
+        color = self.config.get('static_color', (128, 128, 128))
+        frame = np.full((self.height, self.width, 3), color, dtype=np.uint8)
+        return frame
+
+
+class ConfigurableAudioTrack(MediaStreamTrack):
+    """Configurable audio track with different audio generation modes"""
+
+    kind = "audio"
+
+    def __init__(self, clock: MediaClock, config: Dict[str, Any]):
+        """Initialize the configurable audio track.
+
+        Args:
+            clock: Media clock for synchronization
+            config: Configuration dictionary for audio settings
+        """
+        super().__init__()
+        self.clock = clock
+        self.config = config
+        self.sample_rate = config.get('sample_rate', 48000)
+        self.samples_per_frame = config.get('samples_per_frame', 960)
+        self.mode = config.get('audio_mode', 'tone')
+        self.frequency = config.get('frequency', 440.0)
+        self.volume = config.get('volume', 0.5)
+        self._samples_generated = 0
+
+    async def next_timestamp(self) -> Tuple[int, float]:
+        pts = self._samples_generated
+        time_base = 1 / self.sample_rate
+        return pts, time_base
+
+    async def recv(self) -> AudioFrame:
+        pts, time_base = await self.next_timestamp()
+
+        # Generate audio based on mode
+        if self.mode == 'tone':
+            samples = self._generate_tone()
+        elif self.mode == 'noise':
+            samples = self._generate_noise()
+        elif self.mode == 'silence':
+            samples = self._generate_silence()
+        else:
+            samples = self._generate_tone()  # default
+
+        # Convert to stereo
+        left = (samples * self.volume * 32767).astype(np.int16)
+        right = left.copy()
+
+        # Interleave channels
+        interleaved = np.empty(self.samples_per_frame * 2, dtype=np.int16)  # type: ignore
+        interleaved[0::2] = left
+        interleaved[1::2] = right
+
+        stereo = interleaved.reshape(1, -1)  # type: ignore
+
+        frame = AudioFrame.from_ndarray(stereo, format="s16", layout="stereo")  # type: ignore
+        frame.sample_rate = self.sample_rate
+        frame.pts = pts
+        frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000)
+
+        self._samples_generated += self.samples_per_frame
+        return frame
+
+    def _generate_tone(self) -> Any:
+        """Generate sine wave tone"""
+        t = (np.arange(self.samples_per_frame) + self._samples_generated) / self.sample_rate  # type: ignore
+        return np.sin(2 * np.pi * self.frequency * t).astype(np.float32)  # type: ignore
+
+    def _generate_noise(self) -> Any:
+        """Generate white noise"""
+        return np.random.uniform(-1, 1, self.samples_per_frame).astype(np.float32)  # type: ignore
+
+    def _generate_silence(self) -> Any:
+        """Generate silence"""
+        return np.zeros(self.samples_per_frame, dtype=np.float32)  # type: ignore
+
+
+def create_minimal_bot_tracks(session_name: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, MediaStreamTrack]:
+    """
+    Create minimal bot tracks with configurable audio/video generation.
+
+    Args:
+        session_name: Name for the session
+        config: Configuration dictionary with options:
+            - visualization: 'ball', 'waveform', 'static'
+            - audio_mode: 'tone', 'noise', 'silence'
+            - width: video width (default 320)
+            - height: video height (default 240)
+            - fps: frames per second (default 15)
+            - sample_rate: audio sample rate (default 48000)
+            - frequency: tone frequency in Hz (default 440)
+            - volume: audio volume 0-1 (default 0.5)
+            - static_color: RGB tuple for static mode (default gray)
+
+    Returns:
+        Dictionary containing 'video' and 'audio' tracks
+    """
+    if config is None:
+        config = {}
+
+    # Set defaults
+    default_config = {  # type: ignore
+        'visualization': 'ball',
+        'audio_mode': 'tone',
+        'width': 320,
+        'height': 240,
+        'fps': 15,
+        'sample_rate': 48000,
+        'samples_per_frame': 960,
+        'frequency': 440.0,
+        'volume': 0.5,
+        'static_color': (128, 128, 128)
+    }
+    default_config.update(config)  # type: ignore
+
+    media_clock = MediaClock()
+
+    video_track = ConfigurableVideoTrack(media_clock, default_config)  # type: ignore
+    audio_track = ConfigurableAudioTrack(media_clock, default_config)  # type: ignore
+
+    logger.info(f"Created minimal bot tracks for {session_name} with config: {default_config}")
+
+    return {"video": video_track, "audio": audio_track}
+
+
+# Agent descriptor exported for dynamic discovery by the FastAPI service
+AGENT_NAME = "Minimal Configurable Bot"
+AGENT_DESCRIPTION = "Minimal bot with configurable audio/video generation modes"
+
+def agent_info() -> Dict[str, str]:
+    """Return agent metadata for discovery."""
+    return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION}
+
+def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]:
+    """Factory wrapper used by the FastAPI service to instantiate tracks for an agent."""
+    return create_minimal_bot_tracks(session_name)