Added minimal reference example

2025-09-16 14:15:43 -07:00 · 2025-09-16 14:15:43 -07:00 · b319776c99
commit b319776c99
parent 90c3c6e19b
1 changed files with 258 additions and 0 deletions
--- a/voicebot/bots/minimal.py
+++ b/voicebot/bots/minimal.py
@ -0,0 +1,258 @@
 """
 Minimal Bot - Reference Example
 A minimal bot that consumes and generates audio/video with configurable visualizations and audio creation.
 """
 import numpy as np
 import cv2
 import fractions
 import time
 from av.audio.frame import AudioFrame
 from av import VideoFrame
 from aiortc import MediaStreamTrack
 from typing import Dict, Any, Optional, Tuple
 from shared.logger import logger
 class MediaClock:
    """Shared clock for media synchronization."""
    def __init__(self):
        self.t0 = time.perf_counter()
    def now(self) -> float:
        return time.perf_counter() - self.t0
 class ConfigurableVideoTrack(MediaStreamTrack):
    """Configurable video track with different visualization modes"""
    kind = "video"
    def __init__(self, clock: MediaClock, config: Dict[str, Any]):
        """Initialize the configurable video track.
        Args:
            clock: Media clock for synchronization
            config: Configuration dictionary for video settings
        """
        super().__init__()
        self.clock = clock
        self.config = config
        self.width = config.get('width', 320)
        self.height = config.get('height', 240)
        self.fps = config.get('fps', 15)
        self.mode = config.get('visualization', 'ball')
        self.frame_count = 0
        self._start_time = time.time()
        # Mode-specific state
        if self.mode == 'ball':
            self.ball_x = self.width // 2
            self.ball_y = self.height // 2
            self.ball_dx = 2
            self.ball_dy = 2
            self.ball_radius = 20
    async def next_timestamp(self) -> Tuple[int, float]:
        pts = int(self.frame_count * (90000 / self.fps))
        time_base = 1 / 90000
        return pts, time_base
    async def recv(self) -> VideoFrame:
        pts, time_base = await self.next_timestamp()
        # Create frame based on mode
        if self.mode == 'ball':
            frame_array = self._generate_ball_frame()
        elif self.mode == 'waveform':
            frame_array = self._generate_waveform_frame()
        elif self.mode == 'static':
            frame_array = self._generate_static_frame()
        else:
            frame_array = self._generate_ball_frame()  # default
        frame = VideoFrame.from_ndarray(frame_array, format="bgr24")  # type: ignore
        frame.pts = pts
        frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000)
        self.frame_count += 1
        return frame
    def _generate_ball_frame(self) -> Any:
        """Generate bouncing ball visualization"""
        frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        # Update ball position
        self.ball_x += self.ball_dx
        self.ball_y += self.ball_dy
        # Bounce off walls
        if self.ball_x <= self.ball_radius or self.ball_x >= self.width - self.ball_radius:
            self.ball_dx = -self.ball_dx
        if self.ball_y <= self.ball_radius or self.ball_y >= self.height - self.ball_radius:
            self.ball_dy = -self.ball_dy
        # Draw ball
        cv2.circle(frame, (int(self.ball_x), int(self.ball_y)), self.ball_radius, (0, 255, 0), -1)
        # Add timestamp
        timestamp = f"Frame: {self.frame_count}"
        cv2.putText(frame, timestamp, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
        return frame
    def _generate_waveform_frame(self) -> Any:
        """Generate waveform visualization"""
        frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        # Generate sine wave
        x = np.linspace(0, 4*np.pi, self.width)
        y = np.sin(x + self.frame_count * 0.1) * self.height // 4 + self.height // 2
        # Draw waveform
        for i in range(1, len(y)):
            cv2.line(frame, (i-1, int(y[i-1])), (i, int(y[i])), (255, 255, 255), 2)
        return frame
    def _generate_static_frame(self) -> Any:
        """Generate static color frame"""
        color = self.config.get('static_color', (128, 128, 128))
        frame = np.full((self.height, self.width, 3), color, dtype=np.uint8)
        return frame
 class ConfigurableAudioTrack(MediaStreamTrack):
    """Configurable audio track with different audio generation modes"""
    kind = "audio"
    def __init__(self, clock: MediaClock, config: Dict[str, Any]):
        """Initialize the configurable audio track.
        Args:
            clock: Media clock for synchronization
            config: Configuration dictionary for audio settings
        """
        super().__init__()
        self.clock = clock
        self.config = config
        self.sample_rate = config.get('sample_rate', 48000)
        self.samples_per_frame = config.get('samples_per_frame', 960)
        self.mode = config.get('audio_mode', 'tone')
        self.frequency = config.get('frequency', 440.0)
        self.volume = config.get('volume', 0.5)
        self._samples_generated = 0
    async def next_timestamp(self) -> Tuple[int, float]:
        pts = self._samples_generated
        time_base = 1 / self.sample_rate
        return pts, time_base
    async def recv(self) -> AudioFrame:
        pts, time_base = await self.next_timestamp()
        # Generate audio based on mode
        if self.mode == 'tone':
            samples = self._generate_tone()
        elif self.mode == 'noise':
            samples = self._generate_noise()
        elif self.mode == 'silence':
            samples = self._generate_silence()
        else:
            samples = self._generate_tone()  # default
        # Convert to stereo
        left = (samples * self.volume * 32767).astype(np.int16)
        right = left.copy()
        # Interleave channels
        interleaved = np.empty(self.samples_per_frame * 2, dtype=np.int16)  # type: ignore
        interleaved[0::2] = left
        interleaved[1::2] = right
        stereo = interleaved.reshape(1, -1)  # type: ignore
        frame = AudioFrame.from_ndarray(stereo, format="s16", layout="stereo")  # type: ignore
        frame.sample_rate = self.sample_rate
        frame.pts = pts
        frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000)
        self._samples_generated += self.samples_per_frame
        return frame
    def _generate_tone(self) -> Any:
        """Generate sine wave tone"""
        t = (np.arange(self.samples_per_frame) + self._samples_generated) / self.sample_rate  # type: ignore
        return np.sin(2 * np.pi * self.frequency * t).astype(np.float32)  # type: ignore
    def _generate_noise(self) -> Any:
        """Generate white noise"""
        return np.random.uniform(-1, 1, self.samples_per_frame).astype(np.float32)  # type: ignore
    def _generate_silence(self) -> Any:
        """Generate silence"""
        return np.zeros(self.samples_per_frame, dtype=np.float32)  # type: ignore
 def create_minimal_bot_tracks(session_name: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, MediaStreamTrack]:
    """
    Create minimal bot tracks with configurable audio/video generation.
    Args:
        session_name: Name for the session
        config: Configuration dictionary with options:
            - visualization: 'ball', 'waveform', 'static'
            - audio_mode: 'tone', 'noise', 'silence'
            - width: video width (default 320)
            - height: video height (default 240)
            - fps: frames per second (default 15)
            - sample_rate: audio sample rate (default 48000)
            - frequency: tone frequency in Hz (default 440)
            - volume: audio volume 0-1 (default 0.5)
            - static_color: RGB tuple for static mode (default gray)
    Returns:
        Dictionary containing 'video' and 'audio' tracks
    """
    if config is None:
        config = {}
    # Set defaults
    default_config = {  # type: ignore
        'visualization': 'ball',
        'audio_mode': 'tone',
        'width': 320,
        'height': 240,
        'fps': 15,
        'sample_rate': 48000,
        'samples_per_frame': 960,
        'frequency': 440.0,
        'volume': 0.5,
        'static_color': (128, 128, 128)
    }
    default_config.update(config)  # type: ignore
    media_clock = MediaClock()
    video_track = ConfigurableVideoTrack(media_clock, default_config)  # type: ignore
    audio_track = ConfigurableAudioTrack(media_clock, default_config)  # type: ignore
    logger.info(f"Created minimal bot tracks for {session_name} with config: {default_config}")
    return {"video": video_track, "audio": audio_track}
 # Agent descriptor exported for dynamic discovery by the FastAPI service
 AGENT_NAME = "Minimal Configurable Bot"
 AGENT_DESCRIPTION = "Minimal bot with configurable audio/video generation modes"
 def agent_info() -> Dict[str, str]:
    """Return agent metadata for discovery."""
    return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION}
 def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]:
    """Factory wrapper used by the FastAPI service to instantiate tracks for an agent."""
    return create_minimal_bot_tracks(session_name)