diff --git a/voicebot/bots/minimal.py b/voicebot/bots/minimal.py new file mode 100644 index 0000000..b482fd4 --- /dev/null +++ b/voicebot/bots/minimal.py @@ -0,0 +1,258 @@ +""" +Minimal Bot - Reference Example + +A minimal bot that consumes and generates audio/video with configurable visualizations and audio creation. +""" + +import numpy as np +import cv2 +import fractions +import time +from av.audio.frame import AudioFrame +from av import VideoFrame +from aiortc import MediaStreamTrack +from typing import Dict, Any, Optional, Tuple +from shared.logger import logger + + +class MediaClock: + """Shared clock for media synchronization.""" + + def __init__(self): + self.t0 = time.perf_counter() + + def now(self) -> float: + return time.perf_counter() - self.t0 + + +class ConfigurableVideoTrack(MediaStreamTrack): + """Configurable video track with different visualization modes""" + + kind = "video" + + def __init__(self, clock: MediaClock, config: Dict[str, Any]): + """Initialize the configurable video track. + + Args: + clock: Media clock for synchronization + config: Configuration dictionary for video settings + """ + super().__init__() + self.clock = clock + self.config = config + self.width = config.get('width', 320) + self.height = config.get('height', 240) + self.fps = config.get('fps', 15) + self.mode = config.get('visualization', 'ball') + self.frame_count = 0 + self._start_time = time.time() + + # Mode-specific state + if self.mode == 'ball': + self.ball_x = self.width // 2 + self.ball_y = self.height // 2 + self.ball_dx = 2 + self.ball_dy = 2 + self.ball_radius = 20 + + async def next_timestamp(self) -> Tuple[int, float]: + pts = int(self.frame_count * (90000 / self.fps)) + time_base = 1 / 90000 + return pts, time_base + + async def recv(self) -> VideoFrame: + pts, time_base = await self.next_timestamp() + + # Create frame based on mode + if self.mode == 'ball': + frame_array = self._generate_ball_frame() + elif self.mode == 'waveform': + frame_array = self._generate_waveform_frame() + elif self.mode == 'static': + frame_array = self._generate_static_frame() + else: + frame_array = self._generate_ball_frame() # default + + frame = VideoFrame.from_ndarray(frame_array, format="bgr24") # type: ignore + frame.pts = pts + frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000) + + self.frame_count += 1 + return frame + + def _generate_ball_frame(self) -> Any: + """Generate bouncing ball visualization""" + frame = np.zeros((self.height, self.width, 3), dtype=np.uint8) + + # Update ball position + self.ball_x += self.ball_dx + self.ball_y += self.ball_dy + + # Bounce off walls + if self.ball_x <= self.ball_radius or self.ball_x >= self.width - self.ball_radius: + self.ball_dx = -self.ball_dx + if self.ball_y <= self.ball_radius or self.ball_y >= self.height - self.ball_radius: + self.ball_dy = -self.ball_dy + + # Draw ball + cv2.circle(frame, (int(self.ball_x), int(self.ball_y)), self.ball_radius, (0, 255, 0), -1) + + # Add timestamp + timestamp = f"Frame: {self.frame_count}" + cv2.putText(frame, timestamp, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + + return frame + + def _generate_waveform_frame(self) -> Any: + """Generate waveform visualization""" + frame = np.zeros((self.height, self.width, 3), dtype=np.uint8) + + # Generate sine wave + x = np.linspace(0, 4*np.pi, self.width) + y = np.sin(x + self.frame_count * 0.1) * self.height // 4 + self.height // 2 + + # Draw waveform + for i in range(1, len(y)): + cv2.line(frame, (i-1, int(y[i-1])), (i, int(y[i])), (255, 255, 255), 2) + + return frame + + def _generate_static_frame(self) -> Any: + """Generate static color frame""" + color = self.config.get('static_color', (128, 128, 128)) + frame = np.full((self.height, self.width, 3), color, dtype=np.uint8) + return frame + + +class ConfigurableAudioTrack(MediaStreamTrack): + """Configurable audio track with different audio generation modes""" + + kind = "audio" + + def __init__(self, clock: MediaClock, config: Dict[str, Any]): + """Initialize the configurable audio track. + + Args: + clock: Media clock for synchronization + config: Configuration dictionary for audio settings + """ + super().__init__() + self.clock = clock + self.config = config + self.sample_rate = config.get('sample_rate', 48000) + self.samples_per_frame = config.get('samples_per_frame', 960) + self.mode = config.get('audio_mode', 'tone') + self.frequency = config.get('frequency', 440.0) + self.volume = config.get('volume', 0.5) + self._samples_generated = 0 + + async def next_timestamp(self) -> Tuple[int, float]: + pts = self._samples_generated + time_base = 1 / self.sample_rate + return pts, time_base + + async def recv(self) -> AudioFrame: + pts, time_base = await self.next_timestamp() + + # Generate audio based on mode + if self.mode == 'tone': + samples = self._generate_tone() + elif self.mode == 'noise': + samples = self._generate_noise() + elif self.mode == 'silence': + samples = self._generate_silence() + else: + samples = self._generate_tone() # default + + # Convert to stereo + left = (samples * self.volume * 32767).astype(np.int16) + right = left.copy() + + # Interleave channels + interleaved = np.empty(self.samples_per_frame * 2, dtype=np.int16) # type: ignore + interleaved[0::2] = left + interleaved[1::2] = right + + stereo = interleaved.reshape(1, -1) # type: ignore + + frame = AudioFrame.from_ndarray(stereo, format="s16", layout="stereo") # type: ignore + frame.sample_rate = self.sample_rate + frame.pts = pts + frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000) + + self._samples_generated += self.samples_per_frame + return frame + + def _generate_tone(self) -> Any: + """Generate sine wave tone""" + t = (np.arange(self.samples_per_frame) + self._samples_generated) / self.sample_rate # type: ignore + return np.sin(2 * np.pi * self.frequency * t).astype(np.float32) # type: ignore + + def _generate_noise(self) -> Any: + """Generate white noise""" + return np.random.uniform(-1, 1, self.samples_per_frame).astype(np.float32) # type: ignore + + def _generate_silence(self) -> Any: + """Generate silence""" + return np.zeros(self.samples_per_frame, dtype=np.float32) # type: ignore + + +def create_minimal_bot_tracks(session_name: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, MediaStreamTrack]: + """ + Create minimal bot tracks with configurable audio/video generation. + + Args: + session_name: Name for the session + config: Configuration dictionary with options: + - visualization: 'ball', 'waveform', 'static' + - audio_mode: 'tone', 'noise', 'silence' + - width: video width (default 320) + - height: video height (default 240) + - fps: frames per second (default 15) + - sample_rate: audio sample rate (default 48000) + - frequency: tone frequency in Hz (default 440) + - volume: audio volume 0-1 (default 0.5) + - static_color: RGB tuple for static mode (default gray) + + Returns: + Dictionary containing 'video' and 'audio' tracks + """ + if config is None: + config = {} + + # Set defaults + default_config = { # type: ignore + 'visualization': 'ball', + 'audio_mode': 'tone', + 'width': 320, + 'height': 240, + 'fps': 15, + 'sample_rate': 48000, + 'samples_per_frame': 960, + 'frequency': 440.0, + 'volume': 0.5, + 'static_color': (128, 128, 128) + } + default_config.update(config) # type: ignore + + media_clock = MediaClock() + + video_track = ConfigurableVideoTrack(media_clock, default_config) # type: ignore + audio_track = ConfigurableAudioTrack(media_clock, default_config) # type: ignore + + logger.info(f"Created minimal bot tracks for {session_name} with config: {default_config}") + + return {"video": video_track, "audio": audio_track} + + +# Agent descriptor exported for dynamic discovery by the FastAPI service +AGENT_NAME = "Minimal Configurable Bot" +AGENT_DESCRIPTION = "Minimal bot with configurable audio/video generation modes" + +def agent_info() -> Dict[str, str]: + """Return agent metadata for discovery.""" + return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION} + +def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]: + """Factory wrapper used by the FastAPI service to instantiate tracks for an agent.""" + return create_minimal_bot_tracks(session_name) \ No newline at end of file