386 lines
13 KiB
Python

"""
Minimal Bot - Reference Example
A minimal bot that consumes and generates audio/video with configurable visualizations and audio creation.
"""
import numpy as np
import cv2
import fractions
import time
from av.audio.frame import AudioFrame
from av import VideoFrame
from aiortc import MediaStreamTrack
from typing import Dict, Any, Optional, Tuple
from shared.logger import logger
class MediaClock:
"""Shared clock for media synchronization."""
def __init__(self):
self.t0 = time.perf_counter()
def now(self) -> float:
return time.perf_counter() - self.t0
class ConfigurableVideoTrack(MediaStreamTrack):
"""Configurable video track with different visualization modes"""
kind = "video"
def __init__(self, clock: MediaClock, config: Dict[str, Any]):
"""Initialize the configurable video track.
Args:
clock: Media clock for synchronization
config: Configuration dictionary for video settings
"""
super().__init__()
self.clock = clock
self.config = config
self.width = config.get('width', 320)
self.height = config.get('height', 240)
self.fps = config.get('fps', 15)
self.mode = config.get('visualization', 'ball')
self.frame_count = 0
self._start_time = time.time()
# Mode-specific state
if self.mode == 'ball':
self.ball_x = self.width // 2
self.ball_y = self.height // 2
self.ball_dx = 2
self.ball_dy = 2
self.ball_radius = 20
async def next_timestamp(self) -> Tuple[int, float]:
pts = int(self.frame_count * (90000 / self.fps))
time_base = 1 / 90000
return pts, time_base
async def recv(self) -> VideoFrame:
pts, time_base = await self.next_timestamp()
# Create frame based on mode
if self.mode == 'ball':
frame_array = self._generate_ball_frame()
elif self.mode == 'waveform':
frame_array = self._generate_waveform_frame()
elif self.mode == 'static':
frame_array = self._generate_static_frame()
else:
frame_array = self._generate_ball_frame() # default
frame = VideoFrame.from_ndarray(frame_array, format="bgr24") # type: ignore
frame.pts = pts
frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000)
self.frame_count += 1
return frame
def _generate_ball_frame(self) -> Any:
"""Generate bouncing ball visualization"""
frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
# Update ball position
self.ball_x += self.ball_dx
self.ball_y += self.ball_dy
# Bounce off walls
if self.ball_x <= self.ball_radius or self.ball_x >= self.width - self.ball_radius:
self.ball_dx = -self.ball_dx
if self.ball_y <= self.ball_radius or self.ball_y >= self.height - self.ball_radius:
self.ball_dy = -self.ball_dy
# Draw ball
cv2.circle(frame, (int(self.ball_x), int(self.ball_y)), self.ball_radius, (0, 255, 0), -1)
# Add timestamp
timestamp = f"Frame: {self.frame_count}"
cv2.putText(frame, timestamp, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
return frame
def _generate_waveform_frame(self) -> Any:
"""Generate waveform visualization"""
frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
# Generate sine wave
x = np.linspace(0, 4*np.pi, self.width)
y = np.sin(x + self.frame_count * 0.1) * self.height // 4 + self.height // 2
# Draw waveform
for i in range(1, len(y)):
cv2.line(frame, (i-1, int(y[i-1])), (i, int(y[i])), (255, 255, 255), 2)
return frame
def _generate_static_frame(self) -> Any:
"""Generate static color frame"""
color = self.config.get('static_color', (128, 128, 128))
frame = np.full((self.height, self.width, 3), color, dtype=np.uint8)
return frame
class ConfigurableAudioTrack(MediaStreamTrack):
"""Configurable audio track with different audio generation modes"""
kind = "audio"
def __init__(self, clock: MediaClock, config: Dict[str, Any]):
"""Initialize the configurable audio track.
Args:
clock: Media clock for synchronization
config: Configuration dictionary for audio settings
"""
super().__init__()
self.clock = clock
self.config = config
self.sample_rate = config.get('sample_rate', 48000)
self.samples_per_frame = config.get('samples_per_frame', 960)
self.mode = config.get('audio_mode', 'tone')
self.frequency = config.get('frequency', 440.0)
self.volume = config.get('volume', 0.5)
self._samples_generated = 0
async def next_timestamp(self) -> Tuple[int, float]:
pts = self._samples_generated
time_base = 1 / self.sample_rate
return pts, time_base
async def recv(self) -> AudioFrame:
pts, time_base = await self.next_timestamp()
# Generate audio based on mode
if self.mode == 'tone':
samples = self._generate_tone()
elif self.mode == 'noise':
samples = self._generate_noise()
elif self.mode == 'silence':
samples = self._generate_silence()
else:
samples = self._generate_tone() # default
# Convert to stereo
left = (samples * self.volume * 32767).astype(np.int16)
right = left.copy()
# Interleave channels
interleaved = np.empty(self.samples_per_frame * 2, dtype=np.int16) # type: ignore
interleaved[0::2] = left
interleaved[1::2] = right
stereo = interleaved.reshape(1, -1) # type: ignore
frame = AudioFrame.from_ndarray(stereo, format="s16", layout="stereo") # type: ignore
frame.sample_rate = self.sample_rate
frame.pts = pts
frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000)
self._samples_generated += self.samples_per_frame
return frame
def _generate_tone(self) -> Any:
"""Generate sine wave tone"""
t = (np.arange(self.samples_per_frame) + self._samples_generated) / self.sample_rate # type: ignore
return np.sin(2 * np.pi * self.frequency * t).astype(np.float32) # type: ignore
def _generate_noise(self) -> Any:
"""Generate white noise"""
return np.random.uniform(-1, 1, self.samples_per_frame).astype(np.float32) # type: ignore
def _generate_silence(self) -> Any:
"""Generate silence"""
return np.zeros(self.samples_per_frame, dtype=np.float32) # type: ignore
def create_minimal_bot_tracks(session_name: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, MediaStreamTrack]:
"""
Create minimal bot tracks with configurable audio/video generation.
Args:
session_name: Name for the session
config: Configuration dictionary with options:
- visualization: 'ball', 'waveform', 'static'
- audio_mode: 'tone', 'noise', 'silence'
- width: video width (default 320)
- height: video height (default 240)
- fps: frames per second (default 15)
- sample_rate: audio sample rate (default 48000)
- frequency: tone frequency in Hz (default 440)
- volume: audio volume 0-1 (default 0.5)
- static_color: RGB tuple for static mode (default gray)
Returns:
Dictionary containing 'video' and 'audio' tracks
"""
if config is None:
config = {}
# Set defaults
default_config = { # type: ignore
'visualization': 'ball',
'audio_mode': 'tone',
'width': 320,
'height': 240,
'fps': 15,
'sample_rate': 48000,
'samples_per_frame': 960,
'frequency': 440.0,
'volume': 0.5,
'static_color': (128, 128, 128)
}
default_config.update(config) # type: ignore
media_clock = MediaClock()
video_track = ConfigurableVideoTrack(media_clock, default_config) # type: ignore
audio_track = ConfigurableAudioTrack(media_clock, default_config) # type: ignore
logger.info(f"Created minimal bot tracks for {session_name} with config: {default_config}")
return {"video": video_track, "audio": audio_track}
# Agent descriptor exported for dynamic discovery by the FastAPI service
AGENT_NAME = "Minimal Configurable Bot"
AGENT_DESCRIPTION = "Minimal bot with configurable audio/video generation modes"
def agent_info() -> Dict[str, str]:
"""Return agent metadata for discovery."""
return {
"name": AGENT_NAME,
"description": AGENT_DESCRIPTION,
"has_media": "true",
"configurable": "true"
}
def get_config_schema() -> Dict[str, Any]:
"""Get the configuration schema for the Minimal Configurable Bot.
Returns a schema that defines all configurable parameters for the bot,
allowing frontend applications to dynamically generate configuration UIs.
"""
return {
"bot_name": AGENT_NAME,
"version": "1.0",
"parameters": [
{
"name": "visualization",
"type": "select",
"label": "Video Visualization Mode",
"description": "Choose the type of video visualization to display",
"default_value": "ball",
"required": True,
"options": [
{"value": "ball", "label": "Bouncing Ball"},
{"value": "waveform", "label": "Sine Wave Animation"},
{"value": "static", "label": "Static Color Frame"}
]
},
{
"name": "audio_mode",
"type": "select",
"label": "Audio Generation Mode",
"description": "Choose the type of audio to generate",
"default_value": "tone",
"required": True,
"options": [
{"value": "tone", "label": "Sine Wave Tone"},
{"value": "noise", "label": "White Noise"},
{"value": "silence", "label": "Silence"}
]
},
{
"name": "width",
"type": "number",
"label": "Video Width",
"description": "Width of the video frame in pixels",
"default_value": 320,
"required": False,
"min_value": 160,
"max_value": 1920,
"step": 1
},
{
"name": "height",
"type": "number",
"label": "Video Height",
"description": "Height of the video frame in pixels",
"default_value": 240,
"required": False,
"min_value": 120,
"max_value": 1080,
"step": 1
},
{
"name": "fps",
"type": "number",
"label": "Frames Per Second",
"description": "Video frame rate",
"default_value": 15,
"required": False,
"min_value": 1,
"max_value": 60,
"step": 1
},
{
"name": "sample_rate",
"type": "number",
"label": "Audio Sample Rate",
"description": "Audio sample rate in Hz",
"default_value": 48000,
"required": False,
"min_value": 8000,
"max_value": 96000,
"step": 1000
},
{
"name": "frequency",
"type": "number",
"label": "Audio Frequency (Hz)",
"description": "Frequency of the generated tone in Hz",
"default_value": 440.0,
"required": False,
"min_value": 20.0,
"max_value": 20000.0,
"step": 1.0
},
{
"name": "volume",
"type": "range",
"label": "Audio Volume",
"description": "Volume level (0.0 to 1.0)",
"default_value": 0.5,
"required": False,
"min_value": 0.0,
"max_value": 1.0,
"step": 0.1
},
{
"name": "static_color",
"type": "string",
"label": "Static Color (RGB)",
"description": "RGB color tuple for static mode (e.g., '128,128,128')",
"default_value": "128,128,128",
"required": False,
"pattern": r"^\d{1,3},\d{1,3},\d{1,3}$",
"max_length": 11
}
],
"categories": [
{
"Video Settings": ["visualization", "width", "height", "fps"]
},
{
"Audio Settings": ["audio_mode", "sample_rate", "frequency", "volume"]
},
{
"Advanced": ["static_color"]
}
]
}