Added minimal reference example
This commit is contained in:
parent
90c3c6e19b
commit
b319776c99
258
voicebot/bots/minimal.py
Normal file
258
voicebot/bots/minimal.py
Normal file
@ -0,0 +1,258 @@
|
||||
"""
|
||||
Minimal Bot - Reference Example
|
||||
|
||||
A minimal bot that consumes and generates audio/video with configurable visualizations and audio creation.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
import fractions
|
||||
import time
|
||||
from av.audio.frame import AudioFrame
|
||||
from av import VideoFrame
|
||||
from aiortc import MediaStreamTrack
|
||||
from typing import Dict, Any, Optional, Tuple
|
||||
from shared.logger import logger
|
||||
|
||||
|
||||
class MediaClock:
|
||||
"""Shared clock for media synchronization."""
|
||||
|
||||
def __init__(self):
|
||||
self.t0 = time.perf_counter()
|
||||
|
||||
def now(self) -> float:
|
||||
return time.perf_counter() - self.t0
|
||||
|
||||
|
||||
class ConfigurableVideoTrack(MediaStreamTrack):
|
||||
"""Configurable video track with different visualization modes"""
|
||||
|
||||
kind = "video"
|
||||
|
||||
def __init__(self, clock: MediaClock, config: Dict[str, Any]):
|
||||
"""Initialize the configurable video track.
|
||||
|
||||
Args:
|
||||
clock: Media clock for synchronization
|
||||
config: Configuration dictionary for video settings
|
||||
"""
|
||||
super().__init__()
|
||||
self.clock = clock
|
||||
self.config = config
|
||||
self.width = config.get('width', 320)
|
||||
self.height = config.get('height', 240)
|
||||
self.fps = config.get('fps', 15)
|
||||
self.mode = config.get('visualization', 'ball')
|
||||
self.frame_count = 0
|
||||
self._start_time = time.time()
|
||||
|
||||
# Mode-specific state
|
||||
if self.mode == 'ball':
|
||||
self.ball_x = self.width // 2
|
||||
self.ball_y = self.height // 2
|
||||
self.ball_dx = 2
|
||||
self.ball_dy = 2
|
||||
self.ball_radius = 20
|
||||
|
||||
async def next_timestamp(self) -> Tuple[int, float]:
|
||||
pts = int(self.frame_count * (90000 / self.fps))
|
||||
time_base = 1 / 90000
|
||||
return pts, time_base
|
||||
|
||||
async def recv(self) -> VideoFrame:
|
||||
pts, time_base = await self.next_timestamp()
|
||||
|
||||
# Create frame based on mode
|
||||
if self.mode == 'ball':
|
||||
frame_array = self._generate_ball_frame()
|
||||
elif self.mode == 'waveform':
|
||||
frame_array = self._generate_waveform_frame()
|
||||
elif self.mode == 'static':
|
||||
frame_array = self._generate_static_frame()
|
||||
else:
|
||||
frame_array = self._generate_ball_frame() # default
|
||||
|
||||
frame = VideoFrame.from_ndarray(frame_array, format="bgr24") # type: ignore
|
||||
frame.pts = pts
|
||||
frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000)
|
||||
|
||||
self.frame_count += 1
|
||||
return frame
|
||||
|
||||
def _generate_ball_frame(self) -> Any:
|
||||
"""Generate bouncing ball visualization"""
|
||||
frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
|
||||
|
||||
# Update ball position
|
||||
self.ball_x += self.ball_dx
|
||||
self.ball_y += self.ball_dy
|
||||
|
||||
# Bounce off walls
|
||||
if self.ball_x <= self.ball_radius or self.ball_x >= self.width - self.ball_radius:
|
||||
self.ball_dx = -self.ball_dx
|
||||
if self.ball_y <= self.ball_radius or self.ball_y >= self.height - self.ball_radius:
|
||||
self.ball_dy = -self.ball_dy
|
||||
|
||||
# Draw ball
|
||||
cv2.circle(frame, (int(self.ball_x), int(self.ball_y)), self.ball_radius, (0, 255, 0), -1)
|
||||
|
||||
# Add timestamp
|
||||
timestamp = f"Frame: {self.frame_count}"
|
||||
cv2.putText(frame, timestamp, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
||||
|
||||
return frame
|
||||
|
||||
def _generate_waveform_frame(self) -> Any:
|
||||
"""Generate waveform visualization"""
|
||||
frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
|
||||
|
||||
# Generate sine wave
|
||||
x = np.linspace(0, 4*np.pi, self.width)
|
||||
y = np.sin(x + self.frame_count * 0.1) * self.height // 4 + self.height // 2
|
||||
|
||||
# Draw waveform
|
||||
for i in range(1, len(y)):
|
||||
cv2.line(frame, (i-1, int(y[i-1])), (i, int(y[i])), (255, 255, 255), 2)
|
||||
|
||||
return frame
|
||||
|
||||
def _generate_static_frame(self) -> Any:
|
||||
"""Generate static color frame"""
|
||||
color = self.config.get('static_color', (128, 128, 128))
|
||||
frame = np.full((self.height, self.width, 3), color, dtype=np.uint8)
|
||||
return frame
|
||||
|
||||
|
||||
class ConfigurableAudioTrack(MediaStreamTrack):
|
||||
"""Configurable audio track with different audio generation modes"""
|
||||
|
||||
kind = "audio"
|
||||
|
||||
def __init__(self, clock: MediaClock, config: Dict[str, Any]):
|
||||
"""Initialize the configurable audio track.
|
||||
|
||||
Args:
|
||||
clock: Media clock for synchronization
|
||||
config: Configuration dictionary for audio settings
|
||||
"""
|
||||
super().__init__()
|
||||
self.clock = clock
|
||||
self.config = config
|
||||
self.sample_rate = config.get('sample_rate', 48000)
|
||||
self.samples_per_frame = config.get('samples_per_frame', 960)
|
||||
self.mode = config.get('audio_mode', 'tone')
|
||||
self.frequency = config.get('frequency', 440.0)
|
||||
self.volume = config.get('volume', 0.5)
|
||||
self._samples_generated = 0
|
||||
|
||||
async def next_timestamp(self) -> Tuple[int, float]:
|
||||
pts = self._samples_generated
|
||||
time_base = 1 / self.sample_rate
|
||||
return pts, time_base
|
||||
|
||||
async def recv(self) -> AudioFrame:
|
||||
pts, time_base = await self.next_timestamp()
|
||||
|
||||
# Generate audio based on mode
|
||||
if self.mode == 'tone':
|
||||
samples = self._generate_tone()
|
||||
elif self.mode == 'noise':
|
||||
samples = self._generate_noise()
|
||||
elif self.mode == 'silence':
|
||||
samples = self._generate_silence()
|
||||
else:
|
||||
samples = self._generate_tone() # default
|
||||
|
||||
# Convert to stereo
|
||||
left = (samples * self.volume * 32767).astype(np.int16)
|
||||
right = left.copy()
|
||||
|
||||
# Interleave channels
|
||||
interleaved = np.empty(self.samples_per_frame * 2, dtype=np.int16) # type: ignore
|
||||
interleaved[0::2] = left
|
||||
interleaved[1::2] = right
|
||||
|
||||
stereo = interleaved.reshape(1, -1) # type: ignore
|
||||
|
||||
frame = AudioFrame.from_ndarray(stereo, format="s16", layout="stereo") # type: ignore
|
||||
frame.sample_rate = self.sample_rate
|
||||
frame.pts = pts
|
||||
frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000)
|
||||
|
||||
self._samples_generated += self.samples_per_frame
|
||||
return frame
|
||||
|
||||
def _generate_tone(self) -> Any:
|
||||
"""Generate sine wave tone"""
|
||||
t = (np.arange(self.samples_per_frame) + self._samples_generated) / self.sample_rate # type: ignore
|
||||
return np.sin(2 * np.pi * self.frequency * t).astype(np.float32) # type: ignore
|
||||
|
||||
def _generate_noise(self) -> Any:
|
||||
"""Generate white noise"""
|
||||
return np.random.uniform(-1, 1, self.samples_per_frame).astype(np.float32) # type: ignore
|
||||
|
||||
def _generate_silence(self) -> Any:
|
||||
"""Generate silence"""
|
||||
return np.zeros(self.samples_per_frame, dtype=np.float32) # type: ignore
|
||||
|
||||
|
||||
def create_minimal_bot_tracks(session_name: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, MediaStreamTrack]:
|
||||
"""
|
||||
Create minimal bot tracks with configurable audio/video generation.
|
||||
|
||||
Args:
|
||||
session_name: Name for the session
|
||||
config: Configuration dictionary with options:
|
||||
- visualization: 'ball', 'waveform', 'static'
|
||||
- audio_mode: 'tone', 'noise', 'silence'
|
||||
- width: video width (default 320)
|
||||
- height: video height (default 240)
|
||||
- fps: frames per second (default 15)
|
||||
- sample_rate: audio sample rate (default 48000)
|
||||
- frequency: tone frequency in Hz (default 440)
|
||||
- volume: audio volume 0-1 (default 0.5)
|
||||
- static_color: RGB tuple for static mode (default gray)
|
||||
|
||||
Returns:
|
||||
Dictionary containing 'video' and 'audio' tracks
|
||||
"""
|
||||
if config is None:
|
||||
config = {}
|
||||
|
||||
# Set defaults
|
||||
default_config = { # type: ignore
|
||||
'visualization': 'ball',
|
||||
'audio_mode': 'tone',
|
||||
'width': 320,
|
||||
'height': 240,
|
||||
'fps': 15,
|
||||
'sample_rate': 48000,
|
||||
'samples_per_frame': 960,
|
||||
'frequency': 440.0,
|
||||
'volume': 0.5,
|
||||
'static_color': (128, 128, 128)
|
||||
}
|
||||
default_config.update(config) # type: ignore
|
||||
|
||||
media_clock = MediaClock()
|
||||
|
||||
video_track = ConfigurableVideoTrack(media_clock, default_config) # type: ignore
|
||||
audio_track = ConfigurableAudioTrack(media_clock, default_config) # type: ignore
|
||||
|
||||
logger.info(f"Created minimal bot tracks for {session_name} with config: {default_config}")
|
||||
|
||||
return {"video": video_track, "audio": audio_track}
|
||||
|
||||
|
||||
# Agent descriptor exported for dynamic discovery by the FastAPI service
|
||||
AGENT_NAME = "Minimal Configurable Bot"
|
||||
AGENT_DESCRIPTION = "Minimal bot with configurable audio/video generation modes"
|
||||
|
||||
def agent_info() -> Dict[str, str]:
|
||||
"""Return agent metadata for discovery."""
|
||||
return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION}
|
||||
|
||||
def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]:
|
||||
"""Factory wrapper used by the FastAPI service to instantiate tracks for an agent."""
|
||||
return create_minimal_bot_tracks(session_name)
|
Loading…
x
Reference in New Issue
Block a user