Added minimal reference example
This commit is contained in:
parent
90c3c6e19b
commit
b319776c99
258
voicebot/bots/minimal.py
Normal file
258
voicebot/bots/minimal.py
Normal file
@ -0,0 +1,258 @@
|
|||||||
|
"""
|
||||||
|
Minimal Bot - Reference Example
|
||||||
|
|
||||||
|
A minimal bot that consumes and generates audio/video with configurable visualizations and audio creation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import cv2
|
||||||
|
import fractions
|
||||||
|
import time
|
||||||
|
from av.audio.frame import AudioFrame
|
||||||
|
from av import VideoFrame
|
||||||
|
from aiortc import MediaStreamTrack
|
||||||
|
from typing import Dict, Any, Optional, Tuple
|
||||||
|
from shared.logger import logger
|
||||||
|
|
||||||
|
|
||||||
|
class MediaClock:
|
||||||
|
"""Shared clock for media synchronization."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.t0 = time.perf_counter()
|
||||||
|
|
||||||
|
def now(self) -> float:
|
||||||
|
return time.perf_counter() - self.t0
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigurableVideoTrack(MediaStreamTrack):
|
||||||
|
"""Configurable video track with different visualization modes"""
|
||||||
|
|
||||||
|
kind = "video"
|
||||||
|
|
||||||
|
def __init__(self, clock: MediaClock, config: Dict[str, Any]):
|
||||||
|
"""Initialize the configurable video track.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
clock: Media clock for synchronization
|
||||||
|
config: Configuration dictionary for video settings
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.clock = clock
|
||||||
|
self.config = config
|
||||||
|
self.width = config.get('width', 320)
|
||||||
|
self.height = config.get('height', 240)
|
||||||
|
self.fps = config.get('fps', 15)
|
||||||
|
self.mode = config.get('visualization', 'ball')
|
||||||
|
self.frame_count = 0
|
||||||
|
self._start_time = time.time()
|
||||||
|
|
||||||
|
# Mode-specific state
|
||||||
|
if self.mode == 'ball':
|
||||||
|
self.ball_x = self.width // 2
|
||||||
|
self.ball_y = self.height // 2
|
||||||
|
self.ball_dx = 2
|
||||||
|
self.ball_dy = 2
|
||||||
|
self.ball_radius = 20
|
||||||
|
|
||||||
|
async def next_timestamp(self) -> Tuple[int, float]:
|
||||||
|
pts = int(self.frame_count * (90000 / self.fps))
|
||||||
|
time_base = 1 / 90000
|
||||||
|
return pts, time_base
|
||||||
|
|
||||||
|
async def recv(self) -> VideoFrame:
|
||||||
|
pts, time_base = await self.next_timestamp()
|
||||||
|
|
||||||
|
# Create frame based on mode
|
||||||
|
if self.mode == 'ball':
|
||||||
|
frame_array = self._generate_ball_frame()
|
||||||
|
elif self.mode == 'waveform':
|
||||||
|
frame_array = self._generate_waveform_frame()
|
||||||
|
elif self.mode == 'static':
|
||||||
|
frame_array = self._generate_static_frame()
|
||||||
|
else:
|
||||||
|
frame_array = self._generate_ball_frame() # default
|
||||||
|
|
||||||
|
frame = VideoFrame.from_ndarray(frame_array, format="bgr24") # type: ignore
|
||||||
|
frame.pts = pts
|
||||||
|
frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000)
|
||||||
|
|
||||||
|
self.frame_count += 1
|
||||||
|
return frame
|
||||||
|
|
||||||
|
def _generate_ball_frame(self) -> Any:
|
||||||
|
"""Generate bouncing ball visualization"""
|
||||||
|
frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
|
||||||
|
|
||||||
|
# Update ball position
|
||||||
|
self.ball_x += self.ball_dx
|
||||||
|
self.ball_y += self.ball_dy
|
||||||
|
|
||||||
|
# Bounce off walls
|
||||||
|
if self.ball_x <= self.ball_radius or self.ball_x >= self.width - self.ball_radius:
|
||||||
|
self.ball_dx = -self.ball_dx
|
||||||
|
if self.ball_y <= self.ball_radius or self.ball_y >= self.height - self.ball_radius:
|
||||||
|
self.ball_dy = -self.ball_dy
|
||||||
|
|
||||||
|
# Draw ball
|
||||||
|
cv2.circle(frame, (int(self.ball_x), int(self.ball_y)), self.ball_radius, (0, 255, 0), -1)
|
||||||
|
|
||||||
|
# Add timestamp
|
||||||
|
timestamp = f"Frame: {self.frame_count}"
|
||||||
|
cv2.putText(frame, timestamp, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
||||||
|
|
||||||
|
return frame
|
||||||
|
|
||||||
|
def _generate_waveform_frame(self) -> Any:
|
||||||
|
"""Generate waveform visualization"""
|
||||||
|
frame = np.zeros((self.height, self.width, 3), dtype=np.uint8)
|
||||||
|
|
||||||
|
# Generate sine wave
|
||||||
|
x = np.linspace(0, 4*np.pi, self.width)
|
||||||
|
y = np.sin(x + self.frame_count * 0.1) * self.height // 4 + self.height // 2
|
||||||
|
|
||||||
|
# Draw waveform
|
||||||
|
for i in range(1, len(y)):
|
||||||
|
cv2.line(frame, (i-1, int(y[i-1])), (i, int(y[i])), (255, 255, 255), 2)
|
||||||
|
|
||||||
|
return frame
|
||||||
|
|
||||||
|
def _generate_static_frame(self) -> Any:
|
||||||
|
"""Generate static color frame"""
|
||||||
|
color = self.config.get('static_color', (128, 128, 128))
|
||||||
|
frame = np.full((self.height, self.width, 3), color, dtype=np.uint8)
|
||||||
|
return frame
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigurableAudioTrack(MediaStreamTrack):
|
||||||
|
"""Configurable audio track with different audio generation modes"""
|
||||||
|
|
||||||
|
kind = "audio"
|
||||||
|
|
||||||
|
def __init__(self, clock: MediaClock, config: Dict[str, Any]):
|
||||||
|
"""Initialize the configurable audio track.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
clock: Media clock for synchronization
|
||||||
|
config: Configuration dictionary for audio settings
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.clock = clock
|
||||||
|
self.config = config
|
||||||
|
self.sample_rate = config.get('sample_rate', 48000)
|
||||||
|
self.samples_per_frame = config.get('samples_per_frame', 960)
|
||||||
|
self.mode = config.get('audio_mode', 'tone')
|
||||||
|
self.frequency = config.get('frequency', 440.0)
|
||||||
|
self.volume = config.get('volume', 0.5)
|
||||||
|
self._samples_generated = 0
|
||||||
|
|
||||||
|
async def next_timestamp(self) -> Tuple[int, float]:
|
||||||
|
pts = self._samples_generated
|
||||||
|
time_base = 1 / self.sample_rate
|
||||||
|
return pts, time_base
|
||||||
|
|
||||||
|
async def recv(self) -> AudioFrame:
|
||||||
|
pts, time_base = await self.next_timestamp()
|
||||||
|
|
||||||
|
# Generate audio based on mode
|
||||||
|
if self.mode == 'tone':
|
||||||
|
samples = self._generate_tone()
|
||||||
|
elif self.mode == 'noise':
|
||||||
|
samples = self._generate_noise()
|
||||||
|
elif self.mode == 'silence':
|
||||||
|
samples = self._generate_silence()
|
||||||
|
else:
|
||||||
|
samples = self._generate_tone() # default
|
||||||
|
|
||||||
|
# Convert to stereo
|
||||||
|
left = (samples * self.volume * 32767).astype(np.int16)
|
||||||
|
right = left.copy()
|
||||||
|
|
||||||
|
# Interleave channels
|
||||||
|
interleaved = np.empty(self.samples_per_frame * 2, dtype=np.int16) # type: ignore
|
||||||
|
interleaved[0::2] = left
|
||||||
|
interleaved[1::2] = right
|
||||||
|
|
||||||
|
stereo = interleaved.reshape(1, -1) # type: ignore
|
||||||
|
|
||||||
|
frame = AudioFrame.from_ndarray(stereo, format="s16", layout="stereo") # type: ignore
|
||||||
|
frame.sample_rate = self.sample_rate
|
||||||
|
frame.pts = pts
|
||||||
|
frame.time_base = fractions.Fraction(time_base).limit_denominator(1000000)
|
||||||
|
|
||||||
|
self._samples_generated += self.samples_per_frame
|
||||||
|
return frame
|
||||||
|
|
||||||
|
def _generate_tone(self) -> Any:
|
||||||
|
"""Generate sine wave tone"""
|
||||||
|
t = (np.arange(self.samples_per_frame) + self._samples_generated) / self.sample_rate # type: ignore
|
||||||
|
return np.sin(2 * np.pi * self.frequency * t).astype(np.float32) # type: ignore
|
||||||
|
|
||||||
|
def _generate_noise(self) -> Any:
|
||||||
|
"""Generate white noise"""
|
||||||
|
return np.random.uniform(-1, 1, self.samples_per_frame).astype(np.float32) # type: ignore
|
||||||
|
|
||||||
|
def _generate_silence(self) -> Any:
|
||||||
|
"""Generate silence"""
|
||||||
|
return np.zeros(self.samples_per_frame, dtype=np.float32) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def create_minimal_bot_tracks(session_name: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, MediaStreamTrack]:
|
||||||
|
"""
|
||||||
|
Create minimal bot tracks with configurable audio/video generation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session_name: Name for the session
|
||||||
|
config: Configuration dictionary with options:
|
||||||
|
- visualization: 'ball', 'waveform', 'static'
|
||||||
|
- audio_mode: 'tone', 'noise', 'silence'
|
||||||
|
- width: video width (default 320)
|
||||||
|
- height: video height (default 240)
|
||||||
|
- fps: frames per second (default 15)
|
||||||
|
- sample_rate: audio sample rate (default 48000)
|
||||||
|
- frequency: tone frequency in Hz (default 440)
|
||||||
|
- volume: audio volume 0-1 (default 0.5)
|
||||||
|
- static_color: RGB tuple for static mode (default gray)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing 'video' and 'audio' tracks
|
||||||
|
"""
|
||||||
|
if config is None:
|
||||||
|
config = {}
|
||||||
|
|
||||||
|
# Set defaults
|
||||||
|
default_config = { # type: ignore
|
||||||
|
'visualization': 'ball',
|
||||||
|
'audio_mode': 'tone',
|
||||||
|
'width': 320,
|
||||||
|
'height': 240,
|
||||||
|
'fps': 15,
|
||||||
|
'sample_rate': 48000,
|
||||||
|
'samples_per_frame': 960,
|
||||||
|
'frequency': 440.0,
|
||||||
|
'volume': 0.5,
|
||||||
|
'static_color': (128, 128, 128)
|
||||||
|
}
|
||||||
|
default_config.update(config) # type: ignore
|
||||||
|
|
||||||
|
media_clock = MediaClock()
|
||||||
|
|
||||||
|
video_track = ConfigurableVideoTrack(media_clock, default_config) # type: ignore
|
||||||
|
audio_track = ConfigurableAudioTrack(media_clock, default_config) # type: ignore
|
||||||
|
|
||||||
|
logger.info(f"Created minimal bot tracks for {session_name} with config: {default_config}")
|
||||||
|
|
||||||
|
return {"video": video_track, "audio": audio_track}
|
||||||
|
|
||||||
|
|
||||||
|
# Agent descriptor exported for dynamic discovery by the FastAPI service
|
||||||
|
AGENT_NAME = "Minimal Configurable Bot"
|
||||||
|
AGENT_DESCRIPTION = "Minimal bot with configurable audio/video generation modes"
|
||||||
|
|
||||||
|
def agent_info() -> Dict[str, str]:
|
||||||
|
"""Return agent metadata for discovery."""
|
||||||
|
return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION}
|
||||||
|
|
||||||
|
def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]:
|
||||||
|
"""Factory wrapper used by the FastAPI service to instantiate tracks for an agent."""
|
||||||
|
return create_minimal_bot_tracks(session_name)
|
Loading…
x
Reference in New Issue
Block a user