From 3674d57b0adfcd12176e3f8928b87b658c19e9cf Mon Sep 17 00:00:00 2001
From: James Ketrenos <james_git@ketrenos.com>
Date: Wed, 3 Sep 2025 11:51:18 -0700
Subject: [PATCH] Adding orchestration

---
 voicebot/bots/__init__.py              |   3 +
 voicebot/{ => bots}/synthetic_media.py |  24 +++-
 voicebot/{ => bots}/whisper.py         |  20 +++
 voicebot/main.py                       | 162 +++++++++++++++++++++++--
 4 files changed, 198 insertions(+), 11 deletions(-)
 create mode 100644 voicebot/bots/__init__.py
 rename voicebot/{ => bots}/synthetic_media.py (94%)
 rename voicebot/{ => bots}/whisper.py (85%)

diff --git a/voicebot/bots/__init__.py b/voicebot/bots/__init__.py
new file mode 100644
index 0000000..8e1c276
--- /dev/null
+++ b/voicebot/bots/__init__.py
@@ -0,0 +1,3 @@
+"""Bots package for discoverable agent modules."""
+
+__all__ = ["synthetic_media", "whisper"]
diff --git a/voicebot/synthetic_media.py b/voicebot/bots/synthetic_media.py
similarity index 94%
rename from voicebot/synthetic_media.py
rename to voicebot/bots/synthetic_media.py
index a69503b..ee4e533 100644
--- a/voicebot/synthetic_media.py
+++ b/voicebot/bots/synthetic_media.py
@@ -1,10 +1,13 @@
 """
-Synthetic Media Tracks Module
+Synthetic Media Tracks Module (bots/synthetic_media)
 
-This module provides synthetic audio and video track creation for WebRTC media streaming.
-Contains AnimatedVideoTrack and SyntheticAudioTrack implementations ported from JavaScript.
+Copied from voicebot/synthetic_media. This module contains the real implementation
+and heavy dependencies and is intended to be imported lazily by the orchestrator
+or runtime controllers.
 """
 
+# ...existing heavy implementation moved here (keeps same content as original file)
+
 import numpy as np
 import math
 import cv2
@@ -232,7 +235,7 @@ class SyntheticAudioTrack(MediaStreamTrack):
     - Top of screen (Y=0): 800Hz (high pitch)
     - Bottom of screen (Y=height): 200Hz (low pitch)
 
-    Bounce events add temporary audio effects on top of the continuous tone.
+    Bounce events add temporary audio on top of the continuous tone.
     """
 
     kind = "audio"
@@ -458,3 +461,16 @@ def create_synthetic_tracks(session_name: str) -> dict[str, MediaStreamTrack]:
     video_track.audio_track = audio_track
 
     return {"video": video_track, "audio": audio_track}
+
+
+# Agent descriptor exported for dynamic discovery by the FastAPI service
+AGENT_NAME = "synthetic_media"
+AGENT_DESCRIPTION = "Synthetic audio and video tracks (AnimatedVideoTrack + SyntheticAudioTrack)"
+
+def agent_info() -> dict[str, str]:
+    """Return agent metadata for discovery."""
+    return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION}
+
+def create_agent_tracks(session_name: str) -> dict[str, MediaStreamTrack]:
+    """Factory wrapper used by the FastAPI service to instantiate tracks for an agent."""
+    return create_synthetic_tracks(session_name)
diff --git a/voicebot/whisper.py b/voicebot/bots/whisper.py
similarity index 85%
rename from voicebot/whisper.py
rename to voicebot/bots/whisper.py
index 71cd167..fd78adb 100644
--- a/voicebot/whisper.py
+++ b/voicebot/bots/whisper.py
@@ -1,8 +1,28 @@
+"""Bots package whisper agent (bots/whisper)
+
+Lightweight agent descriptor; heavy model loading must be done by a controller
+when the agent is actually used.
+"""
+
+from typing import Dict
 from typing import Any
 import librosa
 from logger import logger
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 
+
+AGENT_NAME = "whisper"
+AGENT_DESCRIPTION = "Speech recognition agent (Whisper) - processes incoming audio"
+
+
+def agent_info() -> Dict[str, str]:
+    return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION}
+
+
+def create_agent_tracks(session_name: str) -> dict:
+    """Whisper is not a media source - return no local tracks."""
+    return {}
+
 model_ids = {
     "Distil-Whisper": [
         "distil-whisper/distil-large-v2",
diff --git a/voicebot/main.py b/voicebot/main.py
index 68c0f4d..4d88e1d 100644
--- a/voicebot/main.py
+++ b/voicebot/main.py
@@ -2,7 +2,7 @@
 WebRTC Media Agent for Python
 
 This module provides WebRTC signaling server communication and peer connection management.
-Synthetic audio/video track creation is handled by the synthetic_media module.
+Synthetic audio/video track creation is handled by the bots.synthetic_media module.
 """
 
 from __future__ import annotations
@@ -18,6 +18,7 @@ from typing import (
     Protocol,
     AsyncIterator,
     cast,
+    Any,
 )
 
 # test
@@ -57,7 +58,17 @@ from aiortc import (
     MediaStreamTrack,
 )
 from logger import logger
-from synthetic_media import create_synthetic_tracks, AnimatedVideoTrack
+from voicebot.bots.synthetic_media import create_synthetic_tracks, AnimatedVideoTrack
+
+# Bot orchestration imports
+import importlib
+import pkgutil
+import uuid
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import threading
+import uvicorn
+
 
 # import debug_aioice
 
@@ -103,12 +114,16 @@ class WebRTCSignalingClient:
         session_id: str,
         session_name: str,
         insecure: bool = False,
+        create_tracks: Optional[Callable[[str], Dict[str, MediaStreamTrack]]] = None,
     ):
         self.server_url = server_url
         self.lobby_id = lobby_id
         self.session_id = session_id
         self.session_name = session_name
         self.insecure = insecure
+        
+        # Optional factory to create local media tracks for this client (bot provided)
+        self.create_tracks = create_tracks
 
         # WebSocket client protocol instance (typed as object to avoid Any)
         self.websocket: Optional[object] = None
@@ -207,10 +222,19 @@ class WebRTCSignalingClient:
         logger.info("Disconnected from signaling server")
 
     async def _setup_local_media(self):
-        """Create local synthetic media tracks"""
-        # Create synthetic tracks using the new module
-        tracks = create_synthetic_tracks(self.session_name)
-        self.local_tracks.update(tracks)
+        """Create local media tracks"""
+        # If a bot provided a create_tracks callable, use it to create tracks.
+        # Otherwise, use default synthetic tracks.
+        try:
+            if self.create_tracks:
+                tracks = self.create_tracks(self.session_name)
+                self.local_tracks.update(tracks)
+            else:
+                # Default fallback to synthetic tracks
+                tracks = create_synthetic_tracks(self.session_name)
+                self.local_tracks.update(tracks)
+        except Exception:
+            logger.exception("Failed to create local tracks using bot factory")
 
         # Add local peer to peers dict
         local_peer = Peer(
@@ -221,7 +245,7 @@ class WebRTCSignalingClient:
         )
         self.peers[self.session_id] = local_peer
 
-        logger.info("Local synthetic media tracks created")
+        logger.info("Local media tracks created")
 
     async def _send_message(
         self, message_type: str, data: Optional[MessageData] = None
@@ -1103,6 +1127,130 @@ async def main():
         await client.disconnect()
 
 
+# --- FastAPI service for bot discovery and orchestration -------------------
+
+app = FastAPI(title="voicebot-bot-orchestrator")
+
+
+class JoinRequest(BaseModel):
+    lobby_id: str
+    session_id: str
+    nick: str
+    server_url: str
+    insecure: bool = False
+
+
+def discover_bots() -> Dict[str, Dict[str, Any]]:
+    """Discover bot modules under the voicebot.bots package that expose bot_info.
+
+    This intentionally imports modules under `voicebot.bots` so heavy bot
+    implementations can remain in that package and be imported lazily.
+    """
+    bots: Dict[str, Dict[str, Any]] = {}
+    try:
+        package = importlib.import_module("voicebot.bots")
+        package_path = package.__path__
+    except Exception:
+        logger.exception("Failed to import voicebot.bots package")
+        return bots
+
+    for _finder, name, _ispkg in pkgutil.iter_modules(package_path):
+        try:
+            mod = importlib.import_module(f"voicebot.bots.{name}")
+        except Exception:
+            logger.exception("Failed to import voicebot.bots.%s", name)
+            continue
+        info = None
+        create_tracks = None
+        if hasattr(mod, "agent_info") and callable(getattr(mod, "agent_info")):
+            try:
+                info = mod.agent_info()
+                # Note: Keep copy as is to maintain structure
+            except Exception:
+                logger.exception("agent_info() failed for %s", name)
+        if hasattr(mod, "create_agent_tracks") and callable(getattr(mod, "create_agent_tracks")):
+            create_tracks = getattr(mod, "create_agent_tracks")
+
+        if info:
+            bots[info.get("name", name)] = {"module": name, "info": info, "create_tracks": create_tracks}
+    return bots
+
+
+@app.get("/bots")
+def list_bots() -> Dict[str, Any]:
+    bots = discover_bots()
+    return {k: v["info"] for k, v in bots.items()}
+
+
+@app.post("/bots/{bot_name}/join")
+async def bot_join(bot_name: str, req: JoinRequest):
+    bots = discover_bots()
+    bot = bots.get(bot_name)
+    if not bot:
+        raise HTTPException(status_code=404, detail="Bot not found")
+
+    create_tracks = bot.get("create_tracks")
+
+    # Start the WebRTCSignalingClient in a background asyncio task and register it
+    client = WebRTCSignalingClient(
+        server_url=req.server_url,
+        lobby_id=req.lobby_id,
+        session_id=req.session_id,
+        session_name=req.nick,
+        insecure=req.insecure,
+        create_tracks=create_tracks,
+    )
+
+    run_id = str(uuid.uuid4())
+
+    async def run_client():
+        try:
+            registry[run_id] = client
+            await client.connect()
+        except Exception:
+            logger.exception("Bot client failed for run %s", run_id)
+        finally:
+            registry.pop(run_id, None)
+
+    loop = asyncio.get_event_loop()
+    threading.Thread(target=loop.run_until_complete, args=(run_client(),), daemon=True).start()
+
+    return {"status": "started", "bot": bot_name, "run_id": run_id}
+
+
+# Lightweight in-memory registry of running bot clients
+registry: Dict[str, WebRTCSignalingClient] = {}
+
+
+@app.post("/bots/runs/{run_id}/stop")
+async def stop_run(run_id: str):
+    client = registry.get(run_id)
+    if not client:
+        raise HTTPException(status_code=404, detail="Run not found")
+    try:
+        await client.disconnect()
+    except Exception:
+        logger.exception("Failed to stop run %s", run_id)
+        raise HTTPException(status_code=500, detail="Failed to stop run")
+    registry.pop(run_id, None)
+    return {"status": "stopped", "run_id": run_id}
+
+
+@app.get("/bots/runs")
+def list_runs() -> Dict[str, Any]:
+    return {
+        "runs": [
+            {"run_id": run_id, "session_id": client.session_id, "session_name": client.session_name}
+            for run_id, client in registry.items()
+        ]
+    }
+
+
+def start_bot_api(host: str = "0.0.0.0", port: int = 8788):
+    """Start the bot orchestration API server"""
+    uvicorn.run(app, host=host, port=port)
+
+
 if __name__ == "__main__":
     # Install required packages:
     # pip install aiortc websockets opencv-python numpy