diff --git a/client/src/BotManager.tsx b/client/src/BotManager.tsx index 65f3f87..4de6601 100644 --- a/client/src/BotManager.tsx +++ b/client/src/BotManager.tsx @@ -76,6 +76,8 @@ const BotManager: React.FC = ({ lobbyId, onBotAdded, sx }) => { if (!selectedBot) return; setAddingBot(true); + setError(""); // Clear any previous errors + try { const request: BotJoinLobbyRequest = { lobby_id: lobbyId, @@ -83,9 +85,30 @@ const BotManager: React.FC = ({ lobbyId, onBotAdded, sx }) => { provider_id: providers[selectedBot], }; - const response = await botsApi.requestJoinLobby(selectedBot, request); + // Retry logic for handling service restart scenarios + let retries = 3; + let response; + + while (retries > 0) { + try { + response = await botsApi.requestJoinLobby(selectedBot, request); + break; // Success, exit retry loop + } catch (err: any) { + retries--; + + // If it's a 404 error and we have retries left, wait and retry + if (err?.status === 404 && retries > 0) { + console.log(`Bot join failed with 404, retrying... (${retries} attempts left)`); + await new Promise(resolve => setTimeout(resolve, 1000)); // Wait 1 second + continue; + } + + // If it's not a 404 or we're out of retries, throw the error + throw err; + } + } - if (response.status === "requested") { + if (response && response.status === "requested") { setAddDialogOpen(false); setSelectedBot(""); setBotNick(""); @@ -95,7 +118,11 @@ const BotManager: React.FC = ({ lobbyId, onBotAdded, sx }) => { } } catch (err) { console.error("Failed to add bot:", err); - setError("Failed to add bot to lobby"); + if (err instanceof Error) { + setError(`Failed to add bot: ${err.message}`); + } else { + setError("Failed to add bot to lobby"); + } } finally { setAddingBot(false); } diff --git a/server/core/bot_manager.py b/server/core/bot_manager.py index 77d2592..b26b7f8 100644 --- a/server/core/bot_manager.py +++ b/server/core/bot_manager.py @@ -5,6 +5,8 @@ import uuid import secrets import threading import httpx +import json +import asyncio from typing import Dict, List, Optional from pydantic import ValidationError from logger import logger @@ -75,11 +77,107 @@ class BotManager: str, BotInstanceModel ] = {} # bot_instance_id -> BotInstanceModel self.lock = threading.RLock() + self.bot_providers_file = "bot_providers.json" + self.cleanup_task: Optional["asyncio.Task[None]"] = None + self._shutdown_event = asyncio.Event() # Check if provider authentication is enabled allowed_providers = BotProviderConfig.get_allowed_providers() if not allowed_providers: logger.warning("Bot provider authentication disabled. Any provider can register.") + + # Load persisted bot providers + self._load_bot_providers() + + # Note: Don't start cleanup task here - will be started when needed + + def start_cleanup(self): + """Start the cleanup task""" + try: + if self.cleanup_task is None: + self.cleanup_task = asyncio.create_task(self._periodic_cleanup()) + logger.debug("Bot provider cleanup task started") + except RuntimeError: + # No event loop running yet, cleanup will be started later + logger.debug("No event loop available for bot provider cleanup task") + + async def stop_cleanup(self): + """Stop the cleanup task""" + self._shutdown_event.set() + if self.cleanup_task: + self.cleanup_task.cancel() + try: + await self.cleanup_task + except asyncio.CancelledError: + pass + + async def _periodic_cleanup(self): + """Periodically clean up stale bot providers""" + cleanup_interval = 300 # 5 minutes + stale_threshold = 900 # 15 minutes + + while not self._shutdown_event.is_set(): + try: + await asyncio.sleep(cleanup_interval) + + now = time.time() + providers_to_remove = [] + + with self.lock: + for provider_id, provider in self.bot_providers.items(): + if now - provider.last_seen > stale_threshold: + providers_to_remove.append(provider_id) + logger.info(f"Marking stale bot provider for removal: {provider.name} (ID: {provider_id}, last_seen: {now - provider.last_seen:.1f}s ago)") + + if providers_to_remove: + with self.lock: + for provider_id in providers_to_remove: + if provider_id in self.bot_providers: + del self.bot_providers[provider_id] + + self._save_bot_providers() + logger.info(f"Cleaned up {len(providers_to_remove)} stale bot providers") + + except asyncio.CancelledError: + break + except Exception as e: + logger.error(f"Error in bot provider cleanup: {e}") + + def _save_bot_providers(self): + """Save bot providers to disk""" + try: + with self.lock: + providers_data = {} + for provider_id, provider in self.bot_providers.items(): + providers_data[provider_id] = provider.model_dump() + + with open(self.bot_providers_file, 'w') as f: + json.dump(providers_data, f, indent=2) + logger.debug(f"Saved {len(providers_data)} bot providers to {self.bot_providers_file}") + except Exception as e: + logger.error(f"Failed to save bot providers: {e}") + + def _load_bot_providers(self): + """Load bot providers from disk""" + try: + if not os.path.exists(self.bot_providers_file): + logger.debug(f"No bot providers file found at {self.bot_providers_file}") + return + + with open(self.bot_providers_file, 'r') as f: + providers_data = json.load(f) + + with self.lock: + for provider_id, provider_dict in providers_data.items(): + try: + provider = BotProviderModel.model_validate(provider_dict) + self.bot_providers[provider_id] = provider + except Exception as e: + logger.warning(f"Failed to load bot provider {provider_id}: {e}") + + logger.info(f"Loaded {len(self.bot_providers)} bot providers from {self.bot_providers_file}") + except Exception as e: + logger.error(f"Failed to load bot providers: {e}") async def register_provider(self, request: BotProviderRegisterRequest) -> BotProviderRegisterResponse: """Register a new bot provider with authentication""" @@ -103,6 +201,10 @@ class BotManager: # Remove stale providers for provider_id_to_remove in providers_to_remove: del self.bot_providers[provider_id_to_remove] + + # Save after removing stale providers + if providers_to_remove: + self._save_bot_providers() provider_id = str(uuid.uuid4()) now = time.time() @@ -120,6 +222,12 @@ class BotManager: with self.lock: self.bot_providers[provider_id] = provider + # Save to disk + self._save_bot_providers() + + # Start cleanup task if not already running + self.start_cleanup() + logger.info(f"Registered bot provider: {request.name} at {request.base_url} with key: {request.provider_key}") return BotProviderRegisterResponse(provider_id=provider_id) diff --git a/server/core/session_manager.py b/server/core/session_manager.py index 3a16e07..0f1333c 100644 --- a/server/core/session_manager.py +++ b/server/core/session_manager.py @@ -111,7 +111,11 @@ class Session: """Individual session representing a user or bot connection""" def __init__( - self, id: str, bot_instance_id: Optional[str] = None, has_media: bool = True + self, + id: str, + bot_instance_id: Optional[str] = None, + has_media: bool = True, + from_disk: bool = False, ): logger.info( f"Instantiating new session {id} (bot: {True if bot_instance_id else False}, media: {has_media})" @@ -290,9 +294,9 @@ class SessionManager: # Background task management self.cleanup_task_running = False - self.cleanup_task: Optional[asyncio.Task] = None + self.cleanup_task: Optional[asyncio.Task[None]] = None self.validation_task_running = False - self.validation_task: Optional[asyncio.Task] = None + self.validation_task: Optional[asyncio.Task[None]] = None def create_session( self, @@ -484,6 +488,7 @@ class SessionManager: s_saved.id, bot_instance_id=s_saved.bot_instance_id, has_media=s_saved.has_media, + from_disk=True, ) session.name = name session.created_at = created_at diff --git a/voicebot/bot_orchestrator.py b/voicebot/bot_orchestrator.py index 5d97663..838337d 100644 --- a/voicebot/bot_orchestrator.py +++ b/voicebot/bot_orchestrator.py @@ -187,13 +187,13 @@ async def check_provider_registration(server_url: str, provider_id: str, insecur verify = not insecure async with httpx.AsyncClient(verify=verify) as client: # Check if our provider is still in the provider list - response = await client.get(f"{server_url}/api/bots", timeout=5.0) + response = await client.get(f"{server_url}/api/bots/providers", timeout=5.0) if response.status_code == 200: data = response.json() - providers = data.get("providers", {}) - # providers is Dict[bot_name, provider_id], so check if our provider_id is in the values - is_registered = provider_id in providers.values() - logger.debug(f"Registration check: provider_id={provider_id}, providers={providers}, is_registered={is_registered}") + providers = data.get("providers", []) + # providers is a list of BotProviderModel objects, check if our provider_id is in the list + is_registered = any(provider.get("provider_id") == provider_id for provider in providers) + logger.debug(f"Registration check: provider_id={provider_id}, found_providers={len(providers)}, is_registered={is_registered}") return is_registered else: logger.warning(f"Registration check failed: HTTP {response.status_code}")