When backend services (server or voicebot) restart, active frontend UIs become unable to add bots, resulting in:

```
POST https://ketrenos.com/ai-voicebot/api/bots/ai_chatbot/join 404 (Not Found)
```

The issue was caused by three main problems:

1. **Incorrect Provider Registration Check**: The voicebot service was checking provider registration using the wrong API endpoint (`/api/bots` instead of `/api/bots/providers`)

2. **No Persistence for Bot Providers**: Bot providers were stored only in memory and lost on server restart, requiring re-registration

3. **AsyncIO Task Initialization Issue**: The cleanup task was being created during `__init__` when no event loop was running, causing FastAPI route registration failures

**File**: `voicebot/bot_orchestrator.py`

**Problem**: The `check_provider_registration` function was calling `/api/bots` (which returns available bots) instead of `/api/bots/providers` (which returns registered providers).

**Fix**: Updated the function to use the correct endpoint and parse the response properly:

```python
async def check_provider_registration(server_url: str, provider_id: str, insecure: bool = False) -> bool:
    """Check if the bot provider is still registered with the server."""
    try:
        import httpx

        verify = not insecure
        async with httpx.AsyncClient(verify=verify) as client:
            # Check if our provider is still in the provider list
            response = await client.get(f"{server_url}/api/bots/providers", timeout=5.0)
            if response.status_code == 200:
                data = response.json()
                providers = data.get("providers", [])
                # providers is a list of BotProviderModel objects, check if our provider_id is in the list
                is_registered = any(provider.get("provider_id") == provider_id for provider in providers)
                logger.debug(f"Registration check: provider_id={provider_id}, found_providers={len(providers)}, is_registered={is_registered}")
                return is_registered
            else:
                logger.warning(f"Registration check failed: HTTP {response.status_code}")
                return False
    except Exception as e:
        logger.debug(f"Provider registration check failed: {e}")
    return False
```

**File**: `server/core/bot_manager.py`

**Problem**: Bot providers were stored only in memory and lost on server restart.

**Fix**: Added persistence functionality to save/load bot providers to/from `bot_providers.json`:

```python
def _save_bot_providers(self):
    """Save bot providers to disk"""
    try:
        with self.lock:
            providers_data = {}
            for provider_id, provider in self.bot_providers.items():
                providers_data[provider_id] = provider.model_dump()

        with open(self.bot_providers_file, 'w') as f:
            json.dump(providers_data, f, indent=2)
        logger.debug(f"Saved {len(providers_data)} bot providers to {self.bot_providers_file}")
    except Exception as e:
        logger.error(f"Failed to save bot providers: {e}")

def _load_bot_providers(self):
    """Load bot providers from disk"""
    try:
        if not os.path.exists(self.bot_providers_file):
            logger.debug(f"No bot providers file found at {self.bot_providers_file}")
            return

        with open(self.bot_providers_file, 'r') as f:
            providers_data = json.load(f)

        with self.lock:
            for provider_id, provider_dict in providers_data.items():
                try:
                    provider = BotProviderModel.model_validate(provider_dict)
                    self.bot_providers[provider_id] = provider
                except Exception as e:
                    logger.warning(f"Failed to load bot provider {provider_id}: {e}")

        logger.info(f"Loaded {len(self.bot_providers)} bot providers from {self.bot_providers_file}")
    except Exception as e:
        logger.error(f"Failed to load bot providers: {e}")
```

**Integration**: The persistence functions are automatically called:
- `_load_bot_providers()` during `BotManager.__init__()`
- `_save_bot_providers()` when registering new providers or removing stale ones

**File**: `server/core/bot_manager.py`

**Problem**: The cleanup task was being created during `BotManager.__init__()` when no event loop was running, causing the FastAPI application to fail to register routes properly.

**Fix**: Deferred the cleanup task creation until it's actually needed:

```python
def __init__(self):
    # ... other initialization ...
    # Load persisted bot providers
    self._load_bot_providers()

    # Note: Don't start cleanup task here - will be started when needed

def start_cleanup(self):
    """Start the cleanup task"""
    try:
        if self.cleanup_task is None:
            self.cleanup_task = asyncio.create_task(self._periodic_cleanup())
            logger.debug("Bot provider cleanup task started")
    except RuntimeError:
        # No event loop running yet, cleanup will be started later
        logger.debug("No event loop available for bot provider cleanup task")

async def register_provider(self, request: BotProviderRegisterRequest) -> BotProviderRegisterResponse:
    # ... registration logic ...

    # Start cleanup task if not already running
    self.start_cleanup()

    return BotProviderRegisterResponse(provider_id=provider_id)
```

**File**: `server/core/bot_manager.py`

**Enhancement**: Added a background task that periodically removes providers that haven't been seen in 15 minutes:

```python
async def _periodic_cleanup(self):
    """Periodically clean up stale bot providers"""
    cleanup_interval = 300  # 5 minutes
    stale_threshold = 900   # 15 minutes

    while not self._shutdown_event.is_set():
        try:
            await asyncio.sleep(cleanup_interval)

            now = time.time()
            providers_to_remove = []

            with self.lock:
                for provider_id, provider in self.bot_providers.items():
                    if now - provider.last_seen > stale_threshold:
                        providers_to_remove.append(provider_id)
                        logger.info(f"Marking stale bot provider for removal: {provider.name} (ID: {provider_id}, last_seen: {now - provider.last_seen:.1f}s ago)")

            if providers_to_remove:
                with self.lock:
                    for provider_id in providers_to_remove:
                        if provider_id in self.bot_providers:
                            del self.bot_providers[provider_id]

                self._save_bot_providers()
                logger.info(f"Cleaned up {len(providers_to_remove)} stale bot providers")

        except asyncio.CancelledError:
            break
        except Exception as e:
            logger.error(f"Error in bot provider cleanup: {e}")
```

**File**: `client/src/BotManager.tsx`

**Enhancement**: Added retry logic to handle temporary 404s during service restarts:

```typescript
// Retry logic for handling service restart scenarios
let retries = 3;
let response;

while (retries > 0) {
  try {
    response = await botsApi.requestJoinLobby(selectedBot, request);
    break; // Success, exit retry loop
  } catch (err: any) {
    retries--;

    // If it's a 404 error and we have retries left, wait and retry
    if (err?.status === 404 && retries > 0) {
      console.log(`Bot join failed with 404, retrying... (${retries} attempts left)`);
      await new Promise(resolve => setTimeout(resolve, 1000)); // Wait 1 second
      continue;
    }

    // If it's not a 404 or we're out of retries, throw the error
    throw err;
  }
}
```

1. **Persistence**: Bot providers now survive server restarts and don't need to re-register immediately
2. **Correct Registration Checks**: Provider registration checks use the correct API endpoint
3. **Proper AsyncIO Task Management**: Cleanup tasks are started only when an event loop is available
4. **Automatic Cleanup**: Stale providers are automatically removed to prevent accumulation of dead entries
5. **Client Resilience**: Frontend can handle temporary 404s during service restarts with automatic retries
6. **Reduced Downtime**: Users experience fewer failed bot additions during service restarts

After implementing these fixes:

1. Bot providers are correctly persisted in `bot_providers.json`
2. Server restarts load existing providers from disk
3. Provider registration checks use the correct `/api/bots/providers` endpoint
4. AsyncIO cleanup tasks start properly without interfering with route registration
5. Client retries failed requests with 404 errors
6. Periodic cleanup prevents accumulation of stale providers
7. Bot join requests work correctly: `POST /api/bots/{bot_name}/join` returns 200 OK

Test the fix with these commands:

```bash
curl -k https://ketrenos.com/ai-voicebot/api/lobby

curl -k -X POST https://ketrenos.com/ai-voicebot/api/bots/ai_chatbot/join \
  -H "Content-Type: application/json" \
  -d '{"lobby_id":"<lobby_id>","nick":"test-bot","provider_id":"<provider_id>"}'

curl -k https://ketrenos.com/ai-voicebot/api/bots/providers

curl -k https://ketrenos.com/ai-voicebot/api/bots
```

1. `voicebot/bot_orchestrator.py` - Fixed registration check endpoint
2. `server/core/bot_manager.py` - Added persistence and cleanup
3. `client/src/BotManager.tsx` - Added retry logic

No additional configuration is required. The fixes work with existing environment variables and settings.
This commit is contained in:
James Ketr 2025-09-05 12:15:31 -07:00
parent 4d218864d8
commit e0548a128c
4 changed files with 151 additions and 11 deletions

View File

@ -76,6 +76,8 @@ const BotManager: React.FC<BotManagerProps> = ({ lobbyId, onBotAdded, sx }) => {
if (!selectedBot) return;
setAddingBot(true);
setError(""); // Clear any previous errors
try {
const request: BotJoinLobbyRequest = {
lobby_id: lobbyId,
@ -83,9 +85,30 @@ const BotManager: React.FC<BotManagerProps> = ({ lobbyId, onBotAdded, sx }) => {
provider_id: providers[selectedBot],
};
const response = await botsApi.requestJoinLobby(selectedBot, request);
// Retry logic for handling service restart scenarios
let retries = 3;
let response;
while (retries > 0) {
try {
response = await botsApi.requestJoinLobby(selectedBot, request);
break; // Success, exit retry loop
} catch (err: any) {
retries--;
// If it's a 404 error and we have retries left, wait and retry
if (err?.status === 404 && retries > 0) {
console.log(`Bot join failed with 404, retrying... (${retries} attempts left)`);
await new Promise(resolve => setTimeout(resolve, 1000)); // Wait 1 second
continue;
}
// If it's not a 404 or we're out of retries, throw the error
throw err;
}
}
if (response.status === "requested") {
if (response && response.status === "requested") {
setAddDialogOpen(false);
setSelectedBot("");
setBotNick("");
@ -95,7 +118,11 @@ const BotManager: React.FC<BotManagerProps> = ({ lobbyId, onBotAdded, sx }) => {
}
} catch (err) {
console.error("Failed to add bot:", err);
setError("Failed to add bot to lobby");
if (err instanceof Error) {
setError(`Failed to add bot: ${err.message}`);
} else {
setError("Failed to add bot to lobby");
}
} finally {
setAddingBot(false);
}

View File

@ -5,6 +5,8 @@ import uuid
import secrets
import threading
import httpx
import json
import asyncio
from typing import Dict, List, Optional
from pydantic import ValidationError
from logger import logger
@ -75,11 +77,107 @@ class BotManager:
str, BotInstanceModel
] = {} # bot_instance_id -> BotInstanceModel
self.lock = threading.RLock()
self.bot_providers_file = "bot_providers.json"
self.cleanup_task: Optional["asyncio.Task[None]"] = None
self._shutdown_event = asyncio.Event()
# Check if provider authentication is enabled
allowed_providers = BotProviderConfig.get_allowed_providers()
if not allowed_providers:
logger.warning("Bot provider authentication disabled. Any provider can register.")
# Load persisted bot providers
self._load_bot_providers()
# Note: Don't start cleanup task here - will be started when needed
def start_cleanup(self):
"""Start the cleanup task"""
try:
if self.cleanup_task is None:
self.cleanup_task = asyncio.create_task(self._periodic_cleanup())
logger.debug("Bot provider cleanup task started")
except RuntimeError:
# No event loop running yet, cleanup will be started later
logger.debug("No event loop available for bot provider cleanup task")
async def stop_cleanup(self):
"""Stop the cleanup task"""
self._shutdown_event.set()
if self.cleanup_task:
self.cleanup_task.cancel()
try:
await self.cleanup_task
except asyncio.CancelledError:
pass
async def _periodic_cleanup(self):
"""Periodically clean up stale bot providers"""
cleanup_interval = 300 # 5 minutes
stale_threshold = 900 # 15 minutes
while not self._shutdown_event.is_set():
try:
await asyncio.sleep(cleanup_interval)
now = time.time()
providers_to_remove = []
with self.lock:
for provider_id, provider in self.bot_providers.items():
if now - provider.last_seen > stale_threshold:
providers_to_remove.append(provider_id)
logger.info(f"Marking stale bot provider for removal: {provider.name} (ID: {provider_id}, last_seen: {now - provider.last_seen:.1f}s ago)")
if providers_to_remove:
with self.lock:
for provider_id in providers_to_remove:
if provider_id in self.bot_providers:
del self.bot_providers[provider_id]
self._save_bot_providers()
logger.info(f"Cleaned up {len(providers_to_remove)} stale bot providers")
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Error in bot provider cleanup: {e}")
def _save_bot_providers(self):
"""Save bot providers to disk"""
try:
with self.lock:
providers_data = {}
for provider_id, provider in self.bot_providers.items():
providers_data[provider_id] = provider.model_dump()
with open(self.bot_providers_file, 'w') as f:
json.dump(providers_data, f, indent=2)
logger.debug(f"Saved {len(providers_data)} bot providers to {self.bot_providers_file}")
except Exception as e:
logger.error(f"Failed to save bot providers: {e}")
def _load_bot_providers(self):
"""Load bot providers from disk"""
try:
if not os.path.exists(self.bot_providers_file):
logger.debug(f"No bot providers file found at {self.bot_providers_file}")
return
with open(self.bot_providers_file, 'r') as f:
providers_data = json.load(f)
with self.lock:
for provider_id, provider_dict in providers_data.items():
try:
provider = BotProviderModel.model_validate(provider_dict)
self.bot_providers[provider_id] = provider
except Exception as e:
logger.warning(f"Failed to load bot provider {provider_id}: {e}")
logger.info(f"Loaded {len(self.bot_providers)} bot providers from {self.bot_providers_file}")
except Exception as e:
logger.error(f"Failed to load bot providers: {e}")
async def register_provider(self, request: BotProviderRegisterRequest) -> BotProviderRegisterResponse:
"""Register a new bot provider with authentication"""
@ -103,6 +201,10 @@ class BotManager:
# Remove stale providers
for provider_id_to_remove in providers_to_remove:
del self.bot_providers[provider_id_to_remove]
# Save after removing stale providers
if providers_to_remove:
self._save_bot_providers()
provider_id = str(uuid.uuid4())
now = time.time()
@ -120,6 +222,12 @@ class BotManager:
with self.lock:
self.bot_providers[provider_id] = provider
# Save to disk
self._save_bot_providers()
# Start cleanup task if not already running
self.start_cleanup()
logger.info(f"Registered bot provider: {request.name} at {request.base_url} with key: {request.provider_key}")
return BotProviderRegisterResponse(provider_id=provider_id)

View File

@ -111,7 +111,11 @@ class Session:
"""Individual session representing a user or bot connection"""
def __init__(
self, id: str, bot_instance_id: Optional[str] = None, has_media: bool = True
self,
id: str,
bot_instance_id: Optional[str] = None,
has_media: bool = True,
from_disk: bool = False,
):
logger.info(
f"Instantiating new session {id} (bot: {True if bot_instance_id else False}, media: {has_media})"
@ -290,9 +294,9 @@ class SessionManager:
# Background task management
self.cleanup_task_running = False
self.cleanup_task: Optional[asyncio.Task] = None
self.cleanup_task: Optional[asyncio.Task[None]] = None
self.validation_task_running = False
self.validation_task: Optional[asyncio.Task] = None
self.validation_task: Optional[asyncio.Task[None]] = None
def create_session(
self,
@ -484,6 +488,7 @@ class SessionManager:
s_saved.id,
bot_instance_id=s_saved.bot_instance_id,
has_media=s_saved.has_media,
from_disk=True,
)
session.name = name
session.created_at = created_at

View File

@ -187,13 +187,13 @@ async def check_provider_registration(server_url: str, provider_id: str, insecur
verify = not insecure
async with httpx.AsyncClient(verify=verify) as client:
# Check if our provider is still in the provider list
response = await client.get(f"{server_url}/api/bots", timeout=5.0)
response = await client.get(f"{server_url}/api/bots/providers", timeout=5.0)
if response.status_code == 200:
data = response.json()
providers = data.get("providers", {})
# providers is Dict[bot_name, provider_id], so check if our provider_id is in the values
is_registered = provider_id in providers.values()
logger.debug(f"Registration check: provider_id={provider_id}, providers={providers}, is_registered={is_registered}")
providers = data.get("providers", [])
# providers is a list of BotProviderModel objects, check if our provider_id is in the list
is_registered = any(provider.get("provider_id") == provider_id for provider in providers)
logger.debug(f"Registration check: provider_id={provider_id}, found_providers={len(providers)}, is_registered={is_registered}")
return is_registered
else:
logger.warning(f"Registration check failed: HTTP {response.status_code}")