Workiing ish
This commit is contained in:
parent
39739e5d34
commit
122ed532d6
175
API_EVOLUTION.md
175
API_EVOLUTION.md
@ -1,175 +0,0 @@
|
|||||||
# API Evolution Detection System
|
|
||||||
|
|
||||||
This system automatically detects when your OpenAPI schema has new endpoints or changed parameters that need to be implemented in the `ApiClient` class.
|
|
||||||
|
|
||||||
## How It Works
|
|
||||||
|
|
||||||
### Automatic Detection
|
|
||||||
- **Development Mode**: Automatically runs when `api-client.ts` is imported during development
|
|
||||||
- **Runtime Checking**: Compares available endpoints in the OpenAPI schema with implemented methods
|
|
||||||
- **Console Warnings**: Displays detailed warnings about unimplemented endpoints
|
|
||||||
|
|
||||||
### Schema Comparison
|
|
||||||
- **Hash-based Detection**: Detects when the OpenAPI schema file changes
|
|
||||||
- **Endpoint Analysis**: Identifies new, changed, or unimplemented endpoints
|
|
||||||
- **Parameter Validation**: Suggests checking for parameter changes
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
### Automatic Checking
|
|
||||||
The system runs automatically in development mode when you import from `api-client.ts`:
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
import { apiClient } from './api-client';
|
|
||||||
// Check runs automatically after 1 second delay
|
|
||||||
```
|
|
||||||
|
|
||||||
### Command Line Checking
|
|
||||||
You can run API evolution checks from the command line:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Full type generation with evolution check
|
|
||||||
./generate-ts-types.sh
|
|
||||||
|
|
||||||
# Quick evolution check only (without regenerating types)
|
|
||||||
./check-api-evolution.sh
|
|
||||||
|
|
||||||
# Or from within the client container
|
|
||||||
npm run check-api-evolution
|
|
||||||
```
|
|
||||||
|
|
||||||
### Manual Checking
|
|
||||||
You can manually trigger checks during development:
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
import { devUtils } from './api-client';
|
|
||||||
|
|
||||||
// Check for API evolution
|
|
||||||
const evolution = await devUtils.checkApiEvolution();
|
|
||||||
|
|
||||||
// Force recheck (bypasses once-per-session limit)
|
|
||||||
devUtils.recheckEndpoints();
|
|
||||||
```
|
|
||||||
|
|
||||||
### Console Output
|
|
||||||
When unimplemented endpoints are found, you'll see:
|
|
||||||
|
|
||||||
**Browser Console (development mode):**
|
|
||||||
```
|
|
||||||
🚨 API Evolution Detection
|
|
||||||
🆕 New API endpoints detected:
|
|
||||||
• GET /ai-voicebot/api/new-feature (get_new_feature_endpoint)
|
|
||||||
⚠️ Unimplemented API endpoints:
|
|
||||||
• POST /ai-voicebot/api/admin/bulk-action
|
|
||||||
💡 Implementation suggestions:
|
|
||||||
Add these methods to ApiClient:
|
|
||||||
async adminBulkAction(): Promise<any> {
|
|
||||||
return this.request<any>('/ai-voicebot/api/admin/bulk-action', { method: 'POST' });
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Command Line:**
|
|
||||||
```
|
|
||||||
🔍 API Evolution Check
|
|
||||||
==================================================
|
|
||||||
📊 Summary:
|
|
||||||
Total endpoints: 8
|
|
||||||
Implemented: 7
|
|
||||||
Unimplemented: 1
|
|
||||||
|
|
||||||
⚠️ Unimplemented API endpoints:
|
|
||||||
• POST /ai-voicebot/api/admin/bulk-action
|
|
||||||
Admin bulk action endpoint
|
|
||||||
|
|
||||||
💡 Implementation suggestions:
|
|
||||||
Add these methods to the ApiClient class:
|
|
||||||
|
|
||||||
async adminBulkAction(data?: any): Promise<any> {
|
|
||||||
return this.request<any>('/ai-voicebot/api/admin/bulk-action', { method: 'POST', body: data });
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
### Implemented Endpoints Registry
|
|
||||||
The system maintains a registry of implemented endpoints in `ApiClient`. When you add new methods, update the registry:
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
// In api-evolution-checker.ts
|
|
||||||
private getImplementedEndpoints(): Set<string> {
|
|
||||||
return new Set([
|
|
||||||
'GET:/ai-voicebot/api/admin/names',
|
|
||||||
'POST:/ai-voicebot/api/admin/set_password',
|
|
||||||
// Add new endpoints here:
|
|
||||||
'POST:/ai-voicebot/api/admin/bulk-action',
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Schema Location
|
|
||||||
The system attempts to load the OpenAPI schema from:
|
|
||||||
- `/openapi-schema.json` (served by your development server)
|
|
||||||
- Falls back to hardcoded endpoint list if schema file is unavailable
|
|
||||||
|
|
||||||
## Development Workflow
|
|
||||||
|
|
||||||
### When Adding New API Endpoints
|
|
||||||
|
|
||||||
1. **Add endpoint to FastAPI server** (server/main.py)
|
|
||||||
2. **Regenerate types**: Run `./generate-ts-types.sh`
|
|
||||||
3. **Check console** for warnings about unimplemented endpoints
|
|
||||||
4. **Implement methods** in `ApiClient` class
|
|
||||||
5. **Update endpoint registry** in the evolution checker
|
|
||||||
6. **Add convenience methods** to API namespaces if needed
|
|
||||||
|
|
||||||
### Example Implementation
|
|
||||||
|
|
||||||
When you see a warning like:
|
|
||||||
```
|
|
||||||
⚠️ Unimplemented: POST /ai-voicebot/api/admin/bulk-action
|
|
||||||
```
|
|
||||||
|
|
||||||
1. Add the method to `ApiClient`:
|
|
||||||
```typescript
|
|
||||||
async adminBulkAction(data: BulkActionRequest): Promise<BulkActionResponse> {
|
|
||||||
return this.request<BulkActionResponse>('/ai-voicebot/api/admin/bulk-action', {
|
|
||||||
method: 'POST',
|
|
||||||
body: data
|
|
||||||
});
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Add to convenience API:
|
|
||||||
```typescript
|
|
||||||
export const adminApi = {
|
|
||||||
listNames: () => apiClient.adminListNames(),
|
|
||||||
setPassword: (data: AdminSetPassword) => apiClient.adminSetPassword(data),
|
|
||||||
clearPassword: (data: AdminClearPassword) => apiClient.adminClearPassword(data),
|
|
||||||
bulkAction: (data: BulkActionRequest) => apiClient.adminBulkAction(data), // New
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
3. Update the registry:
|
|
||||||
```typescript
|
|
||||||
private getImplementedEndpoints(): Set<string> {
|
|
||||||
return new Set([
|
|
||||||
// ... existing endpoints ...
|
|
||||||
'POST:/ai-voicebot/api/admin/bulk-action', // Add this
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Benefits
|
|
||||||
|
|
||||||
- **Prevents Missing Implementations**: Never forget to implement new API endpoints
|
|
||||||
- **Development Efficiency**: Automatic detection saves time during API evolution
|
|
||||||
- **Type Safety**: Works with generated TypeScript types for full type safety
|
|
||||||
- **Code Generation**: Provides implementation stubs to get started quickly
|
|
||||||
- **Schema Validation**: Detects when OpenAPI schema changes
|
|
||||||
|
|
||||||
## Production Considerations
|
|
||||||
|
|
||||||
- **Development Only**: Evolution checking only runs in development mode
|
|
||||||
- **Performance**: Minimal runtime overhead (single check per session)
|
|
||||||
- **Error Handling**: Gracefully falls back if schema loading fails
|
|
||||||
- **Console Logging**: All output goes to console.warn/info for easy filtering
|
|
@ -1,298 +0,0 @@
|
|||||||
# Architecture Recommendations: Sessions, Lobbies, and WebSockets
|
|
||||||
|
|
||||||
## Executive Summary
|
|
||||||
|
|
||||||
The current architecture has grown organically into a monolithic structure that mixes concerns and creates maintenance challenges. This document outlines specific recommendations to improve maintainability, reduce complexity, and enhance the development experience.
|
|
||||||
|
|
||||||
## Current Issues
|
|
||||||
|
|
||||||
### 1. Server (`server/main.py`)
|
|
||||||
- **Monolithic structure**: 2300+ lines in a single file
|
|
||||||
- **Mixed concerns**: Session, lobby, WebSocket, bot, and admin logic intertwined
|
|
||||||
- **Complex state management**: Multiple global dictionaries requiring manual synchronization
|
|
||||||
- **WebSocket message handling**: Deep nested switch statements are hard to follow
|
|
||||||
- **Threading complexity**: Multiple locks and shared state increase deadlock risk
|
|
||||||
|
|
||||||
### 2. Client (`client/src/`)
|
|
||||||
- **Fragmented connection logic**: WebSocket handling scattered across components
|
|
||||||
- **Error handling complexity**: Different scenarios handled inconsistently
|
|
||||||
- **State synchronization**: Multiple sources of truth for session/lobby state
|
|
||||||
|
|
||||||
### 3. Voicebot (`voicebot/`)
|
|
||||||
- **Duplicate patterns**: Similar WebSocket logic but different implementation
|
|
||||||
- **Bot lifecycle complexity**: Complex orchestration with unclear state flow
|
|
||||||
|
|
||||||
## Proposed Architecture
|
|
||||||
|
|
||||||
### Server Refactoring
|
|
||||||
|
|
||||||
#### 1. Extract Core Modules
|
|
||||||
|
|
||||||
```
|
|
||||||
server/
|
|
||||||
├── main.py # FastAPI app setup and routing only
|
|
||||||
├── core/
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ ├── session_manager.py # Session lifecycle and persistence
|
|
||||||
│ ├── lobby_manager.py # Lobby management and chat
|
|
||||||
│ ├── bot_manager.py # Bot provider and orchestration
|
|
||||||
│ └── auth_manager.py # Name/password authentication
|
|
||||||
├── websocket/
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ ├── connection.py # WebSocket connection handling
|
|
||||||
│ ├── message_handlers.py # Message type routing and handling
|
|
||||||
│ └── signaling.py # WebRTC signaling logic
|
|
||||||
├── api/
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ ├── admin.py # Admin endpoints
|
|
||||||
│ ├── sessions.py # Session HTTP API
|
|
||||||
│ ├── lobbies.py # Lobby HTTP API
|
|
||||||
│ └── bots.py # Bot HTTP API
|
|
||||||
└── models/
|
|
||||||
├── __init__.py
|
|
||||||
├── session.py # Session and Lobby classes
|
|
||||||
└── events.py # Event system for decoupled communication
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 2. Event-Driven Architecture
|
|
||||||
|
|
||||||
Replace direct method calls with an event system:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from typing import Protocol
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
|
|
||||||
class Event(ABC):
|
|
||||||
"""Base event class"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class SessionJoinedLobby(Event):
|
|
||||||
def __init__(self, session_id: str, lobby_id: str):
|
|
||||||
self.session_id = session_id
|
|
||||||
self.lobby_id = lobby_id
|
|
||||||
|
|
||||||
class EventHandler(Protocol):
|
|
||||||
async def handle(self, event: Event) -> None: ...
|
|
||||||
|
|
||||||
class EventBus:
|
|
||||||
def __init__(self):
|
|
||||||
self._handlers: dict[type[Event], list[EventHandler]] = {}
|
|
||||||
|
|
||||||
def subscribe(self, event_type: type[Event], handler: EventHandler):
|
|
||||||
if event_type not in self._handlers:
|
|
||||||
self._handlers[event_type] = []
|
|
||||||
self._handlers[event_type].append(handler)
|
|
||||||
|
|
||||||
async def publish(self, event: Event):
|
|
||||||
event_type = type(event)
|
|
||||||
if event_type in self._handlers:
|
|
||||||
for handler in self._handlers[event_type]:
|
|
||||||
await handler.handle(event)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 3. WebSocket Message Router
|
|
||||||
|
|
||||||
Replace the massive switch statement with a clean router:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from typing import Callable, Dict, Any
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
|
|
||||||
class MessageHandler(ABC):
|
|
||||||
@abstractmethod
|
|
||||||
async def handle(self, session: Session, data: Dict[str, Any], websocket: WebSocket) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
class SetNameHandler(MessageHandler):
|
|
||||||
async def handle(self, session: Session, data: Dict[str, Any], websocket: WebSocket) -> None:
|
|
||||||
# Handle set_name logic here
|
|
||||||
pass
|
|
||||||
|
|
||||||
class WebSocketRouter:
|
|
||||||
def __init__(self):
|
|
||||||
self._handlers: Dict[str, MessageHandler] = {}
|
|
||||||
|
|
||||||
def register(self, message_type: str, handler: MessageHandler):
|
|
||||||
self._handlers[message_type] = handler
|
|
||||||
|
|
||||||
async def route(self, message_type: str, session: Session, data: Dict[str, Any], websocket: WebSocket):
|
|
||||||
if message_type in self._handlers:
|
|
||||||
await self._handlers[message_type].handle(session, data, websocket)
|
|
||||||
else:
|
|
||||||
await websocket.send_json({"type": "error", "data": {"error": f"Unknown message type: {message_type}"}})
|
|
||||||
```
|
|
||||||
|
|
||||||
### Client Refactoring
|
|
||||||
|
|
||||||
#### 1. Centralized Connection Management
|
|
||||||
|
|
||||||
Create a single WebSocket connection manager:
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
// src/connection/WebSocketManager.ts
|
|
||||||
export class WebSocketManager {
|
|
||||||
private ws: WebSocket | null = null;
|
|
||||||
private reconnectAttempts = 0;
|
|
||||||
private messageHandlers = new Map<string, (data: any) => void>();
|
|
||||||
|
|
||||||
constructor(private url: string) {}
|
|
||||||
|
|
||||||
async connect(): Promise<void> {
|
|
||||||
// Connection logic with automatic reconnection
|
|
||||||
}
|
|
||||||
|
|
||||||
subscribe(messageType: string, handler: (data: any) => void): void {
|
|
||||||
this.messageHandlers.set(messageType, handler);
|
|
||||||
}
|
|
||||||
|
|
||||||
send(type: string, data: any): void {
|
|
||||||
if (this.ws?.readyState === WebSocket.OPEN) {
|
|
||||||
this.ws.send(JSON.stringify({ type, data }));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private handleMessage(event: MessageEvent): void {
|
|
||||||
const message = JSON.parse(event.data);
|
|
||||||
const handler = this.messageHandlers.get(message.type);
|
|
||||||
if (handler) {
|
|
||||||
handler(message.data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 2. Unified State Management
|
|
||||||
|
|
||||||
Use a state management pattern (Context + Reducer or Zustand):
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
// src/store/AppStore.ts
|
|
||||||
interface AppState {
|
|
||||||
session: Session | null;
|
|
||||||
lobby: Lobby | null;
|
|
||||||
participants: Participant[];
|
|
||||||
connectionStatus: 'disconnected' | 'connecting' | 'connected';
|
|
||||||
error: string | null;
|
|
||||||
}
|
|
||||||
|
|
||||||
type AppAction =
|
|
||||||
| { type: 'SET_SESSION'; payload: Session }
|
|
||||||
| { type: 'SET_LOBBY'; payload: Lobby }
|
|
||||||
| { type: 'UPDATE_PARTICIPANTS'; payload: Participant[] }
|
|
||||||
| { type: 'SET_CONNECTION_STATUS'; payload: AppState['connectionStatus'] }
|
|
||||||
| { type: 'SET_ERROR'; payload: string | null };
|
|
||||||
|
|
||||||
const appReducer = (state: AppState, action: AppAction): AppState => {
|
|
||||||
switch (action.type) {
|
|
||||||
case 'SET_SESSION':
|
|
||||||
return { ...state, session: action.payload };
|
|
||||||
// ... other cases
|
|
||||||
default:
|
|
||||||
return state;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
### Voicebot Refactoring
|
|
||||||
|
|
||||||
#### 1. Unified Connection Interface
|
|
||||||
|
|
||||||
Create a common WebSocket interface used by both client and voicebot:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# shared/websocket_client.py
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from typing import Dict, Any, Callable, Optional
|
|
||||||
|
|
||||||
class WebSocketClient(ABC):
|
|
||||||
def __init__(self, url: str, session_id: str, lobby_id: str):
|
|
||||||
self.url = url
|
|
||||||
self.session_id = session_id
|
|
||||||
self.lobby_id = lobby_id
|
|
||||||
self.message_handlers: Dict[str, Callable[[Dict[str, Any]], None]] = {}
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def connect(self) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def send_message(self, message_type: str, data: Dict[str, Any]) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def register_handler(self, message_type: str, handler: Callable[[Dict[str, Any]], None]):
|
|
||||||
self.message_handlers[message_type] = handler
|
|
||||||
|
|
||||||
async def handle_message(self, message_type: str, data: Dict[str, Any]):
|
|
||||||
handler = self.message_handlers.get(message_type)
|
|
||||||
if handler:
|
|
||||||
await handler(data)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Implementation Plan
|
|
||||||
|
|
||||||
### Phase 1: Server Foundation (Week 1-2)
|
|
||||||
1. Extract `SessionManager` and `LobbyManager` classes
|
|
||||||
2. Implement basic event system
|
|
||||||
3. Create WebSocket message router
|
|
||||||
4. Move admin endpoints to separate module
|
|
||||||
|
|
||||||
### Phase 2: Server Completion (Week 3-4)
|
|
||||||
1. Extract bot management functionality
|
|
||||||
2. Implement remaining message handlers
|
|
||||||
3. Add comprehensive testing
|
|
||||||
4. Performance optimization
|
|
||||||
|
|
||||||
### Phase 3: Client Refactoring (Week 5-6)
|
|
||||||
1. Implement centralized WebSocket manager
|
|
||||||
2. Create unified state management
|
|
||||||
3. Refactor components to use new architecture
|
|
||||||
4. Add error boundary and better error handling
|
|
||||||
|
|
||||||
### Phase 4: Voicebot Integration (Week 7-8)
|
|
||||||
1. Create shared WebSocket interface
|
|
||||||
2. Refactor voicebot to use common patterns
|
|
||||||
3. Improve bot lifecycle management
|
|
||||||
4. Integration testing
|
|
||||||
|
|
||||||
## Benefits of Proposed Architecture
|
|
||||||
|
|
||||||
### Maintainability
|
|
||||||
- **Single Responsibility**: Each module has a clear, focused purpose
|
|
||||||
- **Testability**: Smaller, focused classes are easier to unit test
|
|
||||||
- **Debugging**: Clear separation makes it easier to trace issues
|
|
||||||
|
|
||||||
### Scalability
|
|
||||||
- **Event-driven**: Loose coupling enables easier feature additions
|
|
||||||
- **Modular**: New functionality can be added without touching core logic
|
|
||||||
- **Performance**: Event system enables asynchronous processing
|
|
||||||
|
|
||||||
### Developer Experience
|
|
||||||
- **Code Navigation**: Easier to find relevant code
|
|
||||||
- **Documentation**: Smaller modules are easier to document
|
|
||||||
- **Onboarding**: New developers can understand individual components
|
|
||||||
|
|
||||||
### Reliability
|
|
||||||
- **Error Isolation**: Failures in one module don't cascade
|
|
||||||
- **State Management**: Centralized state reduces synchronization bugs
|
|
||||||
- **Connection Handling**: Robust reconnection and error recovery
|
|
||||||
|
|
||||||
## Risk Mitigation
|
|
||||||
|
|
||||||
### Breaking Changes
|
|
||||||
- Implement changes incrementally
|
|
||||||
- Maintain backward compatibility during transition
|
|
||||||
- Comprehensive testing at each phase
|
|
||||||
|
|
||||||
### Performance Impact
|
|
||||||
- Benchmark before and after changes
|
|
||||||
- Event system should be lightweight
|
|
||||||
- Monitor memory usage and connection handling
|
|
||||||
|
|
||||||
### Team Coordination
|
|
||||||
- Clear communication about architecture changes
|
|
||||||
- Code review process for architectural decisions
|
|
||||||
- Documentation updates with each phase
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
This refactoring will transform the current monolithic architecture into a maintainable, scalable system. The modular approach will reduce complexity, improve testability, and make the codebase more approachable for new developers while maintaining all existing functionality.
|
|
@ -1,238 +0,0 @@
|
|||||||
# Automated API Client Generation System
|
|
||||||
|
|
||||||
This document explains the automated TypeScript API client generation and update system for the AI Voicebot project.
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
The system automatically:
|
|
||||||
1. **Generates OpenAPI schema** from FastAPI server
|
|
||||||
2. **Creates TypeScript types** from the schema
|
|
||||||
3. **Updates API client** with missing endpoint implementations using dynamic paths
|
|
||||||
4. **Updates evolution checker** with current endpoint lists
|
|
||||||
5. **Validates TypeScript** compilation
|
|
||||||
6. **Runs evolution checks** to ensure completeness
|
|
||||||
|
|
||||||
All generated API calls use the `PUBLIC_URL` environment variable to dynamically construct paths, making the system deployable to any base path without hardcoded `/ai-voicebot` prefixes.
|
|
||||||
|
|
||||||
## Files in the System
|
|
||||||
|
|
||||||
### Generated Files (Auto-updated)
|
|
||||||
- `client/openapi-schema.json` - OpenAPI schema from server
|
|
||||||
- `client/src/api-types.ts` - TypeScript type definitions
|
|
||||||
- `client/src/api-client.ts` - API client (auto-sections updated)
|
|
||||||
- `client/src/api-evolution-checker.ts` - Evolution checker (lists updated)
|
|
||||||
|
|
||||||
### Manual Files
|
|
||||||
- `generate-ts-types.sh` - Main orchestration script
|
|
||||||
- `client/update-api-client.js` - API client updater utility
|
|
||||||
- `client/src/api-usage-examples.ts` - Usage examples and patterns
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
### Environment Variables
|
|
||||||
|
|
||||||
The system uses environment variables for dynamic path configuration:
|
|
||||||
|
|
||||||
- **`PUBLIC_URL`** - Base path for the application (e.g., `/ai-voicebot`, `/my-app`, etc.)
|
|
||||||
- Used in: API paths, schema loading, asset paths
|
|
||||||
- Default: `""` (empty string for root deployment)
|
|
||||||
- Set in: Docker environment, build process, or runtime
|
|
||||||
|
|
||||||
### Dynamic Path Handling
|
|
||||||
|
|
||||||
All API endpoints use dynamic path construction:
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
// Instead of hardcoded paths:
|
|
||||||
// "/ai-voicebot/api/health"
|
|
||||||
|
|
||||||
// The system uses:
|
|
||||||
this.getApiPath("/ai-voicebot/api/health")
|
|
||||||
// Which becomes: `${PUBLIC_URL}/api/health`
|
|
||||||
```
|
|
||||||
|
|
||||||
This allows deployment to different base paths without code changes.
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
### Full Generation (Recommended)
|
|
||||||
```bash
|
|
||||||
./generate-ts-types.sh
|
|
||||||
```
|
|
||||||
This runs the complete pipeline and is the primary way to use the system.
|
|
||||||
|
|
||||||
### Individual Steps
|
|
||||||
```bash
|
|
||||||
# Inside client container
|
|
||||||
npm run generate-schema # Generate OpenAPI schema
|
|
||||||
npm run generate-types # Generate TypeScript types
|
|
||||||
npm run update-api-client # Update API client
|
|
||||||
npm run check-api-evolution # Check for missing endpoints
|
|
||||||
```
|
|
||||||
|
|
||||||
## How Auto-Updates Work
|
|
||||||
|
|
||||||
### API Client Updates
|
|
||||||
|
|
||||||
The `update-api-client.js` script:
|
|
||||||
|
|
||||||
1. **Parses OpenAPI schema** to find all available endpoints
|
|
||||||
2. **Scans existing API client** to detect implemented methods
|
|
||||||
3. **Identifies missing endpoints** by comparing the two
|
|
||||||
4. **Generates method implementations** for missing endpoints
|
|
||||||
5. **Updates the client class** by inserting new methods in designated section
|
|
||||||
6. **Updates endpoint lists** used by evolution checking
|
|
||||||
|
|
||||||
#### Auto-Generated Section
|
|
||||||
```typescript
|
|
||||||
export class ApiClient {
|
|
||||||
// ... manual methods ...
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Construct API path using PUBLIC_URL environment variable
|
|
||||||
* Replaces hardcoded /ai-voicebot prefix with dynamic base from environment
|
|
||||||
*/
|
|
||||||
private getApiPath(schemaPath: string): string {
|
|
||||||
return schemaPath.replace('/ai-voicebot', base);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Auto-generated endpoints will be added here by update-api-client.js
|
|
||||||
// DO NOT MANUALLY EDIT BELOW THIS LINE
|
|
||||||
|
|
||||||
// New endpoints automatically appear here using this.getApiPath()
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Method Generation
|
|
||||||
- **Method names** derived from `operationId` or path/method combination
|
|
||||||
- **Parameters** inferred from path parameters and request body
|
|
||||||
- **Return types** use generic `Promise<any>` (can be enhanced)
|
|
||||||
- **Path handling** supports both static and parameterized paths using `PUBLIC_URL`
|
|
||||||
- **Dynamic paths** automatically replace hardcoded prefixes with environment-based values
|
|
||||||
|
|
||||||
### Evolution Checker Updates
|
|
||||||
|
|
||||||
The evolution checker tracks:
|
|
||||||
- **Known schema endpoints** - updated from current OpenAPI schema
|
|
||||||
- **Implemented endpoints** - updated from actual API client code
|
|
||||||
- **Missing endpoints** - calculated difference for warnings
|
|
||||||
|
|
||||||
## Customization
|
|
||||||
|
|
||||||
### Adding Manual Endpoints
|
|
||||||
|
|
||||||
For endpoints not in OpenAPI schema (e.g., external services), add them manually before the auto-generated section:
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
// Manual endpoints (these won't be auto-generated)
|
|
||||||
async getCustomData(): Promise<CustomResponse> {
|
|
||||||
return this.request<CustomResponse>("/custom/endpoint", { method: "GET" });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Auto-generated endpoints will be added here by update-api-client.js
|
|
||||||
// DO NOT MANUALLY EDIT BELOW THIS LINE
|
|
||||||
```
|
|
||||||
|
|
||||||
### Improving Generated Methods
|
|
||||||
|
|
||||||
To enhance auto-generated methods:
|
|
||||||
|
|
||||||
1. **Better Type Inference**: Modify `generateMethodSignature()` in `update-api-client.js` to use specific types from schema
|
|
||||||
2. **Parameter Validation**: Add validation logic in method generation
|
|
||||||
3. **Error Handling**: Customize error handling patterns
|
|
||||||
4. **Documentation**: Add JSDoc generation from OpenAPI descriptions
|
|
||||||
|
|
||||||
### Schema Evolution Detection
|
|
||||||
|
|
||||||
The system detects:
|
|
||||||
- **New endpoints** added to OpenAPI schema
|
|
||||||
- **Changed endpoints** (parameter or response changes)
|
|
||||||
- **Deprecated endpoints** (with proper OpenAPI marking)
|
|
||||||
|
|
||||||
## Development Workflow
|
|
||||||
|
|
||||||
1. **Develop API endpoints** in FastAPI server with proper typing
|
|
||||||
2. **Run generation script** to update client: `./generate-ts-types.sh`
|
|
||||||
3. **Use generated types** in React components
|
|
||||||
4. **Manual customization** for complex endpoints if needed
|
|
||||||
5. **Commit all changes** including generated and updated files
|
|
||||||
|
|
||||||
## Best Practices
|
|
||||||
|
|
||||||
### Server Development
|
|
||||||
- Use **Pydantic models** for all request/response types
|
|
||||||
- Add **proper OpenAPI metadata** (summary, description, tags)
|
|
||||||
- Use **consistent naming** for operation IDs
|
|
||||||
- **Version your API** to handle breaking changes
|
|
||||||
|
|
||||||
### Client Development
|
|
||||||
- **Import from api-client.ts** rather than making raw fetch calls
|
|
||||||
- **Use generated types** for type safety
|
|
||||||
- **Avoid editing auto-generated sections** - they will be overwritten
|
|
||||||
- **Add custom endpoints manually** when needed
|
|
||||||
|
|
||||||
### Type Safety
|
|
||||||
```typescript
|
|
||||||
// Good: Using generated types and client
|
|
||||||
import { apiClient, type LobbyModel, type LobbyCreateRequest } from './api-client';
|
|
||||||
|
|
||||||
const createLobby = async (data: LobbyCreateRequest): Promise<LobbyModel> => {
|
|
||||||
const response = await apiClient.createLobby(sessionId, data);
|
|
||||||
return response.data; // Fully typed
|
|
||||||
};
|
|
||||||
|
|
||||||
// Avoid: Direct fetch calls
|
|
||||||
const createLobbyRaw = async () => {
|
|
||||||
const response = await fetch('/api/lobby', { /* ... */ });
|
|
||||||
return response.json(); // No type safety
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### Common Issues
|
|
||||||
|
|
||||||
**"Could not find insertion marker"**
|
|
||||||
- The API client file was manually edited and the auto-generation markers were removed
|
|
||||||
- Restore the markers or regenerate the client file from template
|
|
||||||
|
|
||||||
**"Missing endpoints detected"**
|
|
||||||
- New endpoints were added to the server but the generation script wasn't run
|
|
||||||
- Run `./generate-ts-types.sh` to update the client
|
|
||||||
|
|
||||||
**"Type errors after generation"**
|
|
||||||
- Schema changes may have affected existing manual code
|
|
||||||
- Check the TypeScript compiler output and update affected code
|
|
||||||
|
|
||||||
**"Duplicate method names"**
|
|
||||||
- Manual methods conflict with auto-generated ones
|
|
||||||
- Rename manual methods or adjust the operation ID generation logic
|
|
||||||
|
|
||||||
### Debug Mode
|
|
||||||
|
|
||||||
Add debug logging by modifying `update-api-client.js`:
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
// Add after parsing
|
|
||||||
console.log('Schema endpoints:', this.endpoints.map(e => `${e.method}:${e.path}`));
|
|
||||||
console.log('Implemented endpoints:', Array.from(this.implementedEndpoints));
|
|
||||||
```
|
|
||||||
|
|
||||||
## Future Enhancements
|
|
||||||
|
|
||||||
- **Stronger type inference** from OpenAPI schema components
|
|
||||||
- **Request/response validation** using schema definitions
|
|
||||||
- **Mock data generation** for testing
|
|
||||||
- **API versioning support** with backward compatibility
|
|
||||||
- **Performance optimization** with request caching
|
|
||||||
- **OpenAPI spec validation** before generation
|
|
||||||
|
|
||||||
## Integration with Build Process
|
|
||||||
|
|
||||||
The system integrates with:
|
|
||||||
- **Docker Compose** for cross-container coordination
|
|
||||||
- **npm scripts** for frontend build pipeline
|
|
||||||
- **TypeScript compilation** for type checking
|
|
||||||
- **CI/CD workflows** for automated updates
|
|
||||||
|
|
||||||
This ensures that API changes are automatically reflected in the frontend without manual intervention, reducing development friction and preventing API/client drift.
|
|
@ -1,220 +0,0 @@
|
|||||||
# Chat Integration for AI Voicebot System
|
|
||||||
|
|
||||||
This document describes the chat functionality that has been integrated into the AI voicebot system, allowing bots to send and receive chat messages through the WebSocket signaling server.
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
The chat integration enables bots to:
|
|
||||||
1. **Receive chat messages** from other participants in the lobby
|
|
||||||
2. **Send chat messages** back to the lobby
|
|
||||||
3. **Process and respond** to specific commands or keywords
|
|
||||||
4. **Integrate seamlessly** with the existing WebRTC signaling infrastructure
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
### Core Components
|
|
||||||
|
|
||||||
1. **WebRTC Signaling Client** (`webrtc_signaling.py`)
|
|
||||||
- Extended with chat message handling capabilities
|
|
||||||
- Added `on_chat_message_received` callback for bots
|
|
||||||
- Added `send_chat_message()` method for sending messages
|
|
||||||
|
|
||||||
2. **Bot Orchestrator** (`bot_orchestrator.py`)
|
|
||||||
- Enhanced bot discovery to detect chat handlers
|
|
||||||
- Sets up chat message callbacks when bots join lobbies
|
|
||||||
- Manages the connection between WebRTC client and bot chat handlers
|
|
||||||
|
|
||||||
3. **Chat Models** (`shared/models.py`)
|
|
||||||
- `ChatMessageModel`: Structure for chat messages
|
|
||||||
- `ChatMessagesListModel`: For message lists
|
|
||||||
- `ChatMessagesSendModel`: For sending messages
|
|
||||||
|
|
||||||
### Bot Interface
|
|
||||||
|
|
||||||
Bots can now implement an optional `handle_chat_message` function:
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def handle_chat_message(
|
|
||||||
chat_message: ChatMessageModel,
|
|
||||||
send_message_func: Callable[[str], Awaitable[None]]
|
|
||||||
) -> Optional[str]:
|
|
||||||
"""
|
|
||||||
Handle incoming chat messages and optionally return a response.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
chat_message: The received chat message
|
|
||||||
send_message_func: Function to send messages back to the lobby
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Optional response message to send back to the lobby
|
|
||||||
"""
|
|
||||||
# Process the message and return a response
|
|
||||||
return "Hello! I received your message."
|
|
||||||
```
|
|
||||||
|
|
||||||
## Implementation Details
|
|
||||||
|
|
||||||
### 1. WebSocket Message Handling
|
|
||||||
|
|
||||||
The WebRTC signaling client now handles `chat_message` type messages:
|
|
||||||
|
|
||||||
```python
|
|
||||||
elif msg_type == "chat_message":
|
|
||||||
try:
|
|
||||||
validated = ChatMessageModel.model_validate(data)
|
|
||||||
except ValidationError as e:
|
|
||||||
logger.error(f"Invalid chat_message payload: {e}", exc_info=True)
|
|
||||||
return
|
|
||||||
logger.info(f"Received chat message from {validated.sender_name}: {validated.message[:50]}...")
|
|
||||||
# Call the callback if it's set
|
|
||||||
if self.on_chat_message_received:
|
|
||||||
try:
|
|
||||||
await self.on_chat_message_received(validated)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in chat message callback: {e}", exc_info=True)
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Bot Discovery Enhancement
|
|
||||||
|
|
||||||
The bot orchestrator now detects chat handlers during discovery:
|
|
||||||
|
|
||||||
```python
|
|
||||||
if hasattr(mod, "handle_chat_message") and callable(getattr(mod, "handle_chat_message")):
|
|
||||||
chat_handler = getattr(mod, "handle_chat_message")
|
|
||||||
|
|
||||||
bots[info.get("name", name)] = {
|
|
||||||
"module": name,
|
|
||||||
"info": info,
|
|
||||||
"create_tracks": create_tracks,
|
|
||||||
"chat_handler": chat_handler
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Chat Handler Setup
|
|
||||||
|
|
||||||
When a bot joins a lobby, the orchestrator sets up the chat handler:
|
|
||||||
|
|
||||||
```python
|
|
||||||
if chat_handler:
|
|
||||||
async def bot_chat_handler(chat_message: ChatMessageModel):
|
|
||||||
"""Wrapper to call the bot's chat handler and optionally send responses"""
|
|
||||||
try:
|
|
||||||
response = await chat_handler(chat_message, client.send_chat_message)
|
|
||||||
if response and isinstance(response, str):
|
|
||||||
await client.send_chat_message(response)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in bot chat handler for {bot_name}: {e}", exc_info=True)
|
|
||||||
|
|
||||||
client.on_chat_message_received = bot_chat_handler
|
|
||||||
```
|
|
||||||
|
|
||||||
## Example Bots
|
|
||||||
|
|
||||||
### 1. Chatbot (`bots/chatbot.py`)
|
|
||||||
|
|
||||||
A simple conversational bot that responds to greetings and commands:
|
|
||||||
|
|
||||||
- Responds to keywords like "hello", "how are you", "goodbye"
|
|
||||||
- Provides time information when asked
|
|
||||||
- Tells jokes on request
|
|
||||||
- Handles direct mentions intelligently
|
|
||||||
|
|
||||||
Example interactions:
|
|
||||||
- User: "hello" → Bot: "Hi there!"
|
|
||||||
- User: "time" → Bot: "Let me check... it's currently 2025-09-03 23:45:12"
|
|
||||||
- User: "joke" → Bot: "Why don't scientists trust atoms? Because they make up everything!"
|
|
||||||
|
|
||||||
### 2. Enhanced Whisper Bot (`bots/whisper.py`)
|
|
||||||
|
|
||||||
The existing speech recognition bot now also handles chat commands:
|
|
||||||
|
|
||||||
- Responds to messages starting with "whisper:"
|
|
||||||
- Provides help and status information
|
|
||||||
- Echoes back commands for demonstration
|
|
||||||
|
|
||||||
Example interactions:
|
|
||||||
- User: "whisper: hello" → Bot: "Hello UserName! I'm the Whisper speech recognition bot."
|
|
||||||
- User: "whisper: help" → Bot: "I can process speech and respond to simple commands..."
|
|
||||||
- User: "whisper: status" → Bot: "Whisper bot is running and ready to process audio and chat messages."
|
|
||||||
|
|
||||||
## Server Integration
|
|
||||||
|
|
||||||
The server (`server/main.py`) already handles chat messages through WebSocket:
|
|
||||||
|
|
||||||
1. **Receiving messages**: `send_chat_message` message type
|
|
||||||
2. **Broadcasting**: `broadcast_chat_message` method distributes messages to all lobby participants
|
|
||||||
3. **Storage**: Messages are stored in lobby's `chat_messages` list
|
|
||||||
|
|
||||||
## Testing
|
|
||||||
|
|
||||||
The implementation has been tested with:
|
|
||||||
|
|
||||||
1. **Bot Discovery**: All bots are correctly discovered with chat capabilities detected
|
|
||||||
2. **Message Processing**: Both chatbot and whisper bot respond correctly to test messages
|
|
||||||
3. **Integration**: The WebRTC signaling client properly routes messages to bot handlers
|
|
||||||
|
|
||||||
Test results:
|
|
||||||
```
|
|
||||||
Discovered 3 bots:
|
|
||||||
Bot: chatbot
|
|
||||||
Has chat handler: True
|
|
||||||
Bot: synthetic_media
|
|
||||||
Has chat handler: False
|
|
||||||
Bot: whisper
|
|
||||||
Has chat handler: True
|
|
||||||
|
|
||||||
Chat functionality test:
|
|
||||||
- Chatbot response to "hello": "Hey!"
|
|
||||||
- Whisper response to "whisper: hello": "Hello TestUser! I'm the Whisper speech recognition bot."
|
|
||||||
✅ Chat functionality test completed!
|
|
||||||
```
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
### For Bot Developers
|
|
||||||
|
|
||||||
To add chat capabilities to a bot:
|
|
||||||
|
|
||||||
1. Import the required types:
|
|
||||||
```python
|
|
||||||
from typing import Dict, Optional, Callable, Awaitable
|
|
||||||
from shared.models import ChatMessageModel
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Implement the chat handler:
|
|
||||||
```python
|
|
||||||
async def handle_chat_message(
|
|
||||||
chat_message: ChatMessageModel,
|
|
||||||
send_message_func: Callable[[str], Awaitable[None]]
|
|
||||||
) -> Optional[str]:
|
|
||||||
# Your chat logic here
|
|
||||||
if "hello" in chat_message.message.lower():
|
|
||||||
return f"Hello {chat_message.sender_name}!"
|
|
||||||
return None
|
|
||||||
```
|
|
||||||
|
|
||||||
3. The bot orchestrator will automatically detect and wire up the chat handler when the bot joins a lobby.
|
|
||||||
|
|
||||||
### For System Integration
|
|
||||||
|
|
||||||
The chat system integrates seamlessly with the existing voicebot infrastructure:
|
|
||||||
|
|
||||||
1. **No breaking changes** to existing bots without chat handlers
|
|
||||||
2. **Automatic discovery** of chat capabilities
|
|
||||||
3. **Error isolation** - chat handler failures don't affect WebRTC functionality
|
|
||||||
4. **Logging** provides visibility into chat message flow
|
|
||||||
|
|
||||||
## Future Enhancements
|
|
||||||
|
|
||||||
Potential improvements for the chat system:
|
|
||||||
|
|
||||||
1. **Message History**: Bots could access recent chat history
|
|
||||||
2. **Rich Responses**: Support for formatted messages, images, etc.
|
|
||||||
3. **Private Messaging**: Direct messages between participants
|
|
||||||
4. **Chat Commands**: Standardized command parsing framework
|
|
||||||
5. **Persistence**: Long-term storage of chat interactions
|
|
||||||
6. **Analytics**: Message processing metrics and bot performance monitoring
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
The chat integration provides a powerful foundation for creating interactive AI bots that can engage with users through text while maintaining their audio/video capabilities. The implementation is robust, well-tested, and ready for production use.
|
|
@ -1,5 +1,5 @@
|
|||||||
FROM ubuntu:oracular
|
FROM ubuntu:oracular
|
||||||
# Stick with Python3.12
|
# Stick with Python3.12 (plucky has 3.13)
|
||||||
|
|
||||||
# Install some utilities frequently used
|
# Install some utilities frequently used
|
||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
@ -28,6 +28,20 @@ RUN apt-get update \
|
|||||||
&& apt-get clean \
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
|
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
|
||||||
|
|
||||||
|
# Install Intel graphics runtimes
|
||||||
|
RUN apt-get update \
|
||||||
|
&& DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \
|
||||||
|
&& add-apt-repository -y ppa:kobuk-team/intel-graphics \
|
||||||
|
&& apt-get update \
|
||||||
|
&& DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||||
|
libze-intel-gpu1 \
|
||||||
|
libze1 \
|
||||||
|
intel-ocloc \
|
||||||
|
intel-opencl-icd \
|
||||||
|
xpu-smi \
|
||||||
|
clinfo \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
|
||||||
|
|
||||||
# Install uv using the official Astral script
|
# Install uv using the official Astral script
|
||||||
RUN curl -Ls https://astral.sh/uv/install.sh | bash
|
RUN curl -Ls https://astral.sh/uv/install.sh | bash
|
||||||
|
@ -1,190 +0,0 @@
|
|||||||
"""
|
|
||||||
Documentation for the Server Refactoring Step 1 Implementation
|
|
||||||
|
|
||||||
This document outlines what was accomplished in Step 1 of the server refactoring
|
|
||||||
and how to verify the implementation works.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# STEP 1 IMPLEMENTATION SUMMARY
|
|
||||||
|
|
||||||
## What Was Accomplished
|
|
||||||
|
|
||||||
### 1. Created Modular Architecture
|
|
||||||
- **server/core/**: Core business logic modules
|
|
||||||
- `session_manager.py`: Session lifecycle and persistence
|
|
||||||
- `lobby_manager.py`: Lobby management and chat functionality
|
|
||||||
- `auth_manager.py`: Authentication and name protection
|
|
||||||
|
|
||||||
- **server/models/**: Event system and data models
|
|
||||||
- `events.py`: Event-driven architecture foundation
|
|
||||||
|
|
||||||
- **server/websocket/**: WebSocket handling
|
|
||||||
- `message_handlers.py`: Clean message routing (replaces massive switch statement)
|
|
||||||
- `connection.py`: WebSocket connection management
|
|
||||||
|
|
||||||
- **server/api/**: HTTP API endpoints
|
|
||||||
- `admin.py`: Admin endpoints (extracted from main.py)
|
|
||||||
- `sessions.py`: Session management endpoints
|
|
||||||
- `lobbies.py`: Lobby management endpoints
|
|
||||||
|
|
||||||
### 2. Key Improvements
|
|
||||||
- **Separation of Concerns**: Each module has a single responsibility
|
|
||||||
- **Event-Driven Architecture**: Decoupled communication between components
|
|
||||||
- **Clean Message Routing**: Replaced 200+ line switch statement with handler pattern
|
|
||||||
- **Thread Safety**: Proper locking and state management
|
|
||||||
- **Type Safety**: Better type annotations and error handling
|
|
||||||
- **Testability**: Modules can be tested independently
|
|
||||||
|
|
||||||
### 3. Backward Compatibility
|
|
||||||
- All existing endpoints work unchanged
|
|
||||||
- Same WebSocket message protocols
|
|
||||||
- Same session/lobby behavior
|
|
||||||
- Same authentication mechanisms
|
|
||||||
|
|
||||||
## File Structure Created
|
|
||||||
|
|
||||||
```
|
|
||||||
server/
|
|
||||||
├── main_refactored.py # New main file using modular architecture
|
|
||||||
├── core/
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ ├── session_manager.py # Session lifecycle management
|
|
||||||
│ ├── lobby_manager.py # Lobby and chat management
|
|
||||||
│ └── auth_manager.py # Authentication and passwords
|
|
||||||
├── websocket/
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ ├── message_handlers.py # WebSocket message routing
|
|
||||||
│ └── connection.py # Connection management
|
|
||||||
├── api/
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ ├── admin.py # Admin HTTP endpoints
|
|
||||||
│ ├── sessions.py # Session HTTP endpoints
|
|
||||||
│ └── lobbies.py # Lobby HTTP endpoints
|
|
||||||
└── models/
|
|
||||||
├── __init__.py
|
|
||||||
└── events.py # Event system
|
|
||||||
```
|
|
||||||
|
|
||||||
## How to Test/Verify
|
|
||||||
|
|
||||||
### 1. Syntax Verification
|
|
||||||
The modules can be imported and instantiated:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# In server/ directory:
|
|
||||||
python3 -c "
|
|
||||||
import sys; sys.path.append('.')
|
|
||||||
from core.session_manager import SessionManager
|
|
||||||
from core.lobby_manager import LobbyManager
|
|
||||||
from core.auth_manager import AuthManager
|
|
||||||
print('✓ All modules import successfully')
|
|
||||||
"
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Basic Functionality Test
|
|
||||||
```python
|
|
||||||
# Test basic object creation (no FastAPI dependencies)
|
|
||||||
python3 -c "
|
|
||||||
import sys; sys.path.append('.')
|
|
||||||
from core.auth_manager import AuthManager
|
|
||||||
auth = AuthManager()
|
|
||||||
auth.set_password('test', 'password')
|
|
||||||
assert auth.verify_password('test', 'password')
|
|
||||||
assert not auth.verify_password('test', 'wrong')
|
|
||||||
print('✓ AuthManager works correctly')
|
|
||||||
"
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Server Startup Test
|
|
||||||
To test the full refactored server:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Start the refactored server
|
|
||||||
cd server/
|
|
||||||
python3 main_refactored.py
|
|
||||||
```
|
|
||||||
|
|
||||||
Expected output:
|
|
||||||
```
|
|
||||||
INFO - Starting AI Voice Bot server with modular architecture...
|
|
||||||
INFO - Loaded 0 sessions from sessions.json
|
|
||||||
INFO - AI Voice Bot server started successfully!
|
|
||||||
INFO - Server URL: /
|
|
||||||
INFO - Sessions loaded: 0
|
|
||||||
INFO - Lobbies available: 0
|
|
||||||
INFO - Protected names: 0
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. API Endpoints Test
|
|
||||||
```bash
|
|
||||||
# Test health endpoint
|
|
||||||
curl http://localhost:8000/api/system/health
|
|
||||||
|
|
||||||
# Expected response:
|
|
||||||
{
|
|
||||||
"status": "ok",
|
|
||||||
"architecture": "modular",
|
|
||||||
"version": "2.0.0",
|
|
||||||
"managers": {
|
|
||||||
"session_manager": "active",
|
|
||||||
"lobby_manager": "active",
|
|
||||||
"auth_manager": "active",
|
|
||||||
"websocket_manager": "active"
|
|
||||||
},
|
|
||||||
"statistics": {
|
|
||||||
"sessions": 0,
|
|
||||||
"lobbies": 0,
|
|
||||||
"protected_names": 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Benefits Achieved
|
|
||||||
|
|
||||||
### Maintainability
|
|
||||||
- **Reduced Complexity**: Original 2300-line main.py split into focused modules
|
|
||||||
- **Clear Dependencies**: Each module has explicit dependencies
|
|
||||||
- **Easier Debugging**: Issues can be isolated to specific modules
|
|
||||||
|
|
||||||
### Testability
|
|
||||||
- **Unit Testing**: Each module can be tested independently
|
|
||||||
- **Mocking**: Dependencies can be easily mocked for testing
|
|
||||||
- **Integration Testing**: Components can be tested together
|
|
||||||
|
|
||||||
### Developer Experience
|
|
||||||
- **Code Navigation**: Easy to find relevant functionality
|
|
||||||
- **Onboarding**: New developers can understand individual components
|
|
||||||
- **Documentation**: Smaller modules are easier to document
|
|
||||||
|
|
||||||
### Scalability
|
|
||||||
- **Event System**: Enables loose coupling and async processing
|
|
||||||
- **Modular Growth**: New features can be added without touching core logic
|
|
||||||
- **Performance**: Better separation allows for targeted optimizations
|
|
||||||
|
|
||||||
## Next Steps (Future Phases)
|
|
||||||
|
|
||||||
### Phase 2: Complete WebSocket Extraction
|
|
||||||
- Extract remaining WebSocket message types (WebRTC signaling)
|
|
||||||
- Add comprehensive error handling
|
|
||||||
- Implement message validation
|
|
||||||
|
|
||||||
### Phase 3: Enhanced Event System
|
|
||||||
- Add event persistence for reliability
|
|
||||||
- Implement event replay capabilities
|
|
||||||
- Add monitoring and metrics
|
|
||||||
|
|
||||||
### Phase 4: Advanced Features
|
|
||||||
- Plugin architecture for bots
|
|
||||||
- Rate limiting and security enhancements
|
|
||||||
- Advanced admin capabilities
|
|
||||||
|
|
||||||
## Migration Path
|
|
||||||
|
|
||||||
The refactored architecture can be adopted gradually:
|
|
||||||
|
|
||||||
1. **Testing**: Use `main_refactored.py` in development
|
|
||||||
2. **Validation**: Verify all functionality works correctly
|
|
||||||
3. **Deployment**: Replace `main.py` with `main_refactored.py`
|
|
||||||
4. **Cleanup**: Remove old monolithic code after verification
|
|
||||||
|
|
||||||
The modular design ensures that each component can evolve independently while maintaining system stability.
|
|
@ -1,153 +0,0 @@
|
|||||||
🎉 SERVER REFACTORING STEP 1 - SUCCESSFULLY COMPLETED!
|
|
||||||
|
|
||||||
## Summary of Implementation
|
|
||||||
|
|
||||||
### ✅ What Was Accomplished
|
|
||||||
|
|
||||||
**1. Modular Architecture Created**
|
|
||||||
```
|
|
||||||
server/
|
|
||||||
├── core/ # Business logic modules
|
|
||||||
│ ├── session_manager.py # Session lifecycle & persistence
|
|
||||||
│ ├── lobby_manager.py # Lobby management & chat
|
|
||||||
│ └── auth_manager.py # Authentication & passwords
|
|
||||||
├── websocket/ # WebSocket handling
|
|
||||||
│ ├── message_handlers.py # Message routing (replaces switch statement)
|
|
||||||
│ └── connection.py # Connection management
|
|
||||||
├── api/ # HTTP endpoints
|
|
||||||
│ ├── admin.py # Admin endpoints
|
|
||||||
│ ├── sessions.py # Session endpoints
|
|
||||||
│ └── lobbies.py # Lobby endpoints
|
|
||||||
├── models/ # Events & data models
|
|
||||||
│ └── events.py # Event-driven architecture
|
|
||||||
└── main_refactored.py # New modular main file
|
|
||||||
```
|
|
||||||
|
|
||||||
**2. Key Improvements Achieved**
|
|
||||||
- ✅ **Separation of Concerns**: 2300-line monolith split into focused modules
|
|
||||||
- ✅ **Event-Driven Architecture**: Decoupled communication via event bus
|
|
||||||
- ✅ **Clean Message Routing**: Replaced massive switch statement with handler pattern
|
|
||||||
- ✅ **Thread Safety**: Proper locking and state management maintained
|
|
||||||
- ✅ **Dependency Injection**: Managers can be configured and swapped
|
|
||||||
- ✅ **Testability**: Each module can be tested independently
|
|
||||||
|
|
||||||
**3. Backward Compatibility Maintained**
|
|
||||||
- ✅ **Same API endpoints**: All existing HTTP endpoints work unchanged
|
|
||||||
- ✅ **Same WebSocket protocol**: All message types work identically
|
|
||||||
- ✅ **Same authentication**: Password and name protection unchanged
|
|
||||||
- ✅ **Same session persistence**: Existing sessions.json format preserved
|
|
||||||
|
|
||||||
### 🧪 Verification Results
|
|
||||||
|
|
||||||
**Architecture Structure**: ✅ All directories and files created correctly
|
|
||||||
**Module Imports**: ✅ All core modules import successfully in proper environment
|
|
||||||
**Server Startup**: ✅ Refactored server starts and initializes all components
|
|
||||||
**Session Loading**: ✅ Successfully loaded 4 existing sessions from disk
|
|
||||||
**Background Tasks**: ✅ Cleanup and validation tasks start properly
|
|
||||||
**Session Integrity**: ✅ Detected and logged duplicate session names
|
|
||||||
**Graceful Shutdown**: ✅ All components shut down cleanly
|
|
||||||
|
|
||||||
### 📊 Test Results
|
|
||||||
|
|
||||||
```
|
|
||||||
INFO - Starting AI Voice Bot server with modular architecture...
|
|
||||||
INFO - Loaded 4 sessions from sessions.json
|
|
||||||
INFO - Starting session background tasks...
|
|
||||||
INFO - AI Voice Bot server started successfully!
|
|
||||||
INFO - Server URL: /ai-voicebot/
|
|
||||||
INFO - Sessions loaded: 4
|
|
||||||
INFO - Lobbies available: 0
|
|
||||||
INFO - Protected names: 0
|
|
||||||
INFO - Session background tasks started
|
|
||||||
```
|
|
||||||
|
|
||||||
**Session Integrity Validation Working**:
|
|
||||||
```
|
|
||||||
WARNING - Session integrity issues found: 3 issues
|
|
||||||
WARNING - Integrity issue: Duplicate name 'whisper-bot' found in 3 sessions
|
|
||||||
```
|
|
||||||
|
|
||||||
### 🔧 Technical Achievements
|
|
||||||
|
|
||||||
**1. SessionManager**
|
|
||||||
- Extracted all session lifecycle management
|
|
||||||
- Background cleanup and validation tasks
|
|
||||||
- Thread-safe operations with proper locking
|
|
||||||
- Event publishing for session state changes
|
|
||||||
|
|
||||||
**2. LobbyManager**
|
|
||||||
- Extracted lobby creation and management
|
|
||||||
- Chat message handling and persistence
|
|
||||||
- Event-driven participant updates
|
|
||||||
- Automatic empty lobby cleanup
|
|
||||||
|
|
||||||
**3. AuthManager**
|
|
||||||
- Extracted password hashing and verification
|
|
||||||
- Name protection and takeover logic
|
|
||||||
- Integrity validation for auth data
|
|
||||||
- Clean separation from session logic
|
|
||||||
|
|
||||||
**4. WebSocket Message Router**
|
|
||||||
- Replaced 200+ line switch statement
|
|
||||||
- Handler pattern for clean message processing
|
|
||||||
- Easy to extend with new message types
|
|
||||||
- Proper error handling and validation
|
|
||||||
|
|
||||||
**5. Event System**
|
|
||||||
- Decoupled component communication
|
|
||||||
- Async event processing
|
|
||||||
- Error isolation and logging
|
|
||||||
- Foundation for future enhancements
|
|
||||||
|
|
||||||
### 🚀 Benefits Realized
|
|
||||||
|
|
||||||
**Maintainability**
|
|
||||||
- Code is now organized into logical, focused modules
|
|
||||||
- Much easier to locate and modify specific functionality
|
|
||||||
- Reduced cognitive load when working on individual features
|
|
||||||
|
|
||||||
**Testability**
|
|
||||||
- Each module can be unit tested independently
|
|
||||||
- Dependencies can be mocked easily
|
|
||||||
- Integration tests can focus on specific interactions
|
|
||||||
|
|
||||||
**Scalability**
|
|
||||||
- Event system enables loose coupling
|
|
||||||
- New features can be added without touching core logic
|
|
||||||
- Components can be optimized independently
|
|
||||||
|
|
||||||
**Developer Experience**
|
|
||||||
- New developers can understand individual components
|
|
||||||
- Clear separation of responsibilities
|
|
||||||
- Better error messages and logging
|
|
||||||
|
|
||||||
### 🎯 Next Steps (Future Phases)
|
|
||||||
|
|
||||||
**Phase 2: Complete WebSocket Extraction**
|
|
||||||
- Extract WebRTC signaling handlers
|
|
||||||
- Add comprehensive message validation
|
|
||||||
- Implement rate limiting
|
|
||||||
|
|
||||||
**Phase 3: Enhanced Event System**
|
|
||||||
- Add event persistence
|
|
||||||
- Implement event replay capabilities
|
|
||||||
- Add metrics and monitoring
|
|
||||||
|
|
||||||
**Phase 4: Advanced Features**
|
|
||||||
- Plugin architecture for bots
|
|
||||||
- Advanced admin capabilities
|
|
||||||
- Performance optimizations
|
|
||||||
|
|
||||||
### 🏁 Conclusion
|
|
||||||
|
|
||||||
**Step 1 of the server refactoring is COMPLETE and SUCCESSFUL!**
|
|
||||||
|
|
||||||
The monolithic `main.py` has been successfully transformed into a clean, modular architecture that:
|
|
||||||
- Maintains 100% backward compatibility
|
|
||||||
- Significantly improves code organization
|
|
||||||
- Provides a solid foundation for future development
|
|
||||||
- Reduces maintenance burden and technical debt
|
|
||||||
|
|
||||||
The refactored server is ready for production use and provides a much better foundation for continued development and feature additions.
|
|
||||||
|
|
||||||
**Ready to proceed to Phase 2 or continue with other improvements! 🚀**
|
|
@ -1,168 +0,0 @@
|
|||||||
# OpenAPI TypeScript Generation
|
|
||||||
|
|
||||||
This project now supports automatic TypeScript type generation from the FastAPI server's Pydantic models using OpenAPI schema generation.
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
The implementation follows the "OpenAPI Schema Generation (Recommended for FastAPI)" approach:
|
|
||||||
|
|
||||||
1. **Server-side**: FastAPI automatically generates OpenAPI schema from Pydantic models
|
|
||||||
2. **Generation**: Python script extracts the schema and saves it as JSON
|
|
||||||
3. **TypeScript**: `openapi-typescript` converts the schema to TypeScript types
|
|
||||||
4. **Client**: Typed API client provides type-safe server communication
|
|
||||||
|
|
||||||
## Generated Files
|
|
||||||
|
|
||||||
- `client/openapi-schema.json` - OpenAPI schema extracted from FastAPI
|
|
||||||
- `client/src/api-types.ts` - TypeScript interfaces generated from OpenAPI schema
|
|
||||||
- `client/src/api-client.ts` - Typed API client with convenience methods
|
|
||||||
|
|
||||||
## How It Works
|
|
||||||
|
|
||||||
### 1. Schema Generation
|
|
||||||
The `server/generate_schema_simple.py` script:
|
|
||||||
- Imports the FastAPI app from `main.py`
|
|
||||||
- Extracts the OpenAPI schema using `app.openapi()`
|
|
||||||
- Saves the schema as JSON in `client/openapi-schema.json`
|
|
||||||
|
|
||||||
### 2. TypeScript Generation
|
|
||||||
The `openapi-typescript` package:
|
|
||||||
- Reads the OpenAPI schema JSON
|
|
||||||
- Generates TypeScript interfaces in `client/src/api-types.ts`
|
|
||||||
- Creates type-safe definitions for all Pydantic models
|
|
||||||
|
|
||||||
### 3. API Client
|
|
||||||
The `client/src/api-client.ts` file provides:
|
|
||||||
- Type-safe API client class
|
|
||||||
- Convenience functions for each endpoint
|
|
||||||
- Proper error handling with custom `ApiError` class
|
|
||||||
- Re-exported types for easy importing
|
|
||||||
|
|
||||||
## Usage in React Components
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
import { apiClient, adminApi, healthApi, lobbiesApi, sessionsApi } from './api-client';
|
|
||||||
import type { LobbyModel, SessionModel, AdminSetPassword } from './api-client';
|
|
||||||
|
|
||||||
// Using the convenience APIs
|
|
||||||
const healthStatus = await healthApi.check();
|
|
||||||
const lobbies = await lobbiesApi.getAll();
|
|
||||||
const session = await sessionsApi.getCurrent();
|
|
||||||
|
|
||||||
// Using the main client
|
|
||||||
const adminNames = await apiClient.adminListNames();
|
|
||||||
|
|
||||||
// With type safety for request data
|
|
||||||
const passwordData: AdminSetPassword = {
|
|
||||||
name: "admin",
|
|
||||||
password: "newpassword"
|
|
||||||
};
|
|
||||||
const result = await adminApi.setPassword(passwordData);
|
|
||||||
|
|
||||||
// Type-safe lobby creation
|
|
||||||
const lobbyRequest: LobbyCreateRequest = {
|
|
||||||
type: "lobby_create",
|
|
||||||
data: {
|
|
||||||
name: "My Lobby",
|
|
||||||
private: false
|
|
||||||
}
|
|
||||||
};
|
|
||||||
const newLobby = await sessionsApi.createLobby("session-id", lobbyRequest);
|
|
||||||
```
|
|
||||||
|
|
||||||
## Regenerating Types
|
|
||||||
|
|
||||||
### Manual Generation
|
|
||||||
```bash
|
|
||||||
# Generate schema from server
|
|
||||||
docker compose exec server uv run python3 generate_schema_simple.py
|
|
||||||
|
|
||||||
# Generate TypeScript types
|
|
||||||
docker compose exec client npx openapi-typescript openapi-schema.json -o src/api-types.ts
|
|
||||||
|
|
||||||
# Type check
|
|
||||||
docker compose exec client npm run type-check
|
|
||||||
```
|
|
||||||
|
|
||||||
### Automated Generation
|
|
||||||
```bash
|
|
||||||
# Run the comprehensive generation script
|
|
||||||
./generate-ts-types.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
### NPM Scripts (in frontend container)
|
|
||||||
```bash
|
|
||||||
# Generate just the schema
|
|
||||||
npm run generate-schema
|
|
||||||
|
|
||||||
# Generate just the TypeScript types (requires schema to exist)
|
|
||||||
npm run generate-types
|
|
||||||
|
|
||||||
# Generate both schema and types
|
|
||||||
npm run generate-api-types
|
|
||||||
```
|
|
||||||
|
|
||||||
## Development Workflow
|
|
||||||
|
|
||||||
1. **Modify Pydantic models** in `shared/models.py`
|
|
||||||
2. **Regenerate types** using one of the methods above
|
|
||||||
3. **Update React components** to use the new types
|
|
||||||
4. **Type check** to ensure everything compiles
|
|
||||||
|
|
||||||
## Benefits
|
|
||||||
|
|
||||||
- ✅ **Type Safety**: Full TypeScript type checking for API requests/responses
|
|
||||||
- ✅ **Auto-completion**: IDE support with auto-complete for API methods and data structures
|
|
||||||
- ✅ **Error Prevention**: Catch type mismatches at compile time
|
|
||||||
- ✅ **Documentation**: Self-documenting API with TypeScript interfaces
|
|
||||||
- ✅ **Sync Guarantee**: Types are always in sync with server models
|
|
||||||
- ✅ **Refactoring Safety**: IDE can safely refactor across frontend/backend
|
|
||||||
|
|
||||||
## File Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
server/
|
|
||||||
├── main.py # FastAPI app with Pydantic models
|
|
||||||
├── generate_schema_simple.py # Schema extraction script
|
|
||||||
└── generate_api_client.py # Enhanced generator (backup)
|
|
||||||
|
|
||||||
shared/
|
|
||||||
└── models.py # Pydantic models (source of truth)
|
|
||||||
|
|
||||||
client/
|
|
||||||
├── openapi-schema.json # Generated OpenAPI schema
|
|
||||||
├── package.json # Updated with openapi-typescript dependency
|
|
||||||
└── src/
|
|
||||||
├── api-types.ts # Generated TypeScript interfaces
|
|
||||||
└── api-client.ts # Typed API client
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### Container Issues
|
|
||||||
If the frontend container has dependency conflicts:
|
|
||||||
```bash
|
|
||||||
# Rebuild the frontend container
|
|
||||||
docker compose build client
|
|
||||||
docker compose up -d client
|
|
||||||
```
|
|
||||||
|
|
||||||
### TypeScript Errors
|
|
||||||
Ensure the generated types are up to date:
|
|
||||||
```bash
|
|
||||||
./generate-ts-types.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
### Module Not Found Errors
|
|
||||||
Check that the volume mounts are working correctly and files are synced between host and container.
|
|
||||||
|
|
||||||
## API Evolution Detection
|
|
||||||
|
|
||||||
The system now includes automatic detection of API changes:
|
|
||||||
|
|
||||||
- **Automatic Checking**: In development mode, the system automatically warns about unimplemented endpoints
|
|
||||||
- **Console Warnings**: Clear warnings appear in the browser console when new API endpoints are available
|
|
||||||
- **Implementation Stubs**: Provides ready-to-use code stubs for new endpoints
|
|
||||||
- **Schema Monitoring**: Detects when the OpenAPI schema changes
|
|
||||||
|
|
||||||
See `client/src/API_EVOLUTION.md` for detailed documentation on using this feature.
|
|
@ -140,10 +140,6 @@ class Session:
|
|||||||
self.bot_instance_id: Optional[str] = None # Bot instance ID for tracking
|
self.bot_instance_id: Optional[str] = None # Bot instance ID for tracking
|
||||||
self.session_lock = threading.RLock() # Instance-level lock
|
self.session_lock = threading.RLock() # Instance-level lock
|
||||||
|
|
||||||
def is_bot(self) -> bool:
|
|
||||||
"""Check if this session represents a bot"""
|
|
||||||
return bool(self.bot_run_id or self.bot_provider_id or self.bot_instance_id)
|
|
||||||
|
|
||||||
def getName(self) -> str:
|
def getName(self) -> str:
|
||||||
with self.session_lock:
|
with self.session_lock:
|
||||||
return f"{self.short}:{self.name if self.name else '[ ---- ]'}"
|
return f"{self.short}:{self.name if self.name else '[ ---- ]'}"
|
||||||
@ -405,10 +401,6 @@ class SessionManager:
|
|||||||
with self.lock:
|
with self.lock:
|
||||||
sessions_list: List[SessionSaved] = []
|
sessions_list: List[SessionSaved] = []
|
||||||
for s in self._instances:
|
for s in self._instances:
|
||||||
# Skip bot sessions - they should not be persisted
|
|
||||||
# Bot sessions are managed by the voicebot service lifecycle
|
|
||||||
if s.bot_instance_id is not None or s.bot_run_id is not None or s.bot_provider_id is not None:
|
|
||||||
continue
|
|
||||||
sessions_list.append(s.to_saved())
|
sessions_list.append(s.to_saved())
|
||||||
|
|
||||||
# Note: We'll need to handle name_passwords separately or inject it
|
# Note: We'll need to handle name_passwords separately or inject it
|
||||||
|
@ -104,12 +104,12 @@ logger.info(f"Starting server with public URL: {public_url}")
|
|||||||
|
|
||||||
|
|
||||||
# Global managers - these replace the global variables from original main.py
|
# Global managers - these replace the global variables from original main.py
|
||||||
session_manager: SessionManager = None
|
session_manager: SessionManager | None = None
|
||||||
lobby_manager: LobbyManager = None
|
lobby_manager: LobbyManager | None = None
|
||||||
auth_manager: AuthManager = None
|
auth_manager: AuthManager | None = None
|
||||||
bot_manager: BotManager = None
|
bot_manager: BotManager | None = None
|
||||||
bot_config_manager: BotConfigManager = None
|
bot_config_manager: BotConfigManager | None = None
|
||||||
websocket_manager: WebSocketConnectionManager = None
|
websocket_manager: WebSocketConnectionManager | None = None
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
|
@ -1,302 +0,0 @@
|
|||||||
# AI Voicebot
|
|
||||||
|
|
||||||
A WebRTC-enabled AI voicebot system with speech recognition and synthetic media capabilities. The voicebot can run in two modes: as a client connecting to lobbies or as a provider serving bots to other applications.
|
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
- **Speech Recognition**: Uses Whisper models for real-time audio transcription
|
|
||||||
- **Synthetic Media**: Generates animated video and audio tracks
|
|
||||||
- **WebRTC Integration**: Real-time peer-to-peer communication
|
|
||||||
- **Bot Provider System**: Can register with a main server to provide bot services
|
|
||||||
- **Flexible Deployment**: Docker-based with development and production modes
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
### Prerequisites
|
|
||||||
|
|
||||||
- Docker and Docker Compose
|
|
||||||
- Python 3.12+ (if running locally)
|
|
||||||
- Access to a compatible signaling server
|
|
||||||
|
|
||||||
### Running with Docker
|
|
||||||
|
|
||||||
#### 1. Bot Provider Mode (Recommended)
|
|
||||||
|
|
||||||
Run the voicebot as a bot provider that registers with the main server:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Development mode with auto-reload
|
|
||||||
VOICEBOT_MODE=provider PRODUCTION=false docker-compose up voicebot
|
|
||||||
|
|
||||||
# Production mode
|
|
||||||
VOICEBOT_MODE=provider PRODUCTION=true docker-compose up voicebot
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 2. Direct Client Mode
|
|
||||||
|
|
||||||
Run the voicebot as a direct client connecting to a lobby:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Development mode
|
|
||||||
VOICEBOT_MODE=client PRODUCTION=false docker-compose up voicebot
|
|
||||||
|
|
||||||
# Production mode
|
|
||||||
VOICEBOT_MODE=client PRODUCTION=true docker-compose up voicebot
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running Locally
|
|
||||||
|
|
||||||
#### 1. Setup Environment
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd voicebot/
|
|
||||||
|
|
||||||
# Create virtual environment
|
|
||||||
uv init --python /usr/bin/python3.12 --name "ai-voicebot-agent"
|
|
||||||
uv add -r requirements.txt
|
|
||||||
|
|
||||||
# Activate environment
|
|
||||||
source .venv/bin/activate
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 2. Bot Provider Mode
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Development with auto-reload
|
|
||||||
python main.py --mode provider --server-url https://your-server.com/ai-voicebot --reload --insecure
|
|
||||||
|
|
||||||
# Production
|
|
||||||
python main.py --mode provider --server-url https://your-server.com/ai-voicebot
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 3. Direct Client Mode
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python main.py --mode client \
|
|
||||||
--server-url https://your-server.com/ai-voicebot \
|
|
||||||
--lobby "my-lobby" \
|
|
||||||
--session-name "My Bot" \
|
|
||||||
--insecure
|
|
||||||
```
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
### Environment Variables
|
|
||||||
|
|
||||||
| Variable | Description | Default | Example |
|
|
||||||
|----------|-------------|---------|---------|
|
|
||||||
| `VOICEBOT_MODE` | Operating mode: `client` or `provider` | `client` | `provider` |
|
|
||||||
| `PRODUCTION` | Production mode flag | `false` | `true` |
|
|
||||||
|
|
||||||
### Command Line Arguments
|
|
||||||
|
|
||||||
#### Common Arguments
|
|
||||||
- `--mode`: Run as `client` or `provider`
|
|
||||||
- `--server-url`: Main server URL
|
|
||||||
- `--insecure`: Allow insecure SSL connections
|
|
||||||
- `--help`: Show all available options
|
|
||||||
|
|
||||||
#### Provider Mode Arguments
|
|
||||||
- `--host`: Host to bind the provider server (default: `0.0.0.0`)
|
|
||||||
- `--port`: Port for the provider server (default: `8788`)
|
|
||||||
- `--reload`: Enable auto-reload for development
|
|
||||||
|
|
||||||
#### Client Mode Arguments
|
|
||||||
- `--lobby`: Lobby name to join (default: `default`)
|
|
||||||
- `--session-name`: Display name for the bot (default: `Python Bot`)
|
|
||||||
- `--session-id`: Existing session ID to reuse
|
|
||||||
- `--password`: Password for protected names
|
|
||||||
- `--private`: Create/join private lobby
|
|
||||||
|
|
||||||
## Available Bots
|
|
||||||
|
|
||||||
The voicebot system includes the following bot types:
|
|
||||||
|
|
||||||
### 1. Whisper Bot
|
|
||||||
- **Name**: `whisper`
|
|
||||||
- **Description**: Speech recognition agent using OpenAI Whisper models
|
|
||||||
- **Capabilities**: Real-time audio transcription, multiple language support
|
|
||||||
- **Models**: Supports various Whisper and Distil-Whisper models
|
|
||||||
|
|
||||||
### 2. Synthetic Media Bot
|
|
||||||
- **Name**: `synthetic_media`
|
|
||||||
- **Description**: Generates animated video and audio tracks
|
|
||||||
- **Capabilities**: Animated video generation, synthetic audio, edge detection on incoming video
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
### Bot Provider System
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
|
|
||||||
│ Main Server │ │ Bot Provider │ │ Client App │
|
|
||||||
│ │◄───┤ (Voicebot) │ │ │
|
|
||||||
│ - Bot Registry │ │ - Whisper Bot │ │ - Bot Manager │
|
|
||||||
│ - Lobby Management │ - Synthetic Bot │ │ - UI Controls │
|
|
||||||
│ - API Endpoints │ │ - API Server │ │ - Lobby View │
|
|
||||||
└─────────────────┘ └──────────────────┘ └─────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Flow
|
|
||||||
1. Voicebot registers as bot provider with main server
|
|
||||||
2. Main server discovers available bots from providers
|
|
||||||
3. Client requests bot to join lobby via main server
|
|
||||||
4. Main server forwards request to appropriate provider
|
|
||||||
5. Provider creates bot instance that connects to the lobby
|
|
||||||
|
|
||||||
## Development
|
|
||||||
|
|
||||||
### Auto-Reload
|
|
||||||
|
|
||||||
In development mode, the bot provider supports auto-reload using uvicorn:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Watches /voicebot and /shared directories for changes
|
|
||||||
python main.py --mode provider --reload
|
|
||||||
```
|
|
||||||
|
|
||||||
### Adding New Bots
|
|
||||||
|
|
||||||
1. Create a new module in `voicebot/bots/`
|
|
||||||
2. Implement required functions:
|
|
||||||
```python
|
|
||||||
def agent_info() -> dict:
|
|
||||||
return {"name": "my_bot", "description": "My custom bot"}
|
|
||||||
|
|
||||||
def create_agent_tracks(session_name: str) -> dict:
|
|
||||||
# Return MediaStreamTrack instances
|
|
||||||
return {"audio": my_audio_track, "video": my_video_track}
|
|
||||||
```
|
|
||||||
3. The bot will be automatically discovered and available
|
|
||||||
|
|
||||||
### Testing
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Test bot discovery
|
|
||||||
python test_bot_api.py
|
|
||||||
|
|
||||||
# Test client connection
|
|
||||||
python main.py --mode client --lobby test --session-name "Test Bot"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Production Deployment
|
|
||||||
|
|
||||||
### Docker Compose
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
version: '3.8'
|
|
||||||
services:
|
|
||||||
voicebot-provider:
|
|
||||||
build: .
|
|
||||||
environment:
|
|
||||||
- VOICEBOT_MODE=provider
|
|
||||||
- PRODUCTION=true
|
|
||||||
ports:
|
|
||||||
- "8788:8788"
|
|
||||||
volumes:
|
|
||||||
- ./cache:/voicebot/cache
|
|
||||||
```
|
|
||||||
|
|
||||||
### Kubernetes
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: voicebot-provider
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: voicebot-provider
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: voicebot-provider
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: voicebot
|
|
||||||
image: ai-voicebot:latest
|
|
||||||
env:
|
|
||||||
- name: VOICEBOT_MODE
|
|
||||||
value: "provider"
|
|
||||||
- name: PRODUCTION
|
|
||||||
value: "true"
|
|
||||||
ports:
|
|
||||||
- containerPort: 8788
|
|
||||||
```
|
|
||||||
|
|
||||||
## API Reference
|
|
||||||
|
|
||||||
### Bot Provider Endpoints
|
|
||||||
|
|
||||||
The voicebot provider exposes the following HTTP API:
|
|
||||||
|
|
||||||
- `GET /bots` - List available bots
|
|
||||||
- `POST /bots/{bot_name}/join` - Request bot to join lobby
|
|
||||||
- `GET /bots/runs` - List active bot instances
|
|
||||||
- `POST /bots/runs/{run_id}/stop` - Stop a bot instance
|
|
||||||
|
|
||||||
### Example API Usage
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# List available bots
|
|
||||||
curl http://localhost:8788/bots
|
|
||||||
|
|
||||||
# Request whisper bot to join lobby
|
|
||||||
curl -X POST http://localhost:8788/bots/whisper/join \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"lobby_id": "lobby-123",
|
|
||||||
"session_id": "session-456",
|
|
||||||
"nick": "Speech Bot",
|
|
||||||
"server_url": "https://server.com/ai-voicebot"
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### Common Issues
|
|
||||||
|
|
||||||
**Bot provider not registering:**
|
|
||||||
- Check server URL is correct and accessible
|
|
||||||
- Verify network connectivity between provider and server
|
|
||||||
- Check logs for registration errors
|
|
||||||
|
|
||||||
**Auto-reload not working:**
|
|
||||||
- Ensure `--reload` flag is used in development
|
|
||||||
- Check file permissions on watched directories
|
|
||||||
- Verify uvicorn version supports reload functionality
|
|
||||||
|
|
||||||
**WebRTC connection issues:**
|
|
||||||
- Check STUN/TURN server configuration
|
|
||||||
- Verify network ports are not blocked
|
|
||||||
- Check browser console for ICE connection errors
|
|
||||||
|
|
||||||
### Logs
|
|
||||||
|
|
||||||
Logs are written to stdout and include:
|
|
||||||
- Bot registration status
|
|
||||||
- WebRTC connection events
|
|
||||||
- Media track creation/destruction
|
|
||||||
- API request/response details
|
|
||||||
|
|
||||||
### Debug Mode
|
|
||||||
|
|
||||||
Enable verbose logging:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python main.py --mode provider --server-url https://server.com --debug
|
|
||||||
```
|
|
||||||
|
|
||||||
## Contributing
|
|
||||||
|
|
||||||
1. Fork the repository
|
|
||||||
2. Create a feature branch
|
|
||||||
3. Make your changes
|
|
||||||
4. Add tests for new functionality
|
|
||||||
5. Submit a pull request
|
|
||||||
|
|
||||||
## License
|
|
||||||
|
|
||||||
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
@ -1,82 +0,0 @@
|
|||||||
# Voicebot Module Refactoring
|
|
||||||
|
|
||||||
The voicebot/main.py functionality has been broken down into individual Python files for better organization and maintainability:
|
|
||||||
|
|
||||||
## New File Structure
|
|
||||||
|
|
||||||
### Core Modules
|
|
||||||
|
|
||||||
1. **`models.py`** - Data models and configuration
|
|
||||||
- `VoicebotArgs` - Pydantic model for CLI arguments and configuration
|
|
||||||
- `VoicebotMode` - Enum for client/provider modes
|
|
||||||
- `Peer` - WebRTC peer representation
|
|
||||||
- `JoinRequest` - Request model for joining lobbies
|
|
||||||
- `MessageData` - Type alias for message payloads
|
|
||||||
|
|
||||||
2. **`webrtc_signaling.py`** - WebRTC signaling client functionality
|
|
||||||
- `WebRTCSignalingClient` - Main WebRTC signaling client class
|
|
||||||
- Handles peer connection management, ICE candidates, session descriptions
|
|
||||||
- Registration status tracking and reconnection logic
|
|
||||||
- Message processing and event handling
|
|
||||||
|
|
||||||
3. **`session_manager.py`** - Session and lobby management
|
|
||||||
- `create_or_get_session()` - Session creation/retrieval
|
|
||||||
- `create_or_get_lobby()` - Lobby creation/retrieval
|
|
||||||
- HTTP API communication utilities
|
|
||||||
|
|
||||||
4. **`bot_orchestrator.py`** - FastAPI bot orchestration service
|
|
||||||
- Bot discovery and management
|
|
||||||
- FastAPI endpoints for bot operations
|
|
||||||
- Provider registration with main server
|
|
||||||
- Bot instance lifecycle management
|
|
||||||
|
|
||||||
5. **`client_main.py`** - Main client logic
|
|
||||||
- `main_with_args()` - Core client functionality
|
|
||||||
- `start_client_with_reload()` - Development mode with reload
|
|
||||||
- Event handlers for peer and track management
|
|
||||||
|
|
||||||
6. **`client_app.py`** - Client FastAPI application
|
|
||||||
- `create_client_app()` - Creates FastAPI app for client mode
|
|
||||||
- Health check and status endpoints
|
|
||||||
- Process isolation and locking
|
|
||||||
|
|
||||||
7. **`utils.py`** - Utility functions
|
|
||||||
- URL conversion utilities (`http_base_url`, `ws_url`)
|
|
||||||
- SSL context creation
|
|
||||||
- Network information logging
|
|
||||||
|
|
||||||
8. **`main.py`** - Main orchestration and entry point
|
|
||||||
- Command-line argument parsing
|
|
||||||
- Mode selection (client vs provider)
|
|
||||||
- Entry points for both modes
|
|
||||||
|
|
||||||
### Key Improvements
|
|
||||||
|
|
||||||
- **Separation of Concerns**: Each file handles specific functionality
|
|
||||||
- **Better Maintainability**: Smaller, focused modules are easier to understand and modify
|
|
||||||
- **Reduced Coupling**: Dependencies between components are more explicit
|
|
||||||
- **Type Safety**: Proper type hints and Pydantic models throughout
|
|
||||||
- **Error Handling**: Centralized error handling and logging
|
|
||||||
|
|
||||||
### Usage
|
|
||||||
|
|
||||||
The refactored code maintains the same CLI interface:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Client mode
|
|
||||||
python voicebot/main.py --mode client --server-url http://localhost:8000/ai-voicebot
|
|
||||||
|
|
||||||
# Provider mode
|
|
||||||
python voicebot/main.py --mode provider --host 0.0.0.0 --port 8788
|
|
||||||
```
|
|
||||||
|
|
||||||
### Import Structure
|
|
||||||
|
|
||||||
```python
|
|
||||||
from voicebot import VoicebotArgs, VoicebotMode, WebRTCSignalingClient
|
|
||||||
from voicebot.models import Peer, JoinRequest
|
|
||||||
from voicebot.session_manager import create_or_get_session, create_or_get_lobby
|
|
||||||
from voicebot.client_main import main_with_args
|
|
||||||
```
|
|
||||||
|
|
||||||
The original `main_old.py` contains the monolithic implementation for reference.
|
|
@ -13,7 +13,7 @@ import os
|
|||||||
import gc
|
import gc
|
||||||
import shutil
|
import shutil
|
||||||
from queue import Queue, Empty
|
from queue import Queue, Empty
|
||||||
from typing import Dict, Optional, Callable, Awaitable, Any, cast, List, Union
|
from typing import Dict, Optional, Callable, Awaitable, Any, List, Union
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
from pydantic import BaseModel, Field, ConfigDict
|
from pydantic import BaseModel, Field, ConfigDict
|
||||||
@ -23,7 +23,10 @@ import librosa
|
|||||||
from shared.logger import logger
|
from shared.logger import logger
|
||||||
from aiortc import MediaStreamTrack
|
from aiortc import MediaStreamTrack
|
||||||
from aiortc.mediastreams import MediaStreamError
|
from aiortc.mediastreams import MediaStreamError
|
||||||
from av import AudioFrame
|
from av import AudioFrame, VideoFrame
|
||||||
|
import cv2
|
||||||
|
import fractions
|
||||||
|
from time import perf_counter
|
||||||
|
|
||||||
# Import shared models for chat functionality
|
# Import shared models for chat functionality
|
||||||
import sys
|
import sys
|
||||||
@ -35,24 +38,75 @@ from voicebot.models import Peer
|
|||||||
|
|
||||||
# OpenVINO optimized imports
|
# OpenVINO optimized imports
|
||||||
import openvino as ov
|
import openvino as ov
|
||||||
from optimum.intel.openvino import OVModelForSpeechSeq2Seq
|
from optimum.intel.openvino import OVModelForSpeechSeq2Seq # type: ignore
|
||||||
from transformers import AutoProcessor
|
from transformers import WhisperProcessor
|
||||||
|
from openvino.runtime import Core # Part of optimum.intel.openvino # type: ignore
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
# Import quantization dependencies with error handling
|
# Import quantization dependencies with error handling
|
||||||
try:
|
import nncf # type: ignore
|
||||||
import nncf
|
from optimum.intel.openvino.quantization import InferRequestWrapper # type: ignore
|
||||||
from optimum.intel.openvino.quantization import InferRequestWrapper
|
QUANTIZATION_AVAILABLE = True
|
||||||
QUANTIZATION_AVAILABLE = True
|
|
||||||
except ImportError as e:
|
|
||||||
logger.warning(f"Quantization libraries not available: {e}")
|
|
||||||
QUANTIZATION_AVAILABLE = False
|
|
||||||
|
|
||||||
# Type definitions
|
# Type definitions
|
||||||
AudioArray = npt.NDArray[np.float32]
|
AudioArray = npt.NDArray[np.float32]
|
||||||
ModelConfig = Dict[str, Union[str, int, bool]]
|
ModelConfig = Dict[str, Union[str, int, bool]]
|
||||||
CalibrationData = List[Dict[str, Any]]
|
CalibrationData = List[Dict[str, Any]]
|
||||||
|
|
||||||
|
_device = "GPU.1" # Default to Intel Arc B580 GPU
|
||||||
|
|
||||||
|
def get_available_devices() -> list[dict[str, Any]]:
|
||||||
|
"""List available OpenVINO devices with their properties."""
|
||||||
|
try:
|
||||||
|
core = Core()
|
||||||
|
devices = core.available_devices
|
||||||
|
device_info : list[dict[str, Any]] = []
|
||||||
|
for device in devices:
|
||||||
|
try:
|
||||||
|
# Get device properties
|
||||||
|
properties = core.get_property(device, "FULL_DEVICE_NAME")
|
||||||
|
# Attempt to get additional properties if available
|
||||||
|
try:
|
||||||
|
device_type = core.get_property(device, "DEVICE_TYPE")
|
||||||
|
except Exception:
|
||||||
|
device_type = "N/A"
|
||||||
|
try:
|
||||||
|
capabilities : Any = core.get_property(device, "SUPPORTED_PROPERTIES")
|
||||||
|
except Exception:
|
||||||
|
capabilities = "N/A"
|
||||||
|
device_info.append({
|
||||||
|
"name": device,
|
||||||
|
"full_name": properties,
|
||||||
|
"type": device_type,
|
||||||
|
"capabilities": capabilities
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to retrieve properties for device {device}: {e}")
|
||||||
|
device_info.append({
|
||||||
|
"name": device,
|
||||||
|
"full_name": "Unknown",
|
||||||
|
"type": "N/A",
|
||||||
|
"capabilities": "N/A"
|
||||||
|
})
|
||||||
|
return device_info
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to retrieve available devices: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def print_available_devices(device: str | None = None):
|
||||||
|
"""Print available OpenVINO devices in a formatted manner."""
|
||||||
|
devices = get_available_devices()
|
||||||
|
if not devices:
|
||||||
|
logger.info("No OpenVINO devices detected.")
|
||||||
|
return
|
||||||
|
logger.info("Available OpenVINO Devices:")
|
||||||
|
for d in devices:
|
||||||
|
logger.info(f"- Device: {d.get('name')} {'*' if d.get('name') == device else ''}")
|
||||||
|
logger.info(f" Full Name: {d.get('full_name')}")
|
||||||
|
logger.info(f" Type: {d.get('type')}")
|
||||||
|
|
||||||
|
|
||||||
|
print_available_devices(_device)
|
||||||
|
|
||||||
class AudioQueueItem(BaseModel):
|
class AudioQueueItem(BaseModel):
|
||||||
"""Audio data with timestamp for processing queue."""
|
"""Audio data with timestamp for processing queue."""
|
||||||
@ -75,7 +129,7 @@ class OpenVINOConfig(BaseModel):
|
|||||||
"""OpenVINO configuration for Intel Arc B580 optimization."""
|
"""OpenVINO configuration for Intel Arc B580 optimization."""
|
||||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
device: str = Field(default="GPU", description="Target device for inference")
|
device: str = Field(default=_device, description="Target device for inference")
|
||||||
cache_dir: str = Field(default="./ov_cache", description="Cache directory for compiled models")
|
cache_dir: str = Field(default="./ov_cache", description="Cache directory for compiled models")
|
||||||
enable_quantization: bool = Field(default=True, description="Enable INT8 quantization")
|
enable_quantization: bool = Field(default=True, description="Enable INT8 quantization")
|
||||||
throughput_streams: int = Field(default=2, description="Number of inference streams")
|
throughput_streams: int = Field(default=2, description="Number of inference streams")
|
||||||
@ -83,14 +137,36 @@ class OpenVINOConfig(BaseModel):
|
|||||||
|
|
||||||
def to_ov_config(self) -> ModelConfig:
|
def to_ov_config(self) -> ModelConfig:
|
||||||
"""Convert to OpenVINO configuration dictionary."""
|
"""Convert to OpenVINO configuration dictionary."""
|
||||||
return {
|
cfg: ModelConfig = {"CACHE_DIR": self.cache_dir}
|
||||||
"CACHE_DIR": self.cache_dir,
|
|
||||||
"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES",
|
# Only include GPU-specific tuning options when the target device is GPU.
|
||||||
"GPU_ENABLE_LOOP_UNROLLING": "YES",
|
# Some OpenVINO plugins (notably the CPU plugin) will raise NotFound
|
||||||
|
# errors for GPU_* properties, so avoid passing them unless applicable.
|
||||||
|
device = (self.device or "").upper()
|
||||||
|
if device == "GPU":
|
||||||
|
cfg.update(
|
||||||
|
{
|
||||||
|
# Throughput / stream tuning
|
||||||
"GPU_THROUGHPUT_STREAMS": str(self.throughput_streams),
|
"GPU_THROUGHPUT_STREAMS": str(self.throughput_streams),
|
||||||
"GPU_MAX_NUM_THREADS": str(self.max_threads),
|
# Threading controls may be driver/plugin-specific; keep minimal
|
||||||
"GPU_ENABLE_OPENCL_THROTTLING": "NO"
|
# NOTE: We intentionally do NOT set GPU_MAX_NUM_THREADS here
|
||||||
|
# because some OpenVINO plugins / builds (and the CPU plugin
|
||||||
|
# during a fallback) do not recognize the property and will
|
||||||
|
# raise NotFound/UnsupportedProperty errors. If you need to
|
||||||
|
# tune GPU threads for a specific driver, set that externally
|
||||||
|
# or via vendor-specific tools.
|
||||||
}
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Safe CPU-side defaults
|
||||||
|
cfg.update(
|
||||||
|
{
|
||||||
|
"CPU_THROUGHPUT_NUM_THREADS": str(self.max_threads),
|
||||||
|
"CPU_BIND_THREAD": "YES",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
# Global configuration and constants
|
# Global configuration and constants
|
||||||
@ -139,13 +215,14 @@ def setup_intel_arc_environment() -> None:
|
|||||||
class OpenVINOWhisperModel:
|
class OpenVINOWhisperModel:
|
||||||
"""OpenVINO optimized Whisper model for Intel Arc B580."""
|
"""OpenVINO optimized Whisper model for Intel Arc B580."""
|
||||||
|
|
||||||
def __init__(self, model_id: str, config: OpenVINOConfig):
|
def __init__(self, model_id: str, config: OpenVINOConfig, device: str):
|
||||||
self.model_id = model_id
|
self.model_id = model_id
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.device = device
|
||||||
self.model_path = Path(model_id.replace('/', '_'))
|
self.model_path = Path(model_id.replace('/', '_'))
|
||||||
self.quantized_model_path = Path(f"{self.model_path}_quantized")
|
self.quantized_model_path = Path(f"{self.model_path}_quantized")
|
||||||
|
|
||||||
self.processor: Optional[AutoProcessor] = None
|
self.processor: Optional[WhisperProcessor] = None
|
||||||
self.ov_model: Optional[OVModelForSpeechSeq2Seq] = None
|
self.ov_model: Optional[OVModelForSpeechSeq2Seq] = None
|
||||||
self.is_quantized = False
|
self.is_quantized = False
|
||||||
|
|
||||||
@ -157,23 +234,29 @@ class OpenVINOWhisperModel:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Initialize processor
|
# Initialize processor
|
||||||
self.processor = AutoProcessor.from_pretrained(self.model_id)
|
logger.info(f"Loading Whisper model '{self.model_id}' on device: {self.device}")
|
||||||
|
self.processor = WhisperProcessor.from_pretrained(self.model_id, use_fast=True) # type: ignore
|
||||||
logger.info("Whisper processor loaded successfully")
|
logger.info("Whisper processor loaded successfully")
|
||||||
|
|
||||||
# Try to load quantized model first if it exists
|
# Export the model to OpenVINO IR if not already converted
|
||||||
if QUANTIZATION_AVAILABLE and self.config.enable_quantization and self.quantized_model_path.exists():
|
self.ov_model = OVModelForSpeechSeq2Seq.from_pretrained(self.model_id, export=True, device=self.device) # type: ignore
|
||||||
if self._try_load_quantized_model():
|
|
||||||
return
|
|
||||||
|
|
||||||
# Load or create FP16 model
|
logger.info("Whisper model exported as OpenVINO IR")
|
||||||
if self.model_path.exists():
|
|
||||||
self._load_fp16_model()
|
|
||||||
else:
|
|
||||||
self._convert_model()
|
|
||||||
|
|
||||||
# Try quantization after model is loaded and compiled
|
# # Try to load quantized model first if it exists
|
||||||
if QUANTIZATION_AVAILABLE and self.config.enable_quantization and not self.is_quantized:
|
# if self.config.enable_quantization and self.quantized_model_path.exists():
|
||||||
self._try_quantize_existing_model()
|
# if self._try_load_quantized_model():
|
||||||
|
# return
|
||||||
|
|
||||||
|
# # Load or create FP16 model
|
||||||
|
# if self.model_path.exists():
|
||||||
|
# self._load_fp16_model()
|
||||||
|
# else:
|
||||||
|
# self._convert_model()
|
||||||
|
|
||||||
|
# # Try quantization after model is loaded and compiled
|
||||||
|
# if self.config.enable_quantization and not self.is_quantized:
|
||||||
|
# self._try_quantize_existing_model()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error initializing model: {e}")
|
logger.error(f"Error initializing model: {e}")
|
||||||
@ -294,6 +377,9 @@ class OpenVINOWhisperModel:
|
|||||||
|
|
||||||
def _quantize_model_safe(self) -> None:
|
def _quantize_model_safe(self) -> None:
|
||||||
"""Safely quantize the model with extensive error handling."""
|
"""Safely quantize the model with extensive error handling."""
|
||||||
|
if not nncf:
|
||||||
|
logger.info("Quantization libraries not available, skipping quantization")
|
||||||
|
return
|
||||||
if self.quantized_model_path.exists():
|
if self.quantized_model_path.exists():
|
||||||
logger.info("Quantized model already exists")
|
logger.info("Quantized model already exists")
|
||||||
return
|
return
|
||||||
@ -301,6 +387,9 @@ class OpenVINOWhisperModel:
|
|||||||
if self.ov_model is None:
|
if self.ov_model is None:
|
||||||
raise RuntimeError("No model to quantize")
|
raise RuntimeError("No model to quantize")
|
||||||
|
|
||||||
|
if not self.ov_model.decoder_with_past:
|
||||||
|
raise RuntimeError("Model decoder_with_past not available")
|
||||||
|
|
||||||
logger.info("Creating INT8 quantized model for Intel Arc B580...")
|
logger.info("Creating INT8 quantized model for Intel Arc B580...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -338,8 +427,8 @@ class OpenVINOWhisperModel:
|
|||||||
|
|
||||||
# Save quantized models
|
# Save quantized models
|
||||||
self.quantized_model_path.mkdir(parents=True, exist_ok=True)
|
self.quantized_model_path.mkdir(parents=True, exist_ok=True)
|
||||||
ov.save_model(quantized_encoder, self.quantized_model_path / "openvino_encoder_model.xml")
|
ov.save_model(quantized_encoder, self.quantized_model_path / "openvino_encoder_model.xml") # type: ignore
|
||||||
ov.save_model(quantized_decoder, self.quantized_model_path / "openvino_decoder_with_past_model.xml")
|
ov.save_model(quantized_decoder, self.quantized_model_path / "openvino_decoder_with_past_model.xml") # type: ignore
|
||||||
|
|
||||||
# Copy remaining files
|
# Copy remaining files
|
||||||
self._copy_model_files()
|
self._copy_model_files()
|
||||||
@ -366,11 +455,11 @@ class OpenVINOWhisperModel:
|
|||||||
logger.info(f"Collecting calibration data ({dataset_size} samples)...")
|
logger.info(f"Collecting calibration data ({dataset_size} samples)...")
|
||||||
|
|
||||||
# Check model components
|
# Check model components
|
||||||
if not hasattr(self.ov_model, 'encoder') or self.ov_model.encoder is None:
|
if not self.ov_model.encoder:
|
||||||
logger.warning("Encoder not available for calibration")
|
logger.warning("Encoder not available for calibration")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
if not hasattr(self.ov_model, 'decoder_with_past') or self.ov_model.decoder_with_past is None:
|
if not self.ov_model.decoder_with_past:
|
||||||
logger.warning("Decoder with past not available for calibration")
|
logger.warning("Decoder with past not available for calibration")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
@ -402,14 +491,14 @@ class OpenVINOWhisperModel:
|
|||||||
duration = 2.0 + np.random.random() * 3.0 # 2-5 seconds
|
duration = 2.0 + np.random.random() * 3.0 # 2-5 seconds
|
||||||
synthetic_audio = np.random.randn(int(SAMPLE_RATE * duration)).astype(np.float32) * 0.1
|
synthetic_audio = np.random.randn(int(SAMPLE_RATE * duration)).astype(np.float32) * 0.1
|
||||||
|
|
||||||
input_features = self.processor(
|
inputs : Any = self.processor(
|
||||||
synthetic_audio,
|
synthetic_audio,
|
||||||
sampling_rate=SAMPLE_RATE,
|
sampling_rate=SAMPLE_RATE,
|
||||||
return_tensors="pt"
|
return_tensors="pt"
|
||||||
).input_features
|
)
|
||||||
|
|
||||||
# Run inference to collect calibration data
|
# Run inference to collect calibration data
|
||||||
_ = self.ov_model.generate(input_features, max_new_tokens=10)
|
generated_ids = self.ov_model.generate(inputs.input_features, max_new_tokens=10)
|
||||||
|
|
||||||
if i % 5 == 0:
|
if i % 5 == 0:
|
||||||
logger.debug(f"Generated calibration sample {i+1}/{dataset_size}")
|
logger.debug(f"Generated calibration sample {i+1}/{dataset_size}")
|
||||||
@ -470,11 +559,36 @@ class OpenVINOWhisperModel:
|
|||||||
self._warmup_model()
|
self._warmup_model()
|
||||||
logger.info("Model compiled and warmed up successfully")
|
logger.info("Model compiled and warmed up successfully")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to compile for GPU, trying CPU: {e}")
|
logger.warning(f"Failed to compile for {self.config.device}, attempting safe CPU fallback: {e}")
|
||||||
# Fallback to CPU
|
# Fallback: reload/compile model with a CPU-only ov_config to avoid
|
||||||
|
# passing GPU-specific properties to the CPU plugin which can raise
|
||||||
|
# NotFound/UnsupportedProperty exceptions.
|
||||||
|
try:
|
||||||
|
cpu_cfg = OpenVINOConfig(**{**self.config.model_dump()}) if hasattr(self.config, 'model_dump') else self.config
|
||||||
|
# Ensure device is CPU and use conservative CPU threading options
|
||||||
|
cpu_cfg = OpenVINOConfig(device='CPU', cache_dir=self.config.cache_dir, enable_quantization=self.config.enable_quantization, throughput_streams=1, max_threads=self.config.max_threads)
|
||||||
|
|
||||||
|
logger.info("Reloading model with CPU-only OpenVINO config for safe compilation")
|
||||||
|
# Try to reload using the existing saved model path if possible
|
||||||
|
try:
|
||||||
|
self.ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
|
||||||
|
self.model_path,
|
||||||
|
ov_config=cpu_cfg.to_ov_config(),
|
||||||
|
compile=False
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# If loading the saved model failed, try loading without ov_config
|
||||||
|
self.ov_model = OVModelForSpeechSeq2Seq.from_pretrained(self.model_path, compile=False)
|
||||||
|
|
||||||
|
# Compile on CPU
|
||||||
|
self.ov_model.to('CPU')
|
||||||
|
# Provide CPU-only ov_config if supported
|
||||||
try:
|
try:
|
||||||
self.ov_model.to("CPU")
|
|
||||||
self.ov_model.compile()
|
self.ov_model.compile()
|
||||||
|
except Exception as compile_cpu_e:
|
||||||
|
logger.warning(f"CPU compile with CPU ov_config failed, retrying default compile: {compile_cpu_e}")
|
||||||
|
self.ov_model.compile()
|
||||||
|
|
||||||
self._warmup_model()
|
self._warmup_model()
|
||||||
logger.info("Model compiled for CPU successfully")
|
logger.info("Model compiled for CPU successfully")
|
||||||
except Exception as cpu_e:
|
except Exception as cpu_e:
|
||||||
@ -503,17 +617,31 @@ class OpenVINOWhisperModel:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Model warmup failed: {e}")
|
logger.warning(f"Model warmup failed: {e}")
|
||||||
|
|
||||||
def generate(self, input_features: torch.Tensor) -> torch.Tensor:
|
def generate(self, input_features: torch.Tensor, language: str = "en") -> torch.Tensor:
|
||||||
"""Generate transcription from input features."""
|
"""Generate transcription from input features."""
|
||||||
if self.ov_model is None:
|
if self.ov_model is None:
|
||||||
raise RuntimeError("Model not initialized")
|
raise RuntimeError("Model not initialized")
|
||||||
|
|
||||||
return self.ov_model.generate(
|
generation_config : dict[str, Any]= {
|
||||||
|
"max_length": 448,
|
||||||
|
"num_beams": 4, # Use beam search for better results
|
||||||
|
# "num_beams": 1, # Greedy decoding for speed
|
||||||
|
"no_repeat_ngram_size": 3, # Prevent repetitive phrases
|
||||||
|
"language": language, # Explicitly set language to English
|
||||||
|
"task": "transcribe", # Ensure transcription, not translation
|
||||||
|
"suppress_tokens": None, # Disable default suppress_tokens to avoid conflicts
|
||||||
|
"begin_suppress_tokens": None, # Disable default begin_suppress_tokens
|
||||||
|
"max_new_tokens": 128,
|
||||||
|
"do_sample": False
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
return self.ov_model.generate( # type: ignore
|
||||||
input_features,
|
input_features,
|
||||||
max_new_tokens=128,
|
**generation_config
|
||||||
num_beams=1, # Greedy decoding for speed
|
|
||||||
do_sample=False
|
|
||||||
)
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Model generation failed: {e}")
|
||||||
|
raise RuntimeError(f"Failed to generate transcription: {e}")
|
||||||
|
|
||||||
def decode(self, token_ids: torch.Tensor, skip_special_tokens: bool = True) -> List[str]:
|
def decode(self, token_ids: torch.Tensor, skip_special_tokens: bool = True) -> List[str]:
|
||||||
"""Decode token IDs to text."""
|
"""Decode token IDs to text."""
|
||||||
@ -528,30 +656,29 @@ _whisper_model: Optional[OpenVINOWhisperModel] = None
|
|||||||
_audio_processors: Dict[str, "OptimizedAudioProcessor"] = {}
|
_audio_processors: Dict[str, "OptimizedAudioProcessor"] = {}
|
||||||
_send_chat_func: Optional[Callable[[str], Awaitable[None]]] = None
|
_send_chat_func: Optional[Callable[[str], Awaitable[None]]] = None
|
||||||
|
|
||||||
|
def _ensure_model_loaded(device: str = _device) -> OpenVINOWhisperModel:
|
||||||
def _ensure_model_loaded() -> OpenVINOWhisperModel:
|
|
||||||
"""Ensure the global model is loaded."""
|
"""Ensure the global model is loaded."""
|
||||||
global _whisper_model
|
global _whisper_model
|
||||||
if _whisper_model is None:
|
if _whisper_model is None:
|
||||||
setup_intel_arc_environment()
|
setup_intel_arc_environment()
|
||||||
logger.info(f"Loading OpenVINO Whisper model: {_model_id}")
|
logger.info(f"Loading OpenVINO Whisper model: {_model_id}")
|
||||||
_whisper_model = OpenVINOWhisperModel(_model_id, _ov_config)
|
_whisper_model = OpenVINOWhisperModel(model_id=_model_id, config=_ov_config, device=device)
|
||||||
logger.info("OpenVINO Whisper model loaded successfully")
|
logger.info("OpenVINO Whisper model loaded successfully")
|
||||||
return _whisper_model
|
return _whisper_model
|
||||||
|
|
||||||
|
|
||||||
def extract_input_features(audio_array: AudioArray, sampling_rate: int) -> torch.Tensor:
|
def extract_input_features(audio_array: AudioArray, sampling_rate: int) -> torch.Tensor:
|
||||||
"""Extract input features from audio array optimized for OpenVINO."""
|
"""Extract input features from audio array optimized for OpenVINO."""
|
||||||
model = _ensure_model_loaded()
|
ov_model = _ensure_model_loaded()
|
||||||
if model.processor is None:
|
if ov_model.processor is None:
|
||||||
raise RuntimeError("Processor not initialized")
|
raise RuntimeError("Processor not initialized")
|
||||||
|
|
||||||
processor_output = model.processor(
|
inputs = ov_model.processor(
|
||||||
audio_array,
|
audio_array,
|
||||||
sampling_rate=sampling_rate,
|
sampling_rate=sampling_rate,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
)
|
)
|
||||||
return processor_output.input_features
|
return inputs.input_features
|
||||||
|
|
||||||
|
|
||||||
class OptimizedAudioProcessor:
|
class OptimizedAudioProcessor:
|
||||||
@ -686,8 +813,8 @@ class OptimizedAudioProcessor:
|
|||||||
threading_queue = getattr(self, '_threading_queue', None)
|
threading_queue = getattr(self, '_threading_queue', None)
|
||||||
if threading_queue:
|
if threading_queue:
|
||||||
threading_queue.put_nowait(queue_item)
|
threading_queue.put_nowait(queue_item)
|
||||||
except:
|
except Exception as e:
|
||||||
logger.warning(f"Threading queue issue for {self.peer_name}")
|
logger.warning(f"Threading queue issue for {self.peer_name}: {e}")
|
||||||
|
|
||||||
def _queue_final_transcription(self) -> None:
|
def _queue_final_transcription(self) -> None:
|
||||||
"""Queue final transcription of current phrase."""
|
"""Queue final transcription of current phrase."""
|
||||||
@ -759,8 +886,18 @@ class OptimizedAudioProcessor:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in thread processing loop for {self.peer_name}: {e}")
|
logger.error(f"Error in thread processing loop for {self.peer_name}: {e}")
|
||||||
|
|
||||||
async def _transcribe_and_send(self, audio_array: AudioArray, is_final: bool) -> None:
|
async def _transcribe_and_send(self, audio_array: AudioArray, is_final: bool, language: str="en") -> None:
|
||||||
"""Transcribe audio using OpenVINO optimized model."""
|
"""
|
||||||
|
Transcribe raw numpy audio data using OpenVINO Whisper.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- audio_array: normalized 1D numpy array containing mono PCM data at 16 kHz.
|
||||||
|
- is_final: whether this is a final transcription (True) or interim (False)
|
||||||
|
- language: language code for transcription (default 'en' for English)
|
||||||
|
"""
|
||||||
|
if audio_array.ndim != 1:
|
||||||
|
raise ValueError("Expected mono audio as a 1D numpy array.")
|
||||||
|
|
||||||
transcription_start = time.time()
|
transcription_start = time.time()
|
||||||
transcription_type = "final" if is_final else "streaming"
|
transcription_type = "final" if is_final else "streaming"
|
||||||
|
|
||||||
@ -782,15 +919,15 @@ class OptimizedAudioProcessor:
|
|||||||
|
|
||||||
# Extract features for OpenVINO
|
# Extract features for OpenVINO
|
||||||
input_features = extract_input_features(audio_array, self.sample_rate)
|
input_features = extract_input_features(audio_array, self.sample_rate)
|
||||||
|
# logger.info(f"Features extracted for OpenVINO: {input_features.shape}")
|
||||||
# GPU inference with OpenVINO
|
# GPU inference with OpenVINO
|
||||||
model = _ensure_model_loaded()
|
ov_model = _ensure_model_loaded()
|
||||||
predicted_ids = model.generate(input_features)
|
generated_ids = ov_model.generate(input_features)
|
||||||
|
|
||||||
# Decode results
|
|
||||||
transcription = model.decode(predicted_ids, skip_special_tokens=True)
|
|
||||||
text = transcription[0].strip() if transcription else ""
|
|
||||||
|
|
||||||
|
# Decode tokens into text
|
||||||
|
transcription = ov_model.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||||
|
text = transcription.strip() if transcription else ""
|
||||||
|
logger.info(f"Transcription text: {text}")
|
||||||
transcription_time = time.time() - transcription_start
|
transcription_time = time.time() - transcription_start
|
||||||
|
|
||||||
if text and len(text.split()) >= 2:
|
if text and len(text.split()) >= 2:
|
||||||
@ -847,6 +984,125 @@ class OptimizedAudioProcessor:
|
|||||||
|
|
||||||
logger.info(f"OptimizedAudioProcessor shutdown complete for {self.peer_name}")
|
logger.info(f"OptimizedAudioProcessor shutdown complete for {self.peer_name}")
|
||||||
|
|
||||||
|
def normalize_audio(audio_data: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]:
|
||||||
|
"""Normalize audio to have maximum amplitude of 1.0."""
|
||||||
|
max_amplitude = np.max(np.abs(audio_data))
|
||||||
|
if max_amplitude > 0:
|
||||||
|
audio_data = audio_data / max_amplitude
|
||||||
|
return audio_data
|
||||||
|
|
||||||
|
|
||||||
|
class MediaClock:
|
||||||
|
"""Simple monotonic clock for media tracks."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.t0 = perf_counter()
|
||||||
|
|
||||||
|
def now(self) -> float:
|
||||||
|
return perf_counter() - self.t0
|
||||||
|
|
||||||
|
|
||||||
|
class WaveformVideoTrack(MediaStreamTrack):
|
||||||
|
"""Video track that renders a live waveform of the incoming audio.
|
||||||
|
|
||||||
|
The track reads the most-active `OptimizedAudioProcessor` in
|
||||||
|
`_audio_processors` and renders the last ~2s of its `current_phrase_audio`.
|
||||||
|
If no audio is available, the track will display a "No audio" message.
|
||||||
|
"""
|
||||||
|
|
||||||
|
kind = "video"
|
||||||
|
|
||||||
|
def __init__(self, session_name: str, width: int = 640, height: int = 240, fps: int = 15) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.session_name = session_name
|
||||||
|
self.width = int(width)
|
||||||
|
self.height = int(height)
|
||||||
|
self.fps = int(fps)
|
||||||
|
self.clock = MediaClock()
|
||||||
|
self._next_frame_index = 0
|
||||||
|
|
||||||
|
async def next_timestamp(self) -> tuple[int, float]:
|
||||||
|
pts = int(self._next_frame_index * (1 / self.fps) * 90000)
|
||||||
|
time_base = 1 / 90000
|
||||||
|
return pts, time_base
|
||||||
|
|
||||||
|
async def recv(self) -> VideoFrame:
|
||||||
|
pts, time_base = await self.next_timestamp()
|
||||||
|
|
||||||
|
# schedule frame according to clock
|
||||||
|
target_t = self._next_frame_index / self.fps
|
||||||
|
now = self.clock.now()
|
||||||
|
if target_t > now:
|
||||||
|
await asyncio.sleep(target_t - now)
|
||||||
|
|
||||||
|
self._next_frame_index += 1
|
||||||
|
|
||||||
|
frame_array: npt.NDArray[np.uint8] = np.zeros((self.height, self.width, 3), dtype=np.uint8)
|
||||||
|
|
||||||
|
# Select the most active processor (highest RMS) and draw its waveform
|
||||||
|
best_proc = None
|
||||||
|
best_rms = 0.0
|
||||||
|
try:
|
||||||
|
for pname, proc in _audio_processors.items():
|
||||||
|
try:
|
||||||
|
arr = getattr(proc, 'current_phrase_audio', None)
|
||||||
|
if arr is None or len(arr) == 0:
|
||||||
|
continue
|
||||||
|
rms = float(np.sqrt(np.mean(arr**2)))
|
||||||
|
if rms > best_rms:
|
||||||
|
best_rms = rms
|
||||||
|
best_proc = (pname, arr.copy())
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
best_proc = None
|
||||||
|
|
||||||
|
if best_proc is not None:
|
||||||
|
pname, arr = best_proc
|
||||||
|
|
||||||
|
# Use up to 2 seconds of audio for the waveform
|
||||||
|
window_samples = min(len(arr), SAMPLE_RATE * 2)
|
||||||
|
if window_samples <= 0:
|
||||||
|
arr_segment = np.zeros(1, dtype=np.float32)
|
||||||
|
else:
|
||||||
|
arr_segment = arr[-window_samples:]
|
||||||
|
|
||||||
|
# Normalize segment to -1..1 safely
|
||||||
|
maxv = float(np.max(np.abs(arr_segment))) if arr_segment.size > 0 else 0.0
|
||||||
|
if maxv > 0:
|
||||||
|
norm = arr_segment / maxv
|
||||||
|
else:
|
||||||
|
norm = np.zeros_like(arr_segment)
|
||||||
|
|
||||||
|
# Map audio samples to pixels across the width
|
||||||
|
if norm.size < self.width:
|
||||||
|
padded = np.zeros(self.width, dtype=np.float32)
|
||||||
|
if norm.size > 0:
|
||||||
|
padded[-norm.size:] = norm
|
||||||
|
norm = padded
|
||||||
|
else:
|
||||||
|
block = int(np.ceil(norm.size / self.width))
|
||||||
|
norm = np.array([np.mean(norm[i * block : min((i + 1) * block, norm.size)]) for i in range(self.width)], dtype=np.float32)
|
||||||
|
|
||||||
|
# Create polyline points, avoid NaN
|
||||||
|
points: list[tuple[int, int]] = []
|
||||||
|
for x in range(self.width):
|
||||||
|
v = float(norm[x]) if x < norm.size and not np.isnan(norm[x]) else 0.0
|
||||||
|
y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 1))
|
||||||
|
points.append((x, max(0, min(self.height - 1, y))))
|
||||||
|
|
||||||
|
if len(points) > 1:
|
||||||
|
pts_np = np.array(points, dtype=np.int32)
|
||||||
|
cv2.polylines(frame_array, [pts_np], isClosed=False, color=(0, 200, 80), thickness=2)
|
||||||
|
|
||||||
|
cv2.putText(frame_array, f"Waveform: {pname}", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
||||||
|
else:
|
||||||
|
cv2.putText(frame_array, "No audio", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (200, 200, 200), 1)
|
||||||
|
|
||||||
|
frame = VideoFrame.from_ndarray(frame_array, format="bgr24")
|
||||||
|
frame.pts = pts
|
||||||
|
frame.time_base = fractions.Fraction(1 / 90000).limit_denominator(1000000)
|
||||||
|
return frame
|
||||||
|
|
||||||
async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None:
|
async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None:
|
||||||
"""Handle incoming audio tracks from WebRTC peers."""
|
"""Handle incoming audio tracks from WebRTC peers."""
|
||||||
@ -901,7 +1157,8 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None:
|
|||||||
audio_data = _resample_audio(audio_data, frame.sample_rate, SAMPLE_RATE)
|
audio_data = _resample_audio(audio_data, frame.sample_rate, SAMPLE_RATE)
|
||||||
|
|
||||||
# Convert to float32
|
# Convert to float32
|
||||||
audio_data_float32 = cast(AudioArray, audio_data.astype(np.float32))
|
audio_data_float32 = audio_data.astype(np.float32)
|
||||||
|
audio_data = normalize_audio(audio_data)
|
||||||
|
|
||||||
# Process with optimized processor
|
# Process with optimized processor
|
||||||
audio_processor.add_audio_data(audio_data_float32)
|
audio_processor.add_audio_data(audio_data_float32)
|
||||||
@ -937,7 +1194,11 @@ def _process_audio_frame(audio_data: npt.NDArray[Any], frame: AudioFrame) -> npt
|
|||||||
def _resample_audio(audio_data: npt.NDArray[np.float32], orig_sr: int, target_sr: int) -> npt.NDArray[np.float32]:
|
def _resample_audio(audio_data: npt.NDArray[np.float32], orig_sr: int, target_sr: int) -> npt.NDArray[np.float32]:
|
||||||
"""Resample audio efficiently."""
|
"""Resample audio efficiently."""
|
||||||
try:
|
try:
|
||||||
# Use high-quality resampling for better results
|
# Handle stereo audio by converting to mono if necessary
|
||||||
|
if audio_data.ndim > 1:
|
||||||
|
audio_data = np.mean(audio_data, axis=1)
|
||||||
|
|
||||||
|
# Use high-quality resampling
|
||||||
resampled = librosa.resample(
|
resampled = librosa.resample(
|
||||||
audio_data.astype(np.float64),
|
audio_data.astype(np.float64),
|
||||||
orig_sr=orig_sr,
|
orig_sr=orig_sr,
|
||||||
@ -947,16 +1208,42 @@ def _resample_audio(audio_data: npt.NDArray[np.float32], orig_sr: int, target_sr
|
|||||||
return resampled.astype(np.float32)
|
return resampled.astype(np.float32)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Resampling failed: {e}")
|
logger.error(f"Resampling failed: {e}")
|
||||||
return audio_data
|
raise ValueError(f"Failed to resample audio from {orig_sr} Hz to {target_sr} Hz: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Public API functions
|
# Public API functions
|
||||||
def agent_info() -> Dict[str, str]:
|
def agent_info() -> Dict[str, str]:
|
||||||
return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION, "has_media": "false"}
|
return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION, "has_media": "true"}
|
||||||
|
|
||||||
|
|
||||||
def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]:
|
def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]:
|
||||||
"""Whisper is not a media source - return no local tracks."""
|
"""Create agent tracks. Provides a synthetic video waveform track and a silent audio track for compatibility."""
|
||||||
|
class SilentAudioTrack(MediaStreamTrack):
|
||||||
|
kind = "audio"
|
||||||
|
def __init__(self, sample_rate: int = SAMPLE_RATE, channels: int = 1, fps: int = 50):
|
||||||
|
super().__init__()
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.channels = channels
|
||||||
|
self.fps = fps
|
||||||
|
self.samples_per_frame = int(self.sample_rate / self.fps)
|
||||||
|
self._timestamp = 0
|
||||||
|
async def recv(self) -> AudioFrame:
|
||||||
|
# Generate silent audio as int16 (required by aiortc)
|
||||||
|
data = np.zeros((self.channels, self.samples_per_frame), dtype=np.int16)
|
||||||
|
frame = AudioFrame.from_ndarray(data, layout="mono" if self.channels == 1 else "stereo")
|
||||||
|
frame.sample_rate = self.sample_rate
|
||||||
|
frame.pts = self._timestamp
|
||||||
|
frame.time_base = fractions.Fraction(1, self.sample_rate)
|
||||||
|
self._timestamp += self.samples_per_frame
|
||||||
|
await asyncio.sleep(1 / self.fps)
|
||||||
|
return frame
|
||||||
|
try:
|
||||||
|
video_track = WaveformVideoTrack(session_name=session_name, width=640, height=240, fps=15)
|
||||||
|
audio_track = SilentAudioTrack()
|
||||||
|
return {"video": video_track, "audio": audio_track}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to create agent tracks: {e}")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
@ -1010,12 +1297,12 @@ def get_active_processors() -> Dict[str, OptimizedAudioProcessor]:
|
|||||||
|
|
||||||
def get_model_info() -> Dict[str, Any]:
|
def get_model_info() -> Dict[str, Any]:
|
||||||
"""Get information about the loaded model."""
|
"""Get information about the loaded model."""
|
||||||
model = _ensure_model_loaded()
|
ov_model = _ensure_model_loaded()
|
||||||
return {
|
return {
|
||||||
"model_id": _model_id,
|
"model_id": _model_id,
|
||||||
"device": _ov_config.device,
|
"device": _ov_config.device,
|
||||||
"quantization_enabled": _ov_config.enable_quantization,
|
"quantization_enabled": _ov_config.enable_quantization,
|
||||||
"is_quantized": model.is_quantized,
|
"is_quantized": ov_model.is_quantized,
|
||||||
"sample_rate": SAMPLE_RATE,
|
"sample_rate": SAMPLE_RATE,
|
||||||
"chunk_duration_ms": CHUNK_DURATION_MS
|
"chunk_duration_ms": CHUNK_DURATION_MS
|
||||||
}
|
}
|
@ -29,10 +29,10 @@ dill==0.3.8
|
|||||||
dnspython==2.7.0
|
dnspython==2.7.0
|
||||||
fastapi==0.116.1
|
fastapi==0.116.1
|
||||||
ffmpy==0.6.1
|
ffmpy==0.6.1
|
||||||
filelock==3.13.1
|
filelock==3.19.1
|
||||||
fonttools==4.59.2
|
fonttools==4.59.2
|
||||||
frozenlist==1.7.0
|
frozenlist==1.7.0
|
||||||
fsspec==2024.6.1
|
fsspec==2025.3.0
|
||||||
google-crc32c==1.7.1
|
google-crc32c==1.7.1
|
||||||
gradio==5.44.1
|
gradio==5.44.1
|
||||||
gradio-client==1.12.1
|
gradio-client==1.12.1
|
||||||
@ -45,17 +45,18 @@ httpx==0.28.1
|
|||||||
huggingface-hub==0.34.4
|
huggingface-hub==0.34.4
|
||||||
idna==3.10
|
idna==3.10
|
||||||
ifaddr==0.2.0
|
ifaddr==0.2.0
|
||||||
jinja2==3.1.4
|
iniconfig==2.1.0
|
||||||
|
jinja2==3.1.6
|
||||||
jiwer==4.0.0
|
jiwer==4.0.0
|
||||||
joblib==1.5.2
|
joblib==1.5.2
|
||||||
jsonschema==4.25.1
|
jsonschema==4.25.1
|
||||||
jsonschema-specifications==2025.4.1
|
jsonschema-specifications==2025.9.1
|
||||||
kiwisolver==1.4.9
|
kiwisolver==1.4.9
|
||||||
lazy-loader==0.4
|
lazy-loader==0.4
|
||||||
librosa==0.11.0
|
librosa==0.11.0
|
||||||
llvmlite==0.44.0
|
llvmlite==0.44.0
|
||||||
markdown-it-py==4.0.0
|
markdown-it-py==4.0.0
|
||||||
markupsafe==2.1.5
|
markupsafe==3.0.2
|
||||||
matplotlib==3.10.6
|
matplotlib==3.10.6
|
||||||
mdurl==0.1.2
|
mdurl==0.1.2
|
||||||
ml-dtypes==0.5.3
|
ml-dtypes==0.5.3
|
||||||
@ -65,23 +66,40 @@ msgpack==1.1.1
|
|||||||
multidict==6.6.4
|
multidict==6.6.4
|
||||||
multiprocess==0.70.16
|
multiprocess==0.70.16
|
||||||
natsort==8.4.0
|
natsort==8.4.0
|
||||||
networkx==3.3
|
networkx==3.5
|
||||||
ninja==1.11.1.4
|
ninja==1.13.0
|
||||||
nncf==2.17.0
|
nncf==2.18.0
|
||||||
numba==0.61.2
|
numba==0.61.2
|
||||||
numpy==2.2.6
|
numpy==2.2.6
|
||||||
|
nvidia-cublas-cu12==12.8.4.1
|
||||||
|
nvidia-cuda-cupti-cu12==12.8.90
|
||||||
|
nvidia-cuda-nvrtc-cu12==12.8.93
|
||||||
|
nvidia-cuda-runtime-cu12==12.8.90
|
||||||
|
nvidia-cudnn-cu12==9.10.2.21
|
||||||
|
nvidia-cufft-cu12==11.3.3.83
|
||||||
|
nvidia-cufile-cu12==1.13.1.3
|
||||||
|
nvidia-curand-cu12==10.3.9.90
|
||||||
|
nvidia-cusolver-cu12==11.7.3.90
|
||||||
|
nvidia-cusparse-cu12==12.5.8.93
|
||||||
|
nvidia-cusparselt-cu12==0.7.1
|
||||||
|
nvidia-nccl-cu12==2.27.3
|
||||||
|
nvidia-nvjitlink-cu12==12.8.93
|
||||||
|
nvidia-nvtx-cu12==12.8.90
|
||||||
onnx==1.19.0
|
onnx==1.19.0
|
||||||
openai-whisper==20250625
|
openai-whisper @ git+https://github.com/openai/whisper.git@c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
|
||||||
opencv-python==4.11.0.86
|
opencv-python==4.11.0.86
|
||||||
openvino==2025.3.0
|
openvino==2025.3.0
|
||||||
|
openvino-genai==2025.3.0.0
|
||||||
openvino-telemetry==2025.2.0
|
openvino-telemetry==2025.2.0
|
||||||
|
openvino-tokenizers==2025.3.0.0
|
||||||
optimum==1.27.0
|
optimum==1.27.0
|
||||||
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@c35534d077dddf9382c6d8699f13412d28b19853
|
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@b9c151fec6b414d9ca78be8643d08e267b133bfc
|
||||||
orjson==3.11.3
|
orjson==3.11.3
|
||||||
packaging==25.0
|
packaging==25.0
|
||||||
pandas==2.2.3
|
pandas==2.3.2
|
||||||
pillow==11.3.0
|
pillow==11.3.0
|
||||||
platformdirs==4.4.0
|
platformdirs==4.4.0
|
||||||
|
pluggy==1.6.0
|
||||||
pooch==1.8.2
|
pooch==1.8.2
|
||||||
propcache==0.3.2
|
propcache==0.3.2
|
||||||
protobuf==6.32.0
|
protobuf==6.32.0
|
||||||
@ -96,16 +114,22 @@ pyee==13.0.0
|
|||||||
pygments==2.19.2
|
pygments==2.19.2
|
||||||
pylibsrtp==0.12.0
|
pylibsrtp==0.12.0
|
||||||
pymoo==0.6.1.5
|
pymoo==0.6.1.5
|
||||||
|
pyopencl==2025.2.6
|
||||||
pyopenssl==25.1.0
|
pyopenssl==25.1.0
|
||||||
pyparsing==3.2.3
|
pyparsing==3.2.3
|
||||||
|
pytest==8.4.2
|
||||||
|
pytest-asyncio==1.1.0
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
|
python-ffmpeg==1.0.16
|
||||||
python-multipart==0.0.20
|
python-multipart==0.0.20
|
||||||
|
pytools==2025.2.4
|
||||||
pytz==2025.2
|
pytz==2025.2
|
||||||
pyyaml==6.0.2
|
pyyaml==6.0.2
|
||||||
rapidfuzz==3.14.0
|
rapidfuzz==3.14.0
|
||||||
referencing==0.36.2
|
referencing==0.36.2
|
||||||
regex==2025.9.1
|
regex==2025.9.1
|
||||||
requests==2.32.5
|
requests==2.32.5
|
||||||
|
resampy==0.4.3
|
||||||
rich==14.1.0
|
rich==14.1.0
|
||||||
rpds-py==0.27.1
|
rpds-py==0.27.1
|
||||||
ruff==0.12.11
|
ruff==0.12.11
|
||||||
@ -114,22 +138,24 @@ safetensors==0.6.2
|
|||||||
scikit-learn==1.7.1
|
scikit-learn==1.7.1
|
||||||
scipy==1.16.1
|
scipy==1.16.1
|
||||||
semantic-version==2.10.0
|
semantic-version==2.10.0
|
||||||
setuptools==70.2.0
|
setuptools==80.9.0
|
||||||
shellingham==1.5.4
|
shellingham==1.5.4
|
||||||
|
siphash24==1.8
|
||||||
six==1.17.0
|
six==1.17.0
|
||||||
sniffio==1.3.1
|
sniffio==1.3.1
|
||||||
soundfile==0.13.1
|
soundfile==0.13.1
|
||||||
soxr==0.5.0.post1
|
soxr==0.5.0.post1
|
||||||
speechrecognition==3.14.3
|
speechrecognition==3.14.3
|
||||||
starlette==0.47.3
|
starlette==0.47.3
|
||||||
sympy==1.13.3
|
sympy==1.14.0
|
||||||
tabulate==0.9.0
|
tabulate==0.9.0
|
||||||
threadpoolctl==3.6.0
|
threadpoolctl==3.6.0
|
||||||
tiktoken==0.11.0
|
tiktoken==0.11.0
|
||||||
tokenizers==0.21.4
|
tokenizers==0.21.4
|
||||||
tomlkit==0.13.3
|
tomlkit==0.13.3
|
||||||
torch==2.8.0+cpu
|
torch==2.8.0
|
||||||
tqdm==4.66.5
|
torchvision==0.23.0
|
||||||
|
tqdm==4.67.1
|
||||||
transformers==4.53.3
|
transformers==4.53.3
|
||||||
triton==3.4.0
|
triton==3.4.0
|
||||||
typer==0.17.3
|
typer==0.17.3
|
||||||
|
@ -8,12 +8,8 @@ import logging
|
|||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Add the project root (parent of the voicebot directory) to sys.path so
|
# Add the voicebot directory to the path
|
||||||
# imports like `from shared import ...` work when running this script from
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||||
# inside the `voicebot` container.
|
|
||||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
if project_root not in sys.path:
|
|
||||||
sys.path.append(project_root)
|
|
||||||
|
|
||||||
from shared.logger import logger
|
from shared.logger import logger
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user