ai-voicebot/server/api/monitoring.py

433 lines
14 KiB
Python

"""
Performance and Health Monitoring API Endpoints
Provides REST API endpoints for monitoring system performance, health status,
cache statistics, and operational metrics.
Endpoints:
- /api/health - Health check summary
- /api/health/ready - Readiness probe (Kubernetes)
- /api/health/live - Liveness probe (Kubernetes)
- /api/metrics - Performance metrics
- /api/metrics/history - Historical metrics
- /api/cache/stats - Cache statistics
- /api/system/info - System information
"""
from typing import Dict, Any, Optional
from fastapi import APIRouter, HTTPException, Query
from datetime import datetime, timedelta
from logger import logger
# Import monitoring components
try:
from core.performance import metrics_collector
from core.health import health_monitor
from core.cache import cache_manager
except ImportError as e:
logger.warning(f"Some monitoring components not available: {e}")
metrics_collector = None
health_monitor = None
cache_manager = None
router = APIRouter(prefix="/api", tags=["monitoring"])
@router.get("/health")
async def get_health_summary():
"""
Get comprehensive health summary.
Returns:
Dict containing overall health status and component details
"""
try:
if not health_monitor:
raise HTTPException(status_code=503, detail="Health monitoring not available")
health_summary = await health_monitor.get_health_summary()
return {
"status": "success",
"data": health_summary,
"timestamp": datetime.now().isoformat()
}
except Exception as e:
logger.error(f"Error getting health summary: {e}")
raise HTTPException(status_code=500, detail=f"Health check failed: {str(e)}")
@router.get("/health/ready")
async def readiness_probe():
"""
Kubernetes readiness probe endpoint.
Returns:
Ready status for load balancer inclusion
"""
try:
if not health_monitor:
return {"ready": False, "reason": "Health monitoring not available"}
readiness = health_monitor.get_readiness_status()
if readiness["ready"]:
return {
"status": "ready",
"timestamp": datetime.now().isoformat(),
**readiness
}
else:
raise HTTPException(
status_code=503,
detail={
"status": "not_ready",
"timestamp": datetime.now().isoformat(),
**readiness
}
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in readiness probe: {e}")
raise HTTPException(status_code=500, detail=f"Readiness check failed: {str(e)}")
@router.get("/health/live")
async def liveness_probe():
"""
Kubernetes liveness probe endpoint.
Returns:
Alive status for container restart decisions
"""
try:
if not health_monitor:
return {"alive": True, "reason": "Basic liveness check"}
liveness = health_monitor.get_liveness_status()
if liveness["alive"]:
return {
"status": "alive",
"timestamp": datetime.now().isoformat(),
**liveness
}
else:
raise HTTPException(
status_code=503,
detail={
"status": "not_alive",
"timestamp": datetime.now().isoformat(),
**liveness
}
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in liveness probe: {e}")
raise HTTPException(status_code=500, detail=f"Liveness check failed: {str(e)}")
@router.get("/metrics")
async def get_current_metrics():
"""
Get current performance metrics.
Returns:
Current system and application metrics
"""
try:
if not metrics_collector:
raise HTTPException(status_code=503, detail="Metrics collection not available")
current_metrics = metrics_collector.get_current_metrics()
performance_summary = metrics_collector.get_performance_summary()
return {
"status": "success",
"data": {
"current": current_metrics,
"summary": performance_summary
},
"timestamp": datetime.now().isoformat()
}
except Exception as e:
logger.error(f"Error getting metrics: {e}")
raise HTTPException(status_code=500, detail=f"Metrics collection failed: {str(e)}")
@router.get("/metrics/history")
async def get_metrics_history(
metric_name: str = Query(..., description="Name of the metric to retrieve"),
minutes: int = Query(default=5, ge=1, le=60, description="Minutes of history to retrieve")
):
"""
Get historical data for a specific metric.
Args:
metric_name: Name of the metric
minutes: Number of minutes of history to retrieve (1-60)
Returns:
Historical metric data points
"""
try:
if not metrics_collector:
raise HTTPException(status_code=503, detail="Metrics collection not available")
history = metrics_collector.get_metric_history(metric_name, minutes)
return {
"status": "success",
"data": {
"metric_name": metric_name,
"minutes": minutes,
"data_points": len(history),
"history": history
},
"timestamp": datetime.now().isoformat()
}
except Exception as e:
logger.error(f"Error getting metric history: {e}")
raise HTTPException(status_code=500, detail=f"Metric history failed: {str(e)}")
@router.get("/cache/stats")
async def get_cache_statistics():
"""
Get cache performance statistics.
Returns:
Cache hit rates, sizes, and performance metrics
"""
try:
if not cache_manager:
raise HTTPException(status_code=503, detail="Cache management not available")
cache_stats = cache_manager.get_all_stats()
# Calculate aggregate statistics
total_hits = sum(stats['hits'] for stats in cache_stats.values())
total_misses = sum(stats['misses'] for stats in cache_stats.values())
total_requests = total_hits + total_misses
overall_hit_rate = (total_hits / total_requests * 100) if total_requests > 0 else 0
return {
"status": "success",
"data": {
"overall": {
"total_hits": total_hits,
"total_misses": total_misses,
"overall_hit_rate_percent": overall_hit_rate,
"total_requests": total_requests
},
"by_cache": cache_stats
},
"timestamp": datetime.now().isoformat()
}
except Exception as e:
logger.error(f"Error getting cache stats: {e}")
raise HTTPException(status_code=500, detail=f"Cache statistics failed: {str(e)}")
@router.get("/system/info")
async def get_system_info():
"""
Get system information and configuration.
Returns:
System details, configuration, and runtime information
"""
try:
import sys
import platform
import os
# Get current metrics if available
current_metrics = {}
if metrics_collector:
current_metrics = metrics_collector.get_current_metrics()
system_info = {
"python": {
"version": sys.version,
"platform": platform.platform(),
"architecture": platform.architecture()[0]
},
"runtime": {
"uptime_seconds": current_metrics.get('gauges', {}).get('uptime_seconds', 0),
"process_id": os.getpid(),
"working_directory": os.getcwd()
},
"performance": {
"cpu_usage_percent": current_metrics.get('gauges', {}).get('cpu_usage_percent', 0),
"memory_usage_mb": current_metrics.get('gauges', {}).get('memory_usage_mb', 0),
"memory_usage_percent": current_metrics.get('gauges', {}).get('memory_usage_percent', 0)
}
}
# Add health status if available
if health_monitor and health_monitor.last_full_check:
system_info["health"] = {
"last_check": health_monitor.last_full_check.isoformat(),
"check_interval": health_monitor.check_interval_seconds
}
return {
"status": "success",
"data": system_info,
"timestamp": datetime.now().isoformat()
}
except Exception as e:
logger.error(f"Error getting system info: {e}")
raise HTTPException(status_code=500, detail=f"System info failed: {str(e)}")
@router.post("/cache/clear")
async def clear_cache(cache_type: Optional[str] = Query(None, description="Specific cache to clear")):
"""
Clear cache entries.
Args:
cache_type: Optional specific cache to clear (session, lobby, user, message, computed)
Returns:
Cache clear results
"""
try:
if not cache_manager:
raise HTTPException(status_code=503, detail="Cache management not available")
if cache_type:
# Clear specific cache
cache_attr = f"{cache_type}_cache"
if hasattr(cache_manager, cache_attr):
cache = getattr(cache_manager, cache_attr)
cache.backend.clear()
return {
"status": "success",
"message": f"Cleared {cache_type} cache",
"timestamp": datetime.now().isoformat()
}
else:
raise HTTPException(status_code=400, detail=f"Unknown cache type: {cache_type}")
else:
# Clear all caches
for cache_name in ['session', 'lobby', 'user', 'message', 'computed']:
cache_attr = f"{cache_name}_cache"
if hasattr(cache_manager, cache_attr):
cache = getattr(cache_manager, cache_attr)
cache.backend.clear()
return {
"status": "success",
"message": "Cleared all caches",
"timestamp": datetime.now().isoformat()
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error clearing cache: {e}")
raise HTTPException(status_code=500, detail=f"Cache clear failed: {str(e)}")
@router.get("/metrics/export")
async def export_metrics_prometheus():
"""
Export metrics in Prometheus format.
Returns:
Metrics in Prometheus text format
"""
try:
if not metrics_collector:
raise HTTPException(status_code=503, detail="Metrics collection not available")
current_metrics = metrics_collector.get_current_metrics()
prometheus_lines = []
# Convert gauges to Prometheus format
for metric_name, value in current_metrics.get('gauges', {}).items():
prometheus_lines.append(f"# TYPE {metric_name} gauge")
prometheus_lines.append(f"{metric_name} {value}")
# Convert counters to Prometheus format
for metric_name, value in current_metrics.get('counters', {}).items():
prometheus_lines.append(f"# TYPE {metric_name} counter")
prometheus_lines.append(f"{metric_name} {value}")
# Convert histograms to Prometheus format (simplified)
for metric_name, stats in current_metrics.get('histograms', {}).items():
prometheus_lines.append(f"# TYPE {metric_name} histogram")
prometheus_lines.append(f"{metric_name}_count {stats['count']}")
prometheus_lines.append(f"{metric_name}_sum {stats['avg'] * stats['count']}")
prometheus_text = '\n'.join(prometheus_lines)
return prometheus_text
except Exception as e:
logger.error(f"Error exporting Prometheus metrics: {e}")
raise HTTPException(status_code=500, detail=f"Metrics export failed: {str(e)}")
# Note: Middleware would be added to the main FastAPI app, not the router
# This is just an example of how metrics could be collected automatically
def create_metrics_middleware():
"""Create middleware for automatic metrics collection."""
async def metrics_middleware(request, call_next):
"""Middleware to automatically collect API metrics."""
import time
start_time = time.time()
try:
response = await call_next(request)
# Record metrics if collector is available
if metrics_collector:
duration = time.time() - start_time
endpoint = request.url.path
method = request.method
status_code = response.status_code
# Record request metrics
metrics_collector.record_counter(
'api_requests_total',
labels={'endpoint': endpoint, 'method': method, 'status': str(status_code)}
)
metrics_collector.record_histogram(
'api_request_duration_seconds',
duration,
labels={'endpoint': endpoint, 'method': method}
)
return response
except Exception as e:
# Record error metrics
if metrics_collector:
duration = time.time() - start_time
endpoint = request.url.path
method = request.method
metrics_collector.record_counter(
'api_errors_total',
labels={'endpoint': endpoint, 'method': method, 'error': type(e).__name__}
)
raise
return metrics_middleware