""" Performance and Health Monitoring API Endpoints Provides REST API endpoints for monitoring system performance, health status, cache statistics, and operational metrics. Endpoints: - /api/health - Health check summary - /api/health/ready - Readiness probe (Kubernetes) - /api/health/live - Liveness probe (Kubernetes) - /api/metrics - Performance metrics - /api/metrics/history - Historical metrics - /api/cache/stats - Cache statistics - /api/system/info - System information """ from typing import Optional from fastapi import APIRouter, HTTPException, Query from datetime import datetime from shared.logger import logger # Import monitoring components try: from core.performance import metrics_collector from core.health import health_monitor from core.cache import cache_manager except ImportError as e: logger.warning(f"Some monitoring components not available: {e}") metrics_collector = None health_monitor = None cache_manager = None router = APIRouter(prefix="/api", tags=["monitoring"]) @router.get("/health") async def get_health_summary(): """ Get comprehensive health summary. Returns: Dict containing overall health status and component details """ try: if not health_monitor: raise HTTPException(status_code=503, detail="Health monitoring not available") health_summary = await health_monitor.get_health_summary() return { "status": "success", "data": health_summary, "timestamp": datetime.now().isoformat() } except Exception as e: logger.error(f"Error getting health summary: {e}") raise HTTPException(status_code=500, detail=f"Health check failed: {str(e)}") @router.get("/health/ready") async def readiness_probe(): """ Kubernetes readiness probe endpoint. Returns: Ready status for load balancer inclusion """ try: if not health_monitor: return {"ready": False, "reason": "Health monitoring not available"} readiness = health_monitor.get_readiness_status() if readiness["ready"]: return { "status": "ready", "timestamp": datetime.now().isoformat(), **readiness } else: raise HTTPException( status_code=503, detail={ "status": "not_ready", "timestamp": datetime.now().isoformat(), **readiness } ) except HTTPException: raise except Exception as e: logger.error(f"Error in readiness probe: {e}") raise HTTPException(status_code=500, detail=f"Readiness check failed: {str(e)}") @router.get("/health/live") async def liveness_probe(): """ Kubernetes liveness probe endpoint. Returns: Alive status for container restart decisions """ try: if not health_monitor: return {"alive": True, "reason": "Basic liveness check"} liveness = health_monitor.get_liveness_status() if liveness["alive"]: return { "status": "alive", "timestamp": datetime.now().isoformat(), **liveness } else: raise HTTPException( status_code=503, detail={ "status": "not_alive", "timestamp": datetime.now().isoformat(), **liveness } ) except HTTPException: raise except Exception as e: logger.error(f"Error in liveness probe: {e}") raise HTTPException(status_code=500, detail=f"Liveness check failed: {str(e)}") @router.get("/metrics") async def get_current_metrics(): """ Get current performance metrics. Returns: Current system and application metrics """ try: if not metrics_collector: raise HTTPException(status_code=503, detail="Metrics collection not available") current_metrics = metrics_collector.get_current_metrics() performance_summary = metrics_collector.get_performance_summary() return { "status": "success", "data": { "current": current_metrics, "summary": performance_summary }, "timestamp": datetime.now().isoformat() } except Exception as e: logger.error(f"Error getting metrics: {e}") raise HTTPException(status_code=500, detail=f"Metrics collection failed: {str(e)}") @router.get("/metrics/history") async def get_metrics_history( metric_name: str = Query(..., description="Name of the metric to retrieve"), minutes: int = Query(default=5, ge=1, le=60, description="Minutes of history to retrieve") ): """ Get historical data for a specific metric. Args: metric_name: Name of the metric minutes: Number of minutes of history to retrieve (1-60) Returns: Historical metric data points """ try: if not metrics_collector: raise HTTPException(status_code=503, detail="Metrics collection not available") history = metrics_collector.get_metric_history(metric_name, minutes) return { "status": "success", "data": { "metric_name": metric_name, "minutes": minutes, "data_points": len(history), "history": history }, "timestamp": datetime.now().isoformat() } except Exception as e: logger.error(f"Error getting metric history: {e}") raise HTTPException(status_code=500, detail=f"Metric history failed: {str(e)}") @router.get("/cache/stats") async def get_cache_statistics(): """ Get cache performance statistics. Returns: Cache hit rates, sizes, and performance metrics """ try: if not cache_manager: raise HTTPException(status_code=503, detail="Cache management not available") cache_stats = cache_manager.get_all_stats() # Calculate aggregate statistics total_hits = sum(stats['hits'] for stats in cache_stats.values()) total_misses = sum(stats['misses'] for stats in cache_stats.values()) total_requests = total_hits + total_misses overall_hit_rate = (total_hits / total_requests * 100) if total_requests > 0 else 0 return { "status": "success", "data": { "overall": { "total_hits": total_hits, "total_misses": total_misses, "overall_hit_rate_percent": overall_hit_rate, "total_requests": total_requests }, "by_cache": cache_stats }, "timestamp": datetime.now().isoformat() } except Exception as e: logger.error(f"Error getting cache stats: {e}") raise HTTPException(status_code=500, detail=f"Cache statistics failed: {str(e)}") @router.get("/system/info") async def get_system_info(): """ Get system information and configuration. Returns: System details, configuration, and runtime information """ try: import sys import platform import os # Get current metrics if available current_metrics = {} if metrics_collector: current_metrics = metrics_collector.get_current_metrics() system_info = { "python": { "version": sys.version, "platform": platform.platform(), "architecture": platform.architecture()[0] }, "runtime": { "uptime_seconds": current_metrics.get('gauges', {}).get('uptime_seconds', 0), "process_id": os.getpid(), "working_directory": os.getcwd() }, "performance": { "cpu_usage_percent": current_metrics.get('gauges', {}).get('cpu_usage_percent', 0), "memory_usage_mb": current_metrics.get('gauges', {}).get('memory_usage_mb', 0), "memory_usage_percent": current_metrics.get('gauges', {}).get('memory_usage_percent', 0) } } # Add health status if available if health_monitor and health_monitor.last_full_check: system_info["health"] = { "last_check": health_monitor.last_full_check.isoformat(), "check_interval": health_monitor.check_interval_seconds } return { "status": "success", "data": system_info, "timestamp": datetime.now().isoformat() } except Exception as e: logger.error(f"Error getting system info: {e}") raise HTTPException(status_code=500, detail=f"System info failed: {str(e)}") @router.post("/cache/clear") async def clear_cache(cache_type: Optional[str] = Query(None, description="Specific cache to clear")): """ Clear cache entries. Args: cache_type: Optional specific cache to clear (session, lobby, user, message, computed) Returns: Cache clear results """ try: if not cache_manager: raise HTTPException(status_code=503, detail="Cache management not available") if cache_type: # Clear specific cache cache_attr = f"{cache_type}_cache" if hasattr(cache_manager, cache_attr): cache = getattr(cache_manager, cache_attr) cache.backend.clear() return { "status": "success", "message": f"Cleared {cache_type} cache", "timestamp": datetime.now().isoformat() } else: raise HTTPException(status_code=400, detail=f"Unknown cache type: {cache_type}") else: # Clear all caches for cache_name in ['session', 'lobby', 'user', 'message', 'computed']: cache_attr = f"{cache_name}_cache" if hasattr(cache_manager, cache_attr): cache = getattr(cache_manager, cache_attr) cache.backend.clear() return { "status": "success", "message": "Cleared all caches", "timestamp": datetime.now().isoformat() } except HTTPException: raise except Exception as e: logger.error(f"Error clearing cache: {e}") raise HTTPException(status_code=500, detail=f"Cache clear failed: {str(e)}") @router.get("/metrics/export") async def export_metrics_prometheus(): """ Export metrics in Prometheus format. Returns: Metrics in Prometheus text format """ try: if not metrics_collector: raise HTTPException(status_code=503, detail="Metrics collection not available") current_metrics = metrics_collector.get_current_metrics() prometheus_lines = [] # Convert gauges to Prometheus format for metric_name, value in current_metrics.get('gauges', {}).items(): prometheus_lines.append(f"# TYPE {metric_name} gauge") prometheus_lines.append(f"{metric_name} {value}") # Convert counters to Prometheus format for metric_name, value in current_metrics.get('counters', {}).items(): prometheus_lines.append(f"# TYPE {metric_name} counter") prometheus_lines.append(f"{metric_name} {value}") # Convert histograms to Prometheus format (simplified) for metric_name, stats in current_metrics.get('histograms', {}).items(): prometheus_lines.append(f"# TYPE {metric_name} histogram") prometheus_lines.append(f"{metric_name}_count {stats['count']}") prometheus_lines.append(f"{metric_name}_sum {stats['avg'] * stats['count']}") prometheus_text = '\n'.join(prometheus_lines) return prometheus_text except Exception as e: logger.error(f"Error exporting Prometheus metrics: {e}") raise HTTPException(status_code=500, detail=f"Metrics export failed: {str(e)}") # Note: Middleware would be added to the main FastAPI app, not the router # This is just an example of how metrics could be collected automatically def create_metrics_middleware(): """Create middleware for automatic metrics collection.""" async def metrics_middleware(request, call_next): """Middleware to automatically collect API metrics.""" import time start_time = time.time() try: response = await call_next(request) # Record metrics if collector is available if metrics_collector: duration = time.time() - start_time endpoint = request.url.path method = request.method status_code = response.status_code # Record request metrics metrics_collector.record_counter( 'api_requests_total', labels={'endpoint': endpoint, 'method': method, 'status': str(status_code)} ) metrics_collector.record_histogram( 'api_request_duration_seconds', duration, labels={'endpoint': endpoint, 'method': method} ) return response except Exception as e: # Record error metrics if metrics_collector: duration = time.time() - start_time endpoint = request.url.path method = request.method metrics_collector.record_counter( 'api_errors_total', labels={'endpoint': endpoint, 'method': method, 'error': type(e).__name__} ) raise return metrics_middleware