433 lines
14 KiB
Python
433 lines
14 KiB
Python
"""
|
|
Performance and Health Monitoring API Endpoints
|
|
|
|
Provides REST API endpoints for monitoring system performance, health status,
|
|
cache statistics, and operational metrics.
|
|
|
|
Endpoints:
|
|
- /api/health - Health check summary
|
|
- /api/health/ready - Readiness probe (Kubernetes)
|
|
- /api/health/live - Liveness probe (Kubernetes)
|
|
- /api/metrics - Performance metrics
|
|
- /api/metrics/history - Historical metrics
|
|
- /api/cache/stats - Cache statistics
|
|
- /api/system/info - System information
|
|
"""
|
|
|
|
from typing import Dict, Any, Optional
|
|
from fastapi import APIRouter, HTTPException, Query
|
|
from datetime import datetime, timedelta
|
|
|
|
from logger import logger
|
|
|
|
# Import monitoring components
|
|
try:
|
|
from core.performance import metrics_collector
|
|
from core.health import health_monitor
|
|
from core.cache import cache_manager
|
|
except ImportError as e:
|
|
logger.warning(f"Some monitoring components not available: {e}")
|
|
metrics_collector = None
|
|
health_monitor = None
|
|
cache_manager = None
|
|
|
|
|
|
router = APIRouter(prefix="/api", tags=["monitoring"])
|
|
|
|
|
|
@router.get("/health")
|
|
async def get_health_summary():
|
|
"""
|
|
Get comprehensive health summary.
|
|
|
|
Returns:
|
|
Dict containing overall health status and component details
|
|
"""
|
|
try:
|
|
if not health_monitor:
|
|
raise HTTPException(status_code=503, detail="Health monitoring not available")
|
|
|
|
health_summary = await health_monitor.get_health_summary()
|
|
return {
|
|
"status": "success",
|
|
"data": health_summary,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting health summary: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Health check failed: {str(e)}")
|
|
|
|
|
|
@router.get("/health/ready")
|
|
async def readiness_probe():
|
|
"""
|
|
Kubernetes readiness probe endpoint.
|
|
|
|
Returns:
|
|
Ready status for load balancer inclusion
|
|
"""
|
|
try:
|
|
if not health_monitor:
|
|
return {"ready": False, "reason": "Health monitoring not available"}
|
|
|
|
readiness = health_monitor.get_readiness_status()
|
|
|
|
if readiness["ready"]:
|
|
return {
|
|
"status": "ready",
|
|
"timestamp": datetime.now().isoformat(),
|
|
**readiness
|
|
}
|
|
else:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail={
|
|
"status": "not_ready",
|
|
"timestamp": datetime.now().isoformat(),
|
|
**readiness
|
|
}
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error in readiness probe: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Readiness check failed: {str(e)}")
|
|
|
|
|
|
@router.get("/health/live")
|
|
async def liveness_probe():
|
|
"""
|
|
Kubernetes liveness probe endpoint.
|
|
|
|
Returns:
|
|
Alive status for container restart decisions
|
|
"""
|
|
try:
|
|
if not health_monitor:
|
|
return {"alive": True, "reason": "Basic liveness check"}
|
|
|
|
liveness = health_monitor.get_liveness_status()
|
|
|
|
if liveness["alive"]:
|
|
return {
|
|
"status": "alive",
|
|
"timestamp": datetime.now().isoformat(),
|
|
**liveness
|
|
}
|
|
else:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail={
|
|
"status": "not_alive",
|
|
"timestamp": datetime.now().isoformat(),
|
|
**liveness
|
|
}
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error in liveness probe: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Liveness check failed: {str(e)}")
|
|
|
|
|
|
@router.get("/metrics")
|
|
async def get_current_metrics():
|
|
"""
|
|
Get current performance metrics.
|
|
|
|
Returns:
|
|
Current system and application metrics
|
|
"""
|
|
try:
|
|
if not metrics_collector:
|
|
raise HTTPException(status_code=503, detail="Metrics collection not available")
|
|
|
|
current_metrics = metrics_collector.get_current_metrics()
|
|
performance_summary = metrics_collector.get_performance_summary()
|
|
|
|
return {
|
|
"status": "success",
|
|
"data": {
|
|
"current": current_metrics,
|
|
"summary": performance_summary
|
|
},
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting metrics: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Metrics collection failed: {str(e)}")
|
|
|
|
|
|
@router.get("/metrics/history")
|
|
async def get_metrics_history(
|
|
metric_name: str = Query(..., description="Name of the metric to retrieve"),
|
|
minutes: int = Query(default=5, ge=1, le=60, description="Minutes of history to retrieve")
|
|
):
|
|
"""
|
|
Get historical data for a specific metric.
|
|
|
|
Args:
|
|
metric_name: Name of the metric
|
|
minutes: Number of minutes of history to retrieve (1-60)
|
|
|
|
Returns:
|
|
Historical metric data points
|
|
"""
|
|
try:
|
|
if not metrics_collector:
|
|
raise HTTPException(status_code=503, detail="Metrics collection not available")
|
|
|
|
history = metrics_collector.get_metric_history(metric_name, minutes)
|
|
|
|
return {
|
|
"status": "success",
|
|
"data": {
|
|
"metric_name": metric_name,
|
|
"minutes": minutes,
|
|
"data_points": len(history),
|
|
"history": history
|
|
},
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting metric history: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Metric history failed: {str(e)}")
|
|
|
|
|
|
@router.get("/cache/stats")
|
|
async def get_cache_statistics():
|
|
"""
|
|
Get cache performance statistics.
|
|
|
|
Returns:
|
|
Cache hit rates, sizes, and performance metrics
|
|
"""
|
|
try:
|
|
if not cache_manager:
|
|
raise HTTPException(status_code=503, detail="Cache management not available")
|
|
|
|
cache_stats = cache_manager.get_all_stats()
|
|
|
|
# Calculate aggregate statistics
|
|
total_hits = sum(stats['hits'] for stats in cache_stats.values())
|
|
total_misses = sum(stats['misses'] for stats in cache_stats.values())
|
|
total_requests = total_hits + total_misses
|
|
overall_hit_rate = (total_hits / total_requests * 100) if total_requests > 0 else 0
|
|
|
|
return {
|
|
"status": "success",
|
|
"data": {
|
|
"overall": {
|
|
"total_hits": total_hits,
|
|
"total_misses": total_misses,
|
|
"overall_hit_rate_percent": overall_hit_rate,
|
|
"total_requests": total_requests
|
|
},
|
|
"by_cache": cache_stats
|
|
},
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting cache stats: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Cache statistics failed: {str(e)}")
|
|
|
|
|
|
@router.get("/system/info")
|
|
async def get_system_info():
|
|
"""
|
|
Get system information and configuration.
|
|
|
|
Returns:
|
|
System details, configuration, and runtime information
|
|
"""
|
|
try:
|
|
import sys
|
|
import platform
|
|
import os
|
|
|
|
# Get current metrics if available
|
|
current_metrics = {}
|
|
if metrics_collector:
|
|
current_metrics = metrics_collector.get_current_metrics()
|
|
|
|
system_info = {
|
|
"python": {
|
|
"version": sys.version,
|
|
"platform": platform.platform(),
|
|
"architecture": platform.architecture()[0]
|
|
},
|
|
"runtime": {
|
|
"uptime_seconds": current_metrics.get('gauges', {}).get('uptime_seconds', 0),
|
|
"process_id": os.getpid(),
|
|
"working_directory": os.getcwd()
|
|
},
|
|
"performance": {
|
|
"cpu_usage_percent": current_metrics.get('gauges', {}).get('cpu_usage_percent', 0),
|
|
"memory_usage_mb": current_metrics.get('gauges', {}).get('memory_usage_mb', 0),
|
|
"memory_usage_percent": current_metrics.get('gauges', {}).get('memory_usage_percent', 0)
|
|
}
|
|
}
|
|
|
|
# Add health status if available
|
|
if health_monitor and health_monitor.last_full_check:
|
|
system_info["health"] = {
|
|
"last_check": health_monitor.last_full_check.isoformat(),
|
|
"check_interval": health_monitor.check_interval_seconds
|
|
}
|
|
|
|
return {
|
|
"status": "success",
|
|
"data": system_info,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting system info: {e}")
|
|
raise HTTPException(status_code=500, detail=f"System info failed: {str(e)}")
|
|
|
|
|
|
@router.post("/cache/clear")
|
|
async def clear_cache(cache_type: Optional[str] = Query(None, description="Specific cache to clear")):
|
|
"""
|
|
Clear cache entries.
|
|
|
|
Args:
|
|
cache_type: Optional specific cache to clear (session, lobby, user, message, computed)
|
|
|
|
Returns:
|
|
Cache clear results
|
|
"""
|
|
try:
|
|
if not cache_manager:
|
|
raise HTTPException(status_code=503, detail="Cache management not available")
|
|
|
|
if cache_type:
|
|
# Clear specific cache
|
|
cache_attr = f"{cache_type}_cache"
|
|
if hasattr(cache_manager, cache_attr):
|
|
cache = getattr(cache_manager, cache_attr)
|
|
cache.backend.clear()
|
|
return {
|
|
"status": "success",
|
|
"message": f"Cleared {cache_type} cache",
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
else:
|
|
raise HTTPException(status_code=400, detail=f"Unknown cache type: {cache_type}")
|
|
else:
|
|
# Clear all caches
|
|
for cache_name in ['session', 'lobby', 'user', 'message', 'computed']:
|
|
cache_attr = f"{cache_name}_cache"
|
|
if hasattr(cache_manager, cache_attr):
|
|
cache = getattr(cache_manager, cache_attr)
|
|
cache.backend.clear()
|
|
|
|
return {
|
|
"status": "success",
|
|
"message": "Cleared all caches",
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error clearing cache: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Cache clear failed: {str(e)}")
|
|
|
|
|
|
@router.get("/metrics/export")
|
|
async def export_metrics_prometheus():
|
|
"""
|
|
Export metrics in Prometheus format.
|
|
|
|
Returns:
|
|
Metrics in Prometheus text format
|
|
"""
|
|
try:
|
|
if not metrics_collector:
|
|
raise HTTPException(status_code=503, detail="Metrics collection not available")
|
|
|
|
current_metrics = metrics_collector.get_current_metrics()
|
|
prometheus_lines = []
|
|
|
|
# Convert gauges to Prometheus format
|
|
for metric_name, value in current_metrics.get('gauges', {}).items():
|
|
prometheus_lines.append(f"# TYPE {metric_name} gauge")
|
|
prometheus_lines.append(f"{metric_name} {value}")
|
|
|
|
# Convert counters to Prometheus format
|
|
for metric_name, value in current_metrics.get('counters', {}).items():
|
|
prometheus_lines.append(f"# TYPE {metric_name} counter")
|
|
prometheus_lines.append(f"{metric_name} {value}")
|
|
|
|
# Convert histograms to Prometheus format (simplified)
|
|
for metric_name, stats in current_metrics.get('histograms', {}).items():
|
|
prometheus_lines.append(f"# TYPE {metric_name} histogram")
|
|
prometheus_lines.append(f"{metric_name}_count {stats['count']}")
|
|
prometheus_lines.append(f"{metric_name}_sum {stats['avg'] * stats['count']}")
|
|
|
|
prometheus_text = '\n'.join(prometheus_lines)
|
|
|
|
return prometheus_text
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error exporting Prometheus metrics: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Metrics export failed: {str(e)}")
|
|
|
|
|
|
# Note: Middleware would be added to the main FastAPI app, not the router
|
|
# This is just an example of how metrics could be collected automatically
|
|
|
|
def create_metrics_middleware():
|
|
"""Create middleware for automatic metrics collection."""
|
|
async def metrics_middleware(request, call_next):
|
|
"""Middleware to automatically collect API metrics."""
|
|
import time
|
|
start_time = time.time()
|
|
|
|
try:
|
|
response = await call_next(request)
|
|
|
|
# Record metrics if collector is available
|
|
if metrics_collector:
|
|
duration = time.time() - start_time
|
|
endpoint = request.url.path
|
|
method = request.method
|
|
status_code = response.status_code
|
|
|
|
# Record request metrics
|
|
metrics_collector.record_counter(
|
|
'api_requests_total',
|
|
labels={'endpoint': endpoint, 'method': method, 'status': str(status_code)}
|
|
)
|
|
|
|
metrics_collector.record_histogram(
|
|
'api_request_duration_seconds',
|
|
duration,
|
|
labels={'endpoint': endpoint, 'method': method}
|
|
)
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
# Record error metrics
|
|
if metrics_collector:
|
|
duration = time.time() - start_time
|
|
endpoint = request.url.path
|
|
method = request.method
|
|
|
|
metrics_collector.record_counter(
|
|
'api_errors_total',
|
|
labels={'endpoint': endpoint, 'method': method, 'error': type(e).__name__}
|
|
)
|
|
|
|
raise
|
|
|
|
return metrics_middleware
|