ai-voicebot/server/api/monitoring.py

"""
Performance and Health Monitoring API Endpoints

Provides REST API endpoints for monitoring system performance, health status,
cache statistics, and operational metrics.

Endpoints:
- /api/health - Health check summary
- /api/health/ready - Readiness probe (Kubernetes)
- /api/health/live - Liveness probe (Kubernetes)
- /api/metrics - Performance metrics
- /api/metrics/history - Historical metrics
- /api/cache/stats - Cache statistics
- /api/system/info - System information
"""

from typing import Dict, Any, Optional
from fastapi import APIRouter, HTTPException, Query
from datetime import datetime, timedelta

from logger import logger

# Import monitoring components
try:
    from core.performance import metrics_collector
    from core.health import health_monitor
    from core.cache import cache_manager
except ImportError as e:
    logger.warning(f"Some monitoring components not available: {e}")
    metrics_collector = None
    health_monitor = None
    cache_manager = None


router = APIRouter(prefix="/api", tags=["monitoring"])


@router.get("/health")
async def get_health_summary():
    """
    Get comprehensive health summary.

    Returns:
        Dict containing overall health status and component details
    """
    try:
        if not health_monitor:
            raise HTTPException(status_code=503, detail="Health monitoring not available")

        health_summary = await health_monitor.get_health_summary()
        return {
            "status": "success",
            "data": health_summary,
            "timestamp": datetime.now().isoformat()
        }

    except Exception as e:
        logger.error(f"Error getting health summary: {e}")
        raise HTTPException(status_code=500, detail=f"Health check failed: {str(e)}")


@router.get("/health/ready")
async def readiness_probe():
    """
    Kubernetes readiness probe endpoint.

    Returns:
        Ready status for load balancer inclusion
    """
    try:
        if not health_monitor:
            return {"ready": False, "reason": "Health monitoring not available"}

        readiness = health_monitor.get_readiness_status()

        if readiness["ready"]:
            return {
                "status": "ready",
                "timestamp": datetime.now().isoformat(),
                **readiness
            }
        else:
            raise HTTPException(
                status_code=503,
                detail={
                    "status": "not_ready",
                    "timestamp": datetime.now().isoformat(),
                    **readiness
                }
            )

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error in readiness probe: {e}")
        raise HTTPException(status_code=500, detail=f"Readiness check failed: {str(e)}")


@router.get("/health/live")
async def liveness_probe():
    """
    Kubernetes liveness probe endpoint.

    Returns:
        Alive status for container restart decisions
    """
    try:
        if not health_monitor:
            return {"alive": True, "reason": "Basic liveness check"}

        liveness = health_monitor.get_liveness_status()

        if liveness["alive"]:
            return {
                "status": "alive",
                "timestamp": datetime.now().isoformat(),
                **liveness
            }
        else:
            raise HTTPException(
                status_code=503,
                detail={
                    "status": "not_alive",
                    "timestamp": datetime.now().isoformat(),
                    **liveness
                }
            )

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error in liveness probe: {e}")
        raise HTTPException(status_code=500, detail=f"Liveness check failed: {str(e)}")


@router.get("/metrics")
async def get_current_metrics():
    """
    Get current performance metrics.

    Returns:
        Current system and application metrics
    """
    try:
        if not metrics_collector:
            raise HTTPException(status_code=503, detail="Metrics collection not available")

        current_metrics = metrics_collector.get_current_metrics()
        performance_summary = metrics_collector.get_performance_summary()

        return {
            "status": "success",
            "data": {
                "current": current_metrics,
                "summary": performance_summary
            },
            "timestamp": datetime.now().isoformat()
        }

    except Exception as e:
        logger.error(f"Error getting metrics: {e}")
        raise HTTPException(status_code=500, detail=f"Metrics collection failed: {str(e)}")


@router.get("/metrics/history")
async def get_metrics_history(
    metric_name: str = Query(..., description="Name of the metric to retrieve"),
    minutes: int = Query(default=5, ge=1, le=60, description="Minutes of history to retrieve")
):
    """
    Get historical data for a specific metric.

    Args:
        metric_name: Name of the metric
        minutes: Number of minutes of history to retrieve (1-60)

    Returns:
        Historical metric data points
    """
    try:
        if not metrics_collector:
            raise HTTPException(status_code=503, detail="Metrics collection not available")

        history = metrics_collector.get_metric_history(metric_name, minutes)

        return {
            "status": "success",
            "data": {
                "metric_name": metric_name,
                "minutes": minutes,
                "data_points": len(history),
                "history": history
            },
            "timestamp": datetime.now().isoformat()
        }

    except Exception as e:
        logger.error(f"Error getting metric history: {e}")
        raise HTTPException(status_code=500, detail=f"Metric history failed: {str(e)}")


@router.get("/cache/stats")
async def get_cache_statistics():
    """
    Get cache performance statistics.

    Returns:
        Cache hit rates, sizes, and performance metrics
    """
    try:
        if not cache_manager:
            raise HTTPException(status_code=503, detail="Cache management not available")

        cache_stats = cache_manager.get_all_stats()

        # Calculate aggregate statistics
        total_hits = sum(stats['hits'] for stats in cache_stats.values())
        total_misses = sum(stats['misses'] for stats in cache_stats.values())
        total_requests = total_hits + total_misses
        overall_hit_rate = (total_hits / total_requests * 100) if total_requests > 0 else 0

        return {
            "status": "success",
            "data": {
                "overall": {
                    "total_hits": total_hits,
                    "total_misses": total_misses,
                    "overall_hit_rate_percent": overall_hit_rate,
                    "total_requests": total_requests
                },
                "by_cache": cache_stats
            },
            "timestamp": datetime.now().isoformat()
        }

    except Exception as e:
        logger.error(f"Error getting cache stats: {e}")
        raise HTTPException(status_code=500, detail=f"Cache statistics failed: {str(e)}")


@router.get("/system/info")
async def get_system_info():
    """
    Get system information and configuration.

    Returns:
        System details, configuration, and runtime information
    """
    try:
        import sys
        import platform
        import os

        # Get current metrics if available
        current_metrics = {}
        if metrics_collector:
            current_metrics = metrics_collector.get_current_metrics()

        system_info = {
            "python": {
                "version": sys.version,
                "platform": platform.platform(),
                "architecture": platform.architecture()[0]
            },
            "runtime": {
                "uptime_seconds": current_metrics.get('gauges', {}).get('uptime_seconds', 0),
                "process_id": os.getpid(),
                "working_directory": os.getcwd()
            },
            "performance": {
                "cpu_usage_percent": current_metrics.get('gauges', {}).get('cpu_usage_percent', 0),
                "memory_usage_mb": current_metrics.get('gauges', {}).get('memory_usage_mb', 0),
                "memory_usage_percent": current_metrics.get('gauges', {}).get('memory_usage_percent', 0)
            }
        }

        # Add health status if available
        if health_monitor and health_monitor.last_full_check:
            system_info["health"] = {
                "last_check": health_monitor.last_full_check.isoformat(),
                "check_interval": health_monitor.check_interval_seconds
            }

        return {
            "status": "success",
            "data": system_info,
            "timestamp": datetime.now().isoformat()
        }

    except Exception as e:
        logger.error(f"Error getting system info: {e}")
        raise HTTPException(status_code=500, detail=f"System info failed: {str(e)}")


@router.post("/cache/clear")
async def clear_cache(cache_type: Optional[str] = Query(None, description="Specific cache to clear")):
    """
    Clear cache entries.

    Args:
        cache_type: Optional specific cache to clear (session, lobby, user, message, computed)

    Returns:
        Cache clear results
    """
    try:
        if not cache_manager:
            raise HTTPException(status_code=503, detail="Cache management not available")

        if cache_type:
            # Clear specific cache
            cache_attr = f"{cache_type}_cache"
            if hasattr(cache_manager, cache_attr):
                cache = getattr(cache_manager, cache_attr)
                cache.backend.clear()
                return {
                    "status": "success",
                    "message": f"Cleared {cache_type} cache",
                    "timestamp": datetime.now().isoformat()
                }
            else:
                raise HTTPException(status_code=400, detail=f"Unknown cache type: {cache_type}")
        else:
            # Clear all caches
            for cache_name in ['session', 'lobby', 'user', 'message', 'computed']:
                cache_attr = f"{cache_name}_cache"
                if hasattr(cache_manager, cache_attr):
                    cache = getattr(cache_manager, cache_attr)
                    cache.backend.clear()

            return {
                "status": "success",
                "message": "Cleared all caches",
                "timestamp": datetime.now().isoformat()
            }

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error clearing cache: {e}")
        raise HTTPException(status_code=500, detail=f"Cache clear failed: {str(e)}")


@router.get("/metrics/export")
async def export_metrics_prometheus():
    """
    Export metrics in Prometheus format.

    Returns:
        Metrics in Prometheus text format
    """
    try:
        if not metrics_collector:
            raise HTTPException(status_code=503, detail="Metrics collection not available")

        current_metrics = metrics_collector.get_current_metrics()
        prometheus_lines = []

        # Convert gauges to Prometheus format
        for metric_name, value in current_metrics.get('gauges', {}).items():
            prometheus_lines.append(f"# TYPE {metric_name} gauge")
            prometheus_lines.append(f"{metric_name} {value}")

        # Convert counters to Prometheus format
        for metric_name, value in current_metrics.get('counters', {}).items():
            prometheus_lines.append(f"# TYPE {metric_name} counter")
            prometheus_lines.append(f"{metric_name} {value}")

        # Convert histograms to Prometheus format (simplified)
        for metric_name, stats in current_metrics.get('histograms', {}).items():
            prometheus_lines.append(f"# TYPE {metric_name} histogram")
            prometheus_lines.append(f"{metric_name}_count {stats['count']}")
            prometheus_lines.append(f"{metric_name}_sum {stats['avg'] * stats['count']}")

        prometheus_text = '\n'.join(prometheus_lines)

        return prometheus_text

    except Exception as e:
        logger.error(f"Error exporting Prometheus metrics: {e}")
        raise HTTPException(status_code=500, detail=f"Metrics export failed: {str(e)}")


# Note: Middleware would be added to the main FastAPI app, not the router
# This is just an example of how metrics could be collected automatically

def create_metrics_middleware():
    """Create middleware for automatic metrics collection."""
    async def metrics_middleware(request, call_next):
        """Middleware to automatically collect API metrics."""
        import time
        start_time = time.time()

        try:
            response = await call_next(request)

            # Record metrics if collector is available
            if metrics_collector:
                duration = time.time() - start_time
                endpoint = request.url.path
                method = request.method
                status_code = response.status_code

                # Record request metrics
                metrics_collector.record_counter(
                    'api_requests_total',
                    labels={'endpoint': endpoint, 'method': method, 'status': str(status_code)}
                )

                metrics_collector.record_histogram(
                    'api_request_duration_seconds',
                    duration,
                    labels={'endpoint': endpoint, 'method': method}
                )

            return response

        except Exception as e:
            # Record error metrics
            if metrics_collector:
                duration = time.time() - start_time
                endpoint = request.url.path
                method = request.method

                metrics_collector.record_counter(
                    'api_errors_total',
                    labels={'endpoint': endpoint, 'method': method, 'error': type(e).__name__}
                )

            raise

    return metrics_middleware