wifi-densepose/src/tasks/monitoring.py

"""
Monitoring tasks for WiFi-DensePose API
"""

import asyncio
import logging
import psutil
import time
from datetime import datetime, timedelta
from typing import Dict, Any, Optional, List
from contextlib import asynccontextmanager

from sqlalchemy import select, func, and_, or_
from sqlalchemy.ext.asyncio import AsyncSession

from src.config.settings import Settings
from src.database.connection import get_database_manager
from src.database.models import SystemMetric, Device, Session, CSIData, PoseDetection
from src.logger import get_logger

logger = get_logger(__name__)


class MonitoringTask:
    """Base class for monitoring tasks."""

    def __init__(self, name: str, settings: Settings):
        self.name = name
        self.settings = settings
        self.enabled = True
        self.last_run = None
        self.run_count = 0
        self.error_count = 0
        self.interval_seconds = 60  # Default interval

    async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
        """Collect metrics for this task."""
        raise NotImplementedError

    async def run(self, session: AsyncSession) -> Dict[str, Any]:
        """Run the monitoring task with error handling."""
        start_time = datetime.utcnow()

        try:
            logger.debug(f"Starting monitoring task: {self.name}")

            metrics = await self.collect_metrics(session)

            # Store metrics in database
            for metric_data in metrics:
                metric = SystemMetric(
                    metric_name=metric_data["name"],
                    metric_type=metric_data["type"],
                    value=metric_data["value"],
                    unit=metric_data.get("unit"),
                    labels=metric_data.get("labels"),
                    tags=metric_data.get("tags"),
                    source=metric_data.get("source", self.name),
                    component=metric_data.get("component"),
                    description=metric_data.get("description"),
                    meta_data=metric_data.get("metadata"),
                )
                session.add(metric)

            await session.commit()

            self.last_run = start_time
            self.run_count += 1

            logger.debug(f"Monitoring task {self.name} completed: collected {len(metrics)} metrics")

            return {
                "task": self.name,
                "status": "success",
                "start_time": start_time.isoformat(),
                "duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
                "metrics_collected": len(metrics),
            }

        except Exception as e:
            self.error_count += 1
            logger.error(f"Monitoring task {self.name} failed: {e}", exc_info=True)

            return {
                "task": self.name,
                "status": "error",
                "start_time": start_time.isoformat(),
                "duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
                "error": str(e),
                "metrics_collected": 0,
            }

    def get_stats(self) -> Dict[str, Any]:
        """Get task statistics."""
        return {
            "name": self.name,
            "enabled": self.enabled,
            "interval_seconds": self.interval_seconds,
            "last_run": self.last_run.isoformat() if self.last_run else None,
            "run_count": self.run_count,
            "error_count": self.error_count,
        }


class SystemResourceMonitoring(MonitoringTask):
    """Monitor system resources (CPU, memory, disk, network)."""

    def __init__(self, settings: Settings):
        super().__init__("system_resources", settings)
        self.interval_seconds = settings.system_monitoring_interval

    async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
        """Collect system resource metrics."""
        metrics = []
        timestamp = datetime.utcnow()

        # CPU metrics
        cpu_percent = psutil.cpu_percent(interval=1)
        cpu_count = psutil.cpu_count()
        cpu_freq = psutil.cpu_freq()

        metrics.extend([
            {
                "name": "system_cpu_usage_percent",
                "type": "gauge",
                "value": cpu_percent,
                "unit": "percent",
                "component": "cpu",
                "description": "CPU usage percentage",
                "metadata": {"timestamp": timestamp.isoformat()}
            },
            {
                "name": "system_cpu_count",
                "type": "gauge",
                "value": cpu_count,
                "unit": "count",
                "component": "cpu",
                "description": "Number of CPU cores",
                "metadata": {"timestamp": timestamp.isoformat()}
            }
        ])

        if cpu_freq:
            metrics.append({
                "name": "system_cpu_frequency_mhz",
                "type": "gauge",
                "value": cpu_freq.current,
                "unit": "mhz",
                "component": "cpu",
                "description": "Current CPU frequency",
                "metadata": {"timestamp": timestamp.isoformat()}
            })

        # Memory metrics
        memory = psutil.virtual_memory()
        swap = psutil.swap_memory()

        metrics.extend([
            {
                "name": "system_memory_total_bytes",
                "type": "gauge",
                "value": memory.total,
                "unit": "bytes",
                "component": "memory",
                "description": "Total system memory",
                "metadata": {"timestamp": timestamp.isoformat()}
            },
            {
                "name": "system_memory_used_bytes",
                "type": "gauge",
                "value": memory.used,
                "unit": "bytes",
                "component": "memory",
                "description": "Used system memory",
                "metadata": {"timestamp": timestamp.isoformat()}
            },
            {
                "name": "system_memory_available_bytes",
                "type": "gauge",
                "value": memory.available,
                "unit": "bytes",
                "component": "memory",
                "description": "Available system memory",
                "metadata": {"timestamp": timestamp.isoformat()}
            },
            {
                "name": "system_memory_usage_percent",
                "type": "gauge",
                "value": memory.percent,
                "unit": "percent",
                "component": "memory",
                "description": "Memory usage percentage",
                "metadata": {"timestamp": timestamp.isoformat()}
            },
            {
                "name": "system_swap_total_bytes",
                "type": "gauge",
                "value": swap.total,
                "unit": "bytes",
                "component": "memory",
                "description": "Total swap memory",
                "metadata": {"timestamp": timestamp.isoformat()}
            },
            {
                "name": "system_swap_used_bytes",
                "type": "gauge",
                "value": swap.used,
                "unit": "bytes",
                "component": "memory",
                "description": "Used swap memory",
                "metadata": {"timestamp": timestamp.isoformat()}
            }
        ])

        # Disk metrics
        disk_usage = psutil.disk_usage('/')
        disk_io = psutil.disk_io_counters()

        metrics.extend([
            {
                "name": "system_disk_total_bytes",
                "type": "gauge",
                "value": disk_usage.total,
                "unit": "bytes",
                "component": "disk",
                "description": "Total disk space",
                "metadata": {"timestamp": timestamp.isoformat()}
            },
            {
                "name": "system_disk_used_bytes",
                "type": "gauge",
                "value": disk_usage.used,
                "unit": "bytes",
                "component": "disk",
                "description": "Used disk space",
                "metadata": {"timestamp": timestamp.isoformat()}
            },
            {
                "name": "system_disk_free_bytes",
                "type": "gauge",
                "value": disk_usage.free,
                "unit": "bytes",
                "component": "disk",
                "description": "Free disk space",
                "metadata": {"timestamp": timestamp.isoformat()}
            },
            {
                "name": "system_disk_usage_percent",
                "type": "gauge",
                "value": (disk_usage.used / disk_usage.total) * 100,
                "unit": "percent",
                "component": "disk",
                "description": "Disk usage percentage",
                "metadata": {"timestamp": timestamp.isoformat()}
            }
        ])

        if disk_io:
            metrics.extend([
                {
                    "name": "system_disk_read_bytes_total",
                    "type": "counter",
                    "value": disk_io.read_bytes,
                    "unit": "bytes",
                    "component": "disk",
                    "description": "Total bytes read from disk",
                    "metadata": {"timestamp": timestamp.isoformat()}
                },
                {
                    "name": "system_disk_write_bytes_total",
                    "type": "counter",
                    "value": disk_io.write_bytes,
                    "unit": "bytes",
                    "component": "disk",
                    "description": "Total bytes written to disk",
                    "metadata": {"timestamp": timestamp.isoformat()}
                }
            ])

        # Network metrics
        network_io = psutil.net_io_counters()

        if network_io:
            metrics.extend([
                {
                    "name": "system_network_bytes_sent_total",
                    "type": "counter",
                    "value": network_io.bytes_sent,
                    "unit": "bytes",
                    "component": "network",
                    "description": "Total bytes sent over network",
                    "metadata": {"timestamp": timestamp.isoformat()}
                },
                {
                    "name": "system_network_bytes_recv_total",
                    "type": "counter",
                    "value": network_io.bytes_recv,
                    "unit": "bytes",
                    "component": "network",
                    "description": "Total bytes received over network",
                    "metadata": {"timestamp": timestamp.isoformat()}
                },
                {
                    "name": "system_network_packets_sent_total",
                    "type": "counter",
                    "value": network_io.packets_sent,
                    "unit": "count",
                    "component": "network",
                    "description": "Total packets sent over network",
                    "metadata": {"timestamp": timestamp.isoformat()}
                },
                {
                    "name": "system_network_packets_recv_total",
                    "type": "counter",
                    "value": network_io.packets_recv,
                    "unit": "count",
                    "component": "network",
                    "description": "Total packets received over network",
                    "metadata": {"timestamp": timestamp.isoformat()}
                }
            ])

        return metrics


class DatabaseMonitoring(MonitoringTask):
    """Monitor database performance and statistics."""

    def __init__(self, settings: Settings):
        super().__init__("database", settings)
        self.interval_seconds = settings.database_monitoring_interval

    async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
        """Collect database metrics."""
        metrics = []
        timestamp = datetime.utcnow()

        # Get database connection stats
        db_manager = get_database_manager(self.settings)
        connection_stats = await db_manager.get_connection_stats()

        # PostgreSQL connection metrics
        if "postgresql" in connection_stats:
            pg_stats = connection_stats["postgresql"]
            metrics.extend([
                {
                    "name": "database_connections_total",
                    "type": "gauge",
                    "value": pg_stats.get("total_connections", 0),
                    "unit": "count",
                    "component": "postgresql",
                    "description": "Total database connections",
                    "metadata": {"timestamp": timestamp.isoformat()}
                },
                {
                    "name": "database_connections_active",
                    "type": "gauge",
                    "value": pg_stats.get("checked_out", 0),
                    "unit": "count",
                    "component": "postgresql",
                    "description": "Active database connections",
                    "metadata": {"timestamp": timestamp.isoformat()}
                },
                {
                    "name": "database_connections_available",
                    "type": "gauge",
                    "value": pg_stats.get("available_connections", 0),
                    "unit": "count",
                    "component": "postgresql",
                    "description": "Available database connections",
                    "metadata": {"timestamp": timestamp.isoformat()}
                }
            ])

        # Redis connection metrics
        if "redis" in connection_stats and not connection_stats["redis"].get("error"):
            redis_stats = connection_stats["redis"]
            metrics.extend([
                {
                    "name": "redis_connections_active",
                    "type": "gauge",
                    "value": redis_stats.get("connected_clients", 0),
                    "unit": "count",
                    "component": "redis",
                    "description": "Active Redis connections",
                    "metadata": {"timestamp": timestamp.isoformat()}
                },
                {
                    "name": "redis_connections_blocked",
                    "type": "gauge",
                    "value": redis_stats.get("blocked_clients", 0),
                    "unit": "count",
                    "component": "redis",
                    "description": "Blocked Redis connections",
                    "metadata": {"timestamp": timestamp.isoformat()}
                }
            ])

        # Table row counts
        table_counts = await self._get_table_counts(session)
        for table_name, count in table_counts.items():
            metrics.append({
                "name": f"database_table_rows_{table_name}",
                "type": "gauge",
                "value": count,
                "unit": "count",
                "component": "postgresql",
                "description": f"Number of rows in {table_name} table",
                "metadata": {"timestamp": timestamp.isoformat(), "table": table_name}
            })

        return metrics

    async def _get_table_counts(self, session: AsyncSession) -> Dict[str, int]:
        """Get row counts for all tables."""
        counts = {}

        # Count devices
        result = await session.execute(select(func.count(Device.id)))
        counts["devices"] = result.scalar() or 0

        # Count sessions
        result = await session.execute(select(func.count(Session.id)))
        counts["sessions"] = result.scalar() or 0

        # Count CSI data
        result = await session.execute(select(func.count(CSIData.id)))
        counts["csi_data"] = result.scalar() or 0

        # Count pose detections
        result = await session.execute(select(func.count(PoseDetection.id)))
        counts["pose_detections"] = result.scalar() or 0

        # Count system metrics
        result = await session.execute(select(func.count(SystemMetric.id)))
        counts["system_metrics"] = result.scalar() or 0

        return counts


class ApplicationMonitoring(MonitoringTask):
    """Monitor application-specific metrics."""

    def __init__(self, settings: Settings):
        super().__init__("application", settings)
        self.interval_seconds = settings.application_monitoring_interval
        self.start_time = datetime.utcnow()

    async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
        """Collect application metrics."""
        metrics = []
        timestamp = datetime.utcnow()

        # Application uptime
        uptime_seconds = (timestamp - self.start_time).total_seconds()
        metrics.append({
            "name": "application_uptime_seconds",
            "type": "gauge",
            "value": uptime_seconds,
            "unit": "seconds",
            "component": "application",
            "description": "Application uptime in seconds",
            "metadata": {"timestamp": timestamp.isoformat()}
        })

        # Active sessions count
        active_sessions_query = select(func.count(Session.id)).where(
            Session.status == "active"
        )
        result = await session.execute(active_sessions_query)
        active_sessions = result.scalar() or 0

        metrics.append({
            "name": "application_active_sessions",
            "type": "gauge",
            "value": active_sessions,
            "unit": "count",
            "component": "application",
            "description": "Number of active sessions",
            "metadata": {"timestamp": timestamp.isoformat()}
        })

        # Active devices count
        active_devices_query = select(func.count(Device.id)).where(
            Device.status == "active"
        )
        result = await session.execute(active_devices_query)
        active_devices = result.scalar() or 0

        metrics.append({
            "name": "application_active_devices",
            "type": "gauge",
            "value": active_devices,
            "unit": "count",
            "component": "application",
            "description": "Number of active devices",
            "metadata": {"timestamp": timestamp.isoformat()}
        })

        # Recent data processing metrics (last hour)
        one_hour_ago = timestamp - timedelta(hours=1)

        # Recent CSI data count
        recent_csi_query = select(func.count(CSIData.id)).where(
            CSIData.created_at >= one_hour_ago
        )
        result = await session.execute(recent_csi_query)
        recent_csi_count = result.scalar() or 0

        metrics.append({
            "name": "application_csi_data_hourly",
            "type": "gauge",
            "value": recent_csi_count,
            "unit": "count",
            "component": "application",
            "description": "CSI data records created in the last hour",
            "metadata": {"timestamp": timestamp.isoformat()}
        })

        # Recent pose detections count
        recent_pose_query = select(func.count(PoseDetection.id)).where(
            PoseDetection.created_at >= one_hour_ago
        )
        result = await session.execute(recent_pose_query)
        recent_pose_count = result.scalar() or 0

        metrics.append({
            "name": "application_pose_detections_hourly",
            "type": "gauge",
            "value": recent_pose_count,
            "unit": "count",
            "component": "application",
            "description": "Pose detections created in the last hour",
            "metadata": {"timestamp": timestamp.isoformat()}
        })

        # Processing status metrics
        processing_statuses = ["pending", "processing", "completed", "failed"]
        for status in processing_statuses:
            status_query = select(func.count(CSIData.id)).where(
                CSIData.processing_status == status
            )
            result = await session.execute(status_query)
            status_count = result.scalar() or 0

            metrics.append({
                "name": f"application_csi_processing_{status}",
                "type": "gauge",
                "value": status_count,
                "unit": "count",
                "component": "application",
                "description": f"CSI data records with {status} processing status",
                "metadata": {"timestamp": timestamp.isoformat(), "status": status}
            })

        return metrics


class PerformanceMonitoring(MonitoringTask):
    """Monitor performance metrics and response times."""

    def __init__(self, settings: Settings):
        super().__init__("performance", settings)
        self.interval_seconds = settings.performance_monitoring_interval
        self.response_times = []
        self.error_counts = {}

    async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
        """Collect performance metrics."""
        metrics = []
        timestamp = datetime.utcnow()

        # Database query performance test
        start_time = time.time()
        test_query = select(func.count(Device.id))
        await session.execute(test_query)
        db_response_time = (time.time() - start_time) * 1000  # Convert to milliseconds

        metrics.append({
            "name": "performance_database_query_time_ms",
            "type": "gauge",
            "value": db_response_time,
            "unit": "milliseconds",
            "component": "database",
            "description": "Database query response time",
            "metadata": {"timestamp": timestamp.isoformat()}
        })

        # Average response time (if we have data)
        if self.response_times:
            avg_response_time = sum(self.response_times) / len(self.response_times)
            metrics.append({
                "name": "performance_avg_response_time_ms",
                "type": "gauge",
                "value": avg_response_time,
                "unit": "milliseconds",
                "component": "api",
                "description": "Average API response time",
                "metadata": {"timestamp": timestamp.isoformat()}
            })

            # Clear old response times (keep only recent ones)
            self.response_times = self.response_times[-100:]  # Keep last 100

        # Error rates
        for error_type, count in self.error_counts.items():
            metrics.append({
                "name": f"performance_errors_{error_type}_total",
                "type": "counter",
                "value": count,
                "unit": "count",
                "component": "api",
                "description": f"Total {error_type} errors",
                "metadata": {"timestamp": timestamp.isoformat(), "error_type": error_type}
            })

        return metrics

    def record_response_time(self, response_time_ms: float):
        """Record an API response time."""
        self.response_times.append(response_time_ms)

    def record_error(self, error_type: str):
        """Record an error occurrence."""
        self.error_counts[error_type] = self.error_counts.get(error_type, 0) + 1


class MonitoringManager:
    """Manager for all monitoring tasks."""

    def __init__(self, settings: Settings):
        self.settings = settings
        self.db_manager = get_database_manager(settings)
        self.tasks = self._initialize_tasks()
        self.running = False
        self.last_run = None
        self.run_count = 0

    def _initialize_tasks(self) -> List[MonitoringTask]:
        """Initialize all monitoring tasks."""
        tasks = [
            SystemResourceMonitoring(self.settings),
            DatabaseMonitoring(self.settings),
            ApplicationMonitoring(self.settings),
            PerformanceMonitoring(self.settings),
        ]

        # Filter enabled tasks
        enabled_tasks = [task for task in tasks if task.enabled]

        logger.info(f"Initialized {len(enabled_tasks)} monitoring tasks")
        return enabled_tasks

    async def run_all_tasks(self) -> Dict[str, Any]:
        """Run all monitoring tasks."""
        if self.running:
            return {"status": "already_running", "message": "Monitoring already in progress"}

        self.running = True
        start_time = datetime.utcnow()

        try:
            logger.debug("Starting monitoring tasks")

            results = []
            total_metrics = 0

            async with self.db_manager.get_async_session() as session:
                for task in self.tasks:
                    if not task.enabled:
                        continue

                    result = await task.run(session)
                    results.append(result)
                    total_metrics += result.get("metrics_collected", 0)

            self.last_run = start_time
            self.run_count += 1

            duration = (datetime.utcnow() - start_time).total_seconds()

            logger.debug(
                f"Monitoring tasks completed: collected {total_metrics} metrics "
                f"in {duration:.2f} seconds"
            )

            return {
                "status": "completed",
                "start_time": start_time.isoformat(),
                "duration_seconds": duration,
                "total_metrics": total_metrics,
                "task_results": results,
            }

        except Exception as e:
            logger.error(f"Monitoring tasks failed: {e}", exc_info=True)
            return {
                "status": "error",
                "start_time": start_time.isoformat(),
                "duration_seconds": (datetime.utcnow() - start_time).total_seconds(),
                "error": str(e),
                "total_metrics": 0,
            }

        finally:
            self.running = False

    async def run_task(self, task_name: str) -> Dict[str, Any]:
        """Run a specific monitoring task."""
        task = next((t for t in self.tasks if t.name == task_name), None)

        if not task:
            return {
                "status": "error",
                "error": f"Task '{task_name}' not found",
                "available_tasks": [t.name for t in self.tasks]
            }

        if not task.enabled:
            return {
                "status": "error",
                "error": f"Task '{task_name}' is disabled"
            }

        async with self.db_manager.get_async_session() as session:
            return await task.run(session)

    def get_stats(self) -> Dict[str, Any]:
        """Get monitoring manager statistics."""
        return {
            "manager": {
                "running": self.running,
                "last_run": self.last_run.isoformat() if self.last_run else None,
                "run_count": self.run_count,
            },
            "tasks": [task.get_stats() for task in self.tasks],
        }

    def get_performance_task(self) -> Optional[PerformanceMonitoring]:
        """Get the performance monitoring task for recording metrics."""
        return next((t for t in self.tasks if isinstance(t, PerformanceMonitoring)), None)


# Global monitoring manager instance
_monitoring_manager: Optional[MonitoringManager] = None


def get_monitoring_manager(settings: Settings) -> MonitoringManager:
    """Get monitoring manager instance."""
    global _monitoring_manager
    if _monitoring_manager is None:
        _monitoring_manager = MonitoringManager(settings)
    return _monitoring_manager


async def run_periodic_monitoring(settings: Settings):
    """Run periodic monitoring tasks."""
    monitoring_manager = get_monitoring_manager(settings)

    while True:
        try:
            await monitoring_manager.run_all_tasks()

            # Wait for next monitoring interval
            await asyncio.sleep(settings.monitoring_interval_seconds)

        except asyncio.CancelledError:
            logger.info("Periodic monitoring cancelled")
            break
        except Exception as e:
            logger.error(f"Periodic monitoring error: {e}", exc_info=True)
            # Wait before retrying
            await asyncio.sleep(30)