updates
This commit is contained in:
773
src/tasks/monitoring.py
Normal file
773
src/tasks/monitoring.py
Normal file
@@ -0,0 +1,773 @@
|
||||
"""
|
||||
Monitoring tasks for WiFi-DensePose API
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import psutil
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional, List
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from sqlalchemy import select, func, and_, or_
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from src.config.settings import Settings
|
||||
from src.database.connection import get_database_manager
|
||||
from src.database.models import SystemMetric, Device, Session, CSIData, PoseDetection
|
||||
from src.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class MonitoringTask:
|
||||
"""Base class for monitoring tasks."""
|
||||
|
||||
def __init__(self, name: str, settings: Settings):
|
||||
self.name = name
|
||||
self.settings = settings
|
||||
self.enabled = True
|
||||
self.last_run = None
|
||||
self.run_count = 0
|
||||
self.error_count = 0
|
||||
self.interval_seconds = 60 # Default interval
|
||||
|
||||
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
|
||||
"""Collect metrics for this task."""
|
||||
raise NotImplementedError
|
||||
|
||||
async def run(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Run the monitoring task with error handling."""
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
try:
|
||||
logger.debug(f"Starting monitoring task: {self.name}")
|
||||
|
||||
metrics = await self.collect_metrics(session)
|
||||
|
||||
# Store metrics in database
|
||||
for metric_data in metrics:
|
||||
metric = SystemMetric(
|
||||
metric_name=metric_data["name"],
|
||||
metric_type=metric_data["type"],
|
||||
value=metric_data["value"],
|
||||
unit=metric_data.get("unit"),
|
||||
labels=metric_data.get("labels"),
|
||||
tags=metric_data.get("tags"),
|
||||
source=metric_data.get("source", self.name),
|
||||
component=metric_data.get("component"),
|
||||
description=metric_data.get("description"),
|
||||
metadata=metric_data.get("metadata"),
|
||||
)
|
||||
session.add(metric)
|
||||
|
||||
await session.commit()
|
||||
|
||||
self.last_run = start_time
|
||||
self.run_count += 1
|
||||
|
||||
logger.debug(f"Monitoring task {self.name} completed: collected {len(metrics)} metrics")
|
||||
|
||||
return {
|
||||
"task": self.name,
|
||||
"status": "success",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
|
||||
"metrics_collected": len(metrics),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.error_count += 1
|
||||
logger.error(f"Monitoring task {self.name} failed: {e}", exc_info=True)
|
||||
|
||||
return {
|
||||
"task": self.name,
|
||||
"status": "error",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
|
||||
"error": str(e),
|
||||
"metrics_collected": 0,
|
||||
}
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get task statistics."""
|
||||
return {
|
||||
"name": self.name,
|
||||
"enabled": self.enabled,
|
||||
"interval_seconds": self.interval_seconds,
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None,
|
||||
"run_count": self.run_count,
|
||||
"error_count": self.error_count,
|
||||
}
|
||||
|
||||
|
||||
class SystemResourceMonitoring(MonitoringTask):
|
||||
"""Monitor system resources (CPU, memory, disk, network)."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("system_resources", settings)
|
||||
self.interval_seconds = settings.system_monitoring_interval
|
||||
|
||||
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
|
||||
"""Collect system resource metrics."""
|
||||
metrics = []
|
||||
timestamp = datetime.utcnow()
|
||||
|
||||
# CPU metrics
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
cpu_count = psutil.cpu_count()
|
||||
cpu_freq = psutil.cpu_freq()
|
||||
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "system_cpu_usage_percent",
|
||||
"type": "gauge",
|
||||
"value": cpu_percent,
|
||||
"unit": "percent",
|
||||
"component": "cpu",
|
||||
"description": "CPU usage percentage",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_cpu_count",
|
||||
"type": "gauge",
|
||||
"value": cpu_count,
|
||||
"unit": "count",
|
||||
"component": "cpu",
|
||||
"description": "Number of CPU cores",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
if cpu_freq:
|
||||
metrics.append({
|
||||
"name": "system_cpu_frequency_mhz",
|
||||
"type": "gauge",
|
||||
"value": cpu_freq.current,
|
||||
"unit": "mhz",
|
||||
"component": "cpu",
|
||||
"description": "Current CPU frequency",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Memory metrics
|
||||
memory = psutil.virtual_memory()
|
||||
swap = psutil.swap_memory()
|
||||
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "system_memory_total_bytes",
|
||||
"type": "gauge",
|
||||
"value": memory.total,
|
||||
"unit": "bytes",
|
||||
"component": "memory",
|
||||
"description": "Total system memory",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_memory_used_bytes",
|
||||
"type": "gauge",
|
||||
"value": memory.used,
|
||||
"unit": "bytes",
|
||||
"component": "memory",
|
||||
"description": "Used system memory",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_memory_available_bytes",
|
||||
"type": "gauge",
|
||||
"value": memory.available,
|
||||
"unit": "bytes",
|
||||
"component": "memory",
|
||||
"description": "Available system memory",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_memory_usage_percent",
|
||||
"type": "gauge",
|
||||
"value": memory.percent,
|
||||
"unit": "percent",
|
||||
"component": "memory",
|
||||
"description": "Memory usage percentage",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_swap_total_bytes",
|
||||
"type": "gauge",
|
||||
"value": swap.total,
|
||||
"unit": "bytes",
|
||||
"component": "memory",
|
||||
"description": "Total swap memory",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_swap_used_bytes",
|
||||
"type": "gauge",
|
||||
"value": swap.used,
|
||||
"unit": "bytes",
|
||||
"component": "memory",
|
||||
"description": "Used swap memory",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
# Disk metrics
|
||||
disk_usage = psutil.disk_usage('/')
|
||||
disk_io = psutil.disk_io_counters()
|
||||
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "system_disk_total_bytes",
|
||||
"type": "gauge",
|
||||
"value": disk_usage.total,
|
||||
"unit": "bytes",
|
||||
"component": "disk",
|
||||
"description": "Total disk space",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_disk_used_bytes",
|
||||
"type": "gauge",
|
||||
"value": disk_usage.used,
|
||||
"unit": "bytes",
|
||||
"component": "disk",
|
||||
"description": "Used disk space",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_disk_free_bytes",
|
||||
"type": "gauge",
|
||||
"value": disk_usage.free,
|
||||
"unit": "bytes",
|
||||
"component": "disk",
|
||||
"description": "Free disk space",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_disk_usage_percent",
|
||||
"type": "gauge",
|
||||
"value": (disk_usage.used / disk_usage.total) * 100,
|
||||
"unit": "percent",
|
||||
"component": "disk",
|
||||
"description": "Disk usage percentage",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
if disk_io:
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "system_disk_read_bytes_total",
|
||||
"type": "counter",
|
||||
"value": disk_io.read_bytes,
|
||||
"unit": "bytes",
|
||||
"component": "disk",
|
||||
"description": "Total bytes read from disk",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_disk_write_bytes_total",
|
||||
"type": "counter",
|
||||
"value": disk_io.write_bytes,
|
||||
"unit": "bytes",
|
||||
"component": "disk",
|
||||
"description": "Total bytes written to disk",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
# Network metrics
|
||||
network_io = psutil.net_io_counters()
|
||||
|
||||
if network_io:
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "system_network_bytes_sent_total",
|
||||
"type": "counter",
|
||||
"value": network_io.bytes_sent,
|
||||
"unit": "bytes",
|
||||
"component": "network",
|
||||
"description": "Total bytes sent over network",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_network_bytes_recv_total",
|
||||
"type": "counter",
|
||||
"value": network_io.bytes_recv,
|
||||
"unit": "bytes",
|
||||
"component": "network",
|
||||
"description": "Total bytes received over network",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_network_packets_sent_total",
|
||||
"type": "counter",
|
||||
"value": network_io.packets_sent,
|
||||
"unit": "count",
|
||||
"component": "network",
|
||||
"description": "Total packets sent over network",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_network_packets_recv_total",
|
||||
"type": "counter",
|
||||
"value": network_io.packets_recv,
|
||||
"unit": "count",
|
||||
"component": "network",
|
||||
"description": "Total packets received over network",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
class DatabaseMonitoring(MonitoringTask):
|
||||
"""Monitor database performance and statistics."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("database", settings)
|
||||
self.interval_seconds = settings.database_monitoring_interval
|
||||
|
||||
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
|
||||
"""Collect database metrics."""
|
||||
metrics = []
|
||||
timestamp = datetime.utcnow()
|
||||
|
||||
# Get database connection stats
|
||||
db_manager = get_database_manager(self.settings)
|
||||
connection_stats = await db_manager.get_connection_stats()
|
||||
|
||||
# PostgreSQL connection metrics
|
||||
if "postgresql" in connection_stats:
|
||||
pg_stats = connection_stats["postgresql"]
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "database_connections_total",
|
||||
"type": "gauge",
|
||||
"value": pg_stats.get("total_connections", 0),
|
||||
"unit": "count",
|
||||
"component": "postgresql",
|
||||
"description": "Total database connections",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "database_connections_active",
|
||||
"type": "gauge",
|
||||
"value": pg_stats.get("checked_out", 0),
|
||||
"unit": "count",
|
||||
"component": "postgresql",
|
||||
"description": "Active database connections",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "database_connections_available",
|
||||
"type": "gauge",
|
||||
"value": pg_stats.get("available_connections", 0),
|
||||
"unit": "count",
|
||||
"component": "postgresql",
|
||||
"description": "Available database connections",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
# Redis connection metrics
|
||||
if "redis" in connection_stats and not connection_stats["redis"].get("error"):
|
||||
redis_stats = connection_stats["redis"]
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "redis_connections_active",
|
||||
"type": "gauge",
|
||||
"value": redis_stats.get("connected_clients", 0),
|
||||
"unit": "count",
|
||||
"component": "redis",
|
||||
"description": "Active Redis connections",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "redis_connections_blocked",
|
||||
"type": "gauge",
|
||||
"value": redis_stats.get("blocked_clients", 0),
|
||||
"unit": "count",
|
||||
"component": "redis",
|
||||
"description": "Blocked Redis connections",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
# Table row counts
|
||||
table_counts = await self._get_table_counts(session)
|
||||
for table_name, count in table_counts.items():
|
||||
metrics.append({
|
||||
"name": f"database_table_rows_{table_name}",
|
||||
"type": "gauge",
|
||||
"value": count,
|
||||
"unit": "count",
|
||||
"component": "postgresql",
|
||||
"description": f"Number of rows in {table_name} table",
|
||||
"metadata": {"timestamp": timestamp.isoformat(), "table": table_name}
|
||||
})
|
||||
|
||||
return metrics
|
||||
|
||||
async def _get_table_counts(self, session: AsyncSession) -> Dict[str, int]:
|
||||
"""Get row counts for all tables."""
|
||||
counts = {}
|
||||
|
||||
# Count devices
|
||||
result = await session.execute(select(func.count(Device.id)))
|
||||
counts["devices"] = result.scalar() or 0
|
||||
|
||||
# Count sessions
|
||||
result = await session.execute(select(func.count(Session.id)))
|
||||
counts["sessions"] = result.scalar() or 0
|
||||
|
||||
# Count CSI data
|
||||
result = await session.execute(select(func.count(CSIData.id)))
|
||||
counts["csi_data"] = result.scalar() or 0
|
||||
|
||||
# Count pose detections
|
||||
result = await session.execute(select(func.count(PoseDetection.id)))
|
||||
counts["pose_detections"] = result.scalar() or 0
|
||||
|
||||
# Count system metrics
|
||||
result = await session.execute(select(func.count(SystemMetric.id)))
|
||||
counts["system_metrics"] = result.scalar() or 0
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
class ApplicationMonitoring(MonitoringTask):
|
||||
"""Monitor application-specific metrics."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("application", settings)
|
||||
self.interval_seconds = settings.application_monitoring_interval
|
||||
self.start_time = datetime.utcnow()
|
||||
|
||||
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
|
||||
"""Collect application metrics."""
|
||||
metrics = []
|
||||
timestamp = datetime.utcnow()
|
||||
|
||||
# Application uptime
|
||||
uptime_seconds = (timestamp - self.start_time).total_seconds()
|
||||
metrics.append({
|
||||
"name": "application_uptime_seconds",
|
||||
"type": "gauge",
|
||||
"value": uptime_seconds,
|
||||
"unit": "seconds",
|
||||
"component": "application",
|
||||
"description": "Application uptime in seconds",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Active sessions count
|
||||
active_sessions_query = select(func.count(Session.id)).where(
|
||||
Session.status == "active"
|
||||
)
|
||||
result = await session.execute(active_sessions_query)
|
||||
active_sessions = result.scalar() or 0
|
||||
|
||||
metrics.append({
|
||||
"name": "application_active_sessions",
|
||||
"type": "gauge",
|
||||
"value": active_sessions,
|
||||
"unit": "count",
|
||||
"component": "application",
|
||||
"description": "Number of active sessions",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Active devices count
|
||||
active_devices_query = select(func.count(Device.id)).where(
|
||||
Device.status == "active"
|
||||
)
|
||||
result = await session.execute(active_devices_query)
|
||||
active_devices = result.scalar() or 0
|
||||
|
||||
metrics.append({
|
||||
"name": "application_active_devices",
|
||||
"type": "gauge",
|
||||
"value": active_devices,
|
||||
"unit": "count",
|
||||
"component": "application",
|
||||
"description": "Number of active devices",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Recent data processing metrics (last hour)
|
||||
one_hour_ago = timestamp - timedelta(hours=1)
|
||||
|
||||
# Recent CSI data count
|
||||
recent_csi_query = select(func.count(CSIData.id)).where(
|
||||
CSIData.created_at >= one_hour_ago
|
||||
)
|
||||
result = await session.execute(recent_csi_query)
|
||||
recent_csi_count = result.scalar() or 0
|
||||
|
||||
metrics.append({
|
||||
"name": "application_csi_data_hourly",
|
||||
"type": "gauge",
|
||||
"value": recent_csi_count,
|
||||
"unit": "count",
|
||||
"component": "application",
|
||||
"description": "CSI data records created in the last hour",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Recent pose detections count
|
||||
recent_pose_query = select(func.count(PoseDetection.id)).where(
|
||||
PoseDetection.created_at >= one_hour_ago
|
||||
)
|
||||
result = await session.execute(recent_pose_query)
|
||||
recent_pose_count = result.scalar() or 0
|
||||
|
||||
metrics.append({
|
||||
"name": "application_pose_detections_hourly",
|
||||
"type": "gauge",
|
||||
"value": recent_pose_count,
|
||||
"unit": "count",
|
||||
"component": "application",
|
||||
"description": "Pose detections created in the last hour",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Processing status metrics
|
||||
processing_statuses = ["pending", "processing", "completed", "failed"]
|
||||
for status in processing_statuses:
|
||||
status_query = select(func.count(CSIData.id)).where(
|
||||
CSIData.processing_status == status
|
||||
)
|
||||
result = await session.execute(status_query)
|
||||
status_count = result.scalar() or 0
|
||||
|
||||
metrics.append({
|
||||
"name": f"application_csi_processing_{status}",
|
||||
"type": "gauge",
|
||||
"value": status_count,
|
||||
"unit": "count",
|
||||
"component": "application",
|
||||
"description": f"CSI data records with {status} processing status",
|
||||
"metadata": {"timestamp": timestamp.isoformat(), "status": status}
|
||||
})
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
class PerformanceMonitoring(MonitoringTask):
|
||||
"""Monitor performance metrics and response times."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("performance", settings)
|
||||
self.interval_seconds = settings.performance_monitoring_interval
|
||||
self.response_times = []
|
||||
self.error_counts = {}
|
||||
|
||||
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
|
||||
"""Collect performance metrics."""
|
||||
metrics = []
|
||||
timestamp = datetime.utcnow()
|
||||
|
||||
# Database query performance test
|
||||
start_time = time.time()
|
||||
test_query = select(func.count(Device.id))
|
||||
await session.execute(test_query)
|
||||
db_response_time = (time.time() - start_time) * 1000 # Convert to milliseconds
|
||||
|
||||
metrics.append({
|
||||
"name": "performance_database_query_time_ms",
|
||||
"type": "gauge",
|
||||
"value": db_response_time,
|
||||
"unit": "milliseconds",
|
||||
"component": "database",
|
||||
"description": "Database query response time",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Average response time (if we have data)
|
||||
if self.response_times:
|
||||
avg_response_time = sum(self.response_times) / len(self.response_times)
|
||||
metrics.append({
|
||||
"name": "performance_avg_response_time_ms",
|
||||
"type": "gauge",
|
||||
"value": avg_response_time,
|
||||
"unit": "milliseconds",
|
||||
"component": "api",
|
||||
"description": "Average API response time",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Clear old response times (keep only recent ones)
|
||||
self.response_times = self.response_times[-100:] # Keep last 100
|
||||
|
||||
# Error rates
|
||||
for error_type, count in self.error_counts.items():
|
||||
metrics.append({
|
||||
"name": f"performance_errors_{error_type}_total",
|
||||
"type": "counter",
|
||||
"value": count,
|
||||
"unit": "count",
|
||||
"component": "api",
|
||||
"description": f"Total {error_type} errors",
|
||||
"metadata": {"timestamp": timestamp.isoformat(), "error_type": error_type}
|
||||
})
|
||||
|
||||
return metrics
|
||||
|
||||
def record_response_time(self, response_time_ms: float):
|
||||
"""Record an API response time."""
|
||||
self.response_times.append(response_time_ms)
|
||||
|
||||
def record_error(self, error_type: str):
|
||||
"""Record an error occurrence."""
|
||||
self.error_counts[error_type] = self.error_counts.get(error_type, 0) + 1
|
||||
|
||||
|
||||
class MonitoringManager:
|
||||
"""Manager for all monitoring tasks."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
self.settings = settings
|
||||
self.db_manager = get_database_manager(settings)
|
||||
self.tasks = self._initialize_tasks()
|
||||
self.running = False
|
||||
self.last_run = None
|
||||
self.run_count = 0
|
||||
|
||||
def _initialize_tasks(self) -> List[MonitoringTask]:
|
||||
"""Initialize all monitoring tasks."""
|
||||
tasks = [
|
||||
SystemResourceMonitoring(self.settings),
|
||||
DatabaseMonitoring(self.settings),
|
||||
ApplicationMonitoring(self.settings),
|
||||
PerformanceMonitoring(self.settings),
|
||||
]
|
||||
|
||||
# Filter enabled tasks
|
||||
enabled_tasks = [task for task in tasks if task.enabled]
|
||||
|
||||
logger.info(f"Initialized {len(enabled_tasks)} monitoring tasks")
|
||||
return enabled_tasks
|
||||
|
||||
async def run_all_tasks(self) -> Dict[str, Any]:
|
||||
"""Run all monitoring tasks."""
|
||||
if self.running:
|
||||
return {"status": "already_running", "message": "Monitoring already in progress"}
|
||||
|
||||
self.running = True
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
try:
|
||||
logger.debug("Starting monitoring tasks")
|
||||
|
||||
results = []
|
||||
total_metrics = 0
|
||||
|
||||
async with self.db_manager.get_async_session() as session:
|
||||
for task in self.tasks:
|
||||
if not task.enabled:
|
||||
continue
|
||||
|
||||
result = await task.run(session)
|
||||
results.append(result)
|
||||
total_metrics += result.get("metrics_collected", 0)
|
||||
|
||||
self.last_run = start_time
|
||||
self.run_count += 1
|
||||
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
logger.debug(
|
||||
f"Monitoring tasks completed: collected {total_metrics} metrics "
|
||||
f"in {duration:.2f} seconds"
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "completed",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_seconds": duration,
|
||||
"total_metrics": total_metrics,
|
||||
"task_results": results,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Monitoring tasks failed: {e}", exc_info=True)
|
||||
return {
|
||||
"status": "error",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_seconds": (datetime.utcnow() - start_time).total_seconds(),
|
||||
"error": str(e),
|
||||
"total_metrics": 0,
|
||||
}
|
||||
|
||||
finally:
|
||||
self.running = False
|
||||
|
||||
async def run_task(self, task_name: str) -> Dict[str, Any]:
|
||||
"""Run a specific monitoring task."""
|
||||
task = next((t for t in self.tasks if t.name == task_name), None)
|
||||
|
||||
if not task:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"Task '{task_name}' not found",
|
||||
"available_tasks": [t.name for t in self.tasks]
|
||||
}
|
||||
|
||||
if not task.enabled:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"Task '{task_name}' is disabled"
|
||||
}
|
||||
|
||||
async with self.db_manager.get_async_session() as session:
|
||||
return await task.run(session)
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get monitoring manager statistics."""
|
||||
return {
|
||||
"manager": {
|
||||
"running": self.running,
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None,
|
||||
"run_count": self.run_count,
|
||||
},
|
||||
"tasks": [task.get_stats() for task in self.tasks],
|
||||
}
|
||||
|
||||
def get_performance_task(self) -> Optional[PerformanceMonitoring]:
|
||||
"""Get the performance monitoring task for recording metrics."""
|
||||
return next((t for t in self.tasks if isinstance(t, PerformanceMonitoring)), None)
|
||||
|
||||
|
||||
# Global monitoring manager instance
|
||||
_monitoring_manager: Optional[MonitoringManager] = None
|
||||
|
||||
|
||||
def get_monitoring_manager(settings: Settings) -> MonitoringManager:
|
||||
"""Get monitoring manager instance."""
|
||||
global _monitoring_manager
|
||||
if _monitoring_manager is None:
|
||||
_monitoring_manager = MonitoringManager(settings)
|
||||
return _monitoring_manager
|
||||
|
||||
|
||||
async def run_periodic_monitoring(settings: Settings):
|
||||
"""Run periodic monitoring tasks."""
|
||||
monitoring_manager = get_monitoring_manager(settings)
|
||||
|
||||
while True:
|
||||
try:
|
||||
await monitoring_manager.run_all_tasks()
|
||||
|
||||
# Wait for next monitoring interval
|
||||
await asyncio.sleep(settings.monitoring_interval_seconds)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Periodic monitoring cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Periodic monitoring error: {e}", exc_info=True)
|
||||
# Wait before retrying
|
||||
await asyncio.sleep(30)
|
||||
Reference in New Issue
Block a user