updates
This commit is contained in:
612
src/tasks/backup.py
Normal file
612
src/tasks/backup.py
Normal file
@@ -0,0 +1,612 @@
|
||||
"""
|
||||
Backup tasks for WiFi-DensePose API
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import gzip
|
||||
import json
|
||||
import subprocess
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, List
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from src.config.settings import Settings
|
||||
from src.database.connection import get_database_manager
|
||||
from src.database.models import Device, Session, CSIData, PoseDetection, SystemMetric, AuditLog
|
||||
from src.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class BackupTask:
|
||||
"""Base class for backup tasks."""
|
||||
|
||||
def __init__(self, name: str, settings: Settings):
|
||||
self.name = name
|
||||
self.settings = settings
|
||||
self.enabled = True
|
||||
self.last_run = None
|
||||
self.run_count = 0
|
||||
self.error_count = 0
|
||||
self.backup_dir = Path(settings.backup_directory)
|
||||
self.backup_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
async def execute_backup(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute the backup task."""
|
||||
raise NotImplementedError
|
||||
|
||||
async def run(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Run the backup task with error handling."""
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
try:
|
||||
logger.info(f"Starting backup task: {self.name}")
|
||||
|
||||
result = await self.execute_backup(session)
|
||||
|
||||
self.last_run = start_time
|
||||
self.run_count += 1
|
||||
|
||||
logger.info(
|
||||
f"Backup task {self.name} completed: "
|
||||
f"backed up {result.get('backup_size_mb', 0):.2f} MB"
|
||||
)
|
||||
|
||||
return {
|
||||
"task": self.name,
|
||||
"status": "success",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
|
||||
**result
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.error_count += 1
|
||||
logger.error(f"Backup task {self.name} failed: {e}", exc_info=True)
|
||||
|
||||
return {
|
||||
"task": self.name,
|
||||
"status": "error",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
|
||||
"error": str(e),
|
||||
"backup_size_mb": 0
|
||||
}
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get task statistics."""
|
||||
return {
|
||||
"name": self.name,
|
||||
"enabled": self.enabled,
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None,
|
||||
"run_count": self.run_count,
|
||||
"error_count": self.error_count,
|
||||
"backup_directory": str(self.backup_dir),
|
||||
}
|
||||
|
||||
def _get_backup_filename(self, prefix: str, extension: str = ".gz") -> str:
|
||||
"""Generate backup filename with timestamp."""
|
||||
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
||||
return f"{prefix}_{timestamp}{extension}"
|
||||
|
||||
def _get_file_size_mb(self, file_path: Path) -> float:
|
||||
"""Get file size in MB."""
|
||||
if file_path.exists():
|
||||
return file_path.stat().st_size / (1024 * 1024)
|
||||
return 0.0
|
||||
|
||||
def _cleanup_old_backups(self, pattern: str, retention_days: int):
|
||||
"""Clean up old backup files."""
|
||||
if retention_days <= 0:
|
||||
return
|
||||
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=retention_days)
|
||||
|
||||
for backup_file in self.backup_dir.glob(pattern):
|
||||
if backup_file.stat().st_mtime < cutoff_date.timestamp():
|
||||
try:
|
||||
backup_file.unlink()
|
||||
logger.debug(f"Deleted old backup: {backup_file}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete old backup {backup_file}: {e}")
|
||||
|
||||
|
||||
class DatabaseBackup(BackupTask):
|
||||
"""Full database backup using pg_dump."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("database_backup", settings)
|
||||
self.retention_days = settings.database_backup_retention_days
|
||||
|
||||
async def execute_backup(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute database backup."""
|
||||
backup_filename = self._get_backup_filename("database_full", ".sql.gz")
|
||||
backup_path = self.backup_dir / backup_filename
|
||||
|
||||
# Build pg_dump command
|
||||
pg_dump_cmd = [
|
||||
"pg_dump",
|
||||
"--verbose",
|
||||
"--no-password",
|
||||
"--format=custom",
|
||||
"--compress=9",
|
||||
"--file", str(backup_path),
|
||||
]
|
||||
|
||||
# Add connection parameters
|
||||
if self.settings.database_url:
|
||||
pg_dump_cmd.append(self.settings.database_url)
|
||||
else:
|
||||
pg_dump_cmd.extend([
|
||||
"--host", self.settings.db_host,
|
||||
"--port", str(self.settings.db_port),
|
||||
"--username", self.settings.db_user,
|
||||
"--dbname", self.settings.db_name,
|
||||
])
|
||||
|
||||
# Set environment variables
|
||||
env = os.environ.copy()
|
||||
if self.settings.db_password:
|
||||
env["PGPASSWORD"] = self.settings.db_password
|
||||
|
||||
# Execute pg_dump
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*pg_dump_cmd,
|
||||
env=env,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
error_msg = stderr.decode() if stderr else "Unknown pg_dump error"
|
||||
raise Exception(f"pg_dump failed: {error_msg}")
|
||||
|
||||
backup_size_mb = self._get_file_size_mb(backup_path)
|
||||
|
||||
# Clean up old backups
|
||||
self._cleanup_old_backups("database_full_*.sql.gz", self.retention_days)
|
||||
|
||||
return {
|
||||
"backup_file": backup_filename,
|
||||
"backup_path": str(backup_path),
|
||||
"backup_size_mb": backup_size_mb,
|
||||
"retention_days": self.retention_days,
|
||||
}
|
||||
|
||||
|
||||
class ConfigurationBackup(BackupTask):
|
||||
"""Backup configuration files and settings."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("configuration_backup", settings)
|
||||
self.retention_days = settings.config_backup_retention_days
|
||||
self.config_files = [
|
||||
"src/config/settings.py",
|
||||
".env",
|
||||
"pyproject.toml",
|
||||
"docker-compose.yml",
|
||||
"Dockerfile",
|
||||
]
|
||||
|
||||
async def execute_backup(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute configuration backup."""
|
||||
backup_filename = self._get_backup_filename("configuration", ".tar.gz")
|
||||
backup_path = self.backup_dir / backup_filename
|
||||
|
||||
# Create temporary directory for config files
|
||||
temp_dir = self.backup_dir / "temp_config"
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
copied_files = []
|
||||
|
||||
# Copy configuration files
|
||||
for config_file in self.config_files:
|
||||
source_path = Path(config_file)
|
||||
if source_path.exists():
|
||||
dest_path = temp_dir / source_path.name
|
||||
shutil.copy2(source_path, dest_path)
|
||||
copied_files.append(config_file)
|
||||
|
||||
# Create settings dump
|
||||
settings_dump = {
|
||||
"backup_timestamp": datetime.utcnow().isoformat(),
|
||||
"environment": self.settings.environment,
|
||||
"debug": self.settings.debug,
|
||||
"api_version": self.settings.api_version,
|
||||
"database_settings": {
|
||||
"db_host": self.settings.db_host,
|
||||
"db_port": self.settings.db_port,
|
||||
"db_name": self.settings.db_name,
|
||||
"db_pool_size": self.settings.db_pool_size,
|
||||
},
|
||||
"redis_settings": {
|
||||
"redis_enabled": self.settings.redis_enabled,
|
||||
"redis_host": self.settings.redis_host,
|
||||
"redis_port": self.settings.redis_port,
|
||||
"redis_db": self.settings.redis_db,
|
||||
},
|
||||
"monitoring_settings": {
|
||||
"monitoring_interval_seconds": self.settings.monitoring_interval_seconds,
|
||||
"cleanup_interval_seconds": self.settings.cleanup_interval_seconds,
|
||||
},
|
||||
}
|
||||
|
||||
settings_file = temp_dir / "settings_dump.json"
|
||||
with open(settings_file, 'w') as f:
|
||||
json.dump(settings_dump, f, indent=2)
|
||||
|
||||
# Create tar.gz archive
|
||||
tar_cmd = [
|
||||
"tar", "-czf", str(backup_path),
|
||||
"-C", str(temp_dir),
|
||||
"."
|
||||
]
|
||||
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*tar_cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
error_msg = stderr.decode() if stderr else "Unknown tar error"
|
||||
raise Exception(f"tar failed: {error_msg}")
|
||||
|
||||
backup_size_mb = self._get_file_size_mb(backup_path)
|
||||
|
||||
# Clean up old backups
|
||||
self._cleanup_old_backups("configuration_*.tar.gz", self.retention_days)
|
||||
|
||||
return {
|
||||
"backup_file": backup_filename,
|
||||
"backup_path": str(backup_path),
|
||||
"backup_size_mb": backup_size_mb,
|
||||
"copied_files": copied_files,
|
||||
"retention_days": self.retention_days,
|
||||
}
|
||||
|
||||
finally:
|
||||
# Clean up temporary directory
|
||||
if temp_dir.exists():
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
|
||||
class DataExportBackup(BackupTask):
|
||||
"""Export specific data tables to JSON format."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("data_export_backup", settings)
|
||||
self.retention_days = settings.data_export_retention_days
|
||||
self.export_batch_size = 1000
|
||||
|
||||
async def execute_backup(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute data export backup."""
|
||||
backup_filename = self._get_backup_filename("data_export", ".json.gz")
|
||||
backup_path = self.backup_dir / backup_filename
|
||||
|
||||
export_data = {
|
||||
"backup_timestamp": datetime.utcnow().isoformat(),
|
||||
"export_version": "1.0",
|
||||
"tables": {}
|
||||
}
|
||||
|
||||
# Export devices
|
||||
devices_data = await self._export_table_data(session, Device, "devices")
|
||||
export_data["tables"]["devices"] = devices_data
|
||||
|
||||
# Export sessions
|
||||
sessions_data = await self._export_table_data(session, Session, "sessions")
|
||||
export_data["tables"]["sessions"] = sessions_data
|
||||
|
||||
# Export recent CSI data (last 7 days)
|
||||
recent_date = datetime.utcnow() - timedelta(days=7)
|
||||
csi_query = select(CSIData).where(CSIData.created_at >= recent_date)
|
||||
csi_data = await self._export_query_data(session, csi_query, "csi_data")
|
||||
export_data["tables"]["csi_data_recent"] = csi_data
|
||||
|
||||
# Export recent pose detections (last 7 days)
|
||||
pose_query = select(PoseDetection).where(PoseDetection.created_at >= recent_date)
|
||||
pose_data = await self._export_query_data(session, pose_query, "pose_detections")
|
||||
export_data["tables"]["pose_detections_recent"] = pose_data
|
||||
|
||||
# Write compressed JSON
|
||||
with gzip.open(backup_path, 'wt', encoding='utf-8') as f:
|
||||
json.dump(export_data, f, indent=2, default=str)
|
||||
|
||||
backup_size_mb = self._get_file_size_mb(backup_path)
|
||||
|
||||
# Clean up old backups
|
||||
self._cleanup_old_backups("data_export_*.json.gz", self.retention_days)
|
||||
|
||||
total_records = sum(
|
||||
table_data["record_count"]
|
||||
for table_data in export_data["tables"].values()
|
||||
)
|
||||
|
||||
return {
|
||||
"backup_file": backup_filename,
|
||||
"backup_path": str(backup_path),
|
||||
"backup_size_mb": backup_size_mb,
|
||||
"total_records": total_records,
|
||||
"tables_exported": list(export_data["tables"].keys()),
|
||||
"retention_days": self.retention_days,
|
||||
}
|
||||
|
||||
async def _export_table_data(self, session: AsyncSession, model_class, table_name: str) -> Dict[str, Any]:
|
||||
"""Export all data from a table."""
|
||||
query = select(model_class)
|
||||
return await self._export_query_data(session, query, table_name)
|
||||
|
||||
async def _export_query_data(self, session: AsyncSession, query, table_name: str) -> Dict[str, Any]:
|
||||
"""Export data from a query."""
|
||||
result = await session.execute(query)
|
||||
records = result.scalars().all()
|
||||
|
||||
exported_records = []
|
||||
for record in records:
|
||||
if hasattr(record, 'to_dict'):
|
||||
exported_records.append(record.to_dict())
|
||||
else:
|
||||
# Fallback for records without to_dict method
|
||||
record_dict = {}
|
||||
for column in record.__table__.columns:
|
||||
value = getattr(record, column.name)
|
||||
if isinstance(value, datetime):
|
||||
value = value.isoformat()
|
||||
record_dict[column.name] = value
|
||||
exported_records.append(record_dict)
|
||||
|
||||
return {
|
||||
"table_name": table_name,
|
||||
"record_count": len(exported_records),
|
||||
"export_timestamp": datetime.utcnow().isoformat(),
|
||||
"records": exported_records,
|
||||
}
|
||||
|
||||
|
||||
class LogsBackup(BackupTask):
|
||||
"""Backup application logs."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("logs_backup", settings)
|
||||
self.retention_days = settings.logs_backup_retention_days
|
||||
self.logs_directory = Path(settings.log_directory)
|
||||
|
||||
async def execute_backup(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute logs backup."""
|
||||
if not self.logs_directory.exists():
|
||||
return {
|
||||
"backup_file": None,
|
||||
"backup_path": None,
|
||||
"backup_size_mb": 0,
|
||||
"message": "Logs directory does not exist",
|
||||
}
|
||||
|
||||
backup_filename = self._get_backup_filename("logs", ".tar.gz")
|
||||
backup_path = self.backup_dir / backup_filename
|
||||
|
||||
# Create tar.gz archive of logs
|
||||
tar_cmd = [
|
||||
"tar", "-czf", str(backup_path),
|
||||
"-C", str(self.logs_directory.parent),
|
||||
self.logs_directory.name
|
||||
]
|
||||
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*tar_cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
error_msg = stderr.decode() if stderr else "Unknown tar error"
|
||||
raise Exception(f"tar failed: {error_msg}")
|
||||
|
||||
backup_size_mb = self._get_file_size_mb(backup_path)
|
||||
|
||||
# Count log files
|
||||
log_files = list(self.logs_directory.glob("*.log*"))
|
||||
|
||||
# Clean up old backups
|
||||
self._cleanup_old_backups("logs_*.tar.gz", self.retention_days)
|
||||
|
||||
return {
|
||||
"backup_file": backup_filename,
|
||||
"backup_path": str(backup_path),
|
||||
"backup_size_mb": backup_size_mb,
|
||||
"log_files_count": len(log_files),
|
||||
"retention_days": self.retention_days,
|
||||
}
|
||||
|
||||
|
||||
class BackupManager:
|
||||
"""Manager for all backup tasks."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
self.settings = settings
|
||||
self.db_manager = get_database_manager(settings)
|
||||
self.tasks = self._initialize_tasks()
|
||||
self.running = False
|
||||
self.last_run = None
|
||||
self.run_count = 0
|
||||
self.total_backup_size = 0
|
||||
|
||||
def _initialize_tasks(self) -> List[BackupTask]:
|
||||
"""Initialize all backup tasks."""
|
||||
tasks = [
|
||||
DatabaseBackup(self.settings),
|
||||
ConfigurationBackup(self.settings),
|
||||
DataExportBackup(self.settings),
|
||||
LogsBackup(self.settings),
|
||||
]
|
||||
|
||||
# Filter enabled tasks
|
||||
enabled_tasks = [task for task in tasks if task.enabled]
|
||||
|
||||
logger.info(f"Initialized {len(enabled_tasks)} backup tasks")
|
||||
return enabled_tasks
|
||||
|
||||
async def run_all_tasks(self) -> Dict[str, Any]:
|
||||
"""Run all backup tasks."""
|
||||
if self.running:
|
||||
return {"status": "already_running", "message": "Backup already in progress"}
|
||||
|
||||
self.running = True
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
try:
|
||||
logger.info("Starting backup tasks")
|
||||
|
||||
results = []
|
||||
total_backup_size = 0
|
||||
|
||||
async with self.db_manager.get_async_session() as session:
|
||||
for task in self.tasks:
|
||||
if not task.enabled:
|
||||
continue
|
||||
|
||||
result = await task.run(session)
|
||||
results.append(result)
|
||||
total_backup_size += result.get("backup_size_mb", 0)
|
||||
|
||||
self.last_run = start_time
|
||||
self.run_count += 1
|
||||
self.total_backup_size += total_backup_size
|
||||
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
logger.info(
|
||||
f"Backup tasks completed: created {total_backup_size:.2f} MB "
|
||||
f"in {duration:.2f} seconds"
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "completed",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_seconds": duration,
|
||||
"total_backup_size_mb": total_backup_size,
|
||||
"task_results": results,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Backup tasks failed: {e}", exc_info=True)
|
||||
return {
|
||||
"status": "error",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_seconds": (datetime.utcnow() - start_time).total_seconds(),
|
||||
"error": str(e),
|
||||
"total_backup_size_mb": 0,
|
||||
}
|
||||
|
||||
finally:
|
||||
self.running = False
|
||||
|
||||
async def run_task(self, task_name: str) -> Dict[str, Any]:
|
||||
"""Run a specific backup task."""
|
||||
task = next((t for t in self.tasks if t.name == task_name), None)
|
||||
|
||||
if not task:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"Task '{task_name}' not found",
|
||||
"available_tasks": [t.name for t in self.tasks]
|
||||
}
|
||||
|
||||
if not task.enabled:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"Task '{task_name}' is disabled"
|
||||
}
|
||||
|
||||
async with self.db_manager.get_async_session() as session:
|
||||
return await task.run(session)
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get backup manager statistics."""
|
||||
return {
|
||||
"manager": {
|
||||
"running": self.running,
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None,
|
||||
"run_count": self.run_count,
|
||||
"total_backup_size_mb": self.total_backup_size,
|
||||
},
|
||||
"tasks": [task.get_stats() for task in self.tasks],
|
||||
}
|
||||
|
||||
def list_backups(self) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""List all backup files."""
|
||||
backup_files = {}
|
||||
|
||||
for task in self.tasks:
|
||||
task_backups = []
|
||||
|
||||
# Define patterns for each task type
|
||||
patterns = {
|
||||
"database_backup": "database_full_*.sql.gz",
|
||||
"configuration_backup": "configuration_*.tar.gz",
|
||||
"data_export_backup": "data_export_*.json.gz",
|
||||
"logs_backup": "logs_*.tar.gz",
|
||||
}
|
||||
|
||||
pattern = patterns.get(task.name, f"{task.name}_*")
|
||||
|
||||
for backup_file in task.backup_dir.glob(pattern):
|
||||
stat = backup_file.stat()
|
||||
task_backups.append({
|
||||
"filename": backup_file.name,
|
||||
"path": str(backup_file),
|
||||
"size_mb": stat.st_size / (1024 * 1024),
|
||||
"created_at": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
||||
})
|
||||
|
||||
# Sort by creation time (newest first)
|
||||
task_backups.sort(key=lambda x: x["created_at"], reverse=True)
|
||||
backup_files[task.name] = task_backups
|
||||
|
||||
return backup_files
|
||||
|
||||
|
||||
# Global backup manager instance
|
||||
_backup_manager: Optional[BackupManager] = None
|
||||
|
||||
|
||||
def get_backup_manager(settings: Settings) -> BackupManager:
|
||||
"""Get backup manager instance."""
|
||||
global _backup_manager
|
||||
if _backup_manager is None:
|
||||
_backup_manager = BackupManager(settings)
|
||||
return _backup_manager
|
||||
|
||||
|
||||
async def run_periodic_backup(settings: Settings):
|
||||
"""Run periodic backup tasks."""
|
||||
backup_manager = get_backup_manager(settings)
|
||||
|
||||
while True:
|
||||
try:
|
||||
await backup_manager.run_all_tasks()
|
||||
|
||||
# Wait for next backup interval
|
||||
await asyncio.sleep(settings.backup_interval_seconds)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Periodic backup cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Periodic backup error: {e}", exc_info=True)
|
||||
# Wait before retrying
|
||||
await asyncio.sleep(300) # 5 minutes
|
||||
598
src/tasks/cleanup.py
Normal file
598
src/tasks/cleanup.py
Normal file
@@ -0,0 +1,598 @@
|
||||
"""
|
||||
Periodic cleanup tasks for WiFi-DensePose API
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional, List
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from sqlalchemy import delete, select, func, and_, or_
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from src.config.settings import Settings
|
||||
from src.database.connection import get_database_manager
|
||||
from src.database.models import (
|
||||
CSIData, PoseDetection, SystemMetric, AuditLog, Session, Device
|
||||
)
|
||||
from src.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class CleanupTask:
|
||||
"""Base class for cleanup tasks."""
|
||||
|
||||
def __init__(self, name: str, settings: Settings):
|
||||
self.name = name
|
||||
self.settings = settings
|
||||
self.enabled = True
|
||||
self.last_run = None
|
||||
self.run_count = 0
|
||||
self.error_count = 0
|
||||
self.total_cleaned = 0
|
||||
|
||||
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute the cleanup task."""
|
||||
raise NotImplementedError
|
||||
|
||||
async def run(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Run the cleanup task with error handling."""
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
try:
|
||||
logger.info(f"Starting cleanup task: {self.name}")
|
||||
|
||||
result = await self.execute(session)
|
||||
|
||||
self.last_run = start_time
|
||||
self.run_count += 1
|
||||
|
||||
if result.get("cleaned_count", 0) > 0:
|
||||
self.total_cleaned += result["cleaned_count"]
|
||||
logger.info(
|
||||
f"Cleanup task {self.name} completed: "
|
||||
f"cleaned {result['cleaned_count']} items"
|
||||
)
|
||||
else:
|
||||
logger.debug(f"Cleanup task {self.name} completed: no items to clean")
|
||||
|
||||
return {
|
||||
"task": self.name,
|
||||
"status": "success",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
|
||||
**result
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.error_count += 1
|
||||
logger.error(f"Cleanup task {self.name} failed: {e}", exc_info=True)
|
||||
|
||||
return {
|
||||
"task": self.name,
|
||||
"status": "error",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
|
||||
"error": str(e),
|
||||
"cleaned_count": 0
|
||||
}
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get task statistics."""
|
||||
return {
|
||||
"name": self.name,
|
||||
"enabled": self.enabled,
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None,
|
||||
"run_count": self.run_count,
|
||||
"error_count": self.error_count,
|
||||
"total_cleaned": self.total_cleaned,
|
||||
}
|
||||
|
||||
|
||||
class OldCSIDataCleanup(CleanupTask):
|
||||
"""Cleanup old CSI data records."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("old_csi_data_cleanup", settings)
|
||||
self.retention_days = settings.csi_data_retention_days
|
||||
self.batch_size = settings.cleanup_batch_size
|
||||
|
||||
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute CSI data cleanup."""
|
||||
if self.retention_days <= 0:
|
||||
return {"cleaned_count": 0, "message": "CSI data retention disabled"}
|
||||
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=self.retention_days)
|
||||
|
||||
# Count records to be deleted
|
||||
count_query = select(func.count(CSIData.id)).where(
|
||||
CSIData.created_at < cutoff_date
|
||||
)
|
||||
total_count = await session.scalar(count_query)
|
||||
|
||||
if total_count == 0:
|
||||
return {"cleaned_count": 0, "message": "No old CSI data to clean"}
|
||||
|
||||
# Delete in batches
|
||||
cleaned_count = 0
|
||||
while cleaned_count < total_count:
|
||||
# Get batch of IDs to delete
|
||||
id_query = select(CSIData.id).where(
|
||||
CSIData.created_at < cutoff_date
|
||||
).limit(self.batch_size)
|
||||
|
||||
result = await session.execute(id_query)
|
||||
ids_to_delete = [row[0] for row in result.fetchall()]
|
||||
|
||||
if not ids_to_delete:
|
||||
break
|
||||
|
||||
# Delete batch
|
||||
delete_query = delete(CSIData).where(CSIData.id.in_(ids_to_delete))
|
||||
await session.execute(delete_query)
|
||||
await session.commit()
|
||||
|
||||
batch_size = len(ids_to_delete)
|
||||
cleaned_count += batch_size
|
||||
|
||||
logger.debug(f"Deleted {batch_size} CSI data records (total: {cleaned_count})")
|
||||
|
||||
# Small delay to avoid overwhelming the database
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
return {
|
||||
"cleaned_count": cleaned_count,
|
||||
"retention_days": self.retention_days,
|
||||
"cutoff_date": cutoff_date.isoformat()
|
||||
}
|
||||
|
||||
|
||||
class OldPoseDetectionCleanup(CleanupTask):
|
||||
"""Cleanup old pose detection records."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("old_pose_detection_cleanup", settings)
|
||||
self.retention_days = settings.pose_detection_retention_days
|
||||
self.batch_size = settings.cleanup_batch_size
|
||||
|
||||
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute pose detection cleanup."""
|
||||
if self.retention_days <= 0:
|
||||
return {"cleaned_count": 0, "message": "Pose detection retention disabled"}
|
||||
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=self.retention_days)
|
||||
|
||||
# Count records to be deleted
|
||||
count_query = select(func.count(PoseDetection.id)).where(
|
||||
PoseDetection.created_at < cutoff_date
|
||||
)
|
||||
total_count = await session.scalar(count_query)
|
||||
|
||||
if total_count == 0:
|
||||
return {"cleaned_count": 0, "message": "No old pose detections to clean"}
|
||||
|
||||
# Delete in batches
|
||||
cleaned_count = 0
|
||||
while cleaned_count < total_count:
|
||||
# Get batch of IDs to delete
|
||||
id_query = select(PoseDetection.id).where(
|
||||
PoseDetection.created_at < cutoff_date
|
||||
).limit(self.batch_size)
|
||||
|
||||
result = await session.execute(id_query)
|
||||
ids_to_delete = [row[0] for row in result.fetchall()]
|
||||
|
||||
if not ids_to_delete:
|
||||
break
|
||||
|
||||
# Delete batch
|
||||
delete_query = delete(PoseDetection).where(PoseDetection.id.in_(ids_to_delete))
|
||||
await session.execute(delete_query)
|
||||
await session.commit()
|
||||
|
||||
batch_size = len(ids_to_delete)
|
||||
cleaned_count += batch_size
|
||||
|
||||
logger.debug(f"Deleted {batch_size} pose detection records (total: {cleaned_count})")
|
||||
|
||||
# Small delay to avoid overwhelming the database
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
return {
|
||||
"cleaned_count": cleaned_count,
|
||||
"retention_days": self.retention_days,
|
||||
"cutoff_date": cutoff_date.isoformat()
|
||||
}
|
||||
|
||||
|
||||
class OldMetricsCleanup(CleanupTask):
|
||||
"""Cleanup old system metrics."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("old_metrics_cleanup", settings)
|
||||
self.retention_days = settings.metrics_retention_days
|
||||
self.batch_size = settings.cleanup_batch_size
|
||||
|
||||
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute metrics cleanup."""
|
||||
if self.retention_days <= 0:
|
||||
return {"cleaned_count": 0, "message": "Metrics retention disabled"}
|
||||
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=self.retention_days)
|
||||
|
||||
# Count records to be deleted
|
||||
count_query = select(func.count(SystemMetric.id)).where(
|
||||
SystemMetric.created_at < cutoff_date
|
||||
)
|
||||
total_count = await session.scalar(count_query)
|
||||
|
||||
if total_count == 0:
|
||||
return {"cleaned_count": 0, "message": "No old metrics to clean"}
|
||||
|
||||
# Delete in batches
|
||||
cleaned_count = 0
|
||||
while cleaned_count < total_count:
|
||||
# Get batch of IDs to delete
|
||||
id_query = select(SystemMetric.id).where(
|
||||
SystemMetric.created_at < cutoff_date
|
||||
).limit(self.batch_size)
|
||||
|
||||
result = await session.execute(id_query)
|
||||
ids_to_delete = [row[0] for row in result.fetchall()]
|
||||
|
||||
if not ids_to_delete:
|
||||
break
|
||||
|
||||
# Delete batch
|
||||
delete_query = delete(SystemMetric).where(SystemMetric.id.in_(ids_to_delete))
|
||||
await session.execute(delete_query)
|
||||
await session.commit()
|
||||
|
||||
batch_size = len(ids_to_delete)
|
||||
cleaned_count += batch_size
|
||||
|
||||
logger.debug(f"Deleted {batch_size} metric records (total: {cleaned_count})")
|
||||
|
||||
# Small delay to avoid overwhelming the database
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
return {
|
||||
"cleaned_count": cleaned_count,
|
||||
"retention_days": self.retention_days,
|
||||
"cutoff_date": cutoff_date.isoformat()
|
||||
}
|
||||
|
||||
|
||||
class OldAuditLogCleanup(CleanupTask):
|
||||
"""Cleanup old audit logs."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("old_audit_log_cleanup", settings)
|
||||
self.retention_days = settings.audit_log_retention_days
|
||||
self.batch_size = settings.cleanup_batch_size
|
||||
|
||||
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute audit log cleanup."""
|
||||
if self.retention_days <= 0:
|
||||
return {"cleaned_count": 0, "message": "Audit log retention disabled"}
|
||||
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=self.retention_days)
|
||||
|
||||
# Count records to be deleted
|
||||
count_query = select(func.count(AuditLog.id)).where(
|
||||
AuditLog.created_at < cutoff_date
|
||||
)
|
||||
total_count = await session.scalar(count_query)
|
||||
|
||||
if total_count == 0:
|
||||
return {"cleaned_count": 0, "message": "No old audit logs to clean"}
|
||||
|
||||
# Delete in batches
|
||||
cleaned_count = 0
|
||||
while cleaned_count < total_count:
|
||||
# Get batch of IDs to delete
|
||||
id_query = select(AuditLog.id).where(
|
||||
AuditLog.created_at < cutoff_date
|
||||
).limit(self.batch_size)
|
||||
|
||||
result = await session.execute(id_query)
|
||||
ids_to_delete = [row[0] for row in result.fetchall()]
|
||||
|
||||
if not ids_to_delete:
|
||||
break
|
||||
|
||||
# Delete batch
|
||||
delete_query = delete(AuditLog).where(AuditLog.id.in_(ids_to_delete))
|
||||
await session.execute(delete_query)
|
||||
await session.commit()
|
||||
|
||||
batch_size = len(ids_to_delete)
|
||||
cleaned_count += batch_size
|
||||
|
||||
logger.debug(f"Deleted {batch_size} audit log records (total: {cleaned_count})")
|
||||
|
||||
# Small delay to avoid overwhelming the database
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
return {
|
||||
"cleaned_count": cleaned_count,
|
||||
"retention_days": self.retention_days,
|
||||
"cutoff_date": cutoff_date.isoformat()
|
||||
}
|
||||
|
||||
|
||||
class OrphanedSessionCleanup(CleanupTask):
|
||||
"""Cleanup orphaned sessions (sessions without associated data)."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("orphaned_session_cleanup", settings)
|
||||
self.orphan_threshold_days = settings.orphaned_session_threshold_days
|
||||
self.batch_size = settings.cleanup_batch_size
|
||||
|
||||
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute orphaned session cleanup."""
|
||||
if self.orphan_threshold_days <= 0:
|
||||
return {"cleaned_count": 0, "message": "Orphaned session cleanup disabled"}
|
||||
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=self.orphan_threshold_days)
|
||||
|
||||
# Find sessions that are old and have no associated CSI data or pose detections
|
||||
orphaned_sessions_query = select(Session.id).where(
|
||||
and_(
|
||||
Session.created_at < cutoff_date,
|
||||
Session.status.in_(["completed", "failed", "cancelled"]),
|
||||
~Session.id.in_(select(CSIData.session_id).where(CSIData.session_id.isnot(None))),
|
||||
~Session.id.in_(select(PoseDetection.session_id))
|
||||
)
|
||||
)
|
||||
|
||||
result = await session.execute(orphaned_sessions_query)
|
||||
orphaned_ids = [row[0] for row in result.fetchall()]
|
||||
|
||||
if not orphaned_ids:
|
||||
return {"cleaned_count": 0, "message": "No orphaned sessions to clean"}
|
||||
|
||||
# Delete orphaned sessions
|
||||
delete_query = delete(Session).where(Session.id.in_(orphaned_ids))
|
||||
await session.execute(delete_query)
|
||||
await session.commit()
|
||||
|
||||
cleaned_count = len(orphaned_ids)
|
||||
|
||||
return {
|
||||
"cleaned_count": cleaned_count,
|
||||
"orphan_threshold_days": self.orphan_threshold_days,
|
||||
"cutoff_date": cutoff_date.isoformat()
|
||||
}
|
||||
|
||||
|
||||
class InvalidDataCleanup(CleanupTask):
|
||||
"""Cleanup invalid or corrupted data records."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("invalid_data_cleanup", settings)
|
||||
self.batch_size = settings.cleanup_batch_size
|
||||
|
||||
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Execute invalid data cleanup."""
|
||||
total_cleaned = 0
|
||||
|
||||
# Clean invalid CSI data
|
||||
invalid_csi_query = select(CSIData.id).where(
|
||||
or_(
|
||||
CSIData.is_valid == False,
|
||||
CSIData.amplitude == None,
|
||||
CSIData.phase == None,
|
||||
CSIData.frequency <= 0,
|
||||
CSIData.bandwidth <= 0,
|
||||
CSIData.num_subcarriers <= 0
|
||||
)
|
||||
)
|
||||
|
||||
result = await session.execute(invalid_csi_query)
|
||||
invalid_csi_ids = [row[0] for row in result.fetchall()]
|
||||
|
||||
if invalid_csi_ids:
|
||||
delete_query = delete(CSIData).where(CSIData.id.in_(invalid_csi_ids))
|
||||
await session.execute(delete_query)
|
||||
total_cleaned += len(invalid_csi_ids)
|
||||
logger.debug(f"Deleted {len(invalid_csi_ids)} invalid CSI data records")
|
||||
|
||||
# Clean invalid pose detections
|
||||
invalid_pose_query = select(PoseDetection.id).where(
|
||||
or_(
|
||||
PoseDetection.is_valid == False,
|
||||
PoseDetection.person_count < 0,
|
||||
and_(
|
||||
PoseDetection.detection_confidence.isnot(None),
|
||||
or_(
|
||||
PoseDetection.detection_confidence < 0,
|
||||
PoseDetection.detection_confidence > 1
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = await session.execute(invalid_pose_query)
|
||||
invalid_pose_ids = [row[0] for row in result.fetchall()]
|
||||
|
||||
if invalid_pose_ids:
|
||||
delete_query = delete(PoseDetection).where(PoseDetection.id.in_(invalid_pose_ids))
|
||||
await session.execute(delete_query)
|
||||
total_cleaned += len(invalid_pose_ids)
|
||||
logger.debug(f"Deleted {len(invalid_pose_ids)} invalid pose detection records")
|
||||
|
||||
await session.commit()
|
||||
|
||||
return {
|
||||
"cleaned_count": total_cleaned,
|
||||
"invalid_csi_count": len(invalid_csi_ids) if invalid_csi_ids else 0,
|
||||
"invalid_pose_count": len(invalid_pose_ids) if invalid_pose_ids else 0,
|
||||
}
|
||||
|
||||
|
||||
class CleanupManager:
|
||||
"""Manager for all cleanup tasks."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
self.settings = settings
|
||||
self.db_manager = get_database_manager(settings)
|
||||
self.tasks = self._initialize_tasks()
|
||||
self.running = False
|
||||
self.last_run = None
|
||||
self.run_count = 0
|
||||
self.total_cleaned = 0
|
||||
|
||||
def _initialize_tasks(self) -> List[CleanupTask]:
|
||||
"""Initialize all cleanup tasks."""
|
||||
tasks = [
|
||||
OldCSIDataCleanup(self.settings),
|
||||
OldPoseDetectionCleanup(self.settings),
|
||||
OldMetricsCleanup(self.settings),
|
||||
OldAuditLogCleanup(self.settings),
|
||||
OrphanedSessionCleanup(self.settings),
|
||||
InvalidDataCleanup(self.settings),
|
||||
]
|
||||
|
||||
# Filter enabled tasks
|
||||
enabled_tasks = [task for task in tasks if task.enabled]
|
||||
|
||||
logger.info(f"Initialized {len(enabled_tasks)} cleanup tasks")
|
||||
return enabled_tasks
|
||||
|
||||
async def run_all_tasks(self) -> Dict[str, Any]:
|
||||
"""Run all cleanup tasks."""
|
||||
if self.running:
|
||||
return {"status": "already_running", "message": "Cleanup already in progress"}
|
||||
|
||||
self.running = True
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
try:
|
||||
logger.info("Starting cleanup tasks")
|
||||
|
||||
results = []
|
||||
total_cleaned = 0
|
||||
|
||||
async with self.db_manager.get_async_session() as session:
|
||||
for task in self.tasks:
|
||||
if not task.enabled:
|
||||
continue
|
||||
|
||||
result = await task.run(session)
|
||||
results.append(result)
|
||||
total_cleaned += result.get("cleaned_count", 0)
|
||||
|
||||
self.last_run = start_time
|
||||
self.run_count += 1
|
||||
self.total_cleaned += total_cleaned
|
||||
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
logger.info(
|
||||
f"Cleanup tasks completed: cleaned {total_cleaned} items "
|
||||
f"in {duration:.2f} seconds"
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "completed",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_seconds": duration,
|
||||
"total_cleaned": total_cleaned,
|
||||
"task_results": results,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cleanup tasks failed: {e}", exc_info=True)
|
||||
return {
|
||||
"status": "error",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_seconds": (datetime.utcnow() - start_time).total_seconds(),
|
||||
"error": str(e),
|
||||
"total_cleaned": 0,
|
||||
}
|
||||
|
||||
finally:
|
||||
self.running = False
|
||||
|
||||
async def run_task(self, task_name: str) -> Dict[str, Any]:
|
||||
"""Run a specific cleanup task."""
|
||||
task = next((t for t in self.tasks if t.name == task_name), None)
|
||||
|
||||
if not task:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"Task '{task_name}' not found",
|
||||
"available_tasks": [t.name for t in self.tasks]
|
||||
}
|
||||
|
||||
if not task.enabled:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"Task '{task_name}' is disabled"
|
||||
}
|
||||
|
||||
async with self.db_manager.get_async_session() as session:
|
||||
return await task.run(session)
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get cleanup manager statistics."""
|
||||
return {
|
||||
"manager": {
|
||||
"running": self.running,
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None,
|
||||
"run_count": self.run_count,
|
||||
"total_cleaned": self.total_cleaned,
|
||||
},
|
||||
"tasks": [task.get_stats() for task in self.tasks],
|
||||
}
|
||||
|
||||
def enable_task(self, task_name: str) -> bool:
|
||||
"""Enable a specific task."""
|
||||
task = next((t for t in self.tasks if t.name == task_name), None)
|
||||
if task:
|
||||
task.enabled = True
|
||||
return True
|
||||
return False
|
||||
|
||||
def disable_task(self, task_name: str) -> bool:
|
||||
"""Disable a specific task."""
|
||||
task = next((t for t in self.tasks if t.name == task_name), None)
|
||||
if task:
|
||||
task.enabled = False
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# Global cleanup manager instance
|
||||
_cleanup_manager: Optional[CleanupManager] = None
|
||||
|
||||
|
||||
def get_cleanup_manager(settings: Settings) -> CleanupManager:
|
||||
"""Get cleanup manager instance."""
|
||||
global _cleanup_manager
|
||||
if _cleanup_manager is None:
|
||||
_cleanup_manager = CleanupManager(settings)
|
||||
return _cleanup_manager
|
||||
|
||||
|
||||
async def run_periodic_cleanup(settings: Settings):
|
||||
"""Run periodic cleanup tasks."""
|
||||
cleanup_manager = get_cleanup_manager(settings)
|
||||
|
||||
while True:
|
||||
try:
|
||||
await cleanup_manager.run_all_tasks()
|
||||
|
||||
# Wait for next cleanup interval
|
||||
await asyncio.sleep(settings.cleanup_interval_seconds)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Periodic cleanup cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Periodic cleanup error: {e}", exc_info=True)
|
||||
# Wait before retrying
|
||||
await asyncio.sleep(60)
|
||||
773
src/tasks/monitoring.py
Normal file
773
src/tasks/monitoring.py
Normal file
@@ -0,0 +1,773 @@
|
||||
"""
|
||||
Monitoring tasks for WiFi-DensePose API
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import psutil
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional, List
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from sqlalchemy import select, func, and_, or_
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from src.config.settings import Settings
|
||||
from src.database.connection import get_database_manager
|
||||
from src.database.models import SystemMetric, Device, Session, CSIData, PoseDetection
|
||||
from src.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class MonitoringTask:
|
||||
"""Base class for monitoring tasks."""
|
||||
|
||||
def __init__(self, name: str, settings: Settings):
|
||||
self.name = name
|
||||
self.settings = settings
|
||||
self.enabled = True
|
||||
self.last_run = None
|
||||
self.run_count = 0
|
||||
self.error_count = 0
|
||||
self.interval_seconds = 60 # Default interval
|
||||
|
||||
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
|
||||
"""Collect metrics for this task."""
|
||||
raise NotImplementedError
|
||||
|
||||
async def run(self, session: AsyncSession) -> Dict[str, Any]:
|
||||
"""Run the monitoring task with error handling."""
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
try:
|
||||
logger.debug(f"Starting monitoring task: {self.name}")
|
||||
|
||||
metrics = await self.collect_metrics(session)
|
||||
|
||||
# Store metrics in database
|
||||
for metric_data in metrics:
|
||||
metric = SystemMetric(
|
||||
metric_name=metric_data["name"],
|
||||
metric_type=metric_data["type"],
|
||||
value=metric_data["value"],
|
||||
unit=metric_data.get("unit"),
|
||||
labels=metric_data.get("labels"),
|
||||
tags=metric_data.get("tags"),
|
||||
source=metric_data.get("source", self.name),
|
||||
component=metric_data.get("component"),
|
||||
description=metric_data.get("description"),
|
||||
metadata=metric_data.get("metadata"),
|
||||
)
|
||||
session.add(metric)
|
||||
|
||||
await session.commit()
|
||||
|
||||
self.last_run = start_time
|
||||
self.run_count += 1
|
||||
|
||||
logger.debug(f"Monitoring task {self.name} completed: collected {len(metrics)} metrics")
|
||||
|
||||
return {
|
||||
"task": self.name,
|
||||
"status": "success",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
|
||||
"metrics_collected": len(metrics),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.error_count += 1
|
||||
logger.error(f"Monitoring task {self.name} failed: {e}", exc_info=True)
|
||||
|
||||
return {
|
||||
"task": self.name,
|
||||
"status": "error",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
|
||||
"error": str(e),
|
||||
"metrics_collected": 0,
|
||||
}
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get task statistics."""
|
||||
return {
|
||||
"name": self.name,
|
||||
"enabled": self.enabled,
|
||||
"interval_seconds": self.interval_seconds,
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None,
|
||||
"run_count": self.run_count,
|
||||
"error_count": self.error_count,
|
||||
}
|
||||
|
||||
|
||||
class SystemResourceMonitoring(MonitoringTask):
|
||||
"""Monitor system resources (CPU, memory, disk, network)."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("system_resources", settings)
|
||||
self.interval_seconds = settings.system_monitoring_interval
|
||||
|
||||
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
|
||||
"""Collect system resource metrics."""
|
||||
metrics = []
|
||||
timestamp = datetime.utcnow()
|
||||
|
||||
# CPU metrics
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
cpu_count = psutil.cpu_count()
|
||||
cpu_freq = psutil.cpu_freq()
|
||||
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "system_cpu_usage_percent",
|
||||
"type": "gauge",
|
||||
"value": cpu_percent,
|
||||
"unit": "percent",
|
||||
"component": "cpu",
|
||||
"description": "CPU usage percentage",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_cpu_count",
|
||||
"type": "gauge",
|
||||
"value": cpu_count,
|
||||
"unit": "count",
|
||||
"component": "cpu",
|
||||
"description": "Number of CPU cores",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
if cpu_freq:
|
||||
metrics.append({
|
||||
"name": "system_cpu_frequency_mhz",
|
||||
"type": "gauge",
|
||||
"value": cpu_freq.current,
|
||||
"unit": "mhz",
|
||||
"component": "cpu",
|
||||
"description": "Current CPU frequency",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Memory metrics
|
||||
memory = psutil.virtual_memory()
|
||||
swap = psutil.swap_memory()
|
||||
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "system_memory_total_bytes",
|
||||
"type": "gauge",
|
||||
"value": memory.total,
|
||||
"unit": "bytes",
|
||||
"component": "memory",
|
||||
"description": "Total system memory",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_memory_used_bytes",
|
||||
"type": "gauge",
|
||||
"value": memory.used,
|
||||
"unit": "bytes",
|
||||
"component": "memory",
|
||||
"description": "Used system memory",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_memory_available_bytes",
|
||||
"type": "gauge",
|
||||
"value": memory.available,
|
||||
"unit": "bytes",
|
||||
"component": "memory",
|
||||
"description": "Available system memory",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_memory_usage_percent",
|
||||
"type": "gauge",
|
||||
"value": memory.percent,
|
||||
"unit": "percent",
|
||||
"component": "memory",
|
||||
"description": "Memory usage percentage",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_swap_total_bytes",
|
||||
"type": "gauge",
|
||||
"value": swap.total,
|
||||
"unit": "bytes",
|
||||
"component": "memory",
|
||||
"description": "Total swap memory",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_swap_used_bytes",
|
||||
"type": "gauge",
|
||||
"value": swap.used,
|
||||
"unit": "bytes",
|
||||
"component": "memory",
|
||||
"description": "Used swap memory",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
# Disk metrics
|
||||
disk_usage = psutil.disk_usage('/')
|
||||
disk_io = psutil.disk_io_counters()
|
||||
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "system_disk_total_bytes",
|
||||
"type": "gauge",
|
||||
"value": disk_usage.total,
|
||||
"unit": "bytes",
|
||||
"component": "disk",
|
||||
"description": "Total disk space",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_disk_used_bytes",
|
||||
"type": "gauge",
|
||||
"value": disk_usage.used,
|
||||
"unit": "bytes",
|
||||
"component": "disk",
|
||||
"description": "Used disk space",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_disk_free_bytes",
|
||||
"type": "gauge",
|
||||
"value": disk_usage.free,
|
||||
"unit": "bytes",
|
||||
"component": "disk",
|
||||
"description": "Free disk space",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_disk_usage_percent",
|
||||
"type": "gauge",
|
||||
"value": (disk_usage.used / disk_usage.total) * 100,
|
||||
"unit": "percent",
|
||||
"component": "disk",
|
||||
"description": "Disk usage percentage",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
if disk_io:
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "system_disk_read_bytes_total",
|
||||
"type": "counter",
|
||||
"value": disk_io.read_bytes,
|
||||
"unit": "bytes",
|
||||
"component": "disk",
|
||||
"description": "Total bytes read from disk",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_disk_write_bytes_total",
|
||||
"type": "counter",
|
||||
"value": disk_io.write_bytes,
|
||||
"unit": "bytes",
|
||||
"component": "disk",
|
||||
"description": "Total bytes written to disk",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
# Network metrics
|
||||
network_io = psutil.net_io_counters()
|
||||
|
||||
if network_io:
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "system_network_bytes_sent_total",
|
||||
"type": "counter",
|
||||
"value": network_io.bytes_sent,
|
||||
"unit": "bytes",
|
||||
"component": "network",
|
||||
"description": "Total bytes sent over network",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_network_bytes_recv_total",
|
||||
"type": "counter",
|
||||
"value": network_io.bytes_recv,
|
||||
"unit": "bytes",
|
||||
"component": "network",
|
||||
"description": "Total bytes received over network",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_network_packets_sent_total",
|
||||
"type": "counter",
|
||||
"value": network_io.packets_sent,
|
||||
"unit": "count",
|
||||
"component": "network",
|
||||
"description": "Total packets sent over network",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "system_network_packets_recv_total",
|
||||
"type": "counter",
|
||||
"value": network_io.packets_recv,
|
||||
"unit": "count",
|
||||
"component": "network",
|
||||
"description": "Total packets received over network",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
class DatabaseMonitoring(MonitoringTask):
|
||||
"""Monitor database performance and statistics."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("database", settings)
|
||||
self.interval_seconds = settings.database_monitoring_interval
|
||||
|
||||
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
|
||||
"""Collect database metrics."""
|
||||
metrics = []
|
||||
timestamp = datetime.utcnow()
|
||||
|
||||
# Get database connection stats
|
||||
db_manager = get_database_manager(self.settings)
|
||||
connection_stats = await db_manager.get_connection_stats()
|
||||
|
||||
# PostgreSQL connection metrics
|
||||
if "postgresql" in connection_stats:
|
||||
pg_stats = connection_stats["postgresql"]
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "database_connections_total",
|
||||
"type": "gauge",
|
||||
"value": pg_stats.get("total_connections", 0),
|
||||
"unit": "count",
|
||||
"component": "postgresql",
|
||||
"description": "Total database connections",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "database_connections_active",
|
||||
"type": "gauge",
|
||||
"value": pg_stats.get("checked_out", 0),
|
||||
"unit": "count",
|
||||
"component": "postgresql",
|
||||
"description": "Active database connections",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "database_connections_available",
|
||||
"type": "gauge",
|
||||
"value": pg_stats.get("available_connections", 0),
|
||||
"unit": "count",
|
||||
"component": "postgresql",
|
||||
"description": "Available database connections",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
# Redis connection metrics
|
||||
if "redis" in connection_stats and not connection_stats["redis"].get("error"):
|
||||
redis_stats = connection_stats["redis"]
|
||||
metrics.extend([
|
||||
{
|
||||
"name": "redis_connections_active",
|
||||
"type": "gauge",
|
||||
"value": redis_stats.get("connected_clients", 0),
|
||||
"unit": "count",
|
||||
"component": "redis",
|
||||
"description": "Active Redis connections",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
},
|
||||
{
|
||||
"name": "redis_connections_blocked",
|
||||
"type": "gauge",
|
||||
"value": redis_stats.get("blocked_clients", 0),
|
||||
"unit": "count",
|
||||
"component": "redis",
|
||||
"description": "Blocked Redis connections",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
}
|
||||
])
|
||||
|
||||
# Table row counts
|
||||
table_counts = await self._get_table_counts(session)
|
||||
for table_name, count in table_counts.items():
|
||||
metrics.append({
|
||||
"name": f"database_table_rows_{table_name}",
|
||||
"type": "gauge",
|
||||
"value": count,
|
||||
"unit": "count",
|
||||
"component": "postgresql",
|
||||
"description": f"Number of rows in {table_name} table",
|
||||
"metadata": {"timestamp": timestamp.isoformat(), "table": table_name}
|
||||
})
|
||||
|
||||
return metrics
|
||||
|
||||
async def _get_table_counts(self, session: AsyncSession) -> Dict[str, int]:
|
||||
"""Get row counts for all tables."""
|
||||
counts = {}
|
||||
|
||||
# Count devices
|
||||
result = await session.execute(select(func.count(Device.id)))
|
||||
counts["devices"] = result.scalar() or 0
|
||||
|
||||
# Count sessions
|
||||
result = await session.execute(select(func.count(Session.id)))
|
||||
counts["sessions"] = result.scalar() or 0
|
||||
|
||||
# Count CSI data
|
||||
result = await session.execute(select(func.count(CSIData.id)))
|
||||
counts["csi_data"] = result.scalar() or 0
|
||||
|
||||
# Count pose detections
|
||||
result = await session.execute(select(func.count(PoseDetection.id)))
|
||||
counts["pose_detections"] = result.scalar() or 0
|
||||
|
||||
# Count system metrics
|
||||
result = await session.execute(select(func.count(SystemMetric.id)))
|
||||
counts["system_metrics"] = result.scalar() or 0
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
class ApplicationMonitoring(MonitoringTask):
|
||||
"""Monitor application-specific metrics."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("application", settings)
|
||||
self.interval_seconds = settings.application_monitoring_interval
|
||||
self.start_time = datetime.utcnow()
|
||||
|
||||
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
|
||||
"""Collect application metrics."""
|
||||
metrics = []
|
||||
timestamp = datetime.utcnow()
|
||||
|
||||
# Application uptime
|
||||
uptime_seconds = (timestamp - self.start_time).total_seconds()
|
||||
metrics.append({
|
||||
"name": "application_uptime_seconds",
|
||||
"type": "gauge",
|
||||
"value": uptime_seconds,
|
||||
"unit": "seconds",
|
||||
"component": "application",
|
||||
"description": "Application uptime in seconds",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Active sessions count
|
||||
active_sessions_query = select(func.count(Session.id)).where(
|
||||
Session.status == "active"
|
||||
)
|
||||
result = await session.execute(active_sessions_query)
|
||||
active_sessions = result.scalar() or 0
|
||||
|
||||
metrics.append({
|
||||
"name": "application_active_sessions",
|
||||
"type": "gauge",
|
||||
"value": active_sessions,
|
||||
"unit": "count",
|
||||
"component": "application",
|
||||
"description": "Number of active sessions",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Active devices count
|
||||
active_devices_query = select(func.count(Device.id)).where(
|
||||
Device.status == "active"
|
||||
)
|
||||
result = await session.execute(active_devices_query)
|
||||
active_devices = result.scalar() or 0
|
||||
|
||||
metrics.append({
|
||||
"name": "application_active_devices",
|
||||
"type": "gauge",
|
||||
"value": active_devices,
|
||||
"unit": "count",
|
||||
"component": "application",
|
||||
"description": "Number of active devices",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Recent data processing metrics (last hour)
|
||||
one_hour_ago = timestamp - timedelta(hours=1)
|
||||
|
||||
# Recent CSI data count
|
||||
recent_csi_query = select(func.count(CSIData.id)).where(
|
||||
CSIData.created_at >= one_hour_ago
|
||||
)
|
||||
result = await session.execute(recent_csi_query)
|
||||
recent_csi_count = result.scalar() or 0
|
||||
|
||||
metrics.append({
|
||||
"name": "application_csi_data_hourly",
|
||||
"type": "gauge",
|
||||
"value": recent_csi_count,
|
||||
"unit": "count",
|
||||
"component": "application",
|
||||
"description": "CSI data records created in the last hour",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Recent pose detections count
|
||||
recent_pose_query = select(func.count(PoseDetection.id)).where(
|
||||
PoseDetection.created_at >= one_hour_ago
|
||||
)
|
||||
result = await session.execute(recent_pose_query)
|
||||
recent_pose_count = result.scalar() or 0
|
||||
|
||||
metrics.append({
|
||||
"name": "application_pose_detections_hourly",
|
||||
"type": "gauge",
|
||||
"value": recent_pose_count,
|
||||
"unit": "count",
|
||||
"component": "application",
|
||||
"description": "Pose detections created in the last hour",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Processing status metrics
|
||||
processing_statuses = ["pending", "processing", "completed", "failed"]
|
||||
for status in processing_statuses:
|
||||
status_query = select(func.count(CSIData.id)).where(
|
||||
CSIData.processing_status == status
|
||||
)
|
||||
result = await session.execute(status_query)
|
||||
status_count = result.scalar() or 0
|
||||
|
||||
metrics.append({
|
||||
"name": f"application_csi_processing_{status}",
|
||||
"type": "gauge",
|
||||
"value": status_count,
|
||||
"unit": "count",
|
||||
"component": "application",
|
||||
"description": f"CSI data records with {status} processing status",
|
||||
"metadata": {"timestamp": timestamp.isoformat(), "status": status}
|
||||
})
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
class PerformanceMonitoring(MonitoringTask):
|
||||
"""Monitor performance metrics and response times."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
super().__init__("performance", settings)
|
||||
self.interval_seconds = settings.performance_monitoring_interval
|
||||
self.response_times = []
|
||||
self.error_counts = {}
|
||||
|
||||
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
|
||||
"""Collect performance metrics."""
|
||||
metrics = []
|
||||
timestamp = datetime.utcnow()
|
||||
|
||||
# Database query performance test
|
||||
start_time = time.time()
|
||||
test_query = select(func.count(Device.id))
|
||||
await session.execute(test_query)
|
||||
db_response_time = (time.time() - start_time) * 1000 # Convert to milliseconds
|
||||
|
||||
metrics.append({
|
||||
"name": "performance_database_query_time_ms",
|
||||
"type": "gauge",
|
||||
"value": db_response_time,
|
||||
"unit": "milliseconds",
|
||||
"component": "database",
|
||||
"description": "Database query response time",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Average response time (if we have data)
|
||||
if self.response_times:
|
||||
avg_response_time = sum(self.response_times) / len(self.response_times)
|
||||
metrics.append({
|
||||
"name": "performance_avg_response_time_ms",
|
||||
"type": "gauge",
|
||||
"value": avg_response_time,
|
||||
"unit": "milliseconds",
|
||||
"component": "api",
|
||||
"description": "Average API response time",
|
||||
"metadata": {"timestamp": timestamp.isoformat()}
|
||||
})
|
||||
|
||||
# Clear old response times (keep only recent ones)
|
||||
self.response_times = self.response_times[-100:] # Keep last 100
|
||||
|
||||
# Error rates
|
||||
for error_type, count in self.error_counts.items():
|
||||
metrics.append({
|
||||
"name": f"performance_errors_{error_type}_total",
|
||||
"type": "counter",
|
||||
"value": count,
|
||||
"unit": "count",
|
||||
"component": "api",
|
||||
"description": f"Total {error_type} errors",
|
||||
"metadata": {"timestamp": timestamp.isoformat(), "error_type": error_type}
|
||||
})
|
||||
|
||||
return metrics
|
||||
|
||||
def record_response_time(self, response_time_ms: float):
|
||||
"""Record an API response time."""
|
||||
self.response_times.append(response_time_ms)
|
||||
|
||||
def record_error(self, error_type: str):
|
||||
"""Record an error occurrence."""
|
||||
self.error_counts[error_type] = self.error_counts.get(error_type, 0) + 1
|
||||
|
||||
|
||||
class MonitoringManager:
|
||||
"""Manager for all monitoring tasks."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
self.settings = settings
|
||||
self.db_manager = get_database_manager(settings)
|
||||
self.tasks = self._initialize_tasks()
|
||||
self.running = False
|
||||
self.last_run = None
|
||||
self.run_count = 0
|
||||
|
||||
def _initialize_tasks(self) -> List[MonitoringTask]:
|
||||
"""Initialize all monitoring tasks."""
|
||||
tasks = [
|
||||
SystemResourceMonitoring(self.settings),
|
||||
DatabaseMonitoring(self.settings),
|
||||
ApplicationMonitoring(self.settings),
|
||||
PerformanceMonitoring(self.settings),
|
||||
]
|
||||
|
||||
# Filter enabled tasks
|
||||
enabled_tasks = [task for task in tasks if task.enabled]
|
||||
|
||||
logger.info(f"Initialized {len(enabled_tasks)} monitoring tasks")
|
||||
return enabled_tasks
|
||||
|
||||
async def run_all_tasks(self) -> Dict[str, Any]:
|
||||
"""Run all monitoring tasks."""
|
||||
if self.running:
|
||||
return {"status": "already_running", "message": "Monitoring already in progress"}
|
||||
|
||||
self.running = True
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
try:
|
||||
logger.debug("Starting monitoring tasks")
|
||||
|
||||
results = []
|
||||
total_metrics = 0
|
||||
|
||||
async with self.db_manager.get_async_session() as session:
|
||||
for task in self.tasks:
|
||||
if not task.enabled:
|
||||
continue
|
||||
|
||||
result = await task.run(session)
|
||||
results.append(result)
|
||||
total_metrics += result.get("metrics_collected", 0)
|
||||
|
||||
self.last_run = start_time
|
||||
self.run_count += 1
|
||||
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
logger.debug(
|
||||
f"Monitoring tasks completed: collected {total_metrics} metrics "
|
||||
f"in {duration:.2f} seconds"
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "completed",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_seconds": duration,
|
||||
"total_metrics": total_metrics,
|
||||
"task_results": results,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Monitoring tasks failed: {e}", exc_info=True)
|
||||
return {
|
||||
"status": "error",
|
||||
"start_time": start_time.isoformat(),
|
||||
"duration_seconds": (datetime.utcnow() - start_time).total_seconds(),
|
||||
"error": str(e),
|
||||
"total_metrics": 0,
|
||||
}
|
||||
|
||||
finally:
|
||||
self.running = False
|
||||
|
||||
async def run_task(self, task_name: str) -> Dict[str, Any]:
|
||||
"""Run a specific monitoring task."""
|
||||
task = next((t for t in self.tasks if t.name == task_name), None)
|
||||
|
||||
if not task:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"Task '{task_name}' not found",
|
||||
"available_tasks": [t.name for t in self.tasks]
|
||||
}
|
||||
|
||||
if not task.enabled:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"Task '{task_name}' is disabled"
|
||||
}
|
||||
|
||||
async with self.db_manager.get_async_session() as session:
|
||||
return await task.run(session)
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get monitoring manager statistics."""
|
||||
return {
|
||||
"manager": {
|
||||
"running": self.running,
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None,
|
||||
"run_count": self.run_count,
|
||||
},
|
||||
"tasks": [task.get_stats() for task in self.tasks],
|
||||
}
|
||||
|
||||
def get_performance_task(self) -> Optional[PerformanceMonitoring]:
|
||||
"""Get the performance monitoring task for recording metrics."""
|
||||
return next((t for t in self.tasks if isinstance(t, PerformanceMonitoring)), None)
|
||||
|
||||
|
||||
# Global monitoring manager instance
|
||||
_monitoring_manager: Optional[MonitoringManager] = None
|
||||
|
||||
|
||||
def get_monitoring_manager(settings: Settings) -> MonitoringManager:
|
||||
"""Get monitoring manager instance."""
|
||||
global _monitoring_manager
|
||||
if _monitoring_manager is None:
|
||||
_monitoring_manager = MonitoringManager(settings)
|
||||
return _monitoring_manager
|
||||
|
||||
|
||||
async def run_periodic_monitoring(settings: Settings):
|
||||
"""Run periodic monitoring tasks."""
|
||||
monitoring_manager = get_monitoring_manager(settings)
|
||||
|
||||
while True:
|
||||
try:
|
||||
await monitoring_manager.run_all_tasks()
|
||||
|
||||
# Wait for next monitoring interval
|
||||
await asyncio.sleep(settings.monitoring_interval_seconds)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Periodic monitoring cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Periodic monitoring error: {e}", exc_info=True)
|
||||
# Wait before retrying
|
||||
await asyncio.sleep(30)
|
||||
Reference in New Issue
Block a user