feat: Complete Rust port of WiFi-DensePose with modular crates

Major changes:
- Organized Python v1 implementation into v1/ subdirectory
- Created Rust workspace with 9 modular crates:
  - wifi-densepose-core: Core types, traits, errors
  - wifi-densepose-signal: CSI processing, phase sanitization, FFT
  - wifi-densepose-nn: Neural network inference (ONNX/Candle/tch)
  - wifi-densepose-api: Axum-based REST/WebSocket API
  - wifi-densepose-db: SQLx database layer
  - wifi-densepose-config: Configuration management
  - wifi-densepose-hardware: Hardware abstraction
  - wifi-densepose-wasm: WebAssembly bindings
  - wifi-densepose-cli: Command-line interface

Documentation:
- ADR-001: Workspace structure
- ADR-002: Signal processing library selection
- ADR-003: Neural network inference strategy
- DDD domain model with bounded contexts

Testing:
- 69 tests passing across all crates
- Signal processing: 45 tests
- Neural networks: 21 tests
- Core: 3 doc tests

Performance targets:
- 10x faster CSI processing (~0.5ms vs ~5ms)
- 5x lower memory usage (~100MB vs ~500MB)
- WASM support for browser deployment
This commit is contained in:
Claude
2026-01-13 03:11:16 +00:00
parent 5101504b72
commit 6ed69a3d48
427 changed files with 90993 additions and 0 deletions

612
v1/src/tasks/backup.py Normal file
View File

@@ -0,0 +1,612 @@
"""
Backup tasks for WiFi-DensePose API
"""
import asyncio
import logging
import os
import shutil
import gzip
import json
import subprocess
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, Any, Optional, List
from contextlib import asynccontextmanager
from sqlalchemy import select, text
from sqlalchemy.ext.asyncio import AsyncSession
from src.config.settings import Settings
from src.database.connection import get_database_manager
from src.database.models import Device, Session, CSIData, PoseDetection, SystemMetric, AuditLog
from src.logger import get_logger
logger = get_logger(__name__)
class BackupTask:
"""Base class for backup tasks."""
def __init__(self, name: str, settings: Settings):
self.name = name
self.settings = settings
self.enabled = True
self.last_run = None
self.run_count = 0
self.error_count = 0
self.backup_dir = Path(settings.backup_directory)
self.backup_dir.mkdir(parents=True, exist_ok=True)
async def execute_backup(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute the backup task."""
raise NotImplementedError
async def run(self, session: AsyncSession) -> Dict[str, Any]:
"""Run the backup task with error handling."""
start_time = datetime.utcnow()
try:
logger.info(f"Starting backup task: {self.name}")
result = await self.execute_backup(session)
self.last_run = start_time
self.run_count += 1
logger.info(
f"Backup task {self.name} completed: "
f"backed up {result.get('backup_size_mb', 0):.2f} MB"
)
return {
"task": self.name,
"status": "success",
"start_time": start_time.isoformat(),
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
**result
}
except Exception as e:
self.error_count += 1
logger.error(f"Backup task {self.name} failed: {e}", exc_info=True)
return {
"task": self.name,
"status": "error",
"start_time": start_time.isoformat(),
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
"error": str(e),
"backup_size_mb": 0
}
def get_stats(self) -> Dict[str, Any]:
"""Get task statistics."""
return {
"name": self.name,
"enabled": self.enabled,
"last_run": self.last_run.isoformat() if self.last_run else None,
"run_count": self.run_count,
"error_count": self.error_count,
"backup_directory": str(self.backup_dir),
}
def _get_backup_filename(self, prefix: str, extension: str = ".gz") -> str:
"""Generate backup filename with timestamp."""
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
return f"{prefix}_{timestamp}{extension}"
def _get_file_size_mb(self, file_path: Path) -> float:
"""Get file size in MB."""
if file_path.exists():
return file_path.stat().st_size / (1024 * 1024)
return 0.0
def _cleanup_old_backups(self, pattern: str, retention_days: int):
"""Clean up old backup files."""
if retention_days <= 0:
return
cutoff_date = datetime.utcnow() - timedelta(days=retention_days)
for backup_file in self.backup_dir.glob(pattern):
if backup_file.stat().st_mtime < cutoff_date.timestamp():
try:
backup_file.unlink()
logger.debug(f"Deleted old backup: {backup_file}")
except Exception as e:
logger.warning(f"Failed to delete old backup {backup_file}: {e}")
class DatabaseBackup(BackupTask):
"""Full database backup using pg_dump."""
def __init__(self, settings: Settings):
super().__init__("database_backup", settings)
self.retention_days = settings.database_backup_retention_days
async def execute_backup(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute database backup."""
backup_filename = self._get_backup_filename("database_full", ".sql.gz")
backup_path = self.backup_dir / backup_filename
# Build pg_dump command
pg_dump_cmd = [
"pg_dump",
"--verbose",
"--no-password",
"--format=custom",
"--compress=9",
"--file", str(backup_path),
]
# Add connection parameters
if self.settings.database_url:
pg_dump_cmd.append(self.settings.database_url)
else:
pg_dump_cmd.extend([
"--host", self.settings.db_host,
"--port", str(self.settings.db_port),
"--username", self.settings.db_user,
"--dbname", self.settings.db_name,
])
# Set environment variables
env = os.environ.copy()
if self.settings.db_password:
env["PGPASSWORD"] = self.settings.db_password
# Execute pg_dump
process = await asyncio.create_subprocess_exec(
*pg_dump_cmd,
env=env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode != 0:
error_msg = stderr.decode() if stderr else "Unknown pg_dump error"
raise Exception(f"pg_dump failed: {error_msg}")
backup_size_mb = self._get_file_size_mb(backup_path)
# Clean up old backups
self._cleanup_old_backups("database_full_*.sql.gz", self.retention_days)
return {
"backup_file": backup_filename,
"backup_path": str(backup_path),
"backup_size_mb": backup_size_mb,
"retention_days": self.retention_days,
}
class ConfigurationBackup(BackupTask):
"""Backup configuration files and settings."""
def __init__(self, settings: Settings):
super().__init__("configuration_backup", settings)
self.retention_days = settings.config_backup_retention_days
self.config_files = [
"src/config/settings.py",
".env",
"pyproject.toml",
"docker-compose.yml",
"Dockerfile",
]
async def execute_backup(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute configuration backup."""
backup_filename = self._get_backup_filename("configuration", ".tar.gz")
backup_path = self.backup_dir / backup_filename
# Create temporary directory for config files
temp_dir = self.backup_dir / "temp_config"
temp_dir.mkdir(exist_ok=True)
try:
copied_files = []
# Copy configuration files
for config_file in self.config_files:
source_path = Path(config_file)
if source_path.exists():
dest_path = temp_dir / source_path.name
shutil.copy2(source_path, dest_path)
copied_files.append(config_file)
# Create settings dump
settings_dump = {
"backup_timestamp": datetime.utcnow().isoformat(),
"environment": self.settings.environment,
"debug": self.settings.debug,
"version": self.settings.version,
"database_settings": {
"db_host": self.settings.db_host,
"db_port": self.settings.db_port,
"db_name": self.settings.db_name,
"db_pool_size": self.settings.db_pool_size,
},
"redis_settings": {
"redis_enabled": self.settings.redis_enabled,
"redis_host": self.settings.redis_host,
"redis_port": self.settings.redis_port,
"redis_db": self.settings.redis_db,
},
"monitoring_settings": {
"monitoring_interval_seconds": self.settings.monitoring_interval_seconds,
"cleanup_interval_seconds": self.settings.cleanup_interval_seconds,
},
}
settings_file = temp_dir / "settings_dump.json"
with open(settings_file, 'w') as f:
json.dump(settings_dump, f, indent=2)
# Create tar.gz archive
tar_cmd = [
"tar", "-czf", str(backup_path),
"-C", str(temp_dir),
"."
]
process = await asyncio.create_subprocess_exec(
*tar_cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode != 0:
error_msg = stderr.decode() if stderr else "Unknown tar error"
raise Exception(f"tar failed: {error_msg}")
backup_size_mb = self._get_file_size_mb(backup_path)
# Clean up old backups
self._cleanup_old_backups("configuration_*.tar.gz", self.retention_days)
return {
"backup_file": backup_filename,
"backup_path": str(backup_path),
"backup_size_mb": backup_size_mb,
"copied_files": copied_files,
"retention_days": self.retention_days,
}
finally:
# Clean up temporary directory
if temp_dir.exists():
shutil.rmtree(temp_dir)
class DataExportBackup(BackupTask):
"""Export specific data tables to JSON format."""
def __init__(self, settings: Settings):
super().__init__("data_export_backup", settings)
self.retention_days = settings.data_export_retention_days
self.export_batch_size = 1000
async def execute_backup(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute data export backup."""
backup_filename = self._get_backup_filename("data_export", ".json.gz")
backup_path = self.backup_dir / backup_filename
export_data = {
"backup_timestamp": datetime.utcnow().isoformat(),
"export_version": "1.0",
"tables": {}
}
# Export devices
devices_data = await self._export_table_data(session, Device, "devices")
export_data["tables"]["devices"] = devices_data
# Export sessions
sessions_data = await self._export_table_data(session, Session, "sessions")
export_data["tables"]["sessions"] = sessions_data
# Export recent CSI data (last 7 days)
recent_date = datetime.utcnow() - timedelta(days=7)
csi_query = select(CSIData).where(CSIData.created_at >= recent_date)
csi_data = await self._export_query_data(session, csi_query, "csi_data")
export_data["tables"]["csi_data_recent"] = csi_data
# Export recent pose detections (last 7 days)
pose_query = select(PoseDetection).where(PoseDetection.created_at >= recent_date)
pose_data = await self._export_query_data(session, pose_query, "pose_detections")
export_data["tables"]["pose_detections_recent"] = pose_data
# Write compressed JSON
with gzip.open(backup_path, 'wt', encoding='utf-8') as f:
json.dump(export_data, f, indent=2, default=str)
backup_size_mb = self._get_file_size_mb(backup_path)
# Clean up old backups
self._cleanup_old_backups("data_export_*.json.gz", self.retention_days)
total_records = sum(
table_data["record_count"]
for table_data in export_data["tables"].values()
)
return {
"backup_file": backup_filename,
"backup_path": str(backup_path),
"backup_size_mb": backup_size_mb,
"total_records": total_records,
"tables_exported": list(export_data["tables"].keys()),
"retention_days": self.retention_days,
}
async def _export_table_data(self, session: AsyncSession, model_class, table_name: str) -> Dict[str, Any]:
"""Export all data from a table."""
query = select(model_class)
return await self._export_query_data(session, query, table_name)
async def _export_query_data(self, session: AsyncSession, query, table_name: str) -> Dict[str, Any]:
"""Export data from a query."""
result = await session.execute(query)
records = result.scalars().all()
exported_records = []
for record in records:
if hasattr(record, 'to_dict'):
exported_records.append(record.to_dict())
else:
# Fallback for records without to_dict method
record_dict = {}
for column in record.__table__.columns:
value = getattr(record, column.name)
if isinstance(value, datetime):
value = value.isoformat()
record_dict[column.name] = value
exported_records.append(record_dict)
return {
"table_name": table_name,
"record_count": len(exported_records),
"export_timestamp": datetime.utcnow().isoformat(),
"records": exported_records,
}
class LogsBackup(BackupTask):
"""Backup application logs."""
def __init__(self, settings: Settings):
super().__init__("logs_backup", settings)
self.retention_days = settings.logs_backup_retention_days
self.logs_directory = Path(settings.log_directory)
async def execute_backup(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute logs backup."""
if not self.logs_directory.exists():
return {
"backup_file": None,
"backup_path": None,
"backup_size_mb": 0,
"message": "Logs directory does not exist",
}
backup_filename = self._get_backup_filename("logs", ".tar.gz")
backup_path = self.backup_dir / backup_filename
# Create tar.gz archive of logs
tar_cmd = [
"tar", "-czf", str(backup_path),
"-C", str(self.logs_directory.parent),
self.logs_directory.name
]
process = await asyncio.create_subprocess_exec(
*tar_cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode != 0:
error_msg = stderr.decode() if stderr else "Unknown tar error"
raise Exception(f"tar failed: {error_msg}")
backup_size_mb = self._get_file_size_mb(backup_path)
# Count log files
log_files = list(self.logs_directory.glob("*.log*"))
# Clean up old backups
self._cleanup_old_backups("logs_*.tar.gz", self.retention_days)
return {
"backup_file": backup_filename,
"backup_path": str(backup_path),
"backup_size_mb": backup_size_mb,
"log_files_count": len(log_files),
"retention_days": self.retention_days,
}
class BackupManager:
"""Manager for all backup tasks."""
def __init__(self, settings: Settings):
self.settings = settings
self.db_manager = get_database_manager(settings)
self.tasks = self._initialize_tasks()
self.running = False
self.last_run = None
self.run_count = 0
self.total_backup_size = 0
def _initialize_tasks(self) -> List[BackupTask]:
"""Initialize all backup tasks."""
tasks = [
DatabaseBackup(self.settings),
ConfigurationBackup(self.settings),
DataExportBackup(self.settings),
LogsBackup(self.settings),
]
# Filter enabled tasks
enabled_tasks = [task for task in tasks if task.enabled]
logger.info(f"Initialized {len(enabled_tasks)} backup tasks")
return enabled_tasks
async def run_all_tasks(self) -> Dict[str, Any]:
"""Run all backup tasks."""
if self.running:
return {"status": "already_running", "message": "Backup already in progress"}
self.running = True
start_time = datetime.utcnow()
try:
logger.info("Starting backup tasks")
results = []
total_backup_size = 0
async with self.db_manager.get_async_session() as session:
for task in self.tasks:
if not task.enabled:
continue
result = await task.run(session)
results.append(result)
total_backup_size += result.get("backup_size_mb", 0)
self.last_run = start_time
self.run_count += 1
self.total_backup_size += total_backup_size
duration = (datetime.utcnow() - start_time).total_seconds()
logger.info(
f"Backup tasks completed: created {total_backup_size:.2f} MB "
f"in {duration:.2f} seconds"
)
return {
"status": "completed",
"start_time": start_time.isoformat(),
"duration_seconds": duration,
"total_backup_size_mb": total_backup_size,
"task_results": results,
}
except Exception as e:
logger.error(f"Backup tasks failed: {e}", exc_info=True)
return {
"status": "error",
"start_time": start_time.isoformat(),
"duration_seconds": (datetime.utcnow() - start_time).total_seconds(),
"error": str(e),
"total_backup_size_mb": 0,
}
finally:
self.running = False
async def run_task(self, task_name: str) -> Dict[str, Any]:
"""Run a specific backup task."""
task = next((t for t in self.tasks if t.name == task_name), None)
if not task:
return {
"status": "error",
"error": f"Task '{task_name}' not found",
"available_tasks": [t.name for t in self.tasks]
}
if not task.enabled:
return {
"status": "error",
"error": f"Task '{task_name}' is disabled"
}
async with self.db_manager.get_async_session() as session:
return await task.run(session)
def get_stats(self) -> Dict[str, Any]:
"""Get backup manager statistics."""
return {
"manager": {
"running": self.running,
"last_run": self.last_run.isoformat() if self.last_run else None,
"run_count": self.run_count,
"total_backup_size_mb": self.total_backup_size,
},
"tasks": [task.get_stats() for task in self.tasks],
}
def list_backups(self) -> Dict[str, List[Dict[str, Any]]]:
"""List all backup files."""
backup_files = {}
for task in self.tasks:
task_backups = []
# Define patterns for each task type
patterns = {
"database_backup": "database_full_*.sql.gz",
"configuration_backup": "configuration_*.tar.gz",
"data_export_backup": "data_export_*.json.gz",
"logs_backup": "logs_*.tar.gz",
}
pattern = patterns.get(task.name, f"{task.name}_*")
for backup_file in task.backup_dir.glob(pattern):
stat = backup_file.stat()
task_backups.append({
"filename": backup_file.name,
"path": str(backup_file),
"size_mb": stat.st_size / (1024 * 1024),
"created_at": datetime.fromtimestamp(stat.st_mtime).isoformat(),
})
# Sort by creation time (newest first)
task_backups.sort(key=lambda x: x["created_at"], reverse=True)
backup_files[task.name] = task_backups
return backup_files
# Global backup manager instance
_backup_manager: Optional[BackupManager] = None
def get_backup_manager(settings: Settings) -> BackupManager:
"""Get backup manager instance."""
global _backup_manager
if _backup_manager is None:
_backup_manager = BackupManager(settings)
return _backup_manager
async def run_periodic_backup(settings: Settings):
"""Run periodic backup tasks."""
backup_manager = get_backup_manager(settings)
while True:
try:
await backup_manager.run_all_tasks()
# Wait for next backup interval
await asyncio.sleep(settings.backup_interval_seconds)
except asyncio.CancelledError:
logger.info("Periodic backup cancelled")
break
except Exception as e:
logger.error(f"Periodic backup error: {e}", exc_info=True)
# Wait before retrying
await asyncio.sleep(300) # 5 minutes

598
v1/src/tasks/cleanup.py Normal file
View File

@@ -0,0 +1,598 @@
"""
Periodic cleanup tasks for WiFi-DensePose API
"""
import asyncio
import logging
from datetime import datetime, timedelta
from typing import Dict, Any, Optional, List
from contextlib import asynccontextmanager
from sqlalchemy import delete, select, func, and_, or_
from sqlalchemy.ext.asyncio import AsyncSession
from src.config.settings import Settings
from src.database.connection import get_database_manager
from src.database.models import (
CSIData, PoseDetection, SystemMetric, AuditLog, Session, Device
)
from src.logger import get_logger
logger = get_logger(__name__)
class CleanupTask:
"""Base class for cleanup tasks."""
def __init__(self, name: str, settings: Settings):
self.name = name
self.settings = settings
self.enabled = True
self.last_run = None
self.run_count = 0
self.error_count = 0
self.total_cleaned = 0
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute the cleanup task."""
raise NotImplementedError
async def run(self, session: AsyncSession) -> Dict[str, Any]:
"""Run the cleanup task with error handling."""
start_time = datetime.utcnow()
try:
logger.info(f"Starting cleanup task: {self.name}")
result = await self.execute(session)
self.last_run = start_time
self.run_count += 1
if result.get("cleaned_count", 0) > 0:
self.total_cleaned += result["cleaned_count"]
logger.info(
f"Cleanup task {self.name} completed: "
f"cleaned {result['cleaned_count']} items"
)
else:
logger.debug(f"Cleanup task {self.name} completed: no items to clean")
return {
"task": self.name,
"status": "success",
"start_time": start_time.isoformat(),
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
**result
}
except Exception as e:
self.error_count += 1
logger.error(f"Cleanup task {self.name} failed: {e}", exc_info=True)
return {
"task": self.name,
"status": "error",
"start_time": start_time.isoformat(),
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
"error": str(e),
"cleaned_count": 0
}
def get_stats(self) -> Dict[str, Any]:
"""Get task statistics."""
return {
"name": self.name,
"enabled": self.enabled,
"last_run": self.last_run.isoformat() if self.last_run else None,
"run_count": self.run_count,
"error_count": self.error_count,
"total_cleaned": self.total_cleaned,
}
class OldCSIDataCleanup(CleanupTask):
"""Cleanup old CSI data records."""
def __init__(self, settings: Settings):
super().__init__("old_csi_data_cleanup", settings)
self.retention_days = settings.csi_data_retention_days
self.batch_size = settings.cleanup_batch_size
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute CSI data cleanup."""
if self.retention_days <= 0:
return {"cleaned_count": 0, "message": "CSI data retention disabled"}
cutoff_date = datetime.utcnow() - timedelta(days=self.retention_days)
# Count records to be deleted
count_query = select(func.count(CSIData.id)).where(
CSIData.created_at < cutoff_date
)
total_count = await session.scalar(count_query)
if total_count == 0:
return {"cleaned_count": 0, "message": "No old CSI data to clean"}
# Delete in batches
cleaned_count = 0
while cleaned_count < total_count:
# Get batch of IDs to delete
id_query = select(CSIData.id).where(
CSIData.created_at < cutoff_date
).limit(self.batch_size)
result = await session.execute(id_query)
ids_to_delete = [row[0] for row in result.fetchall()]
if not ids_to_delete:
break
# Delete batch
delete_query = delete(CSIData).where(CSIData.id.in_(ids_to_delete))
await session.execute(delete_query)
await session.commit()
batch_size = len(ids_to_delete)
cleaned_count += batch_size
logger.debug(f"Deleted {batch_size} CSI data records (total: {cleaned_count})")
# Small delay to avoid overwhelming the database
await asyncio.sleep(0.1)
return {
"cleaned_count": cleaned_count,
"retention_days": self.retention_days,
"cutoff_date": cutoff_date.isoformat()
}
class OldPoseDetectionCleanup(CleanupTask):
"""Cleanup old pose detection records."""
def __init__(self, settings: Settings):
super().__init__("old_pose_detection_cleanup", settings)
self.retention_days = settings.pose_detection_retention_days
self.batch_size = settings.cleanup_batch_size
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute pose detection cleanup."""
if self.retention_days <= 0:
return {"cleaned_count": 0, "message": "Pose detection retention disabled"}
cutoff_date = datetime.utcnow() - timedelta(days=self.retention_days)
# Count records to be deleted
count_query = select(func.count(PoseDetection.id)).where(
PoseDetection.created_at < cutoff_date
)
total_count = await session.scalar(count_query)
if total_count == 0:
return {"cleaned_count": 0, "message": "No old pose detections to clean"}
# Delete in batches
cleaned_count = 0
while cleaned_count < total_count:
# Get batch of IDs to delete
id_query = select(PoseDetection.id).where(
PoseDetection.created_at < cutoff_date
).limit(self.batch_size)
result = await session.execute(id_query)
ids_to_delete = [row[0] for row in result.fetchall()]
if not ids_to_delete:
break
# Delete batch
delete_query = delete(PoseDetection).where(PoseDetection.id.in_(ids_to_delete))
await session.execute(delete_query)
await session.commit()
batch_size = len(ids_to_delete)
cleaned_count += batch_size
logger.debug(f"Deleted {batch_size} pose detection records (total: {cleaned_count})")
# Small delay to avoid overwhelming the database
await asyncio.sleep(0.1)
return {
"cleaned_count": cleaned_count,
"retention_days": self.retention_days,
"cutoff_date": cutoff_date.isoformat()
}
class OldMetricsCleanup(CleanupTask):
"""Cleanup old system metrics."""
def __init__(self, settings: Settings):
super().__init__("old_metrics_cleanup", settings)
self.retention_days = settings.metrics_retention_days
self.batch_size = settings.cleanup_batch_size
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute metrics cleanup."""
if self.retention_days <= 0:
return {"cleaned_count": 0, "message": "Metrics retention disabled"}
cutoff_date = datetime.utcnow() - timedelta(days=self.retention_days)
# Count records to be deleted
count_query = select(func.count(SystemMetric.id)).where(
SystemMetric.created_at < cutoff_date
)
total_count = await session.scalar(count_query)
if total_count == 0:
return {"cleaned_count": 0, "message": "No old metrics to clean"}
# Delete in batches
cleaned_count = 0
while cleaned_count < total_count:
# Get batch of IDs to delete
id_query = select(SystemMetric.id).where(
SystemMetric.created_at < cutoff_date
).limit(self.batch_size)
result = await session.execute(id_query)
ids_to_delete = [row[0] for row in result.fetchall()]
if not ids_to_delete:
break
# Delete batch
delete_query = delete(SystemMetric).where(SystemMetric.id.in_(ids_to_delete))
await session.execute(delete_query)
await session.commit()
batch_size = len(ids_to_delete)
cleaned_count += batch_size
logger.debug(f"Deleted {batch_size} metric records (total: {cleaned_count})")
# Small delay to avoid overwhelming the database
await asyncio.sleep(0.1)
return {
"cleaned_count": cleaned_count,
"retention_days": self.retention_days,
"cutoff_date": cutoff_date.isoformat()
}
class OldAuditLogCleanup(CleanupTask):
"""Cleanup old audit logs."""
def __init__(self, settings: Settings):
super().__init__("old_audit_log_cleanup", settings)
self.retention_days = settings.audit_log_retention_days
self.batch_size = settings.cleanup_batch_size
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute audit log cleanup."""
if self.retention_days <= 0:
return {"cleaned_count": 0, "message": "Audit log retention disabled"}
cutoff_date = datetime.utcnow() - timedelta(days=self.retention_days)
# Count records to be deleted
count_query = select(func.count(AuditLog.id)).where(
AuditLog.created_at < cutoff_date
)
total_count = await session.scalar(count_query)
if total_count == 0:
return {"cleaned_count": 0, "message": "No old audit logs to clean"}
# Delete in batches
cleaned_count = 0
while cleaned_count < total_count:
# Get batch of IDs to delete
id_query = select(AuditLog.id).where(
AuditLog.created_at < cutoff_date
).limit(self.batch_size)
result = await session.execute(id_query)
ids_to_delete = [row[0] for row in result.fetchall()]
if not ids_to_delete:
break
# Delete batch
delete_query = delete(AuditLog).where(AuditLog.id.in_(ids_to_delete))
await session.execute(delete_query)
await session.commit()
batch_size = len(ids_to_delete)
cleaned_count += batch_size
logger.debug(f"Deleted {batch_size} audit log records (total: {cleaned_count})")
# Small delay to avoid overwhelming the database
await asyncio.sleep(0.1)
return {
"cleaned_count": cleaned_count,
"retention_days": self.retention_days,
"cutoff_date": cutoff_date.isoformat()
}
class OrphanedSessionCleanup(CleanupTask):
"""Cleanup orphaned sessions (sessions without associated data)."""
def __init__(self, settings: Settings):
super().__init__("orphaned_session_cleanup", settings)
self.orphan_threshold_days = settings.orphaned_session_threshold_days
self.batch_size = settings.cleanup_batch_size
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute orphaned session cleanup."""
if self.orphan_threshold_days <= 0:
return {"cleaned_count": 0, "message": "Orphaned session cleanup disabled"}
cutoff_date = datetime.utcnow() - timedelta(days=self.orphan_threshold_days)
# Find sessions that are old and have no associated CSI data or pose detections
orphaned_sessions_query = select(Session.id).where(
and_(
Session.created_at < cutoff_date,
Session.status.in_(["completed", "failed", "cancelled"]),
~Session.id.in_(select(CSIData.session_id).where(CSIData.session_id.isnot(None))),
~Session.id.in_(select(PoseDetection.session_id))
)
)
result = await session.execute(orphaned_sessions_query)
orphaned_ids = [row[0] for row in result.fetchall()]
if not orphaned_ids:
return {"cleaned_count": 0, "message": "No orphaned sessions to clean"}
# Delete orphaned sessions
delete_query = delete(Session).where(Session.id.in_(orphaned_ids))
await session.execute(delete_query)
await session.commit()
cleaned_count = len(orphaned_ids)
return {
"cleaned_count": cleaned_count,
"orphan_threshold_days": self.orphan_threshold_days,
"cutoff_date": cutoff_date.isoformat()
}
class InvalidDataCleanup(CleanupTask):
"""Cleanup invalid or corrupted data records."""
def __init__(self, settings: Settings):
super().__init__("invalid_data_cleanup", settings)
self.batch_size = settings.cleanup_batch_size
async def execute(self, session: AsyncSession) -> Dict[str, Any]:
"""Execute invalid data cleanup."""
total_cleaned = 0
# Clean invalid CSI data
invalid_csi_query = select(CSIData.id).where(
or_(
CSIData.is_valid == False,
CSIData.amplitude == None,
CSIData.phase == None,
CSIData.frequency <= 0,
CSIData.bandwidth <= 0,
CSIData.num_subcarriers <= 0
)
)
result = await session.execute(invalid_csi_query)
invalid_csi_ids = [row[0] for row in result.fetchall()]
if invalid_csi_ids:
delete_query = delete(CSIData).where(CSIData.id.in_(invalid_csi_ids))
await session.execute(delete_query)
total_cleaned += len(invalid_csi_ids)
logger.debug(f"Deleted {len(invalid_csi_ids)} invalid CSI data records")
# Clean invalid pose detections
invalid_pose_query = select(PoseDetection.id).where(
or_(
PoseDetection.is_valid == False,
PoseDetection.person_count < 0,
and_(
PoseDetection.detection_confidence.isnot(None),
or_(
PoseDetection.detection_confidence < 0,
PoseDetection.detection_confidence > 1
)
)
)
)
result = await session.execute(invalid_pose_query)
invalid_pose_ids = [row[0] for row in result.fetchall()]
if invalid_pose_ids:
delete_query = delete(PoseDetection).where(PoseDetection.id.in_(invalid_pose_ids))
await session.execute(delete_query)
total_cleaned += len(invalid_pose_ids)
logger.debug(f"Deleted {len(invalid_pose_ids)} invalid pose detection records")
await session.commit()
return {
"cleaned_count": total_cleaned,
"invalid_csi_count": len(invalid_csi_ids) if invalid_csi_ids else 0,
"invalid_pose_count": len(invalid_pose_ids) if invalid_pose_ids else 0,
}
class CleanupManager:
"""Manager for all cleanup tasks."""
def __init__(self, settings: Settings):
self.settings = settings
self.db_manager = get_database_manager(settings)
self.tasks = self._initialize_tasks()
self.running = False
self.last_run = None
self.run_count = 0
self.total_cleaned = 0
def _initialize_tasks(self) -> List[CleanupTask]:
"""Initialize all cleanup tasks."""
tasks = [
OldCSIDataCleanup(self.settings),
OldPoseDetectionCleanup(self.settings),
OldMetricsCleanup(self.settings),
OldAuditLogCleanup(self.settings),
OrphanedSessionCleanup(self.settings),
InvalidDataCleanup(self.settings),
]
# Filter enabled tasks
enabled_tasks = [task for task in tasks if task.enabled]
logger.info(f"Initialized {len(enabled_tasks)} cleanup tasks")
return enabled_tasks
async def run_all_tasks(self) -> Dict[str, Any]:
"""Run all cleanup tasks."""
if self.running:
return {"status": "already_running", "message": "Cleanup already in progress"}
self.running = True
start_time = datetime.utcnow()
try:
logger.info("Starting cleanup tasks")
results = []
total_cleaned = 0
async with self.db_manager.get_async_session() as session:
for task in self.tasks:
if not task.enabled:
continue
result = await task.run(session)
results.append(result)
total_cleaned += result.get("cleaned_count", 0)
self.last_run = start_time
self.run_count += 1
self.total_cleaned += total_cleaned
duration = (datetime.utcnow() - start_time).total_seconds()
logger.info(
f"Cleanup tasks completed: cleaned {total_cleaned} items "
f"in {duration:.2f} seconds"
)
return {
"status": "completed",
"start_time": start_time.isoformat(),
"duration_seconds": duration,
"total_cleaned": total_cleaned,
"task_results": results,
}
except Exception as e:
logger.error(f"Cleanup tasks failed: {e}", exc_info=True)
return {
"status": "error",
"start_time": start_time.isoformat(),
"duration_seconds": (datetime.utcnow() - start_time).total_seconds(),
"error": str(e),
"total_cleaned": 0,
}
finally:
self.running = False
async def run_task(self, task_name: str) -> Dict[str, Any]:
"""Run a specific cleanup task."""
task = next((t for t in self.tasks if t.name == task_name), None)
if not task:
return {
"status": "error",
"error": f"Task '{task_name}' not found",
"available_tasks": [t.name for t in self.tasks]
}
if not task.enabled:
return {
"status": "error",
"error": f"Task '{task_name}' is disabled"
}
async with self.db_manager.get_async_session() as session:
return await task.run(session)
def get_stats(self) -> Dict[str, Any]:
"""Get cleanup manager statistics."""
return {
"manager": {
"running": self.running,
"last_run": self.last_run.isoformat() if self.last_run else None,
"run_count": self.run_count,
"total_cleaned": self.total_cleaned,
},
"tasks": [task.get_stats() for task in self.tasks],
}
def enable_task(self, task_name: str) -> bool:
"""Enable a specific task."""
task = next((t for t in self.tasks if t.name == task_name), None)
if task:
task.enabled = True
return True
return False
def disable_task(self, task_name: str) -> bool:
"""Disable a specific task."""
task = next((t for t in self.tasks if t.name == task_name), None)
if task:
task.enabled = False
return True
return False
# Global cleanup manager instance
_cleanup_manager: Optional[CleanupManager] = None
def get_cleanup_manager(settings: Settings) -> CleanupManager:
"""Get cleanup manager instance."""
global _cleanup_manager
if _cleanup_manager is None:
_cleanup_manager = CleanupManager(settings)
return _cleanup_manager
async def run_periodic_cleanup(settings: Settings):
"""Run periodic cleanup tasks."""
cleanup_manager = get_cleanup_manager(settings)
while True:
try:
await cleanup_manager.run_all_tasks()
# Wait for next cleanup interval
await asyncio.sleep(settings.cleanup_interval_seconds)
except asyncio.CancelledError:
logger.info("Periodic cleanup cancelled")
break
except Exception as e:
logger.error(f"Periodic cleanup error: {e}", exc_info=True)
# Wait before retrying
await asyncio.sleep(60)

773
v1/src/tasks/monitoring.py Normal file
View File

@@ -0,0 +1,773 @@
"""
Monitoring tasks for WiFi-DensePose API
"""
import asyncio
import logging
import psutil
import time
from datetime import datetime, timedelta
from typing import Dict, Any, Optional, List
from contextlib import asynccontextmanager
from sqlalchemy import select, func, and_, or_
from sqlalchemy.ext.asyncio import AsyncSession
from src.config.settings import Settings
from src.database.connection import get_database_manager
from src.database.models import SystemMetric, Device, Session, CSIData, PoseDetection
from src.logger import get_logger
logger = get_logger(__name__)
class MonitoringTask:
"""Base class for monitoring tasks."""
def __init__(self, name: str, settings: Settings):
self.name = name
self.settings = settings
self.enabled = True
self.last_run = None
self.run_count = 0
self.error_count = 0
self.interval_seconds = 60 # Default interval
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
"""Collect metrics for this task."""
raise NotImplementedError
async def run(self, session: AsyncSession) -> Dict[str, Any]:
"""Run the monitoring task with error handling."""
start_time = datetime.utcnow()
try:
logger.debug(f"Starting monitoring task: {self.name}")
metrics = await self.collect_metrics(session)
# Store metrics in database
for metric_data in metrics:
metric = SystemMetric(
metric_name=metric_data["name"],
metric_type=metric_data["type"],
value=metric_data["value"],
unit=metric_data.get("unit"),
labels=metric_data.get("labels"),
tags=metric_data.get("tags"),
source=metric_data.get("source", self.name),
component=metric_data.get("component"),
description=metric_data.get("description"),
meta_data=metric_data.get("metadata"),
)
session.add(metric)
await session.commit()
self.last_run = start_time
self.run_count += 1
logger.debug(f"Monitoring task {self.name} completed: collected {len(metrics)} metrics")
return {
"task": self.name,
"status": "success",
"start_time": start_time.isoformat(),
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
"metrics_collected": len(metrics),
}
except Exception as e:
self.error_count += 1
logger.error(f"Monitoring task {self.name} failed: {e}", exc_info=True)
return {
"task": self.name,
"status": "error",
"start_time": start_time.isoformat(),
"duration_ms": (datetime.utcnow() - start_time).total_seconds() * 1000,
"error": str(e),
"metrics_collected": 0,
}
def get_stats(self) -> Dict[str, Any]:
"""Get task statistics."""
return {
"name": self.name,
"enabled": self.enabled,
"interval_seconds": self.interval_seconds,
"last_run": self.last_run.isoformat() if self.last_run else None,
"run_count": self.run_count,
"error_count": self.error_count,
}
class SystemResourceMonitoring(MonitoringTask):
"""Monitor system resources (CPU, memory, disk, network)."""
def __init__(self, settings: Settings):
super().__init__("system_resources", settings)
self.interval_seconds = settings.system_monitoring_interval
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
"""Collect system resource metrics."""
metrics = []
timestamp = datetime.utcnow()
# CPU metrics
cpu_percent = psutil.cpu_percent(interval=1)
cpu_count = psutil.cpu_count()
cpu_freq = psutil.cpu_freq()
metrics.extend([
{
"name": "system_cpu_usage_percent",
"type": "gauge",
"value": cpu_percent,
"unit": "percent",
"component": "cpu",
"description": "CPU usage percentage",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_cpu_count",
"type": "gauge",
"value": cpu_count,
"unit": "count",
"component": "cpu",
"description": "Number of CPU cores",
"metadata": {"timestamp": timestamp.isoformat()}
}
])
if cpu_freq:
metrics.append({
"name": "system_cpu_frequency_mhz",
"type": "gauge",
"value": cpu_freq.current,
"unit": "mhz",
"component": "cpu",
"description": "Current CPU frequency",
"metadata": {"timestamp": timestamp.isoformat()}
})
# Memory metrics
memory = psutil.virtual_memory()
swap = psutil.swap_memory()
metrics.extend([
{
"name": "system_memory_total_bytes",
"type": "gauge",
"value": memory.total,
"unit": "bytes",
"component": "memory",
"description": "Total system memory",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_memory_used_bytes",
"type": "gauge",
"value": memory.used,
"unit": "bytes",
"component": "memory",
"description": "Used system memory",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_memory_available_bytes",
"type": "gauge",
"value": memory.available,
"unit": "bytes",
"component": "memory",
"description": "Available system memory",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_memory_usage_percent",
"type": "gauge",
"value": memory.percent,
"unit": "percent",
"component": "memory",
"description": "Memory usage percentage",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_swap_total_bytes",
"type": "gauge",
"value": swap.total,
"unit": "bytes",
"component": "memory",
"description": "Total swap memory",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_swap_used_bytes",
"type": "gauge",
"value": swap.used,
"unit": "bytes",
"component": "memory",
"description": "Used swap memory",
"metadata": {"timestamp": timestamp.isoformat()}
}
])
# Disk metrics
disk_usage = psutil.disk_usage('/')
disk_io = psutil.disk_io_counters()
metrics.extend([
{
"name": "system_disk_total_bytes",
"type": "gauge",
"value": disk_usage.total,
"unit": "bytes",
"component": "disk",
"description": "Total disk space",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_disk_used_bytes",
"type": "gauge",
"value": disk_usage.used,
"unit": "bytes",
"component": "disk",
"description": "Used disk space",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_disk_free_bytes",
"type": "gauge",
"value": disk_usage.free,
"unit": "bytes",
"component": "disk",
"description": "Free disk space",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_disk_usage_percent",
"type": "gauge",
"value": (disk_usage.used / disk_usage.total) * 100,
"unit": "percent",
"component": "disk",
"description": "Disk usage percentage",
"metadata": {"timestamp": timestamp.isoformat()}
}
])
if disk_io:
metrics.extend([
{
"name": "system_disk_read_bytes_total",
"type": "counter",
"value": disk_io.read_bytes,
"unit": "bytes",
"component": "disk",
"description": "Total bytes read from disk",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_disk_write_bytes_total",
"type": "counter",
"value": disk_io.write_bytes,
"unit": "bytes",
"component": "disk",
"description": "Total bytes written to disk",
"metadata": {"timestamp": timestamp.isoformat()}
}
])
# Network metrics
network_io = psutil.net_io_counters()
if network_io:
metrics.extend([
{
"name": "system_network_bytes_sent_total",
"type": "counter",
"value": network_io.bytes_sent,
"unit": "bytes",
"component": "network",
"description": "Total bytes sent over network",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_network_bytes_recv_total",
"type": "counter",
"value": network_io.bytes_recv,
"unit": "bytes",
"component": "network",
"description": "Total bytes received over network",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_network_packets_sent_total",
"type": "counter",
"value": network_io.packets_sent,
"unit": "count",
"component": "network",
"description": "Total packets sent over network",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "system_network_packets_recv_total",
"type": "counter",
"value": network_io.packets_recv,
"unit": "count",
"component": "network",
"description": "Total packets received over network",
"metadata": {"timestamp": timestamp.isoformat()}
}
])
return metrics
class DatabaseMonitoring(MonitoringTask):
"""Monitor database performance and statistics."""
def __init__(self, settings: Settings):
super().__init__("database", settings)
self.interval_seconds = settings.database_monitoring_interval
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
"""Collect database metrics."""
metrics = []
timestamp = datetime.utcnow()
# Get database connection stats
db_manager = get_database_manager(self.settings)
connection_stats = await db_manager.get_connection_stats()
# PostgreSQL connection metrics
if "postgresql" in connection_stats:
pg_stats = connection_stats["postgresql"]
metrics.extend([
{
"name": "database_connections_total",
"type": "gauge",
"value": pg_stats.get("total_connections", 0),
"unit": "count",
"component": "postgresql",
"description": "Total database connections",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "database_connections_active",
"type": "gauge",
"value": pg_stats.get("checked_out", 0),
"unit": "count",
"component": "postgresql",
"description": "Active database connections",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "database_connections_available",
"type": "gauge",
"value": pg_stats.get("available_connections", 0),
"unit": "count",
"component": "postgresql",
"description": "Available database connections",
"metadata": {"timestamp": timestamp.isoformat()}
}
])
# Redis connection metrics
if "redis" in connection_stats and not connection_stats["redis"].get("error"):
redis_stats = connection_stats["redis"]
metrics.extend([
{
"name": "redis_connections_active",
"type": "gauge",
"value": redis_stats.get("connected_clients", 0),
"unit": "count",
"component": "redis",
"description": "Active Redis connections",
"metadata": {"timestamp": timestamp.isoformat()}
},
{
"name": "redis_connections_blocked",
"type": "gauge",
"value": redis_stats.get("blocked_clients", 0),
"unit": "count",
"component": "redis",
"description": "Blocked Redis connections",
"metadata": {"timestamp": timestamp.isoformat()}
}
])
# Table row counts
table_counts = await self._get_table_counts(session)
for table_name, count in table_counts.items():
metrics.append({
"name": f"database_table_rows_{table_name}",
"type": "gauge",
"value": count,
"unit": "count",
"component": "postgresql",
"description": f"Number of rows in {table_name} table",
"metadata": {"timestamp": timestamp.isoformat(), "table": table_name}
})
return metrics
async def _get_table_counts(self, session: AsyncSession) -> Dict[str, int]:
"""Get row counts for all tables."""
counts = {}
# Count devices
result = await session.execute(select(func.count(Device.id)))
counts["devices"] = result.scalar() or 0
# Count sessions
result = await session.execute(select(func.count(Session.id)))
counts["sessions"] = result.scalar() or 0
# Count CSI data
result = await session.execute(select(func.count(CSIData.id)))
counts["csi_data"] = result.scalar() or 0
# Count pose detections
result = await session.execute(select(func.count(PoseDetection.id)))
counts["pose_detections"] = result.scalar() or 0
# Count system metrics
result = await session.execute(select(func.count(SystemMetric.id)))
counts["system_metrics"] = result.scalar() or 0
return counts
class ApplicationMonitoring(MonitoringTask):
"""Monitor application-specific metrics."""
def __init__(self, settings: Settings):
super().__init__("application", settings)
self.interval_seconds = settings.application_monitoring_interval
self.start_time = datetime.utcnow()
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
"""Collect application metrics."""
metrics = []
timestamp = datetime.utcnow()
# Application uptime
uptime_seconds = (timestamp - self.start_time).total_seconds()
metrics.append({
"name": "application_uptime_seconds",
"type": "gauge",
"value": uptime_seconds,
"unit": "seconds",
"component": "application",
"description": "Application uptime in seconds",
"metadata": {"timestamp": timestamp.isoformat()}
})
# Active sessions count
active_sessions_query = select(func.count(Session.id)).where(
Session.status == "active"
)
result = await session.execute(active_sessions_query)
active_sessions = result.scalar() or 0
metrics.append({
"name": "application_active_sessions",
"type": "gauge",
"value": active_sessions,
"unit": "count",
"component": "application",
"description": "Number of active sessions",
"metadata": {"timestamp": timestamp.isoformat()}
})
# Active devices count
active_devices_query = select(func.count(Device.id)).where(
Device.status == "active"
)
result = await session.execute(active_devices_query)
active_devices = result.scalar() or 0
metrics.append({
"name": "application_active_devices",
"type": "gauge",
"value": active_devices,
"unit": "count",
"component": "application",
"description": "Number of active devices",
"metadata": {"timestamp": timestamp.isoformat()}
})
# Recent data processing metrics (last hour)
one_hour_ago = timestamp - timedelta(hours=1)
# Recent CSI data count
recent_csi_query = select(func.count(CSIData.id)).where(
CSIData.created_at >= one_hour_ago
)
result = await session.execute(recent_csi_query)
recent_csi_count = result.scalar() or 0
metrics.append({
"name": "application_csi_data_hourly",
"type": "gauge",
"value": recent_csi_count,
"unit": "count",
"component": "application",
"description": "CSI data records created in the last hour",
"metadata": {"timestamp": timestamp.isoformat()}
})
# Recent pose detections count
recent_pose_query = select(func.count(PoseDetection.id)).where(
PoseDetection.created_at >= one_hour_ago
)
result = await session.execute(recent_pose_query)
recent_pose_count = result.scalar() or 0
metrics.append({
"name": "application_pose_detections_hourly",
"type": "gauge",
"value": recent_pose_count,
"unit": "count",
"component": "application",
"description": "Pose detections created in the last hour",
"metadata": {"timestamp": timestamp.isoformat()}
})
# Processing status metrics
processing_statuses = ["pending", "processing", "completed", "failed"]
for status in processing_statuses:
status_query = select(func.count(CSIData.id)).where(
CSIData.processing_status == status
)
result = await session.execute(status_query)
status_count = result.scalar() or 0
metrics.append({
"name": f"application_csi_processing_{status}",
"type": "gauge",
"value": status_count,
"unit": "count",
"component": "application",
"description": f"CSI data records with {status} processing status",
"metadata": {"timestamp": timestamp.isoformat(), "status": status}
})
return metrics
class PerformanceMonitoring(MonitoringTask):
"""Monitor performance metrics and response times."""
def __init__(self, settings: Settings):
super().__init__("performance", settings)
self.interval_seconds = settings.performance_monitoring_interval
self.response_times = []
self.error_counts = {}
async def collect_metrics(self, session: AsyncSession) -> List[Dict[str, Any]]:
"""Collect performance metrics."""
metrics = []
timestamp = datetime.utcnow()
# Database query performance test
start_time = time.time()
test_query = select(func.count(Device.id))
await session.execute(test_query)
db_response_time = (time.time() - start_time) * 1000 # Convert to milliseconds
metrics.append({
"name": "performance_database_query_time_ms",
"type": "gauge",
"value": db_response_time,
"unit": "milliseconds",
"component": "database",
"description": "Database query response time",
"metadata": {"timestamp": timestamp.isoformat()}
})
# Average response time (if we have data)
if self.response_times:
avg_response_time = sum(self.response_times) / len(self.response_times)
metrics.append({
"name": "performance_avg_response_time_ms",
"type": "gauge",
"value": avg_response_time,
"unit": "milliseconds",
"component": "api",
"description": "Average API response time",
"metadata": {"timestamp": timestamp.isoformat()}
})
# Clear old response times (keep only recent ones)
self.response_times = self.response_times[-100:] # Keep last 100
# Error rates
for error_type, count in self.error_counts.items():
metrics.append({
"name": f"performance_errors_{error_type}_total",
"type": "counter",
"value": count,
"unit": "count",
"component": "api",
"description": f"Total {error_type} errors",
"metadata": {"timestamp": timestamp.isoformat(), "error_type": error_type}
})
return metrics
def record_response_time(self, response_time_ms: float):
"""Record an API response time."""
self.response_times.append(response_time_ms)
def record_error(self, error_type: str):
"""Record an error occurrence."""
self.error_counts[error_type] = self.error_counts.get(error_type, 0) + 1
class MonitoringManager:
"""Manager for all monitoring tasks."""
def __init__(self, settings: Settings):
self.settings = settings
self.db_manager = get_database_manager(settings)
self.tasks = self._initialize_tasks()
self.running = False
self.last_run = None
self.run_count = 0
def _initialize_tasks(self) -> List[MonitoringTask]:
"""Initialize all monitoring tasks."""
tasks = [
SystemResourceMonitoring(self.settings),
DatabaseMonitoring(self.settings),
ApplicationMonitoring(self.settings),
PerformanceMonitoring(self.settings),
]
# Filter enabled tasks
enabled_tasks = [task for task in tasks if task.enabled]
logger.info(f"Initialized {len(enabled_tasks)} monitoring tasks")
return enabled_tasks
async def run_all_tasks(self) -> Dict[str, Any]:
"""Run all monitoring tasks."""
if self.running:
return {"status": "already_running", "message": "Monitoring already in progress"}
self.running = True
start_time = datetime.utcnow()
try:
logger.debug("Starting monitoring tasks")
results = []
total_metrics = 0
async with self.db_manager.get_async_session() as session:
for task in self.tasks:
if not task.enabled:
continue
result = await task.run(session)
results.append(result)
total_metrics += result.get("metrics_collected", 0)
self.last_run = start_time
self.run_count += 1
duration = (datetime.utcnow() - start_time).total_seconds()
logger.debug(
f"Monitoring tasks completed: collected {total_metrics} metrics "
f"in {duration:.2f} seconds"
)
return {
"status": "completed",
"start_time": start_time.isoformat(),
"duration_seconds": duration,
"total_metrics": total_metrics,
"task_results": results,
}
except Exception as e:
logger.error(f"Monitoring tasks failed: {e}", exc_info=True)
return {
"status": "error",
"start_time": start_time.isoformat(),
"duration_seconds": (datetime.utcnow() - start_time).total_seconds(),
"error": str(e),
"total_metrics": 0,
}
finally:
self.running = False
async def run_task(self, task_name: str) -> Dict[str, Any]:
"""Run a specific monitoring task."""
task = next((t for t in self.tasks if t.name == task_name), None)
if not task:
return {
"status": "error",
"error": f"Task '{task_name}' not found",
"available_tasks": [t.name for t in self.tasks]
}
if not task.enabled:
return {
"status": "error",
"error": f"Task '{task_name}' is disabled"
}
async with self.db_manager.get_async_session() as session:
return await task.run(session)
def get_stats(self) -> Dict[str, Any]:
"""Get monitoring manager statistics."""
return {
"manager": {
"running": self.running,
"last_run": self.last_run.isoformat() if self.last_run else None,
"run_count": self.run_count,
},
"tasks": [task.get_stats() for task in self.tasks],
}
def get_performance_task(self) -> Optional[PerformanceMonitoring]:
"""Get the performance monitoring task for recording metrics."""
return next((t for t in self.tasks if isinstance(t, PerformanceMonitoring)), None)
# Global monitoring manager instance
_monitoring_manager: Optional[MonitoringManager] = None
def get_monitoring_manager(settings: Settings) -> MonitoringManager:
"""Get monitoring manager instance."""
global _monitoring_manager
if _monitoring_manager is None:
_monitoring_manager = MonitoringManager(settings)
return _monitoring_manager
async def run_periodic_monitoring(settings: Settings):
"""Run periodic monitoring tasks."""
monitoring_manager = get_monitoring_manager(settings)
while True:
try:
await monitoring_manager.run_all_tasks()
# Wait for next monitoring interval
await asyncio.sleep(settings.monitoring_interval_seconds)
except asyncio.CancelledError:
logger.info("Periodic monitoring cancelled")
break
except Exception as e:
logger.error(f"Periodic monitoring error: {e}", exc_info=True)
# Wait before retrying
await asyncio.sleep(30)