Major changes: - Organized Python v1 implementation into v1/ subdirectory - Created Rust workspace with 9 modular crates: - wifi-densepose-core: Core types, traits, errors - wifi-densepose-signal: CSI processing, phase sanitization, FFT - wifi-densepose-nn: Neural network inference (ONNX/Candle/tch) - wifi-densepose-api: Axum-based REST/WebSocket API - wifi-densepose-db: SQLx database layer - wifi-densepose-config: Configuration management - wifi-densepose-hardware: Hardware abstraction - wifi-densepose-wasm: WebAssembly bindings - wifi-densepose-cli: Command-line interface Documentation: - ADR-001: Workspace structure - ADR-002: Signal processing library selection - ADR-003: Neural network inference strategy - DDD domain model with bounded contexts Testing: - 69 tests passing across all crates - Signal processing: 45 tests - Neural networks: 21 tests - Core: 3 doc tests Performance targets: - 10x faster CSI processing (~0.5ms vs ~5ms) - 5x lower memory usage (~100MB vs ~500MB) - WASM support for browser deployment
465 lines
16 KiB
Python
465 lines
16 KiB
Python
"""
|
|
Health check service for WiFi-DensePose API
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import time
|
|
from typing import Dict, Any, List, Optional
|
|
from datetime import datetime, timedelta
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
|
|
from src.config.settings import Settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class HealthStatus(Enum):
|
|
"""Health status enumeration."""
|
|
HEALTHY = "healthy"
|
|
DEGRADED = "degraded"
|
|
UNHEALTHY = "unhealthy"
|
|
UNKNOWN = "unknown"
|
|
|
|
|
|
@dataclass
|
|
class HealthCheck:
|
|
"""Health check result."""
|
|
name: str
|
|
status: HealthStatus
|
|
message: str
|
|
timestamp: datetime = field(default_factory=datetime.utcnow)
|
|
duration_ms: float = 0.0
|
|
details: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class ServiceHealth:
|
|
"""Service health information."""
|
|
name: str
|
|
status: HealthStatus
|
|
last_check: Optional[datetime] = None
|
|
checks: List[HealthCheck] = field(default_factory=list)
|
|
uptime: float = 0.0
|
|
error_count: int = 0
|
|
last_error: Optional[str] = None
|
|
|
|
|
|
class HealthCheckService:
|
|
"""Service for monitoring application health."""
|
|
|
|
def __init__(self, settings: Settings):
|
|
self.settings = settings
|
|
self._services: Dict[str, ServiceHealth] = {}
|
|
self._start_time = time.time()
|
|
self._initialized = False
|
|
self._running = False
|
|
|
|
async def initialize(self):
|
|
"""Initialize health check service."""
|
|
if self._initialized:
|
|
return
|
|
|
|
logger.info("Initializing health check service")
|
|
|
|
# Initialize service health tracking
|
|
self._services = {
|
|
"api": ServiceHealth("api", HealthStatus.UNKNOWN),
|
|
"database": ServiceHealth("database", HealthStatus.UNKNOWN),
|
|
"redis": ServiceHealth("redis", HealthStatus.UNKNOWN),
|
|
"hardware": ServiceHealth("hardware", HealthStatus.UNKNOWN),
|
|
"pose": ServiceHealth("pose", HealthStatus.UNKNOWN),
|
|
"stream": ServiceHealth("stream", HealthStatus.UNKNOWN),
|
|
}
|
|
|
|
self._initialized = True
|
|
logger.info("Health check service initialized")
|
|
|
|
async def start(self):
|
|
"""Start health check service."""
|
|
if not self._initialized:
|
|
await self.initialize()
|
|
|
|
self._running = True
|
|
logger.info("Health check service started")
|
|
|
|
async def shutdown(self):
|
|
"""Shutdown health check service."""
|
|
self._running = False
|
|
logger.info("Health check service shut down")
|
|
|
|
async def perform_health_checks(self) -> Dict[str, HealthCheck]:
|
|
"""Perform all health checks."""
|
|
if not self._running:
|
|
return {}
|
|
|
|
logger.debug("Performing health checks")
|
|
results = {}
|
|
|
|
# Perform individual health checks
|
|
checks = [
|
|
self._check_api_health(),
|
|
self._check_database_health(),
|
|
self._check_redis_health(),
|
|
self._check_hardware_health(),
|
|
self._check_pose_health(),
|
|
self._check_stream_health(),
|
|
]
|
|
|
|
# Run checks concurrently
|
|
check_results = await asyncio.gather(*checks, return_exceptions=True)
|
|
|
|
# Process results
|
|
for i, result in enumerate(check_results):
|
|
check_name = ["api", "database", "redis", "hardware", "pose", "stream"][i]
|
|
|
|
if isinstance(result, Exception):
|
|
health_check = HealthCheck(
|
|
name=check_name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message=f"Health check failed: {result}"
|
|
)
|
|
else:
|
|
health_check = result
|
|
|
|
results[check_name] = health_check
|
|
self._update_service_health(check_name, health_check)
|
|
|
|
logger.debug(f"Completed {len(results)} health checks")
|
|
return results
|
|
|
|
async def _check_api_health(self) -> HealthCheck:
|
|
"""Check API health."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Basic API health check
|
|
uptime = time.time() - self._start_time
|
|
|
|
status = HealthStatus.HEALTHY
|
|
message = "API is running normally"
|
|
details = {
|
|
"uptime_seconds": uptime,
|
|
"uptime_formatted": str(timedelta(seconds=int(uptime)))
|
|
}
|
|
|
|
except Exception as e:
|
|
status = HealthStatus.UNHEALTHY
|
|
message = f"API health check failed: {e}"
|
|
details = {"error": str(e)}
|
|
|
|
duration_ms = (time.time() - start_time) * 1000
|
|
|
|
return HealthCheck(
|
|
name="api",
|
|
status=status,
|
|
message=message,
|
|
duration_ms=duration_ms,
|
|
details=details
|
|
)
|
|
|
|
async def _check_database_health(self) -> HealthCheck:
|
|
"""Check database health."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Import here to avoid circular imports
|
|
from src.database.connection import get_database_manager
|
|
|
|
db_manager = get_database_manager()
|
|
|
|
if not db_manager.is_connected():
|
|
status = HealthStatus.UNHEALTHY
|
|
message = "Database is not connected"
|
|
details = {"connected": False}
|
|
else:
|
|
# Test database connection
|
|
await db_manager.test_connection()
|
|
|
|
status = HealthStatus.HEALTHY
|
|
message = "Database is connected and responsive"
|
|
details = {
|
|
"connected": True,
|
|
"pool_size": db_manager.get_pool_size(),
|
|
"active_connections": db_manager.get_active_connections()
|
|
}
|
|
|
|
except Exception as e:
|
|
status = HealthStatus.UNHEALTHY
|
|
message = f"Database health check failed: {e}"
|
|
details = {"error": str(e)}
|
|
|
|
duration_ms = (time.time() - start_time) * 1000
|
|
|
|
return HealthCheck(
|
|
name="database",
|
|
status=status,
|
|
message=message,
|
|
duration_ms=duration_ms,
|
|
details=details
|
|
)
|
|
|
|
async def _check_redis_health(self) -> HealthCheck:
|
|
"""Check Redis health."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
redis_config = self.settings.get_redis_url()
|
|
|
|
if not redis_config:
|
|
status = HealthStatus.UNKNOWN
|
|
message = "Redis is not configured"
|
|
details = {"configured": False}
|
|
else:
|
|
# Test Redis connection
|
|
import redis.asyncio as redis
|
|
|
|
redis_client = redis.from_url(redis_config)
|
|
await redis_client.ping()
|
|
await redis_client.close()
|
|
|
|
status = HealthStatus.HEALTHY
|
|
message = "Redis is connected and responsive"
|
|
details = {"connected": True}
|
|
|
|
except Exception as e:
|
|
status = HealthStatus.UNHEALTHY
|
|
message = f"Redis health check failed: {e}"
|
|
details = {"error": str(e)}
|
|
|
|
duration_ms = (time.time() - start_time) * 1000
|
|
|
|
return HealthCheck(
|
|
name="redis",
|
|
status=status,
|
|
message=message,
|
|
duration_ms=duration_ms,
|
|
details=details
|
|
)
|
|
|
|
async def _check_hardware_health(self) -> HealthCheck:
|
|
"""Check hardware service health."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Import here to avoid circular imports
|
|
from src.api.dependencies import get_hardware_service
|
|
|
|
hardware_service = get_hardware_service()
|
|
|
|
if hasattr(hardware_service, 'get_status'):
|
|
status_info = await hardware_service.get_status()
|
|
|
|
if status_info.get("status") == "healthy":
|
|
status = HealthStatus.HEALTHY
|
|
message = "Hardware service is operational"
|
|
else:
|
|
status = HealthStatus.DEGRADED
|
|
message = f"Hardware service status: {status_info.get('status', 'unknown')}"
|
|
|
|
details = status_info
|
|
else:
|
|
status = HealthStatus.UNKNOWN
|
|
message = "Hardware service status unavailable"
|
|
details = {}
|
|
|
|
except Exception as e:
|
|
status = HealthStatus.UNHEALTHY
|
|
message = f"Hardware health check failed: {e}"
|
|
details = {"error": str(e)}
|
|
|
|
duration_ms = (time.time() - start_time) * 1000
|
|
|
|
return HealthCheck(
|
|
name="hardware",
|
|
status=status,
|
|
message=message,
|
|
duration_ms=duration_ms,
|
|
details=details
|
|
)
|
|
|
|
async def _check_pose_health(self) -> HealthCheck:
|
|
"""Check pose service health."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Import here to avoid circular imports
|
|
from src.api.dependencies import get_pose_service
|
|
|
|
pose_service = get_pose_service()
|
|
|
|
if hasattr(pose_service, 'get_status'):
|
|
status_info = await pose_service.get_status()
|
|
|
|
if status_info.get("status") == "healthy":
|
|
status = HealthStatus.HEALTHY
|
|
message = "Pose service is operational"
|
|
else:
|
|
status = HealthStatus.DEGRADED
|
|
message = f"Pose service status: {status_info.get('status', 'unknown')}"
|
|
|
|
details = status_info
|
|
else:
|
|
status = HealthStatus.UNKNOWN
|
|
message = "Pose service status unavailable"
|
|
details = {}
|
|
|
|
except Exception as e:
|
|
status = HealthStatus.UNHEALTHY
|
|
message = f"Pose health check failed: {e}"
|
|
details = {"error": str(e)}
|
|
|
|
duration_ms = (time.time() - start_time) * 1000
|
|
|
|
return HealthCheck(
|
|
name="pose",
|
|
status=status,
|
|
message=message,
|
|
duration_ms=duration_ms,
|
|
details=details
|
|
)
|
|
|
|
async def _check_stream_health(self) -> HealthCheck:
|
|
"""Check stream service health."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Import here to avoid circular imports
|
|
from src.api.dependencies import get_stream_service
|
|
|
|
stream_service = get_stream_service()
|
|
|
|
if hasattr(stream_service, 'get_status'):
|
|
status_info = await stream_service.get_status()
|
|
|
|
if status_info.get("status") == "healthy":
|
|
status = HealthStatus.HEALTHY
|
|
message = "Stream service is operational"
|
|
else:
|
|
status = HealthStatus.DEGRADED
|
|
message = f"Stream service status: {status_info.get('status', 'unknown')}"
|
|
|
|
details = status_info
|
|
else:
|
|
status = HealthStatus.UNKNOWN
|
|
message = "Stream service status unavailable"
|
|
details = {}
|
|
|
|
except Exception as e:
|
|
status = HealthStatus.UNHEALTHY
|
|
message = f"Stream health check failed: {e}"
|
|
details = {"error": str(e)}
|
|
|
|
duration_ms = (time.time() - start_time) * 1000
|
|
|
|
return HealthCheck(
|
|
name="stream",
|
|
status=status,
|
|
message=message,
|
|
duration_ms=duration_ms,
|
|
details=details
|
|
)
|
|
|
|
def _update_service_health(self, service_name: str, health_check: HealthCheck):
|
|
"""Update service health information."""
|
|
if service_name not in self._services:
|
|
self._services[service_name] = ServiceHealth(service_name, HealthStatus.UNKNOWN)
|
|
|
|
service_health = self._services[service_name]
|
|
service_health.status = health_check.status
|
|
service_health.last_check = health_check.timestamp
|
|
service_health.uptime = time.time() - self._start_time
|
|
|
|
# Keep last 10 checks
|
|
service_health.checks.append(health_check)
|
|
if len(service_health.checks) > 10:
|
|
service_health.checks.pop(0)
|
|
|
|
# Update error tracking
|
|
if health_check.status == HealthStatus.UNHEALTHY:
|
|
service_health.error_count += 1
|
|
service_health.last_error = health_check.message
|
|
|
|
async def get_overall_health(self) -> Dict[str, Any]:
|
|
"""Get overall system health."""
|
|
if not self._services:
|
|
return {
|
|
"status": HealthStatus.UNKNOWN.value,
|
|
"message": "Health checks not initialized"
|
|
}
|
|
|
|
# Determine overall status
|
|
statuses = [service.status for service in self._services.values()]
|
|
|
|
if all(status == HealthStatus.HEALTHY for status in statuses):
|
|
overall_status = HealthStatus.HEALTHY
|
|
message = "All services are healthy"
|
|
elif any(status == HealthStatus.UNHEALTHY for status in statuses):
|
|
overall_status = HealthStatus.UNHEALTHY
|
|
unhealthy_services = [
|
|
name for name, service in self._services.items()
|
|
if service.status == HealthStatus.UNHEALTHY
|
|
]
|
|
message = f"Unhealthy services: {', '.join(unhealthy_services)}"
|
|
elif any(status == HealthStatus.DEGRADED for status in statuses):
|
|
overall_status = HealthStatus.DEGRADED
|
|
degraded_services = [
|
|
name for name, service in self._services.items()
|
|
if service.status == HealthStatus.DEGRADED
|
|
]
|
|
message = f"Degraded services: {', '.join(degraded_services)}"
|
|
else:
|
|
overall_status = HealthStatus.UNKNOWN
|
|
message = "System health status unknown"
|
|
|
|
return {
|
|
"status": overall_status.value,
|
|
"message": message,
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
"uptime": time.time() - self._start_time,
|
|
"services": {
|
|
name: {
|
|
"status": service.status.value,
|
|
"last_check": service.last_check.isoformat() if service.last_check else None,
|
|
"error_count": service.error_count,
|
|
"last_error": service.last_error
|
|
}
|
|
for name, service in self._services.items()
|
|
}
|
|
}
|
|
|
|
async def get_service_health(self, service_name: str) -> Optional[Dict[str, Any]]:
|
|
"""Get health information for a specific service."""
|
|
service = self._services.get(service_name)
|
|
if not service:
|
|
return None
|
|
|
|
return {
|
|
"name": service.name,
|
|
"status": service.status.value,
|
|
"last_check": service.last_check.isoformat() if service.last_check else None,
|
|
"uptime": service.uptime,
|
|
"error_count": service.error_count,
|
|
"last_error": service.last_error,
|
|
"recent_checks": [
|
|
{
|
|
"timestamp": check.timestamp.isoformat(),
|
|
"status": check.status.value,
|
|
"message": check.message,
|
|
"duration_ms": check.duration_ms,
|
|
"details": check.details
|
|
}
|
|
for check in service.checks[-5:] # Last 5 checks
|
|
]
|
|
}
|
|
|
|
async def get_status(self) -> Dict[str, Any]:
|
|
"""Get health check service status."""
|
|
return {
|
|
"status": "healthy" if self._running else "stopped",
|
|
"initialized": self._initialized,
|
|
"running": self._running,
|
|
"services_monitored": len(self._services),
|
|
"uptime": time.time() - self._start_time
|
|
} |