feat: Complete Rust port of WiFi-DensePose with modular crates
Major changes: - Organized Python v1 implementation into v1/ subdirectory - Created Rust workspace with 9 modular crates: - wifi-densepose-core: Core types, traits, errors - wifi-densepose-signal: CSI processing, phase sanitization, FFT - wifi-densepose-nn: Neural network inference (ONNX/Candle/tch) - wifi-densepose-api: Axum-based REST/WebSocket API - wifi-densepose-db: SQLx database layer - wifi-densepose-config: Configuration management - wifi-densepose-hardware: Hardware abstraction - wifi-densepose-wasm: WebAssembly bindings - wifi-densepose-cli: Command-line interface Documentation: - ADR-001: Workspace structure - ADR-002: Signal processing library selection - ADR-003: Neural network inference strategy - DDD domain model with bounded contexts Testing: - 69 tests passing across all crates - Signal processing: 45 tests - Neural networks: 21 tests - Core: 3 doc tests Performance targets: - 10x faster CSI processing (~0.5ms vs ~5ms) - 5x lower memory usage (~100MB vs ~500MB) - WASM support for browser deployment
This commit is contained in:
465
v1/src/services/health_check.py
Normal file
465
v1/src/services/health_check.py
Normal file
@@ -0,0 +1,465 @@
|
||||
"""
|
||||
Health check service for WiFi-DensePose API
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
|
||||
from src.config.settings import Settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HealthStatus(Enum):
|
||||
"""Health status enumeration."""
|
||||
HEALTHY = "healthy"
|
||||
DEGRADED = "degraded"
|
||||
UNHEALTHY = "unhealthy"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
@dataclass
|
||||
class HealthCheck:
|
||||
"""Health check result."""
|
||||
name: str
|
||||
status: HealthStatus
|
||||
message: str
|
||||
timestamp: datetime = field(default_factory=datetime.utcnow)
|
||||
duration_ms: float = 0.0
|
||||
details: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServiceHealth:
|
||||
"""Service health information."""
|
||||
name: str
|
||||
status: HealthStatus
|
||||
last_check: Optional[datetime] = None
|
||||
checks: List[HealthCheck] = field(default_factory=list)
|
||||
uptime: float = 0.0
|
||||
error_count: int = 0
|
||||
last_error: Optional[str] = None
|
||||
|
||||
|
||||
class HealthCheckService:
|
||||
"""Service for monitoring application health."""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
self.settings = settings
|
||||
self._services: Dict[str, ServiceHealth] = {}
|
||||
self._start_time = time.time()
|
||||
self._initialized = False
|
||||
self._running = False
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize health check service."""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
logger.info("Initializing health check service")
|
||||
|
||||
# Initialize service health tracking
|
||||
self._services = {
|
||||
"api": ServiceHealth("api", HealthStatus.UNKNOWN),
|
||||
"database": ServiceHealth("database", HealthStatus.UNKNOWN),
|
||||
"redis": ServiceHealth("redis", HealthStatus.UNKNOWN),
|
||||
"hardware": ServiceHealth("hardware", HealthStatus.UNKNOWN),
|
||||
"pose": ServiceHealth("pose", HealthStatus.UNKNOWN),
|
||||
"stream": ServiceHealth("stream", HealthStatus.UNKNOWN),
|
||||
}
|
||||
|
||||
self._initialized = True
|
||||
logger.info("Health check service initialized")
|
||||
|
||||
async def start(self):
|
||||
"""Start health check service."""
|
||||
if not self._initialized:
|
||||
await self.initialize()
|
||||
|
||||
self._running = True
|
||||
logger.info("Health check service started")
|
||||
|
||||
async def shutdown(self):
|
||||
"""Shutdown health check service."""
|
||||
self._running = False
|
||||
logger.info("Health check service shut down")
|
||||
|
||||
async def perform_health_checks(self) -> Dict[str, HealthCheck]:
|
||||
"""Perform all health checks."""
|
||||
if not self._running:
|
||||
return {}
|
||||
|
||||
logger.debug("Performing health checks")
|
||||
results = {}
|
||||
|
||||
# Perform individual health checks
|
||||
checks = [
|
||||
self._check_api_health(),
|
||||
self._check_database_health(),
|
||||
self._check_redis_health(),
|
||||
self._check_hardware_health(),
|
||||
self._check_pose_health(),
|
||||
self._check_stream_health(),
|
||||
]
|
||||
|
||||
# Run checks concurrently
|
||||
check_results = await asyncio.gather(*checks, return_exceptions=True)
|
||||
|
||||
# Process results
|
||||
for i, result in enumerate(check_results):
|
||||
check_name = ["api", "database", "redis", "hardware", "pose", "stream"][i]
|
||||
|
||||
if isinstance(result, Exception):
|
||||
health_check = HealthCheck(
|
||||
name=check_name,
|
||||
status=HealthStatus.UNHEALTHY,
|
||||
message=f"Health check failed: {result}"
|
||||
)
|
||||
else:
|
||||
health_check = result
|
||||
|
||||
results[check_name] = health_check
|
||||
self._update_service_health(check_name, health_check)
|
||||
|
||||
logger.debug(f"Completed {len(results)} health checks")
|
||||
return results
|
||||
|
||||
async def _check_api_health(self) -> HealthCheck:
|
||||
"""Check API health."""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Basic API health check
|
||||
uptime = time.time() - self._start_time
|
||||
|
||||
status = HealthStatus.HEALTHY
|
||||
message = "API is running normally"
|
||||
details = {
|
||||
"uptime_seconds": uptime,
|
||||
"uptime_formatted": str(timedelta(seconds=int(uptime)))
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
status = HealthStatus.UNHEALTHY
|
||||
message = f"API health check failed: {e}"
|
||||
details = {"error": str(e)}
|
||||
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
|
||||
return HealthCheck(
|
||||
name="api",
|
||||
status=status,
|
||||
message=message,
|
||||
duration_ms=duration_ms,
|
||||
details=details
|
||||
)
|
||||
|
||||
async def _check_database_health(self) -> HealthCheck:
|
||||
"""Check database health."""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Import here to avoid circular imports
|
||||
from src.database.connection import get_database_manager
|
||||
|
||||
db_manager = get_database_manager()
|
||||
|
||||
if not db_manager.is_connected():
|
||||
status = HealthStatus.UNHEALTHY
|
||||
message = "Database is not connected"
|
||||
details = {"connected": False}
|
||||
else:
|
||||
# Test database connection
|
||||
await db_manager.test_connection()
|
||||
|
||||
status = HealthStatus.HEALTHY
|
||||
message = "Database is connected and responsive"
|
||||
details = {
|
||||
"connected": True,
|
||||
"pool_size": db_manager.get_pool_size(),
|
||||
"active_connections": db_manager.get_active_connections()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
status = HealthStatus.UNHEALTHY
|
||||
message = f"Database health check failed: {e}"
|
||||
details = {"error": str(e)}
|
||||
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
|
||||
return HealthCheck(
|
||||
name="database",
|
||||
status=status,
|
||||
message=message,
|
||||
duration_ms=duration_ms,
|
||||
details=details
|
||||
)
|
||||
|
||||
async def _check_redis_health(self) -> HealthCheck:
|
||||
"""Check Redis health."""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
redis_config = self.settings.get_redis_url()
|
||||
|
||||
if not redis_config:
|
||||
status = HealthStatus.UNKNOWN
|
||||
message = "Redis is not configured"
|
||||
details = {"configured": False}
|
||||
else:
|
||||
# Test Redis connection
|
||||
import redis.asyncio as redis
|
||||
|
||||
redis_client = redis.from_url(redis_config)
|
||||
await redis_client.ping()
|
||||
await redis_client.close()
|
||||
|
||||
status = HealthStatus.HEALTHY
|
||||
message = "Redis is connected and responsive"
|
||||
details = {"connected": True}
|
||||
|
||||
except Exception as e:
|
||||
status = HealthStatus.UNHEALTHY
|
||||
message = f"Redis health check failed: {e}"
|
||||
details = {"error": str(e)}
|
||||
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
|
||||
return HealthCheck(
|
||||
name="redis",
|
||||
status=status,
|
||||
message=message,
|
||||
duration_ms=duration_ms,
|
||||
details=details
|
||||
)
|
||||
|
||||
async def _check_hardware_health(self) -> HealthCheck:
|
||||
"""Check hardware service health."""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Import here to avoid circular imports
|
||||
from src.api.dependencies import get_hardware_service
|
||||
|
||||
hardware_service = get_hardware_service()
|
||||
|
||||
if hasattr(hardware_service, 'get_status'):
|
||||
status_info = await hardware_service.get_status()
|
||||
|
||||
if status_info.get("status") == "healthy":
|
||||
status = HealthStatus.HEALTHY
|
||||
message = "Hardware service is operational"
|
||||
else:
|
||||
status = HealthStatus.DEGRADED
|
||||
message = f"Hardware service status: {status_info.get('status', 'unknown')}"
|
||||
|
||||
details = status_info
|
||||
else:
|
||||
status = HealthStatus.UNKNOWN
|
||||
message = "Hardware service status unavailable"
|
||||
details = {}
|
||||
|
||||
except Exception as e:
|
||||
status = HealthStatus.UNHEALTHY
|
||||
message = f"Hardware health check failed: {e}"
|
||||
details = {"error": str(e)}
|
||||
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
|
||||
return HealthCheck(
|
||||
name="hardware",
|
||||
status=status,
|
||||
message=message,
|
||||
duration_ms=duration_ms,
|
||||
details=details
|
||||
)
|
||||
|
||||
async def _check_pose_health(self) -> HealthCheck:
|
||||
"""Check pose service health."""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Import here to avoid circular imports
|
||||
from src.api.dependencies import get_pose_service
|
||||
|
||||
pose_service = get_pose_service()
|
||||
|
||||
if hasattr(pose_service, 'get_status'):
|
||||
status_info = await pose_service.get_status()
|
||||
|
||||
if status_info.get("status") == "healthy":
|
||||
status = HealthStatus.HEALTHY
|
||||
message = "Pose service is operational"
|
||||
else:
|
||||
status = HealthStatus.DEGRADED
|
||||
message = f"Pose service status: {status_info.get('status', 'unknown')}"
|
||||
|
||||
details = status_info
|
||||
else:
|
||||
status = HealthStatus.UNKNOWN
|
||||
message = "Pose service status unavailable"
|
||||
details = {}
|
||||
|
||||
except Exception as e:
|
||||
status = HealthStatus.UNHEALTHY
|
||||
message = f"Pose health check failed: {e}"
|
||||
details = {"error": str(e)}
|
||||
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
|
||||
return HealthCheck(
|
||||
name="pose",
|
||||
status=status,
|
||||
message=message,
|
||||
duration_ms=duration_ms,
|
||||
details=details
|
||||
)
|
||||
|
||||
async def _check_stream_health(self) -> HealthCheck:
|
||||
"""Check stream service health."""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Import here to avoid circular imports
|
||||
from src.api.dependencies import get_stream_service
|
||||
|
||||
stream_service = get_stream_service()
|
||||
|
||||
if hasattr(stream_service, 'get_status'):
|
||||
status_info = await stream_service.get_status()
|
||||
|
||||
if status_info.get("status") == "healthy":
|
||||
status = HealthStatus.HEALTHY
|
||||
message = "Stream service is operational"
|
||||
else:
|
||||
status = HealthStatus.DEGRADED
|
||||
message = f"Stream service status: {status_info.get('status', 'unknown')}"
|
||||
|
||||
details = status_info
|
||||
else:
|
||||
status = HealthStatus.UNKNOWN
|
||||
message = "Stream service status unavailable"
|
||||
details = {}
|
||||
|
||||
except Exception as e:
|
||||
status = HealthStatus.UNHEALTHY
|
||||
message = f"Stream health check failed: {e}"
|
||||
details = {"error": str(e)}
|
||||
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
|
||||
return HealthCheck(
|
||||
name="stream",
|
||||
status=status,
|
||||
message=message,
|
||||
duration_ms=duration_ms,
|
||||
details=details
|
||||
)
|
||||
|
||||
def _update_service_health(self, service_name: str, health_check: HealthCheck):
|
||||
"""Update service health information."""
|
||||
if service_name not in self._services:
|
||||
self._services[service_name] = ServiceHealth(service_name, HealthStatus.UNKNOWN)
|
||||
|
||||
service_health = self._services[service_name]
|
||||
service_health.status = health_check.status
|
||||
service_health.last_check = health_check.timestamp
|
||||
service_health.uptime = time.time() - self._start_time
|
||||
|
||||
# Keep last 10 checks
|
||||
service_health.checks.append(health_check)
|
||||
if len(service_health.checks) > 10:
|
||||
service_health.checks.pop(0)
|
||||
|
||||
# Update error tracking
|
||||
if health_check.status == HealthStatus.UNHEALTHY:
|
||||
service_health.error_count += 1
|
||||
service_health.last_error = health_check.message
|
||||
|
||||
async def get_overall_health(self) -> Dict[str, Any]:
|
||||
"""Get overall system health."""
|
||||
if not self._services:
|
||||
return {
|
||||
"status": HealthStatus.UNKNOWN.value,
|
||||
"message": "Health checks not initialized"
|
||||
}
|
||||
|
||||
# Determine overall status
|
||||
statuses = [service.status for service in self._services.values()]
|
||||
|
||||
if all(status == HealthStatus.HEALTHY for status in statuses):
|
||||
overall_status = HealthStatus.HEALTHY
|
||||
message = "All services are healthy"
|
||||
elif any(status == HealthStatus.UNHEALTHY for status in statuses):
|
||||
overall_status = HealthStatus.UNHEALTHY
|
||||
unhealthy_services = [
|
||||
name for name, service in self._services.items()
|
||||
if service.status == HealthStatus.UNHEALTHY
|
||||
]
|
||||
message = f"Unhealthy services: {', '.join(unhealthy_services)}"
|
||||
elif any(status == HealthStatus.DEGRADED for status in statuses):
|
||||
overall_status = HealthStatus.DEGRADED
|
||||
degraded_services = [
|
||||
name for name, service in self._services.items()
|
||||
if service.status == HealthStatus.DEGRADED
|
||||
]
|
||||
message = f"Degraded services: {', '.join(degraded_services)}"
|
||||
else:
|
||||
overall_status = HealthStatus.UNKNOWN
|
||||
message = "System health status unknown"
|
||||
|
||||
return {
|
||||
"status": overall_status.value,
|
||||
"message": message,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"uptime": time.time() - self._start_time,
|
||||
"services": {
|
||||
name: {
|
||||
"status": service.status.value,
|
||||
"last_check": service.last_check.isoformat() if service.last_check else None,
|
||||
"error_count": service.error_count,
|
||||
"last_error": service.last_error
|
||||
}
|
||||
for name, service in self._services.items()
|
||||
}
|
||||
}
|
||||
|
||||
async def get_service_health(self, service_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get health information for a specific service."""
|
||||
service = self._services.get(service_name)
|
||||
if not service:
|
||||
return None
|
||||
|
||||
return {
|
||||
"name": service.name,
|
||||
"status": service.status.value,
|
||||
"last_check": service.last_check.isoformat() if service.last_check else None,
|
||||
"uptime": service.uptime,
|
||||
"error_count": service.error_count,
|
||||
"last_error": service.last_error,
|
||||
"recent_checks": [
|
||||
{
|
||||
"timestamp": check.timestamp.isoformat(),
|
||||
"status": check.status.value,
|
||||
"message": check.message,
|
||||
"duration_ms": check.duration_ms,
|
||||
"details": check.details
|
||||
}
|
||||
for check in service.checks[-5:] # Last 5 checks
|
||||
]
|
||||
}
|
||||
|
||||
async def get_status(self) -> Dict[str, Any]:
|
||||
"""Get health check service status."""
|
||||
return {
|
||||
"status": "healthy" if self._running else "stopped",
|
||||
"initialized": self._initialized,
|
||||
"running": self._running,
|
||||
"services_monitored": len(self._services),
|
||||
"uptime": time.time() - self._start_time
|
||||
}
|
||||
Reference in New Issue
Block a user