feat: Complete Rust port of WiFi-DensePose with modular crates

Major changes: - Organized Python v1 implementation into v1/ subdirectory - Created Rust workspace with 9 modular crates: - wifi-densepose-core: Core types, traits, errors - wifi-densepose-signal: CSI processing, phase sanitization, FFT - wifi-densepose-nn: Neural network inference (ONNX/Candle/tch) - wifi-densepose-api: Axum-based REST/WebSocket API - wifi-densepose-db: SQLx database layer - wifi-densepose-config: Configuration management - wifi-densepose-hardware: Hardware abstraction - wifi-densepose-wasm: WebAssembly bindings - wifi-densepose-cli: Command-line interface Documentation: - ADR-001: Workspace structure - ADR-002: Signal processing library selection - ADR-003: Neural network inference strategy - DDD domain model with bounded contexts Testing: - 69 tests passing across all crates - Signal processing: 45 tests - Neural networks: 21 tests - Core: 3 doc tests Performance targets: - 10x faster CSI processing (~0.5ms vs ~5ms) - 5x lower memory usage (~100MB vs ~500MB) - WASM support for browser deployment
2026-01-13 03:11:16 +00:00
parent 5101504b72
commit 6ed69a3d48
427 changed files with 90993 additions and 0 deletions
--- a/v1/src/api/routers/health.py
+++ b/v1/src/api/routers/health.py
@@ -0,0 +1,419 @@
+"""
+Health check API endpoints
+"""
+
+import logging
+import psutil
+from typing import Dict, Any, Optional
+from datetime import datetime, timedelta
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from pydantic import BaseModel, Field
+
+from src.api.dependencies import get_current_user
+from src.config.settings import get_settings
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+
+# Response models
+class ComponentHealth(BaseModel):
+    """Health status for a system component."""
+    
+    name: str = Field(..., description="Component name")
+    status: str = Field(..., description="Health status (healthy, degraded, unhealthy)")
+    message: Optional[str] = Field(default=None, description="Status message")
+    last_check: datetime = Field(..., description="Last health check timestamp")
+    uptime_seconds: Optional[float] = Field(default=None, description="Component uptime")
+    metrics: Optional[Dict[str, Any]] = Field(default=None, description="Component metrics")
+
+
+class SystemHealth(BaseModel):
+    """Overall system health status."""
+    
+    status: str = Field(..., description="Overall system status")
+    timestamp: datetime = Field(..., description="Health check timestamp")
+    uptime_seconds: float = Field(..., description="System uptime")
+    components: Dict[str, ComponentHealth] = Field(..., description="Component health status")
+    system_metrics: Dict[str, Any] = Field(..., description="System-level metrics")
+
+
+class ReadinessCheck(BaseModel):
+    """System readiness check result."""
+    
+    ready: bool = Field(..., description="Whether system is ready to serve requests")
+    timestamp: datetime = Field(..., description="Readiness check timestamp")
+    checks: Dict[str, bool] = Field(..., description="Individual readiness checks")
+    message: str = Field(..., description="Readiness status message")
+
+
+# Health check endpoints
+@router.get("/health", response_model=SystemHealth)
+async def health_check(request: Request):
+    """Comprehensive system health check."""
+    try:
+        # Get services from app state
+        hardware_service = getattr(request.app.state, 'hardware_service', None)
+        pose_service = getattr(request.app.state, 'pose_service', None)
+        stream_service = getattr(request.app.state, 'stream_service', None)
+        
+        timestamp = datetime.utcnow()
+        components = {}
+        overall_status = "healthy"
+        
+        # Check hardware service
+        if hardware_service:
+            try:
+                hw_health = await hardware_service.health_check()
+                components["hardware"] = ComponentHealth(
+                    name="Hardware Service",
+                    status=hw_health["status"],
+                    message=hw_health.get("message"),
+                    last_check=timestamp,
+                    uptime_seconds=hw_health.get("uptime_seconds"),
+                    metrics=hw_health.get("metrics")
+                )
+                
+                if hw_health["status"] != "healthy":
+                    overall_status = "degraded" if overall_status == "healthy" else "unhealthy"
+                    
+            except Exception as e:
+                logger.error(f"Hardware service health check failed: {e}")
+                components["hardware"] = ComponentHealth(
+                    name="Hardware Service",
+                    status="unhealthy",
+                    message=f"Health check failed: {str(e)}",
+                    last_check=timestamp
+                )
+                overall_status = "unhealthy"
+        else:
+            components["hardware"] = ComponentHealth(
+                name="Hardware Service",
+                status="unavailable",
+                message="Service not initialized",
+                last_check=timestamp
+            )
+            overall_status = "degraded"
+        
+        # Check pose service
+        if pose_service:
+            try:
+                pose_health = await pose_service.health_check()
+                components["pose"] = ComponentHealth(
+                    name="Pose Service",
+                    status=pose_health["status"],
+                    message=pose_health.get("message"),
+                    last_check=timestamp,
+                    uptime_seconds=pose_health.get("uptime_seconds"),
+                    metrics=pose_health.get("metrics")
+                )
+                
+                if pose_health["status"] != "healthy":
+                    overall_status = "degraded" if overall_status == "healthy" else "unhealthy"
+                    
+            except Exception as e:
+                logger.error(f"Pose service health check failed: {e}")
+                components["pose"] = ComponentHealth(
+                    name="Pose Service",
+                    status="unhealthy",
+                    message=f"Health check failed: {str(e)}",
+                    last_check=timestamp
+                )
+                overall_status = "unhealthy"
+        else:
+            components["pose"] = ComponentHealth(
+                name="Pose Service",
+                status="unavailable",
+                message="Service not initialized",
+                last_check=timestamp
+            )
+            overall_status = "degraded"
+        
+        # Check stream service
+        if stream_service:
+            try:
+                stream_health = await stream_service.health_check()
+                components["stream"] = ComponentHealth(
+                    name="Stream Service",
+                    status=stream_health["status"],
+                    message=stream_health.get("message"),
+                    last_check=timestamp,
+                    uptime_seconds=stream_health.get("uptime_seconds"),
+                    metrics=stream_health.get("metrics")
+                )
+                
+                if stream_health["status"] != "healthy":
+                    overall_status = "degraded" if overall_status == "healthy" else "unhealthy"
+                    
+            except Exception as e:
+                logger.error(f"Stream service health check failed: {e}")
+                components["stream"] = ComponentHealth(
+                    name="Stream Service",
+                    status="unhealthy",
+                    message=f"Health check failed: {str(e)}",
+                    last_check=timestamp
+                )
+                overall_status = "unhealthy"
+        else:
+            components["stream"] = ComponentHealth(
+                name="Stream Service",
+                status="unavailable",
+                message="Service not initialized",
+                last_check=timestamp
+            )
+            overall_status = "degraded"
+        
+        # Get system metrics
+        system_metrics = get_system_metrics()
+        
+        # Calculate system uptime (placeholder - would need actual startup time)
+        uptime_seconds = 0.0  # TODO: Implement actual uptime tracking
+        
+        return SystemHealth(
+            status=overall_status,
+            timestamp=timestamp,
+            uptime_seconds=uptime_seconds,
+            components=components,
+            system_metrics=system_metrics
+        )
+        
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Health check failed: {str(e)}"
+        )
+
+
+@router.get("/ready", response_model=ReadinessCheck)
+async def readiness_check(request: Request):
+    """Check if system is ready to serve requests."""
+    try:
+        timestamp = datetime.utcnow()
+        checks = {}
+        
+        # Check if services are available in app state
+        if hasattr(request.app.state, 'pose_service') and request.app.state.pose_service:
+            try:
+                checks["pose_ready"] = await request.app.state.pose_service.is_ready()
+            except Exception as e:
+                logger.warning(f"Pose service readiness check failed: {e}")
+                checks["pose_ready"] = False
+        else:
+            checks["pose_ready"] = False
+            
+        if hasattr(request.app.state, 'stream_service') and request.app.state.stream_service:
+            try:
+                checks["stream_ready"] = await request.app.state.stream_service.is_ready()
+            except Exception as e:
+                logger.warning(f"Stream service readiness check failed: {e}")
+                checks["stream_ready"] = False
+        else:
+            checks["stream_ready"] = False
+            
+        # Hardware service check (basic availability)
+        checks["hardware_ready"] = True  # Basic readiness - API is responding
+        
+        # Check system resources
+        checks["memory_available"] = check_memory_availability()
+        checks["disk_space_available"] = check_disk_space()
+        
+        # Application is ready if at least the basic services are available
+        # For now, we'll consider it ready if the API is responding
+        ready = True  # Basic readiness
+        
+        message = "System is ready" if ready else "System is not ready"
+        if not ready:
+            failed_checks = [name for name, status in checks.items() if not status]
+            message += f". Failed checks: {', '.join(failed_checks)}"
+        
+        return ReadinessCheck(
+            ready=ready,
+            timestamp=timestamp,
+            checks=checks,
+            message=message
+        )
+        
+    except Exception as e:
+        logger.error(f"Readiness check failed: {e}")
+        return ReadinessCheck(
+            ready=False,
+            timestamp=datetime.utcnow(),
+            checks={},
+            message=f"Readiness check failed: {str(e)}"
+        )
+
+
+@router.get("/live")
+async def liveness_check():
+    """Simple liveness check for load balancers."""
+    return {
+        "status": "alive",
+        "timestamp": datetime.utcnow().isoformat()
+    }
+
+
+@router.get("/metrics")
+async def get_health_metrics(
+    request: Request,
+    current_user: Optional[Dict] = Depends(get_current_user)
+):
+    """Get detailed system metrics."""
+    try:
+        metrics = get_system_metrics()
+        
+        # Add additional metrics if authenticated
+        if current_user:
+            metrics.update(get_detailed_metrics())
+        
+        return {
+            "timestamp": datetime.utcnow().isoformat(),
+            "metrics": metrics
+        }
+        
+    except Exception as e:
+        logger.error(f"Error getting system metrics: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to get system metrics: {str(e)}"
+        )
+
+
+@router.get("/version")
+async def get_version_info():
+    """Get application version information."""
+    settings = get_settings()
+    
+    return {
+        "name": settings.app_name,
+        "version": settings.version,
+        "environment": settings.environment,
+        "debug": settings.debug,
+        "timestamp": datetime.utcnow().isoformat()
+    }
+
+
+def get_system_metrics() -> Dict[str, Any]:
+    """Get basic system metrics."""
+    try:
+        # CPU metrics
+        cpu_percent = psutil.cpu_percent(interval=1)
+        cpu_count = psutil.cpu_count()
+        
+        # Memory metrics
+        memory = psutil.virtual_memory()
+        memory_metrics = {
+            "total_gb": round(memory.total / (1024**3), 2),
+            "available_gb": round(memory.available / (1024**3), 2),
+            "used_gb": round(memory.used / (1024**3), 2),
+            "percent": memory.percent
+        }
+        
+        # Disk metrics
+        disk = psutil.disk_usage('/')
+        disk_metrics = {
+            "total_gb": round(disk.total / (1024**3), 2),
+            "free_gb": round(disk.free / (1024**3), 2),
+            "used_gb": round(disk.used / (1024**3), 2),
+            "percent": round((disk.used / disk.total) * 100, 2)
+        }
+        
+        # Network metrics (basic)
+        network = psutil.net_io_counters()
+        network_metrics = {
+            "bytes_sent": network.bytes_sent,
+            "bytes_recv": network.bytes_recv,
+            "packets_sent": network.packets_sent,
+            "packets_recv": network.packets_recv
+        }
+        
+        return {
+            "cpu": {
+                "percent": cpu_percent,
+                "count": cpu_count
+            },
+            "memory": memory_metrics,
+            "disk": disk_metrics,
+            "network": network_metrics
+        }
+        
+    except Exception as e:
+        logger.error(f"Error getting system metrics: {e}")
+        return {}
+
+
+def get_detailed_metrics() -> Dict[str, Any]:
+    """Get detailed system metrics (requires authentication)."""
+    try:
+        # Process metrics
+        process = psutil.Process()
+        process_metrics = {
+            "pid": process.pid,
+            "cpu_percent": process.cpu_percent(),
+            "memory_mb": round(process.memory_info().rss / (1024**2), 2),
+            "num_threads": process.num_threads(),
+            "create_time": datetime.fromtimestamp(process.create_time()).isoformat()
+        }
+        
+        # Load average (Unix-like systems)
+        load_avg = None
+        try:
+            load_avg = psutil.getloadavg()
+        except AttributeError:
+            # Windows doesn't have load average
+            pass
+        
+        # Temperature sensors (if available)
+        temperatures = {}
+        try:
+            temps = psutil.sensors_temperatures()
+            for name, entries in temps.items():
+                temperatures[name] = [
+                    {"label": entry.label, "current": entry.current}
+                    for entry in entries
+                ]
+        except AttributeError:
+            # Not available on all systems
+            pass
+        
+        detailed = {
+            "process": process_metrics
+        }
+        
+        if load_avg:
+            detailed["load_average"] = {
+                "1min": load_avg[0],
+                "5min": load_avg[1],
+                "15min": load_avg[2]
+            }
+        
+        if temperatures:
+            detailed["temperatures"] = temperatures
+        
+        return detailed
+        
+    except Exception as e:
+        logger.error(f"Error getting detailed metrics: {e}")
+        return {}
+
+
+def check_memory_availability() -> bool:
+    """Check if sufficient memory is available."""
+    try:
+        memory = psutil.virtual_memory()
+        # Consider system ready if less than 90% memory is used
+        return memory.percent < 90.0
+    except Exception:
+        return False
+
+
+def check_disk_space() -> bool:
+    """Check if sufficient disk space is available."""
+    try:
+        disk = psutil.disk_usage('/')
+        # Consider system ready if more than 1GB free space
+        free_gb = disk.free / (1024**3)
+        return free_gb > 1.0
+    except Exception:
+        return False