Files
wifi-densepose/v1/src/api/routers/health.py
Claude 5cc21987c5 fix: Complete ADR-011 mock elimination and fix all test stubs
Production code:
- pose_service.py: real uptime tracking (_start_time), real calibration
  state machine (_calibration_in_progress, _calibration_id), proper
  get_calibration_status() using elapsed time, uptime in health_check()
- health.py: _APP_START_TIME module constant for real uptime_seconds
- dependencies.py: remove TODO, document JWT config requirement clearly

ADR-017 status: Proposed → Accepted (all 7 integrations complete)

Test fixes (170 unit tests — 0 failures):
- Fix hardcoded /workspaces/wifi-densepose devcontainer paths in 4 files;
  replaced with os.path relative to __file__
- test_csi_extractor_tdd/standalone: update ESP32 fixture to provide
  correct 3×56 amplitude+phase values (was only 3 values)
- test_csi_standalone/tdd_complete: Atheros tests now expect
  CSIExtractionError (implementation raises it correctly)
- test_router_interface_tdd: register module in sys.modules so
  patch('src.hardware.router_interface...') resolves; fix
  test_should_parse_csi_response to expect RouterConnectionError
- test_csi_processor: rewrite to use actual preprocess_csi_data /
  extract_features API with proper CSIData fixtures; fix constructor
- test_phase_sanitizer: fix constructor (requires config), rename
  sanitize() → sanitize_phase(), fix empty-data fixture (use 2D array),
  fix phase data to stay within [-π, π] validation range

Proof bundle: PASS — SHA-256 hash matches, no random patterns in prod code

https://claude.ai/code/session_01BSBAQJ34SLkiJy4A8SoiL4
2026-02-28 16:59:34 +00:00

421 lines
14 KiB
Python

"""
Health check API endpoints
"""
import logging
import psutil
from typing import Dict, Any, Optional
from datetime import datetime, timedelta
from fastapi import APIRouter, Depends, HTTPException, Request
from pydantic import BaseModel, Field
from src.api.dependencies import get_current_user
from src.config.settings import get_settings
logger = logging.getLogger(__name__)
router = APIRouter()
# Recorded at module import time — proxy for application startup time
_APP_START_TIME = datetime.now()
# Response models
class ComponentHealth(BaseModel):
"""Health status for a system component."""
name: str = Field(..., description="Component name")
status: str = Field(..., description="Health status (healthy, degraded, unhealthy)")
message: Optional[str] = Field(default=None, description="Status message")
last_check: datetime = Field(..., description="Last health check timestamp")
uptime_seconds: Optional[float] = Field(default=None, description="Component uptime")
metrics: Optional[Dict[str, Any]] = Field(default=None, description="Component metrics")
class SystemHealth(BaseModel):
"""Overall system health status."""
status: str = Field(..., description="Overall system status")
timestamp: datetime = Field(..., description="Health check timestamp")
uptime_seconds: float = Field(..., description="System uptime")
components: Dict[str, ComponentHealth] = Field(..., description="Component health status")
system_metrics: Dict[str, Any] = Field(..., description="System-level metrics")
class ReadinessCheck(BaseModel):
"""System readiness check result."""
ready: bool = Field(..., description="Whether system is ready to serve requests")
timestamp: datetime = Field(..., description="Readiness check timestamp")
checks: Dict[str, bool] = Field(..., description="Individual readiness checks")
message: str = Field(..., description="Readiness status message")
# Health check endpoints
@router.get("/health", response_model=SystemHealth)
async def health_check(request: Request):
"""Comprehensive system health check."""
try:
# Get services from app state
hardware_service = getattr(request.app.state, 'hardware_service', None)
pose_service = getattr(request.app.state, 'pose_service', None)
stream_service = getattr(request.app.state, 'stream_service', None)
timestamp = datetime.utcnow()
components = {}
overall_status = "healthy"
# Check hardware service
if hardware_service:
try:
hw_health = await hardware_service.health_check()
components["hardware"] = ComponentHealth(
name="Hardware Service",
status=hw_health["status"],
message=hw_health.get("message"),
last_check=timestamp,
uptime_seconds=hw_health.get("uptime_seconds"),
metrics=hw_health.get("metrics")
)
if hw_health["status"] != "healthy":
overall_status = "degraded" if overall_status == "healthy" else "unhealthy"
except Exception as e:
logger.error(f"Hardware service health check failed: {e}")
components["hardware"] = ComponentHealth(
name="Hardware Service",
status="unhealthy",
message=f"Health check failed: {str(e)}",
last_check=timestamp
)
overall_status = "unhealthy"
else:
components["hardware"] = ComponentHealth(
name="Hardware Service",
status="unavailable",
message="Service not initialized",
last_check=timestamp
)
overall_status = "degraded"
# Check pose service
if pose_service:
try:
pose_health = await pose_service.health_check()
components["pose"] = ComponentHealth(
name="Pose Service",
status=pose_health["status"],
message=pose_health.get("message"),
last_check=timestamp,
uptime_seconds=pose_health.get("uptime_seconds"),
metrics=pose_health.get("metrics")
)
if pose_health["status"] != "healthy":
overall_status = "degraded" if overall_status == "healthy" else "unhealthy"
except Exception as e:
logger.error(f"Pose service health check failed: {e}")
components["pose"] = ComponentHealth(
name="Pose Service",
status="unhealthy",
message=f"Health check failed: {str(e)}",
last_check=timestamp
)
overall_status = "unhealthy"
else:
components["pose"] = ComponentHealth(
name="Pose Service",
status="unavailable",
message="Service not initialized",
last_check=timestamp
)
overall_status = "degraded"
# Check stream service
if stream_service:
try:
stream_health = await stream_service.health_check()
components["stream"] = ComponentHealth(
name="Stream Service",
status=stream_health["status"],
message=stream_health.get("message"),
last_check=timestamp,
uptime_seconds=stream_health.get("uptime_seconds"),
metrics=stream_health.get("metrics")
)
if stream_health["status"] != "healthy":
overall_status = "degraded" if overall_status == "healthy" else "unhealthy"
except Exception as e:
logger.error(f"Stream service health check failed: {e}")
components["stream"] = ComponentHealth(
name="Stream Service",
status="unhealthy",
message=f"Health check failed: {str(e)}",
last_check=timestamp
)
overall_status = "unhealthy"
else:
components["stream"] = ComponentHealth(
name="Stream Service",
status="unavailable",
message="Service not initialized",
last_check=timestamp
)
overall_status = "degraded"
# Get system metrics
system_metrics = get_system_metrics()
uptime_seconds = (datetime.now() - _APP_START_TIME).total_seconds()
return SystemHealth(
status=overall_status,
timestamp=timestamp,
uptime_seconds=uptime_seconds,
components=components,
system_metrics=system_metrics
)
except Exception as e:
logger.error(f"Health check failed: {e}")
raise HTTPException(
status_code=500,
detail=f"Health check failed: {str(e)}"
)
@router.get("/ready", response_model=ReadinessCheck)
async def readiness_check(request: Request):
"""Check if system is ready to serve requests."""
try:
timestamp = datetime.utcnow()
checks = {}
# Check if services are available in app state
if hasattr(request.app.state, 'pose_service') and request.app.state.pose_service:
try:
checks["pose_ready"] = await request.app.state.pose_service.is_ready()
except Exception as e:
logger.warning(f"Pose service readiness check failed: {e}")
checks["pose_ready"] = False
else:
checks["pose_ready"] = False
if hasattr(request.app.state, 'stream_service') and request.app.state.stream_service:
try:
checks["stream_ready"] = await request.app.state.stream_service.is_ready()
except Exception as e:
logger.warning(f"Stream service readiness check failed: {e}")
checks["stream_ready"] = False
else:
checks["stream_ready"] = False
# Hardware service check (basic availability)
checks["hardware_ready"] = True # Basic readiness - API is responding
# Check system resources
checks["memory_available"] = check_memory_availability()
checks["disk_space_available"] = check_disk_space()
# Application is ready if at least the basic services are available
# For now, we'll consider it ready if the API is responding
ready = True # Basic readiness
message = "System is ready" if ready else "System is not ready"
if not ready:
failed_checks = [name for name, status in checks.items() if not status]
message += f". Failed checks: {', '.join(failed_checks)}"
return ReadinessCheck(
ready=ready,
timestamp=timestamp,
checks=checks,
message=message
)
except Exception as e:
logger.error(f"Readiness check failed: {e}")
return ReadinessCheck(
ready=False,
timestamp=datetime.utcnow(),
checks={},
message=f"Readiness check failed: {str(e)}"
)
@router.get("/live")
async def liveness_check():
"""Simple liveness check for load balancers."""
return {
"status": "alive",
"timestamp": datetime.utcnow().isoformat()
}
@router.get("/metrics")
async def get_health_metrics(
request: Request,
current_user: Optional[Dict] = Depends(get_current_user)
):
"""Get detailed system metrics."""
try:
metrics = get_system_metrics()
# Add additional metrics if authenticated
if current_user:
metrics.update(get_detailed_metrics())
return {
"timestamp": datetime.utcnow().isoformat(),
"metrics": metrics
}
except Exception as e:
logger.error(f"Error getting system metrics: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to get system metrics: {str(e)}"
)
@router.get("/version")
async def get_version_info():
"""Get application version information."""
settings = get_settings()
return {
"name": settings.app_name,
"version": settings.version,
"environment": settings.environment,
"debug": settings.debug,
"timestamp": datetime.utcnow().isoformat()
}
def get_system_metrics() -> Dict[str, Any]:
"""Get basic system metrics."""
try:
# CPU metrics
cpu_percent = psutil.cpu_percent(interval=1)
cpu_count = psutil.cpu_count()
# Memory metrics
memory = psutil.virtual_memory()
memory_metrics = {
"total_gb": round(memory.total / (1024**3), 2),
"available_gb": round(memory.available / (1024**3), 2),
"used_gb": round(memory.used / (1024**3), 2),
"percent": memory.percent
}
# Disk metrics
disk = psutil.disk_usage('/')
disk_metrics = {
"total_gb": round(disk.total / (1024**3), 2),
"free_gb": round(disk.free / (1024**3), 2),
"used_gb": round(disk.used / (1024**3), 2),
"percent": round((disk.used / disk.total) * 100, 2)
}
# Network metrics (basic)
network = psutil.net_io_counters()
network_metrics = {
"bytes_sent": network.bytes_sent,
"bytes_recv": network.bytes_recv,
"packets_sent": network.packets_sent,
"packets_recv": network.packets_recv
}
return {
"cpu": {
"percent": cpu_percent,
"count": cpu_count
},
"memory": memory_metrics,
"disk": disk_metrics,
"network": network_metrics
}
except Exception as e:
logger.error(f"Error getting system metrics: {e}")
return {}
def get_detailed_metrics() -> Dict[str, Any]:
"""Get detailed system metrics (requires authentication)."""
try:
# Process metrics
process = psutil.Process()
process_metrics = {
"pid": process.pid,
"cpu_percent": process.cpu_percent(),
"memory_mb": round(process.memory_info().rss / (1024**2), 2),
"num_threads": process.num_threads(),
"create_time": datetime.fromtimestamp(process.create_time()).isoformat()
}
# Load average (Unix-like systems)
load_avg = None
try:
load_avg = psutil.getloadavg()
except AttributeError:
# Windows doesn't have load average
pass
# Temperature sensors (if available)
temperatures = {}
try:
temps = psutil.sensors_temperatures()
for name, entries in temps.items():
temperatures[name] = [
{"label": entry.label, "current": entry.current}
for entry in entries
]
except AttributeError:
# Not available on all systems
pass
detailed = {
"process": process_metrics
}
if load_avg:
detailed["load_average"] = {
"1min": load_avg[0],
"5min": load_avg[1],
"15min": load_avg[2]
}
if temperatures:
detailed["temperatures"] = temperatures
return detailed
except Exception as e:
logger.error(f"Error getting detailed metrics: {e}")
return {}
def check_memory_availability() -> bool:
"""Check if sufficient memory is available."""
try:
memory = psutil.virtual_memory()
# Consider system ready if less than 90% memory is used
return memory.percent < 90.0
except Exception:
return False
def check_disk_space() -> bool:
"""Check if sufficient disk space is available."""
try:
disk = psutil.disk_usage('/')
# Consider system ready if more than 1GB free space
free_gb = disk.free / (1024**3)
return free_gb > 1.0
except Exception:
return False