Files
wifi-densepose/monitoring/alerting-rules.yml
2025-06-07 11:44:19 +00:00

410 lines
16 KiB
YAML

# WiFi-DensePose Alerting Rules
# This file defines alerting rules for monitoring the WiFi-DensePose application
groups:
- name: wifi-densepose.application
rules:
# Application Health Alerts
- alert: ApplicationDown
expr: up{job="wifi-densepose-app"} == 0
for: 1m
labels:
severity: critical
service: wifi-densepose
team: platform
annotations:
summary: "WiFi-DensePose application is down"
description: "WiFi-DensePose application on {{ $labels.instance }} has been down for more than 1 minute."
runbook_url: "https://docs.wifi-densepose.com/runbooks/application-down"
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{job="wifi-densepose-app",status=~"5.."}[5m])) /
sum(rate(http_requests_total{job="wifi-densepose-app"}[5m]))
) * 100 > 5
for: 5m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-error-rate"
- alert: CriticalErrorRate
expr: |
(
sum(rate(http_requests_total{job="wifi-densepose-app",status=~"5.."}[5m])) /
sum(rate(http_requests_total{job="wifi-densepose-app"}[5m]))
) * 100 > 10
for: 2m
labels:
severity: critical
service: wifi-densepose
team: platform
annotations:
summary: "Critical error rate detected"
description: "Error rate is {{ $value }}% for the last 2 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/critical-error-rate"
- alert: HighResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket{job="wifi-densepose-app"}[5m])) by (le)
) > 1
for: 5m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }}s for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-response-time"
- alert: LowRequestRate
expr: sum(rate(http_requests_total{job="wifi-densepose-app"}[5m])) < 1
for: 10m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "Low request rate detected"
description: "Request rate is {{ $value }} requests/second for the last 10 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/low-request-rate"
- name: wifi-densepose.infrastructure
rules:
# Infrastructure Alerts
- alert: HighCPUUsage
expr: |
(
sum(rate(container_cpu_usage_seconds_total{namespace=~"wifi-densepose.*",container!="POD"}[5m])) by (pod) /
sum(container_spec_cpu_quota{namespace=~"wifi-densepose.*",container!="POD"} / container_spec_cpu_period{namespace=~"wifi-densepose.*",container!="POD"}) by (pod)
) * 100 > 80
for: 5m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "High CPU usage detected"
description: "Pod {{ $labels.pod }} CPU usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-cpu-usage"
- alert: HighMemoryUsage
expr: |
(
sum(container_memory_working_set_bytes{namespace=~"wifi-densepose.*",container!="POD"}) by (pod) /
sum(container_spec_memory_limit_bytes{namespace=~"wifi-densepose.*",container!="POD"}) by (pod)
) * 100 > 80
for: 5m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "High memory usage detected"
description: "Pod {{ $labels.pod }} memory usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-memory-usage"
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{namespace=~"wifi-densepose.*"}[5m]) > 0
for: 5m
labels:
severity: critical
service: wifi-densepose
team: platform
annotations:
summary: "Pod is crash looping"
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is crash looping."
runbook_url: "https://docs.wifi-densepose.com/runbooks/pod-crash-looping"
- alert: PodNotReady
expr: kube_pod_status_ready{namespace=~"wifi-densepose.*",condition="false"} == 1
for: 5m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "Pod is not ready"
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has been not ready for more than 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/pod-not-ready"
- alert: DeploymentReplicasMismatch
expr: |
kube_deployment_spec_replicas{namespace=~"wifi-densepose.*"} !=
kube_deployment_status_replicas_available{namespace=~"wifi-densepose.*"}
for: 10m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "Deployment replicas mismatch"
description: "Deployment {{ $labels.deployment }} in namespace {{ $labels.namespace }} has {{ $value }} available replicas, expected {{ $labels.spec_replicas }}."
runbook_url: "https://docs.wifi-densepose.com/runbooks/deployment-replicas-mismatch"
- name: wifi-densepose.database
rules:
# Database Alerts
- alert: DatabaseDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
service: database
team: platform
annotations:
summary: "PostgreSQL database is down"
description: "PostgreSQL database on {{ $labels.instance }} has been down for more than 1 minute."
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-down"
- alert: HighDatabaseConnections
expr: |
(
pg_stat_database_numbackends{datname="wifi_densepose"} /
pg_settings_max_connections
) * 100 > 80
for: 5m
labels:
severity: warning
service: database
team: platform
annotations:
summary: "High database connection usage"
description: "Database connection usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-database-connections"
- alert: DatabaseSlowQueries
expr: pg_stat_activity_max_tx_duration{datname="wifi_densepose"} > 300
for: 2m
labels:
severity: warning
service: database
team: platform
annotations:
summary: "Slow database queries detected"
description: "Longest running query has been active for {{ $value }} seconds."
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-slow-queries"
- alert: DatabaseDiskSpaceHigh
expr: |
(
(node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"} - node_filesystem_free_bytes{mountpoint="/var/lib/postgresql"}) /
node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"}
) * 100 > 85
for: 5m
labels:
severity: warning
service: database
team: platform
annotations:
summary: "Database disk space usage high"
description: "Database disk usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-disk-space-high"
- name: wifi-densepose.redis
rules:
# Redis Alerts
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
service: redis
team: platform
annotations:
summary: "Redis is down"
description: "Redis on {{ $labels.instance }} has been down for more than 1 minute."
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-down"
- alert: RedisHighMemoryUsage
expr: |
(
redis_memory_used_bytes /
redis_memory_max_bytes
) * 100 > 80
for: 5m
labels:
severity: warning
service: redis
team: platform
annotations:
summary: "Redis high memory usage"
description: "Redis memory usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-high-memory-usage"
- alert: RedisHighConnections
expr: redis_connected_clients > 100
for: 5m
labels:
severity: warning
service: redis
team: platform
annotations:
summary: "Redis high connection count"
description: "Redis has {{ $value }} connected clients for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-high-connections"
- name: wifi-densepose.kubernetes
rules:
# Kubernetes Cluster Alerts
- alert: KubernetesNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 5m
labels:
severity: critical
service: kubernetes
team: platform
annotations:
summary: "Kubernetes node not ready"
description: "Node {{ $labels.node }} has been not ready for more than 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-not-ready"
- alert: KubernetesNodeHighCPU
expr: |
(
1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)
) * 100 > 80
for: 5m
labels:
severity: warning
service: kubernetes
team: platform
annotations:
summary: "Kubernetes node high CPU usage"
description: "Node {{ $labels.instance }} CPU usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-high-cpu"
- alert: KubernetesNodeHighMemory
expr: |
(
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
) * 100 > 85
for: 5m
labels:
severity: warning
service: kubernetes
team: platform
annotations:
summary: "Kubernetes node high memory usage"
description: "Node {{ $labels.instance }} memory usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-high-memory"
- alert: KubernetesNodeDiskSpaceHigh
expr: |
(
(node_filesystem_size_bytes{fstype!="tmpfs"} - node_filesystem_free_bytes{fstype!="tmpfs"}) /
node_filesystem_size_bytes{fstype!="tmpfs"}
) * 100 > 85
for: 5m
labels:
severity: warning
service: kubernetes
team: platform
annotations:
summary: "Kubernetes node high disk usage"
description: "Node {{ $labels.instance }} disk usage is {{ $value }}% on {{ $labels.mountpoint }}."
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-disk-space-high"
- alert: KubernetesPersistentVolumeClaimPending
expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
for: 5m
labels:
severity: warning
service: kubernetes
team: platform
annotations:
summary: "PersistentVolumeClaim pending"
description: "PersistentVolumeClaim {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} has been pending for more than 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-pvc-pending"
- name: wifi-densepose.security
rules:
# Security Alerts
- alert: UnauthorizedAPIAccess
expr: increase(http_requests_total{job="wifi-densepose-app",status="401"}[5m]) > 10
for: 1m
labels:
severity: warning
service: wifi-densepose
team: security
annotations:
summary: "High number of unauthorized API access attempts"
description: "{{ $value }} unauthorized access attempts in the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/unauthorized-api-access"
- alert: SuspiciousActivity
expr: increase(http_requests_total{job="wifi-densepose-app",status="403"}[5m]) > 20
for: 1m
labels:
severity: critical
service: wifi-densepose
team: security
annotations:
summary: "Suspicious activity detected"
description: "{{ $value }} forbidden access attempts in the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/suspicious-activity"
- alert: CertificateExpiringSoon
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 30
for: 1h
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "SSL certificate expiring soon"
description: "SSL certificate for {{ $labels.instance }} expires in {{ $value }} days."
runbook_url: "https://docs.wifi-densepose.com/runbooks/certificate-expiring-soon"
- name: wifi-densepose.business
rules:
# Business Logic Alerts
- alert: LowDataProcessingRate
expr: rate(wifi_densepose_data_processed_total[5m]) < 10
for: 10m
labels:
severity: warning
service: wifi-densepose
team: product
annotations:
summary: "Low data processing rate"
description: "Data processing rate is {{ $value }} items/second for the last 10 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/low-data-processing-rate"
- alert: HighDataProcessingErrors
expr: |
(
rate(wifi_densepose_data_processing_errors_total[5m]) /
rate(wifi_densepose_data_processed_total[5m])
) * 100 > 5
for: 5m
labels:
severity: warning
service: wifi-densepose
team: product
annotations:
summary: "High data processing error rate"
description: "Data processing error rate is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-data-processing-errors"
- alert: ModelInferenceLatencyHigh
expr: |
histogram_quantile(0.95,
rate(wifi_densepose_model_inference_duration_seconds_bucket[5m])
) > 2
for: 5m
labels:
severity: warning
service: wifi-densepose
team: ml
annotations:
summary: "High model inference latency"
description: "95th percentile model inference latency is {{ $value }}s for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-model-inference-latency"