410 lines
16 KiB
YAML
410 lines
16 KiB
YAML
# WiFi-DensePose Alerting Rules
|
|
# This file defines alerting rules for monitoring the WiFi-DensePose application
|
|
|
|
groups:
|
|
- name: wifi-densepose.application
|
|
rules:
|
|
# Application Health Alerts
|
|
- alert: ApplicationDown
|
|
expr: up{job="wifi-densepose-app"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: wifi-densepose
|
|
team: platform
|
|
annotations:
|
|
summary: "WiFi-DensePose application is down"
|
|
description: "WiFi-DensePose application on {{ $labels.instance }} has been down for more than 1 minute."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/application-down"
|
|
|
|
- alert: HighErrorRate
|
|
expr: |
|
|
(
|
|
sum(rate(http_requests_total{job="wifi-densepose-app",status=~"5.."}[5m])) /
|
|
sum(rate(http_requests_total{job="wifi-densepose-app"}[5m]))
|
|
) * 100 > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: platform
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Error rate is {{ $value }}% for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-error-rate"
|
|
|
|
- alert: CriticalErrorRate
|
|
expr: |
|
|
(
|
|
sum(rate(http_requests_total{job="wifi-densepose-app",status=~"5.."}[5m])) /
|
|
sum(rate(http_requests_total{job="wifi-densepose-app"}[5m]))
|
|
) * 100 > 10
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: wifi-densepose
|
|
team: platform
|
|
annotations:
|
|
summary: "Critical error rate detected"
|
|
description: "Error rate is {{ $value }}% for the last 2 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/critical-error-rate"
|
|
|
|
- alert: HighResponseTime
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(http_request_duration_seconds_bucket{job="wifi-densepose-app"}[5m])) by (le)
|
|
) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: platform
|
|
annotations:
|
|
summary: "High response time detected"
|
|
description: "95th percentile response time is {{ $value }}s for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-response-time"
|
|
|
|
- alert: LowRequestRate
|
|
expr: sum(rate(http_requests_total{job="wifi-densepose-app"}[5m])) < 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: platform
|
|
annotations:
|
|
summary: "Low request rate detected"
|
|
description: "Request rate is {{ $value }} requests/second for the last 10 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/low-request-rate"
|
|
|
|
- name: wifi-densepose.infrastructure
|
|
rules:
|
|
# Infrastructure Alerts
|
|
- alert: HighCPUUsage
|
|
expr: |
|
|
(
|
|
sum(rate(container_cpu_usage_seconds_total{namespace=~"wifi-densepose.*",container!="POD"}[5m])) by (pod) /
|
|
sum(container_spec_cpu_quota{namespace=~"wifi-densepose.*",container!="POD"} / container_spec_cpu_period{namespace=~"wifi-densepose.*",container!="POD"}) by (pod)
|
|
) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: platform
|
|
annotations:
|
|
summary: "High CPU usage detected"
|
|
description: "Pod {{ $labels.pod }} CPU usage is {{ $value }}% for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-cpu-usage"
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: |
|
|
(
|
|
sum(container_memory_working_set_bytes{namespace=~"wifi-densepose.*",container!="POD"}) by (pod) /
|
|
sum(container_spec_memory_limit_bytes{namespace=~"wifi-densepose.*",container!="POD"}) by (pod)
|
|
) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: platform
|
|
annotations:
|
|
summary: "High memory usage detected"
|
|
description: "Pod {{ $labels.pod }} memory usage is {{ $value }}% for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-memory-usage"
|
|
|
|
- alert: PodCrashLooping
|
|
expr: rate(kube_pod_container_status_restarts_total{namespace=~"wifi-densepose.*"}[5m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: wifi-densepose
|
|
team: platform
|
|
annotations:
|
|
summary: "Pod is crash looping"
|
|
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is crash looping."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/pod-crash-looping"
|
|
|
|
- alert: PodNotReady
|
|
expr: kube_pod_status_ready{namespace=~"wifi-densepose.*",condition="false"} == 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: platform
|
|
annotations:
|
|
summary: "Pod is not ready"
|
|
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has been not ready for more than 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/pod-not-ready"
|
|
|
|
- alert: DeploymentReplicasMismatch
|
|
expr: |
|
|
kube_deployment_spec_replicas{namespace=~"wifi-densepose.*"} !=
|
|
kube_deployment_status_replicas_available{namespace=~"wifi-densepose.*"}
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: platform
|
|
annotations:
|
|
summary: "Deployment replicas mismatch"
|
|
description: "Deployment {{ $labels.deployment }} in namespace {{ $labels.namespace }} has {{ $value }} available replicas, expected {{ $labels.spec_replicas }}."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/deployment-replicas-mismatch"
|
|
|
|
- name: wifi-densepose.database
|
|
rules:
|
|
# Database Alerts
|
|
- alert: DatabaseDown
|
|
expr: pg_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: database
|
|
team: platform
|
|
annotations:
|
|
summary: "PostgreSQL database is down"
|
|
description: "PostgreSQL database on {{ $labels.instance }} has been down for more than 1 minute."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-down"
|
|
|
|
- alert: HighDatabaseConnections
|
|
expr: |
|
|
(
|
|
pg_stat_database_numbackends{datname="wifi_densepose"} /
|
|
pg_settings_max_connections
|
|
) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: database
|
|
team: platform
|
|
annotations:
|
|
summary: "High database connection usage"
|
|
description: "Database connection usage is {{ $value }}% for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-database-connections"
|
|
|
|
- alert: DatabaseSlowQueries
|
|
expr: pg_stat_activity_max_tx_duration{datname="wifi_densepose"} > 300
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
service: database
|
|
team: platform
|
|
annotations:
|
|
summary: "Slow database queries detected"
|
|
description: "Longest running query has been active for {{ $value }} seconds."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-slow-queries"
|
|
|
|
- alert: DatabaseDiskSpaceHigh
|
|
expr: |
|
|
(
|
|
(node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"} - node_filesystem_free_bytes{mountpoint="/var/lib/postgresql"}) /
|
|
node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"}
|
|
) * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: database
|
|
team: platform
|
|
annotations:
|
|
summary: "Database disk space usage high"
|
|
description: "Database disk usage is {{ $value }}% for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-disk-space-high"
|
|
|
|
- name: wifi-densepose.redis
|
|
rules:
|
|
# Redis Alerts
|
|
- alert: RedisDown
|
|
expr: redis_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: redis
|
|
team: platform
|
|
annotations:
|
|
summary: "Redis is down"
|
|
description: "Redis on {{ $labels.instance }} has been down for more than 1 minute."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-down"
|
|
|
|
- alert: RedisHighMemoryUsage
|
|
expr: |
|
|
(
|
|
redis_memory_used_bytes /
|
|
redis_memory_max_bytes
|
|
) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
team: platform
|
|
annotations:
|
|
summary: "Redis high memory usage"
|
|
description: "Redis memory usage is {{ $value }}% for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-high-memory-usage"
|
|
|
|
- alert: RedisHighConnections
|
|
expr: redis_connected_clients > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
team: platform
|
|
annotations:
|
|
summary: "Redis high connection count"
|
|
description: "Redis has {{ $value }} connected clients for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-high-connections"
|
|
|
|
- name: wifi-densepose.kubernetes
|
|
rules:
|
|
# Kubernetes Cluster Alerts
|
|
- alert: KubernetesNodeNotReady
|
|
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: kubernetes
|
|
team: platform
|
|
annotations:
|
|
summary: "Kubernetes node not ready"
|
|
description: "Node {{ $labels.node }} has been not ready for more than 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-not-ready"
|
|
|
|
- alert: KubernetesNodeHighCPU
|
|
expr: |
|
|
(
|
|
1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)
|
|
) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: kubernetes
|
|
team: platform
|
|
annotations:
|
|
summary: "Kubernetes node high CPU usage"
|
|
description: "Node {{ $labels.instance }} CPU usage is {{ $value }}% for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-high-cpu"
|
|
|
|
- alert: KubernetesNodeHighMemory
|
|
expr: |
|
|
(
|
|
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
|
|
) * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: kubernetes
|
|
team: platform
|
|
annotations:
|
|
summary: "Kubernetes node high memory usage"
|
|
description: "Node {{ $labels.instance }} memory usage is {{ $value }}% for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-high-memory"
|
|
|
|
- alert: KubernetesNodeDiskSpaceHigh
|
|
expr: |
|
|
(
|
|
(node_filesystem_size_bytes{fstype!="tmpfs"} - node_filesystem_free_bytes{fstype!="tmpfs"}) /
|
|
node_filesystem_size_bytes{fstype!="tmpfs"}
|
|
) * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: kubernetes
|
|
team: platform
|
|
annotations:
|
|
summary: "Kubernetes node high disk usage"
|
|
description: "Node {{ $labels.instance }} disk usage is {{ $value }}% on {{ $labels.mountpoint }}."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-disk-space-high"
|
|
|
|
- alert: KubernetesPersistentVolumeClaimPending
|
|
expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: kubernetes
|
|
team: platform
|
|
annotations:
|
|
summary: "PersistentVolumeClaim pending"
|
|
description: "PersistentVolumeClaim {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} has been pending for more than 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-pvc-pending"
|
|
|
|
- name: wifi-densepose.security
|
|
rules:
|
|
# Security Alerts
|
|
- alert: UnauthorizedAPIAccess
|
|
expr: increase(http_requests_total{job="wifi-densepose-app",status="401"}[5m]) > 10
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: security
|
|
annotations:
|
|
summary: "High number of unauthorized API access attempts"
|
|
description: "{{ $value }} unauthorized access attempts in the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/unauthorized-api-access"
|
|
|
|
- alert: SuspiciousActivity
|
|
expr: increase(http_requests_total{job="wifi-densepose-app",status="403"}[5m]) > 20
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: wifi-densepose
|
|
team: security
|
|
annotations:
|
|
summary: "Suspicious activity detected"
|
|
description: "{{ $value }} forbidden access attempts in the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/suspicious-activity"
|
|
|
|
- alert: CertificateExpiringSoon
|
|
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 30
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: platform
|
|
annotations:
|
|
summary: "SSL certificate expiring soon"
|
|
description: "SSL certificate for {{ $labels.instance }} expires in {{ $value }} days."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/certificate-expiring-soon"
|
|
|
|
- name: wifi-densepose.business
|
|
rules:
|
|
# Business Logic Alerts
|
|
- alert: LowDataProcessingRate
|
|
expr: rate(wifi_densepose_data_processed_total[5m]) < 10
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: product
|
|
annotations:
|
|
summary: "Low data processing rate"
|
|
description: "Data processing rate is {{ $value }} items/second for the last 10 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/low-data-processing-rate"
|
|
|
|
- alert: HighDataProcessingErrors
|
|
expr: |
|
|
(
|
|
rate(wifi_densepose_data_processing_errors_total[5m]) /
|
|
rate(wifi_densepose_data_processed_total[5m])
|
|
) * 100 > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: product
|
|
annotations:
|
|
summary: "High data processing error rate"
|
|
description: "Data processing error rate is {{ $value }}% for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-data-processing-errors"
|
|
|
|
- alert: ModelInferenceLatencyHigh
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
rate(wifi_densepose_model_inference_duration_seconds_bucket[5m])
|
|
) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: wifi-densepose
|
|
team: ml
|
|
annotations:
|
|
summary: "High model inference latency"
|
|
description: "95th percentile model inference latency is {{ $value }}s for the last 5 minutes."
|
|
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-model-inference-latency" |