This commit is contained in:
rUv
2025-06-07 11:44:19 +00:00
parent 43e92c5494
commit c378b705ca
95 changed files with 43677 additions and 0 deletions

View File

@@ -0,0 +1,410 @@
# WiFi-DensePose Alerting Rules
# This file defines alerting rules for monitoring the WiFi-DensePose application
groups:
- name: wifi-densepose.application
rules:
# Application Health Alerts
- alert: ApplicationDown
expr: up{job="wifi-densepose-app"} == 0
for: 1m
labels:
severity: critical
service: wifi-densepose
team: platform
annotations:
summary: "WiFi-DensePose application is down"
description: "WiFi-DensePose application on {{ $labels.instance }} has been down for more than 1 minute."
runbook_url: "https://docs.wifi-densepose.com/runbooks/application-down"
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{job="wifi-densepose-app",status=~"5.."}[5m])) /
sum(rate(http_requests_total{job="wifi-densepose-app"}[5m]))
) * 100 > 5
for: 5m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-error-rate"
- alert: CriticalErrorRate
expr: |
(
sum(rate(http_requests_total{job="wifi-densepose-app",status=~"5.."}[5m])) /
sum(rate(http_requests_total{job="wifi-densepose-app"}[5m]))
) * 100 > 10
for: 2m
labels:
severity: critical
service: wifi-densepose
team: platform
annotations:
summary: "Critical error rate detected"
description: "Error rate is {{ $value }}% for the last 2 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/critical-error-rate"
- alert: HighResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket{job="wifi-densepose-app"}[5m])) by (le)
) > 1
for: 5m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }}s for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-response-time"
- alert: LowRequestRate
expr: sum(rate(http_requests_total{job="wifi-densepose-app"}[5m])) < 1
for: 10m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "Low request rate detected"
description: "Request rate is {{ $value }} requests/second for the last 10 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/low-request-rate"
- name: wifi-densepose.infrastructure
rules:
# Infrastructure Alerts
- alert: HighCPUUsage
expr: |
(
sum(rate(container_cpu_usage_seconds_total{namespace=~"wifi-densepose.*",container!="POD"}[5m])) by (pod) /
sum(container_spec_cpu_quota{namespace=~"wifi-densepose.*",container!="POD"} / container_spec_cpu_period{namespace=~"wifi-densepose.*",container!="POD"}) by (pod)
) * 100 > 80
for: 5m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "High CPU usage detected"
description: "Pod {{ $labels.pod }} CPU usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-cpu-usage"
- alert: HighMemoryUsage
expr: |
(
sum(container_memory_working_set_bytes{namespace=~"wifi-densepose.*",container!="POD"}) by (pod) /
sum(container_spec_memory_limit_bytes{namespace=~"wifi-densepose.*",container!="POD"}) by (pod)
) * 100 > 80
for: 5m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "High memory usage detected"
description: "Pod {{ $labels.pod }} memory usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-memory-usage"
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{namespace=~"wifi-densepose.*"}[5m]) > 0
for: 5m
labels:
severity: critical
service: wifi-densepose
team: platform
annotations:
summary: "Pod is crash looping"
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is crash looping."
runbook_url: "https://docs.wifi-densepose.com/runbooks/pod-crash-looping"
- alert: PodNotReady
expr: kube_pod_status_ready{namespace=~"wifi-densepose.*",condition="false"} == 1
for: 5m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "Pod is not ready"
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has been not ready for more than 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/pod-not-ready"
- alert: DeploymentReplicasMismatch
expr: |
kube_deployment_spec_replicas{namespace=~"wifi-densepose.*"} !=
kube_deployment_status_replicas_available{namespace=~"wifi-densepose.*"}
for: 10m
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "Deployment replicas mismatch"
description: "Deployment {{ $labels.deployment }} in namespace {{ $labels.namespace }} has {{ $value }} available replicas, expected {{ $labels.spec_replicas }}."
runbook_url: "https://docs.wifi-densepose.com/runbooks/deployment-replicas-mismatch"
- name: wifi-densepose.database
rules:
# Database Alerts
- alert: DatabaseDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
service: database
team: platform
annotations:
summary: "PostgreSQL database is down"
description: "PostgreSQL database on {{ $labels.instance }} has been down for more than 1 minute."
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-down"
- alert: HighDatabaseConnections
expr: |
(
pg_stat_database_numbackends{datname="wifi_densepose"} /
pg_settings_max_connections
) * 100 > 80
for: 5m
labels:
severity: warning
service: database
team: platform
annotations:
summary: "High database connection usage"
description: "Database connection usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-database-connections"
- alert: DatabaseSlowQueries
expr: pg_stat_activity_max_tx_duration{datname="wifi_densepose"} > 300
for: 2m
labels:
severity: warning
service: database
team: platform
annotations:
summary: "Slow database queries detected"
description: "Longest running query has been active for {{ $value }} seconds."
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-slow-queries"
- alert: DatabaseDiskSpaceHigh
expr: |
(
(node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"} - node_filesystem_free_bytes{mountpoint="/var/lib/postgresql"}) /
node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"}
) * 100 > 85
for: 5m
labels:
severity: warning
service: database
team: platform
annotations:
summary: "Database disk space usage high"
description: "Database disk usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-disk-space-high"
- name: wifi-densepose.redis
rules:
# Redis Alerts
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
service: redis
team: platform
annotations:
summary: "Redis is down"
description: "Redis on {{ $labels.instance }} has been down for more than 1 minute."
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-down"
- alert: RedisHighMemoryUsage
expr: |
(
redis_memory_used_bytes /
redis_memory_max_bytes
) * 100 > 80
for: 5m
labels:
severity: warning
service: redis
team: platform
annotations:
summary: "Redis high memory usage"
description: "Redis memory usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-high-memory-usage"
- alert: RedisHighConnections
expr: redis_connected_clients > 100
for: 5m
labels:
severity: warning
service: redis
team: platform
annotations:
summary: "Redis high connection count"
description: "Redis has {{ $value }} connected clients for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-high-connections"
- name: wifi-densepose.kubernetes
rules:
# Kubernetes Cluster Alerts
- alert: KubernetesNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 5m
labels:
severity: critical
service: kubernetes
team: platform
annotations:
summary: "Kubernetes node not ready"
description: "Node {{ $labels.node }} has been not ready for more than 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-not-ready"
- alert: KubernetesNodeHighCPU
expr: |
(
1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)
) * 100 > 80
for: 5m
labels:
severity: warning
service: kubernetes
team: platform
annotations:
summary: "Kubernetes node high CPU usage"
description: "Node {{ $labels.instance }} CPU usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-high-cpu"
- alert: KubernetesNodeHighMemory
expr: |
(
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
) * 100 > 85
for: 5m
labels:
severity: warning
service: kubernetes
team: platform
annotations:
summary: "Kubernetes node high memory usage"
description: "Node {{ $labels.instance }} memory usage is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-high-memory"
- alert: KubernetesNodeDiskSpaceHigh
expr: |
(
(node_filesystem_size_bytes{fstype!="tmpfs"} - node_filesystem_free_bytes{fstype!="tmpfs"}) /
node_filesystem_size_bytes{fstype!="tmpfs"}
) * 100 > 85
for: 5m
labels:
severity: warning
service: kubernetes
team: platform
annotations:
summary: "Kubernetes node high disk usage"
description: "Node {{ $labels.instance }} disk usage is {{ $value }}% on {{ $labels.mountpoint }}."
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-disk-space-high"
- alert: KubernetesPersistentVolumeClaimPending
expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
for: 5m
labels:
severity: warning
service: kubernetes
team: platform
annotations:
summary: "PersistentVolumeClaim pending"
description: "PersistentVolumeClaim {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} has been pending for more than 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-pvc-pending"
- name: wifi-densepose.security
rules:
# Security Alerts
- alert: UnauthorizedAPIAccess
expr: increase(http_requests_total{job="wifi-densepose-app",status="401"}[5m]) > 10
for: 1m
labels:
severity: warning
service: wifi-densepose
team: security
annotations:
summary: "High number of unauthorized API access attempts"
description: "{{ $value }} unauthorized access attempts in the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/unauthorized-api-access"
- alert: SuspiciousActivity
expr: increase(http_requests_total{job="wifi-densepose-app",status="403"}[5m]) > 20
for: 1m
labels:
severity: critical
service: wifi-densepose
team: security
annotations:
summary: "Suspicious activity detected"
description: "{{ $value }} forbidden access attempts in the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/suspicious-activity"
- alert: CertificateExpiringSoon
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 30
for: 1h
labels:
severity: warning
service: wifi-densepose
team: platform
annotations:
summary: "SSL certificate expiring soon"
description: "SSL certificate for {{ $labels.instance }} expires in {{ $value }} days."
runbook_url: "https://docs.wifi-densepose.com/runbooks/certificate-expiring-soon"
- name: wifi-densepose.business
rules:
# Business Logic Alerts
- alert: LowDataProcessingRate
expr: rate(wifi_densepose_data_processed_total[5m]) < 10
for: 10m
labels:
severity: warning
service: wifi-densepose
team: product
annotations:
summary: "Low data processing rate"
description: "Data processing rate is {{ $value }} items/second for the last 10 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/low-data-processing-rate"
- alert: HighDataProcessingErrors
expr: |
(
rate(wifi_densepose_data_processing_errors_total[5m]) /
rate(wifi_densepose_data_processed_total[5m])
) * 100 > 5
for: 5m
labels:
severity: warning
service: wifi-densepose
team: product
annotations:
summary: "High data processing error rate"
description: "Data processing error rate is {{ $value }}% for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-data-processing-errors"
- alert: ModelInferenceLatencyHigh
expr: |
histogram_quantile(0.95,
rate(wifi_densepose_model_inference_duration_seconds_bucket[5m])
) > 2
for: 5m
labels:
severity: warning
service: wifi-densepose
team: ml
annotations:
summary: "High model inference latency"
description: "95th percentile model inference latency is {{ $value }}s for the last 5 minutes."
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-model-inference-latency"

View File

@@ -0,0 +1,472 @@
{
"dashboard": {
"id": null,
"title": "WiFi-DensePose Monitoring Dashboard",
"tags": ["wifi-densepose", "monitoring", "kubernetes"],
"style": "dark",
"timezone": "browser",
"refresh": "30s",
"schemaVersion": 30,
"version": 1,
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"]
},
"templating": {
"list": [
{
"name": "namespace",
"type": "query",
"query": "label_values(kube_namespace_info, namespace)",
"refresh": 1,
"includeAll": true,
"allValue": ".*",
"multi": true,
"datasource": "Prometheus"
},
{
"name": "pod",
"type": "query",
"query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)",
"refresh": 1,
"includeAll": true,
"allValue": ".*",
"multi": true,
"datasource": "Prometheus"
},
{
"name": "instance",
"type": "query",
"query": "label_values(up, instance)",
"refresh": 1,
"includeAll": true,
"allValue": ".*",
"multi": true,
"datasource": "Prometheus"
}
]
},
"panels": [
{
"id": 1,
"title": "System Overview",
"type": "row",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"collapsed": false
},
{
"id": 2,
"title": "Application Status",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 1},
"targets": [
{
"expr": "up{job=\"wifi-densepose-app\"}",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "green", "value": 1}
]
},
"mappings": [
{"options": {"0": {"text": "Down"}}, "type": "value"},
{"options": {"1": {"text": "Up"}}, "type": "value"}
]
}
},
"options": {
"reduceOptions": {
"values": false,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "background"
}
},
{
"id": 3,
"title": "Request Rate",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 1},
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"wifi-densepose-app\"}[5m]))",
"legendFormat": "Requests/sec",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": {"mode": "palette-classic"},
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 100},
{"color": "red", "value": 1000}
]
}
}
}
},
{
"id": 4,
"title": "Error Rate",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 1},
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"wifi-densepose-app\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=\"wifi-densepose-app\"}[5m])) * 100",
"legendFormat": "Error Rate %",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {"mode": "thresholds"},
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 1},
{"color": "red", "value": 5}
]
}
}
}
},
{
"id": 5,
"title": "Response Time",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 1},
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"wifi-densepose-app\"}[5m])) by (le))",
"legendFormat": "95th percentile",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"color": {"mode": "thresholds"},
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 0.5},
{"color": "red", "value": 1}
]
}
}
}
},
{
"id": 6,
"title": "Application Metrics",
"type": "row",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 9},
"collapsed": false
},
{
"id": 7,
"title": "HTTP Request Rate",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 10},
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"wifi-densepose-app\"}[5m])) by (method, status)",
"legendFormat": "{{method}} {{status}}",
"refId": "A"
}
],
"yAxes": [
{"label": "Requests/sec", "min": 0},
{"show": false}
],
"xAxis": {"show": true},
"legend": {"show": true, "values": true, "current": true}
},
{
"id": 8,
"title": "Response Time Distribution",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 10},
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{job=\"wifi-densepose-app\"}[5m])) by (le))",
"legendFormat": "50th percentile",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"wifi-densepose-app\"}[5m])) by (le))",
"legendFormat": "95th percentile",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job=\"wifi-densepose-app\"}[5m])) by (le))",
"legendFormat": "99th percentile",
"refId": "C"
}
],
"yAxes": [
{"label": "Response Time (s)", "min": 0},
{"show": false}
]
},
{
"id": 9,
"title": "Infrastructure Metrics",
"type": "row",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 18},
"collapsed": false
},
{
"id": 10,
"title": "CPU Usage",
"type": "graph",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 19},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",pod=~\"$pod\"}[5m])) by (pod) * 100",
"legendFormat": "{{pod}}",
"refId": "A"
}
],
"yAxes": [
{"label": "CPU %", "min": 0, "max": 100},
{"show": false}
]
},
{
"id": 11,
"title": "Memory Usage",
"type": "graph",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 19},
"targets": [
{
"expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\",pod=~\"$pod\"}) by (pod) / 1024 / 1024",
"legendFormat": "{{pod}}",
"refId": "A"
}
],
"yAxes": [
{"label": "Memory (MB)", "min": 0},
{"show": false}
]
},
{
"id": 12,
"title": "Network I/O",
"type": "graph",
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 19},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod=~\"$pod\"}[5m])) by (pod)",
"legendFormat": "{{pod}} RX",
"refId": "A"
},
{
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod=~\"$pod\"}[5m])) by (pod)",
"legendFormat": "{{pod}} TX",
"refId": "B"
}
],
"yAxes": [
{"label": "Bytes/sec", "min": 0},
{"show": false}
]
},
{
"id": 13,
"title": "Database Metrics",
"type": "row",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 27},
"collapsed": false
},
{
"id": 14,
"title": "Database Connections",
"type": "graph",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 28},
"targets": [
{
"expr": "pg_stat_database_numbackends{datname=\"wifi_densepose\"}",
"legendFormat": "Active Connections",
"refId": "A"
},
{
"expr": "pg_settings_max_connections",
"legendFormat": "Max Connections",
"refId": "B"
}
],
"yAxes": [
{"label": "Connections", "min": 0},
{"show": false}
]
},
{
"id": 15,
"title": "Database Query Performance",
"type": "graph",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 28},
"targets": [
{
"expr": "rate(pg_stat_database_tup_fetched{datname=\"wifi_densepose\"}[5m])",
"legendFormat": "Tuples Fetched/sec",
"refId": "A"
},
{
"expr": "rate(pg_stat_database_tup_inserted{datname=\"wifi_densepose\"}[5m])",
"legendFormat": "Tuples Inserted/sec",
"refId": "B"
},
{
"expr": "rate(pg_stat_database_tup_updated{datname=\"wifi_densepose\"}[5m])",
"legendFormat": "Tuples Updated/sec",
"refId": "C"
}
],
"yAxes": [
{"label": "Operations/sec", "min": 0},
{"show": false}
]
},
{
"id": 16,
"title": "Redis Metrics",
"type": "graph",
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 28},
"targets": [
{
"expr": "redis_connected_clients",
"legendFormat": "Connected Clients",
"refId": "A"
},
{
"expr": "rate(redis_total_commands_processed_total[5m])",
"legendFormat": "Commands/sec",
"refId": "B"
}
],
"yAxes": [
{"label": "Count", "min": 0},
{"show": false}
]
},
{
"id": 17,
"title": "Kubernetes Metrics",
"type": "row",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 36},
"collapsed": false
},
{
"id": 18,
"title": "Pod Status",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 37},
"targets": [
{
"expr": "sum(kube_pod_status_phase{namespace=~\"$namespace\"}) by (phase)",
"legendFormat": "{{phase}}",
"refId": "A"
}
],
"yAxes": [
{"label": "Pod Count", "min": 0},
{"show": false}
]
},
{
"id": 19,
"title": "Node Resource Usage",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 37},
"targets": [
{
"expr": "(1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100",
"legendFormat": "CPU Usage %",
"refId": "A"
},
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "Memory Usage %",
"refId": "B"
}
],
"yAxes": [
{"label": "Usage %", "min": 0, "max": 100},
{"show": false}
]
},
{
"id": 20,
"title": "Alerts and Logs",
"type": "row",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 45},
"collapsed": false
},
{
"id": 21,
"title": "Active Alerts",
"type": "table",
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 46},
"targets": [
{
"expr": "ALERTS{alertstate=\"firing\"}",
"format": "table",
"instant": true,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"__name__": true,
"Time": true,
"job": true
},
"indexByName": {},
"renameByName": {
"alertname": "Alert",
"severity": "Severity",
"summary": "Summary",
"description": "Description"
}
}
}
]
}
],
"annotations": {
"list": [
{
"name": "Deployments",
"datasource": "Prometheus",
"expr": "increase(kube_deployment_status_observed_generation{namespace=~\"$namespace\"}[1m])",
"iconColor": "green",
"titleFormat": "Deployment: {{deployment}}"
}
]
}
},
"overwrite": true
}

View File

@@ -0,0 +1,325 @@
# Prometheus Configuration for WiFi-DensePose
# This configuration sets up comprehensive monitoring for the WiFi-DensePose application
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'wifi-densepose'
environment: 'production'
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "alerting-rules.yml"
- "recording-rules.yml"
# Scrape configuration
scrape_configs:
# Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
scrape_interval: 30s
metrics_path: /metrics
# Kubernetes API Server
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- default
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Kubernetes Nodes
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
# Kubernetes Node Exporter
- job_name: 'kubernetes-node-exporter'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
action: keep
regex: node-exporter
- source_labels: [__meta_kubernetes_endpoint_address_target_name]
target_label: node
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
# Kubernetes Pods
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# WiFi-DensePose Application
- job_name: 'wifi-densepose-app'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- wifi-densepose
- wifi-densepose-staging
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: wifi-densepose
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
scrape_interval: 10s
metrics_path: /metrics
# PostgreSQL Exporter
- job_name: 'postgres-exporter'
kubernetes_sd_configs:
- role: service
namespaces:
names:
- wifi-densepose
- wifi-densepose-staging
relabel_configs:
- source_labels: [__meta_kubernetes_service_label_app]
action: keep
regex: postgres-exporter
- source_labels: [__meta_kubernetes_service_port_name]
action: keep
regex: metrics
scrape_interval: 30s
# Redis Exporter
- job_name: 'redis-exporter'
kubernetes_sd_configs:
- role: service
namespaces:
names:
- wifi-densepose
- wifi-densepose-staging
relabel_configs:
- source_labels: [__meta_kubernetes_service_label_app]
action: keep
regex: redis-exporter
- source_labels: [__meta_kubernetes_service_port_name]
action: keep
regex: metrics
scrape_interval: 30s
# NGINX Ingress Controller
- job_name: 'nginx-ingress'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- ingress-nginx
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
action: keep
regex: ingress-nginx
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: (.+)
replacement: $1:10254
scrape_interval: 30s
# Kubernetes Services
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module: [http_2xx]
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
# Blackbox Exporter for external endpoints
- job_name: 'blackbox-http'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://wifi-densepose.com
- https://staging.wifi-densepose.com
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
scrape_interval: 60s
# cAdvisor for container metrics
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
scrape_interval: 30s
# Kube State Metrics
- job_name: 'kube-state-metrics'
kubernetes_sd_configs:
- role: service
namespaces:
names:
- kube-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
action: keep
regex: kube-state-metrics
scrape_interval: 30s
# CoreDNS
- job_name: 'coredns'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- kube-system
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_k8s_app]
action: keep
regex: kube-dns
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: metrics
scrape_interval: 30s
# Kubernetes Ingress
- job_name: 'kubernetes-ingresses'
kubernetes_sd_configs:
- role: ingress
relabel_configs:
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
# Remote write configuration for long-term storage
remote_write:
- url: "https://prometheus-remote-write.monitoring.svc.cluster.local/api/v1/write"
queue_config:
max_samples_per_send: 1000
max_shards: 200
capacity: 2500
write_relabel_configs:
- source_labels: [__name__]
regex: 'go_.*'
action: drop
# Storage configuration
storage:
tsdb:
retention.time: 15d
retention.size: 50GB
wal-compression: true
# Feature flags
feature_flags:
- promql-at-modifier
- remote-write-receiver