updates
This commit is contained in:
410
monitoring/alerting-rules.yml
Normal file
410
monitoring/alerting-rules.yml
Normal file
@@ -0,0 +1,410 @@
|
||||
# WiFi-DensePose Alerting Rules
|
||||
# This file defines alerting rules for monitoring the WiFi-DensePose application
|
||||
|
||||
groups:
|
||||
- name: wifi-densepose.application
|
||||
rules:
|
||||
# Application Health Alerts
|
||||
- alert: ApplicationDown
|
||||
expr: up{job="wifi-densepose-app"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: wifi-densepose
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "WiFi-DensePose application is down"
|
||||
description: "WiFi-DensePose application on {{ $labels.instance }} has been down for more than 1 minute."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/application-down"
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{job="wifi-densepose-app",status=~"5.."}[5m])) /
|
||||
sum(rate(http_requests_total{job="wifi-densepose-app"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "High error rate detected"
|
||||
description: "Error rate is {{ $value }}% for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-error-rate"
|
||||
|
||||
- alert: CriticalErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{job="wifi-densepose-app",status=~"5.."}[5m])) /
|
||||
sum(rate(http_requests_total{job="wifi-densepose-app"}[5m]))
|
||||
) * 100 > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: wifi-densepose
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Critical error rate detected"
|
||||
description: "Error rate is {{ $value }}% for the last 2 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/critical-error-rate"
|
||||
|
||||
- alert: HighResponseTime
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_request_duration_seconds_bucket{job="wifi-densepose-app"}[5m])) by (le)
|
||||
) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "High response time detected"
|
||||
description: "95th percentile response time is {{ $value }}s for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-response-time"
|
||||
|
||||
- alert: LowRequestRate
|
||||
expr: sum(rate(http_requests_total{job="wifi-densepose-app"}[5m])) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Low request rate detected"
|
||||
description: "Request rate is {{ $value }} requests/second for the last 10 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/low-request-rate"
|
||||
|
||||
- name: wifi-densepose.infrastructure
|
||||
rules:
|
||||
# Infrastructure Alerts
|
||||
- alert: HighCPUUsage
|
||||
expr: |
|
||||
(
|
||||
sum(rate(container_cpu_usage_seconds_total{namespace=~"wifi-densepose.*",container!="POD"}[5m])) by (pod) /
|
||||
sum(container_spec_cpu_quota{namespace=~"wifi-densepose.*",container!="POD"} / container_spec_cpu_period{namespace=~"wifi-densepose.*",container!="POD"}) by (pod)
|
||||
) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "High CPU usage detected"
|
||||
description: "Pod {{ $labels.pod }} CPU usage is {{ $value }}% for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-cpu-usage"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: |
|
||||
(
|
||||
sum(container_memory_working_set_bytes{namespace=~"wifi-densepose.*",container!="POD"}) by (pod) /
|
||||
sum(container_spec_memory_limit_bytes{namespace=~"wifi-densepose.*",container!="POD"}) by (pod)
|
||||
) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "High memory usage detected"
|
||||
description: "Pod {{ $labels.pod }} memory usage is {{ $value }}% for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-memory-usage"
|
||||
|
||||
- alert: PodCrashLooping
|
||||
expr: rate(kube_pod_container_status_restarts_total{namespace=~"wifi-densepose.*"}[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: wifi-densepose
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Pod is crash looping"
|
||||
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is crash looping."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/pod-crash-looping"
|
||||
|
||||
- alert: PodNotReady
|
||||
expr: kube_pod_status_ready{namespace=~"wifi-densepose.*",condition="false"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Pod is not ready"
|
||||
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has been not ready for more than 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/pod-not-ready"
|
||||
|
||||
- alert: DeploymentReplicasMismatch
|
||||
expr: |
|
||||
kube_deployment_spec_replicas{namespace=~"wifi-densepose.*"} !=
|
||||
kube_deployment_status_replicas_available{namespace=~"wifi-densepose.*"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Deployment replicas mismatch"
|
||||
description: "Deployment {{ $labels.deployment }} in namespace {{ $labels.namespace }} has {{ $value }} available replicas, expected {{ $labels.spec_replicas }}."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/deployment-replicas-mismatch"
|
||||
|
||||
- name: wifi-densepose.database
|
||||
rules:
|
||||
# Database Alerts
|
||||
- alert: DatabaseDown
|
||||
expr: pg_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: database
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "PostgreSQL database is down"
|
||||
description: "PostgreSQL database on {{ $labels.instance }} has been down for more than 1 minute."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-down"
|
||||
|
||||
- alert: HighDatabaseConnections
|
||||
expr: |
|
||||
(
|
||||
pg_stat_database_numbackends{datname="wifi_densepose"} /
|
||||
pg_settings_max_connections
|
||||
) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: database
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "High database connection usage"
|
||||
description: "Database connection usage is {{ $value }}% for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-database-connections"
|
||||
|
||||
- alert: DatabaseSlowQueries
|
||||
expr: pg_stat_activity_max_tx_duration{datname="wifi_densepose"} > 300
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
service: database
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Slow database queries detected"
|
||||
description: "Longest running query has been active for {{ $value }} seconds."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-slow-queries"
|
||||
|
||||
- alert: DatabaseDiskSpaceHigh
|
||||
expr: |
|
||||
(
|
||||
(node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"} - node_filesystem_free_bytes{mountpoint="/var/lib/postgresql"}) /
|
||||
node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"}
|
||||
) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: database
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Database disk space usage high"
|
||||
description: "Database disk usage is {{ $value }}% for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/database-disk-space-high"
|
||||
|
||||
- name: wifi-densepose.redis
|
||||
rules:
|
||||
# Redis Alerts
|
||||
- alert: RedisDown
|
||||
expr: redis_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: redis
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Redis is down"
|
||||
description: "Redis on {{ $labels.instance }} has been down for more than 1 minute."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-down"
|
||||
|
||||
- alert: RedisHighMemoryUsage
|
||||
expr: |
|
||||
(
|
||||
redis_memory_used_bytes /
|
||||
redis_memory_max_bytes
|
||||
) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: redis
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Redis high memory usage"
|
||||
description: "Redis memory usage is {{ $value }}% for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-high-memory-usage"
|
||||
|
||||
- alert: RedisHighConnections
|
||||
expr: redis_connected_clients > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: redis
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Redis high connection count"
|
||||
description: "Redis has {{ $value }} connected clients for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/redis-high-connections"
|
||||
|
||||
- name: wifi-densepose.kubernetes
|
||||
rules:
|
||||
# Kubernetes Cluster Alerts
|
||||
- alert: KubernetesNodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: kubernetes
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Kubernetes node not ready"
|
||||
description: "Node {{ $labels.node }} has been not ready for more than 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-not-ready"
|
||||
|
||||
- alert: KubernetesNodeHighCPU
|
||||
expr: |
|
||||
(
|
||||
1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)
|
||||
) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: kubernetes
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Kubernetes node high CPU usage"
|
||||
description: "Node {{ $labels.instance }} CPU usage is {{ $value }}% for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-high-cpu"
|
||||
|
||||
- alert: KubernetesNodeHighMemory
|
||||
expr: |
|
||||
(
|
||||
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
|
||||
) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: kubernetes
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Kubernetes node high memory usage"
|
||||
description: "Node {{ $labels.instance }} memory usage is {{ $value }}% for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-high-memory"
|
||||
|
||||
- alert: KubernetesNodeDiskSpaceHigh
|
||||
expr: |
|
||||
(
|
||||
(node_filesystem_size_bytes{fstype!="tmpfs"} - node_filesystem_free_bytes{fstype!="tmpfs"}) /
|
||||
node_filesystem_size_bytes{fstype!="tmpfs"}
|
||||
) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: kubernetes
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Kubernetes node high disk usage"
|
||||
description: "Node {{ $labels.instance }} disk usage is {{ $value }}% on {{ $labels.mountpoint }}."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-node-disk-space-high"
|
||||
|
||||
- alert: KubernetesPersistentVolumeClaimPending
|
||||
expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: kubernetes
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "PersistentVolumeClaim pending"
|
||||
description: "PersistentVolumeClaim {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} has been pending for more than 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/kubernetes-pvc-pending"
|
||||
|
||||
- name: wifi-densepose.security
|
||||
rules:
|
||||
# Security Alerts
|
||||
- alert: UnauthorizedAPIAccess
|
||||
expr: increase(http_requests_total{job="wifi-densepose-app",status="401"}[5m]) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: security
|
||||
annotations:
|
||||
summary: "High number of unauthorized API access attempts"
|
||||
description: "{{ $value }} unauthorized access attempts in the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/unauthorized-api-access"
|
||||
|
||||
- alert: SuspiciousActivity
|
||||
expr: increase(http_requests_total{job="wifi-densepose-app",status="403"}[5m]) > 20
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: wifi-densepose
|
||||
team: security
|
||||
annotations:
|
||||
summary: "Suspicious activity detected"
|
||||
description: "{{ $value }} forbidden access attempts in the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/suspicious-activity"
|
||||
|
||||
- alert: CertificateExpiringSoon
|
||||
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 30
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "SSL certificate expiring soon"
|
||||
description: "SSL certificate for {{ $labels.instance }} expires in {{ $value }} days."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/certificate-expiring-soon"
|
||||
|
||||
- name: wifi-densepose.business
|
||||
rules:
|
||||
# Business Logic Alerts
|
||||
- alert: LowDataProcessingRate
|
||||
expr: rate(wifi_densepose_data_processed_total[5m]) < 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: product
|
||||
annotations:
|
||||
summary: "Low data processing rate"
|
||||
description: "Data processing rate is {{ $value }} items/second for the last 10 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/low-data-processing-rate"
|
||||
|
||||
- alert: HighDataProcessingErrors
|
||||
expr: |
|
||||
(
|
||||
rate(wifi_densepose_data_processing_errors_total[5m]) /
|
||||
rate(wifi_densepose_data_processed_total[5m])
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: product
|
||||
annotations:
|
||||
summary: "High data processing error rate"
|
||||
description: "Data processing error rate is {{ $value }}% for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-data-processing-errors"
|
||||
|
||||
- alert: ModelInferenceLatencyHigh
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
rate(wifi_densepose_model_inference_duration_seconds_bucket[5m])
|
||||
) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: wifi-densepose
|
||||
team: ml
|
||||
annotations:
|
||||
summary: "High model inference latency"
|
||||
description: "95th percentile model inference latency is {{ $value }}s for the last 5 minutes."
|
||||
runbook_url: "https://docs.wifi-densepose.com/runbooks/high-model-inference-latency"
|
||||
472
monitoring/grafana-dashboard.json
Normal file
472
monitoring/grafana-dashboard.json
Normal file
@@ -0,0 +1,472 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"title": "WiFi-DensePose Monitoring Dashboard",
|
||||
"tags": ["wifi-densepose", "monitoring", "kubernetes"],
|
||||
"style": "dark",
|
||||
"timezone": "browser",
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 30,
|
||||
"version": 1,
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"]
|
||||
},
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"query": "label_values(kube_namespace_info, namespace)",
|
||||
"refresh": 1,
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"multi": true,
|
||||
"datasource": "Prometheus"
|
||||
},
|
||||
{
|
||||
"name": "pod",
|
||||
"type": "query",
|
||||
"query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)",
|
||||
"refresh": 1,
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"multi": true,
|
||||
"datasource": "Prometheus"
|
||||
},
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"query": "label_values(up, instance)",
|
||||
"refresh": 1,
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"multi": true,
|
||||
"datasource": "Prometheus"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "System Overview",
|
||||
"type": "row",
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Application Status",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 1},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=\"wifi-densepose-app\"}",
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "red", "value": 0},
|
||||
{"color": "green", "value": 1}
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{"options": {"0": {"text": "Down"}}, "type": "value"},
|
||||
{"options": {"1": {"text": "Up"}}, "type": "value"}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": ""
|
||||
},
|
||||
"orientation": "auto",
|
||||
"textMode": "auto",
|
||||
"colorMode": "background"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Request Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 1},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"wifi-densepose-app\"}[5m]))",
|
||||
"legendFormat": "Requests/sec",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"color": {"mode": "palette-classic"},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "green", "value": 0},
|
||||
{"color": "yellow", "value": 100},
|
||||
{"color": "red", "value": 1000}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Error Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 1},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"wifi-densepose-app\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=\"wifi-densepose-app\"}[5m])) * 100",
|
||||
"legendFormat": "Error Rate %",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "green", "value": 0},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "red", "value": 5}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Response Time",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 1},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"wifi-densepose-app\"}[5m])) by (le))",
|
||||
"legendFormat": "95th percentile",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "green", "value": 0},
|
||||
{"color": "yellow", "value": 0.5},
|
||||
{"color": "red", "value": 1}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Application Metrics",
|
||||
"type": "row",
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 9},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "HTTP Request Rate",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 10},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"wifi-densepose-app\"}[5m])) by (method, status)",
|
||||
"legendFormat": "{{method}} {{status}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"yAxes": [
|
||||
{"label": "Requests/sec", "min": 0},
|
||||
{"show": false}
|
||||
],
|
||||
"xAxis": {"show": true},
|
||||
"legend": {"show": true, "values": true, "current": true}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Response Time Distribution",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 10},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{job=\"wifi-densepose-app\"}[5m])) by (le))",
|
||||
"legendFormat": "50th percentile",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"wifi-densepose-app\"}[5m])) by (le))",
|
||||
"legendFormat": "95th percentile",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job=\"wifi-densepose-app\"}[5m])) by (le))",
|
||||
"legendFormat": "99th percentile",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"yAxes": [
|
||||
{"label": "Response Time (s)", "min": 0},
|
||||
{"show": false}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Infrastructure Metrics",
|
||||
"type": "row",
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 18},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "CPU Usage",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 19},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",pod=~\"$pod\"}[5m])) by (pod) * 100",
|
||||
"legendFormat": "{{pod}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"yAxes": [
|
||||
{"label": "CPU %", "min": 0, "max": 100},
|
||||
{"show": false}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Memory Usage",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 19},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\",pod=~\"$pod\"}) by (pod) / 1024 / 1024",
|
||||
"legendFormat": "{{pod}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"yAxes": [
|
||||
{"label": "Memory (MB)", "min": 0},
|
||||
{"show": false}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Network I/O",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 19},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod=~\"$pod\"}[5m])) by (pod)",
|
||||
"legendFormat": "{{pod}} RX",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod=~\"$pod\"}[5m])) by (pod)",
|
||||
"legendFormat": "{{pod}} TX",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"yAxes": [
|
||||
{"label": "Bytes/sec", "min": 0},
|
||||
{"show": false}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"title": "Database Metrics",
|
||||
"type": "row",
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 27},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"title": "Database Connections",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 28},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_stat_database_numbackends{datname=\"wifi_densepose\"}",
|
||||
"legendFormat": "Active Connections",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "pg_settings_max_connections",
|
||||
"legendFormat": "Max Connections",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"yAxes": [
|
||||
{"label": "Connections", "min": 0},
|
||||
{"show": false}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"title": "Database Query Performance",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 28},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(pg_stat_database_tup_fetched{datname=\"wifi_densepose\"}[5m])",
|
||||
"legendFormat": "Tuples Fetched/sec",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(pg_stat_database_tup_inserted{datname=\"wifi_densepose\"}[5m])",
|
||||
"legendFormat": "Tuples Inserted/sec",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "rate(pg_stat_database_tup_updated{datname=\"wifi_densepose\"}[5m])",
|
||||
"legendFormat": "Tuples Updated/sec",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"yAxes": [
|
||||
{"label": "Operations/sec", "min": 0},
|
||||
{"show": false}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"title": "Redis Metrics",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 28},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "redis_connected_clients",
|
||||
"legendFormat": "Connected Clients",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(redis_total_commands_processed_total[5m])",
|
||||
"legendFormat": "Commands/sec",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"yAxes": [
|
||||
{"label": "Count", "min": 0},
|
||||
{"show": false}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"title": "Kubernetes Metrics",
|
||||
"type": "row",
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 36},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"title": "Pod Status",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 37},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_pod_status_phase{namespace=~\"$namespace\"}) by (phase)",
|
||||
"legendFormat": "{{phase}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"yAxes": [
|
||||
{"label": "Pod Count", "min": 0},
|
||||
{"show": false}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"title": "Node Resource Usage",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 37},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100",
|
||||
"legendFormat": "CPU Usage %",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
|
||||
"legendFormat": "Memory Usage %",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"yAxes": [
|
||||
{"label": "Usage %", "min": 0, "max": 100},
|
||||
{"show": false}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"title": "Alerts and Logs",
|
||||
"type": "row",
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 45},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"title": "Active Alerts",
|
||||
"type": "table",
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 46},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ALERTS{alertstate=\"firing\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"__name__": true,
|
||||
"Time": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"alertname": "Alert",
|
||||
"severity": "Severity",
|
||||
"summary": "Summary",
|
||||
"description": "Description"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"name": "Deployments",
|
||||
"datasource": "Prometheus",
|
||||
"expr": "increase(kube_deployment_status_observed_generation{namespace=~\"$namespace\"}[1m])",
|
||||
"iconColor": "green",
|
||||
"titleFormat": "Deployment: {{deployment}}"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overwrite": true
|
||||
}
|
||||
325
monitoring/prometheus-config.yml
Normal file
325
monitoring/prometheus-config.yml
Normal file
@@ -0,0 +1,325 @@
|
||||
# Prometheus Configuration for WiFi-DensePose
|
||||
# This configuration sets up comprehensive monitoring for the WiFi-DensePose application
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'wifi-densepose'
|
||||
environment: 'production'
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||
rule_files:
|
||||
- "alerting-rules.yml"
|
||||
- "recording-rules.yml"
|
||||
|
||||
# Scrape configuration
|
||||
scrape_configs:
|
||||
# Prometheus itself
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
scrape_interval: 30s
|
||||
metrics_path: /metrics
|
||||
|
||||
# Kubernetes API Server
|
||||
- job_name: 'kubernetes-apiservers'
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
namespaces:
|
||||
names:
|
||||
- default
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecure_skip_verify: true
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||
action: keep
|
||||
regex: default;kubernetes;https
|
||||
|
||||
# Kubernetes Nodes
|
||||
- job_name: 'kubernetes-nodes'
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecure_skip_verify: true
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
relabel_configs:
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- target_label: __address__
|
||||
replacement: kubernetes.default.svc:443
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
regex: (.+)
|
||||
target_label: __metrics_path__
|
||||
replacement: /api/v1/nodes/${1}/proxy/metrics
|
||||
|
||||
# Kubernetes Node Exporter
|
||||
- job_name: 'kubernetes-node-exporter'
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_endpoints_name]
|
||||
action: keep
|
||||
regex: node-exporter
|
||||
- source_labels: [__meta_kubernetes_endpoint_address_target_name]
|
||||
target_label: node
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_label_(.+)
|
||||
|
||||
# Kubernetes Pods
|
||||
- job_name: 'kubernetes-pods'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
||||
action: replace
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
action: replace
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
action: replace
|
||||
target_label: kubernetes_namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
action: replace
|
||||
target_label: kubernetes_pod_name
|
||||
|
||||
# WiFi-DensePose Application
|
||||
- job_name: 'wifi-densepose-app'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- wifi-densepose
|
||||
- wifi-densepose-staging
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
action: keep
|
||||
regex: wifi-densepose
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
||||
action: replace
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
action: replace
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
action: replace
|
||||
target_label: kubernetes_namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
action: replace
|
||||
target_label: kubernetes_pod_name
|
||||
scrape_interval: 10s
|
||||
metrics_path: /metrics
|
||||
|
||||
# PostgreSQL Exporter
|
||||
- job_name: 'postgres-exporter'
|
||||
kubernetes_sd_configs:
|
||||
- role: service
|
||||
namespaces:
|
||||
names:
|
||||
- wifi-densepose
|
||||
- wifi-densepose-staging
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_label_app]
|
||||
action: keep
|
||||
regex: postgres-exporter
|
||||
- source_labels: [__meta_kubernetes_service_port_name]
|
||||
action: keep
|
||||
regex: metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# Redis Exporter
|
||||
- job_name: 'redis-exporter'
|
||||
kubernetes_sd_configs:
|
||||
- role: service
|
||||
namespaces:
|
||||
names:
|
||||
- wifi-densepose
|
||||
- wifi-densepose-staging
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_label_app]
|
||||
action: keep
|
||||
regex: redis-exporter
|
||||
- source_labels: [__meta_kubernetes_service_port_name]
|
||||
action: keep
|
||||
regex: metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# NGINX Ingress Controller
|
||||
- job_name: 'nginx-ingress'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- ingress-nginx
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
||||
action: keep
|
||||
regex: ingress-nginx
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
regex: (.+)
|
||||
replacement: $1:10254
|
||||
scrape_interval: 30s
|
||||
|
||||
# Kubernetes Services
|
||||
- job_name: 'kubernetes-services'
|
||||
kubernetes_sd_configs:
|
||||
- role: service
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_label_(.+)
|
||||
|
||||
# Blackbox Exporter for external endpoints
|
||||
- job_name: 'blackbox-http'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- https://wifi-densepose.com
|
||||
- https://staging.wifi-densepose.com
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
scrape_interval: 60s
|
||||
|
||||
# cAdvisor for container metrics
|
||||
- job_name: 'kubernetes-cadvisor'
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecure_skip_verify: true
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
relabel_configs:
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- target_label: __address__
|
||||
replacement: kubernetes.default.svc:443
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
regex: (.+)
|
||||
target_label: __metrics_path__
|
||||
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
||||
scrape_interval: 30s
|
||||
|
||||
# Kube State Metrics
|
||||
- job_name: 'kube-state-metrics'
|
||||
kubernetes_sd_configs:
|
||||
- role: service
|
||||
namespaces:
|
||||
names:
|
||||
- kube-system
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
|
||||
action: keep
|
||||
regex: kube-state-metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# CoreDNS
|
||||
- job_name: 'coredns'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- kube-system
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_k8s_app]
|
||||
action: keep
|
||||
regex: kube-dns
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
||||
action: keep
|
||||
regex: metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# Kubernetes Ingress
|
||||
- job_name: 'kubernetes-ingresses'
|
||||
kubernetes_sd_configs:
|
||||
- role: ingress
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
|
||||
regex: (.+);(.+);(.+)
|
||||
replacement: ${1}://${2}${3}
|
||||
target_label: __param_target
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_ingress_label_(.+)
|
||||
|
||||
# Remote write configuration for long-term storage
|
||||
remote_write:
|
||||
- url: "https://prometheus-remote-write.monitoring.svc.cluster.local/api/v1/write"
|
||||
queue_config:
|
||||
max_samples_per_send: 1000
|
||||
max_shards: 200
|
||||
capacity: 2500
|
||||
write_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: 'go_.*'
|
||||
action: drop
|
||||
|
||||
# Storage configuration
|
||||
storage:
|
||||
tsdb:
|
||||
retention.time: 15d
|
||||
retention.size: 50GB
|
||||
wal-compression: true
|
||||
|
||||
# Feature flags
|
||||
feature_flags:
|
||||
- promql-at-modifier
|
||||
- remote-write-receiver
|
||||
Reference in New Issue
Block a user