-
-
Save MAS150MD200/12bd497644b210d70b9da5c3b637637c to your computer and use it in GitHub Desktop.
Prometheus Recoding and Alert Rules Collection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Prometheus Recoding and Alert Rules Collection | |
| # Copyright (C) 2017 Matous Jan Fialka, <http://mjf.cz/> | |
| # Released under the terms of The MIT License | |
| ALERT processor_usage_too_high | |
| IF ((sum(node_cpu{mode=~"user|nice|system|irq|softirq|steal|idle|iowait"}) by (instance, job)) - (sum(node_cpu{mode=~"idle|iowait"}) by (instance, job))) / (sum(node_cpu{mode=~"user|nice|system|irq|softirq|steal|idle|iowait"}) by (instance, job)) * 100 > 95 | |
| FOR 5m | |
| LABELS {severity="critical"} | |
| ANNOTATIONS {description="Instance {{ $labels.instance }} of job {{ $labels.job }} has processor above 95% (current value: {{ printf \"%.2f\" $value }}%) for over 5 minutes", summary="Processor usage above 95%"} | |
| ALERT swap_usage_above_20_percent | |
| IF (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) * 100) > 50 | |
| FOR 1h | |
| LABELS {severity="moderate"} | |
| ANNOTATIONS {description="Instance {{ $labels.instance }} of job {{ $labels.job }} has swap usage above 20% (current value: {{ printf \"%.2f\" $value }}%) for over 1 hour", summary="Swap usage above 20%"} | |
| ALERT memory_usage_above_90_percent | |
| IF (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) * 100)) > 95 | |
| FOR 5m | |
| LABELS {severity="critical"} | |
| ANNOTATIONS {description="Instance {{ $labels.instance }} of job {{ $labels.job }} has memory usage above 90% (current value: {{ printf \"%.2f\" $value }}%) for over 5 minutes", summary="Memory usage above 90%"} | |
| instance:fd_utilization{} = | |
| process_open_fds / process_max_fds | |
| ALERT file_descriptors_exhausted_in_4_hours | |
| IF predict_linear(instance:fd_utilization[1h], 4 * 3600) > 1 | |
| FOR 10m | |
| LABELS {severity="critical"} | |
| ANNOTATIONS {description="Instance {{ $labels.instance }} of job {{ $labels.job }} will have file descriptors exhausted in 4 hours", summary="File descriptors will be exhausted soon"} | |
| ALERT disk_space_exhausted_in_8_hours | |
| IF predict_linear(node_filesystem_free[1h], 8 * 3600) < 0 | |
| FOR 20m | |
| LABELS {severity="moderate"} | |
| ANNOTATIONS {description="Instance {{ $labels.instance }} of job {{ $labels.job }} will have disk space exhausted in 8 hours", summary="Disk space will be exhausted soon"} | |
| ALERT disk_space_almost_exhausted | |
| IF node_filesystem_avail / node_filesystem_size * 100 <= 10 | |
| FOR 15m | |
| LABELS {severity="critical"} | |
| ANNOTATIONS {description="Instance {{ $labels.instance }} of job {{ $labels.job }} has disk space less than 10% (current value: {{ printf \"%.2f\" $value }}%) for 15 minutes", summary="Disk space almost exhausted"} | |
| ALERT node_down | |
| IF up == 0 | |
| FOR 1m | |
| LABELS {severity="critical"} | |
| ANNOTATIONS {description="Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for over 1 minute", summary="Node down"} | |
| ALERT service_down | |
| IF {__name__=~"[^_]+_up"} == 0 | |
| FOR 3m | |
| LABELS {severity="critical"} | |
| ANNOTATIONS {description="Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for over 3 minutes", summary="Service down"} | |
| ALERT mysql_innodb_log_waits | |
| IF rate(mysql_global_status_innodb_log_waits[5m]) > 10 | |
| LABELS {severity="critical"} | |
| ANNOTATIONS {description="The MySQL InnoDB logs are waiting for disk at a rate of {{ printf \"%.2f\" $value }} per second for over 5 minutes", summary="MySQL InnoDB log waits"} | |
| ALERT ntp_drifting | |
| IF node_ntp_drift_seconds > 0.05 | |
| FOR 1m | |
| LABELS {severity = "critical"} | |
| ANNOTATIONS {description="The NTP drifting has been too high for over 1 minute", summary="NTP drifting too high"} | |
| ALERT ntp_drifting | |
| IF node_ntp_drift_seconds > 0.01 | |
| FOR 1m | |
| LABELS {severity = "moderate"} | |
| ANNOTATIONS {description="The NTP has been drifting for over 1 minute", summary="NTP drift"} | |
| # vi:ft=prometheus:nowrap:tw=10000 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment