# Prometheus Recoding and Alert Rules Collection
# Copyright (C) 2017 Matous Jan Fialka, <http://mjf.cz/>
# Released under the terms of The MIT License

groups:

- name: node_common
  interval: 30s
  rules:

  - alert: processor_usage_too_high
    expr: |
      ((sum(node_cpu{mode=~"^(?:^(?:user|nice|system|irq|softirq|steal|idle|iowait)$)$"}) by (instance, job)) - (sum(node_cpu{mode=~"^(?:^(?:idle|iowait)$)$"}) by (instance, job))) / (sum(node_cpu{mode=~"^(?:^(?:user|nice|system|irq|softirq|steal|idle|iowait)$)$"}) by (instance, job)) * 100 > 95
    for: 5m
    labels:
      severity: critical
    annotations:
      description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has processor above 95% (current value: {{ printf "%.2f" $value }}%) for over 5 minutes'
      summary: 'Processor usage above 95%'

  - alert: swap_usage_above_50_percent
    expr: |
      (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) * 100) > 50
    for: 1h
    labels:
      severity: moderate
    annotations:
      description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has swap usage above 20% (current value: {{ printf "%.2f" $value }}%) for over 1 hour'
      summary: 'Swap usage above 20%'

  - alert: memory_usage_above_90_percent
    expr: |
      (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) * 100)) > 95
    for: 5m
    labels:
      severity: critical
    annotations:
      description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has memory usage above 90% (current value: {{ printf "%.2f" $value }}%) for over 5 minutes'
      summary: 'Memory usage above 90%'

  - alert: node_down
    expr: |
      up == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for over 1 minute'
      summary: 'Node down'

- name: node_predictions
  interval: 30s
  rules:

  - record: instance:fd_utilization
    expr: |
      process_open_fds / process_max_fds

  - alert: file_descriptors_exhausted_in_4_hours
    expr: |
      predict_linear(instance:fd_utilization[1h], 4 * 3600) > 1
    for: 10m
    labels:
      severity: critical
    annotations:
      description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} will have file descriptors exhausted in 4 hours'
      summary: 'File descriptors will be exhausted soon'

  - alert: disk_space_exhausted_in_8_hours
    expr: |
      predict_linear(node_filesystem_free[1h], 8 * 3600) < 0
    for: 20m
    labels:
      severity: moderate
    annotations:
      description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} will have disk space exhausted in 8 hours'
      summary: 'Disk space will be exhausted soon'

  - alert: disk_space_almost_exhausted
    expr: |
      node_filesystem_avail / node_filesystem_size * 100 <= 10
    for: 15m
    labels:
      severity: critical
    annotations:
      description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has disk space less than 10% (current value: {{ printf "%.2f" $value }}%) for 15 minutes'
      summary: 'Disk space almost exhausted'

- name: service_common
  interval: 15s
  rules:

  - alert: service_down
    expr: |
      {__name__=~"^(?:[^_]+_up)$"} == 0
    for: 3m
    labels:
      severity: critical
    annotations:
      description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for over 3 minutes'
      summary: 'Service down'

- name: mysql
  interval: 30s
  rules:

  - record: instance:mysql_estimated_max_used_mem_size
    expr: |
      (mysql_global_variables_key_buffer_size + mysql_global_variables_query_cache_size + mysql_global_variables_tmp_table_size + mysql_global_variables_innodb_buffer_pool_size + (mysql_global_variables_innodb_additional_mem_pool_size or up * 0) + mysql_global_variables_innodb_log_buffer_size + (mysql_global_variables_max_connections * (mysql_global_variables_sort_buffer_size + mysql_global_variables_read_buffer_size + mysql_global_variables_read_rnd_buffer_size + mysql_global_variables_join_buffer_size + mysql_global_variables_thread_stack + mysql_global_variables_binlog_cache_size)))

  - alert: mysql_innodb_log_waits
    expr: |
      rate(mysql_global_status_innodb_log_waits[5m]) > 10
    labels:
      severity: critical
    annotations:
      description: 'The MySQL InnoDB logs are waiting for disk at a rate of {{ printf "%.2f" $value }} per second for over 5 minutes'
      summary: 'MySQL InnoDB log waits'

- name: ntp
  interval: 15s
  rules:

  - alert: ntp_drifting
    expr: |
      node_ntp_drift_seconds > 0.05
    for: 1m
    labels:
      severity: critical
    annotations:
      description: 'The NTP drifting has been too high for over 1 minute'
      summary: 'NTP drifting too high'

  - alert: ntp_drifting
    expr: |
      node_ntp_drift_seconds > 0.01
    for: 1m
    labels:
      severity: moderate
    annotations:
      description: 'The NTP has been drifting for over 1 minute'
      summary: 'NTP drift'

# vi:ft=yaml:nowrap: