Created
April 12, 2022 17:33
-
-
Save mattmattox/9d442945a40e7c0b06de875a6e14cf6c to your computer and use it in GitHub Desktop.
Custom PrometheusRule to pageout to Slack
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| apiVersion: monitoring.coreos.com/v1 | |
| kind: PrometheusRule | |
| metadata: | |
| annotations: | |
| meta.helm.sh/release-name: monitoring | |
| meta.helm.sh/release-namespace: monitoring | |
| prometheus-operator-validated: "true" | |
| labels: | |
| app: kube-prometheus-stack | |
| app.kubernetes.io/instance: monitoring | |
| app.kubernetes.io/managed-by: Helm | |
| app.kubernetes.io/part-of: kube-prometheus-stack | |
| app.kubernetes.io/version: 34.9.0 | |
| chart: kube-prometheus-stack-34.9.0 | |
| cluster: a-rke2-devops | |
| heritage: Helm | |
| release: monitoring | |
| name: monitoring-kube-prometheus-slack.rules | |
| namespace: monitoring | |
| spec: | |
| groups: | |
| - name: slack.rules | |
| rules: | |
| - alert: HostOutOfMemory | |
| expr: 'node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostMemoryUnderMemoryPressure | |
| expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostUnusualNetworkThroughputIn | |
| expr: 'sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostUnusualNetworkThroughputOut | |
| expr: 'sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostUnusualDiskReadRate | |
| expr: 'sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50' | |
| for: 5m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostUnusualDiskWriteRate | |
| expr: 'sum by (instance) (rate(node_disk_write_bytes_total[2m])) / 1024 / 1024 > 50' | |
| for: 5m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostOutOfDiskSpace | |
| expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostDiskWillFillIn24Hours | |
| expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostDiskWillFillIn48Hours | |
| expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 48 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostOutOfInodes | |
| expr: 'node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostUnusualDiskReadLatency | |
| expr: 'rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostUnusualDiskWriteLatency | |
| expr: 'rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostHighCpuLoad | |
| expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostCpuStealNoisyNeighbor | |
| expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: HostContextSwitching | |
| expr: '(rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesNodeNotReady | |
| expr: 'kube_node_status_condition{condition="Ready",status="false"} == 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesNodeNotSchedulable | |
| expr: 'kube_node_status_condition{condition="Schedulable",status="false"} == 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesNodeMemoryPressure | |
| expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesNodeDiskPressure | |
| expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesNodeNetworkUnavailable | |
| expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesNodeOutOfDisk | |
| expr: 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesOutOfCapacity | |
| expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesContainerOomKiller | |
| expr: 'kube_container_status_last_seen_seconds_ago{state="OOMKilled",container=~"/kubelet.*"} > 10' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesPersistentVolumeClaimPending | |
| expr: 'kube_persistentvolumeclaim_info{status="Pending"} > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesPersistentVolumeClaimLost | |
| expr: 'kube_persistentvolumeclaim_info{status="Lost"} > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesPersistentVolumeClaimFailed | |
| expr: 'kube_persistentvolumeclaim_info{status="Failed"} > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesVolumeOutOfDiskSpace | |
| expr: 'kube_persistentvolume_info{status="OutOfDisk"} > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesVolumeFailed | |
| expr: 'kube_persistentvolume_info{status="Failed"} > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesVolumeLost | |
| expr: 'kube_persistentvolume_info{status="Lost"} > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesVolumePending | |
| expr: 'kube_persistentvolume_info{status="Pending"} > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesVolumeOutOfDiskSpace | |
| expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesStatefulSetDown | |
| expr: 'kube_statefulset_status_replicas{status="Failed"} > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesStatefulSetPending | |
| expr: 'kube_statefulset_status_replicas{status="Pending"} > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesPodNotHealthy | |
| expr: 'min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesPodCrashLooping | |
| expr: 'min_over_time(sum by (namespace, pod) (kube_pod_container_status_restarts_total{container=~"kube-.*",reason="CrashLooping"})[15m:1m]) > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesPodRestarting | |
| expr: 'min_over_time(sum by (namespace, pod) (kube_pod_container_status_restarts_total{container=~"kube-.*"})[15m:1m]) > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesReplicasSetMismatch | |
| expr: 'kube_replicationcontroller_status_replicas != kube_replicationcontroller_status_replicas_current' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesDeploymentReplicasMismatch | |
| expr: 'kube_deployment_status_replicas != kube_deployment_status_replicas_current' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesDeploymentFailed | |
| expr: 'kube_deployment_status_replicas != kube_deployment_status_replicas_current and kube_deployment_status_replicas_available == 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesStatefulSetReplicasMismatch | |
| expr: 'kube_statefulset_status_replicas != kube_statefulset_status_replicas_current' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesStatefulSetFailed | |
| expr: 'kube_statefulset_status_replicas != kube_statefulset_status_replicas_current and kube_statefulset_status_replicas_available == 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesDaemonSetReplicasMismatch | |
| expr: 'kube_daemonset_status_replicas != kube_daemonset_status_replicas_current' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesDaemonSetFailed | |
| expr: 'kube_daemonset_status_replicas != kube_daemonset_status_replicas_current and kube_daemonset_status_replicas_available == 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesJobFailed | |
| expr: 'kube_job_status_failed > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesDeploymentGenerationMismatch | |
| expr: 'kube_deployment_status_observed_generation != kube_deployment_status_replicas_current_generation' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesStatefulSetUpdateNotRolledOut | |
| expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesDaemonSetRolloutStuck | |
| expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0' | |
| for: 10m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesDaemonSetMisscheduled | |
| expr: 'kube_daemonset_status_number_misscheduled > 0' | |
| for: 1m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesCronJobTooLong | |
| expr: 'time() - kube_cronjob_next_schedule_time > 3600' | |
| for: 1m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesJobSlowCompletion | |
| expr: 'kube_job_spec_completions - kube_job_status_succeeded > 0' | |
| for: 12h | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesApiServerErrors | |
| expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesApiClientErrors | |
| expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1' | |
| for: 2m | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesClientCertificateExpiresNextWeek | |
| expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60' | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesClientCertificateExpiresSoon | |
| expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60' | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesClientCertificateExpired | |
| expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 0' | |
| labels: | |
| severity: slack-alert | |
| - alert: KubernetesApiSserverLatency | |
| expr: 'apiserver_request_duration_seconds_sum{job="apiserver"} > 0 and histogram_quantile(0.99, sum by (job, le) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) > 0.5' | |
| labels: | |
| severity: slack-alert |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment