Created
April 12, 2022 17:33
-
-
Save mattmattox/9d442945a40e7c0b06de875a6e14cf6c to your computer and use it in GitHub Desktop.
Revisions
-
mattmattox created this gist
Apr 12, 2022 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,299 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: annotations: meta.helm.sh/release-name: monitoring meta.helm.sh/release-namespace: monitoring prometheus-operator-validated: "true" labels: app: kube-prometheus-stack app.kubernetes.io/instance: monitoring app.kubernetes.io/managed-by: Helm app.kubernetes.io/part-of: kube-prometheus-stack app.kubernetes.io/version: 34.9.0 chart: kube-prometheus-stack-34.9.0 cluster: a-rke2-devops heritage: Helm release: monitoring name: monitoring-kube-prometheus-slack.rules namespace: monitoring spec: groups: - name: slack.rules rules: - alert: HostOutOfMemory expr: 'node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10' for: 2m labels: severity: slack-alert - alert: HostMemoryUnderMemoryPressure expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000' for: 2m labels: severity: slack-alert - alert: HostUnusualNetworkThroughputIn expr: 'sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100' for: 2m labels: severity: slack-alert - alert: HostUnusualNetworkThroughputOut expr: 'sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100' for: 2m labels: severity: slack-alert - alert: HostUnusualDiskReadRate expr: 'sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50' for: 5m labels: severity: slack-alert - alert: HostUnusualDiskWriteRate expr: 'sum by (instance) (rate(node_disk_write_bytes_total[2m])) / 1024 / 1024 > 50' for: 5m labels: severity: slack-alert - alert: HostOutOfDiskSpace expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0' for: 2m labels: severity: slack-alert - alert: HostDiskWillFillIn24Hours expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0' for: 2m labels: severity: slack-alert - alert: HostDiskWillFillIn48Hours expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 48 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0' for: 2m labels: severity: slack-alert - alert: HostOutOfInodes expr: 'node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0' for: 2m labels: severity: slack-alert - alert: HostUnusualDiskReadLatency expr: 'rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0' for: 2m labels: severity: slack-alert - alert: HostUnusualDiskWriteLatency expr: 'rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0' for: 2m labels: severity: slack-alert - alert: HostHighCpuLoad expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80' for: 2m labels: severity: slack-alert - alert: HostCpuStealNoisyNeighbor expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' for: 2m labels: severity: slack-alert - alert: HostContextSwitching expr: '(rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000' for: 2m labels: severity: slack-alert - alert: KubernetesNodeNotReady expr: 'kube_node_status_condition{condition="Ready",status="false"} == 0' for: 10m labels: severity: slack-alert - alert: KubernetesNodeNotSchedulable expr: 'kube_node_status_condition{condition="Schedulable",status="false"} == 0' for: 10m labels: severity: slack-alert - alert: KubernetesNodeMemoryPressure expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1' for: 10m labels: severity: slack-alert - alert: KubernetesNodeDiskPressure expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1' for: 10m labels: severity: slack-alert - alert: KubernetesNodeNetworkUnavailable expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1' for: 10m labels: severity: slack-alert - alert: KubernetesNodeOutOfDisk expr: 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1' for: 10m labels: severity: slack-alert - alert: KubernetesOutOfCapacity expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' for: 10m labels: severity: slack-alert - alert: KubernetesContainerOomKiller expr: 'kube_container_status_last_seen_seconds_ago{state="OOMKilled",container=~"/kubelet.*"} > 10' for: 10m labels: severity: slack-alert - alert: KubernetesPersistentVolumeClaimPending expr: 'kube_persistentvolumeclaim_info{status="Pending"} > 0' for: 10m labels: severity: slack-alert - alert: KubernetesPersistentVolumeClaimLost expr: 'kube_persistentvolumeclaim_info{status="Lost"} > 0' for: 10m labels: severity: slack-alert - alert: KubernetesPersistentVolumeClaimFailed expr: 'kube_persistentvolumeclaim_info{status="Failed"} > 0' for: 10m labels: severity: slack-alert - alert: KubernetesVolumeOutOfDiskSpace expr: 'kube_persistentvolume_info{status="OutOfDisk"} > 0' for: 10m labels: severity: slack-alert - alert: KubernetesVolumeFailed expr: 'kube_persistentvolume_info{status="Failed"} > 0' for: 10m labels: severity: slack-alert - alert: KubernetesVolumeLost expr: 'kube_persistentvolume_info{status="Lost"} > 0' for: 10m labels: severity: slack-alert - alert: KubernetesVolumePending expr: 'kube_persistentvolume_info{status="Pending"} > 0' for: 10m labels: severity: slack-alert - alert: KubernetesVolumeOutOfDiskSpace expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10' for: 2m labels: severity: slack-alert - alert: KubernetesStatefulSetDown expr: 'kube_statefulset_status_replicas{status="Failed"} > 0' for: 10m labels: severity: slack-alert - alert: KubernetesStatefulSetPending expr: 'kube_statefulset_status_replicas{status="Pending"} > 0' for: 10m labels: severity: slack-alert - alert: KubernetesPodNotHealthy expr: 'min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0' for: 10m labels: severity: slack-alert - alert: KubernetesPodCrashLooping expr: 'min_over_time(sum by (namespace, pod) (kube_pod_container_status_restarts_total{container=~"kube-.*",reason="CrashLooping"})[15m:1m]) > 0' for: 10m labels: severity: slack-alert - alert: KubernetesPodRestarting expr: 'min_over_time(sum by (namespace, pod) (kube_pod_container_status_restarts_total{container=~"kube-.*"})[15m:1m]) > 0' for: 10m labels: severity: slack-alert - alert: KubernetesReplicasSetMismatch expr: 'kube_replicationcontroller_status_replicas != kube_replicationcontroller_status_replicas_current' for: 10m labels: severity: slack-alert - alert: KubernetesDeploymentReplicasMismatch expr: 'kube_deployment_status_replicas != kube_deployment_status_replicas_current' for: 10m labels: severity: slack-alert - alert: KubernetesDeploymentFailed expr: 'kube_deployment_status_replicas != kube_deployment_status_replicas_current and kube_deployment_status_replicas_available == 0' for: 10m labels: severity: slack-alert - alert: KubernetesStatefulSetReplicasMismatch expr: 'kube_statefulset_status_replicas != kube_statefulset_status_replicas_current' for: 10m labels: severity: slack-alert - alert: KubernetesStatefulSetFailed expr: 'kube_statefulset_status_replicas != kube_statefulset_status_replicas_current and kube_statefulset_status_replicas_available == 0' for: 10m labels: severity: slack-alert - alert: KubernetesDaemonSetReplicasMismatch expr: 'kube_daemonset_status_replicas != kube_daemonset_status_replicas_current' for: 10m labels: severity: slack-alert - alert: KubernetesDaemonSetFailed expr: 'kube_daemonset_status_replicas != kube_daemonset_status_replicas_current and kube_daemonset_status_replicas_available == 0' for: 10m labels: severity: slack-alert - alert: KubernetesJobFailed expr: 'kube_job_status_failed > 0' for: 10m labels: severity: slack-alert - alert: KubernetesDeploymentGenerationMismatch expr: 'kube_deployment_status_observed_generation != kube_deployment_status_replicas_current_generation' for: 10m labels: severity: slack-alert - alert: KubernetesStatefulSetUpdateNotRolledOut expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)' for: 10m labels: severity: slack-alert - alert: KubernetesDaemonSetRolloutStuck expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0' for: 10m labels: severity: slack-alert - alert: KubernetesDaemonSetMisscheduled expr: 'kube_daemonset_status_number_misscheduled > 0' for: 1m labels: severity: slack-alert - alert: KubernetesCronJobTooLong expr: 'time() - kube_cronjob_next_schedule_time > 3600' for: 1m labels: severity: slack-alert - alert: KubernetesJobSlowCompletion expr: 'kube_job_spec_completions - kube_job_status_succeeded > 0' for: 12h labels: severity: slack-alert - alert: KubernetesApiServerErrors expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3' for: 2m labels: severity: slack-alert - alert: KubernetesApiClientErrors expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1' for: 2m labels: severity: slack-alert - alert: KubernetesClientCertificateExpiresNextWeek expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60' labels: severity: slack-alert - alert: KubernetesClientCertificateExpiresSoon expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60' labels: severity: slack-alert - alert: KubernetesClientCertificateExpired expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 0' labels: severity: slack-alert - alert: KubernetesApiSserverLatency expr: 'apiserver_request_duration_seconds_sum{job="apiserver"} > 0 and histogram_quantile(0.99, sum by (job, le) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) > 0.5' labels: severity: slack-alert