Created
August 5, 2020 09:17
-
-
Save yangchuansheng/4310ae9f41513899dc5f0176cdf804b1 to your computer and use it in GitHub Desktop.
prometheus-rules-system.yaml
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| apiVersion: monitoring.coreos.com/v1 | |
| kind: PrometheusRule | |
| metadata: | |
| labels: | |
| prometheus: system | |
| role: alert-rules | |
| name: prometheus-system-rules | |
| namespace: monitoring | |
| spec: | |
| groups: | |
| - name: kube-apiserver.rules | |
| rules: | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) | |
| - | |
| ( | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) | |
| ) | |
| ) | |
| + | |
| # errors | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) | |
| labels: | |
| verb: read | |
| record: apiserver_request:burnrate1d | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) | |
| - | |
| ( | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) | |
| ) | |
| ) | |
| + | |
| # errors | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) | |
| labels: | |
| verb: read | |
| record: apiserver_request:burnrate1h | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) | |
| - | |
| ( | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) | |
| ) | |
| ) | |
| + | |
| # errors | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) | |
| labels: | |
| verb: read | |
| record: apiserver_request:burnrate2h | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) | |
| - | |
| ( | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) | |
| ) | |
| ) | |
| + | |
| # errors | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) | |
| labels: | |
| verb: read | |
| record: apiserver_request:burnrate30m | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) | |
| - | |
| ( | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) | |
| ) | |
| ) | |
| + | |
| # errors | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) | |
| labels: | |
| verb: read | |
| record: apiserver_request:burnrate3d | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) | |
| - | |
| ( | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) | |
| ) | |
| ) | |
| + | |
| # errors | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) | |
| labels: | |
| verb: read | |
| record: apiserver_request:burnrate5m | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) | |
| - | |
| ( | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) | |
| ) | |
| ) | |
| + | |
| # errors | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) | |
| labels: | |
| verb: read | |
| record: apiserver_request:burnrate6h | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) | |
| - | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) | |
| ) | |
| + | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) | |
| labels: | |
| verb: write | |
| record: apiserver_request:burnrate1d | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) | |
| - | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) | |
| ) | |
| + | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) | |
| labels: | |
| verb: write | |
| record: apiserver_request:burnrate1h | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) | |
| - | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) | |
| ) | |
| + | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) | |
| labels: | |
| verb: write | |
| record: apiserver_request:burnrate2h | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) | |
| - | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) | |
| ) | |
| + | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) | |
| labels: | |
| verb: write | |
| record: apiserver_request:burnrate30m | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) | |
| - | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) | |
| ) | |
| + | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) | |
| labels: | |
| verb: write | |
| record: apiserver_request:burnrate3d | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
| - | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) | |
| ) | |
| + | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
| labels: | |
| verb: write | |
| record: apiserver_request:burnrate5m | |
| - expr: | | |
| ( | |
| ( | |
| # too slow | |
| sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) | |
| - | |
| sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) | |
| ) | |
| + | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) | |
| ) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) | |
| labels: | |
| verb: write | |
| record: apiserver_request:burnrate6h | |
| - expr: | | |
| sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) | |
| labels: | |
| verb: read | |
| record: code_resource:apiserver_request_total:rate5m | |
| - expr: | | |
| sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
| labels: | |
| verb: write | |
| record: code_resource:apiserver_request_total:rate5m | |
| - expr: | | |
| histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 | |
| labels: | |
| quantile: "0.99" | |
| verb: read | |
| record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 | |
| labels: | |
| quantile: "0.99" | |
| verb: write | |
| record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
| - expr: | | |
| sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) | |
| / | |
| sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) | |
| record: cluster:apiserver_request_duration_seconds:mean5m | |
| - expr: | | |
| histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.99" | |
| record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.9" | |
| record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.5" | |
| record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
| - interval: 3m | |
| name: kube-apiserver-availability.rules | |
| rules: | |
| - expr: | | |
| 1 - ( | |
| ( | |
| # write too slow | |
| sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) | |
| - | |
| sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) | |
| ) + | |
| ( | |
| # read too slow | |
| sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) | |
| - | |
| ( | |
| sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + | |
| sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + | |
| sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) | |
| ) | |
| ) + | |
| # errors | |
| sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) | |
| ) | |
| / | |
| sum(code:apiserver_request_total:increase30d) | |
| labels: | |
| verb: all | |
| record: apiserver_request:availability30d | |
| - expr: | | |
| 1 - ( | |
| sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) | |
| - | |
| ( | |
| # too slow | |
| sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + | |
| sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + | |
| sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) | |
| ) | |
| + | |
| # errors | |
| sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) | |
| ) | |
| / | |
| sum(code:apiserver_request_total:increase30d{verb="read"}) | |
| labels: | |
| verb: read | |
| record: apiserver_request:availability30d | |
| - expr: | | |
| 1 - ( | |
| ( | |
| # too slow | |
| sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) | |
| - | |
| sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) | |
| ) | |
| + | |
| # errors | |
| sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) | |
| ) | |
| / | |
| sum(code:apiserver_request_total:increase30d{verb="write"}) | |
| labels: | |
| verb: write | |
| record: apiserver_request:availability30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d])) | |
| record: code_verb:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) | |
| labels: | |
| verb: read | |
| record: code:apiserver_request_total:increase30d | |
| - expr: | | |
| sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) | |
| labels: | |
| verb: write | |
| record: code:apiserver_request_total:increase30d | |
| - name: kube-scheduler.rules | |
| rules: | |
| - expr: | | |
| histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.99" | |
| record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.99" | |
| record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.99" | |
| record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.9" | |
| record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.9" | |
| record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.9" | |
| record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.5" | |
| record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.5" | |
| record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
| labels: | |
| quantile: "0.5" | |
| record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
| - name: kubelet.rules | |
| rules: | |
| - expr: | | |
| histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
| labels: | |
| quantile: "0.99" | |
| record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
| labels: | |
| quantile: "0.9" | |
| record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile | |
| - expr: | | |
| histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
| labels: | |
| quantile: "0.5" | |
| record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile | |
| - name: kubernetes-storage | |
| rules: | |
| - alert: KubePersistentVolumeFillingUp | |
| annotations: | |
| message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
| }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage | |
| }} free. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup | |
| expr: | | |
| kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} | |
| / | |
| kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} | |
| < 0.03 | |
| for: 1m | |
| labels: | |
| severity: critical | |
| - alert: KubePersistentVolumeFillingUp | |
| annotations: | |
| message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
| }} in Namespace {{ $labels.namespace }} is expected to fill up within four | |
| days. Currently {{ $value | humanizePercentage }} is available. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup | |
| expr: | | |
| ( | |
| kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} | |
| / | |
| kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} | |
| ) < 0.15 | |
| and | |
| predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 | |
| for: 1h | |
| labels: | |
| severity: warning | |
| - alert: KubePersistentVolumeErrors | |
| annotations: | |
| message: The persistent volume {{ $labels.persistentvolume }} has status {{ | |
| $labels.phase }}. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors | |
| expr: | | |
| kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 | |
| for: 5m | |
| labels: | |
| severity: critical | |
| - name: kubernetes-system | |
| rules: | |
| - alert: KubeVersionMismatch | |
| annotations: | |
| message: There are {{ $value }} different semantic versions of Kubernetes | |
| components running. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch | |
| expr: | | |
| count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 | |
| for: 15m | |
| labels: | |
| severity: warning | |
| - alert: KubeClientErrors | |
| annotations: | |
| message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance | |
| }}' is experiencing {{ $value | humanizePercentage }} errors.' | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors | |
| expr: | | |
| (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) | |
| / | |
| sum(rate(rest_client_requests_total[5m])) by (instance, job)) | |
| > 0.01 | |
| for: 15m | |
| labels: | |
| severity: warning | |
| - name: kube-apiserver-slos | |
| rules: | |
| - alert: KubeAPIErrorBudgetBurn | |
| annotations: | |
| message: The API server is burning too much error budget | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn | |
| expr: | | |
| sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) | |
| and | |
| sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) | |
| for: 2m | |
| labels: | |
| long: 1h | |
| severity: critical | |
| short: 5m | |
| - alert: KubeAPIErrorBudgetBurn | |
| annotations: | |
| message: The API server is burning too much error budget | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn | |
| expr: | | |
| sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) | |
| and | |
| sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) | |
| for: 15m | |
| labels: | |
| long: 6h | |
| severity: critical | |
| short: 30m | |
| - alert: KubeAPIErrorBudgetBurn | |
| annotations: | |
| message: The API server is burning too much error budget | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn | |
| expr: | | |
| sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) | |
| and | |
| sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) | |
| for: 1h | |
| labels: | |
| long: 1d | |
| severity: warning | |
| short: 2h | |
| - alert: KubeAPIErrorBudgetBurn | |
| annotations: | |
| message: The API server is burning too much error budget | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn | |
| expr: | | |
| sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) | |
| and | |
| sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) | |
| for: 3h | |
| labels: | |
| long: 3d | |
| severity: warning | |
| short: 6h | |
| - name: kubernetes-system-apiserver | |
| rules: | |
| - alert: KubeAPILatencyHigh | |
| annotations: | |
| message: The API server has an abnormal latency of {{ $value }} seconds for | |
| {{ $labels.verb }} {{ $labels.resource }}. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh | |
| expr: | | |
| cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} | |
| > | |
| 1 | |
| and on (verb,resource) | |
| ( | |
| cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} | |
| > | |
| on (verb) group_left() | |
| ( | |
| avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) | |
| + | |
| 2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) | |
| ) | |
| ) > on (verb) group_left() | |
| 1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - alert: KubeAPIErrorsHigh | |
| annotations: | |
| message: API server is returning errors for {{ $value | humanizePercentage | |
| }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource | |
| }}. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh | |
| expr: | | |
| sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb) | |
| / | |
| sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.05 | |
| for: 10m | |
| labels: | |
| severity: warning | |
| - alert: KubeClientCertificateExpiration | |
| annotations: | |
| message: A client certificate used to authenticate to the apiserver is expiring | |
| in less than 7.0 days. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration | |
| expr: | | |
| apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 | |
| labels: | |
| severity: warning | |
| - alert: KubeClientCertificateExpiration | |
| annotations: | |
| message: A client certificate used to authenticate to the apiserver is expiring | |
| in less than 24.0 hours. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration | |
| expr: | | |
| apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 | |
| labels: | |
| severity: critical | |
| - alert: AggregatedAPIErrors | |
| annotations: | |
| message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has | |
| reported errors. The number of errors have increased for it in the past | |
| five minutes. High values indicate that the availability of the service | |
| changes too often. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors | |
| expr: | | |
| sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 | |
| labels: | |
| severity: warning | |
| - alert: AggregatedAPIDown | |
| annotations: | |
| message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down. | |
| It has not been available at least for the past five minutes. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown | |
| expr: | | |
| sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - alert: KubeAPIDown | |
| annotations: | |
| message: KubeAPI has disappeared from Prometheus target discovery. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown | |
| expr: | | |
| absent(up{job="apiserver"} == 1) | |
| for: 15m | |
| labels: | |
| severity: critical | |
| - name: kubernetes-system-kubelet | |
| rules: | |
| - alert: KubeNodeNotReady | |
| annotations: | |
| message: '{{ $labels.node }} has been unready for more than 15 minutes.' | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready | |
| expr: | | |
| kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 | |
| for: 15m | |
| labels: | |
| severity: warning | |
| - alert: KubeNodeUnreachable | |
| annotations: | |
| message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable | |
| expr: | | |
| (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key="ToBeDeletedByClusterAutoscaler"}) == 1 | |
| labels: | |
| severity: warning | |
| - alert: KubeletTooManyPods | |
| annotations: | |
| message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage | |
| }} of its Pod capacity. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods | |
| expr: | | |
| max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) by(node) > 0.95 | |
| for: 15m | |
| labels: | |
| severity: warning | |
| - alert: KubeNodeReadinessFlapping | |
| annotations: | |
| message: The readiness status of node {{ $labels.node }} has changed {{ $value | |
| }} times in the last 15 minutes. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping | |
| expr: | | |
| sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 | |
| for: 15m | |
| labels: | |
| severity: warning | |
| - alert: KubeletPlegDurationHigh | |
| annotations: | |
| message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration | |
| of {{ $value }} seconds on node {{ $labels.node }}. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh | |
| expr: | | |
| node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - alert: KubeletPodStartUpLatencyHigh | |
| annotations: | |
| message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds | |
| on node {{ $labels.node }}. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh | |
| expr: | | |
| histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 | |
| for: 15m | |
| labels: | |
| severity: warning | |
| - alert: KubeletDown | |
| annotations: | |
| message: Kubelet has disappeared from Prometheus target discovery. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown | |
| expr: | | |
| absent(up{job="kubelet", metrics_path="/metrics"} == 1) | |
| for: 15m | |
| labels: | |
| severity: critical | |
| - name: kubernetes-system-scheduler | |
| rules: | |
| - alert: KubeSchedulerDown | |
| annotations: | |
| message: KubeScheduler has disappeared from Prometheus target discovery. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown | |
| expr: | | |
| absent(up{job="kube-scheduler"} == 1) | |
| for: 15m | |
| labels: | |
| severity: critical | |
| - name: kubernetes-system-controller-manager | |
| rules: | |
| - alert: KubeControllerManagerDown | |
| annotations: | |
| message: KubeControllerManager has disappeared from Prometheus target discovery. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown | |
| expr: | | |
| absent(up{job="kube-controller-manager"} == 1) | |
| for: 15m | |
| labels: | |
| severity: critical | |
| - name: general.rules | |
| rules: | |
| - alert: TargetDown | |
| annotations: | |
| message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service | |
| }} targets in {{ $labels.namespace }} namespace are down.' | |
| expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, | |
| namespace, service)) > 10 | |
| for: 10m | |
| labels: | |
| severity: warning | |
| - alert: Watchdog | |
| annotations: | |
| message: | | |
| This is an alert meant to ensure that the entire alerting pipeline is functional. | |
| This alert is always firing, therefore it should always be firing in Alertmanager | |
| and always fire against a receiver. There are integrations with various notification | |
| mechanisms that send a notification when this alert is not firing. For example the | |
| "DeadMansSnitch" integration in PagerDuty. | |
| expr: vector(1) | |
| labels: | |
| severity: none | |
| - name: CoreDNS | |
| rules: | |
| - alert: CorednsPanicCount | |
| expr: increase(coredns_panic_count_total[10m]) > 0 | |
| for: 5m | |
| labels: | |
| severity: critical | |
| annotations: | |
| summary: "CoreDNS Panic Count (instance {{ $labels.instance }})" | |
| description: "Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment