From b9b1386149b923b74204287bd874520dbfbd3e22 Mon Sep 17 00:00:00 2001 From: Daniel Berteaud Date: Sat, 20 Apr 2024 23:57:30 +0200 Subject: [PATCH] Update rendered example --- example/monitoring-services.nomad.hcl | 194 +++++++++++++------------- 1 file changed, 97 insertions(+), 97 deletions(-) diff --git a/example/monitoring-services.nomad.hcl b/example/monitoring-services.nomad.hcl index 025e2a4..b7aafad 100644 --- a/example/monitoring-services.nomad.hcl +++ b/example/monitoring-services.nomad.hcl @@ -579,103 +579,6 @@ _EOT groups: -# Prometheus -- name: Prometheus - rules: - - - alert: PrometheusTargetMissing - expr: up{job!~"sftp-PR\\d+"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: Prometheus target missing (job {{ $labels.job }}, instance {{ $labels.instance }}) - description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTooManyRestarts - expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 3 - for: 1m - labels: - severity: warning - annotations: - summary: Prometheus too many restarts (job {{ $labels.job }}, instance {{ $labels.instance }}) - description: "Prometheus has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusNotConnectedToAlertmanager - expr: prometheus_notifications_alertmanagers_discovered < 1 - for: 2m - labels: - severity: critical - annotations: - summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) - description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusRuleEvaluationFailures - expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusRuleEvaluationSlow - expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) - description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusNotificationsBacklog - expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus notifications backlog (instance {{ $labels.instance }}) - description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusAlertmanagerNotificationFailing - expr: rate(alertmanager_notifications_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) - description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTargetScrapingSlow - expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus target scraping slow (instance {{ $labels.instance }}) - description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTsdbWalCorruptions - expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - -_EOT - destination = "local/rules/prometheus.yml" - left_delimiter = "{{{" - right_delimiter = "}}}" - } - template { - data = <<_EOT -# vi: syntax=yaml - -groups: - - name: Traefik rules: @@ -829,6 +732,103 @@ _EOT left_delimiter = "{{{" right_delimiter = "}}}" } + template { + data = <<_EOT +# vi: syntax=yaml + +groups: + +# Prometheus +- name: Prometheus + rules: + + - alert: PrometheusTargetMissing + expr: up == 0 + for: 5m + labels: + severity: critical + annotations: + summary: Prometheus target missing (job {{ $labels.job }}, instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 3 + for: 1m + labels: + severity: warning + annotations: + summary: Prometheus too many restarts (job {{ $labels.job }}, instance {{ $labels.instance }}) + description: "Prometheus has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 2m + labels: + severity: critical + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: rate(alertmanager_notifications_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + +_EOT + destination = "local/rules/prometheus.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } template { data = <<_EOT