Refactor prometheus rules templte handling

This commit is contained in:
Daniel Berteaud 2024-04-02 11:35:23 +02:00
parent f3ce524352
commit 8b04f6c7b6
4 changed files with 39 additions and 105 deletions

View File

@ -83,25 +83,31 @@ _EOT
}
# Alert rules
[[- range (file.ReadDir "bundles/monitoring/templates/prometheus/rules") ]]
[[- if not (file.Exists (printf "prometheus/rules/%s" .)) ]]
[[- range $bundle := file.ReadDir "bundles" ]]
# [[ $bundle ]]
[[- if file.Exists (printf "bundles/%s/templates/prometheus/rules" $bundle) ]]
# if file.Exists [[ $bundle ]]
[[- range $tpl := file.ReadDir (printf "bundles/%s/templates/prometheus/rules" $bundle) ]]
[[- if not (file.Exists (printf "prometheus/rules/%s" $tpl)) ]]
template {
data = <<_EOT
[[ file.Read (printf "bundles/monitoring/templates/prometheus/rules/%s" .) ]]
[[ tmpl.Inline (file.Read (printf "bundles/%s/templates/prometheus/rules/%s" $bundle $tpl)) $c ]]
_EOT
destination = "local/rules/[[ . ]]"
destination = "local/rules/[[ $tpl ]]"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
[[- end ]]
[[- end ]]
[[- end ]]
[[- end ]]
[[- if file.Exists "prometheus/rules" ]]
[[- range (file.ReadDir "prometheus/rules") ]]
[[- range $tpl := file.ReadDir "prometheus/rules" ]]
template {
data = <<_EOT
[[ file.Read (printf "prometheus/rules/%s" .) ]]
[[ tmpl.Inline (file.Read (printf "prometheus/rules/%s" $tpl)) $c ]]
_EOT
destination = "local/rules/[[ . ]]"
left_delimiter = "{{{"
@ -110,13 +116,22 @@ _EOT
[[- end ]]
[[- end ]]
[[- /* Support prometheus rules as artifacts or as raw content */]]
[[- range $k, $v := $c.alert_rules ]]
[[- if has $v "url" ]]
artifact {
source = "[[ $v.url ]]"
destination = "local/rules/[[ $k ]].yml"
mode = "file"
}
[[- else if has $v "content" ]]
template {
data = <<_EOT
[[ $v.content ]]
_EOT
destination = "local/rules/[[ $k ]].yml"
}
[[- end ]]
[[- end ]]
# A client cert, to connect to the AlertManager API

View File

@ -1,16 +0,0 @@
# vi: syntax=yaml
groups:
- name: JVM
rules:
- alert: JvmMemoryFillingUp
expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 90'
for: 2m
labels:
severity: warning
annotations:
summary: JVM memory filling up (instance {{ $labels.instance }})
description: "JVM memory is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -1,80 +0,0 @@
# vi: syntax=yaml
groups:
- name: Postgres
rules:
- alert: PostgresqlDown
expr: 'pg_up == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql down (instance {{ $labels.instance }})
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresTooManyRestarts
expr: changes(process_start_time_seconds{job="pg"}[15m]) > 3
for: 1m
labels:
severity: warning
annotations:
summary: Postgres too many restarts (instance {{ $labels.instance }})
description: "Postgres server has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyConnections
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8'
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql too many connections (instance {{ $labels.instance }})
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlDeadLocks
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql dead locks (instance {{ $labels.instance }})
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PostgresqlHighRollbackRate
# expr: 'rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.05'
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Postgresql high rollback rate (instance {{ $labels.instance }})
# description: "Ratio of transactions being aborted compared to committed is > 5 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateStatementTimeout
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateDeadlock
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyLocksAcquired
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
for: 2m
labels:
severity: critical
annotations:
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -37,7 +37,7 @@ monitoring:
# Ping exporter can ping external hosts and expose stats to prometheus
ping:
# Version of the exporter to use
version: 1.1.2
version: 1.1.3
# Docker image to use
image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1'
# Custom env var to set in the container
@ -146,12 +146,27 @@ monitoring:
# - 10.11.2.3:9305
# - 192.168.6.20:782
jobs: {}
# A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts. Eg
# A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts,
# or you can provide it as raw content. Eg
# alert_rules:
# postgres:
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
# patroni:
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml
# custom:
# content: |
# groups:
# - name: EmbeddedExporter
# rules:
# - alert: PrometheusJobMissing
# expr: 'absent(up{job="prometheus"})'
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Prometheus job missing (instance {{ $labels.instance }})
# description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# If you need something more flexible (like download an archive of rules and uncompress it, you should use artifacts instead. Just ensure your rules
# are in /local/rules/ inside the container
alert_rules: {}