Refactor prometheus rules templte handling
This commit is contained in:
parent
f3ce524352
commit
8b04f6c7b6
|
@ -83,25 +83,31 @@ _EOT
|
|||
}
|
||||
|
||||
# Alert rules
|
||||
[[- range (file.ReadDir "bundles/monitoring/templates/prometheus/rules") ]]
|
||||
[[- if not (file.Exists (printf "prometheus/rules/%s" .)) ]]
|
||||
[[- range $bundle := file.ReadDir "bundles" ]]
|
||||
# [[ $bundle ]]
|
||||
[[- if file.Exists (printf "bundles/%s/templates/prometheus/rules" $bundle) ]]
|
||||
# if file.Exists [[ $bundle ]]
|
||||
[[- range $tpl := file.ReadDir (printf "bundles/%s/templates/prometheus/rules" $bundle) ]]
|
||||
[[- if not (file.Exists (printf "prometheus/rules/%s" $tpl)) ]]
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ file.Read (printf "bundles/monitoring/templates/prometheus/rules/%s" .) ]]
|
||||
[[ tmpl.Inline (file.Read (printf "bundles/%s/templates/prometheus/rules/%s" $bundle $tpl)) $c ]]
|
||||
_EOT
|
||||
destination = "local/rules/[[ . ]]"
|
||||
destination = "local/rules/[[ $tpl ]]"
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
}
|
||||
[[- end ]]
|
||||
[[- end ]]
|
||||
[[- end ]]
|
||||
[[- end ]]
|
||||
|
||||
[[- if file.Exists "prometheus/rules" ]]
|
||||
[[- range (file.ReadDir "prometheus/rules") ]]
|
||||
[[- range $tpl := file.ReadDir "prometheus/rules" ]]
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ file.Read (printf "prometheus/rules/%s" .) ]]
|
||||
[[ tmpl.Inline (file.Read (printf "prometheus/rules/%s" $tpl)) $c ]]
|
||||
_EOT
|
||||
destination = "local/rules/[[ . ]]"
|
||||
left_delimiter = "{{{"
|
||||
|
@ -110,13 +116,22 @@ _EOT
|
|||
[[- end ]]
|
||||
[[- end ]]
|
||||
|
||||
[[- /* Support prometheus rules as artifacts or as raw content */]]
|
||||
[[- range $k, $v := $c.alert_rules ]]
|
||||
|
||||
[[- if has $v "url" ]]
|
||||
artifact {
|
||||
source = "[[ $v.url ]]"
|
||||
destination = "local/rules/[[ $k ]].yml"
|
||||
mode = "file"
|
||||
}
|
||||
[[- else if has $v "content" ]]
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ $v.content ]]
|
||||
_EOT
|
||||
destination = "local/rules/[[ $k ]].yml"
|
||||
}
|
||||
[[- end ]]
|
||||
[[- end ]]
|
||||
|
||||
# A client cert, to connect to the AlertManager API
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: JVM
|
||||
|
||||
rules:
|
||||
|
||||
- alert: JvmMemoryFillingUp
|
||||
expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 90'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM memory filling up (instance {{ $labels.instance }})
|
||||
description: "JVM memory is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
@ -1,80 +0,0 @@
|
|||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: Postgres
|
||||
|
||||
rules:
|
||||
|
||||
- alert: PostgresqlDown
|
||||
expr: 'pg_up == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql down (instance {{ $labels.instance }})
|
||||
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresTooManyRestarts
|
||||
expr: changes(process_start_time_seconds{job="pg"}[15m]) > 3
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgres too many restarts (instance {{ $labels.instance }})
|
||||
description: "Postgres server has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyConnections
|
||||
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql too many connections (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlDeadLocks
|
||||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# - alert: PostgresqlHighRollbackRate
|
||||
# expr: 'rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.05'
|
||||
# for: 0m
|
||||
# labels:
|
||||
# severity: warning
|
||||
# annotations:
|
||||
# summary: Postgresql high rollback rate (instance {{ $labels.instance }})
|
||||
# description: "Ratio of transactions being aborted compared to committed is > 5 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateStatementTimeout
|
||||
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
|
||||
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateDeadlock
|
||||
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
|
||||
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyLocksAcquired
|
||||
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
|
||||
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -37,7 +37,7 @@ monitoring:
|
|||
# Ping exporter can ping external hosts and expose stats to prometheus
|
||||
ping:
|
||||
# Version of the exporter to use
|
||||
version: 1.1.2
|
||||
version: 1.1.3
|
||||
# Docker image to use
|
||||
image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1'
|
||||
# Custom env var to set in the container
|
||||
|
@ -146,12 +146,27 @@ monitoring:
|
|||
# - 10.11.2.3:9305
|
||||
# - 192.168.6.20:782
|
||||
jobs: {}
|
||||
# A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts. Eg
|
||||
# A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts,
|
||||
# or you can provide it as raw content. Eg
|
||||
# alert_rules:
|
||||
# postgres:
|
||||
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
|
||||
# patroni:
|
||||
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml
|
||||
# custom:
|
||||
# content: |
|
||||
# groups:
|
||||
# - name: EmbeddedExporter
|
||||
# rules:
|
||||
# - alert: PrometheusJobMissing
|
||||
# expr: 'absent(up{job="prometheus"})'
|
||||
# for: 0m
|
||||
# labels:
|
||||
# severity: warning
|
||||
# annotations:
|
||||
# summary: Prometheus job missing (instance {{ $labels.instance }})
|
||||
# description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# If you need something more flexible (like download an archive of rules and uncompress it, you should use artifacts instead. Just ensure your rules
|
||||
# are in /local/rules/ inside the container
|
||||
alert_rules: {}
|
||||
|
|
Loading…
Reference in New Issue