monitoring/templates/prometheus/rules/nomad.yml

52 lines
2.1 KiB
YAML

# vi: syntax=yaml
groups:
- name: Nomad
rules:
- alert: NomadJobFailed
expr: 'delta(nomad_nomad_job_summary_failed[30m]) > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Nomad job failed (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobLost
expr: 'nomad_nomad_job_summary_lost > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Nomad job lost (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobQueued
expr: 'nomad_nomad_job_summary_queued > 0'
for: 3m
labels:
severity: warning
annotations:
summary: Nomad job queued (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadBlockedEvaluation
expr: 'nomad_nomad_blocked_evals_total_blocked > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Nomad blocked evaluation (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadTaskOOM
expr: 'count_over_time(nomad_client_allocs_oom_killed[1h]) > 1'
for: 0m
labels:
severity: warning
annotations:
summary: Nomad task killed by OOM (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad task killed by OOM \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"