monitoring/templates/prometheus/rules/consul.yml

55 lines
2.2 KiB
YAML

# vi: syntax=yaml
groups:
- name: ConsulExporter
rules:
- alert: ConsulServiceHealthcheckFailed
# Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available)
expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0'
for: 2m
labels:
severity: critical
annotations:
summary: Consul service healthcheck failed (service {{ $labels.service_name }})
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulMissingMasterNode
expr: 'consul_raft_leader != 1'
for: 0m
labels:
severity: critical
annotations:
summary: Consul missing master node (node {{ $labels.node }})
description: "No consul leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulAgentUnhealthy
expr: 'consul_health_node_status{status="critical"} == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Consul agent unhealthy (node {{ $labels.node }})
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulServiceWarning
expr: 'consul_health_service_status{status="warning"} == 1'
for: 2m
labels:
severity: warning
annotations:
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulServiceCritical
expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"