monitoring/example/services.nomad.hcl

2584 lines
80 KiB
HCL

job "monitoring-services" {
datacenters = ["dc1"]
region = "global"
# Metrics is running prometheus
group "metrics-server" {
shutdown_delay = "6s"
count = 1
network {
mode = "bridge"
port "metrics" {}
}
volume "data" {
source = "prometheus-data"
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
per_alloc = true
}
service {
name = "prometheus"
port = 9090
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
datacenter = "${NOMAD_DC}"
group = "${NOMAD_GROUP_NAME}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
node = "${node.unique.name}"
region = "${NOMAD_REGION}"
}
connect {
sidecar_service {
}
sidecar_task {
config {
args = [
"-c",
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
"-l",
"${meta.connect.log_level}",
"--concurrency",
"${meta.connect.proxy_concurrency}",
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
check {
name = "health"
type = "http"
expose = true
path = "/-/healthy"
interval = "20s"
timeout = "8s"
check_restart {
limit = 10
grace = "5m"
}
}
tags = [
]
}
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
task "metrics-proxy" {
driver = "docker"
user = 8995
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
volumes = [
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
]
pids_limit = 100
}
lifecycle {
hook = "poststart"
sidecar = true
}
vault {
policies = ["metrics"]
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://localhost:9090/metrics;
}
}
_EOT
destination = "local/default.conf"
}
resources {
cpu = 10
memory = 10
memory_max = 20
}
}
# The main prometheus task
task "prometheus" {
driver = "docker"
leader = true
config {
image = "danielberteaud/prometheus:2.51.1-1"
readonly_rootfs = true
pids_limit = 200
command = "prometheus"
args = [
"--config.file=/local/prometheus.yml",
"--log.level=info",
"--web.listen-address=127.0.0.1:9090",
"--storage.tsdb.path=/data",
"--storage.tsdb.retention.time=30d",
"--web.console.libraries=/opt/prometheus/console_libraries",
"--web.console.templates=/opt/prometheus/consoles",
"--web.external-url=https://prometheus.example.org",
"--web.route-prefix=/"
]
}
vault {
policies = ["prometheus"]
env = false
disable_file = true
change_mode = "noop"
}
# Main configuration for prometheus
template {
data = <<_EOT
global:
scrape_interval: 15s
evaluation_interval: 15s
#query_log_file: /dev/stdout
external_labels:
cluster: consul
env: default
rule_files:
- /local/rules/*.yml
alerting:
alertmanagers:
- scheme: https
tls_config:
ca_file: /local/monitoring.ca.pem
cert_file: /secrets/prometheus.bundle.pem
key_file: /secrets/prometheus.bundle.pem
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
datacenter: dc1
relabel_configs:
# Only keep alertmanagers
- source_labels: [__meta_consul_service]
action: keep
regex: alertmanager-tls
scrape_configs:
# Cluster services
- job_name: cluster-services
scheme: https
tls_config:
ca_file: /local/monitoring.ca.pem
cert_file: /secrets/prometheus.bundle.pem
key_file: /secrets/prometheus.bundle.pem
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
datacenter: dc1
relabel_configs:
# Drop anything which is not Nomad, Consul or Vault
# Other services will be monitored with another job
- source_labels: [__meta_consul_service]
action: keep
regex: (nomad(\-client)?|consul|vault)
- source_labels: [__meta_consul_service,__meta_consul_node]
regex: (.+);(.+)
replacement: $${1}/$${2}
target_label: __metrics_path__
- source_labels: [__meta_consul_service]
regex: (.+)
replacement: {{ range $idx, $instance := service "cluster-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
target_label: __address__
# Rewrite the job labels to the name of the service
- source_labels: [__meta_consul_service]
regex: (.+)
replacement: $${1}
target_label: job
# Rewrite the instance labels
- source_labels: [__meta_consul_node]
regex: (.+)
replacement: $${1}
target_label: instance
# regular services discovered from the Consul Catalog
- job_name: consul-services
scheme: https
tls_config:
ca_file: /local/monitoring.ca.pem
cert_file: /secrets/prometheus.bundle.pem
key_file: /secrets/prometheus.bundle.pem
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
datacenter: dc1
relabel_configs:
# Drop sidecar's service to prevent duplicate. Sidecar themselves are treated in another job
- source_labels: [__meta_consul_service]
action: drop
regex: (.+)-sidecar-proxy
# Drop Nomad, Consul and vault, already handled
- source_labels: [__meta_consul_service]
action: drop
regex: (nomad(\-client)?|consul|vault)
# Only keep services having a metrics-port set
- source_labels: [__meta_consul_service_metadata_metrics_port]
regex: \d+
action: keep
# Get metrics path from metadata
- source_labels: [__meta_consul_service_metadata_metrics_path]
target_label: __metrics_path__
regex: (.+)
# Rewrite the scheme if needed
- source_labels: [__meta_consul_service_metadata_metrics_scheme]
regex: (https?)
replacement: $${1}
target_label: __scheme__
# Rewrite the address to use the metrics port
- source_labels: [__address__, __meta_consul_service_metadata_metrics_port]
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $${1}:$${2}
target_label: __address__
# Rewrite the job labels to the name of the service
- source_labels: [__meta_consul_service]
regex: (.+)
replacement: $${1}
target_label: job
# Set the default alloc to 0 if not set
- source_labels: [__meta_consul_service_metadata_alloc]
regex: ^$
replacement: 0
target_label: __meta_consul_service_metadata_alloc
# Keep the alloc meta in a label
# Note that most of the time, alloc is just the allocation index, but in some cases, it can be the host name (for system jobs)
- source_labels: [__meta_consul_service_metadata_alloc]
regex: (.+)
replacement: $${1}
target_label: alloc
# Rewerite the instance label to be service-alloc
- source_labels: [__meta_consul_service, alloc]
regex: (.+);([a-zA-Z\d\-\.]+)
replacement: $${1}-$${2}
target_label: instance
# envoy sidecars from consul
- job_name: consul-envoy-services
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
datacenter: dc1
relabel_configs:
# Only keep sidecar-service with a envoy-metrics-port defined
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_envoy_metrics_port]
action: keep
regex: (.+)-sidecar-proxy;\d+
# Rewrite the address to use the envoy-metrics-port
- source_labels: [__address__, __meta_consul_service_metadata_envoy_metrics_port]
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $${1}:$${2}
target_label: __address__
# Rewrite the job label
- source_labels: [__meta_consul_service]
regex: (.+)
replacement: $${1}
target_label: job
# Set the default alloc to 0 if not set
- source_labels: [__meta_consul_service_metadata_alloc]
regex: ^$
replacement: 0
target_label: __meta_consul_service_metadata_alloc
# Rewerite the instance label to be service-alloc
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc]
regex: (.+);([a-zA-Z\d\-\.]+)
replacement: $${1}-$${2}
target_label: instance
_EOT
destination = "local/prometheus.yml"
uid = 100000
gid = 109090
perms = 640
change_mode = "signal"
change_signal = "SIGHUP"
}
# Alert rules
template {
data = <<_EOT
# vi: syntax=yaml
groups:
- name: Blackbox
rules:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox slow probe (instance {{ $labels.instance }})
description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateExpired
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowHttp
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/blackbox.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
# vi: syntax=yaml
groups:
- name: JVM
rules:
- alert: JvmMemoryFillingUp
expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 90'
for: 2m
labels:
severity: warning
annotations:
summary: JVM memory filling up (instance {{ $labels.instance }})
description: "JVM memory is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/jvm.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
# vi: syntax=yaml
groups:
- name: Nomad
rules:
- alert: NomadJobFailed
expr: 'delta(nomad_nomad_job_summary_failed[30m]) > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Nomad job failed (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobLost
expr: 'nomad_nomad_job_summary_lost > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Nomad job lost (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobQueued
expr: 'nomad_nomad_job_summary_queued > 0'
for: 3m
labels:
severity: warning
annotations:
summary: Nomad job queued (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadBlockedEvaluation
expr: 'nomad_nomad_blocked_evals_total_blocked > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Nomad blocked evaluation (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadTaskOOM
expr: 'count_over_time(nomad_client_allocs_oom_killed[1h]) > 1'
for: 0m
labels:
severity: warning
annotations:
summary: Nomad task killed by OOM (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad task killed by OOM \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/nomad.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
# vi: syntax=yaml
groups:
- name: Ping
rules:
- alert: HostDown
expr: ping_loss_ratio == 1
for: 3m
labels:
severity: critical
annotations:
summary: Host down (host {{ $labels.target }})
description: "Host {{ $labels.target }} doesn't respond to ICMP pings, VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PingLoss
expr: |
avg_over_time(ping_loss_ratio[10m]) > 0.1 and min_over_time(ping_loss_ratio[10m]) < 1
for: 0m
labels:
severity: warning
annotations:
summary: High packet loss (host {{ $labels.target }})
description: "ICMP pings have a loss ratio > 10%, VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/ping.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
# vi: syntax=yaml
groups:
- name: Postgres
rules:
- alert: PostgresqlDown
expr: 'pg_up == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql down (instance {{ $labels.instance }})
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresTooManyRestarts
expr: changes(process_start_time_seconds{job="pg"}[15m]) > 3
for: 1m
labels:
severity: warning
annotations:
summary: Postgres too many restarts (instance {{ $labels.instance }})
description: "Postgres server has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyConnections
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8'
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql too many connections (instance {{ $labels.instance }})
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlDeadLocks
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql dead locks (instance {{ $labels.instance }})
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PostgresqlHighRollbackRate
# expr: 'rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.05'
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Postgresql high rollback rate (instance {{ $labels.instance }})
# description: "Ratio of transactions being aborted compared to committed is > 5 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateStatementTimeout
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateDeadlock
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyLocksAcquired
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
for: 2m
labels:
severity: critical
annotations:
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/postgres.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
# vi: syntax=yaml
groups:
# Prometheus
- name: Prometheus
rules:
- alert: PrometheusTargetMissing
expr: up{job!~"sftp-PR\\d+"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: Prometheus target missing (job {{ $labels.job }}, instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 3
for: 1m
labels:
severity: warning
annotations:
summary: Prometheus too many restarts (job {{ $labels.job }}, instance {{ $labels.instance }})
description: "Prometheus has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 2m
labels:
severity: critical
annotations:
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/prometheus.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
# vi: syntax=yaml
groups:
- name: Traefik
rules:
- alert: TraefikHighHttp5xxErrorRateService
expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
for: 1m
labels:
severity: critical
annotations:
summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/traefik.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
# vi: syntax=yaml
groups:
- name: HashicorpVault
rules:
- alert: VaultSealed
expr: 'vault_core_unsealed == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Vault sealed (instance {{ $labels.instance }})
description: "Vault instance is sealed on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/vault.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
# vi: syntax=yaml
groups:
- name: ConsulExporter
rules:
- alert: ConsulServiceHealthcheckFailed
# Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available)
expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0'
for: 2m
labels:
severity: critical
annotations:
summary: Consul service healthcheck failed (service {{ $labels.service_name }})
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulMissingMasterNode
expr: 'consul_raft_leader != 1'
for: 0m
labels:
severity: critical
annotations:
summary: Consul missing master node (node {{ $labels.node }})
description: "No consul leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulAgentUnhealthy
expr: 'consul_health_node_status{status="critical"} == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Consul agent unhealthy (node {{ $labels.node }})
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulServiceWarning
expr: 'consul_health_service_status{status="warning"} == 1'
for: 2m
labels:
severity: warning
annotations:
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulServiceCritical
expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/consul.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
groups:
- name: EmbeddedExporter
rules:
- alert: LokiProcessTooManyRestarts
expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
for: 0m
labels:
severity: warning
annotations:
summary: Loki process too many restarts (instance {{ $labels.instance }})
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestErrors
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
for: 15m
labels:
severity: critical
annotations:
summary: Loki request errors (instance {{ $labels.instance }})
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestPanic
expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Loki request panic (instance {{ $labels.instance }})
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestLatency
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
for: 5m
labels:
severity: critical
annotations:
summary: Loki request latency (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/loki.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized
expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteRate
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: HostCpuIsUnderutilized
# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 1w
# labels:
# severity: info
# annotations:
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostContextSwitching
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 20000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching (instance {{ $labels.instance }})
description: "Context switching is growing on the node (> 20000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: HostSwapIsFillingUp
# expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 2m
# labels:
# severity: warning
# annotations:
# summary: Host swap is filling up (instance {{ $labels.instance }})
# description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidArrayGotInactive
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 6h
labels:
severity: warning
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/node.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/prometheus"
(printf "common_name=prometheus-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
{{- end -}}
_EOT
destination = "secrets/prometheus.bundle.pem"
uid = 100000
gid = 109090
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The monitoring CA chain, to validate AlertManager cert
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
uid = 100000
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
# Persistent data
volume_mount {
volume = "data"
destination = "/data"
}
resources {
cpu = 200
memory = 768
memory_max = 1024
}
}
}
group "alerts" {
shutdown_delay = "6s"
count = 1
network {
mode = "bridge"
# Port exposing the web API, with mTLS
port "web-tls" {}
# Port used for gossip between the different alertmanager instance
port "cluster" {}
# Port to expose metrics to prometheus
port "metrics" {}
}
volume "data" {
source = "alertmanager-data"
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
per_alloc = true
}
# This service is used for the different instances of alertmanager to communicate
service {
name = "alertmanager-gossip"
port = "cluster"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
# This service is used by prometheus. As it needs to be able to reach every instances, it cannot use
# the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh
service {
name = "alertmanager-tls"
port = "web-tls"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
# This service is exposed through the service mesh
# and can be used to reach the web interface through Traefik
service {
name = "alertmanager"
port = 9093
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
datacenter = "${NOMAD_DC}"
group = "${NOMAD_GROUP_NAME}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
node = "${node.unique.name}"
region = "${NOMAD_REGION}"
}
connect {
sidecar_service {
}
sidecar_task {
config {
args = [
"-c",
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
"-l",
"${meta.connect.log_level}",
"--concurrency",
"${meta.connect.proxy_concurrency}",
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
check {
name = "health"
type = "http"
expose = true
path = "/-/healthy"
interval = "20s"
timeout = "8s"
check_restart {
limit = 12
grace = "30s"
}
}
tags = [
]
}
# This task will handle mTLS to the AlertManager API
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
task "untls-proxy" {
driver = "docker"
user = 9093
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
readonly_rootfs = true
pids_limit = 30
volumes = [
"local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro",
]
mount {
type = "tmpfs"
target = "/tmp"
tmpfs_options {
size = 3000000
}
}
}
vault {
policies = ["metrics", "alertmanager"]
env = false
disable_file = true
change_mode = "noop"
}
lifecycle {
hook = "poststart"
sidecar = true
}
template {
data = <<_EOT
# UnTLS for the web API
server {
listen 127.0.0.1:9093;
location / {
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
proxy_ssl_verify on;
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul;
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
allow 127.0.0.1;
deny all;
}
}
# Metrics proxy
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
proxy_ssl_verify on;
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul;
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
}
}
_EOT
destination = "local/alertmanager.conf"
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/alertmanager"
(printf "common_name=alertmanager-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/alertmanager.bundle.pem"
uid = 109093
gid = 100000
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
resources {
cpu = 10
memory = 18
}
}
# The main alertmanager task
task "alertmanager" {
driver = "docker"
leader = true
config {
image = "danielberteaud/alertmanager:0.27.0-2"
readonly_rootfs = true
pids_limit = 200
command = "/local/alertmanager"
}
vault {
policies = ["metrics", "alertmanager"]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = <<_EOT
LANG=fr_FR.utf8
TZ=Europe/Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
template {
data = <<_EOT
global:
smtp_from: alertmanager@consul
smtp_require_tls: false
smtp_smarthost: localhost:25
_EOT
destination = "secrets/alertmanager.yml"
}
template {
data = <<_EOT
tls_server_config:
cert_file: /secrets/alertmanager.bundle.pem
key_file: /secrets/alertmanager.bundle.pem
client_auth_type: RequireAndVerifyClientCert
client_ca_file: /local/monitoring.ca.pem
tls_client_config:
cert_file: /secrets/alertmanager.bundle.pem
key_file: /secrets/alertmanager.bundle.pem
ca_file: /local/monitoring.ca.pem
_EOT
destination = "local/cluster_tls.yml"
}
template {
data = <<_EOT
tls_server_config:
cert_file: /secrets/alertmanager.bundle.pem
key_file: /secrets/alertmanager.bundle.pem
client_auth_type: RequireAndVerifyClientCert
client_ca_file: /local/monitoring.ca.pem
_EOT
destination = "local/web_tls.yml"
}
template {
data = <<_EOT
#!/bin/sh
set -euo pipefail
exec alertmanager \
--config.file=/secrets/alertmanager.yml \
--storage.path=/data \
--web.external-url=https://alert.example.org \
--web.route-prefix=/ \
--web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \
--cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \
--cluster.advertise-address={{ env "NOMAD_HOST_ADDR_cluster" }} \
{{- range service "alertmanager-gossip" -}}
{{- if not (eq (env "NOMAD_ALLOC_INDEX") (index .ServiceMeta "alloc")) }}
--cluster.peer={{ .Address }}:{{ .Port }} \
{{ end -}}
{{- end -}}
--cluster.tls-config=/local/cluster_tls.yml \
--web.config.file=/local/web_tls.yml
_EOT
destination = "local/alertmanager"
uid = 100000
gid = 100000
perms = "0755"
}
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/alertmanager"
(printf "common_name=alertmanager-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/alertmanager.bundle.pem"
uid = 109093
gid = 109090
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The trusted CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
volume_mount {
volume = "data"
destination = "/data"
}
resources {
cpu = 50
memory = 64
memory_max = 80
}
}
}
group "logs-server" {
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
volume "data" {
source = "loki-data"
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
}
service {
name = "loki"
port = 3100
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
datacenter = "${NOMAD_DC}"
group = "${NOMAD_GROUP_NAME}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
node = "${node.unique.name}"
region = "${NOMAD_REGION}"
}
connect {
sidecar_service {
}
sidecar_task {
config {
args = [
"-c",
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
"-l",
"${meta.connect.log_level}",
"--concurrency",
"${meta.connect.proxy_concurrency}",
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
check {
name = "ready"
type = "http"
path = "/ready"
expose = true
interval = "20s"
timeout = "8s"
check_restart {
limit = 6
grace = "5m"
}
}
tags = [
]
}
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
task "metrics-proxy" {
driver = "docker"
user = 8995
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
volumes = [
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
]
pids_limit = 100
}
lifecycle {
hook = "poststart"
sidecar = true
}
vault {
policies = ["metrics"]
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://localhost:3100/metrics;
}
}
_EOT
destination = "local/default.conf"
}
resources {
cpu = 10
memory = 10
memory_max = 20
}
}
task "loki" {
driver = "docker"
config {
image = "danielberteaud/loki:2.9.6-1"
command = "loki"
args = ["--config.file=/local/loki.yml"]
}
vault {
policies = ["loki"]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = <<_EOT
LANG=fr_FR.utf8
TZ=Europe/Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
template {
data = <<_EOT
analytics:
reporting_enabled: false
auth_enabled: false
common:
instance_addr: 127.0.0.1
path_prefix: /data
replication_factor: 1
ring:
kvstore:
store: inmemory
storage:
filesystem:
chunks_directory: /data/chunks
rules_directory: /data/rules
compactor:
compaction_interval: 1h
deletion_mode: filter-and-delete
retention_enabled: true
shared_store: filesystem
working_directory: /data/compactor
ingester:
chunk_idle_period: 1h
limits_config:
ingestion_burst_size_mb: 100
ingestion_rate_mb: 20
max_entries_limit_per_query: 20000
max_query_parallelism: 128
retention_period: 720h
split_queries_by_interval: 0
ruler:
alertmanager_client:
tls_ca_path: /secrets/monitoring.ca.pem
tls_cert_path: /secrets/loki.bundle.pem
tls_key_path: /secrets/loki.bundle.pem
tls_server_name: alertmanager.monitoring
alertmanager_url: alertmanager-tls
enable_alertmanager_discovery: true
enable_alertmanager_v2: true
enable_api: true
ring:
kvstore:
store: inmemory
rule_path: /tmp/loki-rules
storage:
local:
directory: /local/rules
type: local
schema_config:
configs:
- from: "2020-10-24"
index:
period: 24h
prefix: index_
object_store: filesystem
schema: v11
store: boltdb-shipper
server:
grpc_listen_address: 127.0.0.1
grpc_listen_port: 9095
http_listen_address: 127.0.0.1
http_listen_port: 3100
storage_config:
boltdb_shipper:
active_index_directory: /data/index
cache_location: /data/boltdb-cache
shared_store: filesystem
_EOT
destination = "local/loki.yml"
}
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/loki"
(printf "common_name=loki-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
{{- end -}}
_EOT
destination = "secrets/loki.bundle.pem"
uid = 100000
gid = 103100
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The monitoring CA chain, to validate AlertManager cert
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
uid = 100000
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
volume_mount {
volume = "data"
destination = "/data"
}
resources {
cpu = 150
memory = 1024
}
}
}
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
# And with a loki sink. The goal is to be able to collect logs from various sources
group "logs-aggregator" {
count = 1
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
# The main service is the vector source
# It will provide access to other services through the mesh (like loki)
service {
name = "vector-aggregator"
port = 9000
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
datacenter = "${NOMAD_DC}"
group = "${NOMAD_GROUP_NAME}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
node = "${node.unique.name}"
region = "${NOMAD_REGION}"
}
connect {
sidecar_service {
proxy {
upstreams {
destination_name = "loki"
local_bind_port = 3100
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
}
}
sidecar_task {
config {
args = [
"-c",
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
"-l",
"${meta.connect.log_level}",
"--concurrency",
"${meta.connect.proxy_concurrency}",
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
tags = [
]
}
task "vector" {
driver = "docker"
leader = true
config {
image = "danielberteaud/vector:0.37.0-1"
readonly_rootfs = true
pids_limit = 200
args = ["--config=/local/vector.yml"]
}
vault {
policies = ["metrics"]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = <<_EOT
LANG=fr_FR.utf8
TZ=Europe/Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
data_dir: /local
expire_metrics_secs: 600
sources:
logs_vector:
type: vector
address: 127.0.0.1:9000
vector_metrics:
type: internal_metrics
transforms:
split-by-app:
type: route
inputs: [ "logs_*" ]
route:
traefik: '.service == "traefik"'
postgres: '.service == "postgres"'
syslog: '.source_type == "syslog"'
parse-traefik:
type: remap
inputs: ["split-by-app.traefik"]
source: |
.http = parse_grok!(.message, "%%{HTTPD_COMMONLOG}")
.loki_labels.http_method = .http.verb
.loki_labels.http_status = .http.response
.loki_labels.user = .http.auth
parse-postgres:
type: remap
inputs: ["split-by-app.postgres"]
source: |
if includes(array!(.nomad.tags), "master"){
.loki_labels.pg_role = "master"
} else if includes(array!(.nomad.tags), "replica"){
.loki_labels.pg_role = "replica"
}
parse-syslog:
type: remap
inputs: ["split-by-app.syslog"]
source: |
# PfSense sends /usr/sbin/cron as the appname, instead of cron
if string!(.appname) == "/usr/sbin/cron" {
.appname = "cron"
}
.service = .appname
sinks:
loki:
type: loki
inputs: [ "split-by-app._unmatched", "parse-*" ]
endpoint: http://127.0.0.1:3100
encoding:
codec: text
labels:
job: "{{ .service }}"
host: "{{ .host }}"
_*: "{{ .loki_labels }}"
buffer:
type: disk
max_size: 268435488
remove_label_fields: true
# Expose vector internal metrics
prometheus:
type: prometheus_exporter
inputs: ["vector_metrics"]
address: 0.0.0.0:$${NOMAD_ALLOC_PORT_metrics}
tls:
enabled: true
crt_file: /secrets/metrics.bundle.pem
key_file: /secrets/metrics.bundle.pem
ca_file: /local/monitoring.ca.pem
verify_certificate: true
_EOT
destination = "local/vector.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
change_mode = "signal"
change_signal = "SIGHUP"
}
resources {
cpu = 100
memory = 192
}
}
}
group "grafana" {
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
volume "data" {
source = "grafana-data"
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
}
service {
name = "grafana"
port = 3000
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
}
connect {
sidecar_service {
proxy {
upstreams {
destination_name = "postgres"
local_bind_port = 5432
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
upstreams {
destination_name = "loki"
local_bind_port = 3100
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
upstreams {
destination_name = "prometheus"
local_bind_port = 9090
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
}
}
sidecar_task {
config {
args = [
"-c",
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
"-l",
"${meta.connect.log_level}",
"--concurrency",
"${meta.connect.proxy_concurrency}",
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
check {
name = "health"
type = "http"
path = "/api/health"
expose = true
interval = "30s"
timeout = "8s"
}
tags = [
"traefik.enable=true",
"traefik.http.routers.monitoring-grafana.entrypoints=https",
"traefik.http.routers.monitoring-grafana.rule=Host(`grafana.example.org`)",
"traefik.http.middlewares.csp-monitoring-grafana.headers.contentsecuritypolicy=connect-src 'self' https://grafana.com;default-src 'self';font-src 'self' data:;img-src 'self' data: blob: https://grafana.com;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
"traefik.http.routers.monitoring-grafana.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-grafana",
]
}
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
task "metrics-proxy" {
driver = "docker"
user = 8995
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
volumes = [
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
]
pids_limit = 100
}
lifecycle {
hook = "poststart"
sidecar = true
}
vault {
policies = ["metrics"]
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://127.0.0.1:3000/metrics;
}
}
_EOT
destination = "local/default.conf"
}
resources {
cpu = 10
memory = 10
memory_max = 20
}
}
# Local memcached instance
task "memcached" {
driver = "docker"
user = 11211
lifecycle {
hook = "prestart"
sidecar = true
}
config {
image = "memcached:alpine"
readonly_rootfs = true
force_pull = true
entrypoint = ["/local/memcached"]
}
template {
data = <<_EOT
#!/bin/sh
set -eu
exec memcached -l 127.0.0.1 -p 11211 -m {{ env "NOMAD_MEMORY_LIMIT" | parseInt | subtract 5 }}
_EOT
destination = "local/memcached"
perms = 755
}
resources {
cpu = 10
memory = 20
}
}
task "grafana" {
driver = "docker"
leader = true
config {
image = "danielberteaud/grafana:10.4.1-1"
readonly_rootfs = true
pids_limit = 100
command = "grafana"
args = [
"server",
"--homepath=/opt/grafana",
"--config=/secrets/grafana.ini",
"--packaging=docker"
]
}
vault {
policies = ["grafana"]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = <<_EOT
LANG=fr_FR.utf8
TZ=Europe/Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
template {
data = <<_EOT
GF_SECURITY_ADMIN_PASSWORD={{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd | sprig_squote }}{{ end }}
_EOT
destination = "secrets/.grafana.env"
perms = 400
env = true
}
# Basic grafana configuration file
template {
data = <<_EOT
[server]
http_addr = 127.0.0.1
http_port = 3000
root_url = https://grafana.example.org
[database]
type = postgres
name = grafana
host = 127.0.0.1:5432
user = {{ with secret "database/creds/grafana" }}{{ .Data.username }}{{ end }}
password = {{ with secret "database/creds/grafana" }}{{ .Data.password }}{{ end }}
[remote_cache]
type = memcached
connstr = 127.0.0.1:11211
[analytics]
reporting_enabled = false
check_for_updates = false
check_for_plugin_updates = false
[security]
cookie_secure = true
cookie_samesite = strict
x_xss_protection = true
secret_key = {{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.secret_key }}{{ end }}
[dataproxy]
timeout = 120
[feature_toggles]
_EOT
destination = "secrets/grafana.ini"
uid = 103000
perms = 400
}
# Mount volume in /data for persistence
volume_mount {
volume = "data"
destination = "/data"
}
resources {
cpu = 100
memory = 256
}
}
}
}