2024-03-20 11:10:07 +01:00
job "monitoring-services" {
2024-03-19 13:53:28 +01:00
datacenters = [ "dc1" ]
region = "global"
2024-03-28 09:39:59 +01:00
2024-03-25 22:23:31 +01:00
# Metrics is running prometheus
2024-03-25 12:27:46 +01:00
group "metrics-server" {
2024-03-19 13:53:28 +01:00
shutdown_delay = "6s"
count = 1
network {
mode = "bridge"
port "metrics" {}
}
volume "data" {
2024-03-22 16:10:10 +01:00
source = "prometheus-data"
2024-03-19 13:53:28 +01:00
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
per_alloc = true
}
service {
2024-03-22 16:10:10 +01:00
name = "prometheus"
2024-03-19 13:53:28 +01:00
port = 9090
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
2024-03-28 09:39:59 +01:00
datacenter = "${NOMAD_DC}"
group = "${NOMAD_GROUP_NAME}"
2024-03-19 13:53:28 +01:00
job = "${NOMAD_JOB_NAME}"
2024-03-25 12:27:46 +01:00
namespace = "${NOMAD_NAMESPACE}"
2024-03-28 09:39:59 +01:00
node = "${node.unique.name}"
region = "${NOMAD_REGION}"
2024-03-19 13:53:28 +01:00
}
connect {
sidecar_service {
}
sidecar_task {
config {
args = [
"-c" ,
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json" ,
"-l" ,
"${meta.connect.log_level}" ,
"--concurrency" ,
"${meta.connect.proxy_concurrency}" ,
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
check {
name = "health"
type = "http"
expose = true
path = "/-/healthy"
2024-03-25 22:23:31 +01:00
interval = "20s"
2024-03-19 13:53:28 +01:00
timeout = "8s"
check_restart {
limit = 10
grace = "5m"
}
}
tags = [
]
}
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
task "metrics-proxy" {
driver = "docker"
user = 8995
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
volumes = [
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
]
pids_limit = 100
}
lifecycle {
hook = "poststart"
sidecar = true
}
vault {
policies = [ "metrics" ]
}
2024-03-25 12:27:46 +01:00
# Get a certificate from vault to protect the metrics endpoint
2024-03-19 13:53:28 +01:00
template {
data = < < _EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans = % s " (env "NOMAD_HOST_IP_metrics" ) ) }}
{{ . Cert }}
2024-03-25 12:27:46 +01:00
{{ . Key }}
{{- end }}
2024-03-19 13:53:28 +01:00
_EOT
destination = "secrets/metrics.bundle.pem"
}
2024-03-25 12:27:46 +01:00
# Get the root CA
2024-03-19 13:53:28 +01:00
template {
data = < < _EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
2024-03-25 12:27:46 +01:00
2024-03-19 13:53:28 +01:00
template {
data = < < _EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl ;
http2 on ;
ssl_certificate / secrets / metrics . bundle . pem ;
ssl_certificate_key / secrets / metrics . bundle . pem ;
ssl_client_certificate / local / monitoring . ca . pem ;
ssl_verify_client on ;
ssl_protocols TLSv1 . 2 TLSv1 . 3 ;
ssl_ciphers ECDHE - ECDSA - AES128 - GCM - SHA256 : ECDHE - RSA - AES128 - GCM - SHA256 : ECDHE - ECDSA - AES256 - GCM - SHA384 : ECDHE - RSA - AES256 - GCM - SHA384 : ECDHE - ECDSA - CHACHA20 - POLY1305 : ECDHE - RSA - CHACHA20 - POLY1305 : DHE - RSA - AES128 - GCM - SHA256 : DHE - RSA - AES256 - GCM - SHA384 ;
ssl_session_cache shared : SSL : 10 m ;
ssl_session_timeout 1 h ;
ssl_session_tickets off ;
gzip on ;
gzip_types
text / plain ;
gzip_vary on ;
server_tokens off ;
if ( $ request_method ! ~ ^ ( GET | HEAD ) $ ) {
return 405 ;
}
location / metrics {
proxy_pass http : / / localhost : 9090 / metrics ;
}
}
_EOT
destination = "local/default.conf"
}
resources {
cpu = 10
memory = 10
memory_max = 20
}
}
# The main prometheus task
task "prometheus" {
driver = "docker"
leader = true
config {
2024-03-29 09:45:01 +01:00
image = "danielberteaud/prometheus:2.51.1-1"
2024-03-19 13:53:28 +01:00
readonly_rootfs = true
pids_limit = 200
command = "prometheus"
args = [
"--config.file = / local / prometheus . yml " ,
2024-03-25 14:54:13 +01:00
"--log.level = info " ,
2024-03-19 13:53:28 +01:00
"--web.listen-address = 127 . 0 . 0 . 1 : 9090 " ,
"--storage.tsdb.path = / data " ,
"--storage.tsdb.retention.time = 30 d " ,
"--web.console.libraries = / opt / prometheus / console_libraries " ,
"--web.console.templates = / opt / prometheus / consoles " ,
"--web.external-url = https : / / prometheus . example . org " ,
2024-03-26 15:35:02 +01:00
"--web.route-prefix = / "
2024-03-19 13:53:28 +01:00
]
}
vault {
2024-03-22 16:10:10 +01:00
policies = [ "prometheus" ]
2024-03-19 13:53:28 +01:00
env = false
disable_file = true
change_mode = "noop"
}
# Main configuration for prometheus
template {
data = < < _EOT
global :
scrape_interval : 15 s
evaluation_interval : 15 s
#query_log_file: /dev/stdout
external_labels :
cluster : consul
env : default
rule_files :
- / local / rules /* . y m l
alerting :
alertmanagers :
- scheme : https
tls_config :
ca_file : / local / monitoring . ca . pem
cert_file : / secrets / prometheus . bundle . pem
key_file : / secrets / prometheus . bundle . pem
consul_sd_configs :
- server : {{ sockaddr "GetInterfaceIP \"nomad\"" }}: 8500
scheme : http
2024-03-22 16:10:10 +01:00
token : {{ with secret "consul/creds/prometheus" }}{{ . Data . token }}{{ end }}
2024-03-19 13:53:28 +01:00
datacenter : dc1
relabel_configs :
# Only keep alertmanagers
- source_labels : [ __meta_consul_service ]
action : keep
2024-03-22 16:10:10 +01:00
regex : alertmanager - tls
2024-03-19 13:53:28 +01:00
scrape_configs :
# Cluster services
- job_name : cluster - services
scheme : https
tls_config :
ca_file : / local / monitoring . ca . pem
cert_file : / secrets / prometheus . bundle . pem
key_file : / secrets / prometheus . bundle . pem
consul_sd_configs :
- server : {{ sockaddr "GetInterfaceIP \"nomad\"" }}: 8500
scheme : http
2024-03-22 16:10:10 +01:00
token : {{ with secret "consul/creds/prometheus" }}{{ . Data . token }}{{ end }}
2024-03-19 13:53:28 +01:00
datacenter : dc1
relabel_configs :
# Drop anything which is not Nomad, Consul or Vault
# Other services will be monitored with another job
- source_labels : [ __meta_consul_service ]
action : keep
regex : ( nomad ( \ - client ) ? | consul | vault )
- source_labels : [ __meta_consul_service , __meta_consul_node ]
regex : ( . + ) ; ( . + )
replacement : $ ${ 1 } / $ ${ 2 }
target_label : __metrics_path__
- source_labels : [ __meta_consul_service ]
regex : ( . + )
2024-03-22 16:10:10 +01:00
replacement: {{ range $idx, $instance : = service "cluster-exporter" }}{{ if eq $ idx 0 }}{{ . Address }}: {{ . Port }}{{ end }}{{ end }}
2024-03-19 13:53:28 +01:00
target_label : __address__
# Rewrite the job labels to the name of the service
- source_labels : [ __meta_consul_service ]
regex : ( . + )
replacement : $ ${ 1 }
target_label : job
# Rewrite the instance labels
- source_labels : [ __meta_consul_node ]
regex : ( . + )
replacement : $ ${ 1 }
target_label : instance
# regular services discovered from the Consul Catalog
- job_name : consul - services
scheme : https
tls_config :
ca_file : / local / monitoring . ca . pem
cert_file : / secrets / prometheus . bundle . pem
key_file : / secrets / prometheus . bundle . pem
consul_sd_configs :
- server : {{ sockaddr "GetInterfaceIP \"nomad\"" }}: 8500
scheme : http
2024-03-22 16:10:10 +01:00
token : {{ with secret "consul/creds/prometheus" }}{{ . Data . token }}{{ end }}
2024-03-19 13:53:28 +01:00
datacenter : dc1
relabel_configs :
# Drop sidecar's service to prevent duplicate. Sidecar themselves are treated in another job
- source_labels : [ __meta_consul_service ]
action : drop
regex : ( . + ) - sidecar - proxy
# Drop Nomad, Consul and vault, already handled
- source_labels : [ __meta_consul_service ]
action : drop
regex : ( nomad ( \ - client ) ? | consul | vault )
# Only keep services having a metrics-port set
- source_labels : [ __meta_consul_service_metadata_metrics_port ]
regex : \ d +
action : keep
# Get metrics path from metadata
- source_labels : [ __meta_consul_service_metadata_metrics_path ]
target_label : __metrics_path__
regex : ( . + )
# Rewrite the scheme if needed
- source_labels : [ __meta_consul_service_metadata_metrics_scheme ]
regex : ( https ? )
replacement : $ ${ 1 }
target_label : __scheme__
# Rewrite the address to use the metrics port
- source_labels : [ __address__ , __meta_consul_service_metadata_metrics_port ]
regex : ( [ ^ : ] + ) ( ? : : \ d + ) ? ; ( \ d + )
replacement : $ ${ 1 } : $ ${ 2 }
target_label : __address__
# Rewrite the job labels to the name of the service
- source_labels : [ __meta_consul_service ]
regex : ( . + )
replacement : $ ${ 1 }
target_label : job
# Set the default alloc to 0 if not set
- source_labels : [ __meta_consul_service_metadata_alloc ]
regex : ^ $
replacement : 0
target_label : __meta_consul_service_metadata_alloc
2024-03-25 14:54:13 +01:00
# Keep the alloc meta in a label
# Note that most of the time, alloc is just the allocation index, but in some cases, it can be the host name (for system jobs)
- source_labels : [ __meta_consul_service_metadata_alloc ]
regex : ( . + )
replacement : $ ${ 1 }
target_label : alloc
2024-03-19 13:53:28 +01:00
# Rewerite the instance label to be service-alloc
2024-03-25 14:54:13 +01:00
- source_labels : [ __meta_consul_service , alloc ]
2024-03-19 13:53:28 +01:00
regex : ( . + ) ; ( [ a - zA - Z \ d \ - \ . ] + )
replacement : $ ${ 1 } - $ ${ 2 }
target_label : instance
# envoy sidecars from consul
- job_name : consul - envoy - services
consul_sd_configs :
- server : {{ sockaddr "GetInterfaceIP \"nomad\"" }}: 8500
scheme : http
2024-03-22 16:10:10 +01:00
token : {{ with secret "consul/creds/prometheus" }}{{ . Data . token }}{{ end }}
2024-03-19 13:53:28 +01:00
datacenter : dc1
relabel_configs :
# Only keep sidecar-service with a envoy-metrics-port defined
- source_labels : [ __meta_consul_service , __meta_consul_service_metadata_envoy_metrics_port ]
action : keep
regex : ( . + ) - sidecar - proxy ; \ d +
# Rewrite the address to use the envoy-metrics-port
- source_labels : [ __address__ , __meta_consul_service_metadata_envoy_metrics_port ]
regex : ( [ ^ : ] + ) ( ? : : \ d + ) ? ; ( \ d + )
replacement : $ ${ 1 } : $ ${ 2 }
target_label : __address__
# Rewrite the job label
- source_labels : [ __meta_consul_service ]
regex : ( . + )
replacement : $ ${ 1 }
target_label : job
# Set the default alloc to 0 if not set
- source_labels : [ __meta_consul_service_metadata_alloc ]
regex : ^ $
replacement : 0
target_label : __meta_consul_service_metadata_alloc
# Rewerite the instance label to be service-alloc
- source_labels : [ __meta_consul_service , __meta_consul_service_metadata_alloc ]
regex : ( . + ) ; ( [ a - zA - Z \ d \ - \ . ] + )
replacement : $ ${ 1 } - $ ${ 2 }
target_label : instance
_EOT
destination = "local/prometheus.yml"
uid = 100000
gid = 109090
perms = 640
change_mode = "signal"
change_signal = "SIGHUP"
}
# Alert rules
template {
data = < < _EOT
# vi: syntax=yaml
groups :
- name : Blackbox
rules :
- alert : BlackboxProbeFailed
expr: probe_success = = 0
for : 0 m
labels :
severity : critical
annotations :
summary : Blackbox probe failed ( instance {{ $ labels . instance }})
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : BlackboxSlowProbe
expr : avg_over_time ( probe_duration_seconds [ 1 m ] ) > 1
for : 1 m
labels :
severity : warning
annotations :
summary : Blackbox slow probe ( instance {{ $ labels . instance }})
description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : BlackboxProbeHttpFailure
expr: probe_http_status_code < = 199 OR probe_http_status_code > = 400
for : 0 m
labels :
severity : critical
annotations :
summary : Blackbox probe HTTP failure ( instance {{ $ labels . instance }})
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : BlackboxSslCertificateWillExpireSoon
expr: '3 < = round ( ( last_over_time ( probe_ssl_earliest_cert_expiry [ 10 m ] ) - time ( ) ) / 86400 , 0 . 1 ) < 20 '
for : 0 m
labels :
severity : warning
annotations :
summary : Blackbox SSL certificate will expire soon ( instance {{ $ labels . instance }})
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : BlackboxSslCertificateWillExpireSoon
expr: '0 < = round ( ( last_over_time ( probe_ssl_earliest_cert_expiry [ 10 m ] ) - time ( ) ) / 86400 , 0 . 1 ) < 3 '
for : 0 m
labels :
severity : critical
annotations :
summary : Blackbox SSL certificate will expire soon ( instance {{ $ labels . instance }})
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : BlackboxSslCertificateExpired
expr : ' round ( ( last_over_time ( probe_ssl_earliest_cert_expiry [ 10 m ] ) - time ( ) ) / 86400 , 0 . 1 ) < 0 '
for : 0 m
labels :
severity : critical
annotations :
summary : Blackbox SSL certificate expired ( instance {{ $ labels . instance }})
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : BlackboxProbeSlowHttp
expr : ' avg_over_time ( probe_http_duration_seconds [ 1 m ] ) > 1 '
for : 1 m
labels :
severity : warning
annotations :
summary : Blackbox probe slow HTTP ( instance {{ $ labels . instance }})
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
_EOT
destination = "local/rules/blackbox.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = < < _EOT
# vi: syntax=yaml
groups :
- name : JVM
rules :
- alert : JvmMemoryFillingUp
expr: '(sum by (instance)(jvm_memory_used_bytes{area = "heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap" }) ) * 100 > 90 '
for : 2 m
labels :
severity : warning
annotations :
summary : JVM memory filling up ( instance {{ $ labels . instance }})
description: "JVM memory is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
_EOT
destination = "local/rules/jvm.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = < < _EOT
# vi: syntax=yaml
groups :
- name : Nomad
rules :
- alert : NomadJobFailed
expr : ' delta ( nomad_nomad_job_summary_failed [ 30 m ] ) > 0 '
for : 0 m
labels :
severity : warning
annotations :
summary : Nomad job failed ( job {{ $ labels . exported_job }}, group {{ $ labels . task_group }}, instance {{ $ labels . instance }}, task {{ $ labels . task }})
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : NomadJobLost
expr : ' nomad_nomad_job_summary_lost > 0 '
for : 0 m
labels :
severity : warning
annotations :
summary : Nomad job lost ( job {{ $ labels . exported_job }}, group {{ $ labels . task_group }}, instance {{ $ labels . instance }}, task {{ $ labels . task }})
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : NomadJobQueued
expr : ' nomad_nomad_job_summary_queued > 0 '
for : 3 m
labels :
severity : warning
annotations :
summary : Nomad job queued ( job {{ $ labels . exported_job }}, group {{ $ labels . task_group }}, instance {{ $ labels . instance }}, task {{ $ labels . task }})
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : NomadBlockedEvaluation
expr : ' nomad_nomad_blocked_evals_total_blocked > 0 '
for : 2 m
labels :
severity : warning
annotations :
summary : Nomad blocked evaluation ( job {{ $ labels . exported_job }}, group {{ $ labels . task_group }}, instance {{ $ labels . instance }}, task {{ $ labels . task }})
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : NomadTaskOOM
expr : ' count_over_time ( nomad_client_allocs_oom_killed [ 1 h ] ) > 1 '
for : 0 m
labels :
severity : warning
annotations :
summary : Nomad task killed by OOM ( job {{ $ labels . exported_job }}, group {{ $ labels . task_group }}, instance {{ $ labels . instance }}, task {{ $ labels . task }})
description: "Nomad task killed by OOM \n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
_EOT
destination = "local/rules/nomad.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = < < _EOT
# vi: syntax=yaml
groups :
- name : Ping
rules :
- alert : HostDown
expr: ping_loss_ratio = = 1
for : 3 m
labels :
severity : critical
annotations :
summary : Host down ( host {{ $ labels . target }})
description: "Host {{ $labels.target }} doesn't respond to ICMP pings, VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PingLoss
expr : |
avg_over_time ( ping_loss_ratio [ 10 m ] ) > 0 . 1 and min_over_time ( ping_loss_ratio [ 10 m ] ) < 1
for : 0 m
labels :
severity : warning
annotations :
summary : High packet loss ( host {{ $ labels . target }})
description: "ICMP pings have a loss ratio > 10%, VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
_EOT
destination = "local/rules/ping.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = < < _EOT
# vi: syntax=yaml
groups :
- name : Postgres
rules :
- alert : PostgresqlDown
expr: 'pg_up = = 0 '
for : 0 m
labels :
severity : critical
annotations :
summary : Postgresql down ( instance {{ $ labels . instance }})
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PostgresTooManyRestarts
expr: changes(process_start_time_seconds{job = "pg" }[ 15 m ] ) > 3
for : 1 m
labels :
severity : warning
annotations :
summary : Postgres too many restarts ( instance {{ $ labels . instance }})
description: "Postgres server has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PostgresqlTooManyConnections
expr : ' sum by ( datname ) ( pg_stat_activity_count {datname ! ~ "template.*|postgres" }) > pg_settings_max_connections * 0 . 8 '
for : 2 m
labels :
severity : warning
annotations :
summary : Postgresql too many connections ( instance {{ $ labels . instance }})
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PostgresqlDeadLocks
expr : ' increase ( pg_stat_database_deadlocks {datname ! ~ "template.*|postgres" }[ 1 m ] ) > 5 '
for : 0 m
labels :
severity : warning
annotations :
summary : Postgresql dead locks ( instance {{ $ labels . instance }})
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
# - alert: PostgresqlHighRollbackRate
# expr: 'rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.05'
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Postgresql high rollback rate (instance {{ $labels.instance }})
# description: "Ratio of transactions being aborted compared to committed is > 5 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : PostgresqlHighRateStatementTimeout
expr: 'rate(postgresql_errors_total{type = "statement_timeout" }[ 1 m ] ) > 3 '
for : 0 m
labels :
severity : critical
annotations :
summary : Postgresql high rate statement timeout ( instance {{ $ labels . instance }})
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PostgresqlHighRateDeadlock
expr: 'increase(postgresql_errors_total{type = "deadlock_detected" }[ 1 m ] ) > 1 '
for : 0 m
labels :
severity : critical
annotations :
summary : Postgresql high rate deadlock ( instance {{ $ labels . instance }})
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PostgresqlTooManyLocksAcquired
expr : ' ( ( sum ( pg_locks_count ) ) / ( pg_settings_max_locks_per_transaction * pg_settings_max_connections ) ) > 0 . 20 '
for : 2 m
labels :
severity : critical
annotations :
summary : Postgresql too many locks acquired ( instance {{ $ labels . instance }})
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
_EOT
destination = "local/rules/postgres.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = < < _EOT
# vi: syntax=yaml
groups :
# Prometheus
- name : Prometheus
rules :
- alert : PrometheusTargetMissing
expr: up{job!~"sftp-PR\\d+"} = = 0
for : 5 m
labels :
severity : critical
annotations :
summary : Prometheus target missing ( job {{ $ labels . job }}, instance {{ $ labels . instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job = ~ "prometheus|pushgateway|alertmanager" }[ 15 m ] ) > 3
for : 1 m
labels :
severity : warning
annotations :
summary : Prometheus too many restarts ( job {{ $ labels . job }}, instance {{ $ labels . instance }})
description: "Prometheus has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PrometheusNotConnectedToAlertmanager
expr : prometheus_notifications_alertmanagers_discovered < 1
for : 2 m
labels :
severity : critical
annotations :
summary : Prometheus not connected to alertmanager ( instance {{ $ labels . instance }})
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PrometheusRuleEvaluationFailures
expr : increase ( prometheus_rule_evaluation_failures_total [ 3 m ] ) > 0
for : 0 m
labels :
severity : critical
annotations :
summary : Prometheus rule evaluation failures ( instance {{ $ labels . instance }})
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PrometheusRuleEvaluationSlow
expr : prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for : 5 m
labels :
severity : warning
annotations :
summary : Prometheus rule evaluation slow ( instance {{ $ labels . instance }})
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PrometheusNotificationsBacklog
expr : min_over_time ( prometheus_notifications_queue_length [ 10 m ] ) > 0
for : 0 m
labels :
severity : warning
annotations :
summary : Prometheus notifications backlog ( instance {{ $ labels . instance }})
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PrometheusAlertmanagerNotificationFailing
expr : rate ( alertmanager_notifications_failed_total [ 1 m ] ) > 0
for : 0 m
labels :
severity : critical
annotations :
summary : Prometheus AlertManager notification failing ( instance {{ $ labels . instance }})
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile = "0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5" } > 1 . 05
for : 5 m
labels :
severity : warning
annotations :
summary : Prometheus target scraping slow ( instance {{ $ labels . instance }})
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : PrometheusTsdbWalCorruptions
expr : increase ( prometheus_tsdb_wal_corruptions_total [ 1 m ] ) > 0
for : 0 m
labels :
severity : critical
annotations :
summary : Prometheus TSDB WAL corruptions ( instance {{ $ labels . instance }})
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
_EOT
destination = "local/rules/prometheus.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = < < _EOT
# vi: syntax=yaml
groups :
- name : Traefik
rules :
- alert : TraefikHighHttp5xxErrorRateService
expr: 'sum(rate(traefik_service_requests_total{code = ~ "5.*" }[ 3 m ] ) ) by ( service ) / sum ( rate ( traefik_service_requests_total [ 3 m ] ) ) by ( service ) * 100 > 5 '
for : 1 m
labels :
severity : critical
annotations :
summary : Traefik high HTTP 5 x x error rate service ( instance {{ $ labels . instance }})
description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
_EOT
destination = "local/rules/traefik.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = < < _EOT
# vi: syntax=yaml
groups :
- name : HashicorpVault
rules :
- alert : VaultSealed
expr: 'vault_core_unsealed = = 0 '
for : 0 m
labels :
severity : critical
annotations :
summary : Vault sealed ( instance {{ $ labels . instance }})
description: "Vault instance is sealed on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
_EOT
destination = "local/rules/vault.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
2024-03-25 12:27:46 +01:00
template {
data = < < _EOT
# vi: syntax=yaml
groups :
- name : ConsulExporter
rules :
- alert : ConsulServiceHealthcheckFailed
# Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available)
expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} = = 0 '
for : 2 m
labels :
severity : critical
annotations :
summary : Consul service healthcheck failed ( service {{ $ labels . service_name }})
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : ConsulMissingMasterNode
expr: 'consul_raft_leader ! = 1 '
for : 0 m
labels :
severity : critical
annotations :
summary : Consul missing master node ( node {{ $ labels . node }})
description: "No consul leader\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : ConsulAgentUnhealthy
expr: 'consul_health_node_status{status = "critical" } = = 1 '
for : 0 m
labels :
severity : critical
annotations :
summary : Consul agent unhealthy ( node {{ $ labels . node }})
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : ConsulServiceWarning
expr: 'consul_health_service_status{status = "warning" } = = 1 '
for : 2 m
labels :
severity : warning
annotations :
summary : Service {{ $ labels . service_name }} on node {{ $ labels . node }} is in warning state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : ConsulServiceCritical
expr: 'consul_health_service_status{status = "critical",service_name!~".*-sidecar-proxy" } = = 1 '
for : 2 m
labels :
severity : critical
annotations :
summary : Service {{ $ labels . service_name }} on node {{ $ labels . node }} is in critical state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
_EOT
destination = "local/rules/consul.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
2024-03-25 22:23:31 +01:00
template {
data = < < _EOT
groups :
- name : EmbeddedExporter
rules :
- alert : LokiProcessTooManyRestarts
expr: 'changes(process_start_time_seconds{job = ~ ".*loki.*" }[ 15 m ] ) > 2 '
for : 0 m
labels :
severity : warning
annotations :
summary : Loki process too many restarts ( instance {{ $ labels . instance }})
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : LokiRequestErrors
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code = ~ "5.." }[ 1 m ] ) ) by ( namespace , job , route ) / sum ( rate ( loki_request_duration_seconds_count [ 1 m ] ) ) by ( namespace , job , route ) > 10 '
for : 15 m
labels :
severity : critical
annotations :
summary : Loki request errors ( instance {{ $ labels . instance }})
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : LokiRequestPanic
expr : ' sum ( increase ( loki_panic_total [ 10 m ] ) ) by ( namespace , job ) > 0 '
for : 5 m
labels :
severity : critical
annotations :
summary : Loki request panic ( instance {{ $ labels . instance }})
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : LokiRequestLatency
expr : ' ( histogram_quantile ( 0 . 99 , sum ( rate ( loki_request_duration_seconds_bucket {route ! ~ "(?i).*tail.*" }[ 5 m ] ) ) by ( le ) ) ) > 1 '
for : 5 m
labels :
severity : critical
annotations :
summary : Loki request latency ( instance {{ $ labels . instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
_EOT
destination = "local/rules/loki.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = < < _EOT
groups :
- name : NodeExporter
rules :
- alert : HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host out of memory ( instance {{ $ labels . instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostMemoryUnderMemoryPressure
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host memory under memory pressure ( instance {{ $ labels . instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostMemoryIsUnderutilized
expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 1 w
labels :
severity : info
annotations :
summary : Host Memory is underutilized ( instance {{ $ labels . instance }})
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostUnusualNetworkThroughputIn
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 5 m
labels :
severity : warning
annotations :
summary : Host unusual network throughput in ( instance {{ $ labels . instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostUnusualNetworkThroughputOut
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 5 m
labels :
severity : warning
annotations :
summary : Host unusual network throughput out ( instance {{ $ labels . instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostUnusualDiskReadRate
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 5 m
labels :
severity : warning
annotations :
summary : Host unusual disk read rate ( instance {{ $ labels . instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostUnusualDiskWriteRate
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host unusual disk write rate ( instance {{ $ labels . instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostOutOfDiskSpace
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly = = 0) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host out of disk space ( instance {{ $ labels . instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostDiskWillFillIn24Hours
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly = = 0) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host disk will fill in 24 hours ( instance {{ $ labels . instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostOutOfInodes
expr: '(node_filesystem_files_free{fstype! = "msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host out of inodes ( instance {{ $ labels . instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostFilesystemDeviceError
expr: 'node_filesystem_device_error = = 1 '
for : 0 m
labels :
severity : critical
annotations :
summary : Host filesystem device error ( instance {{ $ labels . instance }})
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostInodesWillFillIn24Hours
expr: '(node_filesystem_files_free{fstype! = "msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host inodes will fill in 24 hours ( instance {{ $ labels . instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host unusual disk read latency ( instance {{ $ labels . instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host unusual disk write latency ( instance {{ $ labels . instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostHighCpuLoad
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode! = "idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+" }'
for : 10 m
labels :
severity : warning
annotations :
summary : Host high CPU load ( instance {{ $ labels . instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
# - alert: HostCpuIsUnderutilized
# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 1w
# labels:
# severity: info
# annotations:
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : HostCpuStealNoisyNeighbor
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode = "steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+" }'
for : 0 m
labels :
severity : warning
annotations :
summary : Host CPU steal noisy neighbor ( instance {{ $ labels . instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostCpuHighIowait
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode = "iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+" }'
for : 0 m
labels :
severity : warning
annotations :
summary : Host CPU high iowait ( instance {{ $ labels . instance }})
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostUnusualDiskIo
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 5 m
labels :
severity : warning
annotations :
summary : Host unusual disk IO ( instance {{ $ labels . instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostContextSwitching
2024-03-26 17:05:51 +01:00
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode = "idle"})) > 20000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+" }'
2024-03-25 22:23:31 +01:00
for : 0 m
labels :
severity : warning
annotations :
summary : Host context switching ( instance {{ $ labels . instance }})
2024-03-26 17:05:51 +01:00
description: "Context switching is growing on the node (> 20000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
2024-03-25 22:23:31 +01:00
# - alert: HostSwapIsFillingUp
# expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 2m
# labels:
# severity: warning
# annotations:
# summary: Host swap is filling up (instance {{ $labels.instance }})
# description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state = "failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+" }'
for : 0 m
labels :
severity : warning
annotations :
summary : Host systemd service crashed ( instance {{ $ labels . instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostPhysicalComponentTooHot
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label! = "tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+" }'
for : 5 m
labels :
severity : warning
annotations :
summary : Host physical component too hot ( instance {{ $ labels . instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostNodeOvertemperatureAlarm
expr: '(node_hwmon_temp_crit_alarm_celsius = = 1) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 0 m
labels :
severity : critical
annotations :
summary : Host node overtemperature alarm ( instance {{ $ labels . instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostRaidArrayGotInactive
expr: '(node_md_state{state = "inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+" }'
for : 0 m
labels :
severity : critical
annotations :
summary : Host RAID array got inactive ( instance {{ $ labels . instance }})
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostRaidDiskFailure
expr: '(node_md_disks{state = "failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host RAID disk failure ( instance {{ $ labels . instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostKernelVersionDeviations
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 6 h
labels :
severity : warning
annotations :
summary : Host kernel version deviations ( instance {{ $ labels . instance }})
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostOomKillDetected
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 0 m
labels :
severity : warning
annotations :
summary : Host OOM kill detected ( instance {{ $ labels . instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 0 m
labels :
severity : info
annotations :
summary : Host EDAC Correctable Errors detected ( instance {{ $ labels . instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 0 m
labels :
severity : warning
annotations :
summary : Host EDAC Uncorrectable Errors detected ( instance {{ $ labels . instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host Network Receive Errors ( instance {{ $ labels . instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host Network Transmit Errors ( instance {{ $ labels . instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostNetworkInterfaceSaturated
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 1 m
labels :
severity : warning
annotations :
summary : Host Network Interface Saturated ( instance {{ $ labels . instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) ! = 0) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host Network Bond Degraded ( instance {{ $ labels . instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 5 m
labels :
severity : warning
annotations :
summary : Host conntrack limit ( instance {{ $ labels . instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) > = 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) < = 0)) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 10 m
labels :
severity : warning
annotations :
summary : Host clock skew ( instance {{ $ labels . instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) = = 0 and node_timex_maxerror_seconds > = 16) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 2 m
labels :
severity : warning
annotations :
summary : Host clock not synchronising ( instance {{ $ labels . instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
- alert : HostRequiresReboot
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename = ~ ".+" }'
for : 4 h
labels :
severity : info
annotations :
summary : Host requires reboot ( instance {{ $ labels . instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $ labels }}"
_EOT
destination = "local/rules/node.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
2024-03-19 13:53:28 +01:00
# A client cert, to connect to the AlertManager API
template {
data = < < _EOT
2024-03-22 16:10:10 +01:00
{{- with pkiCert "pki/monitoring/issue/prometheus"
(printf "common_name = prometheus - % s . monitoring . consul " (env "NOMAD_ALLOC_INDEX" ) )
2024-03-19 13:53:28 +01:00
(printf "ttl = % dh " (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72 ) ) - }}
{{ . Cert }}
{{ . Key }}
{{- end - }}
_EOT
destination = "secrets/prometheus.bundle.pem"
uid = 100000
gid = 109090
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The monitoring CA chain, to validate AlertManager cert
template {
data = < < _EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
uid = 100000
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
# Persistent data
volume_mount {
volume = "data"
destination = "/data"
}
resources {
2024-03-26 13:34:08 +01:00
cpu = 200
memory = 768
memory_max = 1024
2024-03-19 13:53:28 +01:00
}
}
}
group "alerts" {
2024-03-20 11:10:07 +01:00
shutdown_delay = "6s"
count = 1
2024-03-19 13:53:28 +01:00
network {
mode = "bridge"
2024-03-25 22:23:31 +01:00
# Port exposing the web API, with mTLS
2024-03-19 13:53:28 +01:00
port "web-tls" {}
2024-03-25 22:23:31 +01:00
# Port used for gossip between the different alertmanager instance
2024-03-19 13:53:28 +01:00
port "cluster" {}
2024-03-25 22:23:31 +01:00
# Port to expose metrics to prometheus
2024-03-19 13:53:28 +01:00
port "metrics" {}
}
volume "data" {
2024-03-22 16:10:10 +01:00
source = "alertmanager-data"
2024-03-19 13:53:28 +01:00
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
per_alloc = true
}
# This service is used for the different instances of alertmanager to communicate
service {
2024-03-22 16:10:10 +01:00
name = "alertmanager-gossip"
2024-03-19 13:53:28 +01:00
port = "cluster"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
# This service is used by prometheus. As it needs to be able to reach every instances, it cannot use
# the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh
service {
2024-03-22 16:10:10 +01:00
name = "alertmanager-tls"
2024-03-19 13:53:28 +01:00
port = "web-tls"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
# This service is exposed through the service mesh
# and can be used to reach the web interface through Traefik
service {
2024-03-22 16:10:10 +01:00
name = "alertmanager"
2024-03-19 13:53:28 +01:00
port = 9093
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
2024-03-28 09:39:59 +01:00
datacenter = "${NOMAD_DC}"
group = "${NOMAD_GROUP_NAME}"
2024-03-19 13:53:28 +01:00
job = "${NOMAD_JOB_NAME}"
2024-03-25 12:27:46 +01:00
namespace = "${NOMAD_NAMESPACE}"
2024-03-28 09:39:59 +01:00
node = "${node.unique.name}"
region = "${NOMAD_REGION}"
2024-03-19 13:53:28 +01:00
}
connect {
sidecar_service {
}
sidecar_task {
config {
args = [
"-c" ,
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json" ,
"-l" ,
"${meta.connect.log_level}" ,
"--concurrency" ,
"${meta.connect.proxy_concurrency}" ,
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
check {
name = "health"
type = "http"
expose = true
path = "/-/healthy"
interval = "20s"
timeout = "8s"
check_restart {
limit = 12
grace = "30s"
}
}
tags = [
]
}
# This task will handle mTLS to the AlertManager API
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
2024-03-22 16:10:10 +01:00
task "untls-proxy" {
2024-03-19 13:53:28 +01:00
driver = "docker"
user = 9093
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
readonly_rootfs = true
pids_limit = 30
volumes = [
"local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro" ,
]
mount {
type = "tmpfs"
target = "/tmp"
tmpfs_options {
size = 3000000
}
}
}
vault {
2024-03-22 16:10:10 +01:00
policies = [ "metrics", "alertmanager" ]
2024-03-19 13:53:28 +01:00
env = false
disable_file = true
change_mode = "noop"
}
lifecycle {
hook = "poststart"
sidecar = true
}
template {
data = < < _EOT
2024-03-25 22:23:31 +01:00
# UnTLS for the web API
2024-03-19 13:53:28 +01:00
server {
listen 127 . 0 . 0 . 1 : 9093 ;
location / {
2024-03-25 22:23:31 +01:00
proxy_pass https : / / 127 . 0 . 0 . 1 : {{ env "NOMAD_ALLOC_PORT_web-tls" }};
2024-03-19 13:53:28 +01:00
proxy_ssl_certificate / secrets / alertmanager . bundle . pem ;
proxy_ssl_certificate_key / secrets / alertmanager . bundle . pem ;
proxy_ssl_verify on ;
2024-03-22 16:10:10 +01:00
proxy_ssl_name alertmanager - {{ env "NOMAD_ALLOC_INDEX" }}. monitoring . consul ;
2024-03-19 13:53:28 +01:00
proxy_ssl_trusted_certificate / local / monitoring . ca . pem ;
allow 127 . 0 . 0 . 1 ;
deny all ;
}
}
2024-03-25 22:23:31 +01:00
# Metrics proxy
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl ;
http2 on ;
ssl_certificate / secrets / metrics . bundle . pem ;
ssl_certificate_key / secrets / metrics . bundle . pem ;
ssl_client_certificate / local / monitoring . ca . pem ;
ssl_verify_client on ;
ssl_protocols TLSv1 . 2 TLSv1 . 3 ;
ssl_ciphers ECDHE - ECDSA - AES128 - GCM - SHA256 : ECDHE - RSA - AES128 - GCM - SHA256 : ECDHE - ECDSA - AES256 - GCM - SHA384 : ECDHE - RSA - AES256 - GCM - SHA384 : ECDHE - ECDSA - CHACHA20 - POLY1305 : ECDHE - RSA - CHACHA20 - POLY1305 : DHE - RSA - AES128 - GCM - SHA256 : DHE - RSA - AES256 - GCM - SHA384 ;
ssl_session_cache shared : SSL : 10 m ;
ssl_session_timeout 1 h ;
ssl_session_tickets off ;
gzip on ;
gzip_types
text / plain ;
gzip_vary on ;
server_tokens off ;
if ( $ request_method ! ~ ^ ( GET | HEAD ) $ ) {
return 405 ;
}
location / metrics {
proxy_ssl_certificate / secrets / alertmanager . bundle . pem ;
proxy_ssl_certificate_key / secrets / alertmanager . bundle . pem ;
proxy_ssl_verify on ;
proxy_ssl_name alertmanager - {{ env "NOMAD_ALLOC_INDEX" }}. monitoring . consul ;
proxy_ssl_trusted_certificate / local / monitoring . ca . pem ;
proxy_pass https : / / 127 . 0 . 0 . 1 : {{ env "NOMAD_ALLOC_PORT_web-tls" }};
}
}
2024-03-19 13:53:28 +01:00
_EOT
destination = "local/alertmanager.conf"
}
2024-03-25 22:23:31 +01:00
# Get a certificate from vault to protect the metrics endpoint
template {
data = < < _EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans = % s " (env "NOMAD_HOST_IP_metrics" ) ) }}
{{ . Cert }}
{{ . Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = < < _EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
2024-03-19 13:53:28 +01:00
# Certifiate used by AlertManager
template {
data = < < _EOT
2024-03-22 16:10:10 +01:00
{{- with pkiCert "pki/monitoring/issue/alertmanager"
(printf "common_name = alertmanager - % s . monitoring . consul " (env "NOMAD_ALLOC_INDEX" ) )
2024-03-19 13:53:28 +01:00
(printf "ip_sans = % s " (env "NOMAD_HOST_IP_cluster" ) )
(printf "ttl = % dh " (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72 ) ) }}
{{ . Cert }}
{{ . Key }}
{{- end }}
_EOT
destination = "secrets/alertmanager.bundle.pem"
uid = 109093
gid = 100000
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
resources {
cpu = 10
memory = 18
}
}
# The main alertmanager task
task "alertmanager" {
driver = "docker"
leader = true
config {
2024-03-26 13:38:56 +01:00
image = "danielberteaud/alertmanager:0.27.0-2"
2024-03-19 13:53:28 +01:00
readonly_rootfs = true
pids_limit = 200
command = "/local/alertmanager"
}
vault {
2024-03-22 16:10:10 +01:00
policies = [ "metrics", "alertmanager" ]
2024-03-19 13:53:28 +01:00
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = < < _EOT
LANG = fr_FR . utf8
TZ = Europe / Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
template {
data = < < _EOT
global :
smtp_from : alertmanager @ consul
smtp_require_tls : false
smtp_smarthost : localhost : 25
_EOT
destination = "secrets/alertmanager.yml"
}
template {
data = < < _EOT
tls_server_config :
cert_file : / secrets / alertmanager . bundle . pem
key_file : / secrets / alertmanager . bundle . pem
client_auth_type : RequireAndVerifyClientCert
client_ca_file : / local / monitoring . ca . pem
tls_client_config :
cert_file : / secrets / alertmanager . bundle . pem
key_file : / secrets / alertmanager . bundle . pem
ca_file : / local / monitoring . ca . pem
_EOT
destination = "local/cluster_tls.yml"
}
template {
data = < < _EOT
tls_server_config :
cert_file : / secrets / alertmanager . bundle . pem
key_file : / secrets / alertmanager . bundle . pem
client_auth_type : RequireAndVerifyClientCert
client_ca_file : / local / monitoring . ca . pem
_EOT
destination = "local/web_tls.yml"
}
template {
data = < < _EOT
#!/bin/sh
set - euo pipefail
exec alertmanager \
--config.file = / secrets / alertmanager . yml \
--storage.path = / data \
2024-03-25 22:23:31 +01:00
--web.external-url = https : / / alert . example . org \
2024-03-26 15:35:02 +01:00
--web.route-prefix = / \
2024-03-19 13:53:28 +01:00
--web.listen-address = 0 . 0 . 0 . 0 : {{ env "NOMAD_ALLOC_PORT_web-tls" }} \
--cluster.listen-address = 0 . 0 . 0 . 0 : {{ env "NOMAD_ALLOC_PORT_cluster" }} \
--cluster.advertise-address = {{ env "NOMAD_HOST_ADDR_cluster" }} \
2024-03-22 16:10:10 +01:00
{{- range service "alertmanager-gossip" - }}
2024-03-19 13:53:28 +01:00
{{- if not ( eq ( env "NOMAD_ALLOC_INDEX") (index .ServiceMeta "alloc" ) ) }}
--cluster.peer = {{ . Address }}: {{ . Port }} \
{{ end - }}
{{- end - }}
--cluster.tls-config = / local / cluster_tls . yml \
--web.config.file = / local / web_tls . yml
_EOT
destination = "local/alertmanager"
uid = 100000
gid = 100000
perms = "0755"
}
# Certifiate used by AlertManager
template {
data = < < _EOT
2024-03-22 16:10:10 +01:00
{{- with pkiCert "pki/monitoring/issue/alertmanager"
(printf "common_name = alertmanager - % s . monitoring . consul " (env "NOMAD_ALLOC_INDEX" ) )
2024-03-19 13:53:28 +01:00
(printf "ip_sans = % s " (env "NOMAD_HOST_IP_cluster" ) )
(printf "ttl = % dh " (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72 ) ) }}
{{ . Cert }}
{{ . Key }}
{{- end }}
_EOT
destination = "secrets/alertmanager.bundle.pem"
uid = 109093
gid = 109090
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The trusted CA
template {
data = < < _EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
volume_mount {
volume = "data"
destination = "/data"
}
resources {
2024-03-20 11:10:07 +01:00
cpu = 50
memory = 64
memory_max = 80
}
}
}
2024-03-25 12:27:46 +01:00
group "logs-server" {
2024-03-20 11:10:07 +01:00
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
volume "data" {
2024-03-22 16:10:10 +01:00
source = "loki-data"
2024-03-20 11:10:07 +01:00
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
}
service {
2024-03-22 16:10:10 +01:00
name = "loki"
2024-03-20 11:10:07 +01:00
port = 3100
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
2024-03-28 09:39:59 +01:00
datacenter = "${NOMAD_DC}"
group = "${NOMAD_GROUP_NAME}"
2024-03-20 11:10:07 +01:00
job = "${NOMAD_JOB_NAME}"
2024-03-25 12:27:46 +01:00
namespace = "${NOMAD_NAMESPACE}"
2024-03-28 09:39:59 +01:00
node = "${node.unique.name}"
region = "${NOMAD_REGION}"
2024-03-20 11:10:07 +01:00
}
connect {
sidecar_service {
}
sidecar_task {
config {
args = [
"-c" ,
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json" ,
"-l" ,
"${meta.connect.log_level}" ,
"--concurrency" ,
"${meta.connect.proxy_concurrency}" ,
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
check {
name = "ready"
type = "http"
path = "/ready"
expose = true
interval = "20s"
timeout = "8s"
check_restart {
limit = 6
grace = "5m"
}
}
tags = [
]
}
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
task "metrics-proxy" {
driver = "docker"
user = 8995
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
volumes = [
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
]
pids_limit = 100
}
lifecycle {
hook = "poststart"
sidecar = true
}
vault {
policies = [ "metrics" ]
}
2024-03-25 12:27:46 +01:00
# Get a certificate from vault to protect the metrics endpoint
2024-03-20 11:10:07 +01:00
template {
data = < < _EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans = % s " (env "NOMAD_HOST_IP_metrics" ) ) }}
{{ . Cert }}
2024-03-25 12:27:46 +01:00
{{ . Key }}
{{- end }}
2024-03-20 11:10:07 +01:00
_EOT
destination = "secrets/metrics.bundle.pem"
}
2024-03-25 12:27:46 +01:00
# Get the root CA
2024-03-20 11:10:07 +01:00
template {
data = < < _EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
2024-03-25 12:27:46 +01:00
2024-03-20 11:10:07 +01:00
template {
data = < < _EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl ;
http2 on ;
ssl_certificate / secrets / metrics . bundle . pem ;
ssl_certificate_key / secrets / metrics . bundle . pem ;
ssl_client_certificate / local / monitoring . ca . pem ;
ssl_verify_client on ;
ssl_protocols TLSv1 . 2 TLSv1 . 3 ;
ssl_ciphers ECDHE - ECDSA - AES128 - GCM - SHA256 : ECDHE - RSA - AES128 - GCM - SHA256 : ECDHE - ECDSA - AES256 - GCM - SHA384 : ECDHE - RSA - AES256 - GCM - SHA384 : ECDHE - ECDSA - CHACHA20 - POLY1305 : ECDHE - RSA - CHACHA20 - POLY1305 : DHE - RSA - AES128 - GCM - SHA256 : DHE - RSA - AES256 - GCM - SHA384 ;
ssl_session_cache shared : SSL : 10 m ;
ssl_session_timeout 1 h ;
ssl_session_tickets off ;
gzip on ;
gzip_types
text / plain ;
gzip_vary on ;
server_tokens off ;
if ( $ request_method ! ~ ^ ( GET | HEAD ) $ ) {
return 405 ;
}
location / metrics {
proxy_pass http : / / localhost : 3100 / metrics ;
}
}
_EOT
destination = "local/default.conf"
}
resources {
cpu = 10
memory = 10
memory_max = 20
}
}
task "loki" {
driver = "docker"
config {
2024-03-22 16:10:10 +01:00
image = "danielberteaud/loki:2.9.6-1"
2024-03-20 11:10:07 +01:00
command = "loki"
args = ["--config.file = / local / loki . yml " ]
}
vault {
2024-03-22 16:10:10 +01:00
policies = [ "loki" ]
2024-03-20 11:10:07 +01:00
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = < < _EOT
LANG = fr_FR . utf8
TZ = Europe / Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
template {
data = < < _EOT
analytics :
reporting_enabled : false
auth_enabled : false
common :
instance_addr : 127 . 0 . 0 . 1
path_prefix : / data
replication_factor : 1
ring :
kvstore :
store : inmemory
storage :
filesystem :
chunks_directory : / data / chunks
rules_directory : / data / rules
compactor :
compaction_interval : 1 h
deletion_mode : filter - and - delete
retention_enabled : true
shared_store : filesystem
working_directory : / data / compactor
ingester :
chunk_idle_period : 1 h
limits_config :
ingestion_burst_size_mb : 100
ingestion_rate_mb : 20
max_entries_limit_per_query : 20000
max_query_parallelism : 128
retention_period : 720 h
2024-03-25 12:27:46 +01:00
split_queries_by_interval : 0
2024-03-20 11:10:07 +01:00
ruler :
alertmanager_client :
tls_ca_path : / secrets / monitoring . ca . pem
tls_cert_path : / secrets / loki . bundle . pem
tls_key_path : / secrets / loki . bundle . pem
tls_server_name : alertmanager . monitoring
2024-03-22 16:10:10 +01:00
alertmanager_url : alertmanager - tls
2024-03-20 11:10:07 +01:00
enable_alertmanager_discovery : true
enable_alertmanager_v2 : true
enable_api : true
ring :
kvstore :
store : inmemory
rule_path : / tmp / loki - rules
storage :
local :
directory : / local / rules
type : local
schema_config :
configs :
- from : "2020-10-24"
index :
period : 24 h
prefix : index_
object_store : filesystem
schema : v11
store : boltdb - shipper
server :
grpc_listen_address : 127 . 0 . 0 . 1
grpc_listen_port : 9095
http_listen_address : 127 . 0 . 0 . 1
http_listen_port : 3100
storage_config :
boltdb_shipper :
active_index_directory : / data / index
cache_location : / data / boltdb - cache
shared_store : filesystem
_EOT
destination = "local/loki.yml"
}
# A client cert, to connect to the AlertManager API
template {
data = < < _EOT
2024-03-22 16:10:10 +01:00
{{- with pkiCert "pki/monitoring/issue/loki"
(printf "common_name = loki - % s . monitoring . consul " (env "NOMAD_ALLOC_INDEX" ) )
2024-03-20 11:10:07 +01:00
(printf "ttl = % dh " (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72 ) ) - }}
{{ . Cert }}
{{ . Key }}
{{- end - }}
_EOT
destination = "secrets/loki.bundle.pem"
uid = 100000
gid = 103100
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The monitoring CA chain, to validate AlertManager cert
template {
data = < < _EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
uid = 100000
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
volume_mount {
volume = "data"
destination = "/data"
}
resources {
cpu = 150
2024-03-26 17:08:23 +01:00
memory = 1024
2024-03-20 11:10:07 +01:00
}
}
}
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
# And with a loki sink. The goal is to be able to collect logs from various sources
2024-03-25 12:27:46 +01:00
group "logs-aggregator" {
2024-03-20 11:10:07 +01:00
count = 1
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
# The main service is the vector source
# It will provide access to other services through the mesh (like loki)
service {
2024-03-22 16:10:10 +01:00
name = "vector-aggregator"
2024-03-20 11:10:07 +01:00
port = 9000
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
2024-03-28 09:39:59 +01:00
datacenter = "${NOMAD_DC}"
group = "${NOMAD_GROUP_NAME}"
2024-03-20 11:10:07 +01:00
job = "${NOMAD_JOB_NAME}"
2024-03-25 12:27:46 +01:00
namespace = "${NOMAD_NAMESPACE}"
2024-03-28 09:39:59 +01:00
node = "${node.unique.name}"
region = "${NOMAD_REGION}"
2024-03-20 11:10:07 +01:00
}
connect {
sidecar_service {
proxy {
upstreams {
2024-03-22 16:10:10 +01:00
destination_name = "loki"
2024-03-20 11:10:07 +01:00
local_bind_port = 3100
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
}
}
sidecar_task {
config {
args = [
"-c" ,
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json" ,
"-l" ,
"${meta.connect.log_level}" ,
"--concurrency" ,
"${meta.connect.proxy_concurrency}" ,
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
tags = [
]
}
2024-03-25 12:27:46 +01:00
task "vector" {
2024-03-20 11:10:07 +01:00
driver = "docker"
2024-03-26 09:18:19 +01:00
leader = true
2024-03-20 11:10:07 +01:00
config {
2024-03-26 17:05:51 +01:00
image = "danielberteaud/vector:0.37.0-1"
2024-03-25 12:27:46 +01:00
readonly_rootfs = true
pids_limit = 200
args = ["--config = / local / vector . yml " ]
2024-03-20 11:10:07 +01:00
}
vault {
2024-03-25 12:27:46 +01:00
policies = [ "metrics" ]
env = false
disable_file = true
change_mode = "noop"
2024-03-20 11:10:07 +01:00
}
2024-03-25 12:27:46 +01:00
# Use a template block instead of env {} so we can fetch values from vault
2024-03-20 11:10:07 +01:00
template {
data = < < _EOT
2024-03-25 12:27:46 +01:00
LANG = fr_FR . utf8
TZ = Europe / Paris
2024-03-20 11:10:07 +01:00
_EOT
2024-03-25 12:27:46 +01:00
destination = "secrets/.env"
perms = 400
env = true
2024-03-20 11:10:07 +01:00
}
2024-03-25 12:27:46 +01:00
# Get a certificate from vault to protect the metrics endpoint
2024-03-20 11:10:07 +01:00
template {
data = < < _EOT
2024-03-25 12:27:46 +01:00
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans = % s " (env "NOMAD_HOST_IP_metrics" ) ) }}
{{ . Cert }}
{{ . Key }}
{{- end }}
2024-03-20 11:10:07 +01:00
_EOT
2024-03-25 12:27:46 +01:00
destination = "secrets/metrics.bundle.pem"
2024-03-20 11:10:07 +01:00
}
2024-03-25 12:27:46 +01:00
# Get the root CA
2024-03-20 11:10:07 +01:00
template {
data = < < _EOT
2024-03-25 12:27:46 +01:00
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
2024-03-20 11:10:07 +01:00
_EOT
2024-03-25 12:27:46 +01:00
destination = "local/monitoring.ca.pem"
2024-03-20 11:10:07 +01:00
}
template {
data = < < _EOT
data_dir : / local
expire_metrics_secs : 600
sources :
logs_vector :
type : vector
address : 127 . 0 . 0 . 1 : 9000
vector_metrics :
type : internal_metrics
transforms :
split - by - app :
type : route
inputs : [ "logs_*" ]
route :
traefik: '.service = = "traefik" '
postgres: '.service = = "postgres" '
syslog: '.source_type = = "syslog" '
parse - traefik :
type : remap
inputs : [ "split-by-app.traefik" ]
source : |
.http = parse_grok ! ( . message , "%%{HTTPD_COMMONLOG}" )
.loki_labels.http_method = . http . verb
.loki_labels.http_status = . http . response
.loki_labels.user = . http . auth
parse - postgres :
type : remap
inputs : [ "split-by-app.postgres" ]
source : |
if includes ( array ! ( . nomad . tags ) , "master" ) {
.loki_labels.pg_role = "master"
} else if includes ( array ! ( . nomad . tags ) , "replica" ) {
.loki_labels.pg_role = "replica"
}
parse - syslog :
type : remap
inputs : [ "split-by-app.syslog" ]
source : |
# PfSense sends /usr/sbin/cron as the appname, instead of cron
if string!(.appname) = = "/usr/sbin/cron" {
.appname = "cron"
}
.service = . appname
sinks :
loki :
type : loki
inputs : [ "split-by-app._unmatched", "parse-*" ]
endpoint : http : / / 127 . 0 . 0 . 1 : 3100
encoding :
codec : text
labels :
job : "{{ .service }}"
host : "{{ .host }}"
_ * : "{{ .loki_labels }}"
buffer :
type : disk
max_size : 268435488
remove_label_fields : true
# Expose vector internal metrics
prometheus :
type : prometheus_exporter
inputs : [ "vector_metrics" ]
2024-03-25 12:27:46 +01:00
address : 0 . 0 . 0 . 0 : $ ${ N O M A D _ A L L O C _ P O R T _ m e t r i c s }
tls :
enabled : true
crt_file : / secrets / metrics . bundle . pem
key_file : / secrets / metrics . bundle . pem
ca_file : / local / monitoring . ca . pem
verify_certificate : true
2024-03-20 11:10:07 +01:00
_EOT
destination = "local/vector.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
change_mode = "signal"
change_signal = "SIGHUP"
}
resources {
cpu = 100
memory = 192
2024-03-19 13:53:28 +01:00
}
}
}
2024-03-22 16:10:10 +01:00
2024-03-23 23:53:37 +01:00
group "grafana" {
2024-03-22 16:10:10 +01:00
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
volume "data" {
source = "grafana-data"
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
}
service {
name = "grafana"
port = 3000
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
}
connect {
sidecar_service {
proxy {
upstreams {
destination_name = "postgres"
local_bind_port = 5432
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
2024-03-25 12:27:46 +01:00
upstreams {
destination_name = "loki"
local_bind_port = 3100
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
upstreams {
destination_name = "prometheus"
local_bind_port = 9090
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
2024-03-22 16:10:10 +01:00
}
}
sidecar_task {
config {
args = [
"-c" ,
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json" ,
"-l" ,
"${meta.connect.log_level}" ,
"--concurrency" ,
"${meta.connect.proxy_concurrency}" ,
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
check {
name = "health"
type = "http"
path = "/api/health"
expose = true
interval = "30s"
timeout = "8s"
}
tags = [
"traefik.enable = true " ,
"traefik.http.routers.monitoring-grafana.entrypoints = https " ,
"traefik.http.routers.monitoring-grafana.rule = Host ( ` grafana . example . org ` ) " ,
2024-03-26 11:47:03 +01:00
"traefik.http.middlewares.csp-monitoring-grafana.headers.contentsecuritypolicy = connect - src ' self ' https : / / grafana . com ; default - src ' self ' ; font - src ' self ' data : ; img - src ' self ' data : blob : https : / / grafana . com ; script - src ' self ' ' unsafe - inline ' ' unsafe - eval ' ; style - src ' self ' ' unsafe - inline ' ; " ,
2024-03-22 16:10:10 +01:00
"traefik.http.routers.monitoring-grafana.middlewares = security - headers @ file , rate - limit - std @ file , forward - proto @ file , inflight - std @ file , hsts @ file , compression @ file , csp - monitoring - grafana " ,
]
}
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
task "metrics-proxy" {
driver = "docker"
user = 8995
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
volumes = [
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
]
pids_limit = 100
}
lifecycle {
hook = "poststart"
sidecar = true
}
vault {
policies = [ "metrics" ]
}
2024-03-25 12:27:46 +01:00
# Get a certificate from vault to protect the metrics endpoint
2024-03-22 16:10:10 +01:00
template {
data = < < _EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans = % s " (env "NOMAD_HOST_IP_metrics" ) ) }}
{{ . Cert }}
2024-03-25 12:27:46 +01:00
{{ . Key }}
{{- end }}
2024-03-22 16:10:10 +01:00
_EOT
destination = "secrets/metrics.bundle.pem"
}
2024-03-25 12:27:46 +01:00
# Get the root CA
2024-03-22 16:10:10 +01:00
template {
data = < < _EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
2024-03-25 12:27:46 +01:00
2024-03-22 16:10:10 +01:00
template {
data = < < _EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl ;
http2 on ;
ssl_certificate / secrets / metrics . bundle . pem ;
ssl_certificate_key / secrets / metrics . bundle . pem ;
ssl_client_certificate / local / monitoring . ca . pem ;
ssl_verify_client on ;
ssl_protocols TLSv1 . 2 TLSv1 . 3 ;
ssl_ciphers ECDHE - ECDSA - AES128 - GCM - SHA256 : ECDHE - RSA - AES128 - GCM - SHA256 : ECDHE - ECDSA - AES256 - GCM - SHA384 : ECDHE - RSA - AES256 - GCM - SHA384 : ECDHE - ECDSA - CHACHA20 - POLY1305 : ECDHE - RSA - CHACHA20 - POLY1305 : DHE - RSA - AES128 - GCM - SHA256 : DHE - RSA - AES256 - GCM - SHA384 ;
ssl_session_cache shared : SSL : 10 m ;
ssl_session_timeout 1 h ;
ssl_session_tickets off ;
gzip on ;
gzip_types
text / plain ;
gzip_vary on ;
server_tokens off ;
if ( $ request_method ! ~ ^ ( GET | HEAD ) $ ) {
return 405 ;
}
location / metrics {
2024-03-25 22:23:31 +01:00
proxy_pass http : / / 127 . 0 . 0 . 1 : 3000 / metrics ;
2024-03-22 16:10:10 +01:00
}
}
_EOT
destination = "local/default.conf"
}
resources {
cpu = 10
memory = 10
memory_max = 20
}
}
# Local memcached instance
task "memcached" {
driver = "docker"
user = 11211
lifecycle {
hook = "prestart"
sidecar = true
}
config {
image = "memcached:alpine"
readonly_rootfs = true
force_pull = true
entrypoint = [ "/local/memcached" ]
}
template {
data = < < _EOT
#!/bin/sh
set - eu
exec memcached - l 127 . 0 . 0 . 1 - p 11211 - m {{ env "NOMAD_MEMORY_LIMIT" | parseInt | subtract 5 }}
_EOT
destination = "local/memcached"
perms = 755
}
resources {
cpu = 10
memory = 20
}
}
task "grafana" {
driver = "docker"
leader = true
config {
image = "danielberteaud/grafana:10.4.1-1"
readonly_rootfs = true
pids_limit = 100
command = "grafana"
args = [
"server" ,
"--homepath = / opt / grafana " ,
"--config = / secrets / grafana . ini " ,
"--packaging = docker "
]
}
vault {
policies = [ "grafana" ]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = < < _EOT
LANG = fr_FR . utf8
TZ = Europe / Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
2024-03-25 22:23:31 +01:00
template {
data = < < _EOT
2024-03-25 22:37:52 +01:00
GF_SECURITY_ADMIN_PASSWORD = {{ with secret "kv/service/monitoring/grafana" }}{{ . Data . data . initial_admin_pwd | sprig_squote }}{{ end }}
2024-03-25 22:23:31 +01:00
_EOT
destination = "secrets/.grafana.env"
perms = 400
env = true
}
2024-03-22 16:10:10 +01:00
# Basic grafana configuration file
template {
data = < < _EOT
[ server ]
http_addr = 127 . 0 . 0 . 1
http_port = 3000
root_url = https : / / grafana . example . org
[ database ]
type = postgres
name = grafana
host = 127 . 0 . 0 . 1 : 5432
user = {{ with secret "database/creds/grafana" }}{{ . Data . username }}{{ end }}
password = {{ with secret "database/creds/grafana" }}{{ . Data . password }}{{ end }}
[ remote_cache ]
type = memcached
connstr = 127 . 0 . 0 . 1 : 11211
[ analytics ]
reporting_enabled = false
check_for_updates = false
check_for_plugin_updates = false
[ security ]
cookie_secure = true
cookie_samesite = strict
x_xss_protection = true
secret_key = {{ with secret "kv/service/monitoring/grafana" }}{{ . Data . data . secret_key }}{{ end }}
[ dataproxy ]
timeout = 120
2024-03-23 23:53:37 +01:00
[ feature_toggles ]
2024-03-22 16:10:10 +01:00
_EOT
destination = "secrets/grafana.ini"
uid = 103000
perms = 400
}
# Mount volume in /data for persistence
volume_mount {
volume = "data"
destination = "/data"
}
resources {
cpu = 100
memory = 256
}
}
}
2024-03-19 13:53:28 +01:00
}