2584 lines
80 KiB
HCL
2584 lines
80 KiB
HCL
job "monitoring-services" {
|
|
|
|
|
|
datacenters = ["dc1"]
|
|
region = "global"
|
|
|
|
|
|
|
|
|
|
# Metrics is running prometheus
|
|
group "metrics-server" {
|
|
|
|
shutdown_delay = "6s"
|
|
count = 1
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "metrics" {}
|
|
}
|
|
|
|
|
|
volume "data" {
|
|
source = "prometheus-data"
|
|
type = "csi"
|
|
access_mode = "single-node-writer"
|
|
attachment_mode = "file-system"
|
|
per_alloc = true
|
|
}
|
|
|
|
|
|
service {
|
|
name = "prometheus"
|
|
port = 9090
|
|
|
|
meta {
|
|
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
|
alloc = "${NOMAD_ALLOC_INDEX}"
|
|
datacenter = "${NOMAD_DC}"
|
|
group = "${NOMAD_GROUP_NAME}"
|
|
job = "${NOMAD_JOB_NAME}"
|
|
namespace = "${NOMAD_NAMESPACE}"
|
|
node = "${node.unique.name}"
|
|
region = "${NOMAD_REGION}"
|
|
}
|
|
|
|
connect {
|
|
sidecar_service {
|
|
}
|
|
sidecar_task {
|
|
config {
|
|
args = [
|
|
"-c",
|
|
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
|
|
"-l",
|
|
"${meta.connect.log_level}",
|
|
"--concurrency",
|
|
"${meta.connect.proxy_concurrency}",
|
|
"--disable-hot-restart"
|
|
]
|
|
}
|
|
|
|
resources {
|
|
cpu = 50
|
|
memory = 64
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
check {
|
|
name = "health"
|
|
type = "http"
|
|
expose = true
|
|
path = "/-/healthy"
|
|
interval = "20s"
|
|
timeout = "8s"
|
|
check_restart {
|
|
limit = 10
|
|
grace = "5m"
|
|
}
|
|
}
|
|
|
|
tags = [
|
|
|
|
|
|
]
|
|
}
|
|
|
|
|
|
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
|
|
task "metrics-proxy" {
|
|
driver = "docker"
|
|
user = 8995
|
|
|
|
config {
|
|
image = "nginxinc/nginx-unprivileged:alpine"
|
|
force_pull = true
|
|
volumes = [
|
|
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
|
|
]
|
|
pids_limit = 100
|
|
}
|
|
|
|
lifecycle {
|
|
hook = "poststart"
|
|
sidecar = true
|
|
}
|
|
|
|
vault {
|
|
policies = ["metrics"]
|
|
}
|
|
|
|
# Get a certificate from vault to protect the metrics endpoint
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/metrics.bundle.pem"
|
|
}
|
|
|
|
# Get the root CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
|
|
template {
|
|
data = <<_EOT
|
|
server {
|
|
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
|
|
http2 on;
|
|
|
|
ssl_certificate /secrets/metrics.bundle.pem;
|
|
ssl_certificate_key /secrets/metrics.bundle.pem;
|
|
ssl_client_certificate /local/monitoring.ca.pem;
|
|
ssl_verify_client on;
|
|
ssl_protocols TLSv1.2 TLSv1.3;
|
|
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
|
ssl_session_cache shared:SSL:10m;
|
|
ssl_session_timeout 1h;
|
|
ssl_session_tickets off;
|
|
gzip on;
|
|
gzip_types
|
|
text/plain;
|
|
gzip_vary on;
|
|
|
|
server_tokens off;
|
|
|
|
if ($request_method !~ ^(GET|HEAD)$ ) {
|
|
return 405;
|
|
}
|
|
location /metrics {
|
|
proxy_pass http://localhost:9090/metrics;
|
|
}
|
|
}
|
|
_EOT
|
|
destination = "local/default.conf"
|
|
}
|
|
|
|
resources {
|
|
cpu = 10
|
|
memory = 10
|
|
memory_max = 20
|
|
}
|
|
}
|
|
|
|
|
|
|
|
# The main prometheus task
|
|
task "prometheus" {
|
|
driver = "docker"
|
|
leader = true
|
|
|
|
config {
|
|
image = "danielberteaud/prometheus:2.51.1-1"
|
|
readonly_rootfs = true
|
|
pids_limit = 200
|
|
command = "prometheus"
|
|
args = [
|
|
"--config.file=/local/prometheus.yml",
|
|
"--log.level=info",
|
|
"--web.listen-address=127.0.0.1:9090",
|
|
"--storage.tsdb.path=/data",
|
|
"--storage.tsdb.retention.time=30d",
|
|
"--web.console.libraries=/opt/prometheus/console_libraries",
|
|
"--web.console.templates=/opt/prometheus/consoles",
|
|
"--web.external-url=https://prometheus.example.org",
|
|
"--web.route-prefix=/"
|
|
]
|
|
}
|
|
|
|
|
|
vault {
|
|
policies = ["prometheus"]
|
|
env = false
|
|
disable_file = true
|
|
change_mode = "noop"
|
|
}
|
|
|
|
|
|
|
|
|
|
# Main configuration for prometheus
|
|
template {
|
|
data = <<_EOT
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
#query_log_file: /dev/stdout
|
|
external_labels:
|
|
cluster: consul
|
|
env: default
|
|
|
|
rule_files:
|
|
- /local/rules/*.yml
|
|
|
|
alerting:
|
|
alertmanagers:
|
|
- scheme: https
|
|
tls_config:
|
|
ca_file: /local/monitoring.ca.pem
|
|
cert_file: /secrets/prometheus.bundle.pem
|
|
key_file: /secrets/prometheus.bundle.pem
|
|
consul_sd_configs:
|
|
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
|
|
scheme: http
|
|
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
|
|
datacenter: dc1
|
|
relabel_configs:
|
|
# Only keep alertmanagers
|
|
- source_labels: [__meta_consul_service]
|
|
action: keep
|
|
regex: alertmanager-tls
|
|
|
|
scrape_configs:
|
|
|
|
# Cluster services
|
|
- job_name: cluster-services
|
|
scheme: https
|
|
tls_config:
|
|
ca_file: /local/monitoring.ca.pem
|
|
cert_file: /secrets/prometheus.bundle.pem
|
|
key_file: /secrets/prometheus.bundle.pem
|
|
consul_sd_configs:
|
|
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
|
|
scheme: http
|
|
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
|
|
datacenter: dc1
|
|
relabel_configs:
|
|
|
|
# Drop anything which is not Nomad, Consul or Vault
|
|
# Other services will be monitored with another job
|
|
- source_labels: [__meta_consul_service]
|
|
action: keep
|
|
regex: (nomad(\-client)?|consul|vault)
|
|
|
|
- source_labels: [__meta_consul_service,__meta_consul_node]
|
|
regex: (.+);(.+)
|
|
replacement: $${1}/$${2}
|
|
target_label: __metrics_path__
|
|
|
|
- source_labels: [__meta_consul_service]
|
|
regex: (.+)
|
|
replacement: {{ range $idx, $instance := service "cluster-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
|
|
target_label: __address__
|
|
|
|
# Rewrite the job labels to the name of the service
|
|
- source_labels: [__meta_consul_service]
|
|
regex: (.+)
|
|
replacement: $${1}
|
|
target_label: job
|
|
|
|
# Rewrite the instance labels
|
|
- source_labels: [__meta_consul_node]
|
|
regex: (.+)
|
|
replacement: $${1}
|
|
target_label: instance
|
|
|
|
# regular services discovered from the Consul Catalog
|
|
- job_name: consul-services
|
|
scheme: https
|
|
tls_config:
|
|
ca_file: /local/monitoring.ca.pem
|
|
cert_file: /secrets/prometheus.bundle.pem
|
|
key_file: /secrets/prometheus.bundle.pem
|
|
|
|
consul_sd_configs:
|
|
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
|
|
scheme: http
|
|
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
|
|
datacenter: dc1
|
|
|
|
relabel_configs:
|
|
|
|
# Drop sidecar's service to prevent duplicate. Sidecar themselves are treated in another job
|
|
- source_labels: [__meta_consul_service]
|
|
action: drop
|
|
regex: (.+)-sidecar-proxy
|
|
|
|
# Drop Nomad, Consul and vault, already handled
|
|
- source_labels: [__meta_consul_service]
|
|
action: drop
|
|
regex: (nomad(\-client)?|consul|vault)
|
|
|
|
# Only keep services having a metrics-port set
|
|
- source_labels: [__meta_consul_service_metadata_metrics_port]
|
|
regex: \d+
|
|
action: keep
|
|
|
|
# Get metrics path from metadata
|
|
- source_labels: [__meta_consul_service_metadata_metrics_path]
|
|
target_label: __metrics_path__
|
|
regex: (.+)
|
|
|
|
# Rewrite the scheme if needed
|
|
- source_labels: [__meta_consul_service_metadata_metrics_scheme]
|
|
regex: (https?)
|
|
replacement: $${1}
|
|
target_label: __scheme__
|
|
|
|
# Rewrite the address to use the metrics port
|
|
- source_labels: [__address__, __meta_consul_service_metadata_metrics_port]
|
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
replacement: $${1}:$${2}
|
|
target_label: __address__
|
|
|
|
# Rewrite the job labels to the name of the service
|
|
- source_labels: [__meta_consul_service]
|
|
regex: (.+)
|
|
replacement: $${1}
|
|
target_label: job
|
|
|
|
# Set the default alloc to 0 if not set
|
|
- source_labels: [__meta_consul_service_metadata_alloc]
|
|
regex: ^$
|
|
replacement: 0
|
|
target_label: __meta_consul_service_metadata_alloc
|
|
|
|
# Keep the alloc meta in a label
|
|
# Note that most of the time, alloc is just the allocation index, but in some cases, it can be the host name (for system jobs)
|
|
- source_labels: [__meta_consul_service_metadata_alloc]
|
|
regex: (.+)
|
|
replacement: $${1}
|
|
target_label: alloc
|
|
|
|
# Rewerite the instance label to be service-alloc
|
|
- source_labels: [__meta_consul_service, alloc]
|
|
regex: (.+);([a-zA-Z\d\-\.]+)
|
|
replacement: $${1}-$${2}
|
|
target_label: instance
|
|
|
|
# envoy sidecars from consul
|
|
- job_name: consul-envoy-services
|
|
consul_sd_configs:
|
|
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
|
|
scheme: http
|
|
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
|
|
datacenter: dc1
|
|
|
|
relabel_configs:
|
|
|
|
# Only keep sidecar-service with a envoy-metrics-port defined
|
|
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_envoy_metrics_port]
|
|
action: keep
|
|
regex: (.+)-sidecar-proxy;\d+
|
|
|
|
# Rewrite the address to use the envoy-metrics-port
|
|
- source_labels: [__address__, __meta_consul_service_metadata_envoy_metrics_port]
|
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
replacement: $${1}:$${2}
|
|
target_label: __address__
|
|
|
|
# Rewrite the job label
|
|
- source_labels: [__meta_consul_service]
|
|
regex: (.+)
|
|
replacement: $${1}
|
|
target_label: job
|
|
|
|
# Set the default alloc to 0 if not set
|
|
- source_labels: [__meta_consul_service_metadata_alloc]
|
|
regex: ^$
|
|
replacement: 0
|
|
target_label: __meta_consul_service_metadata_alloc
|
|
|
|
# Rewerite the instance label to be service-alloc
|
|
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc]
|
|
regex: (.+);([a-zA-Z\d\-\.]+)
|
|
replacement: $${1}-$${2}
|
|
target_label: instance
|
|
|
|
_EOT
|
|
destination = "local/prometheus.yml"
|
|
uid = 100000
|
|
gid = 109090
|
|
perms = 640
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# Alert rules
|
|
template {
|
|
data = <<_EOT
|
|
# vi: syntax=yaml
|
|
|
|
groups:
|
|
|
|
- name: Blackbox
|
|
rules:
|
|
|
|
- alert: BlackboxProbeFailed
|
|
expr: probe_success == 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Blackbox probe failed (instance {{ $labels.instance }})
|
|
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: BlackboxSlowProbe
|
|
expr: avg_over_time(probe_duration_seconds[1m]) > 1
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Blackbox slow probe (instance {{ $labels.instance }})
|
|
description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: BlackboxProbeHttpFailure
|
|
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
|
|
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: BlackboxSslCertificateWillExpireSoon
|
|
expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
|
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: BlackboxSslCertificateWillExpireSoon
|
|
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
|
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: BlackboxSslCertificateExpired
|
|
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
|
|
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: BlackboxProbeSlowHttp
|
|
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
|
|
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
_EOT
|
|
destination = "local/rules/blackbox.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
template {
|
|
data = <<_EOT
|
|
# vi: syntax=yaml
|
|
|
|
groups:
|
|
|
|
- name: JVM
|
|
|
|
rules:
|
|
|
|
- alert: JvmMemoryFillingUp
|
|
expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 90'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: JVM memory filling up (instance {{ $labels.instance }})
|
|
description: "JVM memory is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
_EOT
|
|
destination = "local/rules/jvm.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
template {
|
|
data = <<_EOT
|
|
# vi: syntax=yaml
|
|
|
|
groups:
|
|
|
|
- name: Nomad
|
|
rules:
|
|
|
|
- alert: NomadJobFailed
|
|
expr: 'delta(nomad_nomad_job_summary_failed[30m]) > 0'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nomad job failed (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
|
|
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NomadJobLost
|
|
expr: 'nomad_nomad_job_summary_lost > 0'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nomad job lost (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
|
|
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NomadJobQueued
|
|
expr: 'nomad_nomad_job_summary_queued > 0'
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nomad job queued (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
|
|
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NomadBlockedEvaluation
|
|
expr: 'nomad_nomad_blocked_evals_total_blocked > 0'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nomad blocked evaluation (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
|
|
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NomadTaskOOM
|
|
expr: 'count_over_time(nomad_client_allocs_oom_killed[1h]) > 1'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nomad task killed by OOM (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
|
|
description: "Nomad task killed by OOM \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
_EOT
|
|
destination = "local/rules/nomad.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
template {
|
|
data = <<_EOT
|
|
# vi: syntax=yaml
|
|
|
|
groups:
|
|
|
|
- name: Ping
|
|
rules:
|
|
|
|
- alert: HostDown
|
|
expr: ping_loss_ratio == 1
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Host down (host {{ $labels.target }})
|
|
description: "Host {{ $labels.target }} doesn't respond to ICMP pings, VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PingLoss
|
|
expr: |
|
|
avg_over_time(ping_loss_ratio[10m]) > 0.1 and min_over_time(ping_loss_ratio[10m]) < 1
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: High packet loss (host {{ $labels.target }})
|
|
description: "ICMP pings have a loss ratio > 10%, VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
_EOT
|
|
destination = "local/rules/ping.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
template {
|
|
data = <<_EOT
|
|
# vi: syntax=yaml
|
|
|
|
groups:
|
|
|
|
- name: Postgres
|
|
|
|
rules:
|
|
|
|
- alert: PostgresqlDown
|
|
expr: 'pg_up == 0'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Postgresql down (instance {{ $labels.instance }})
|
|
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PostgresTooManyRestarts
|
|
expr: changes(process_start_time_seconds{job="pg"}[15m]) > 3
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Postgres too many restarts (instance {{ $labels.instance }})
|
|
description: "Postgres server has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PostgresqlTooManyConnections
|
|
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Postgresql too many connections (instance {{ $labels.instance }})
|
|
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PostgresqlDeadLocks
|
|
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
|
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# - alert: PostgresqlHighRollbackRate
|
|
# expr: 'rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.05'
|
|
# for: 0m
|
|
# labels:
|
|
# severity: warning
|
|
# annotations:
|
|
# summary: Postgresql high rollback rate (instance {{ $labels.instance }})
|
|
# description: "Ratio of transactions being aborted compared to committed is > 5 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PostgresqlHighRateStatementTimeout
|
|
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
|
|
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PostgresqlHighRateDeadlock
|
|
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
|
|
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PostgresqlTooManyLocksAcquired
|
|
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
|
|
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
|
|
_EOT
|
|
destination = "local/rules/postgres.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
template {
|
|
data = <<_EOT
|
|
# vi: syntax=yaml
|
|
|
|
groups:
|
|
|
|
# Prometheus
|
|
- name: Prometheus
|
|
rules:
|
|
|
|
- alert: PrometheusTargetMissing
|
|
expr: up{job!~"sftp-PR\\d+"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Prometheus target missing (job {{ $labels.job }}, instance {{ $labels.instance }})
|
|
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PrometheusTooManyRestarts
|
|
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 3
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Prometheus too many restarts (job {{ $labels.job }}, instance {{ $labels.instance }})
|
|
description: "Prometheus has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PrometheusNotConnectedToAlertmanager
|
|
expr: prometheus_notifications_alertmanagers_discovered < 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
|
|
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PrometheusRuleEvaluationFailures
|
|
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
|
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PrometheusRuleEvaluationSlow
|
|
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
|
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PrometheusNotificationsBacklog
|
|
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
|
|
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PrometheusAlertmanagerNotificationFailing
|
|
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
|
|
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PrometheusTargetScrapingSlow
|
|
expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
|
|
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PrometheusTsdbWalCorruptions
|
|
expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
|
|
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
|
|
_EOT
|
|
destination = "local/rules/prometheus.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
template {
|
|
data = <<_EOT
|
|
# vi: syntax=yaml
|
|
|
|
groups:
|
|
|
|
- name: Traefik
|
|
|
|
rules:
|
|
|
|
- alert: TraefikHighHttp5xxErrorRateService
|
|
expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
|
|
description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
_EOT
|
|
destination = "local/rules/traefik.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
template {
|
|
data = <<_EOT
|
|
# vi: syntax=yaml
|
|
|
|
groups:
|
|
|
|
- name: HashicorpVault
|
|
|
|
rules:
|
|
|
|
- alert: VaultSealed
|
|
expr: 'vault_core_unsealed == 0'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Vault sealed (instance {{ $labels.instance }})
|
|
description: "Vault instance is sealed on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
_EOT
|
|
destination = "local/rules/vault.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
template {
|
|
data = <<_EOT
|
|
# vi: syntax=yaml
|
|
|
|
groups:
|
|
|
|
- name: ConsulExporter
|
|
|
|
rules:
|
|
|
|
- alert: ConsulServiceHealthcheckFailed
|
|
# Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available)
|
|
expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0'
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Consul service healthcheck failed (service {{ $labels.service_name }})
|
|
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ConsulMissingMasterNode
|
|
expr: 'consul_raft_leader != 1'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Consul missing master node (node {{ $labels.node }})
|
|
description: "No consul leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ConsulAgentUnhealthy
|
|
expr: 'consul_health_node_status{status="critical"} == 1'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Consul agent unhealthy (node {{ $labels.node }})
|
|
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ConsulServiceWarning
|
|
expr: 'consul_health_service_status{status="warning"} == 1'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state
|
|
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ConsulServiceCritical
|
|
expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1'
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state
|
|
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
|
|
_EOT
|
|
destination = "local/rules/consul.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
template {
|
|
data = <<_EOT
|
|
groups:
|
|
|
|
- name: EmbeddedExporter
|
|
|
|
rules:
|
|
|
|
- alert: LokiProcessTooManyRestarts
|
|
expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Loki process too many restarts (instance {{ $labels.instance }})
|
|
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: LokiRequestErrors
|
|
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Loki request errors (instance {{ $labels.instance }})
|
|
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: LokiRequestPanic
|
|
expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Loki request panic (instance {{ $labels.instance }})
|
|
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: LokiRequestLatency
|
|
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Loki request latency (instance {{ $labels.instance }})
|
|
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
_EOT
|
|
destination = "local/rules/loki.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
template {
|
|
data = <<_EOT
|
|
groups:
|
|
|
|
- name: NodeExporter
|
|
|
|
rules:
|
|
|
|
- alert: HostOutOfMemory
|
|
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host out of memory (instance {{ $labels.instance }})
|
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostMemoryUnderMemoryPressure
|
|
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
|
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostMemoryIsUnderutilized
|
|
expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 1w
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
|
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostUnusualNetworkThroughputIn
|
|
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostUnusualNetworkThroughputOut
|
|
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostUnusualDiskReadRate
|
|
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostUnusualDiskWriteRate
|
|
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostOutOfDiskSpace
|
|
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host out of disk space (instance {{ $labels.instance }})
|
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostDiskWillFillIn24Hours
|
|
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
|
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostOutOfInodes
|
|
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host out of inodes (instance {{ $labels.instance }})
|
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostFilesystemDeviceError
|
|
expr: 'node_filesystem_device_error == 1'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Host filesystem device error (instance {{ $labels.instance }})
|
|
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostInodesWillFillIn24Hours
|
|
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
|
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostUnusualDiskReadLatency
|
|
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
|
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostUnusualDiskWriteLatency
|
|
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
|
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostHighCpuLoad
|
|
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# - alert: HostCpuIsUnderutilized
|
|
# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
# for: 1w
|
|
# labels:
|
|
# severity: info
|
|
# annotations:
|
|
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
|
# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostCpuStealNoisyNeighbor
|
|
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
|
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostCpuHighIowait
|
|
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
|
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostUnusualDiskIo
|
|
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
|
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostContextSwitching
|
|
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 20000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host context switching (instance {{ $labels.instance }})
|
|
description: "Context switching is growing on the node (> 20000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# - alert: HostSwapIsFillingUp
|
|
# expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
# for: 2m
|
|
# labels:
|
|
# severity: warning
|
|
# annotations:
|
|
# summary: Host swap is filling up (instance {{ $labels.instance }})
|
|
# description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostSystemdServiceCrashed
|
|
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
|
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostPhysicalComponentTooHot
|
|
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host physical component too hot (instance {{ $labels.instance }})
|
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostNodeOvertemperatureAlarm
|
|
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
|
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostRaidArrayGotInactive
|
|
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
|
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostRaidDiskFailure
|
|
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
|
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostKernelVersionDeviations
|
|
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 6h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
|
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostOomKillDetected
|
|
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
|
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostEdacCorrectableErrorsDetected
|
|
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 0m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
|
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostEdacUncorrectableErrorsDetected
|
|
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
|
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostNetworkReceiveErrors
|
|
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
|
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostNetworkTransmitErrors
|
|
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
|
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostNetworkInterfaceSaturated
|
|
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
|
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostNetworkBondDegraded
|
|
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
|
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostConntrackLimit
|
|
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host conntrack limit (instance {{ $labels.instance }})
|
|
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostClockSkew
|
|
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host clock skew (instance {{ $labels.instance }})
|
|
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostClockNotSynchronising
|
|
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
|
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostRequiresReboot
|
|
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
for: 4h
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Host requires reboot (instance {{ $labels.instance }})
|
|
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
_EOT
|
|
destination = "local/rules/node.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
|
|
# A client cert, to connect to the AlertManager API
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/prometheus"
|
|
(printf "common_name=prometheus-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
|
|
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end -}}
|
|
_EOT
|
|
destination = "secrets/prometheus.bundle.pem"
|
|
uid = 100000
|
|
gid = 109090
|
|
perms = "0440"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# The monitoring CA chain, to validate AlertManager cert
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
uid = 100000
|
|
gid = 100000
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# Persistent data
|
|
volume_mount {
|
|
volume = "data"
|
|
destination = "/data"
|
|
}
|
|
|
|
|
|
resources {
|
|
cpu = 200
|
|
memory = 768
|
|
memory_max = 1024
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
group "alerts" {
|
|
|
|
shutdown_delay = "6s"
|
|
count = 1
|
|
|
|
network {
|
|
mode = "bridge"
|
|
# Port exposing the web API, with mTLS
|
|
port "web-tls" {}
|
|
# Port used for gossip between the different alertmanager instance
|
|
port "cluster" {}
|
|
# Port to expose metrics to prometheus
|
|
port "metrics" {}
|
|
}
|
|
|
|
|
|
volume "data" {
|
|
source = "alertmanager-data"
|
|
type = "csi"
|
|
access_mode = "single-node-writer"
|
|
attachment_mode = "file-system"
|
|
per_alloc = true
|
|
}
|
|
|
|
|
|
# This service is used for the different instances of alertmanager to communicate
|
|
service {
|
|
name = "alertmanager-gossip"
|
|
port = "cluster"
|
|
meta {
|
|
alloc = "${NOMAD_ALLOC_INDEX}"
|
|
}
|
|
}
|
|
|
|
# This service is used by prometheus. As it needs to be able to reach every instances, it cannot use
|
|
# the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh
|
|
service {
|
|
name = "alertmanager-tls"
|
|
port = "web-tls"
|
|
meta {
|
|
alloc = "${NOMAD_ALLOC_INDEX}"
|
|
}
|
|
}
|
|
|
|
# This service is exposed through the service mesh
|
|
# and can be used to reach the web interface through Traefik
|
|
service {
|
|
name = "alertmanager"
|
|
port = 9093
|
|
meta {
|
|
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
|
alloc = "${NOMAD_ALLOC_INDEX}"
|
|
datacenter = "${NOMAD_DC}"
|
|
group = "${NOMAD_GROUP_NAME}"
|
|
job = "${NOMAD_JOB_NAME}"
|
|
namespace = "${NOMAD_NAMESPACE}"
|
|
node = "${node.unique.name}"
|
|
region = "${NOMAD_REGION}"
|
|
}
|
|
|
|
connect {
|
|
sidecar_service {
|
|
}
|
|
sidecar_task {
|
|
config {
|
|
args = [
|
|
"-c",
|
|
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
|
|
"-l",
|
|
"${meta.connect.log_level}",
|
|
"--concurrency",
|
|
"${meta.connect.proxy_concurrency}",
|
|
"--disable-hot-restart"
|
|
]
|
|
}
|
|
|
|
resources {
|
|
cpu = 50
|
|
memory = 64
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
check {
|
|
name = "health"
|
|
type = "http"
|
|
expose = true
|
|
path = "/-/healthy"
|
|
interval = "20s"
|
|
timeout = "8s"
|
|
check_restart {
|
|
limit = 12
|
|
grace = "30s"
|
|
}
|
|
}
|
|
|
|
tags = [
|
|
|
|
|
|
]
|
|
}
|
|
|
|
# This task will handle mTLS to the AlertManager API
|
|
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
|
|
task "untls-proxy" {
|
|
driver = "docker"
|
|
user = 9093
|
|
|
|
config {
|
|
image = "nginxinc/nginx-unprivileged:alpine"
|
|
force_pull = true
|
|
readonly_rootfs = true
|
|
pids_limit = 30
|
|
volumes = [
|
|
"local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro",
|
|
]
|
|
mount {
|
|
type = "tmpfs"
|
|
target = "/tmp"
|
|
tmpfs_options {
|
|
size = 3000000
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
vault {
|
|
policies = ["metrics", "alertmanager"]
|
|
env = false
|
|
disable_file = true
|
|
change_mode = "noop"
|
|
}
|
|
|
|
|
|
lifecycle {
|
|
hook = "poststart"
|
|
sidecar = true
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
# UnTLS for the web API
|
|
server {
|
|
listen 127.0.0.1:9093;
|
|
location / {
|
|
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
|
|
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
|
|
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
|
|
proxy_ssl_verify on;
|
|
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul;
|
|
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
|
|
allow 127.0.0.1;
|
|
deny all;
|
|
}
|
|
}
|
|
|
|
# Metrics proxy
|
|
server {
|
|
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
|
|
http2 on;
|
|
|
|
ssl_certificate /secrets/metrics.bundle.pem;
|
|
ssl_certificate_key /secrets/metrics.bundle.pem;
|
|
ssl_client_certificate /local/monitoring.ca.pem;
|
|
ssl_verify_client on;
|
|
ssl_protocols TLSv1.2 TLSv1.3;
|
|
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
|
ssl_session_cache shared:SSL:10m;
|
|
ssl_session_timeout 1h;
|
|
ssl_session_tickets off;
|
|
gzip on;
|
|
gzip_types
|
|
text/plain;
|
|
gzip_vary on;
|
|
|
|
server_tokens off;
|
|
|
|
if ($request_method !~ ^(GET|HEAD)$ ) {
|
|
return 405;
|
|
}
|
|
|
|
location /metrics {
|
|
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
|
|
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
|
|
proxy_ssl_verify on;
|
|
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul;
|
|
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
|
|
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
|
|
}
|
|
}
|
|
|
|
|
|
_EOT
|
|
destination = "local/alertmanager.conf"
|
|
}
|
|
|
|
# Get a certificate from vault to protect the metrics endpoint
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/metrics.bundle.pem"
|
|
}
|
|
|
|
# Get the root CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
|
|
# Certifiate used by AlertManager
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/alertmanager"
|
|
(printf "common_name=alertmanager-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
|
|
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
|
|
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/alertmanager.bundle.pem"
|
|
uid = 109093
|
|
gid = 100000
|
|
perms = "0440"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
resources {
|
|
cpu = 10
|
|
memory = 18
|
|
}
|
|
}
|
|
|
|
# The main alertmanager task
|
|
task "alertmanager" {
|
|
driver = "docker"
|
|
leader = true
|
|
|
|
config {
|
|
image = "danielberteaud/alertmanager:0.27.0-2"
|
|
readonly_rootfs = true
|
|
pids_limit = 200
|
|
command = "/local/alertmanager"
|
|
}
|
|
|
|
|
|
vault {
|
|
policies = ["metrics", "alertmanager"]
|
|
env = false
|
|
disable_file = true
|
|
change_mode = "noop"
|
|
}
|
|
|
|
|
|
|
|
# Use a template block instead of env {} so we can fetch values from vault
|
|
template {
|
|
data = <<_EOT
|
|
LANG=fr_FR.utf8
|
|
TZ=Europe/Paris
|
|
_EOT
|
|
destination = "secrets/.env"
|
|
perms = 400
|
|
env = true
|
|
}
|
|
|
|
|
|
template {
|
|
data = <<_EOT
|
|
global:
|
|
smtp_from: alertmanager@consul
|
|
smtp_require_tls: false
|
|
smtp_smarthost: localhost:25
|
|
|
|
_EOT
|
|
destination = "secrets/alertmanager.yml"
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
tls_server_config:
|
|
cert_file: /secrets/alertmanager.bundle.pem
|
|
key_file: /secrets/alertmanager.bundle.pem
|
|
client_auth_type: RequireAndVerifyClientCert
|
|
client_ca_file: /local/monitoring.ca.pem
|
|
|
|
tls_client_config:
|
|
cert_file: /secrets/alertmanager.bundle.pem
|
|
key_file: /secrets/alertmanager.bundle.pem
|
|
ca_file: /local/monitoring.ca.pem
|
|
|
|
_EOT
|
|
destination = "local/cluster_tls.yml"
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
tls_server_config:
|
|
cert_file: /secrets/alertmanager.bundle.pem
|
|
key_file: /secrets/alertmanager.bundle.pem
|
|
client_auth_type: RequireAndVerifyClientCert
|
|
client_ca_file: /local/monitoring.ca.pem
|
|
|
|
_EOT
|
|
destination = "local/web_tls.yml"
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
#!/bin/sh
|
|
|
|
set -euo pipefail
|
|
|
|
exec alertmanager \
|
|
--config.file=/secrets/alertmanager.yml \
|
|
--storage.path=/data \
|
|
--web.external-url=https://alert.example.org \
|
|
--web.route-prefix=/ \
|
|
--web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \
|
|
--cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \
|
|
--cluster.advertise-address={{ env "NOMAD_HOST_ADDR_cluster" }} \
|
|
{{- range service "alertmanager-gossip" -}}
|
|
{{- if not (eq (env "NOMAD_ALLOC_INDEX") (index .ServiceMeta "alloc")) }}
|
|
--cluster.peer={{ .Address }}:{{ .Port }} \
|
|
{{ end -}}
|
|
{{- end -}}
|
|
--cluster.tls-config=/local/cluster_tls.yml \
|
|
--web.config.file=/local/web_tls.yml
|
|
|
|
_EOT
|
|
destination = "local/alertmanager"
|
|
uid = 100000
|
|
gid = 100000
|
|
perms = "0755"
|
|
}
|
|
|
|
# Certifiate used by AlertManager
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/alertmanager"
|
|
(printf "common_name=alertmanager-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
|
|
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
|
|
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/alertmanager.bundle.pem"
|
|
uid = 109093
|
|
gid = 109090
|
|
perms = "0440"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# The trusted CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
volume_mount {
|
|
volume = "data"
|
|
destination = "/data"
|
|
}
|
|
|
|
|
|
resources {
|
|
cpu = 50
|
|
memory = 64
|
|
memory_max = 80
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
group "logs-server" {
|
|
|
|
shutdown_delay = "6s"
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "metrics" {}
|
|
}
|
|
|
|
|
|
volume "data" {
|
|
source = "loki-data"
|
|
type = "csi"
|
|
access_mode = "single-node-writer"
|
|
attachment_mode = "file-system"
|
|
}
|
|
|
|
|
|
service {
|
|
name = "loki"
|
|
port = 3100
|
|
meta {
|
|
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
|
alloc = "${NOMAD_ALLOC_INDEX}"
|
|
datacenter = "${NOMAD_DC}"
|
|
group = "${NOMAD_GROUP_NAME}"
|
|
job = "${NOMAD_JOB_NAME}"
|
|
namespace = "${NOMAD_NAMESPACE}"
|
|
node = "${node.unique.name}"
|
|
region = "${NOMAD_REGION}"
|
|
}
|
|
|
|
connect {
|
|
sidecar_service {
|
|
}
|
|
sidecar_task {
|
|
config {
|
|
args = [
|
|
"-c",
|
|
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
|
|
"-l",
|
|
"${meta.connect.log_level}",
|
|
"--concurrency",
|
|
"${meta.connect.proxy_concurrency}",
|
|
"--disable-hot-restart"
|
|
]
|
|
}
|
|
|
|
resources {
|
|
cpu = 50
|
|
memory = 64
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
check {
|
|
name = "ready"
|
|
type = "http"
|
|
path = "/ready"
|
|
expose = true
|
|
interval = "20s"
|
|
timeout = "8s"
|
|
check_restart {
|
|
limit = 6
|
|
grace = "5m"
|
|
}
|
|
}
|
|
|
|
tags = [
|
|
|
|
|
|
]
|
|
}
|
|
|
|
|
|
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
|
|
task "metrics-proxy" {
|
|
driver = "docker"
|
|
user = 8995
|
|
|
|
config {
|
|
image = "nginxinc/nginx-unprivileged:alpine"
|
|
force_pull = true
|
|
volumes = [
|
|
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
|
|
]
|
|
pids_limit = 100
|
|
}
|
|
|
|
lifecycle {
|
|
hook = "poststart"
|
|
sidecar = true
|
|
}
|
|
|
|
vault {
|
|
policies = ["metrics"]
|
|
}
|
|
|
|
# Get a certificate from vault to protect the metrics endpoint
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/metrics.bundle.pem"
|
|
}
|
|
|
|
# Get the root CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
|
|
template {
|
|
data = <<_EOT
|
|
server {
|
|
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
|
|
http2 on;
|
|
|
|
ssl_certificate /secrets/metrics.bundle.pem;
|
|
ssl_certificate_key /secrets/metrics.bundle.pem;
|
|
ssl_client_certificate /local/monitoring.ca.pem;
|
|
ssl_verify_client on;
|
|
ssl_protocols TLSv1.2 TLSv1.3;
|
|
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
|
ssl_session_cache shared:SSL:10m;
|
|
ssl_session_timeout 1h;
|
|
ssl_session_tickets off;
|
|
gzip on;
|
|
gzip_types
|
|
text/plain;
|
|
gzip_vary on;
|
|
|
|
server_tokens off;
|
|
|
|
if ($request_method !~ ^(GET|HEAD)$ ) {
|
|
return 405;
|
|
}
|
|
location /metrics {
|
|
proxy_pass http://localhost:3100/metrics;
|
|
}
|
|
}
|
|
_EOT
|
|
destination = "local/default.conf"
|
|
}
|
|
|
|
resources {
|
|
cpu = 10
|
|
memory = 10
|
|
memory_max = 20
|
|
}
|
|
}
|
|
|
|
|
|
|
|
task "loki" {
|
|
driver = "docker"
|
|
|
|
config {
|
|
image = "danielberteaud/loki:2.9.6-1"
|
|
command = "loki"
|
|
args = ["--config.file=/local/loki.yml"]
|
|
}
|
|
|
|
|
|
vault {
|
|
policies = ["loki"]
|
|
env = false
|
|
disable_file = true
|
|
change_mode = "noop"
|
|
}
|
|
|
|
|
|
|
|
# Use a template block instead of env {} so we can fetch values from vault
|
|
template {
|
|
data = <<_EOT
|
|
LANG=fr_FR.utf8
|
|
TZ=Europe/Paris
|
|
_EOT
|
|
destination = "secrets/.env"
|
|
perms = 400
|
|
env = true
|
|
}
|
|
|
|
|
|
template {
|
|
data = <<_EOT
|
|
analytics:
|
|
reporting_enabled: false
|
|
auth_enabled: false
|
|
common:
|
|
instance_addr: 127.0.0.1
|
|
path_prefix: /data
|
|
replication_factor: 1
|
|
ring:
|
|
kvstore:
|
|
store: inmemory
|
|
storage:
|
|
filesystem:
|
|
chunks_directory: /data/chunks
|
|
rules_directory: /data/rules
|
|
compactor:
|
|
compaction_interval: 1h
|
|
deletion_mode: filter-and-delete
|
|
retention_enabled: true
|
|
shared_store: filesystem
|
|
working_directory: /data/compactor
|
|
ingester:
|
|
chunk_idle_period: 1h
|
|
limits_config:
|
|
ingestion_burst_size_mb: 100
|
|
ingestion_rate_mb: 20
|
|
max_entries_limit_per_query: 20000
|
|
max_query_parallelism: 128
|
|
retention_period: 720h
|
|
split_queries_by_interval: 0
|
|
ruler:
|
|
alertmanager_client:
|
|
tls_ca_path: /secrets/monitoring.ca.pem
|
|
tls_cert_path: /secrets/loki.bundle.pem
|
|
tls_key_path: /secrets/loki.bundle.pem
|
|
tls_server_name: alertmanager.monitoring
|
|
alertmanager_url: alertmanager-tls
|
|
enable_alertmanager_discovery: true
|
|
enable_alertmanager_v2: true
|
|
enable_api: true
|
|
ring:
|
|
kvstore:
|
|
store: inmemory
|
|
rule_path: /tmp/loki-rules
|
|
storage:
|
|
local:
|
|
directory: /local/rules
|
|
type: local
|
|
schema_config:
|
|
configs:
|
|
- from: "2020-10-24"
|
|
index:
|
|
period: 24h
|
|
prefix: index_
|
|
object_store: filesystem
|
|
schema: v11
|
|
store: boltdb-shipper
|
|
server:
|
|
grpc_listen_address: 127.0.0.1
|
|
grpc_listen_port: 9095
|
|
http_listen_address: 127.0.0.1
|
|
http_listen_port: 3100
|
|
storage_config:
|
|
boltdb_shipper:
|
|
active_index_directory: /data/index
|
|
cache_location: /data/boltdb-cache
|
|
shared_store: filesystem
|
|
|
|
_EOT
|
|
destination = "local/loki.yml"
|
|
}
|
|
|
|
# A client cert, to connect to the AlertManager API
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/loki"
|
|
(printf "common_name=loki-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
|
|
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end -}}
|
|
_EOT
|
|
destination = "secrets/loki.bundle.pem"
|
|
uid = 100000
|
|
gid = 103100
|
|
perms = "0440"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# The monitoring CA chain, to validate AlertManager cert
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
uid = 100000
|
|
gid = 100000
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
volume_mount {
|
|
volume = "data"
|
|
destination = "/data"
|
|
}
|
|
|
|
|
|
resources {
|
|
cpu = 150
|
|
memory = 1024
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
|
|
# And with a loki sink. The goal is to be able to collect logs from various sources
|
|
group "logs-aggregator" {
|
|
|
|
count = 1
|
|
shutdown_delay = "6s"
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "metrics" {}
|
|
}
|
|
|
|
# The main service is the vector source
|
|
# It will provide access to other services through the mesh (like loki)
|
|
service {
|
|
name = "vector-aggregator"
|
|
port = 9000
|
|
meta {
|
|
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
|
alloc = "${NOMAD_ALLOC_INDEX}"
|
|
datacenter = "${NOMAD_DC}"
|
|
group = "${NOMAD_GROUP_NAME}"
|
|
job = "${NOMAD_JOB_NAME}"
|
|
namespace = "${NOMAD_NAMESPACE}"
|
|
node = "${node.unique.name}"
|
|
region = "${NOMAD_REGION}"
|
|
}
|
|
|
|
connect {
|
|
sidecar_service {
|
|
proxy {
|
|
upstreams {
|
|
destination_name = "loki"
|
|
local_bind_port = 3100
|
|
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
|
|
destination_type = "service"
|
|
}
|
|
}
|
|
}
|
|
sidecar_task {
|
|
config {
|
|
args = [
|
|
"-c",
|
|
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
|
|
"-l",
|
|
"${meta.connect.log_level}",
|
|
"--concurrency",
|
|
"${meta.connect.proxy_concurrency}",
|
|
"--disable-hot-restart"
|
|
]
|
|
}
|
|
|
|
resources {
|
|
cpu = 50
|
|
memory = 64
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
tags = [
|
|
|
|
|
|
]
|
|
}
|
|
|
|
task "vector" {
|
|
driver = "docker"
|
|
leader = true
|
|
|
|
config {
|
|
image = "danielberteaud/vector:0.37.0-1"
|
|
readonly_rootfs = true
|
|
pids_limit = 200
|
|
args = ["--config=/local/vector.yml"]
|
|
}
|
|
|
|
|
|
vault {
|
|
policies = ["metrics"]
|
|
env = false
|
|
disable_file = true
|
|
change_mode = "noop"
|
|
}
|
|
|
|
|
|
|
|
# Use a template block instead of env {} so we can fetch values from vault
|
|
template {
|
|
data = <<_EOT
|
|
LANG=fr_FR.utf8
|
|
TZ=Europe/Paris
|
|
_EOT
|
|
destination = "secrets/.env"
|
|
perms = 400
|
|
env = true
|
|
}
|
|
|
|
# Get a certificate from vault to protect the metrics endpoint
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/metrics.bundle.pem"
|
|
}
|
|
|
|
# Get the root CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
|
|
template {
|
|
data = <<_EOT
|
|
data_dir: /local
|
|
expire_metrics_secs: 600
|
|
|
|
sources:
|
|
|
|
logs_vector:
|
|
type: vector
|
|
address: 127.0.0.1:9000
|
|
|
|
vector_metrics:
|
|
type: internal_metrics
|
|
|
|
transforms:
|
|
split-by-app:
|
|
type: route
|
|
inputs: [ "logs_*" ]
|
|
route:
|
|
traefik: '.service == "traefik"'
|
|
postgres: '.service == "postgres"'
|
|
syslog: '.source_type == "syslog"'
|
|
|
|
parse-traefik:
|
|
type: remap
|
|
inputs: ["split-by-app.traefik"]
|
|
source: |
|
|
.http = parse_grok!(.message, "%%{HTTPD_COMMONLOG}")
|
|
.loki_labels.http_method = .http.verb
|
|
.loki_labels.http_status = .http.response
|
|
.loki_labels.user = .http.auth
|
|
|
|
parse-postgres:
|
|
type: remap
|
|
inputs: ["split-by-app.postgres"]
|
|
source: |
|
|
if includes(array!(.nomad.tags), "master"){
|
|
.loki_labels.pg_role = "master"
|
|
} else if includes(array!(.nomad.tags), "replica"){
|
|
.loki_labels.pg_role = "replica"
|
|
}
|
|
|
|
parse-syslog:
|
|
type: remap
|
|
inputs: ["split-by-app.syslog"]
|
|
source: |
|
|
# PfSense sends /usr/sbin/cron as the appname, instead of cron
|
|
if string!(.appname) == "/usr/sbin/cron" {
|
|
.appname = "cron"
|
|
}
|
|
.service = .appname
|
|
|
|
sinks:
|
|
|
|
loki:
|
|
type: loki
|
|
inputs: [ "split-by-app._unmatched", "parse-*" ]
|
|
endpoint: http://127.0.0.1:3100
|
|
encoding:
|
|
codec: text
|
|
labels:
|
|
job: "{{ .service }}"
|
|
host: "{{ .host }}"
|
|
_*: "{{ .loki_labels }}"
|
|
buffer:
|
|
type: disk
|
|
max_size: 268435488
|
|
remove_label_fields: true
|
|
|
|
# Expose vector internal metrics
|
|
prometheus:
|
|
type: prometheus_exporter
|
|
inputs: ["vector_metrics"]
|
|
address: 0.0.0.0:$${NOMAD_ALLOC_PORT_metrics}
|
|
tls:
|
|
enabled: true
|
|
crt_file: /secrets/metrics.bundle.pem
|
|
key_file: /secrets/metrics.bundle.pem
|
|
ca_file: /local/monitoring.ca.pem
|
|
verify_certificate: true
|
|
|
|
_EOT
|
|
destination = "local/vector.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
|
|
resources {
|
|
cpu = 100
|
|
memory = 192
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
group "grafana" {
|
|
|
|
shutdown_delay = "6s"
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "metrics" {}
|
|
}
|
|
|
|
|
|
volume "data" {
|
|
source = "grafana-data"
|
|
type = "csi"
|
|
access_mode = "single-node-writer"
|
|
attachment_mode = "file-system"
|
|
}
|
|
|
|
|
|
service {
|
|
name = "grafana"
|
|
port = 3000
|
|
|
|
|
|
meta {
|
|
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
|
alloc = "${NOMAD_ALLOC_INDEX}"
|
|
}
|
|
|
|
connect {
|
|
sidecar_service {
|
|
proxy {
|
|
upstreams {
|
|
destination_name = "postgres"
|
|
local_bind_port = 5432
|
|
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
|
|
destination_type = "service"
|
|
}
|
|
upstreams {
|
|
destination_name = "loki"
|
|
local_bind_port = 3100
|
|
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
|
|
destination_type = "service"
|
|
}
|
|
upstreams {
|
|
destination_name = "prometheus"
|
|
local_bind_port = 9090
|
|
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
|
|
destination_type = "service"
|
|
}
|
|
}
|
|
}
|
|
sidecar_task {
|
|
config {
|
|
args = [
|
|
"-c",
|
|
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
|
|
"-l",
|
|
"${meta.connect.log_level}",
|
|
"--concurrency",
|
|
"${meta.connect.proxy_concurrency}",
|
|
"--disable-hot-restart"
|
|
]
|
|
}
|
|
|
|
resources {
|
|
cpu = 50
|
|
memory = 64
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
check {
|
|
name = "health"
|
|
type = "http"
|
|
path = "/api/health"
|
|
expose = true
|
|
interval = "30s"
|
|
timeout = "8s"
|
|
}
|
|
|
|
tags = [
|
|
|
|
"traefik.enable=true",
|
|
"traefik.http.routers.monitoring-grafana.entrypoints=https",
|
|
"traefik.http.routers.monitoring-grafana.rule=Host(`grafana.example.org`)",
|
|
"traefik.http.middlewares.csp-monitoring-grafana.headers.contentsecuritypolicy=connect-src 'self' https://grafana.com;default-src 'self';font-src 'self' data:;img-src 'self' data: blob: https://grafana.com;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
|
|
"traefik.http.routers.monitoring-grafana.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-grafana",
|
|
|
|
]
|
|
}
|
|
|
|
|
|
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
|
|
task "metrics-proxy" {
|
|
driver = "docker"
|
|
user = 8995
|
|
|
|
config {
|
|
image = "nginxinc/nginx-unprivileged:alpine"
|
|
force_pull = true
|
|
volumes = [
|
|
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
|
|
]
|
|
pids_limit = 100
|
|
}
|
|
|
|
lifecycle {
|
|
hook = "poststart"
|
|
sidecar = true
|
|
}
|
|
|
|
vault {
|
|
policies = ["metrics"]
|
|
}
|
|
|
|
# Get a certificate from vault to protect the metrics endpoint
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/metrics.bundle.pem"
|
|
}
|
|
|
|
# Get the root CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
|
|
template {
|
|
data = <<_EOT
|
|
server {
|
|
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
|
|
http2 on;
|
|
|
|
ssl_certificate /secrets/metrics.bundle.pem;
|
|
ssl_certificate_key /secrets/metrics.bundle.pem;
|
|
ssl_client_certificate /local/monitoring.ca.pem;
|
|
ssl_verify_client on;
|
|
ssl_protocols TLSv1.2 TLSv1.3;
|
|
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
|
ssl_session_cache shared:SSL:10m;
|
|
ssl_session_timeout 1h;
|
|
ssl_session_tickets off;
|
|
gzip on;
|
|
gzip_types
|
|
text/plain;
|
|
gzip_vary on;
|
|
|
|
server_tokens off;
|
|
|
|
if ($request_method !~ ^(GET|HEAD)$ ) {
|
|
return 405;
|
|
}
|
|
location /metrics {
|
|
proxy_pass http://127.0.0.1:3000/metrics;
|
|
}
|
|
}
|
|
_EOT
|
|
destination = "local/default.conf"
|
|
}
|
|
|
|
resources {
|
|
cpu = 10
|
|
memory = 10
|
|
memory_max = 20
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
# Local memcached instance
|
|
task "memcached" {
|
|
driver = "docker"
|
|
user = 11211
|
|
|
|
lifecycle {
|
|
hook = "prestart"
|
|
sidecar = true
|
|
}
|
|
|
|
config {
|
|
image = "memcached:alpine"
|
|
readonly_rootfs = true
|
|
force_pull = true
|
|
entrypoint = ["/local/memcached"]
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
#!/bin/sh
|
|
|
|
set -eu
|
|
exec memcached -l 127.0.0.1 -p 11211 -m {{ env "NOMAD_MEMORY_LIMIT" | parseInt | subtract 5 }}
|
|
_EOT
|
|
destination = "local/memcached"
|
|
perms = 755
|
|
}
|
|
|
|
resources {
|
|
cpu = 10
|
|
memory = 20
|
|
}
|
|
}
|
|
|
|
|
|
|
|
task "grafana" {
|
|
|
|
driver = "docker"
|
|
leader = true
|
|
|
|
config {
|
|
image = "danielberteaud/grafana:10.4.1-1"
|
|
readonly_rootfs = true
|
|
pids_limit = 100
|
|
command = "grafana"
|
|
args = [
|
|
"server",
|
|
"--homepath=/opt/grafana",
|
|
"--config=/secrets/grafana.ini",
|
|
"--packaging=docker"
|
|
]
|
|
}
|
|
|
|
|
|
vault {
|
|
policies = ["grafana"]
|
|
env = false
|
|
disable_file = true
|
|
change_mode = "noop"
|
|
}
|
|
|
|
|
|
|
|
# Use a template block instead of env {} so we can fetch values from vault
|
|
template {
|
|
data = <<_EOT
|
|
LANG=fr_FR.utf8
|
|
TZ=Europe/Paris
|
|
_EOT
|
|
destination = "secrets/.env"
|
|
perms = 400
|
|
env = true
|
|
}
|
|
|
|
|
|
template {
|
|
data = <<_EOT
|
|
GF_SECURITY_ADMIN_PASSWORD={{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd | sprig_squote }}{{ end }}
|
|
_EOT
|
|
destination = "secrets/.grafana.env"
|
|
perms = 400
|
|
env = true
|
|
}
|
|
|
|
# Basic grafana configuration file
|
|
template {
|
|
data = <<_EOT
|
|
[server]
|
|
http_addr = 127.0.0.1
|
|
http_port = 3000
|
|
root_url = https://grafana.example.org
|
|
|
|
[database]
|
|
type = postgres
|
|
name = grafana
|
|
host = 127.0.0.1:5432
|
|
user = {{ with secret "database/creds/grafana" }}{{ .Data.username }}{{ end }}
|
|
password = {{ with secret "database/creds/grafana" }}{{ .Data.password }}{{ end }}
|
|
|
|
|
|
[remote_cache]
|
|
type = memcached
|
|
connstr = 127.0.0.1:11211
|
|
|
|
[analytics]
|
|
reporting_enabled = false
|
|
check_for_updates = false
|
|
check_for_plugin_updates = false
|
|
|
|
[security]
|
|
cookie_secure = true
|
|
cookie_samesite = strict
|
|
x_xss_protection = true
|
|
secret_key = {{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.secret_key }}{{ end }}
|
|
|
|
[dataproxy]
|
|
timeout = 120
|
|
|
|
[feature_toggles]
|
|
|
|
_EOT
|
|
destination = "secrets/grafana.ini"
|
|
uid = 103000
|
|
perms = 400
|
|
}
|
|
|
|
# Mount volume in /data for persistence
|
|
volume_mount {
|
|
volume = "data"
|
|
destination = "/data"
|
|
}
|
|
|
|
|
|
resources {
|
|
cpu = 100
|
|
memory = 256
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|