Various cleanup

This commit is contained in:
Daniel Berteaud 2024-03-25 22:23:31 +01:00
parent f954afc251
commit 2ae2a91002
18 changed files with 1281 additions and 196 deletions

View File

@ -1,7 +1,9 @@
job "[[ .instance ]]-agent" {
[[- $c := merge .monitoring.agent .monitoring . ]]
[[ template "common/job_start" $c ]]
type = "system"
# This group will collect logs from the allocation running on the node
@ -39,16 +41,16 @@ job "[[ .instance ]]-agent" {
user = 3987
config {
image = "[[ $n.image ]]"
image = "[[ $n.image ]]"
readonly_rootfs = true
pids_limit = 50
pids_limit = 50
# Nomad Vector Logger needs to run on the host's network namespace
# so it can reach the Nomad Agent API on localhost:4646
network_mode = "host"
# Host network namespace requires disabling user namespace
userns_mode = "host"
command = "nomad-vector-logger"
args = [
command = "nomad-vector-logger"
args = [
"--config",
"/local/nomad-vector-logger.toml"
]
@ -85,9 +87,9 @@ _EOT
destination = "local/nomad-vector-logger.toml"
}
# Disable the default nomad.toml template
# Disable the default nomad.toml template, as we provide our own nomad.yml template
template {
data = "# Disable the default toml template"
data = "# Disable the default toml template"
destination = "local/template/nomad.toml"
}
@ -143,11 +145,13 @@ _EOT
driver = "[[ $c.nomad.driver ]]"
config {
image = "busybox:latest"
command = "sh"
args = [
image = "busybox:latest"
readonly_rootfs = true
pids_limit = 20
command = "sh"
args = [
"-c",
"echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
"echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done"
]
}
@ -170,9 +174,11 @@ _EOT
leader = true
config {
image = "[[ $c.image ]]"
userns_mode = "host"
args = [
image = "[[ $c.image ]]"
userns_mode = "host"
readonly_rootfs = true
pids_limit = 200
args = [
"--watch-config",
"--config", "/local/vector.yml",
"--config-dir", "/alloc/data/vector_conf"
@ -186,7 +192,9 @@ _EOT
}
[[ template "common/metrics_cert" $c ]]
[[ template "common/artifacts" $c ]]
# Main vector configuration
template {
data =<<_EOT
[[ template "monitoring/agent/vector.yml" $c ]]
@ -217,6 +225,8 @@ _EOT
}
}
[[- if .monitoring.agent.node_exporter.enabled ]]
# This group runs the prometheus node-exporter to expose prometheus metrics from the node
group "node-exporter" {
@ -238,21 +248,25 @@ _EOT
driver = "[[ $c.nomad.driver ]]"
config {
image = "[[ $c.image ]]"
pid_mode = "host"
#network_mode = "host"
userns_mode = "host"
image = "[[ $c.image ]]"
pid_mode = "host"
userns_mode = "host"
readonly_rootfs = true
pids_limit = 50
args = [
pids_limit = 50
command = "/usr/local/bin/node_exporter"
args = [
"--path.rootfs=/host",
"--web.config.file=/local/tls.yml",
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}",
[[- range $arg := $c.args ]]
"[[ $arg ]]",
[[- end ]]
]
}
[[ template "common/vault.policies" $c ]]
[[ template "common/metrics_cert" $c ]]
[[ template "common/artifacts" $c ]]
template {
data = <<_EOT
@ -271,4 +285,5 @@ _EOT
[[ template "common/resources" $c ]]
}
}
[[- end ]]
}

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "vector-aggregator[[ .consul.suffix ]]"
Protocol = "http"

View File

@ -0,0 +1,16 @@
[[- $c := merge .monitoring.aggregator .monitoring . -]]
Kind = "service-intentions"
Name = "vector-aggregator[[ .consul.suffix ]]"
Sources = [
{
Name = "[[ $c.traefik.instance ]]"
Permissions = [
{
Action = "[[ $c.traefik.enabled | ternary "allow" "deny" ]]"
HTTP {
Methods = ["POST"]
}
}
]
}
]

View File

Binary file not shown.

View File

@ -1,8 +1,11 @@
job "monitoring-agent" {
datacenters = ["dc1"]
region = "global"
node_pool = "all"
priority = 60
type = "system"
@ -161,7 +164,7 @@ _EOT
destination = "local/nomad-vector-logger.toml"
}
# Disable the default nomad.toml template
# Disable the default nomad.toml template, as we provide our own nomad.yml template
template {
data = "# Disable the default toml template"
destination = "local/template/nomad.toml"
@ -184,6 +187,8 @@ sources:
mode: continue_through
condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
timeout_ms: 1000
ignore_older_secs: 1800
oldest_first: true
{{- end }}
@ -262,11 +267,13 @@ _EOT
driver = "docker"
config {
image = "busybox:latest"
command = "sh"
image = "busybox:latest"
readonly_rootfs = true
pids_limit = 20
command = "sh"
args = [
"-c",
"echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
"echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done"
]
}
@ -289,8 +296,10 @@ _EOT
leader = true
config {
image = "danielberteaud/vector:0.36.1-1"
userns_mode = "host"
image = "danielberteaud/vector:0.36.1-1"
userns_mode = "host"
readonly_rootfs = true
pids_limit = 200
args = [
"--watch-config",
"--config", "/local/vector.yml",
@ -331,6 +340,9 @@ _EOT
}
# Main vector configuration
template {
data = <<_EOT
data_dir: /data
@ -398,8 +410,8 @@ _EOT
resources {
cpu = 100
memory = 192
memory_max = 384
memory = 384
memory_max = 512
}
}
@ -436,16 +448,17 @@ _EOT
driver = "docker"
config {
image = "quay.io/prometheus/node-exporter:latest"
pid_mode = "host"
#network_mode = "host"
image = "danielberteaud/node-exporter:1.7.0-1"
pid_mode = "host"
userns_mode = "host"
readonly_rootfs = true
pids_limit = 50
command = "/usr/local/bin/node_exporter"
args = [
"--path.rootfs=/host",
"--web.config.file=/local/tls.yml",
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}",
"--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)",
]
}
@ -477,6 +490,8 @@ _EOT
}
template {
data = <<_EOT
tls_server_config:

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "vector-aggregator"
Protocol = "http"

View File

@ -0,0 +1,15 @@
Kind = "service-intentions"
Name = "vector-aggregator"
Sources = [
{
Name = "traefik"
Permissions = [
{
Action = "allow"
HTTP {
Methods = ["POST"]
}
}
]
}
]

View File

@ -411,7 +411,7 @@ _EOT
resources {
cpu = 10
memory = 15
memory = 20
}
}

View File

@ -0,0 +1,24 @@
FROM danielberteaud/alpine:24.3-1 AS builder
ARG EXPORTER_VERSION=1.7.0
ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/sha256sums.txt /tmp
RUN set -euxo pipefail &&\
apk --no-cache add \
curl \
tar \
ca-certificates \
&&\
cd /tmp &&\
grep node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz sha256sums.txt | sha256sum -c &&\
tar xvzf node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz &&\
mv node_exporter-${EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/node_exporter
FROM danielberteaud/alpine:24.3-1
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
COPY --from=builder --chown=root:root --chmod=755 /usr/local/bin/node_exporter /usr/local/bin/node_exporter
CMD ["/usr/local/bin/node_exporter"]

View File

@ -5,7 +5,7 @@ job "monitoring-services" {
region = "global"
# Metrics is running prometheus and various exporters
# Metrics is running prometheus
group "metrics-server" {
shutdown_delay = "6s"
@ -67,7 +67,7 @@ job "monitoring-services" {
type = "http"
expose = true
path = "/-/healthy"
interval = "15s"
interval = "20s"
timeout = "8s"
check_restart {
limit = 10
@ -77,11 +77,6 @@ job "monitoring-services" {
tags = [
"traefik.enable=true",
"traefik.http.routers.monitoring-prometheus.entrypoints=https",
"traefik.http.routers.monitoring-prometheus.rule=Host(`prometheus.example.org`)",
"traefik.http.middlewares.csp-monitoring-prometheus.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
"traefik.http.routers.monitoring-prometheus.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-prometheus",
]
}
@ -892,6 +887,410 @@ _EOT
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
groups:
- name: EmbeddedExporter
rules:
- alert: LokiProcessTooManyRestarts
expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
for: 0m
labels:
severity: warning
annotations:
summary: Loki process too many restarts (instance {{ $labels.instance }})
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestErrors
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
for: 15m
labels:
severity: critical
annotations:
summary: Loki request errors (instance {{ $labels.instance }})
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestPanic
expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Loki request panic (instance {{ $labels.instance }})
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestLatency
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
for: 5m
labels:
severity: critical
annotations:
summary: Loki request latency (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/loki.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized
expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteRate
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: HostCpuIsUnderutilized
# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 1w
# labels:
# severity: info
# annotations:
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostContextSwitching
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching (instance {{ $labels.instance }})
description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: HostSwapIsFillingUp
# expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 2m
# labels:
# severity: warning
# annotations:
# summary: Host swap is filling up (instance {{ $labels.instance }})
# description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidArrayGotInactive
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 6h
labels:
severity: warning
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/node.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
# A client cert, to connect to the AlertManager API
template {
@ -945,8 +1344,11 @@ _EOT
network {
mode = "bridge"
# Port exposing the web API, with mTLS
port "web-tls" {}
# Port used for gossip between the different alertmanager instance
port "cluster" {}
# Port to expose metrics to prometheus
port "metrics" {}
}
@ -1031,101 +1433,10 @@ _EOT
tags = [
"traefik.enable=true",
"traefik.http.routers.monitoring-alertmanager.entrypoints=https",
"traefik.http.routers.monitoring-alertmanager.rule=Host(`alerte.example.org`)",
"traefik.http.middlewares.csp-monitoring-alertmanager.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
"traefik.http.routers.monitoring-alertmanager.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-alertmanager",
]
}
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
task "metrics-proxy" {
driver = "docker"
user = 8995
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
volumes = [
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
]
pids_limit = 100
}
lifecycle {
hook = "poststart"
sidecar = true
}
vault {
policies = ["metrics"]
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://127.0.0.1:9093/metrics;
}
}
_EOT
destination = "local/default.conf"
}
resources {
cpu = 10
memory = 10
memory_max = 20
}
}
# This task will handle mTLS to the AlertManager API
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
task "untls-proxy" {
@ -1166,10 +1477,11 @@ _EOT
template {
data = <<_EOT
# UnTLS for the web API
server {
listen 127.0.0.1:9093;
location / {
proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
proxy_ssl_verify on;
@ -1180,10 +1492,66 @@ server {
}
}
# Metrics proxy
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
proxy_ssl_verify on;
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul;
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
}
}
_EOT
destination = "local/alertmanager.conf"
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
# Certifiate used by AlertManager
template {
data = <<_EOT
@ -1203,14 +1571,6 @@ _EOT
change_signal = "SIGHUP"
}
# The trusted CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
resources {
cpu = 10
memory = 18
@ -1300,7 +1660,7 @@ set -euo pipefail
exec alertmanager \
--config.file=/secrets/alertmanager.yml \
--storage.path=/data \
--web.external-url=https://alerte.example.org \
--web.external-url=https://alert.example.org \
--web.route-prefix=/ \
--web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \
--cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \
@ -1430,11 +1790,6 @@ _EOT
tags = [
"traefik.enable=true",
"traefik.http.routers.monitoring-loki.entrypoints=https",
"traefik.http.routers.monitoring-loki.rule=Host(`loki.example.org`)",
"traefik.http.middlewares.csp-monitoring-loki.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
"traefik.http.routers.monitoring-loki.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-loki",
]
}
@ -2048,7 +2403,7 @@ server {
return 405;
}
location /metrics {
proxy_pass http://localhost:3000/metrics;
proxy_pass http://127.0.0.1:3000/metrics;
}
}
_EOT
@ -2132,7 +2487,6 @@ _EOT
# Use a template block instead of env {} so we can fetch values from vault
template {
data = <<_EOT
GF_SECURITY_ADMIN_PASSWORD={{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}
LANG=fr_FR.utf8
TZ=Europe/Paris
_EOT
@ -2142,6 +2496,15 @@ _EOT
}
template {
data = <<_EOT
GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
_EOT
destination = "secrets/.grafana.env"
perms = 400
env = true
}
# Basic grafana configuration file
template {
data = <<_EOT

View File

@ -0,0 +1,24 @@
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
ARG EXPORTER_VERSION=[[ .monitoring.agent.node_exporter.version ]]
ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/sha256sums.txt /tmp
RUN set -euxo pipefail &&\
apk --no-cache add \
curl \
tar \
ca-certificates \
&&\
cd /tmp &&\
grep node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz sha256sums.txt | sha256sum -c &&\
tar xvzf node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz &&\
mv node_exporter-${EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/node_exporter
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
MAINTAINER [[ .docker.maintainer ]]
COPY --from=builder --chown=root:root --chmod=755 /usr/local/bin/node_exporter /usr/local/bin/node_exporter
CMD ["/usr/local/bin/node_exporter"]

View File

@ -2,7 +2,7 @@ job "[[ .instance ]]-services" {
[[ template "common/job_start" . ]]
# Metrics is running prometheus and various exporters
# Metrics is running prometheus
group "metrics-server" {
[[- $c := merge .monitoring.prometheus .monitoring . ]]
@ -28,7 +28,7 @@ job "[[ .instance ]]-services" {
type = "http"
expose = true
path = "/-/healthy"
interval = "15s"
interval = "20s"
timeout = "8s"
check_restart {
limit = 10
@ -168,8 +168,11 @@ _EOT
network {
mode = "bridge"
# Port exposing the web API, with mTLS
port "web-tls" {}
# Port used for gossip between the different alertmanager instance
port "cluster" {}
# Port to expose metrics to prometheus
port "metrics" {}
}
@ -220,8 +223,6 @@ _EOT
]
}
[[ template "common/task.metrics_proxy" $c ]]
# This task will handle mTLS to the AlertManager API
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
task "untls-proxy" {
@ -253,6 +254,8 @@ _EOT
destination = "local/alertmanager.conf"
}
[[ template "common/metrics_cert" $c ]]
# Certifiate used by AlertManager
template {
data = <<_EOT
@ -272,14 +275,6 @@ _EOT
change_signal = "SIGHUP"
}
# The trusted CA
template {
data = <<_EOT
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
resources {
cpu = 10
memory = 18
@ -617,6 +612,15 @@ _EOT
[[ template "common/vault.policies" $c ]]
[[ template "common/file_env" $c ]]
template {
data = <<_EOT
GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
_EOT
destination = "secrets/.grafana.env"
perms = 400
env = true
}
# Basic grafana configuration file
template {
data = <<_EOT

View File

@ -12,6 +12,8 @@ sources:
mode: continue_through
condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
timeout_ms: 1000
ignore_older_secs: 1800
oldest_first: true
{{- end }}

View File

@ -1,7 +1,8 @@
# UnTLS for the web API
server {
listen 127.0.0.1:9093;
location / {
proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
proxy_ssl_verify on;
@ -11,3 +12,39 @@ server {
deny all;
}
}
# Metrics proxy
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
proxy_ssl_verify on;
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.[[ .instance ]].[[ .consul.domain ]];
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
}
}

View File

@ -0,0 +1,41 @@
groups:
- name: EmbeddedExporter
rules:
- alert: LokiProcessTooManyRestarts
expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
for: 0m
labels:
severity: warning
annotations:
summary: Loki process too many restarts (instance {{ $labels.instance }})
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestErrors
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
for: 15m
labels:
severity: critical
annotations:
summary: Loki request errors (instance {{ $labels.instance }})
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestPanic
expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Loki request panic (instance {{ $labels.instance }})
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestLatency
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
for: 5m
labels:
severity: critical
annotations:
summary: Loki request latency (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -0,0 +1,347 @@
groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized
expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteRate
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: HostCpuIsUnderutilized
# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 1w
# labels:
# severity: info
# annotations:
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostContextSwitching
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching (instance {{ $labels.instance }})
description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: HostSwapIsFillingUp
# expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 2m
# labels:
# severity: warning
# annotations:
# summary: Host swap is filling up (instance {{ $labels.instance }})
# description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidArrayGotInactive
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 6h
labels:
severity: warning
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -76,179 +76,284 @@ monitoring:
# - https://portal.acme.com
http_probes: []
# Consul exporter will expose consul metrics
# Consul exporter will expose consul metrics (mainly registered services status)
consul:
# Version of the exporter
version: 0.11.0
# Docker image to use
image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2'
# Custom env var to set in the container
env: {}
# Resource allocation
resources:
cpu: 20
memory: 32
vault:
# Vault policies to attach
policies:
- 'consul-exporter[[ .consul.suffix ]]'
# The cluster exporter is a simple nginx used as a proxy
# which handles TLS for the cluster services (vault, consul and nomad)
cluster:
# Docker image to use
image: nginxinc/nginx-unprivileged:alpine
# Custom env
env: {}
# Resource allocation
resources:
cpu: 10
memory: 15
memory: 20
vault:
# Vault policies to attach to the task
policies:
- 'cluster-exporter[[ .consul.suffix ]]'
- metrics
- metrics[[ .consul.suffix ]]
# The prometheus server
prometheus:
version: 2.51.0
# Number of instances to run. Note that if you run several instances, they will be independant, and all of
# them will scrape the same data. Then queries to the prometheus API will be loadbalanced between all instances.
# This should work most of the time, but can give some strange result if eg, one of the instances was down (queries
# for data during the downtime can give some random result depending on the instance your query is routed to)
count: 1
# Version of prometheus
version: 2.51.0
# Docker image to use
image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1'
# Custom env var to set
env: {}
# Resource allocation
resources:
cpu: 200
memory: 512
# Volumes used for data persistence
# You must create a prometheus-data[0] volume as it's a per_alloc volume
volumes:
data:
type: csi
source: 'prometheus-data'
per_alloc: true
vault:
# Vault policies to attach to the task
policies:
- 'prometheus[[ .consul.suffix ]]'
# A dict of custom jobs. Eg
# jobs:
# squid:
# targets:
# - 10.11.2.3:9305
# - 192.168.6.20:782
jobs: {}
alert_rules: {}
# A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts. Eg
# alert_rules:
# postgres:
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
# patroni:
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml
# If you need something more flexible (like download an archive of rules and uncompress it, you should use artifacts instead. Just ensure your rules
# are in /local/rules/ inside the container
alert_rules: {}
# The public URL where prometheus will be reachable (if exposed with Traefik)
public_url: https://prometheus.example.org
# Traefik settings
traefik:
enabled: true
# Turn this on to expose prometheus with Traefik
# Caution : there's no builtin security, you should configure the appropriate middlewares
enabled: false
router: prometheus
# Metrics retention duration
retention: 30d
# always enable prometheus metrics (of course :-) )
prometheus:
enabled: true
# This is the URL where metrics are exposed, where the metrics proxy will point at (from the container PoV)
metrics_url: http://localhost:9090/metrics
# AlertManager can process and send alerts
alertmanager:
# Number of instances to run. Set > 1 if you wan HA
count: 1
# Version of alertmanager
version: 0.27.0
# DOcker image to use
image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-1'
# Custom env var to set in the container
env: {}
# Resource allocation
resources:
cpu: 50
memory: 64
memory_max: 80
public_url: https://alerte.example.org
# URL where the web interface is reachable (if exposed with Traefik)
public_url: https://alert.example.org
# Traefik settings
traefik:
enabled: true
# Turn this on to expose alertmanager with traefik
# Caution : there's no builtin security, you should configure appropriate middlewares before enabling
enabled: false
router: alertmanager
# No need to strip prefix as alertmanager will be configured to handle it
strip_prefix: false
# Volumes used for data persistence. Note : it's a per_alloc volume
# so you need to create eg alertmanager-data[0]. This volume should be writeable by user with ID 9093
volumes:
data:
source: 'alertmanager-data'
type: csi
per_alloc: true
prometheus:
metrics_url: http://127.0.0.1:9093/metrics
vault:
# List of vault policies to attach to the task
policies:
- metrics
- 'alertmanager[[ .consul.suffix ]]'
- metrics[[ .consul.suffix ]]
- alertmanager[[ .consul.suffix ]]
# Email settings
email:
from: alertmanager@[[ .consul.domain ]]
# You can merge your own custom config with the default provided one. Eg
# custom_config:
# receivers:
# - name: dani
# email_configs:
# - to: dani@example.org
# route:
# group_by: ['alertname', 'cluster', 'job']
# receiver: dani
custom_config: {}
# Loki is the log server
loki:
# Version of loki
version: 2.9.6
# Docker image to use
image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1'
# Custom env to set in the container
env: {}
# Resource allocation
resources:
cpu: 150
memory: 512
vault:
# Vault policies to attach in the container
policies:
- 'loki[[ .consul.suffix ]]'
# URL where loki is exposed (if enabled)
public_url: https://loki.example.org
# Traefik settings
traefik:
# Turn it on to expose Loki with Traefik
# Caution : there's no builtin security, you should add appropriate Traefik middlewares
enabled: false
router: loki
# Retention for logs. Older will be deleted
retention: 720h # 1 month
# Custom configuration which will be merged on top of the default one
custom_config: {}
prometheus:
# URL where metrics are available for the metrics proxy (from inside the container PoV)
metrics_url: http://localhost:3100/metrics
# Volumes for data persistence. Should be writable for user id 3100
volumes:
data:
type: csi
source: 'loki-data'
# Common vector settings
vector:
# Version of vector
version: 0.36.1
# Docker image to use
image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1'
# Vector aggregator can be used to ingest logs from external device (using syslog or fluentd)
# Logs will then be forwarded to loki
aggregator:
# Number of instances
count: 1
# Docker image to use
image: '[[ .monitoring.vector.image ]]'
# Custom env to set in the container
env: {}
# Resource allocation
resources:
cpu: 100
memory: 192
consul:
connect:
upstreams:
# Connect to loki through the service mesh
- destination_name: 'loki[[ .consul.suffix ]]'
local_bind_port: 3100
vault:
# Vault policies to attach to the task.
# Note : vector can expose its metrics with mTLS natively, so we do not add a metrics_proxy task
# but we need to grant the metrics policy to the vector task instead
policies:
- metrics[[ .consul.suffix ]]
# Fluentd source settings
fluentd:
enabled: false
traefik:
router: fluentd
entrypoints:
- fluentd
# Syslog source settings
syslog_udp:
enabled: false
traefik:
router: syslog-udp
entrypoints:
- syslog
- syslog-udp
# Syslog (tcp) source settings
syslog_tcp:
enabled: false
traefik:
router: syslog-tcp
entrypoints:
- syslog-tcp
# Native vector (http) source settings
vector:
enabled: true
# URL where the vector endpoint is available from the outside (if exposed with Traefik)
public_url: https://vector.example.org
traefik:
# Set to true if you want to expose the service with Traefik
# Caution : there's no builtin security, you should configure appropriate middlewares before enabling it
enabled: false
# Grafana settings
grafana:
# Grafana version
version: 10.4.1
# Docker image to use
image: '[[ .docker.repo ]]grafana:[[ .monitoring.grafana.version ]]-1'
env:
GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
# Custom env var to set in the container
env: {}
# Resource allocation
resources:
cpu: 100
memory: 256
# URL where Grafana is reachable
public_url: https://grafana.example.org
# List of plugins to install. Note : plugins are installed at image build time, so you need to rebuild
# the image if you want to update it
plugins:
#- alexanderzobnin-zabbix-app
#- ddurieux-glpi-app
- grafana-clock-panel
- grafana-piechart-panel
# Dict of feature toggles. See https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/feature-toggles/
# Example:
# feature_toggles:
# featureToggleAdminPage: true
# ssoSettingsApi: true
feature_toggles: {}
# Traefik settings
traefik:
enabled: true
router: grafana
# No need to strip prefix as Grafana will be configured to handle it correctly
strip_prefix: false
consul:
connect:
# Connect to postgres, loki and prometheus with the service mesh
upstreams:
- destination_name: postgres[[ .consul.suffix ]]
local_bind_port: 5432
@ -256,16 +361,20 @@ monitoring:
local_bind_port: 3100
- destination_name: prometheus[[ .consul.suffix ]]
local_bind_port: 9090
# Volumes for data persistence
volumes:
data:
type: csi
source: 'grafana-data'
vault:
# Vault policies to attach to the task
policies:
- 'grafana[[ .consul.suffix ]]'
# Postgres DB settings
database:
role: grafana
pgrole: grafana
# Override some default postgres handling
postgres:
database: grafana
user: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.username }}{{ end }}'
@ -273,64 +382,131 @@ monitoring:
pooler:
mode: session
prometheus:
metrics_url: http://localhost:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics
# URL where Grafana metrics are reachable for the metrics proxy (from inside the container PoV)
metrics_url: http://127.0.0.1:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics
# Agent runs as a system jobs, on all the nodes
agent:
consul:
meta:
# Override the alloc service meta, the hostname will be more useful than a 0)
alloc: '${node.unique.name}'
# Nomad settings
nomad:
# Run on all node pools
node_pool: all
# Run with an above average priority
priority: 60
# Nomad vector logger is a small container which will query the Nomad API to discover running allocation on the current node
# Then generate a vector configuration with scraping for all the discovered allocation.
nomad_vector_logger:
version: 24.3
image: '[[ .docker.repo ]]nomad-vector-logger:[[ .monitoring.agent.nomad_vector_logger.version ]]-2'
# Docker image to use
image: '[[ .docker.repo ]]nomad-vector-logger:24.3-2'
# Custom env to set in the container
env: {}
# Resource allocation
resources:
cpu: 20
memory: 24
memory_max: 50
vault:
# Vault policies to attach to the task
policies:
- nomad-vector-logger[[ .consul.suffix ]]
# Vector is the main task. It'll read it's config created by nomad-vector-logger and will read log files
# accordingly, add useful metadata (like node, job, group, task, alloc etc.) and push logs to loki
vector:
# Docker image to use
image: '[[ .monitoring.vector.image ]]'
# Custom env to set in the container
env: {}
# Resource allocation
resources:
cpu: 100
memory: 192
memory_max: 384
memory: 384
memory_max: 512
vault:
# Vault policies to attach to the container. Vector being able to use mTLS on the metrics endpoint
# there's no need to add a metrics_proxy task. Instead, we grant the metrics policy to vector so it can get
# a certificate from vault
policies:
- metrics[[ .consul.suffix ]]
consul:
connect:
upstreams:
# Connect to loki with the service mesh
- destination_name: loki[[ .consul.suffix ]]
local_bind_port: 3100
# Volumes for data persistence
volumes:
# The nomad volume should expose the Nomad alloc dir (eg /opt/nomad/data/alloc) where vector will be able
# to read the logs. You should create a host volume in nomad client config of all your nodes. Eg
# client {
# enabled = true
# host_volume "nomad_alloc" {
# path = "/opt/nomad/data/alloc"
# read_only = "true"
# }
# }
nomad:
type: host
source: nomad_alloc
read_only: true
# The data volume will be used by vector for buffering (in case loki is unavailable)
# You can create a host volume in Nomad's client config, eg
# client {
# enabled = true
# host_volume "nomad_alloc" {
# path = "/data/vector-agent"
# }
# }
data:
type: host
source: vector_data
# The node exporter can be used to expose the host metrics to prometheus
node_exporter:
image: quay.io/prometheus/node-exporter:latest
# Is the node exporter enabled ? (set to false if you don't want it, or if you
# already manage the node-exporter separatly)
enabled: true
# Version of the exporter
version: 1.7.0
# Docker image to use
image: '[[ .docker.repo ]]node-exporter:[[ .monitoring.agent.node_exporter.version ]]-1'
# Custom env to set in the container
env: {}
# Resource allocation
resources:
cpu: 50
memory: 24
memory_max: 32
vault:
# Vault policies to atatch to the task
# This exporter can handle mTLS itself, so no need to create a metrics_proxy task, instead, grant the metrics policy
# So it can get a certificate from vault
policies:
- metrics[[ .consul.suffix ]]
# Args to add to the exporter on start
args:
- '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)'
# Volumes
volumes:
# The exporter should access the host root filesystem
# For this, you should create a host volume in Nomad's client config, eg
# client {
# enabled = true
# host_volume "host_root" {
# path = "/"
# read_only = true
# }
# }
host:
type: host
source: host_root
read_only: true
# Enable globaly prometheus for this bundle :-)
prometheus:
enabled: true