Various cleanup
This commit is contained in:
parent
f954afc251
commit
2ae2a91002
|
@ -1,7 +1,9 @@
|
|||
job "[[ .instance ]]-agent" {
|
||||
|
||||
[[- $c := merge .monitoring.agent .monitoring . ]]
|
||||
|
||||
[[ template "common/job_start" $c ]]
|
||||
|
||||
type = "system"
|
||||
|
||||
# This group will collect logs from the allocation running on the node
|
||||
|
@ -39,16 +41,16 @@ job "[[ .instance ]]-agent" {
|
|||
user = 3987
|
||||
|
||||
config {
|
||||
image = "[[ $n.image ]]"
|
||||
image = "[[ $n.image ]]"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 50
|
||||
pids_limit = 50
|
||||
# Nomad Vector Logger needs to run on the host's network namespace
|
||||
# so it can reach the Nomad Agent API on localhost:4646
|
||||
network_mode = "host"
|
||||
# Host network namespace requires disabling user namespace
|
||||
userns_mode = "host"
|
||||
command = "nomad-vector-logger"
|
||||
args = [
|
||||
command = "nomad-vector-logger"
|
||||
args = [
|
||||
"--config",
|
||||
"/local/nomad-vector-logger.toml"
|
||||
]
|
||||
|
@ -85,9 +87,9 @@ _EOT
|
|||
destination = "local/nomad-vector-logger.toml"
|
||||
}
|
||||
|
||||
# Disable the default nomad.toml template
|
||||
# Disable the default nomad.toml template, as we provide our own nomad.yml template
|
||||
template {
|
||||
data = "# Disable the default toml template"
|
||||
data = "# Disable the default toml template"
|
||||
destination = "local/template/nomad.toml"
|
||||
}
|
||||
|
||||
|
@ -143,11 +145,13 @@ _EOT
|
|||
driver = "[[ $c.nomad.driver ]]"
|
||||
|
||||
config {
|
||||
image = "busybox:latest"
|
||||
command = "sh"
|
||||
args = [
|
||||
image = "busybox:latest"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 20
|
||||
command = "sh"
|
||||
args = [
|
||||
"-c",
|
||||
"echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
|
||||
"echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done"
|
||||
]
|
||||
}
|
||||
|
||||
|
@ -170,9 +174,11 @@ _EOT
|
|||
leader = true
|
||||
|
||||
config {
|
||||
image = "[[ $c.image ]]"
|
||||
userns_mode = "host"
|
||||
args = [
|
||||
image = "[[ $c.image ]]"
|
||||
userns_mode = "host"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 200
|
||||
args = [
|
||||
"--watch-config",
|
||||
"--config", "/local/vector.yml",
|
||||
"--config-dir", "/alloc/data/vector_conf"
|
||||
|
@ -186,7 +192,9 @@ _EOT
|
|||
}
|
||||
|
||||
[[ template "common/metrics_cert" $c ]]
|
||||
[[ template "common/artifacts" $c ]]
|
||||
|
||||
# Main vector configuration
|
||||
template {
|
||||
data =<<_EOT
|
||||
[[ template "monitoring/agent/vector.yml" $c ]]
|
||||
|
@ -217,6 +225,8 @@ _EOT
|
|||
}
|
||||
}
|
||||
|
||||
[[- if .monitoring.agent.node_exporter.enabled ]]
|
||||
|
||||
# This group runs the prometheus node-exporter to expose prometheus metrics from the node
|
||||
group "node-exporter" {
|
||||
|
||||
|
@ -238,21 +248,25 @@ _EOT
|
|||
driver = "[[ $c.nomad.driver ]]"
|
||||
|
||||
config {
|
||||
image = "[[ $c.image ]]"
|
||||
pid_mode = "host"
|
||||
#network_mode = "host"
|
||||
userns_mode = "host"
|
||||
image = "[[ $c.image ]]"
|
||||
pid_mode = "host"
|
||||
userns_mode = "host"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 50
|
||||
args = [
|
||||
pids_limit = 50
|
||||
command = "/usr/local/bin/node_exporter"
|
||||
args = [
|
||||
"--path.rootfs=/host",
|
||||
"--web.config.file=/local/tls.yml",
|
||||
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
|
||||
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}",
|
||||
[[- range $arg := $c.args ]]
|
||||
"[[ $arg ]]",
|
||||
[[- end ]]
|
||||
]
|
||||
}
|
||||
|
||||
[[ template "common/vault.policies" $c ]]
|
||||
[[ template "common/metrics_cert" $c ]]
|
||||
[[ template "common/artifacts" $c ]]
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
|
@ -271,4 +285,5 @@ _EOT
|
|||
[[ template "common/resources" $c ]]
|
||||
}
|
||||
}
|
||||
[[- end ]]
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
Kind = "service-defaults"
|
||||
Name = "vector-aggregator[[ .consul.suffix ]]"
|
||||
Protocol = "http"
|
|
@ -0,0 +1,16 @@
|
|||
[[- $c := merge .monitoring.aggregator .monitoring . -]]
|
||||
Kind = "service-intentions"
|
||||
Name = "vector-aggregator[[ .consul.suffix ]]"
|
||||
Sources = [
|
||||
{
|
||||
Name = "[[ $c.traefik.instance ]]"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "[[ $c.traefik.enabled | ternary "allow" "deny" ]]"
|
||||
HTTP {
|
||||
Methods = ["POST"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
Binary file not shown.
|
@ -1,8 +1,11 @@
|
|||
job "monitoring-agent" {
|
||||
|
||||
|
||||
datacenters = ["dc1"]
|
||||
region = "global"
|
||||
node_pool = "all"
|
||||
priority = 60
|
||||
|
||||
|
||||
type = "system"
|
||||
|
||||
|
@ -161,7 +164,7 @@ _EOT
|
|||
destination = "local/nomad-vector-logger.toml"
|
||||
}
|
||||
|
||||
# Disable the default nomad.toml template
|
||||
# Disable the default nomad.toml template, as we provide our own nomad.yml template
|
||||
template {
|
||||
data = "# Disable the default toml template"
|
||||
destination = "local/template/nomad.toml"
|
||||
|
@ -184,6 +187,8 @@ sources:
|
|||
mode: continue_through
|
||||
condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
|
||||
timeout_ms: 1000
|
||||
ignore_older_secs: 1800
|
||||
oldest_first: true
|
||||
|
||||
{{- end }}
|
||||
|
||||
|
@ -262,11 +267,13 @@ _EOT
|
|||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "busybox:latest"
|
||||
command = "sh"
|
||||
image = "busybox:latest"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 20
|
||||
command = "sh"
|
||||
args = [
|
||||
"-c",
|
||||
"echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
|
||||
"echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done"
|
||||
]
|
||||
}
|
||||
|
||||
|
@ -289,8 +296,10 @@ _EOT
|
|||
leader = true
|
||||
|
||||
config {
|
||||
image = "danielberteaud/vector:0.36.1-1"
|
||||
userns_mode = "host"
|
||||
image = "danielberteaud/vector:0.36.1-1"
|
||||
userns_mode = "host"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 200
|
||||
args = [
|
||||
"--watch-config",
|
||||
"--config", "/local/vector.yml",
|
||||
|
@ -331,6 +340,9 @@ _EOT
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
# Main vector configuration
|
||||
template {
|
||||
data = <<_EOT
|
||||
data_dir: /data
|
||||
|
@ -398,8 +410,8 @@ _EOT
|
|||
|
||||
resources {
|
||||
cpu = 100
|
||||
memory = 192
|
||||
memory_max = 384
|
||||
memory = 384
|
||||
memory_max = 512
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -436,16 +448,17 @@ _EOT
|
|||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "quay.io/prometheus/node-exporter:latest"
|
||||
pid_mode = "host"
|
||||
#network_mode = "host"
|
||||
image = "danielberteaud/node-exporter:1.7.0-1"
|
||||
pid_mode = "host"
|
||||
userns_mode = "host"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 50
|
||||
command = "/usr/local/bin/node_exporter"
|
||||
args = [
|
||||
"--path.rootfs=/host",
|
||||
"--web.config.file=/local/tls.yml",
|
||||
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
|
||||
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}",
|
||||
"--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)",
|
||||
]
|
||||
}
|
||||
|
||||
|
@ -477,6 +490,8 @@ _EOT
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
tls_server_config:
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
Kind = "service-defaults"
|
||||
Name = "vector-aggregator"
|
||||
Protocol = "http"
|
|
@ -0,0 +1,15 @@
|
|||
Kind = "service-intentions"
|
||||
Name = "vector-aggregator"
|
||||
Sources = [
|
||||
{
|
||||
Name = "traefik"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
Methods = ["POST"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
|
@ -411,7 +411,7 @@ _EOT
|
|||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 15
|
||||
memory = 20
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
FROM danielberteaud/alpine:24.3-1 AS builder
|
||||
|
||||
ARG EXPORTER_VERSION=1.7.0
|
||||
|
||||
ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
|
||||
ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/sha256sums.txt /tmp
|
||||
|
||||
RUN set -euxo pipefail &&\
|
||||
apk --no-cache add \
|
||||
curl \
|
||||
tar \
|
||||
ca-certificates \
|
||||
&&\
|
||||
cd /tmp &&\
|
||||
grep node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz sha256sums.txt | sha256sum -c &&\
|
||||
tar xvzf node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz &&\
|
||||
mv node_exporter-${EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/node_exporter
|
||||
|
||||
FROM danielberteaud/alpine:24.3-1
|
||||
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
|
||||
|
||||
COPY --from=builder --chown=root:root --chmod=755 /usr/local/bin/node_exporter /usr/local/bin/node_exporter
|
||||
|
||||
CMD ["/usr/local/bin/node_exporter"]
|
|
@ -5,7 +5,7 @@ job "monitoring-services" {
|
|||
region = "global"
|
||||
|
||||
|
||||
# Metrics is running prometheus and various exporters
|
||||
# Metrics is running prometheus
|
||||
group "metrics-server" {
|
||||
|
||||
shutdown_delay = "6s"
|
||||
|
@ -67,7 +67,7 @@ job "monitoring-services" {
|
|||
type = "http"
|
||||
expose = true
|
||||
path = "/-/healthy"
|
||||
interval = "15s"
|
||||
interval = "20s"
|
||||
timeout = "8s"
|
||||
check_restart {
|
||||
limit = 10
|
||||
|
@ -77,11 +77,6 @@ job "monitoring-services" {
|
|||
|
||||
tags = [
|
||||
|
||||
"traefik.enable=true",
|
||||
"traefik.http.routers.monitoring-prometheus.entrypoints=https",
|
||||
"traefik.http.routers.monitoring-prometheus.rule=Host(`prometheus.example.org`)",
|
||||
"traefik.http.middlewares.csp-monitoring-prometheus.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
|
||||
"traefik.http.routers.monitoring-prometheus.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-prometheus",
|
||||
|
||||
]
|
||||
}
|
||||
|
@ -892,6 +887,410 @@ _EOT
|
|||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
}
|
||||
template {
|
||||
data = <<_EOT
|
||||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: LokiProcessTooManyRestarts
|
||||
expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Loki process too many restarts (instance {{ $labels.instance }})
|
||||
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestErrors
|
||||
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request errors (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestPanic
|
||||
expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request panic (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestLatency
|
||||
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request latency (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
_EOT
|
||||
destination = "local/rules/loki.yml"
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
}
|
||||
template {
|
||||
data = <<_EOT
|
||||
groups:
|
||||
|
||||
- name: NodeExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostFilesystemDeviceError
|
||||
expr: 'node_filesystem_device_error == 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostInodesWillFillIn24Hours
|
||||
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# - alert: HostCpuIsUnderutilized
|
||||
# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
# for: 1w
|
||||
# labels:
|
||||
# severity: info
|
||||
# annotations:
|
||||
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuHighIowait
|
||||
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskIo
|
||||
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostContextSwitching
|
||||
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching (instance {{ $labels.instance }})
|
||||
description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# - alert: HostSwapIsFillingUp
|
||||
# expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
# for: 2m
|
||||
# labels:
|
||||
# severity: warning
|
||||
# annotations:
|
||||
# summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
# description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRequiresReboot
|
||||
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 4h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
_EOT
|
||||
destination = "local/rules/node.yml"
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
}
|
||||
|
||||
# A client cert, to connect to the AlertManager API
|
||||
template {
|
||||
|
@ -945,8 +1344,11 @@ _EOT
|
|||
|
||||
network {
|
||||
mode = "bridge"
|
||||
# Port exposing the web API, with mTLS
|
||||
port "web-tls" {}
|
||||
# Port used for gossip between the different alertmanager instance
|
||||
port "cluster" {}
|
||||
# Port to expose metrics to prometheus
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
|
@ -1031,101 +1433,10 @@ _EOT
|
|||
|
||||
tags = [
|
||||
|
||||
"traefik.enable=true",
|
||||
"traefik.http.routers.monitoring-alertmanager.entrypoints=https",
|
||||
"traefik.http.routers.monitoring-alertmanager.rule=Host(`alerte.example.org`)",
|
||||
"traefik.http.middlewares.csp-monitoring-alertmanager.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
|
||||
"traefik.http.routers.monitoring-alertmanager.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-alertmanager",
|
||||
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
|
||||
task "metrics-proxy" {
|
||||
driver = "docker"
|
||||
user = 8995
|
||||
|
||||
config {
|
||||
image = "nginxinc/nginx-unprivileged:alpine"
|
||||
force_pull = true
|
||||
volumes = [
|
||||
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
|
||||
]
|
||||
pids_limit = 100
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
hook = "poststart"
|
||||
sidecar = true
|
||||
}
|
||||
|
||||
vault {
|
||||
policies = ["metrics"]
|
||||
}
|
||||
|
||||
# Get a certificate from vault to protect the metrics endpoint
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
# Get the root CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
location /metrics {
|
||||
proxy_pass http://127.0.0.1:9093/metrics;
|
||||
}
|
||||
}
|
||||
_EOT
|
||||
destination = "local/default.conf"
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 10
|
||||
memory_max = 20
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
# This task will handle mTLS to the AlertManager API
|
||||
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
|
||||
task "untls-proxy" {
|
||||
|
@ -1166,10 +1477,11 @@ _EOT
|
|||
|
||||
template {
|
||||
data = <<_EOT
|
||||
# UnTLS for the web API
|
||||
server {
|
||||
listen 127.0.0.1:9093;
|
||||
location / {
|
||||
proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
|
||||
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
|
||||
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
|
||||
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
|
||||
proxy_ssl_verify on;
|
||||
|
@ -1180,10 +1492,66 @@ server {
|
|||
}
|
||||
}
|
||||
|
||||
# Metrics proxy
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
|
||||
location /metrics {
|
||||
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
|
||||
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
|
||||
proxy_ssl_verify on;
|
||||
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul;
|
||||
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
|
||||
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
_EOT
|
||||
destination = "local/alertmanager.conf"
|
||||
}
|
||||
|
||||
# Get a certificate from vault to protect the metrics endpoint
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
# Get the root CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
|
||||
# Certifiate used by AlertManager
|
||||
template {
|
||||
data = <<_EOT
|
||||
|
@ -1203,14 +1571,6 @@ _EOT
|
|||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# The trusted CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 18
|
||||
|
@ -1300,7 +1660,7 @@ set -euo pipefail
|
|||
exec alertmanager \
|
||||
--config.file=/secrets/alertmanager.yml \
|
||||
--storage.path=/data \
|
||||
--web.external-url=https://alerte.example.org \
|
||||
--web.external-url=https://alert.example.org \
|
||||
--web.route-prefix=/ \
|
||||
--web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \
|
||||
--cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \
|
||||
|
@ -1430,11 +1790,6 @@ _EOT
|
|||
|
||||
tags = [
|
||||
|
||||
"traefik.enable=true",
|
||||
"traefik.http.routers.monitoring-loki.entrypoints=https",
|
||||
"traefik.http.routers.monitoring-loki.rule=Host(`loki.example.org`)",
|
||||
"traefik.http.middlewares.csp-monitoring-loki.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
|
||||
"traefik.http.routers.monitoring-loki.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-loki",
|
||||
|
||||
]
|
||||
}
|
||||
|
@ -2048,7 +2403,7 @@ server {
|
|||
return 405;
|
||||
}
|
||||
location /metrics {
|
||||
proxy_pass http://localhost:3000/metrics;
|
||||
proxy_pass http://127.0.0.1:3000/metrics;
|
||||
}
|
||||
}
|
||||
_EOT
|
||||
|
@ -2132,7 +2487,6 @@ _EOT
|
|||
# Use a template block instead of env {} so we can fetch values from vault
|
||||
template {
|
||||
data = <<_EOT
|
||||
GF_SECURITY_ADMIN_PASSWORD={{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}
|
||||
LANG=fr_FR.utf8
|
||||
TZ=Europe/Paris
|
||||
_EOT
|
||||
|
@ -2142,6 +2496,15 @@ _EOT
|
|||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
|
||||
_EOT
|
||||
destination = "secrets/.grafana.env"
|
||||
perms = 400
|
||||
env = true
|
||||
}
|
||||
|
||||
# Basic grafana configuration file
|
||||
template {
|
||||
data = <<_EOT
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
|
||||
|
||||
ARG EXPORTER_VERSION=[[ .monitoring.agent.node_exporter.version ]]
|
||||
|
||||
ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
|
||||
ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/sha256sums.txt /tmp
|
||||
|
||||
RUN set -euxo pipefail &&\
|
||||
apk --no-cache add \
|
||||
curl \
|
||||
tar \
|
||||
ca-certificates \
|
||||
&&\
|
||||
cd /tmp &&\
|
||||
grep node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz sha256sums.txt | sha256sum -c &&\
|
||||
tar xvzf node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz &&\
|
||||
mv node_exporter-${EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/node_exporter
|
||||
|
||||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
|
||||
MAINTAINER [[ .docker.maintainer ]]
|
||||
|
||||
COPY --from=builder --chown=root:root --chmod=755 /usr/local/bin/node_exporter /usr/local/bin/node_exporter
|
||||
|
||||
CMD ["/usr/local/bin/node_exporter"]
|
|
@ -2,7 +2,7 @@ job "[[ .instance ]]-services" {
|
|||
|
||||
[[ template "common/job_start" . ]]
|
||||
|
||||
# Metrics is running prometheus and various exporters
|
||||
# Metrics is running prometheus
|
||||
group "metrics-server" {
|
||||
[[- $c := merge .monitoring.prometheus .monitoring . ]]
|
||||
|
||||
|
@ -28,7 +28,7 @@ job "[[ .instance ]]-services" {
|
|||
type = "http"
|
||||
expose = true
|
||||
path = "/-/healthy"
|
||||
interval = "15s"
|
||||
interval = "20s"
|
||||
timeout = "8s"
|
||||
check_restart {
|
||||
limit = 10
|
||||
|
@ -168,8 +168,11 @@ _EOT
|
|||
|
||||
network {
|
||||
mode = "bridge"
|
||||
# Port exposing the web API, with mTLS
|
||||
port "web-tls" {}
|
||||
# Port used for gossip between the different alertmanager instance
|
||||
port "cluster" {}
|
||||
# Port to expose metrics to prometheus
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
|
@ -220,8 +223,6 @@ _EOT
|
|||
]
|
||||
}
|
||||
|
||||
[[ template "common/task.metrics_proxy" $c ]]
|
||||
|
||||
# This task will handle mTLS to the AlertManager API
|
||||
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
|
||||
task "untls-proxy" {
|
||||
|
@ -253,6 +254,8 @@ _EOT
|
|||
destination = "local/alertmanager.conf"
|
||||
}
|
||||
|
||||
[[ template "common/metrics_cert" $c ]]
|
||||
|
||||
# Certifiate used by AlertManager
|
||||
template {
|
||||
data = <<_EOT
|
||||
|
@ -272,14 +275,6 @@ _EOT
|
|||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# The trusted CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 18
|
||||
|
@ -617,6 +612,15 @@ _EOT
|
|||
[[ template "common/vault.policies" $c ]]
|
||||
[[ template "common/file_env" $c ]]
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
|
||||
_EOT
|
||||
destination = "secrets/.grafana.env"
|
||||
perms = 400
|
||||
env = true
|
||||
}
|
||||
|
||||
# Basic grafana configuration file
|
||||
template {
|
||||
data = <<_EOT
|
||||
|
|
|
@ -12,6 +12,8 @@ sources:
|
|||
mode: continue_through
|
||||
condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
|
||||
timeout_ms: 1000
|
||||
ignore_older_secs: 1800
|
||||
oldest_first: true
|
||||
|
||||
{{- end }}
|
||||
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
# UnTLS for the web API
|
||||
server {
|
||||
listen 127.0.0.1:9093;
|
||||
location / {
|
||||
proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
|
||||
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
|
||||
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
|
||||
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
|
||||
proxy_ssl_verify on;
|
||||
|
@ -11,3 +12,39 @@ server {
|
|||
deny all;
|
||||
}
|
||||
}
|
||||
|
||||
# Metrics proxy
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
|
||||
location /metrics {
|
||||
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
|
||||
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
|
||||
proxy_ssl_verify on;
|
||||
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.[[ .instance ]].[[ .consul.domain ]];
|
||||
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
|
||||
proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: LokiProcessTooManyRestarts
|
||||
expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Loki process too many restarts (instance {{ $labels.instance }})
|
||||
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestErrors
|
||||
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request errors (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestPanic
|
||||
expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request panic (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestLatency
|
||||
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request latency (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
@ -0,0 +1,347 @@
|
|||
groups:
|
||||
|
||||
- name: NodeExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostFilesystemDeviceError
|
||||
expr: 'node_filesystem_device_error == 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostInodesWillFillIn24Hours
|
||||
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# - alert: HostCpuIsUnderutilized
|
||||
# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
# for: 1w
|
||||
# labels:
|
||||
# severity: info
|
||||
# annotations:
|
||||
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuHighIowait
|
||||
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskIo
|
||||
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostContextSwitching
|
||||
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching (instance {{ $labels.instance }})
|
||||
description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# - alert: HostSwapIsFillingUp
|
||||
# expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
# for: 2m
|
||||
# labels:
|
||||
# severity: warning
|
||||
# annotations:
|
||||
# summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
# description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRequiresReboot
|
||||
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 4h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
246
variables.yml
246
variables.yml
|
@ -76,179 +76,284 @@ monitoring:
|
|||
# - https://portal.acme.com
|
||||
http_probes: []
|
||||
|
||||
# Consul exporter will expose consul metrics
|
||||
# Consul exporter will expose consul metrics (mainly registered services status)
|
||||
consul:
|
||||
# Version of the exporter
|
||||
version: 0.11.0
|
||||
# Docker image to use
|
||||
image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2'
|
||||
# Custom env var to set in the container
|
||||
env: {}
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 20
|
||||
memory: 32
|
||||
vault:
|
||||
# Vault policies to attach
|
||||
policies:
|
||||
- 'consul-exporter[[ .consul.suffix ]]'
|
||||
|
||||
# The cluster exporter is a simple nginx used as a proxy
|
||||
# which handles TLS for the cluster services (vault, consul and nomad)
|
||||
cluster:
|
||||
# Docker image to use
|
||||
image: nginxinc/nginx-unprivileged:alpine
|
||||
# Custom env
|
||||
env: {}
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 10
|
||||
memory: 15
|
||||
memory: 20
|
||||
vault:
|
||||
# Vault policies to attach to the task
|
||||
policies:
|
||||
- 'cluster-exporter[[ .consul.suffix ]]'
|
||||
- metrics
|
||||
- metrics[[ .consul.suffix ]]
|
||||
|
||||
# The prometheus server
|
||||
prometheus:
|
||||
|
||||
version: 2.51.0
|
||||
|
||||
# Number of instances to run. Note that if you run several instances, they will be independant, and all of
|
||||
# them will scrape the same data. Then queries to the prometheus API will be loadbalanced between all instances.
|
||||
# This should work most of the time, but can give some strange result if eg, one of the instances was down (queries
|
||||
# for data during the downtime can give some random result depending on the instance your query is routed to)
|
||||
count: 1
|
||||
|
||||
# Version of prometheus
|
||||
version: 2.51.0
|
||||
# Docker image to use
|
||||
image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1'
|
||||
|
||||
# Custom env var to set
|
||||
env: {}
|
||||
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 200
|
||||
memory: 512
|
||||
|
||||
# Volumes used for data persistence
|
||||
# You must create a prometheus-data[0] volume as it's a per_alloc volume
|
||||
volumes:
|
||||
data:
|
||||
type: csi
|
||||
source: 'prometheus-data'
|
||||
per_alloc: true
|
||||
|
||||
vault:
|
||||
# Vault policies to attach to the task
|
||||
policies:
|
||||
- 'prometheus[[ .consul.suffix ]]'
|
||||
|
||||
# A dict of custom jobs. Eg
|
||||
# jobs:
|
||||
# squid:
|
||||
# targets:
|
||||
# - 10.11.2.3:9305
|
||||
# - 192.168.6.20:782
|
||||
jobs: {}
|
||||
alert_rules: {}
|
||||
# A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts. Eg
|
||||
# alert_rules:
|
||||
# postgres:
|
||||
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
|
||||
|
||||
# patroni:
|
||||
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml
|
||||
# If you need something more flexible (like download an archive of rules and uncompress it, you should use artifacts instead. Just ensure your rules
|
||||
# are in /local/rules/ inside the container
|
||||
alert_rules: {}
|
||||
# The public URL where prometheus will be reachable (if exposed with Traefik)
|
||||
public_url: https://prometheus.example.org
|
||||
# Traefik settings
|
||||
traefik:
|
||||
enabled: true
|
||||
# Turn this on to expose prometheus with Traefik
|
||||
# Caution : there's no builtin security, you should configure the appropriate middlewares
|
||||
enabled: false
|
||||
router: prometheus
|
||||
|
||||
# Metrics retention duration
|
||||
retention: 30d
|
||||
|
||||
# always enable prometheus metrics (of course :-) )
|
||||
prometheus:
|
||||
enabled: true
|
||||
# This is the URL where metrics are exposed, where the metrics proxy will point at (from the container PoV)
|
||||
metrics_url: http://localhost:9090/metrics
|
||||
|
||||
# AlertManager can process and send alerts
|
||||
alertmanager:
|
||||
# Number of instances to run. Set > 1 if you wan HA
|
||||
count: 1
|
||||
# Version of alertmanager
|
||||
version: 0.27.0
|
||||
# DOcker image to use
|
||||
image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-1'
|
||||
# Custom env var to set in the container
|
||||
env: {}
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 50
|
||||
memory: 64
|
||||
memory_max: 80
|
||||
public_url: https://alerte.example.org
|
||||
# URL where the web interface is reachable (if exposed with Traefik)
|
||||
public_url: https://alert.example.org
|
||||
# Traefik settings
|
||||
traefik:
|
||||
enabled: true
|
||||
# Turn this on to expose alertmanager with traefik
|
||||
# Caution : there's no builtin security, you should configure appropriate middlewares before enabling
|
||||
enabled: false
|
||||
router: alertmanager
|
||||
# No need to strip prefix as alertmanager will be configured to handle it
|
||||
strip_prefix: false
|
||||
# Volumes used for data persistence. Note : it's a per_alloc volume
|
||||
# so you need to create eg alertmanager-data[0]. This volume should be writeable by user with ID 9093
|
||||
volumes:
|
||||
data:
|
||||
source: 'alertmanager-data'
|
||||
type: csi
|
||||
per_alloc: true
|
||||
prometheus:
|
||||
metrics_url: http://127.0.0.1:9093/metrics
|
||||
vault:
|
||||
# List of vault policies to attach to the task
|
||||
policies:
|
||||
- metrics
|
||||
- 'alertmanager[[ .consul.suffix ]]'
|
||||
- metrics[[ .consul.suffix ]]
|
||||
- alertmanager[[ .consul.suffix ]]
|
||||
# Email settings
|
||||
email:
|
||||
from: alertmanager@[[ .consul.domain ]]
|
||||
# You can merge your own custom config with the default provided one. Eg
|
||||
# custom_config:
|
||||
# receivers:
|
||||
# - name: dani
|
||||
# email_configs:
|
||||
# - to: dani@example.org
|
||||
# route:
|
||||
# group_by: ['alertname', 'cluster', 'job']
|
||||
# receiver: dani
|
||||
custom_config: {}
|
||||
|
||||
# Loki is the log server
|
||||
loki:
|
||||
# Version of loki
|
||||
version: 2.9.6
|
||||
# Docker image to use
|
||||
image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1'
|
||||
# Custom env to set in the container
|
||||
env: {}
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 150
|
||||
memory: 512
|
||||
vault:
|
||||
# Vault policies to attach in the container
|
||||
policies:
|
||||
- 'loki[[ .consul.suffix ]]'
|
||||
# URL where loki is exposed (if enabled)
|
||||
public_url: https://loki.example.org
|
||||
# Traefik settings
|
||||
traefik:
|
||||
# Turn it on to expose Loki with Traefik
|
||||
# Caution : there's no builtin security, you should add appropriate Traefik middlewares
|
||||
enabled: false
|
||||
router: loki
|
||||
# Retention for logs. Older will be deleted
|
||||
retention: 720h # 1 month
|
||||
# Custom configuration which will be merged on top of the default one
|
||||
custom_config: {}
|
||||
prometheus:
|
||||
# URL where metrics are available for the metrics proxy (from inside the container PoV)
|
||||
metrics_url: http://localhost:3100/metrics
|
||||
# Volumes for data persistence. Should be writable for user id 3100
|
||||
volumes:
|
||||
data:
|
||||
type: csi
|
||||
source: 'loki-data'
|
||||
|
||||
# Common vector settings
|
||||
vector:
|
||||
# Version of vector
|
||||
version: 0.36.1
|
||||
# Docker image to use
|
||||
image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1'
|
||||
|
||||
# Vector aggregator can be used to ingest logs from external device (using syslog or fluentd)
|
||||
# Logs will then be forwarded to loki
|
||||
aggregator:
|
||||
# Number of instances
|
||||
count: 1
|
||||
# Docker image to use
|
||||
image: '[[ .monitoring.vector.image ]]'
|
||||
# Custom env to set in the container
|
||||
env: {}
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 100
|
||||
memory: 192
|
||||
consul:
|
||||
connect:
|
||||
upstreams:
|
||||
# Connect to loki through the service mesh
|
||||
- destination_name: 'loki[[ .consul.suffix ]]'
|
||||
local_bind_port: 3100
|
||||
vault:
|
||||
# Vault policies to attach to the task.
|
||||
# Note : vector can expose its metrics with mTLS natively, so we do not add a metrics_proxy task
|
||||
# but we need to grant the metrics policy to the vector task instead
|
||||
policies:
|
||||
- metrics[[ .consul.suffix ]]
|
||||
# Fluentd source settings
|
||||
fluentd:
|
||||
enabled: false
|
||||
traefik:
|
||||
router: fluentd
|
||||
entrypoints:
|
||||
- fluentd
|
||||
# Syslog source settings
|
||||
syslog_udp:
|
||||
enabled: false
|
||||
traefik:
|
||||
router: syslog-udp
|
||||
entrypoints:
|
||||
- syslog
|
||||
- syslog-udp
|
||||
# Syslog (tcp) source settings
|
||||
syslog_tcp:
|
||||
enabled: false
|
||||
traefik:
|
||||
router: syslog-tcp
|
||||
entrypoints:
|
||||
- syslog-tcp
|
||||
# Native vector (http) source settings
|
||||
vector:
|
||||
enabled: true
|
||||
# URL where the vector endpoint is available from the outside (if exposed with Traefik)
|
||||
public_url: https://vector.example.org
|
||||
traefik:
|
||||
# Set to true if you want to expose the service with Traefik
|
||||
# Caution : there's no builtin security, you should configure appropriate middlewares before enabling it
|
||||
enabled: false
|
||||
|
||||
# Grafana settings
|
||||
grafana:
|
||||
# Grafana version
|
||||
version: 10.4.1
|
||||
# Docker image to use
|
||||
image: '[[ .docker.repo ]]grafana:[[ .monitoring.grafana.version ]]-1'
|
||||
env:
|
||||
GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
|
||||
# Custom env var to set in the container
|
||||
env: {}
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 100
|
||||
memory: 256
|
||||
# URL where Grafana is reachable
|
||||
public_url: https://grafana.example.org
|
||||
# List of plugins to install. Note : plugins are installed at image build time, so you need to rebuild
|
||||
# the image if you want to update it
|
||||
plugins:
|
||||
#- alexanderzobnin-zabbix-app
|
||||
#- ddurieux-glpi-app
|
||||
- grafana-clock-panel
|
||||
- grafana-piechart-panel
|
||||
# Dict of feature toggles. See https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/feature-toggles/
|
||||
# Example:
|
||||
# feature_toggles:
|
||||
# featureToggleAdminPage: true
|
||||
# ssoSettingsApi: true
|
||||
feature_toggles: {}
|
||||
# Traefik settings
|
||||
traefik:
|
||||
enabled: true
|
||||
router: grafana
|
||||
# No need to strip prefix as Grafana will be configured to handle it correctly
|
||||
strip_prefix: false
|
||||
consul:
|
||||
connect:
|
||||
# Connect to postgres, loki and prometheus with the service mesh
|
||||
upstreams:
|
||||
- destination_name: postgres[[ .consul.suffix ]]
|
||||
local_bind_port: 5432
|
||||
|
@ -256,16 +361,20 @@ monitoring:
|
|||
local_bind_port: 3100
|
||||
- destination_name: prometheus[[ .consul.suffix ]]
|
||||
local_bind_port: 9090
|
||||
# Volumes for data persistence
|
||||
volumes:
|
||||
data:
|
||||
type: csi
|
||||
source: 'grafana-data'
|
||||
vault:
|
||||
# Vault policies to attach to the task
|
||||
policies:
|
||||
- 'grafana[[ .consul.suffix ]]'
|
||||
# Postgres DB settings
|
||||
database:
|
||||
role: grafana
|
||||
pgrole: grafana
|
||||
# Override some default postgres handling
|
||||
postgres:
|
||||
database: grafana
|
||||
user: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.username }}{{ end }}'
|
||||
|
@ -273,64 +382,131 @@ monitoring:
|
|||
pooler:
|
||||
mode: session
|
||||
prometheus:
|
||||
metrics_url: http://localhost:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics
|
||||
# URL where Grafana metrics are reachable for the metrics proxy (from inside the container PoV)
|
||||
metrics_url: http://127.0.0.1:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics
|
||||
|
||||
# Agent runs as a system jobs, on all the nodes
|
||||
agent:
|
||||
consul:
|
||||
meta:
|
||||
# Override the alloc service meta, the hostname will be more useful than a 0)
|
||||
alloc: '${node.unique.name}'
|
||||
# Nomad settings
|
||||
nomad:
|
||||
# Run on all node pools
|
||||
node_pool: all
|
||||
# Run with an above average priority
|
||||
priority: 60
|
||||
|
||||
# Nomad vector logger is a small container which will query the Nomad API to discover running allocation on the current node
|
||||
# Then generate a vector configuration with scraping for all the discovered allocation.
|
||||
nomad_vector_logger:
|
||||
version: 24.3
|
||||
image: '[[ .docker.repo ]]nomad-vector-logger:[[ .monitoring.agent.nomad_vector_logger.version ]]-2'
|
||||
# Docker image to use
|
||||
image: '[[ .docker.repo ]]nomad-vector-logger:24.3-2'
|
||||
# Custom env to set in the container
|
||||
env: {}
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 20
|
||||
memory: 24
|
||||
memory_max: 50
|
||||
vault:
|
||||
# Vault policies to attach to the task
|
||||
policies:
|
||||
- nomad-vector-logger[[ .consul.suffix ]]
|
||||
|
||||
# Vector is the main task. It'll read it's config created by nomad-vector-logger and will read log files
|
||||
# accordingly, add useful metadata (like node, job, group, task, alloc etc.) and push logs to loki
|
||||
vector:
|
||||
# Docker image to use
|
||||
image: '[[ .monitoring.vector.image ]]'
|
||||
# Custom env to set in the container
|
||||
env: {}
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 100
|
||||
memory: 192
|
||||
memory_max: 384
|
||||
memory: 384
|
||||
memory_max: 512
|
||||
vault:
|
||||
# Vault policies to attach to the container. Vector being able to use mTLS on the metrics endpoint
|
||||
# there's no need to add a metrics_proxy task. Instead, we grant the metrics policy to vector so it can get
|
||||
# a certificate from vault
|
||||
policies:
|
||||
- metrics[[ .consul.suffix ]]
|
||||
consul:
|
||||
connect:
|
||||
upstreams:
|
||||
# Connect to loki with the service mesh
|
||||
- destination_name: loki[[ .consul.suffix ]]
|
||||
local_bind_port: 3100
|
||||
# Volumes for data persistence
|
||||
volumes:
|
||||
# The nomad volume should expose the Nomad alloc dir (eg /opt/nomad/data/alloc) where vector will be able
|
||||
# to read the logs. You should create a host volume in nomad client config of all your nodes. Eg
|
||||
# client {
|
||||
# enabled = true
|
||||
# host_volume "nomad_alloc" {
|
||||
# path = "/opt/nomad/data/alloc"
|
||||
# read_only = "true"
|
||||
# }
|
||||
# }
|
||||
nomad:
|
||||
type: host
|
||||
source: nomad_alloc
|
||||
read_only: true
|
||||
# The data volume will be used by vector for buffering (in case loki is unavailable)
|
||||
# You can create a host volume in Nomad's client config, eg
|
||||
# client {
|
||||
# enabled = true
|
||||
# host_volume "nomad_alloc" {
|
||||
# path = "/data/vector-agent"
|
||||
# }
|
||||
# }
|
||||
data:
|
||||
type: host
|
||||
source: vector_data
|
||||
|
||||
# The node exporter can be used to expose the host metrics to prometheus
|
||||
node_exporter:
|
||||
image: quay.io/prometheus/node-exporter:latest
|
||||
# Is the node exporter enabled ? (set to false if you don't want it, or if you
|
||||
# already manage the node-exporter separatly)
|
||||
enabled: true
|
||||
# Version of the exporter
|
||||
version: 1.7.0
|
||||
# Docker image to use
|
||||
image: '[[ .docker.repo ]]node-exporter:[[ .monitoring.agent.node_exporter.version ]]-1'
|
||||
# Custom env to set in the container
|
||||
env: {}
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 50
|
||||
memory: 24
|
||||
memory_max: 32
|
||||
vault:
|
||||
# Vault policies to atatch to the task
|
||||
# This exporter can handle mTLS itself, so no need to create a metrics_proxy task, instead, grant the metrics policy
|
||||
# So it can get a certificate from vault
|
||||
policies:
|
||||
- metrics[[ .consul.suffix ]]
|
||||
# Args to add to the exporter on start
|
||||
args:
|
||||
- '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)'
|
||||
# Volumes
|
||||
volumes:
|
||||
# The exporter should access the host root filesystem
|
||||
# For this, you should create a host volume in Nomad's client config, eg
|
||||
# client {
|
||||
# enabled = true
|
||||
# host_volume "host_root" {
|
||||
# path = "/"
|
||||
# read_only = true
|
||||
# }
|
||||
# }
|
||||
host:
|
||||
type: host
|
||||
source: host_root
|
||||
read_only: true
|
||||
|
||||
# Enable globaly prometheus for this bundle :-)
|
||||
prometheus:
|
||||
enabled: true
|
||||
|
|
Loading…
Reference in New Issue