Start: prometheus + alertmanager + exporters

This commit is contained in:
Daniel Berteaud 2024-03-19 13:53:28 +01:00
parent b32fe575af
commit 65441a4a9e
67 changed files with 4446 additions and 0 deletions

43
TODO.md Normal file
View File

@ -0,0 +1,43 @@
- ~~Split exporters dans un job dédié (pour pouvoir tourner sur node_pool spécifique)~~
- Créer monitoring-agent type system avec vector + node-exporter
- images
- ~~prometheus~~
- ~~ping-exporter~~
- ~~blackbox-exporter~~
- ~~consul-exporter~~
- vector
- loki
- grafana
- nomad-vector-logger
- pki roles:
- ~~monitoring -> prom~~
- ~~consul -> prom~~
- ~~monitoring -> am~~
- vault pol
- ~~prometheus~~
- ~~issue prom on monitoring~~
- ~~issue prom on consul~~
- ~~consul-exporter~~
- ~~issue consul-exporter on consul~~
- ~~alertmanager~~
- ~~issue alertmanager on monitoring~~
- consul defaults & intentions
- ~~prometheus~~
- ~~alertmanager~~
- loki
- tasks
- ~~alertmanager~~
- vector-aggregator
- vector-agent (dans job agent)
- loki (modulariser ou laisser en monolithique ?)
- grafana
- ~~cluster-metrics (job exporters)~~
- questions
- prom rules: keep or move to a -conf bundle ?
- ~~config alertes am (recipient + routing)~~
- ~~http and tcp probes, as exporters are now in a dedicated job~~

4
bundles.yml Normal file
View File

@ -0,0 +1,4 @@
---
dependencies:
- url: ../common.git

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]"
Protocol = "http"

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]"
Protocol = "http"

View File

@ -0,0 +1,16 @@
Kind = "service-intentions"
Name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]"
Sources = [
{
Name = "[[ (merge .monitoring.alertmanager .).traefik.instance ]]"
Permissions = [
{
Action = "allow"
HTTP {
PathPrefix = "[[ if eq "" (urlParse .monitoring.alertmanager.public_url).Path ]]/[[ else ]](urlParse .monitoring.alertmanager.public_url).Path[[ end ]]"
Methods = ["GET", "HEAD", "POST", "PUT", "DELETE", "PATCH"]
}
}
]
}
]

View File

@ -0,0 +1,34 @@
Kind = "service-intentions"
Name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]"
Sources = [
{
Name = "[[ (merge .monitoring.prometheus .).traefik.instance ]]"
Permissions = [
{
Action = "allow"
HTTP {
Methods = ["GET", "HEAD", "POST"]
}
}
]
},
{
Name = "[[ .instance ]]-grafana[[ .consul.suffix ]]"
Permissions = [
{
# Deny access to the admin API from Grafana
Action = "deny"
HTTP {
PathPrefix = "/api/v1/admin"
}
},
{
Action = "allow"
HTTP {
PathPrefix = "/api/v1"
Methods = ["GET", "HEAD", "POST", "PUT"]
}
}
]
}
]

View File

@ -0,0 +1,9 @@
agent_prefix "" {
policy = "read"
}
node_prefix "" {
policy = "read"
}
service_prefix "" {
policy = "read"
}

View File

BIN
example/.variables.yml.swp Normal file

Binary file not shown.

3
example/README.md Normal file
View File

@ -0,0 +1,3 @@
# monitoring
Monitoring stack

43
example/TODO.md Normal file
View File

@ -0,0 +1,43 @@
- ~~Split exporters dans un job dédié (pour pouvoir tourner sur node_pool spécifique)~~
- Créer monitoring-agent type system avec vector + node-exporter
- images
- prometheus
- ping-exporter
- blackbox-exporter
- consul-exporter
- vector
- loki
- grafana
- nomad-vector-logger
- pki roles:
- ~~monitoring -> prom~~
- ~~consul -> prom~~
- ~~monitoring -> am~~
- vault pol
- prometheus
- ~~issue prom on monitoring~~
- ~~issue prom on consul~~
- consul-exporter
- issue consul-exporter on consul
- alertmanager
- ~~issue alertmanager on monitoring~~
- consul defaults & intentions
- ~~prometheus~~
- ~~alertmanager~~
- loki
- tasks
- ~~alertmanager~~
- vector-aggregator
- vector-agent (dans job agent)
- loki (modulariser ou laisser en monolithique ?)
- grafana
- cluster-metrics (job exporters)
- questions
- prom rules: keep or move to a -conf bundle ?
- ~~config alertes am (recipient + routing)~~
- http and tcp probes, as exporters are now in a dedicated job

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "monitoring-alertmanager"
Protocol = "http"

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "monitoring-prometheus"
Protocol = "http"

View File

@ -0,0 +1,16 @@
Kind = "service-intentions"
Name = "monitoring-alertmanager"
Sources = [
{
Name = "traefik"
Permissions = [
{
Action = "allow"
HTTP {
PathPrefix = "/"
Methods = ["GET", "HEAD", "POST", "PUT", "DELETE", "PATCH"]
}
}
]
}
]

View File

@ -0,0 +1,34 @@
Kind = "service-intentions"
Name = "monitoring-prometheus"
Sources = [
{
Name = "traefik"
Permissions = [
{
Action = "allow"
HTTP {
Methods = ["GET", "HEAD", "POST"]
}
}
]
},
{
Name = "monitoring-grafana"
Permissions = [
{
# Deny access to the admin API from Grafana
Action = "deny"
HTTP {
PathPrefix = "/api/v1/admin"
}
},
{
Action = "allow"
HTTP {
PathPrefix = "/api/v1"
Methods = ["GET", "HEAD", "POST", "PUT"]
}
}
]
}
]

View File

@ -0,0 +1,9 @@
agent_prefix "" {
policy = "read"
}
node_prefix "" {
policy = "read"
}
service_prefix "" {
policy = "read"
}

View File

@ -0,0 +1,41 @@
FROM danielberteaud/alpine:24.3-1 AS builder
ARG AM_VERSION=0.27.0
ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/alertmanager-${AM_VERSION}.linux-amd64.tar.gz /tmp
ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/sha256sums.txt /tmp
RUN set -eux &&\
apk --no-cache add \
tar \
&&\
cd /tmp &&\
grep "alertmanager-${AM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
tar xzf alertmanager-${AM_VERSION}.linux-amd64.tar.gz &&\
mv alertmanager-${AM_VERSION}.linux-amd64 /opt/alertmanager
FROM danielberteaud/alpine:24.3-1
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
ENV PATH=/opt/alertmanager:$PATH
COPY --from=builder /opt/alertmanager /opt/alertmanager
RUN set -eux &&\
addgroup -g 9093 alertmanager &&\
adduser --system \
--disabled-password \
--uid 9093 \
--ingroup alertmanager \
--home /opt/alertmanager \
--no-create-home \
--shell /sbin/nologin \
alertmanager &&\
mkdir /data &&\
chown alertmanager:alertmanager /data &&\
chmod 700 data
WORKDIR /opt/alertmanager
USER alertmanager
EXPOSE 9093
CMD [ "alertmanager", \
"--config.file=/opt/alertmanager/alertmanager.yml", \
"--storage.path=/data" ]

View File

@ -0,0 +1,29 @@
FROM danielberteaud/alpine:24.3-1 AS builder
ARG BLACKBOX_EXPORTER_VERSION=0.24.0
ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/sha256sums.txt /tmp
RUN set -eux &&\
apk --no-cache add tar gzip &&\
cd /tmp &&\
grep "blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
tar xvf blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz &&\
mkdir blackbox &&\
mv blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64/blackbox_exporter /usr/local/bin/blackbox_exporter
FROM danielberteaud/alpine:24.3-1
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
ENV BLACKBOX_CONF=/etc/blackbox.yml
COPY --from=builder /usr/local/bin/blackbox_exporter /usr/local/bin/blackbox_exporter
RUN set -eux &&\
apk --no-cache upgrade &&\
apk --no-cache add ca-certificates curl
COPY root/ /
EXPOSE 9195
CMD ["sh", "-c", "exec blackbox_exporter --config.file=${BLACKBOX_CONF}"]

View File

@ -0,0 +1,65 @@
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: "ip4"
http_ssl_no_check:
prober: http
http:
preferred_ip_protocol: "ip4"
tls_config:
insecure_skip_verify: true
http_post_2xx:
prober: http
http:
method: POST
preferred_ip_protocol: "ip4"
tcp_connect:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
pop3s_banner:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
grpc:
prober: grpc
grpc:
tls: true
preferred_ip_protocol: "ip4"
grpc_plain:
prober: grpc
grpc:
preferred_ip_protocol: "ip4"
tls: false
service: "service1"
ssh_banner:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
query_response:
- expect: "^SSH-2.0-"
- send: "SSH-2.0-blackbox-ssh-check"
irc_banner:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: "^:[^ ]+ 001"
icmp:
prober: icmp
icmp_ttl5:
prober: icmp
timeout: 5s
icmp:
ttl: 5

View File

@ -0,0 +1,21 @@
FROM danielberteaud/alpine:24.3-1 AS builder
ARG CONSUL_EXPORTER_VERSION=0.11.0
ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/sha256sums.txt /tmp
RUN set -eux &&\
apk --no-cache add tar gzip &&\
cd /tmp &&\
grep "consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
tar xvf consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz &&\
mv consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64/consul_exporter /usr/local/bin/consul_exporter
FROM danielberteaud/alpine:24.3-1
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
COPY --from=builder /usr/local/bin/consul_exporter /usr/local/bin/consul_exporter
USER 9107
EXPOSE 9107
CMD ["consul_exporter"]

View File

@ -0,0 +1,24 @@
FROM danielberteaud/alpine:24.3-1 AS builder
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
ARG PING_EXPORTER_VERSION=1.1.0
ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz /tmp
ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt /tmp
RUN set -eux &&\
apk --no-cache add \
tar \
gzip \
&&\
cd /tmp &&\
grep "ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz" ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt | sha256sum -c &&\
tar xvf ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz &&\
mv ping_exporter /usr/local/bin/
FROM danielberteaud/alpine:24.3-1
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
COPY --from=builder /usr/local/bin/ping_exporter /usr/local/bin/ping_exporter
EXPOSE 9427
CMD ["ping_exporter", "--config.path=/config.yml"]

View File

@ -0,0 +1,4 @@
# targets:
# - foo.bar
# - acme.com
targets: []

View File

@ -0,0 +1,48 @@
FROM danielberteaud/alpine:24.3-1 AS builder
ARG PROM_VERSION=2.50.1
ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz /tmp
ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/sha256sums.txt /tmp
RUN set -eux &&\
apk --no-cache add \
curl \
tar \
ca-certificates \
&&\
cd /tmp &&\
grep "prometheus-${PROM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
tar xvzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\
rm -f prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\
mv prometheus-${PROM_VERSION}.linux-amd64 /opt/prometheus
FROM danielberteaud/alpine:24.3-1
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
ENV PATH=/opt/prometheus:$PATH
COPY --from=builder /opt/prometheus /opt/prometheus
RUN set -eux &&\
addgroup -g 9090 prometheus &&\
adduser --system \
--disabled-password \
--uid 9090 \
--ingroup prometheus \
--home /opt/prometheus \
--no-create-home \
--shell /sbin/nologin \
prometheus &&\
mkdir /data &&\
chown prometheus.prometheus /data &&\
chmod 700 /data
WORKDIR /opt/prometheus
USER prometheus
EXPOSE 9090
CMD [ "/opt/prometheus/prometheus", \
"--config.file=/opt/prometheus/prometheus.yml", \
"--storage.tsdb.path=/data", \
"--storage.tsdb.wal-compression", \
"--storage.tsdb.wal-compression-type=zstd", \
"--web.console.libraries=/opt/prometheus/console_libraries", \
"--web.console.templates=/opt/prometheus/consoles" ]

17
example/init/consul Executable file
View File

@ -0,0 +1,17 @@
#!/bin/sh
# vim: syntax=sh
vault write consul/roles/monitoring-prometheus \
ttl=720h \
max_ttl=720h \
consul_policies="monitoring-prometheus"
vault write consul/roles/monitoring-consul-exporter \
ttl=720h \
max_ttl=720h \
consul_policies="monitoring-prometheus"
vault write consul/roles/monitoring-cluster-exporter \
ttl=720h \
max_ttl=720h \
consul_policies="monitoring-prometheus"

156
example/init/pki Executable file
View File

@ -0,0 +1,156 @@
#!/bin/sh
set -euo pipefail
#!/bin/sh
# vim: syntax=sh
set -euo pipefail
TMP=$(mktemp -d)
INITIAL_SETUP=false
if [ "$(vault secrets list -format json | jq -r '.["pki/monitoring/"].type')" != "pki" ]; then
INITIAL_SETUP=true
fi
if [ "${INITIAL_SETUP}" = "true" ]; then
# Enable the secret engine
echo "Mounting new PKI secret engine at pki/monitoring"
vault secrets enable -path=pki/monitoring pki
else
echo "Secret engine already mounted at pki/monitoring"
fi
# Configure max-lease-ttl
echo "Tune PKI secret engine"
vault secrets tune -max-lease-ttl=131400h pki/monitoring
# Configure PKI URLs
echo "Configure URL endpoints"
vault write pki/monitoring/config/urls \
issuing_certificates="${VAULT_ADDR}/v1pki/monitoring/ca" \
crl_distribution_points="${VAULT_ADDR}/v1pki/monitoring/crl" \
ocsp_servers="${VAULT_ADDR}/v1pki/monitoring/ocsp"
vault write pki/monitoring/config/cluster \
path="${VAULT_ADDR}/v1pki/monitoring"
vault write pki/monitoring/config/crl \
auto_rebuild=true \
enable_delta=true
# Configure tidy
echo "Configure auto tidy for the PKI"
vault write pki/monitoring/config/auto-tidy \
enabled=true \
tidy_cert_store=true \
tidy_expired_issuers=true \
tidy_revocation_queue=true \
tidy_revoked_cert_issuer_associations=true \
tidy_revoked_certs=true \
tidy_acme=true \
tidy_cross_cluster_revoked_certs=true \
tidy_move_legacy_ca_bundle=true \
maintain_stored_certificate_counts=true
if [ "${INITIAL_SETUP}" = "true" ]; then
# Generate an internal CA
echo "Generating an internal CA"
vault write -format=json pki/monitoring/intermediate/generate/internal \
common_name="monitoring Certificate Authority" \
ttl="131400h" \
organization="ACME Corp" \
ou="Monitoring" \
locality="FooBar Ville" \
key_type=rsa \
key_bits=4096 \
| jq -r '.data.csr' > ${TMP}/monitoring.csr
# Sign this PKI with a root PKI
echo "Signing the new CA with the authority from pki/root"
vault write -format=json pki/root/root/sign-intermediate \
csr=@${TMP}/monitoring.csr \
format=pem_bundle \
ttl="131400h" \
| jq -r '.data.certificate' > ${TMP}/monitoring.crt
# Update the intermediate CA with the signed one
echo "Update the new CA with the signed version"
vault write pki/monitoring/intermediate/set-signed \
certificate=@${TMP}/monitoring.crt
fi
# Remove temp files
echo "Cleaning temp files"
rm -rf ${TMP}
# Create a role for alertmanager
vault write pki/monitoring/roles/monitoring-alertmanager \
allowed_domains="monitoring" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
allow_ip_sans=true \
server_flag=true \
client_flag=true \
allow_wildcard_certificates=false \
max_ttl=100h \
ou="Monitoring"
# Create a role for prometheus (which will only be a client, for AlertManager)
vault write pki/monitoring/roles/monitoring-prometheus \
allowed_domains="monitoring" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
allow_ip_sans=false \
server_flag=false \
client_flag=true \
allow_wildcard_certificates=false \
max_ttl=100h \
ou="Monitoring"
# Create a role for metrics exporters (server only)
vault write pki/monitoring/roles/metrics \
allowed_domains="monitoring" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
allow_ip_sans=true \
server_flag=true \
client_flag=false \
allow_wildcard_certificates=false \
require_cn=false \
max_ttl=72h \
no_store=true \
ou="Monitoring"
# Create a role on the Nomad PKI for the cluster exporter
vault write pki/nomad/roles/monitoring-cluster-exporter \
allowed_domains='nomad.consul' \
allow_subdomains=true \
allow_wildcard_certificates=false \
max_ttl=168h \
allow_ip_sans=false \
server_flag=false \
client_flag=true \
ou="Cluster metrics exporter"
# Create a role on the Consul PKI for the cluster exporter
vault write pki/consul/roles/monitoring-cluster-exporter \
allowed_domains="consul.consul" \
allow_bare_domains=false \
allow_subdomains=true \
allow_wildcard_certificates=false \
max_ttl=168h \
server_flags=false \
client_flags=true \
ou="Cluster metrics exporter"

View File

@ -0,0 +1,419 @@
job "monitoring-exporters" {
datacenters = ["dc1"]
region = "global"
# Run exporters. Use a separated job so exporters can run in a distinct node_pool
group "exporters" {
count = 1
network {
mode = "bridge"
port "ping" {}
port "blackbox" {}
port "consul" {}
port "cluster" {}
}
service {
name = "monitoring-ping-exporter"
port = "ping"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
metrics-port = "${NOMAD_HOST_PORT_ping}"
}
}
service {
name = "monitoring-blackbox-exporter"
port = "blackbox"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
service {
name = "monitoring-consul-exporter"
port = "ping"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
metrics-port = "${NOMAD_HOST_PORT_consul}"
}
}
service {
name = "monitoring-cluster-exporter"
port = "cluster"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
# Export consul services status to prometheus
task "consul-exporter" {
driver = "docker"
config {
image = "danielberteaud/consul-exporter:0.11.0-2"
readonly_rootfs = true
pids_limit = 30
command = "/local/consul-exporter"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = <<_EOT
LANG=fr_FR.utf8
TZ=Europe/Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
vault {
policies = ["monitoring-consul-exporter"]
env = false
disable_file = true
change_mode = "noop"
}
template {
data = <<_EOT
#!/bin/sh
set -euo pipefail
exec consul_exporter \
--web.listen-address=127.0.0.1:9107 \
--consul.server=http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 \
--consul.request-limit=20
_EOT
destination = "local/consul-exporter"
perms = 755
}
template {
data = <<_EOT
CONSUL_HTTP_TOKEN={{ with secret "consul/creds/monitoring-consul-exporter" }}{{ .Data.token }}{{ end }}
_EOT
destination = "secrets/.consul.env"
uid = 100000
gid = 100000
perms = 400
env = true
}
resources {
cpu = 20
memory = 64
}
}
# The cluster metrics exposes prometheus metrics from the various nodes of the cluster
# Nomad, Consul and Vault
# It also exposes the other exporters metrics with mTLS
task "cluster-metrics-proxy" {
driver = "docker"
user = 8685
lifecycle {
hook = "poststart"
sidecar = true
}
config {
image = "nginxinc/nginx-unprivileged:alpine"
readonly_rootfs = true
pids_limit = 30
# Mount the config in nginx conf dir
volumes = [
"secrets/metrics.conf:/etc/nginx/conf.d/default.conf"
]
mount {
type = "tmpfs"
target = "/tmp"
tmpfs_options {
size = 3000000
}
}
}
vault {
policies = ["monitoring-cluster-exporter", "metrics"]
env = false
disable_file = true
change_mode = "noop"
}
# This is the main nginx configuration, which will proxypass requests to the real metrics endpoints
template {
data = <<_EOT
# Cluster exporter
server {
listen {{ env "NOMAD_ALLOC_PORT_cluster" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
set $consul_token "{{ with secret "consul/creds/monitoring-cluster-exporter" }}{{ .Data.token }}{{ end }}";
{{- range service "nomad-client" }}
location /nomad-client/{{ .Node }} {
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus;
proxy_ssl_certificate /secrets/nomad_client_bundle.pem;
proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem;
proxy_ssl_verify on;
proxy_ssl_name client.{{ env "NOMAD_REGION" }}.nomad;
proxy_ssl_trusted_certificate /local/nomad_ca.crt;
}
{{- end }}
{{- range service "nomad" }}
{{- if .Tags | contains "http" }}
location /nomad/{{ .Node }} {
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus;
proxy_ssl_certificate /secrets/nomad_client_bundle.pem;
proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem;
proxy_ssl_verify on;
proxy_ssl_name server.{{ env "NOMAD_REGION" }}.nomad;
proxy_ssl_trusted_certificate /local/nomad_ca.crt;
}
{{- end }}
{{- end }}
{{- range service "consul" }}
location /consul/{{ .Node }} {
proxy_pass https://{{ .Address }}:8501/v1/agent/metrics?format=prometheus;
proxy_set_header X-Consul-Token $consul_token;
proxy_ssl_certificate /secrets/consul_client_bundle.pem;
proxy_ssl_certificate_key /secrets/consul_client_bundle.pem;
proxy_ssl_verify off;
proxy_ssl_trusted_certificate /local/consul_ca.crt;
}
{{- end }}
{{- range service "vault" }}
location /vault/{{ .Node }} {
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/sys/metrics?format=prometheus;
proxy_ssl_verify on;
proxy_ssl_trusted_certificate /etc/ssl/cert.pem;
proxy_set_header X-Forwarded-For "$proxy_add_x_forwarded_for";
proxy_set_header X-Real-IP "$remote_addr";
proxy_set_header X-Forwarded-Proto "$scheme";
proxy_set_header X-Scheme "$scheme";
proxy_set_header X-Forwarded-Host "$host";
proxy_set_header X-Forwarded-Port "$server_port";
}
{{- end }}
location / {
root /usr/share/nginx/html;
index index.html;
}
}
# Ping exporter
server {
listen {{ env "NOMAD_ALLOC_PORT_ping" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://127.0.0.1:9427;
}
}
# Blackbox exporter
server {
listen {{ env "NOMAD_ALLOC_PORT_blackbox" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location / {
proxy_pass http://127.0.0.1:9115;
}
}
# Consul exporter
server {
listen {{ env "NOMAD_ALLOC_PORT_consul" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://127.0.0.1:9107;
}
}
_EOT
destination = "secrets/metrics.conf"
perms = "0440"
uid = 108685
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
# Get certificate to add mTLS to metrics endpoints
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
change_mode = "signal"
change_signal = "SIGHUP"
}
# Get the CA for the monitoring PKI
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
# Get a Nomad client certificate
template {
data = <<_EOT
{{- with pkiCert "pki/nomad/issue/monitoring-cluster-exporter" "common_name=metrics-proxy.nomad.consul" "ttl=24h" }}
{{ .Data.Cert }}
{{ .Data.Key }}
{{- end }}
_EOT
destination = "secrets/nomad_client_bundle.pem"
perms = "0400"
uid = 108685
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
# The CA chain for Nomad
template {
data = <<_EOT
{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/nomad_ca.crt"
}
# Same for Consul
template {
data = <<_EOT
{{- with pkiCert "pki/consul/issue/monitoring-cluster-exporter" "common_name=metrics-proxy.consul.consul" "ttl=24h" }}
{{ .Data.Cert }}
{{ .Data.Key }}
{{- end }}
_EOT
destination = "secrets/consul_client_bundle.pem"
perms = "0400"
uid = 108685
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
template {
data = <<_EOT
{{ with secret "pki/consul/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/consul_ca.crt"
}
resources {
cpu = 10
memory = 18
}
}
}
}

1344
example/monitoring.nomad.hcl Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,3 @@
path "pki/monitoring/issue/metrics" {
capabilities = ["update"]
}

View File

@ -0,0 +1,8 @@
path "pki/monitoring/issue/monitoring-alertmanager" {
capabilities = ["update"]
}
path "kv/service/monitoring/alertmanager" {
capabilities = ["read"]
}

View File

@ -0,0 +1,20 @@
# Read vault metrics
path "sys/metrics" {
capabilities = ["read", "list"]
}
# Get a cert for Nomad
path "pki/nomad/issue/monitoring-cluster-exporter" {
capabilities = ["update"]
}
# Get a cert for Consul
path "pki/consul/issue/monitoring-cluster-exporter" {
capabilities = ["update"]
}
# Get a consul token
path "consul/creds/monitoring-cluster-exporter" {
capabilities = ["read"]
}

View File

@ -0,0 +1,4 @@
path "consul/creds/monitoring-consul-exporter" {
capabilities = ["read"]
}

View File

@ -0,0 +1,12 @@
path "pki/monitoring/issue/monitoring-prometheus" {
capabilities = ["update"]
}
path "kv/service/monitoring/prometheus" {
capabilities = ["read"]
}
path "consul/creds/monitoring-prometheus" {
capabilities = ["read"]
}

View File

@ -0,0 +1,41 @@
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
ARG AM_VERSION=[[ .monitoring.alertmanager.version ]]
ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/alertmanager-${AM_VERSION}.linux-amd64.tar.gz /tmp
ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/sha256sums.txt /tmp
RUN set -eux &&\
apk --no-cache add \
tar \
&&\
cd /tmp &&\
grep "alertmanager-${AM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
tar xzf alertmanager-${AM_VERSION}.linux-amd64.tar.gz &&\
mv alertmanager-${AM_VERSION}.linux-amd64 /opt/alertmanager
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
MAINTAINER [[ .docker.maintainer ]]
ENV PATH=/opt/alertmanager:$PATH
COPY --from=builder /opt/alertmanager /opt/alertmanager
RUN set -eux &&\
addgroup -g 9093 alertmanager &&\
adduser --system \
--disabled-password \
--uid 9093 \
--ingroup alertmanager \
--home /opt/alertmanager \
--no-create-home \
--shell /sbin/nologin \
alertmanager &&\
mkdir /data &&\
chown alertmanager:alertmanager /data &&\
chmod 700 data
WORKDIR /opt/alertmanager
USER alertmanager
EXPOSE 9093
CMD [ "alertmanager", \
"--config.file=/opt/alertmanager/alertmanager.yml", \
"--storage.path=/data" ]

View File

@ -0,0 +1,29 @@
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
ARG BLACKBOX_EXPORTER_VERSION=[[ .monitoring.exporters.blackbox.version ]]
ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/sha256sums.txt /tmp
RUN set -eux &&\
apk --no-cache add tar gzip &&\
cd /tmp &&\
grep "blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
tar xvf blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz &&\
mkdir blackbox &&\
mv blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64/blackbox_exporter /usr/local/bin/blackbox_exporter
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
MAINTAINER [[ .docker.maintainer ]]
ENV BLACKBOX_CONF=/etc/blackbox.yml
COPY --from=builder /usr/local/bin/blackbox_exporter /usr/local/bin/blackbox_exporter
RUN set -eux &&\
apk --no-cache upgrade &&\
apk --no-cache add ca-certificates curl
COPY root/ /
EXPOSE 9195
CMD ["sh", "-c", "exec blackbox_exporter --config.file=${BLACKBOX_CONF}"]

View File

@ -0,0 +1,65 @@
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: "ip4"
http_ssl_no_check:
prober: http
http:
preferred_ip_protocol: "ip4"
tls_config:
insecure_skip_verify: true
http_post_2xx:
prober: http
http:
method: POST
preferred_ip_protocol: "ip4"
tcp_connect:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
pop3s_banner:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
grpc:
prober: grpc
grpc:
tls: true
preferred_ip_protocol: "ip4"
grpc_plain:
prober: grpc
grpc:
preferred_ip_protocol: "ip4"
tls: false
service: "service1"
ssh_banner:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
query_response:
- expect: "^SSH-2.0-"
- send: "SSH-2.0-blackbox-ssh-check"
irc_banner:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: "^:[^ ]+ 001"
icmp:
prober: icmp
icmp_ttl5:
prober: icmp
timeout: 5s
icmp:
ttl: 5

View File

@ -0,0 +1,21 @@
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
ARG CONSUL_EXPORTER_VERSION=[[ .monitoring.exporters.consul.version ]]
ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/sha256sums.txt /tmp
RUN set -eux &&\
apk --no-cache add tar gzip &&\
cd /tmp &&\
grep "consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
tar xvf consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz &&\
mv consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64/consul_exporter /usr/local/bin/consul_exporter
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
MAINTAINER [[ .docker.maintainer ]]
COPY --from=builder /usr/local/bin/consul_exporter /usr/local/bin/consul_exporter
USER 9107
EXPOSE 9107
CMD ["consul_exporter"]

View File

@ -0,0 +1,24 @@
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
MAINTAINER [[ .docker.maintainer ]]
ARG PING_EXPORTER_VERSION=[[ .monitoring.exporters.ping.version ]]
ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz /tmp
ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt /tmp
RUN set -eux &&\
apk --no-cache add \
tar \
gzip \
&&\
cd /tmp &&\
grep "ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz" ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt | sha256sum -c &&\
tar xvf ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz &&\
mv ping_exporter /usr/local/bin/
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
MAINTAINER [[ .docker.maintainer ]]
COPY --from=builder /usr/local/bin/ping_exporter /usr/local/bin/ping_exporter
EXPOSE 9427
CMD ["ping_exporter", "--config.path=/config.yml"]

View File

@ -0,0 +1,4 @@
# targets:
# - foo.bar
# - acme.com
targets: []

View File

@ -0,0 +1,48 @@
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
ARG PROM_VERSION=[[ .monitoring.prometheus.version ]]
ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz /tmp
ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/sha256sums.txt /tmp
RUN set -eux &&\
apk --no-cache add \
curl \
tar \
ca-certificates \
&&\
cd /tmp &&\
grep "prometheus-${PROM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
tar xvzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\
rm -f prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\
mv prometheus-${PROM_VERSION}.linux-amd64 /opt/prometheus
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
MAINTAINER [[ .docker.maintainer ]]
ENV PATH=/opt/prometheus:$PATH
COPY --from=builder /opt/prometheus /opt/prometheus
RUN set -eux &&\
addgroup -g 9090 prometheus &&\
adduser --system \
--disabled-password \
--uid 9090 \
--ingroup prometheus \
--home /opt/prometheus \
--no-create-home \
--shell /sbin/nologin \
prometheus &&\
mkdir /data &&\
chown prometheus.prometheus /data &&\
chmod 700 /data
WORKDIR /opt/prometheus
USER prometheus
EXPOSE 9090
CMD [ "/opt/prometheus/prometheus", \
"--config.file=/opt/prometheus/prometheus.yml", \
"--storage.tsdb.path=/data", \
"--storage.tsdb.wal-compression", \
"--storage.tsdb.wal-compression-type=zstd", \
"--web.console.libraries=/opt/prometheus/console_libraries", \
"--web.console.templates=/opt/prometheus/consoles" ]

17
init/consul Executable file
View File

@ -0,0 +1,17 @@
#!/bin/sh
# vim: syntax=sh
vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-prometheus \
ttl=720h \
max_ttl=720h \
consul_policies="[[ .instance ]]-prometheus"
vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-consul-exporter \
ttl=720h \
max_ttl=720h \
consul_policies="[[ .instance ]]-prometheus"
vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-cluster-exporter \
ttl=720h \
max_ttl=720h \
consul_policies="[[ .instance ]]-prometheus"

69
init/pki Executable file
View File

@ -0,0 +1,69 @@
#!/bin/sh
set -euo pipefail
[[ $c := merge .monitoring . ]]
[[ template "common/vault.mkpki.sh" $c ]]
# Create a role for alertmanager
vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-alertmanager \
allowed_domains="[[ .instance ]]" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
allow_ip_sans=true \
server_flag=true \
client_flag=true \
allow_wildcard_certificates=false \
max_ttl=100h \
ou="[[ $c.vault.pki.ou ]]"
# Create a role for prometheus (which will only be a client, for AlertManager)
vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-prometheus \
allowed_domains="[[ .instance ]]" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
allow_ip_sans=false \
server_flag=false \
client_flag=true \
allow_wildcard_certificates=false \
max_ttl=100h \
ou="[[ $c.vault.pki.ou ]]"
# Create a role for metrics exporters (server only)
vault write [[ $c.vault.pki.path ]]/roles/metrics \
allowed_domains="[[ .instance ]]" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
allow_ip_sans=true \
server_flag=true \
client_flag=false \
allow_wildcard_certificates=false \
require_cn=false \
max_ttl=72h \
no_store=true \
ou="[[ $c.vault.pki.ou ]]"
# Create a role on the Nomad PKI for the cluster exporter
vault write pki/nomad/roles/[[ .instance ]]-cluster-exporter \
allowed_domains='nomad.[[ .consul.domain ]]' \
allow_subdomains=true \
allow_wildcard_certificates=false \
max_ttl=168h \
allow_ip_sans=false \
server_flag=false \
client_flag=true \
ou="Cluster metrics exporter"
# Create a role on the Consul PKI for the cluster exporter
vault write pki/consul/roles/[[ .instance ]]-cluster-exporter \
allowed_domains="consul.[[ .consul.domain ]]" \
allow_bare_domains=false \
allow_subdomains=true \
allow_wildcard_certificates=false \
max_ttl=168h \
server_flags=false \
client_flags=true \
ou="Cluster metrics exporter"

View File

@ -0,0 +1,253 @@
job "[[ .instance ]]-exporters" {
[[- $c := merge .monitoring.exporters . ]]
[[ template "common/job_start" $c ]]
# Run exporters. Use a separated job so exporters can run in a distinct node_pool
group "exporters" {
count = [[ $c.count ]]
network {
mode = "bridge"
port "ping" {}
port "blackbox" {}
port "consul" {}
port "cluster" {}
}
service {
name = "[[ .instance ]]-ping-exporter[[ .consul.suffix ]]"
port = "ping"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
metrics-port = "${NOMAD_HOST_PORT_ping}"
}
}
service {
name = "[[ .instance ]]-blackbox-exporter[[ .consul.suffix ]]"
port = "blackbox"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
service {
name = "[[ .instance ]]-consul-exporter[[ .consul.suffix ]]"
port = "ping"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
metrics-port = "${NOMAD_HOST_PORT_consul}"
}
}
service {
name = "[[ .instance ]]-cluster-exporter[[ .consul.suffix ]]"
port = "cluster"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
[[- if gt (len $c.ping.probes) 0 ]]
[[- $e := merge $c.ping $c ]]
# Ping exporter will collect ICMP ping stats and expose them
# Note : we could do it with blackbox, but as pings require privileges, it's better to grant it
# to a smaller, more focused container. This one only handle icmp pings check, and only from the configuration file
task "ping-exporter" {
driver = "[[ $e.nomad.driver ]]"
config {
image = "[[ $e.image ]]"
readonly_rootfs = true
pids_limit = 30
# Pings require privileges
privileged = true
userns_mode = "host"
command = "ping_exporter"
args = [
"--web.listen-address=127.0.0.1:9427",
"--config.path=/local/config.yml"
]
}
[[ template "common/file_env" $e ]]
template {
data = <<_EOT
[[ template "monitoring/ping_exporter/config.yml" $e ]]
_EOT
destination = "local/config.yml"
}
[[ template "common/resources" $e ]]
}
[[- end ]]
[[- if or (gt (len $c.blackbox.tcp_probes) 0) (gt (len $c.blackbox.http_probes) 0) ]]
[[- $e := merge $c.blackbox $c ]]
# Blackbox exporter will probe http/tcp targets and expose them
# for prometheus
task "blackbox-exporter" {
driver = "[[ $e.nomad.driver ]]"
config {
image = "[[ $e.image ]]"
readonly_rootfs = true
pids_limit = 30
}
[[ template "common/file_env" $e ]]
[[ template "common/resources" $e ]]
}
[[- end ]]
# Export consul services status to prometheus
task "consul-exporter" {
[[- $e := merge $c.consul $c ]]
driver = "[[ $e.nomad.driver ]]"
config {
image = "[[ $e.image ]]"
readonly_rootfs = true
pids_limit = 30
command = "/local/consul-exporter"
}
[[ template "common/file_env" $e ]]
[[ template "common/vault.policies" $e ]]
template {
data = <<_EOT
[[ template "monitoring/consul-exporter/start.sh" $e ]]
_EOT
destination = "local/consul-exporter"
perms = 755
}
template {
data = <<_EOT
CONSUL_HTTP_TOKEN={{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-consul-exporter" }}{{ .Data.token }}{{ end }}
_EOT
destination = "secrets/.consul.env"
uid = 100000
gid = 100000
perms = 400
env = true
}
[[ template "common/resources" $e ]]
}
# The cluster metrics exposes prometheus metrics from the various nodes of the cluster
# Nomad, Consul and Vault
# It also exposes the other exporters metrics with mTLS
task "cluster-metrics-proxy" {
[[- $e := merge $c.cluster $c ]]
driver = "[[ $e.nomad.driver ]]"
user = 8685
lifecycle {
hook = "poststart"
sidecar = true
}
config {
image = "[[ $e.image ]]"
readonly_rootfs = true
pids_limit = 30
# Mount the config in nginx conf dir
volumes = [
"secrets/metrics.conf:/etc/nginx/conf.d/default.conf"
]
[[ template "common/tmpfs" "/tmp" ]]
}
[[ template "common/vault.policies" $e ]]
# This is the main nginx configuration, which will proxypass requests to the real metrics endpoints
template {
data =<<_EOT
[[ template "monitoring/cluster-exporter/nginx.conf" $e ]]
_EOT
destination = "secrets/metrics.conf"
perms = "0440"
uid = 108685
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
# Get certificate to add mTLS to metrics endpoints
template {
data =<<_EOT
{{- with pkiCert "[[ .prometheus.vault_pki ]]/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
change_mode = "signal"
change_signal = "SIGHUP"
}
# Get the CA for the monitoring PKI
template {
data =<<_EOT
{{ with secret "[[ .vault.root ]]pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
# Get a Nomad client certificate
template {
data = <<_EOT
{{- with pkiCert "pki/nomad/issue/[[ .instance ]]-cluster-exporter" "common_name=metrics-proxy.nomad.[[ .consul.domain ]]" "ttl=24h" }}
{{ .Data.Cert }}
{{ .Data.Key }}
{{- end }}
_EOT
destination = "secrets/nomad_client_bundle.pem"
perms = "0400"
uid = 108685
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
# The CA chain for Nomad
template {
data = <<_EOT
{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/nomad_ca.crt"
}
# Same for Consul
template {
data = <<_EOT
{{- with pkiCert "pki/consul/issue/[[ .instance ]]-cluster-exporter" "common_name=metrics-proxy.consul.[[ .consul.domain ]]" "ttl=24h" }}
{{ .Data.Cert }}
{{ .Data.Key }}
{{- end }}
_EOT
destination = "secrets/consul_client_bundle.pem"
perms = "0400"
uid = 108685
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
template {
data = <<_EOT
{{ with secret "pki/consul/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/consul_ca.crt"
}
[[ template "common/resources" $e ]]
}
}
}

376
monitoring.nomad.hcl Normal file
View File

@ -0,0 +1,376 @@
job "[[ .instance ]]" {
[[ template "common/job_start" . ]]
# Metrics is running prometheus and various exporters
group "metrics" {
[[- $c := merge .monitoring.prometheus .monitoring . ]]
shutdown_delay = "6s"
count = [[ $c.count ]]
network {
mode = "bridge"
port "metrics" {}
}
[[ template "common/volumes" $c ]]
service {
name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]"
port = 9090
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
check {
name = "health"
type = "http"
expose = true
path = "/-/healthy"
interval = "15s"
timeout = "8s"
check_restart {
limit = 10
grace = "5m"
}
}
tags = [
[[ template "common/traefik_tags" $c ]]
]
}
[[ template "common/task.metrics_proxy" $c ]]
# The main prometheus task
task "prometheus" {
driver = "[[ $c.nomad.driver ]]"
leader = true
config {
image = "[[ $c.image ]]"
readonly_rootfs = true
pids_limit = 200
command = "prometheus"
args = [
"--config.file=/local/prometheus.yml",
"--log.level=debug",
"--web.listen-address=127.0.0.1:9090",
"--storage.tsdb.path=/data",
"--storage.tsdb.retention.time=[[ $c.retention ]]",
"--web.console.libraries=/opt/prometheus/console_libraries",
"--web.console.templates=/opt/prometheus/consoles",
"--web.external-url=[[ $c.public_url ]]",
"--web.route-prefix=[[ if eq "" (urlParse $c.public_url).Path ]]/[[ else ]](urlParse $c.public_url).Path[[ end ]]"
]
}
[[ template "common/vault.policies" $c ]]
[[ template "common/artifacts" $c ]]
# Main configuration for prometheus
template {
data = <<_EOT
[[ tmpl.Exec "monitoring/prometheus/prometheus.yml" $c | replaceAll "${" "$${" ]]
_EOT
destination = "local/prometheus.yml"
uid = 100000
gid = 109090
perms = 640
change_mode = "signal"
change_signal = "SIGHUP"
}
# Alert rules
[[- range (file.ReadDir "bundles/monitoring/templates/prometheus/rules") ]]
[[- if not (file.Exists (printf "prometheus/rules/%s" .)) ]]
template {
data = <<_EOT
[[ file.Read (printf "bundles/monitoring/templates/prometheus/rules/%s" .) ]]
_EOT
destination = "local/rules/[[ . ]]"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
[[- end ]]
[[- end ]]
[[- if file.Exists "prometheus/rules" ]]
[[- range (file.ReadDir "prometheus/rules") ]]
template {
data = <<_EOT
[[ file.Read (printf "prometheus/rules/%s" .) ]]
_EOT
destination = "local/rules/[[ . ]]"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
[[- end ]]
[[- end ]]
[[- range $k, $v := $c.alert_rules ]]
artifact {
source = "[[ $v.url ]]"
destination = "local/rules/[[ $k ]].yml"
mode = "file"
}
[[- end ]]
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-prometheus"
(printf "common_name=prometheus-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
{{- end -}}
_EOT
destination = "secrets/prometheus.bundle.pem"
uid = 100000
gid = 109090
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The monitoring CA chain, to validate AlertManager cert
template {
data = <<_EOT
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
uid = 100000
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
# Persistent data
volume_mount {
volume = "data"
destination = "/data"
}
[[ template "common/resources" $c ]]
}
}
group "alerts" {
[[- $c := merge .monitoring.alertmanager .monitoring . ]]
count = [[ $c.count ]]
network {
mode = "bridge"
port "web-tls" {}
port "cluster" {}
port "metrics" {}
}
[[ template "common/volumes" $c ]]
# This service is used for the different instances of alertmanager to communicate
service {
name = "[[ .instance ]]-alertmanager-gossip[[ .consul.suffix ]]"
port = "cluster"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
# This service is used by prometheus. As it needs to be able to reach every instances, it cannot use
# the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh
service {
name = "[[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]"
port = "web-tls"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
# This service is exposed through the service mesh
# and can be used to reach the web interface through Traefik
service {
name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]"
port = 9093
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
check {
name = "health"
type = "http"
expose = true
path = "/-/healthy"
interval = "20s"
timeout = "8s"
check_restart {
limit = 12
grace = "30s"
}
}
tags = [
[[ template "common/traefik_tags" $c ]]
]
}
[[ template "common/task.metrics_proxy" $c ]]
# This task will handle mTLS to the AlertManager API
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
task "tls-proxy" {
driver = "[[ $c.nomad.driver ]]"
user = 9093
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
readonly_rootfs = true
pids_limit = 30
volumes = [
"local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro",
]
[[ template "common/tmpfs" "/tmp" ]]
}
[[ template "common/vault.policies" $c ]]
lifecycle {
hook = "poststart"
sidecar = true
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/nginx.conf" $c ]]
_EOT
destination = "local/alertmanager.conf"
}
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager"
(printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/alertmanager.bundle.pem"
uid = 109093
gid = 100000
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The trusted CA
template {
data = <<_EOT
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
resources {
cpu = 10
memory = 18
}
}
# The main alertmanager task
task "alertmanager" {
driver = "[[ $c.nomad.driver ]]"
leader = true
config {
image = "[[ $c.image ]]"
readonly_rootfs = true
pids_limit = 200
command = "/local/alertmanager"
}
[[ template "common/vault.policies" $c ]]
[[ template "common/file_env" $c ]]
template {
data = <<_EOT
[[- if isKind "map" $c.custom_config ]]
[[ merge $c.custom_config (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
[[- else if isKind "string" $c.custom_config ]]
[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
[[- else ]]
# Invalid custom config, using template only
[[ template "monitoring/alertmanager/alertmanager.yml" $c ]]
[[- end ]]
_EOT
destination = "secrets/alertmanager.yml"
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/cluster_tls.yml" $c ]]
_EOT
destination = "local/cluster_tls.yml"
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/web_tls.yml" $c ]]
_EOT
destination = "local/web_tls.yml"
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/start.sh" $c ]]
_EOT
destination = "local/alertmanager"
uid = 100000
gid = 100000
perms = "0755"
}
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager"
(printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/alertmanager.bundle.pem"
uid = 109093
gid = 109090
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The trusted CA
template {
data = <<_EOT
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
volume_mount {
volume = "data"
destination = "/data"
}
[[ template "common/resources" $c ]]
}
}
}

View File

@ -0,0 +1,5 @@
global:
smtp_from: '[[ .email.from ]]'
smtp_smarthost: localhost:25
smtp_require_tls: false

View File

@ -0,0 +1,10 @@
tls_server_config:
cert_file: /secrets/alertmanager.bundle.pem
key_file: /secrets/alertmanager.bundle.pem
client_auth_type: RequireAndVerifyClientCert
client_ca_file: /local/monitoring.ca.pem
tls_client_config:
cert_file: /secrets/alertmanager.bundle.pem
key_file: /secrets/alertmanager.bundle.pem
ca_file: /local/monitoring.ca.pem

View File

@ -0,0 +1,13 @@
server {
listen 127.0.0.1:9093;
location / {
proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
proxy_ssl_verify on;
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring;
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
allow 127.0.0.1;
deny all;
}
}

View File

@ -0,0 +1,19 @@
#!/bin/sh
set -euo pipefail
exec alertmanager \
--config.file=/secrets/alertmanager.yml \
--storage.path=/data \
--web.external-url=[[ .public_url ]] \
--web.route-prefix=[[ if eq "" (urlParse .public_url).Path ]]/[[ else ]](urlParse .public_url).Path[[ end ]] \
--web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \
--cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \
--cluster.advertise-address={{ env "NOMAD_HOST_ADDR_cluster" }} \
{{- range service "[[ .instance ]]-am-gossip[[ .consul.suffix ]]" -}}
{{- if not (eq (env "NOMAD_ALLOC_INDEX") (index .ServiceMeta "alloc")) }}
--cluster.peer={{ .Address }}:{{ .Port }} \
{{ end -}}
{{- end -}}
--cluster.tls-config=/local/cluster_tls.yml \
--web.config.file=/local/web_tls.yml

View File

@ -0,0 +1,5 @@
tls_server_config:
cert_file: /secrets/alertmanager.bundle.pem
key_file: /secrets/alertmanager.bundle.pem
client_auth_type: RequireAndVerifyClientCert
client_ca_file: /local/monitoring.ca.pem

View File

@ -0,0 +1,170 @@
# Cluster exporter
server {
listen {{ env "NOMAD_ALLOC_PORT_cluster" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
set $consul_token "{{ with secret "consul/creds/[[ .instance ]]-cluster-exporter" }}{{ .Data.token }}{{ end }}";
{{- range service "nomad-client" }}
location /nomad-client/{{ .Node }} {
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus;
proxy_ssl_certificate /secrets/nomad_client_bundle.pem;
proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem;
proxy_ssl_verify on;
proxy_ssl_name client.{{ env "NOMAD_REGION" }}.nomad;
proxy_ssl_trusted_certificate /local/nomad_ca.crt;
}
{{- end }}
{{- range service "nomad" }}
{{- if .Tags | contains "http" }}
location /nomad/{{ .Node }} {
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus;
proxy_ssl_certificate /secrets/nomad_client_bundle.pem;
proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem;
proxy_ssl_verify on;
proxy_ssl_name server.{{ env "NOMAD_REGION" }}.nomad;
proxy_ssl_trusted_certificate /local/nomad_ca.crt;
}
{{- end }}
{{- end }}
{{- range service "consul" }}
location /consul/{{ .Node }} {
proxy_pass https://{{ .Address }}:8501/v1/agent/metrics?format=prometheus;
proxy_set_header X-Consul-Token $consul_token;
proxy_ssl_certificate /secrets/consul_client_bundle.pem;
proxy_ssl_certificate_key /secrets/consul_client_bundle.pem;
proxy_ssl_verify off;
proxy_ssl_trusted_certificate /local/consul_ca.crt;
}
{{- end }}
{{- range service "vault" }}
location /vault/{{ .Node }} {
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/sys/metrics?format=prometheus;
proxy_ssl_verify on;
proxy_ssl_trusted_certificate /etc/ssl/cert.pem;
proxy_set_header X-Forwarded-For "$proxy_add_x_forwarded_for";
proxy_set_header X-Real-IP "$remote_addr";
proxy_set_header X-Forwarded-Proto "$scheme";
proxy_set_header X-Scheme "$scheme";
proxy_set_header X-Forwarded-Host "$host";
proxy_set_header X-Forwarded-Port "$server_port";
}
{{- end }}
location / {
root /usr/share/nginx/html;
index index.html;
}
}
# Ping exporter
server {
listen {{ env "NOMAD_ALLOC_PORT_ping" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://127.0.0.1:9427;
}
}
# Blackbox exporter
server {
listen {{ env "NOMAD_ALLOC_PORT_blackbox" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location / {
proxy_pass http://127.0.0.1:9115;
}
}
# Consul exporter
server {
listen {{ env "NOMAD_ALLOC_PORT_consul" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://127.0.0.1:9107;
}
}

View File

@ -0,0 +1,8 @@
#!/bin/sh
set -euo pipefail
exec consul_exporter \
--web.listen-address=127.0.0.1:9107 \
--consul.server=http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 \
--consul.request-limit=20

View File

@ -0,0 +1,4 @@
targets:
[[- range $idx, $probe := .probes ]]
- [[ $probe ]]
[[- end ]]

View File

@ -0,0 +1,237 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
#query_log_file: /dev/stdout
external_labels:
cluster: [[ .consul.domain ]]
env: [[ getenv "NOMAD_NAMESPACE" ]]
rule_files:
- /local/rules/*.yml
alerting:
alertmanagers:
- scheme: https
tls_config:
ca_file: /local/monitoring.ca.pem
cert_file: /secrets/prometheus.bundle.pem
key_file: /secrets/prometheus.bundle.pem
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
datacenter: [[ .consul.datacenter ]]
relabel_configs:
# Only keep alertmanagers
- source_labels: [__meta_consul_service]
action: keep
regex: [[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]
scrape_configs:
[[- range $k, $v := .jobs ]]
- job_name: [[ $k ]]
static_configs:
- targets:
[[- range $target := $v.targets ]]
- [[ $target ]]
[[- end ]]
[[- end ]]
[[- if gt (len .exporters.blackbox.http_probes) 0 ]]
# Blackbox Exporter HTTP targets
- job_name: http_probe
metrics_path: /probe
scheme: https
tls_config:
ca_file: /local/monitoring.ca.pem
cert_file: /secrets/prometheus.bundle.pem
key_file: /secrets/prometheus.bundle.pem
params:
module: ["http_2xx"]
static_configs:
- targets:
[[- range $http_probe := .exporters.blackbox.http_probes ]]
- [[ $http_probe ]]
[[- end ]]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: {{ range $idx, $instance := service "[[ .instance ]]-blackbox-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
[[- end ]]
[[- if gt (len .exporters.blackbox.tcp_probes) 0 ]]
# Blackbox Exporter TCP targets
- job_name: tcp_probe
metrics_path: /probe
scheme: https
tls_config:
ca_file: /local/monitoring.ca.pem
cert_file: /secrets/prometheus.bundle.pem
key_file: /secrets/prometheus.bundle.pem
params:
module: ["tcp_connect"]
static_configs:
[[- range $target := .exporters.blackbox.tcp_probes ]]
- [[ $target ]]
[[- end ]]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: {{ range $idx, $instance := service "[[ .instance ]]-blackbox-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
[[- end ]]
# Cluster services
- job_name: cluster-services
scheme: https
tls_config:
ca_file: /local/monitoring.ca.pem
cert_file: /secrets/prometheus.bundle.pem
key_file: /secrets/prometheus.bundle.pem
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
datacenter: [[ .consul.datacenter ]]
relabel_configs:
# Drop anything which is not Nomad, Consul or Vault
# Other services will be monitored with another job
- source_labels: [__meta_consul_service]
action: keep
regex: (nomad(\-client)?|consul|vault)
- source_labels: [__meta_consul_service,__meta_consul_node]
regex: (.+);(.+)
replacement: ${1}/${2}
target_label: __metrics_path__
- source_labels: [__meta_consul_service]
regex: (.+)
replacement: {{ range $idx, $instance := service "[[ .instance ]]-cluster-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
target_label: __address__
# Rewrite the job labels to the name of the service
- source_labels: [__meta_consul_service]
regex: (.+)
replacement: ${1}
target_label: job
# Rewrite the instance labels
- source_labels: [__meta_consul_node]
regex: (.+)
replacement: ${1}
target_label: instance
# regular services discovered from the Consul Catalog
- job_name: consul-services
scheme: https
tls_config:
ca_file: /local/monitoring.ca.pem
cert_file: /secrets/prometheus.bundle.pem
key_file: /secrets/prometheus.bundle.pem
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
datacenter: [[ .consul.datacenter ]]
relabel_configs:
# Drop sidecar's service to prevent duplicate. Sidecar themselves are treated in another job
- source_labels: [__meta_consul_service]
action: drop
regex: (.+)-sidecar-proxy
# Drop Nomad, Consul and vault, already handled
- source_labels: [__meta_consul_service]
action: drop
regex: (nomad(\-client)?|consul|vault)
# Only keep services having a metrics-port set
- source_labels: [__meta_consul_service_metadata_metrics_port]
regex: \d+
action: keep
# Get metrics path from metadata
- source_labels: [__meta_consul_service_metadata_metrics_path]
target_label: __metrics_path__
regex: (.+)
# Rewrite the scheme if needed
- source_labels: [__meta_consul_service_metadata_metrics_scheme]
regex: (https?)
replacement: ${1}
target_label: __scheme__
# Rewrite the address to use the metrics port
- source_labels: [__address__, __meta_consul_service_metadata_metrics_port]
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: ${1}:${2}
target_label: __address__
# Rewrite the job labels to the name of the service
- source_labels: [__meta_consul_service]
regex: (.+)
replacement: ${1}
target_label: job
# Set the default alloc to 0 if not set
- source_labels: [__meta_consul_service_metadata_alloc]
regex: ^$
replacement: 0
target_label: __meta_consul_service_metadata_alloc
# Rewerite the instance label to be service-alloc
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc]
regex: (.+);([a-zA-Z\d\-\.]+)
replacement: ${1}-${2}
target_label: instance
# envoy sidecars from consul
- job_name: consul-envoy-services
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
datacenter: [[ .consul.datacenter ]]
relabel_configs:
# Only keep sidecar-service with a envoy-metrics-port defined
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_envoy_metrics_port]
action: keep
regex: (.+)-sidecar-proxy;\d+
# Rewrite the address to use the envoy-metrics-port
- source_labels: [__address__, __meta_consul_service_metadata_envoy_metrics_port]
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: ${1}:${2}
target_label: __address__
# Rewrite the job label
- source_labels: [__meta_consul_service]
regex: (.+)
replacement: ${1}
target_label: job
# Set the default alloc to 0 if not set
- source_labels: [__meta_consul_service_metadata_alloc]
regex: ^$
replacement: 0
target_label: __meta_consul_service_metadata_alloc
# Rewerite the instance label to be service-alloc
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc]
regex: (.+);([a-zA-Z\d\-\.]+)
replacement: ${1}-${2}
target_label: instance

View File

@ -0,0 +1,69 @@
# vi: syntax=yaml
groups:
- name: Blackbox
rules:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox slow probe (instance {{ $labels.instance }})
description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateExpired
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowHttp
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -0,0 +1,54 @@
# vi: syntax=yaml
groups:
- name: ConsulExporter
rules:
- alert: ConsulServiceHealthcheckFailed
# Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available)
expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0'
for: 2m
labels:
severity: critical
annotations:
summary: Consul service healthcheck failed (service {{ $labels.service_name }})
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulMissingMasterNode
expr: 'consul_raft_peers < (max_over_time(consul_raft_peers{}[6h]) / 2) + 1'
for: 0m
labels:
severity: critical
annotations:
summary: Consul missing master node (node {{ $labels.node }})
description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulAgentUnhealthy
expr: 'consul_health_node_status{status="critical"} == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Consul agent unhealthy (node {{ $labels.node }})
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulServiceWarning
expr: 'consul_health_service_status{status="warning"} == 1'
for: 2m
labels:
severity: warning
annotations:
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulServiceCritical
expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -0,0 +1,16 @@
# vi: syntax=yaml
groups:
- name: JVM
rules:
- alert: JvmMemoryFillingUp
expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 90'
for: 2m
labels:
severity: warning
annotations:
summary: JVM memory filling up (instance {{ $labels.instance }})
description: "JVM memory is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -0,0 +1,51 @@
# vi: syntax=yaml
groups:
- name: Nomad
rules:
- alert: NomadJobFailed
expr: 'delta(nomad_nomad_job_summary_failed[30m]) > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Nomad job failed (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobLost
expr: 'nomad_nomad_job_summary_lost > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Nomad job lost (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobQueued
expr: 'nomad_nomad_job_summary_queued > 0'
for: 3m
labels:
severity: warning
annotations:
summary: Nomad job queued (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadBlockedEvaluation
expr: 'nomad_nomad_blocked_evals_total_blocked > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Nomad blocked evaluation (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadTaskOOM
expr: 'count_over_time(nomad_client_allocs_oom_killed[1h]) > 1'
for: 0m
labels:
severity: warning
annotations:
summary: Nomad task killed by OOM (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
description: "Nomad task killed by OOM \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -0,0 +1,25 @@
# vi: syntax=yaml
groups:
- name: Ping
rules:
- alert: HostDown
expr: ping_loss_ratio == 1
for: 3m
labels:
severity: critical
annotations:
summary: Host down (host {{ $labels.target }})
description: "Host {{ $labels.target }} doesn't respond to ICMP pings, VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PingLoss
expr: |
avg_over_time(ping_loss_ratio[10m]) > 0.1 and min_over_time(ping_loss_ratio[10m]) < 1
for: 0m
labels:
severity: warning
annotations:
summary: High packet loss (host {{ $labels.target }})
description: "ICMP pings have a loss ratio > 10%, VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -0,0 +1,80 @@
# vi: syntax=yaml
groups:
- name: Postgres
rules:
- alert: PostgresqlDown
expr: 'pg_up == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql down (instance {{ $labels.instance }})
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresTooManyRestarts
expr: changes(process_start_time_seconds{job="pg"}[15m]) > 3
for: 1m
labels:
severity: warning
annotations:
summary: Postgres too many restarts (instance {{ $labels.instance }})
description: "Postgres server has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyConnections
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8'
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql too many connections (instance {{ $labels.instance }})
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlDeadLocks
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql dead locks (instance {{ $labels.instance }})
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PostgresqlHighRollbackRate
# expr: 'rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.05'
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Postgresql high rollback rate (instance {{ $labels.instance }})
# description: "Ratio of transactions being aborted compared to committed is > 5 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateStatementTimeout
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateDeadlock
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyLocksAcquired
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
for: 2m
labels:
severity: critical
annotations:
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -0,0 +1,89 @@
# vi: syntax=yaml
groups:
# Prometheus
- name: Prometheus
rules:
- alert: PrometheusTargetMissing
expr: up{job!~"sftp-PR\\d+"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: Prometheus target missing (job {{ $labels.job }}, instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 3
for: 1m
labels:
severity: warning
annotations:
summary: Prometheus too many restarts (job {{ $labels.job }}, instance {{ $labels.instance }})
description: "Prometheus has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 2m
labels:
severity: critical
annotations:
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -0,0 +1,16 @@
# vi: syntax=yaml
groups:
- name: Traefik
rules:
- alert: TraefikHighHttp5xxErrorRateService
expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
for: 1m
labels:
severity: critical
annotations:
summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -0,0 +1,16 @@
# vi: syntax=yaml
groups:
- name: HashicorpVault
rules:
- alert: VaultSealed
expr: 'vault_core_unsealed == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Vault sealed (instance {{ $labels.instance }})
description: "Vault instance is sealed on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

127
variables.yml Normal file
View File

@ -0,0 +1,127 @@
---
instance: monitoring
vault:
pki:
path: '[[ .prometheus.vault_pki ]]'
ou: Monitoring
monitoring:
exporters:
count: 1
ping:
version: 1.1.0
image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1'
env: {}
resources:
cpu: 10
memory: 30
probes: []
blackbox:
version: 0.24.0
image: '[[ .docker.repo ]]blackbox-exporter:[[ .monitoring.exporters.blackbox.version ]]-1'
env: {}
resources:
cpu: 10
memory: 50
tcp_probes: []
http_probes: []
consul:
version: 0.11.0
image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2'
env: {}
resources:
cpu: 20
memory: 64
vault:
policies:
- '[[ .instance ]]-consul-exporter'
cluster:
image: nginxinc/nginx-unprivileged:alpine
env: {}
resources:
cpu: 10
memory: 18
vault:
policies:
- '[[ .instance ]]-cluster-exporter'
- metrics
prometheus:
version: 2.50.1
count: 1
image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1'
env: {}
resources:
cpu: 200
memory: 768
volumes:
data:
type: csi
source: '[[ .instance ]]-prometheus-data'
per_alloc: true
vault:
policies:
- '[[ .instance ]]-prometheus'
jobs: {}
alert_rules: {}
# alert_rules:
# postgres:
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
public_url: https://prometheus.example.org
traefik:
enabled: true
router: prometheus
retention: 30d
prometheus:
enabled: true
metrics_url: http://localhost:9090/metrics
alertmanager:
count: 1
version: 0.27.0
image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-1'
env: {}
resources:
cpu: 50
memory: 80
public_url: https://alerte.example.org
traefik:
enabled: true
router: alertmanager
strip_prefix: false
volumes:
data:
source: '[[ .instance ]]-alertmanager-data'
type: csi
per_alloc: true
prometheus:
metrics_url: http://127.0.0.1:9093/metrics
vault:
policies:
- metrics
- '[[ .instance ]]-alertmanager'
email:
from: alertmanager@[[ .consul.domain ]]
custom_config: ""
prometheus:
enabled: true

View File

@ -0,0 +1,3 @@
path "[[ .prometheus.vault_pki ]]/issue/metrics" {
capabilities = ["update"]
}

View File

@ -0,0 +1,8 @@
[[- $c := merge .monitoring.alertmanager .monitoring . ]]
path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager" {
capabilities = ["update"]
}
path "[[ .vault.root ]]kv/service/[[ .instance ]]/alertmanager" {
capabilities = ["read"]
}

View File

@ -0,0 +1,20 @@
[[- $c := merge .monitoring.exporters.cluster .monitoring.exporters .monitoring . ]]
# Read vault metrics
path "sys/metrics" {
capabilities = ["read", "list"]
}
# Get a cert for Nomad
path "pki/nomad/issue/[[ .instance ]]-cluster-exporter" {
capabilities = ["update"]
}
# Get a cert for Consul
path "pki/consul/issue/[[ .instance ]]-cluster-exporter" {
capabilities = ["update"]
}
# Get a consul token
path "consul/creds/[[ .instance ]]-cluster-exporter" {
capabilities = ["read"]
}

View File

@ -0,0 +1,4 @@
[[- $c := merge .monitoring.exporters.consul .monitoring.exporters .monitoring . ]]
path "[[ $c.vault.root ]]consul/creds/[[ .instance ]]-consul-exporter" {
capabilities = ["read"]
}

View File

@ -0,0 +1,12 @@
[[- $c := merge .monitoring.prometheus .monitoring . ]]
path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-prometheus" {
capabilities = ["update"]
}
path "[[ $c.vault.root ]]kv/service/[[ .instance ]]/prometheus" {
capabilities = ["read"]
}
path "[[ $c.vault.root ]]consul/creds/[[ .instance ]]-prometheus" {
capabilities = ["read"]
}