From 65441a4a9edbe4b4f45246574af91cfd26be1945 Mon Sep 17 00:00:00 2001 From: Daniel Berteaud Date: Tue, 19 Mar 2024 13:53:28 +0100 Subject: [PATCH] Start: prometheus + alertmanager + exporters --- TODO.md | 43 + bundles.yml | 4 + .../monitoring-alertmanager.hcl | 3 + .../monitoring-prometheus.hcl | 3 + .../monitoring-alertmanager.hcl | 16 + .../monitoring-prometheus.hcl | 34 + consul/policies/monitoring-prometheus.hcl | 9 + example/.monitoring.nomad.hcl.swp | 0 example/.variables.yml.swp | Bin 0 -> 12022 bytes example/README.md | 3 + example/TODO.md | 43 + .../monitoring-alertmanager.hcl | 3 + .../monitoring-prometheus.hcl | 3 + .../monitoring-alertmanager.hcl | 16 + .../monitoring-prometheus.hcl | 34 + .../consul/policies/monitoring-prometheus.hcl | 9 + example/images/alertmanager/Dockerfile | 41 + example/images/blackbox-exporter/Dockerfile | 29 + .../blackbox-exporter/root/etc/blackbox.yml | 65 + example/images/consul-exporter/Dockerfile | 21 + example/images/ping-exporter/Dockerfile | 24 + example/images/ping-exporter/root/config.yml | 4 + example/images/prometheus/Dockerfile | 48 + example/init/consul | 17 + example/init/pki | 156 ++ example/monitoring-exporters.nomad.hcl | 419 +++++ example/monitoring.nomad.hcl | 1344 +++++++++++++++++ example/vault/policies/metrics.hcl | 3 + .../policies/monitoring-alertmanager.hcl | 8 + .../policies/monitoring-cluster-exporter.hcl | 20 + .../policies/monitoring-consul-exporter.hcl | 4 + .../vault/policies/monitoring-prometheus.hcl | 12 + images/alertmanager/Dockerfile | 41 + images/blackbox-exporter/Dockerfile | 29 + .../blackbox-exporter/root/etc/blackbox.yml | 65 + images/consul-exporter/Dockerfile | 21 + images/ping-exporter/Dockerfile | 24 + images/ping-exporter/root/config.yml | 4 + images/prometheus/Dockerfile | 48 + init/consul | 17 + init/pki | 69 + monitoring-exporters.nomad.hcl | 253 ++++ monitoring.nomad.hcl | 376 +++++ templates/alertmanager/alertmanager.yml | 5 + templates/alertmanager/cluster_tls.yml | 10 + templates/alertmanager/nginx.conf | 13 + templates/alertmanager/start.sh | 19 + templates/alertmanager/web_tls.yml | 5 + templates/cluster-exporter/nginx.conf | 170 +++ templates/consul-exporter/start.sh | 8 + templates/ping_exporter/config.yml | 4 + templates/prometheus/prometheus.yml | 237 +++ templates/prometheus/rules/blackbox.yml | 69 + templates/prometheus/rules/consul.yml | 54 + templates/prometheus/rules/jvm.yml | 16 + templates/prometheus/rules/nomad.yml | 51 + templates/prometheus/rules/ping.yml | 25 + templates/prometheus/rules/postgres.yml | 80 + templates/prometheus/rules/prometheus.yml | 89 ++ templates/prometheus/rules/traefik.yml | 16 + templates/prometheus/rules/vault.yml | 16 + variables.yml | 127 ++ vault/policies/metrics.hcl | 3 + vault/policies/monitoring-alertmanager.hcl | 8 + .../policies/monitoring-cluster-exporter.hcl | 20 + vault/policies/monitoring-consul-exporter.hcl | 4 + vault/policies/monitoring-prometheus.hcl | 12 + 67 files changed, 4446 insertions(+) create mode 100644 TODO.md create mode 100644 bundles.yml create mode 100644 consul/config/service-defaults/monitoring-alertmanager.hcl create mode 100644 consul/config/service-defaults/monitoring-prometheus.hcl create mode 100644 consul/config/service-intentions/monitoring-alertmanager.hcl create mode 100644 consul/config/service-intentions/monitoring-prometheus.hcl create mode 100644 consul/policies/monitoring-prometheus.hcl create mode 100644 example/.monitoring.nomad.hcl.swp create mode 100644 example/.variables.yml.swp create mode 100644 example/README.md create mode 100644 example/TODO.md create mode 100644 example/consul/config/service-defaults/monitoring-alertmanager.hcl create mode 100644 example/consul/config/service-defaults/monitoring-prometheus.hcl create mode 100644 example/consul/config/service-intentions/monitoring-alertmanager.hcl create mode 100644 example/consul/config/service-intentions/monitoring-prometheus.hcl create mode 100644 example/consul/policies/monitoring-prometheus.hcl create mode 100644 example/images/alertmanager/Dockerfile create mode 100644 example/images/blackbox-exporter/Dockerfile create mode 100644 example/images/blackbox-exporter/root/etc/blackbox.yml create mode 100644 example/images/consul-exporter/Dockerfile create mode 100644 example/images/ping-exporter/Dockerfile create mode 100644 example/images/ping-exporter/root/config.yml create mode 100644 example/images/prometheus/Dockerfile create mode 100755 example/init/consul create mode 100755 example/init/pki create mode 100644 example/monitoring-exporters.nomad.hcl create mode 100644 example/monitoring.nomad.hcl create mode 100644 example/vault/policies/metrics.hcl create mode 100644 example/vault/policies/monitoring-alertmanager.hcl create mode 100644 example/vault/policies/monitoring-cluster-exporter.hcl create mode 100644 example/vault/policies/monitoring-consul-exporter.hcl create mode 100644 example/vault/policies/monitoring-prometheus.hcl create mode 100644 images/alertmanager/Dockerfile create mode 100644 images/blackbox-exporter/Dockerfile create mode 100644 images/blackbox-exporter/root/etc/blackbox.yml create mode 100644 images/consul-exporter/Dockerfile create mode 100644 images/ping-exporter/Dockerfile create mode 100644 images/ping-exporter/root/config.yml create mode 100644 images/prometheus/Dockerfile create mode 100755 init/consul create mode 100755 init/pki create mode 100644 monitoring-exporters.nomad.hcl create mode 100644 monitoring.nomad.hcl create mode 100644 templates/alertmanager/alertmanager.yml create mode 100644 templates/alertmanager/cluster_tls.yml create mode 100644 templates/alertmanager/nginx.conf create mode 100644 templates/alertmanager/start.sh create mode 100644 templates/alertmanager/web_tls.yml create mode 100644 templates/cluster-exporter/nginx.conf create mode 100644 templates/consul-exporter/start.sh create mode 100644 templates/ping_exporter/config.yml create mode 100644 templates/prometheus/prometheus.yml create mode 100644 templates/prometheus/rules/blackbox.yml create mode 100644 templates/prometheus/rules/consul.yml create mode 100644 templates/prometheus/rules/jvm.yml create mode 100644 templates/prometheus/rules/nomad.yml create mode 100644 templates/prometheus/rules/ping.yml create mode 100644 templates/prometheus/rules/postgres.yml create mode 100644 templates/prometheus/rules/prometheus.yml create mode 100644 templates/prometheus/rules/traefik.yml create mode 100644 templates/prometheus/rules/vault.yml create mode 100644 variables.yml create mode 100644 vault/policies/metrics.hcl create mode 100644 vault/policies/monitoring-alertmanager.hcl create mode 100644 vault/policies/monitoring-cluster-exporter.hcl create mode 100644 vault/policies/monitoring-consul-exporter.hcl create mode 100644 vault/policies/monitoring-prometheus.hcl diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..f2e1157 --- /dev/null +++ b/TODO.md @@ -0,0 +1,43 @@ +- ~~Split exporters dans un job dédié (pour pouvoir tourner sur node_pool spécifique)~~ +- Créer monitoring-agent type system avec vector + node-exporter +- images + - ~~prometheus~~ + - ~~ping-exporter~~ + - ~~blackbox-exporter~~ + - ~~consul-exporter~~ + - vector + - loki + - grafana + - nomad-vector-logger + +- pki roles: + - ~~monitoring -> prom~~ + - ~~consul -> prom~~ + - ~~monitoring -> am~~ + +- vault pol + - ~~prometheus~~ + - ~~issue prom on monitoring~~ + - ~~issue prom on consul~~ + - ~~consul-exporter~~ + - ~~issue consul-exporter on consul~~ + - ~~alertmanager~~ + - ~~issue alertmanager on monitoring~~ + +- consul defaults & intentions + - ~~prometheus~~ + - ~~alertmanager~~ + - loki + +- tasks + - ~~alertmanager~~ + - vector-aggregator + - vector-agent (dans job agent) + - loki (modulariser ou laisser en monolithique ?) + - grafana + - ~~cluster-metrics (job exporters)~~ + +- questions + - prom rules: keep or move to a -conf bundle ? + - ~~config alertes am (recipient + routing)~~ + - ~~http and tcp probes, as exporters are now in a dedicated job~~ diff --git a/bundles.yml b/bundles.yml new file mode 100644 index 0000000..5b9120e --- /dev/null +++ b/bundles.yml @@ -0,0 +1,4 @@ +--- + +dependencies: + - url: ../common.git diff --git a/consul/config/service-defaults/monitoring-alertmanager.hcl b/consul/config/service-defaults/monitoring-alertmanager.hcl new file mode 100644 index 0000000..c25108a --- /dev/null +++ b/consul/config/service-defaults/monitoring-alertmanager.hcl @@ -0,0 +1,3 @@ +Kind = "service-defaults" +Name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]" +Protocol = "http" diff --git a/consul/config/service-defaults/monitoring-prometheus.hcl b/consul/config/service-defaults/monitoring-prometheus.hcl new file mode 100644 index 0000000..2b6d3cf --- /dev/null +++ b/consul/config/service-defaults/monitoring-prometheus.hcl @@ -0,0 +1,3 @@ +Kind = "service-defaults" +Name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]" +Protocol = "http" diff --git a/consul/config/service-intentions/monitoring-alertmanager.hcl b/consul/config/service-intentions/monitoring-alertmanager.hcl new file mode 100644 index 0000000..8a29522 --- /dev/null +++ b/consul/config/service-intentions/monitoring-alertmanager.hcl @@ -0,0 +1,16 @@ +Kind = "service-intentions" +Name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]" +Sources = [ + { + Name = "[[ (merge .monitoring.alertmanager .).traefik.instance ]]" + Permissions = [ + { + Action = "allow" + HTTP { + PathPrefix = "[[ if eq "" (urlParse .monitoring.alertmanager.public_url).Path ]]/[[ else ]](urlParse .monitoring.alertmanager.public_url).Path[[ end ]]" + Methods = ["GET", "HEAD", "POST", "PUT", "DELETE", "PATCH"] + } + } + ] + } +] diff --git a/consul/config/service-intentions/monitoring-prometheus.hcl b/consul/config/service-intentions/monitoring-prometheus.hcl new file mode 100644 index 0000000..65f4f23 --- /dev/null +++ b/consul/config/service-intentions/monitoring-prometheus.hcl @@ -0,0 +1,34 @@ +Kind = "service-intentions" +Name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]" +Sources = [ + { + Name = "[[ (merge .monitoring.prometheus .).traefik.instance ]]" + Permissions = [ + { + Action = "allow" + HTTP { + Methods = ["GET", "HEAD", "POST"] + } + } + ] + }, + { + Name = "[[ .instance ]]-grafana[[ .consul.suffix ]]" + Permissions = [ + { + # Deny access to the admin API from Grafana + Action = "deny" + HTTP { + PathPrefix = "/api/v1/admin" + } + }, + { + Action = "allow" + HTTP { + PathPrefix = "/api/v1" + Methods = ["GET", "HEAD", "POST", "PUT"] + } + } + ] + } +] diff --git a/consul/policies/monitoring-prometheus.hcl b/consul/policies/monitoring-prometheus.hcl new file mode 100644 index 0000000..d96bc6d --- /dev/null +++ b/consul/policies/monitoring-prometheus.hcl @@ -0,0 +1,9 @@ +agent_prefix "" { + policy = "read" +} +node_prefix "" { + policy = "read" +} +service_prefix "" { + policy = "read" +} diff --git a/example/.monitoring.nomad.hcl.swp b/example/.monitoring.nomad.hcl.swp new file mode 100644 index 0000000..e69de29 diff --git a/example/.variables.yml.swp b/example/.variables.yml.swp new file mode 100644 index 0000000000000000000000000000000000000000..1e6e138b0e4049b0dea0e9e2eae1496020e9a742 GIT binary patch literal 12022 zcmeI2J!~9B6vxMrAPEpaBt}Ry0|t_2Kh93#cxeC=2tp*u?D`)2mdf8Lw( z$>Y1vJuydT>>fe8osj3FzxW5UPmS$<`$iJ_;X}tyoO=4<+2i)?$vNdQvc)WyS#I^* zDVr}z!2)gvS+dpj!q9+DAp$4JQpuu}i^55=G?JN!(nZHF(=g@(rWGb4=EG>OCGpn5 zjm0Qn6xcz5ePn8~=PIX(d+(w9?|60xe{dhqkvJsC}0#Y3K#{90@tAeEF@(8213Mc z1dsp!H{bs^ZzJRy_znCDu7V%HH{f&d8MqAI2akfI;C?U;c7Z={C1eBq488@+;05p` zcn}-{`@!G)2-yT1;79N>NWob!2RvYbyTR345Cixgd=0(=pMuNa68I3D0?&ZQ!DC<= z90GTOaj*w$-b~2v;0jm=pMXo?9dHJm26JErJOB=Wi#HK+0h|Pn02_>h17Hjgu(6kr zpTNi9Bk(SG9jt--z+GT3xV8t+2Va17@E&*zyarB#6Tk%&{DF1(1$+m-1aE>jz^g#5 z*^9t5qkvJsC}0%09u%OIr|KMA=us(3PI8eYTrTml@UZ3A&7udp({L6`4))!Fw}^tG zw&YnH1rbMQ)pdoHv8>)wT072{BKGK#l)2|Py~%0Y#ZS+hac8ES{;~Q4QDdo=b1~22 zI1B1vD=VdPE$1E$iik9pA}d7z+hE*Q$RulmSMTdEv$#x}nEM1-U~$3qkT^!j7ou~O zdtUmOWH*lj6JaqIlG}WhC3(#4Of1%+Bs|H)nn#bhwIRq$j~=P61h3+(<>KjzM_+nH zJ4OjxRPl!_jd<*<(ODT*&q-K{o5FK#C7;!+X0(Dsi&2((v@+{af~-uXM|-%?5jH)W zr5&{QDK$2uh3>c+LSS~&2@ z3PkOMQ6U{ssw1F%J_U~#tgbE`j={o^hZbMWGpv`;M?+OSDySTaE@Hu}9Ga-sV_pwh zne-!TLz4x|nP2FXAZi%Z21~JvrpfA&1xR#P{MxR7nsVvJ)1}h(8!oUBwNv{x`96L0 zSZ&b>Kk07CczwG`d3~`>&UZn1llBp87AoU)hc>PCbQMnNHuD_{2W8&tb!Cnyx!*Lh6dN50WBXX|ieFv8ePqu!J#nw|!ZW~%q7-N9|@Urxi1S#ZwJR$E=wesy@8 ziKrVyJ*(-}PG!q1x`3ye0RNkSa~$>Vg6-L(Mb~y6s`9VHY!eaM)WxqpKfBEV7g1VB PmZ}e>&Wvx9mPP&n2&$W6 literal 0 HcmV?d00001 diff --git a/example/README.md b/example/README.md new file mode 100644 index 0000000..31cbcec --- /dev/null +++ b/example/README.md @@ -0,0 +1,3 @@ +# monitoring + +Monitoring stack \ No newline at end of file diff --git a/example/TODO.md b/example/TODO.md new file mode 100644 index 0000000..e4cee54 --- /dev/null +++ b/example/TODO.md @@ -0,0 +1,43 @@ +- ~~Split exporters dans un job dédié (pour pouvoir tourner sur node_pool spécifique)~~ +- Créer monitoring-agent type system avec vector + node-exporter +- images + - prometheus + - ping-exporter + - blackbox-exporter + - consul-exporter + - vector + - loki + - grafana + - nomad-vector-logger + +- pki roles: + - ~~monitoring -> prom~~ + - ~~consul -> prom~~ + - ~~monitoring -> am~~ + +- vault pol + - prometheus + - ~~issue prom on monitoring~~ + - ~~issue prom on consul~~ + - consul-exporter + - issue consul-exporter on consul + - alertmanager + - ~~issue alertmanager on monitoring~~ + +- consul defaults & intentions + - ~~prometheus~~ + - ~~alertmanager~~ + - loki + +- tasks + - ~~alertmanager~~ + - vector-aggregator + - vector-agent (dans job agent) + - loki (modulariser ou laisser en monolithique ?) + - grafana + - cluster-metrics (job exporters) + +- questions + - prom rules: keep or move to a -conf bundle ? + - ~~config alertes am (recipient + routing)~~ + - http and tcp probes, as exporters are now in a dedicated job diff --git a/example/consul/config/service-defaults/monitoring-alertmanager.hcl b/example/consul/config/service-defaults/monitoring-alertmanager.hcl new file mode 100644 index 0000000..0908bb8 --- /dev/null +++ b/example/consul/config/service-defaults/monitoring-alertmanager.hcl @@ -0,0 +1,3 @@ +Kind = "service-defaults" +Name = "monitoring-alertmanager" +Protocol = "http" diff --git a/example/consul/config/service-defaults/monitoring-prometheus.hcl b/example/consul/config/service-defaults/monitoring-prometheus.hcl new file mode 100644 index 0000000..e4820a0 --- /dev/null +++ b/example/consul/config/service-defaults/monitoring-prometheus.hcl @@ -0,0 +1,3 @@ +Kind = "service-defaults" +Name = "monitoring-prometheus" +Protocol = "http" diff --git a/example/consul/config/service-intentions/monitoring-alertmanager.hcl b/example/consul/config/service-intentions/monitoring-alertmanager.hcl new file mode 100644 index 0000000..fedec67 --- /dev/null +++ b/example/consul/config/service-intentions/monitoring-alertmanager.hcl @@ -0,0 +1,16 @@ +Kind = "service-intentions" +Name = "monitoring-alertmanager" +Sources = [ + { + Name = "traefik" + Permissions = [ + { + Action = "allow" + HTTP { + PathPrefix = "/" + Methods = ["GET", "HEAD", "POST", "PUT", "DELETE", "PATCH"] + } + } + ] + } +] diff --git a/example/consul/config/service-intentions/monitoring-prometheus.hcl b/example/consul/config/service-intentions/monitoring-prometheus.hcl new file mode 100644 index 0000000..54c5e1f --- /dev/null +++ b/example/consul/config/service-intentions/monitoring-prometheus.hcl @@ -0,0 +1,34 @@ +Kind = "service-intentions" +Name = "monitoring-prometheus" +Sources = [ + { + Name = "traefik" + Permissions = [ + { + Action = "allow" + HTTP { + Methods = ["GET", "HEAD", "POST"] + } + } + ] + }, + { + Name = "monitoring-grafana" + Permissions = [ + { + # Deny access to the admin API from Grafana + Action = "deny" + HTTP { + PathPrefix = "/api/v1/admin" + } + }, + { + Action = "allow" + HTTP { + PathPrefix = "/api/v1" + Methods = ["GET", "HEAD", "POST", "PUT"] + } + } + ] + } +] diff --git a/example/consul/policies/monitoring-prometheus.hcl b/example/consul/policies/monitoring-prometheus.hcl new file mode 100644 index 0000000..d96bc6d --- /dev/null +++ b/example/consul/policies/monitoring-prometheus.hcl @@ -0,0 +1,9 @@ +agent_prefix "" { + policy = "read" +} +node_prefix "" { + policy = "read" +} +service_prefix "" { + policy = "read" +} diff --git a/example/images/alertmanager/Dockerfile b/example/images/alertmanager/Dockerfile new file mode 100644 index 0000000..4e3fbed --- /dev/null +++ b/example/images/alertmanager/Dockerfile @@ -0,0 +1,41 @@ +FROM danielberteaud/alpine:24.3-1 AS builder + +ARG AM_VERSION=0.27.0 + +ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/alertmanager-${AM_VERSION}.linux-amd64.tar.gz /tmp +ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/sha256sums.txt /tmp +RUN set -eux &&\ + apk --no-cache add \ + tar \ + &&\ + cd /tmp &&\ + grep "alertmanager-${AM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\ + tar xzf alertmanager-${AM_VERSION}.linux-amd64.tar.gz &&\ + mv alertmanager-${AM_VERSION}.linux-amd64 /opt/alertmanager + +FROM danielberteaud/alpine:24.3-1 +MAINTAINER Daniel Berteaud + +ENV PATH=/opt/alertmanager:$PATH + +COPY --from=builder /opt/alertmanager /opt/alertmanager +RUN set -eux &&\ + addgroup -g 9093 alertmanager &&\ + adduser --system \ + --disabled-password \ + --uid 9093 \ + --ingroup alertmanager \ + --home /opt/alertmanager \ + --no-create-home \ + --shell /sbin/nologin \ + alertmanager &&\ + mkdir /data &&\ + chown alertmanager:alertmanager /data &&\ + chmod 700 data + +WORKDIR /opt/alertmanager +USER alertmanager +EXPOSE 9093 +CMD [ "alertmanager", \ + "--config.file=/opt/alertmanager/alertmanager.yml", \ + "--storage.path=/data" ] diff --git a/example/images/blackbox-exporter/Dockerfile b/example/images/blackbox-exporter/Dockerfile new file mode 100644 index 0000000..3771a0f --- /dev/null +++ b/example/images/blackbox-exporter/Dockerfile @@ -0,0 +1,29 @@ +FROM danielberteaud/alpine:24.3-1 AS builder + +ARG BLACKBOX_EXPORTER_VERSION=0.24.0 + +ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp +ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/sha256sums.txt /tmp +RUN set -eux &&\ + apk --no-cache add tar gzip &&\ + cd /tmp &&\ + grep "blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\ + tar xvf blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz &&\ + mkdir blackbox &&\ + mv blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64/blackbox_exporter /usr/local/bin/blackbox_exporter + +FROM danielberteaud/alpine:24.3-1 +MAINTAINER Daniel Berteaud + +ENV BLACKBOX_CONF=/etc/blackbox.yml + +COPY --from=builder /usr/local/bin/blackbox_exporter /usr/local/bin/blackbox_exporter + +RUN set -eux &&\ + apk --no-cache upgrade &&\ + apk --no-cache add ca-certificates curl + +COPY root/ / + +EXPOSE 9195 +CMD ["sh", "-c", "exec blackbox_exporter --config.file=${BLACKBOX_CONF}"] diff --git a/example/images/blackbox-exporter/root/etc/blackbox.yml b/example/images/blackbox-exporter/root/etc/blackbox.yml new file mode 100644 index 0000000..51632ab --- /dev/null +++ b/example/images/blackbox-exporter/root/etc/blackbox.yml @@ -0,0 +1,65 @@ +modules: + http_2xx: + prober: http + http: + preferred_ip_protocol: "ip4" + http_ssl_no_check: + prober: http + http: + preferred_ip_protocol: "ip4" + tls_config: + insecure_skip_verify: true + http_post_2xx: + prober: http + http: + method: POST + preferred_ip_protocol: "ip4" + tcp_connect: + prober: tcp + tcp: + preferred_ip_protocol: "ip4" + pop3s_banner: + prober: tcp + tcp: + preferred_ip_protocol: "ip4" + query_response: + - expect: "^+OK" + tls: true + tls_config: + insecure_skip_verify: false + grpc: + prober: grpc + grpc: + tls: true + preferred_ip_protocol: "ip4" + grpc_plain: + prober: grpc + grpc: + preferred_ip_protocol: "ip4" + tls: false + service: "service1" + ssh_banner: + prober: tcp + tcp: + preferred_ip_protocol: "ip4" + query_response: + - expect: "^SSH-2.0-" + - send: "SSH-2.0-blackbox-ssh-check" + irc_banner: + prober: tcp + tcp: + preferred_ip_protocol: "ip4" + query_response: + - send: "NICK prober" + - send: "USER prober prober prober :prober" + - expect: "PING :([^ ]+)" + send: "PONG ${1}" + - expect: "^:[^ ]+ 001" + icmp: + prober: icmp + icmp_ttl5: + prober: icmp + timeout: 5s + icmp: + ttl: 5 + diff --git a/example/images/consul-exporter/Dockerfile b/example/images/consul-exporter/Dockerfile new file mode 100644 index 0000000..3b02080 --- /dev/null +++ b/example/images/consul-exporter/Dockerfile @@ -0,0 +1,21 @@ +FROM danielberteaud/alpine:24.3-1 AS builder + +ARG CONSUL_EXPORTER_VERSION=0.11.0 + +ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp +ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/sha256sums.txt /tmp +RUN set -eux &&\ + apk --no-cache add tar gzip &&\ + cd /tmp &&\ + grep "consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\ + tar xvf consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz &&\ + mv consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64/consul_exporter /usr/local/bin/consul_exporter + +FROM danielberteaud/alpine:24.3-1 +MAINTAINER Daniel Berteaud + +COPY --from=builder /usr/local/bin/consul_exporter /usr/local/bin/consul_exporter + +USER 9107 +EXPOSE 9107 +CMD ["consul_exporter"] diff --git a/example/images/ping-exporter/Dockerfile b/example/images/ping-exporter/Dockerfile new file mode 100644 index 0000000..e59b3ef --- /dev/null +++ b/example/images/ping-exporter/Dockerfile @@ -0,0 +1,24 @@ +FROM danielberteaud/alpine:24.3-1 AS builder +MAINTAINER Daniel Berteaud + +ARG PING_EXPORTER_VERSION=1.1.0 + +ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz /tmp +ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt /tmp +RUN set -eux &&\ + apk --no-cache add \ + tar \ + gzip \ + &&\ + cd /tmp &&\ + grep "ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz" ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt | sha256sum -c &&\ + tar xvf ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz &&\ + mv ping_exporter /usr/local/bin/ + +FROM danielberteaud/alpine:24.3-1 +MAINTAINER Daniel Berteaud + +COPY --from=builder /usr/local/bin/ping_exporter /usr/local/bin/ping_exporter + +EXPOSE 9427 +CMD ["ping_exporter", "--config.path=/config.yml"] diff --git a/example/images/ping-exporter/root/config.yml b/example/images/ping-exporter/root/config.yml new file mode 100644 index 0000000..95a4d57 --- /dev/null +++ b/example/images/ping-exporter/root/config.yml @@ -0,0 +1,4 @@ +# targets: +# - foo.bar +# - acme.com +targets: [] diff --git a/example/images/prometheus/Dockerfile b/example/images/prometheus/Dockerfile new file mode 100644 index 0000000..2b7ca51 --- /dev/null +++ b/example/images/prometheus/Dockerfile @@ -0,0 +1,48 @@ +FROM danielberteaud/alpine:24.3-1 AS builder + +ARG PROM_VERSION=2.50.1 + +ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz /tmp +ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/sha256sums.txt /tmp +RUN set -eux &&\ + apk --no-cache add \ + curl \ + tar \ + ca-certificates \ + &&\ + cd /tmp &&\ + grep "prometheus-${PROM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\ + tar xvzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\ + rm -f prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\ + mv prometheus-${PROM_VERSION}.linux-amd64 /opt/prometheus + +FROM danielberteaud/alpine:24.3-1 +MAINTAINER Daniel Berteaud + +ENV PATH=/opt/prometheus:$PATH + +COPY --from=builder /opt/prometheus /opt/prometheus +RUN set -eux &&\ + addgroup -g 9090 prometheus &&\ + adduser --system \ + --disabled-password \ + --uid 9090 \ + --ingroup prometheus \ + --home /opt/prometheus \ + --no-create-home \ + --shell /sbin/nologin \ + prometheus &&\ + mkdir /data &&\ + chown prometheus.prometheus /data &&\ + chmod 700 /data + +WORKDIR /opt/prometheus +USER prometheus +EXPOSE 9090 +CMD [ "/opt/prometheus/prometheus", \ + "--config.file=/opt/prometheus/prometheus.yml", \ + "--storage.tsdb.path=/data", \ + "--storage.tsdb.wal-compression", \ + "--storage.tsdb.wal-compression-type=zstd", \ + "--web.console.libraries=/opt/prometheus/console_libraries", \ + "--web.console.templates=/opt/prometheus/consoles" ] diff --git a/example/init/consul b/example/init/consul new file mode 100755 index 0000000..bc2250b --- /dev/null +++ b/example/init/consul @@ -0,0 +1,17 @@ +#!/bin/sh +# vim: syntax=sh + +vault write consul/roles/monitoring-prometheus \ + ttl=720h \ + max_ttl=720h \ + consul_policies="monitoring-prometheus" + +vault write consul/roles/monitoring-consul-exporter \ + ttl=720h \ + max_ttl=720h \ + consul_policies="monitoring-prometheus" + +vault write consul/roles/monitoring-cluster-exporter \ + ttl=720h \ + max_ttl=720h \ + consul_policies="monitoring-prometheus" diff --git a/example/init/pki b/example/init/pki new file mode 100755 index 0000000..9f25071 --- /dev/null +++ b/example/init/pki @@ -0,0 +1,156 @@ +#!/bin/sh + +set -euo pipefail + + +#!/bin/sh + +# vim: syntax=sh + +set -euo pipefail + +TMP=$(mktemp -d) + +INITIAL_SETUP=false +if [ "$(vault secrets list -format json | jq -r '.["pki/monitoring/"].type')" != "pki" ]; then + INITIAL_SETUP=true +fi + +if [ "${INITIAL_SETUP}" = "true" ]; then + # Enable the secret engine + echo "Mounting new PKI secret engine at pki/monitoring" + vault secrets enable -path=pki/monitoring pki +else + echo "Secret engine already mounted at pki/monitoring" +fi + +# Configure max-lease-ttl +echo "Tune PKI secret engine" +vault secrets tune -max-lease-ttl=131400h pki/monitoring + +# Configure PKI URLs +echo "Configure URL endpoints" +vault write pki/monitoring/config/urls \ + issuing_certificates="${VAULT_ADDR}/v1pki/monitoring/ca" \ + crl_distribution_points="${VAULT_ADDR}/v1pki/monitoring/crl" \ + ocsp_servers="${VAULT_ADDR}/v1pki/monitoring/ocsp" + +vault write pki/monitoring/config/cluster \ + path="${VAULT_ADDR}/v1pki/monitoring" + +vault write pki/monitoring/config/crl \ + auto_rebuild=true \ + enable_delta=true + +# Configure tidy +echo "Configure auto tidy for the PKI" +vault write pki/monitoring/config/auto-tidy \ + enabled=true \ + tidy_cert_store=true \ + tidy_expired_issuers=true \ + tidy_revocation_queue=true \ + tidy_revoked_cert_issuer_associations=true \ + tidy_revoked_certs=true \ + tidy_acme=true \ + tidy_cross_cluster_revoked_certs=true \ + tidy_move_legacy_ca_bundle=true \ + maintain_stored_certificate_counts=true + +if [ "${INITIAL_SETUP}" = "true" ]; then + # Generate an internal CA + echo "Generating an internal CA" + vault write -format=json pki/monitoring/intermediate/generate/internal \ + common_name="monitoring Certificate Authority" \ + ttl="131400h" \ + organization="ACME Corp" \ + ou="Monitoring" \ + locality="FooBar Ville" \ + key_type=rsa \ + key_bits=4096 \ + | jq -r '.data.csr' > ${TMP}/monitoring.csr + + + + # Sign this PKI with a root PKI + echo "Signing the new CA with the authority from pki/root" + vault write -format=json pki/root/root/sign-intermediate \ + csr=@${TMP}/monitoring.csr \ + format=pem_bundle \ + ttl="131400h" \ + | jq -r '.data.certificate' > ${TMP}/monitoring.crt + + # Update the intermediate CA with the signed one + echo "Update the new CA with the signed version" + vault write pki/monitoring/intermediate/set-signed \ + certificate=@${TMP}/monitoring.crt + + +fi + +# Remove temp files +echo "Cleaning temp files" +rm -rf ${TMP} + + +# Create a role for alertmanager +vault write pki/monitoring/roles/monitoring-alertmanager \ + allowed_domains="monitoring" \ + allow_bare_domains=false \ + allow_subdomains=true \ + allow_localhost=false \ + allow_ip_sans=true \ + server_flag=true \ + client_flag=true \ + allow_wildcard_certificates=false \ + max_ttl=100h \ + ou="Monitoring" + +# Create a role for prometheus (which will only be a client, for AlertManager) +vault write pki/monitoring/roles/monitoring-prometheus \ + allowed_domains="monitoring" \ + allow_bare_domains=false \ + allow_subdomains=true \ + allow_localhost=false \ + allow_ip_sans=false \ + server_flag=false \ + client_flag=true \ + allow_wildcard_certificates=false \ + max_ttl=100h \ + ou="Monitoring" + +# Create a role for metrics exporters (server only) +vault write pki/monitoring/roles/metrics \ + allowed_domains="monitoring" \ + allow_bare_domains=false \ + allow_subdomains=true \ + allow_localhost=false \ + allow_ip_sans=true \ + server_flag=true \ + client_flag=false \ + allow_wildcard_certificates=false \ + require_cn=false \ + max_ttl=72h \ + no_store=true \ + ou="Monitoring" + +# Create a role on the Nomad PKI for the cluster exporter +vault write pki/nomad/roles/monitoring-cluster-exporter \ + allowed_domains='nomad.consul' \ + allow_subdomains=true \ + allow_wildcard_certificates=false \ + max_ttl=168h \ + allow_ip_sans=false \ + server_flag=false \ + client_flag=true \ + ou="Cluster metrics exporter" + +# Create a role on the Consul PKI for the cluster exporter +vault write pki/consul/roles/monitoring-cluster-exporter \ + allowed_domains="consul.consul" \ + allow_bare_domains=false \ + allow_subdomains=true \ + allow_wildcard_certificates=false \ + max_ttl=168h \ + server_flags=false \ + client_flags=true \ + ou="Cluster metrics exporter" diff --git a/example/monitoring-exporters.nomad.hcl b/example/monitoring-exporters.nomad.hcl new file mode 100644 index 0000000..9fa0cdd --- /dev/null +++ b/example/monitoring-exporters.nomad.hcl @@ -0,0 +1,419 @@ +job "monitoring-exporters" { + + datacenters = ["dc1"] + region = "global" + + + # Run exporters. Use a separated job so exporters can run in a distinct node_pool + group "exporters" { + + count = 1 + + network { + mode = "bridge" + port "ping" {} + port "blackbox" {} + port "consul" {} + port "cluster" {} + } + + service { + name = "monitoring-ping-exporter" + port = "ping" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + metrics-port = "${NOMAD_HOST_PORT_ping}" + } + } + + service { + name = "monitoring-blackbox-exporter" + port = "blackbox" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + } + } + + service { + name = "monitoring-consul-exporter" + port = "ping" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + metrics-port = "${NOMAD_HOST_PORT_consul}" + } + } + + service { + name = "monitoring-cluster-exporter" + port = "cluster" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + } + } + + # Export consul services status to prometheus + task "consul-exporter" { + driver = "docker" + + config { + image = "danielberteaud/consul-exporter:0.11.0-2" + readonly_rootfs = true + pids_limit = 30 + command = "/local/consul-exporter" + } + + + + # Use a template block instead of env {} so we can fetch values from vault + template { + data = <<_EOT +LANG=fr_FR.utf8 +TZ=Europe/Paris +_EOT + destination = "secrets/.env" + perms = 400 + env = true + } + + + vault { + policies = ["monitoring-consul-exporter"] + env = false + disable_file = true + change_mode = "noop" + } + + + template { + data = <<_EOT +#!/bin/sh + +set -euo pipefail + +exec consul_exporter \ + --web.listen-address=127.0.0.1:9107 \ + --consul.server=http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 \ + --consul.request-limit=20 + +_EOT + destination = "local/consul-exporter" + perms = 755 + } + + template { + data = <<_EOT +CONSUL_HTTP_TOKEN={{ with secret "consul/creds/monitoring-consul-exporter" }}{{ .Data.token }}{{ end }} +_EOT + destination = "secrets/.consul.env" + uid = 100000 + gid = 100000 + perms = 400 + env = true + } + + + resources { + cpu = 20 + memory = 64 + } + + } + + # The cluster metrics exposes prometheus metrics from the various nodes of the cluster + # Nomad, Consul and Vault + # It also exposes the other exporters metrics with mTLS + task "cluster-metrics-proxy" { + driver = "docker" + user = 8685 + + lifecycle { + hook = "poststart" + sidecar = true + } + + config { + image = "nginxinc/nginx-unprivileged:alpine" + readonly_rootfs = true + pids_limit = 30 + # Mount the config in nginx conf dir + volumes = [ + "secrets/metrics.conf:/etc/nginx/conf.d/default.conf" + ] + mount { + type = "tmpfs" + target = "/tmp" + tmpfs_options { + size = 3000000 + } + } + + } + + + vault { + policies = ["monitoring-cluster-exporter", "metrics"] + env = false + disable_file = true + change_mode = "noop" + } + + + # This is the main nginx configuration, which will proxypass requests to the real metrics endpoints + template { + data = <<_EOT + +# Cluster exporter +server { + listen {{ env "NOMAD_ALLOC_PORT_cluster" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + + set $consul_token "{{ with secret "consul/creds/monitoring-cluster-exporter" }}{{ .Data.token }}{{ end }}"; + +{{- range service "nomad-client" }} + location /nomad-client/{{ .Node }} { + proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus; + proxy_ssl_certificate /secrets/nomad_client_bundle.pem; + proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem; + proxy_ssl_verify on; + proxy_ssl_name client.{{ env "NOMAD_REGION" }}.nomad; + proxy_ssl_trusted_certificate /local/nomad_ca.crt; + } +{{- end }} + +{{- range service "nomad" }} + {{- if .Tags | contains "http" }} + location /nomad/{{ .Node }} { + proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus; + proxy_ssl_certificate /secrets/nomad_client_bundle.pem; + proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem; + proxy_ssl_verify on; + proxy_ssl_name server.{{ env "NOMAD_REGION" }}.nomad; + proxy_ssl_trusted_certificate /local/nomad_ca.crt; + } + {{- end }} +{{- end }} + +{{- range service "consul" }} + location /consul/{{ .Node }} { + proxy_pass https://{{ .Address }}:8501/v1/agent/metrics?format=prometheus; + proxy_set_header X-Consul-Token $consul_token; + proxy_ssl_certificate /secrets/consul_client_bundle.pem; + proxy_ssl_certificate_key /secrets/consul_client_bundle.pem; + proxy_ssl_verify off; + proxy_ssl_trusted_certificate /local/consul_ca.crt; + } +{{- end }} + +{{- range service "vault" }} + location /vault/{{ .Node }} { + proxy_pass https://{{ .Address }}:{{ .Port }}/v1/sys/metrics?format=prometheus; + proxy_ssl_verify on; + proxy_ssl_trusted_certificate /etc/ssl/cert.pem; + proxy_set_header X-Forwarded-For "$proxy_add_x_forwarded_for"; + proxy_set_header X-Real-IP "$remote_addr"; + proxy_set_header X-Forwarded-Proto "$scheme"; + proxy_set_header X-Scheme "$scheme"; + proxy_set_header X-Forwarded-Host "$host"; + proxy_set_header X-Forwarded-Port "$server_port"; + } +{{- end }} + + location / { + root /usr/share/nginx/html; + index index.html; + } +} + +# Ping exporter +server { + listen {{ env "NOMAD_ALLOC_PORT_ping" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + location /metrics { + proxy_pass http://127.0.0.1:9427; + } +} + +# Blackbox exporter +server { + listen {{ env "NOMAD_ALLOC_PORT_blackbox" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + + location / { + proxy_pass http://127.0.0.1:9115; + } +} + +# Consul exporter +server { + listen {{ env "NOMAD_ALLOC_PORT_consul" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + location /metrics { + proxy_pass http://127.0.0.1:9107; + } +} + +_EOT + destination = "secrets/metrics.conf" + perms = "0440" + uid = 108685 + gid = 100000 + change_mode = "signal" + change_signal = "SIGHUP" + } + + # Get certificate to add mTLS to metrics endpoints + template { + data = <<_EOT +{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) }} +{{ .Cert }} +{{ .Key }} +{{- end }} +_EOT + destination = "secrets/metrics.bundle.pem" + change_mode = "signal" + change_signal = "SIGHUP" + } + + # Get the CA for the monitoring PKI + template { + data = <<_EOT +{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + # Get a Nomad client certificate + template { + data = <<_EOT +{{- with pkiCert "pki/nomad/issue/monitoring-cluster-exporter" "common_name=metrics-proxy.nomad.consul" "ttl=24h" }} +{{ .Data.Cert }} +{{ .Data.Key }} +{{- end }} +_EOT + destination = "secrets/nomad_client_bundle.pem" + perms = "0400" + uid = 108685 + gid = 100000 + change_mode = "signal" + change_signal = "SIGHUP" + } + + # The CA chain for Nomad + template { + data = <<_EOT +{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/nomad_ca.crt" + } + + # Same for Consul + template { + data = <<_EOT +{{- with pkiCert "pki/consul/issue/monitoring-cluster-exporter" "common_name=metrics-proxy.consul.consul" "ttl=24h" }} +{{ .Data.Cert }} +{{ .Data.Key }} +{{- end }} +_EOT + destination = "secrets/consul_client_bundle.pem" + perms = "0400" + uid = 108685 + gid = 100000 + change_mode = "signal" + change_signal = "SIGHUP" + } + template { + data = <<_EOT +{{ with secret "pki/consul/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/consul_ca.crt" + } + + + resources { + cpu = 10 + memory = 18 + } + + } + } +} diff --git a/example/monitoring.nomad.hcl b/example/monitoring.nomad.hcl new file mode 100644 index 0000000..742e225 --- /dev/null +++ b/example/monitoring.nomad.hcl @@ -0,0 +1,1344 @@ +job "monitoring" { + + + datacenters = ["dc1"] + region = "global" + + + # Metrics is running prometheus and various exporters + group "metrics" { + + shutdown_delay = "6s" + count = 1 + + network { + mode = "bridge" + port "metrics" {} + } + + + volume "data" { + source = "monitoring-prometheus-data" + type = "csi" + access_mode = "single-node-writer" + attachment_mode = "file-system" + per_alloc = true + } + + + service { + name = "monitoring-prometheus" + port = 9090 + + meta { + metrics-port = "${NOMAD_HOST_PORT_metrics}" + alloc = "${NOMAD_ALLOC_INDEX}" + job = "${NOMAD_JOB_NAME}" + } + + connect { + sidecar_service { + } + sidecar_task { + config { + args = [ + "-c", + "${NOMAD_SECRETS_DIR}/envoy_bootstrap.json", + "-l", + "${meta.connect.log_level}", + "--concurrency", + "${meta.connect.proxy_concurrency}", + "--disable-hot-restart" + ] + } + + resources { + cpu = 50 + memory = 64 + } + + } + } + + + check { + name = "health" + type = "http" + expose = true + path = "/-/healthy" + interval = "15s" + timeout = "8s" + check_restart { + limit = 10 + grace = "5m" + } + } + + tags = [ + + "traefik.enable=true", + "traefik.http.routers.monitoring-prometheus.entrypoints=https", + "traefik.http.routers.monitoring-prometheus.rule=Host(`prometheus.example.org`)", + "traefik.http.middlewares.csp-monitoring-prometheus.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';", + "traefik.http.routers.monitoring-prometheus.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-prometheus", + + ] + } + + + # The prometheus metrics proxy, adding mTLS to the metrics endpoint + task "metrics-proxy" { + driver = "docker" + user = 8995 + + config { + image = "nginxinc/nginx-unprivileged:alpine" + force_pull = true + volumes = [ + "local/default.conf:/etc/nginx/conf.d/default.conf:ro" + ] + pids_limit = 100 + } + + lifecycle { + hook = "poststart" + sidecar = true + } + + vault { + policies = ["metrics"] + } + + template { + data = <<_EOT +{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} +{{ .Cert }} +{{ .Key }}{{ end -}} +_EOT + destination = "secrets/metrics.bundle.pem" + } + + template { + data = <<_EOT +{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + template { + data = <<_EOT +server { + listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + location /metrics { + proxy_pass http://localhost:9090/metrics; + } +} +_EOT + destination = "local/default.conf" + } + + resources { + cpu = 10 + memory = 10 + memory_max = 20 + } + } + + + + # The main prometheus task + task "prometheus" { + driver = "docker" + leader = true + + config { + image = "danielberteaud/prometheus:2.50.1-1" + readonly_rootfs = true + pids_limit = 200 + command = "prometheus" + args = [ + "--config.file=/local/prometheus.yml", + "--log.level=debug", + "--web.listen-address=127.0.0.1:9090", + "--storage.tsdb.path=/data", + "--storage.tsdb.retention.time=30d", + "--web.console.libraries=/opt/prometheus/console_libraries", + "--web.console.templates=/opt/prometheus/consoles", + "--web.external-url=https://prometheus.example.org", + "--web.route-prefix=/" + ] + } + + + vault { + policies = ["monitoring-prometheus"] + env = false + disable_file = true + change_mode = "noop" + } + + + + + # Main configuration for prometheus + template { + data = <<_EOT +global: + scrape_interval: 15s + evaluation_interval: 15s + #query_log_file: /dev/stdout + external_labels: + cluster: consul + env: default + +rule_files: + - /local/rules/*.yml + +alerting: + alertmanagers: + - scheme: https + tls_config: + ca_file: /local/monitoring.ca.pem + cert_file: /secrets/prometheus.bundle.pem + key_file: /secrets/prometheus.bundle.pem + consul_sd_configs: + - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 + scheme: http + datacenter: dc1 + relabel_configs: + # Only keep alertmanagers + - source_labels: [__meta_consul_service] + action: keep + regex: monitoring-alertmanager-tls + +scrape_configs: + + # Cluster services + - job_name: cluster-services + scheme: https + tls_config: + ca_file: /local/monitoring.ca.pem + cert_file: /secrets/prometheus.bundle.pem + key_file: /secrets/prometheus.bundle.pem + consul_sd_configs: + - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 + scheme: http + token: {{ with secret "consul/creds/monitoring-prometheus" }}{{ .Data.token }}{{ end }} + datacenter: dc1 + relabel_configs: + + # Drop anything which is not Nomad, Consul or Vault + # Other services will be monitored with another job + - source_labels: [__meta_consul_service] + action: keep + regex: (nomad(\-client)?|consul|vault) + + - source_labels: [__meta_consul_service,__meta_consul_node] + regex: (.+);(.+) + replacement: $${1}/$${2} + target_label: __metrics_path__ + + - source_labels: [__meta_consul_service] + regex: (.+) + replacement: {{ range $idx, $instance := service "monitoring-cluster-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }} + target_label: __address__ + + # Rewrite the job labels to the name of the service + - source_labels: [__meta_consul_service] + regex: (.+) + replacement: $${1} + target_label: job + + # Rewrite the instance labels + - source_labels: [__meta_consul_node] + regex: (.+) + replacement: $${1} + target_label: instance + + # regular services discovered from the Consul Catalog + - job_name: consul-services + scheme: https + tls_config: + ca_file: /local/monitoring.ca.pem + cert_file: /secrets/prometheus.bundle.pem + key_file: /secrets/prometheus.bundle.pem + + consul_sd_configs: + - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 + scheme: http + token: {{ with secret "consul/creds/monitoring-prometheus" }}{{ .Data.token }}{{ end }} + datacenter: dc1 + + relabel_configs: + + # Drop sidecar's service to prevent duplicate. Sidecar themselves are treated in another job + - source_labels: [__meta_consul_service] + action: drop + regex: (.+)-sidecar-proxy + + # Drop Nomad, Consul and vault, already handled + - source_labels: [__meta_consul_service] + action: drop + regex: (nomad(\-client)?|consul|vault) + + # Only keep services having a metrics-port set + - source_labels: [__meta_consul_service_metadata_metrics_port] + regex: \d+ + action: keep + + # Get metrics path from metadata + - source_labels: [__meta_consul_service_metadata_metrics_path] + target_label: __metrics_path__ + regex: (.+) + + # Rewrite the scheme if needed + - source_labels: [__meta_consul_service_metadata_metrics_scheme] + regex: (https?) + replacement: $${1} + target_label: __scheme__ + + # Rewrite the address to use the metrics port + - source_labels: [__address__, __meta_consul_service_metadata_metrics_port] + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $${1}:$${2} + target_label: __address__ + + # Rewrite the job labels to the name of the service + - source_labels: [__meta_consul_service] + regex: (.+) + replacement: $${1} + target_label: job + + # Set the default alloc to 0 if not set + - source_labels: [__meta_consul_service_metadata_alloc] + regex: ^$ + replacement: 0 + target_label: __meta_consul_service_metadata_alloc + + # Rewerite the instance label to be service-alloc + - source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc] + regex: (.+);([a-zA-Z\d\-\.]+) + replacement: $${1}-$${2} + target_label: instance + + # envoy sidecars from consul + - job_name: consul-envoy-services + consul_sd_configs: + - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 + scheme: http + token: {{ with secret "consul/creds/monitoring-prometheus" }}{{ .Data.token }}{{ end }} + datacenter: dc1 + + relabel_configs: + + # Only keep sidecar-service with a envoy-metrics-port defined + - source_labels: [__meta_consul_service, __meta_consul_service_metadata_envoy_metrics_port] + action: keep + regex: (.+)-sidecar-proxy;\d+ + + # Rewrite the address to use the envoy-metrics-port + - source_labels: [__address__, __meta_consul_service_metadata_envoy_metrics_port] + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $${1}:$${2} + target_label: __address__ + + # Rewrite the job label + - source_labels: [__meta_consul_service] + regex: (.+) + replacement: $${1} + target_label: job + + # Set the default alloc to 0 if not set + - source_labels: [__meta_consul_service_metadata_alloc] + regex: ^$ + replacement: 0 + target_label: __meta_consul_service_metadata_alloc + + # Rewerite the instance label to be service-alloc + - source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc] + regex: (.+);([a-zA-Z\d\-\.]+) + replacement: $${1}-$${2} + target_label: instance + +_EOT + destination = "local/prometheus.yml" + uid = 100000 + gid = 109090 + perms = 640 + change_mode = "signal" + change_signal = "SIGHUP" + } + + # Alert rules + template { + data = <<_EOT +# vi: syntax=yaml + +groups: + +- name: Blackbox + rules: + + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ $labels.instance }}) + description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (instance {{ $labels.instance }}) + description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) + description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20' + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateExpired + expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) + description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxProbeSlowHttp + expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1' + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) + description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +_EOT + destination = "local/rules/blackbox.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + template { + data = <<_EOT +# vi: syntax=yaml + +groups: + +- name: ConsulExporter + + rules: + + - alert: ConsulServiceHealthcheckFailed + # Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available) + expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0' + for: 2m + labels: + severity: critical + annotations: + summary: Consul service healthcheck failed (service {{ $labels.service_name }}) + description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ConsulMissingMasterNode + expr: 'consul_raft_peers < (max_over_time(consul_raft_peers{}[6h]) / 2) + 1' + for: 0m + labels: + severity: critical + annotations: + summary: Consul missing master node (node {{ $labels.node }}) + description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ConsulAgentUnhealthy + expr: 'consul_health_node_status{status="critical"} == 1' + for: 0m + labels: + severity: critical + annotations: + summary: Consul agent unhealthy (node {{ $labels.node }}) + description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ConsulServiceWarning + expr: 'consul_health_service_status{status="warning"} == 1' + for: 2m + labels: + severity: warning + annotations: + summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state + description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ConsulServiceCritical + expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1' + for: 2m + labels: + severity: critical + annotations: + summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state + description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + +_EOT + destination = "local/rules/consul.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + template { + data = <<_EOT +# vi: syntax=yaml + +groups: + +- name: JVM + + rules: + + - alert: JvmMemoryFillingUp + expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 90' + for: 2m + labels: + severity: warning + annotations: + summary: JVM memory filling up (instance {{ $labels.instance }}) + description: "JVM memory is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +_EOT + destination = "local/rules/jvm.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + template { + data = <<_EOT +# vi: syntax=yaml + +groups: + +- name: Nomad + rules: + + - alert: NomadJobFailed + expr: 'delta(nomad_nomad_job_summary_failed[30m]) > 0' + for: 0m + labels: + severity: warning + annotations: + summary: Nomad job failed (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) + description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NomadJobLost + expr: 'nomad_nomad_job_summary_lost > 0' + for: 0m + labels: + severity: warning + annotations: + summary: Nomad job lost (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) + description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NomadJobQueued + expr: 'nomad_nomad_job_summary_queued > 0' + for: 3m + labels: + severity: warning + annotations: + summary: Nomad job queued (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) + description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NomadBlockedEvaluation + expr: 'nomad_nomad_blocked_evals_total_blocked > 0' + for: 2m + labels: + severity: warning + annotations: + summary: Nomad blocked evaluation (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) + description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NomadTaskOOM + expr: 'count_over_time(nomad_client_allocs_oom_killed[1h]) > 1' + for: 0m + labels: + severity: warning + annotations: + summary: Nomad task killed by OOM (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) + description: "Nomad task killed by OOM \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +_EOT + destination = "local/rules/nomad.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + template { + data = <<_EOT +# vi: syntax=yaml + +groups: + +- name: Ping + rules: + + - alert: HostDown + expr: ping_loss_ratio == 1 + for: 3m + labels: + severity: critical + annotations: + summary: Host down (host {{ $labels.target }}) + description: "Host {{ $labels.target }} doesn't respond to ICMP pings, VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PingLoss + expr: | + avg_over_time(ping_loss_ratio[10m]) > 0.1 and min_over_time(ping_loss_ratio[10m]) < 1 + for: 0m + labels: + severity: warning + annotations: + summary: High packet loss (host {{ $labels.target }}) + description: "ICMP pings have a loss ratio > 10%, VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +_EOT + destination = "local/rules/ping.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + template { + data = <<_EOT +# vi: syntax=yaml + +groups: + +- name: Postgres + + rules: + + - alert: PostgresqlDown + expr: 'pg_up == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql down (instance {{ $labels.instance }}) + description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresTooManyRestarts + expr: changes(process_start_time_seconds{job="pg"}[15m]) > 3 + for: 1m + labels: + severity: warning + annotations: + summary: Postgres too many restarts (instance {{ $labels.instance }}) + description: "Postgres server has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyConnections + expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8' + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many connections (instance {{ $labels.instance }}) + description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlDeadLocks + expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql dead locks (instance {{ $labels.instance }}) + description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# - alert: PostgresqlHighRollbackRate +# expr: 'rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.05' +# for: 0m +# labels: +# severity: warning +# annotations: +# summary: Postgresql high rollback rate (instance {{ $labels.instance }}) +# description: "Ratio of transactions being aborted compared to committed is > 5 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRateStatementTimeout + expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate statement timeout (instance {{ $labels.instance }}) + description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRateDeadlock + expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate deadlock (instance {{ $labels.instance }}) + description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyLocksAcquired + expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20' + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql too many locks acquired (instance {{ $labels.instance }}) + description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + +_EOT + destination = "local/rules/postgres.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + template { + data = <<_EOT +# vi: syntax=yaml + +groups: + +# Prometheus +- name: Prometheus + rules: + + - alert: PrometheusTargetMissing + expr: up{job!~"sftp-PR\\d+"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: Prometheus target missing (job {{ $labels.job }}, instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 3 + for: 1m + labels: + severity: warning + annotations: + summary: Prometheus too many restarts (job {{ $labels.job }}, instance {{ $labels.instance }}) + description: "Prometheus has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 2m + labels: + severity: critical + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: rate(alertmanager_notifications_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + +_EOT + destination = "local/rules/prometheus.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + template { + data = <<_EOT +# vi: syntax=yaml + +groups: + +- name: Traefik + + rules: + + - alert: TraefikHighHttp5xxErrorRateService + expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5' + for: 1m + labels: + severity: critical + annotations: + summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }}) + description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +_EOT + destination = "local/rules/traefik.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + template { + data = <<_EOT +# vi: syntax=yaml + +groups: + +- name: HashicorpVault + + rules: + + - alert: VaultSealed + expr: 'vault_core_unsealed == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Vault sealed (instance {{ $labels.instance }}) + description: "Vault instance is sealed on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +_EOT + destination = "local/rules/vault.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + + # A client cert, to connect to the AlertManager API + template { + data = <<_EOT +{{- with pkiCert "pki/monitoring/issue/monitoring-prometheus" + (printf "common_name=prometheus-%s.monitoring" (env "NOMAD_ALLOC_INDEX")) + (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}} +{{ .Cert }} +{{ .Key }} +{{- end -}} +_EOT + destination = "secrets/prometheus.bundle.pem" + uid = 100000 + gid = 109090 + perms = "0440" + change_mode = "signal" + change_signal = "SIGHUP" + } + + # The monitoring CA chain, to validate AlertManager cert + template { + data = <<_EOT +{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + uid = 100000 + gid = 100000 + change_mode = "signal" + change_signal = "SIGHUP" + } + + # Persistent data + volume_mount { + volume = "data" + destination = "/data" + } + + + resources { + cpu = 200 + memory = 768 + } + + } + } + + group "alerts" { + + count = 1 + + network { + mode = "bridge" + port "web-tls" {} + port "cluster" {} + port "metrics" {} + } + + + volume "data" { + source = "monitoring-alertmanager-data" + type = "csi" + access_mode = "single-node-writer" + attachment_mode = "file-system" + per_alloc = true + } + + + # This service is used for the different instances of alertmanager to communicate + service { + name = "monitoring-alertmanager-gossip" + port = "cluster" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + } + } + + # This service is used by prometheus. As it needs to be able to reach every instances, it cannot use + # the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh + service { + name = "monitoring-alertmanager-tls" + port = "web-tls" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + } + } + + # This service is exposed through the service mesh + # and can be used to reach the web interface through Traefik + service { + name = "monitoring-alertmanager" + port = 9093 + meta { + metrics-port = "${NOMAD_HOST_PORT_metrics}" + alloc = "${NOMAD_ALLOC_INDEX}" + job = "${NOMAD_JOB_NAME}" + } + + connect { + sidecar_service { + } + sidecar_task { + config { + args = [ + "-c", + "${NOMAD_SECRETS_DIR}/envoy_bootstrap.json", + "-l", + "${meta.connect.log_level}", + "--concurrency", + "${meta.connect.proxy_concurrency}", + "--disable-hot-restart" + ] + } + + resources { + cpu = 50 + memory = 64 + } + + } + } + + + check { + name = "health" + type = "http" + expose = true + path = "/-/healthy" + interval = "20s" + timeout = "8s" + check_restart { + limit = 12 + grace = "30s" + } + } + + tags = [ + + "traefik.enable=true", + "traefik.http.routers.monitoring-alertmanager.entrypoints=https", + "traefik.http.routers.monitoring-alertmanager.rule=Host(`alerte.example.org`)", + "traefik.http.middlewares.csp-monitoring-alertmanager.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';", + "traefik.http.routers.monitoring-alertmanager.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-alertmanager", + + ] + } + + + # The prometheus metrics proxy, adding mTLS to the metrics endpoint + task "metrics-proxy" { + driver = "docker" + user = 8995 + + config { + image = "nginxinc/nginx-unprivileged:alpine" + force_pull = true + volumes = [ + "local/default.conf:/etc/nginx/conf.d/default.conf:ro" + ] + pids_limit = 100 + } + + lifecycle { + hook = "poststart" + sidecar = true + } + + vault { + policies = ["metrics"] + } + + template { + data = <<_EOT +{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} +{{ .Cert }} +{{ .Key }}{{ end -}} +_EOT + destination = "secrets/metrics.bundle.pem" + } + + template { + data = <<_EOT +{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + template { + data = <<_EOT +server { + listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + location /metrics { + proxy_pass http://127.0.0.1:9093/metrics; + } +} +_EOT + destination = "local/default.conf" + } + + resources { + cpu = 10 + memory = 10 + memory_max = 20 + } + } + + + + # This task will handle mTLS to the AlertManager API + # And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy + task "tls-proxy" { + driver = "docker" + user = 9093 + + config { + image = "nginxinc/nginx-unprivileged:alpine" + force_pull = true + readonly_rootfs = true + pids_limit = 30 + volumes = [ + "local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro", + ] + mount { + type = "tmpfs" + target = "/tmp" + tmpfs_options { + size = 3000000 + } + } + + } + + + vault { + policies = ["metrics", "monitoring-alertmanager"] + env = false + disable_file = true + change_mode = "noop" + } + + + lifecycle { + hook = "poststart" + sidecar = true + } + + template { + data = <<_EOT +server { + listen 127.0.0.1:9093; + location / { + proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }}; + proxy_ssl_certificate /secrets/alertmanager.bundle.pem; + proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem; + proxy_ssl_verify on; + proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring; + proxy_ssl_trusted_certificate /local/monitoring.ca.pem; + allow 127.0.0.1; + deny all; + } +} + +_EOT + destination = "local/alertmanager.conf" + } + + # Certifiate used by AlertManager + template { + data = <<_EOT +{{- with pkiCert "pki/monitoring/issue/monitoring-alertmanager" + (printf "common_name=alertmanager-%s.monitoring" (env "NOMAD_ALLOC_INDEX")) + (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) + (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }} +{{ .Cert }} +{{ .Key }} +{{- end }} +_EOT + destination = "secrets/alertmanager.bundle.pem" + uid = 109093 + gid = 100000 + perms = "0440" + change_mode = "signal" + change_signal = "SIGHUP" + } + + # The trusted CA + template { + data = <<_EOT +{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + resources { + cpu = 10 + memory = 18 + } + } + + # The main alertmanager task + task "alertmanager" { + driver = "docker" + leader = true + + config { + image = "danielberteaud/alertmanager:0.27.0-1" + readonly_rootfs = true + pids_limit = 200 + command = "/local/alertmanager" + } + + + vault { + policies = ["metrics", "monitoring-alertmanager"] + env = false + disable_file = true + change_mode = "noop" + } + + + + # Use a template block instead of env {} so we can fetch values from vault + template { + data = <<_EOT +LANG=fr_FR.utf8 +TZ=Europe/Paris +_EOT + destination = "secrets/.env" + perms = 400 + env = true + } + + + template { + data = <<_EOT +global: + smtp_from: alertmanager@consul + smtp_require_tls: false + smtp_smarthost: localhost:25 + +_EOT + destination = "secrets/alertmanager.yml" + } + + template { + data = <<_EOT +tls_server_config: + cert_file: /secrets/alertmanager.bundle.pem + key_file: /secrets/alertmanager.bundle.pem + client_auth_type: RequireAndVerifyClientCert + client_ca_file: /local/monitoring.ca.pem + +tls_client_config: + cert_file: /secrets/alertmanager.bundle.pem + key_file: /secrets/alertmanager.bundle.pem + ca_file: /local/monitoring.ca.pem + +_EOT + destination = "local/cluster_tls.yml" + } + + template { + data = <<_EOT +tls_server_config: + cert_file: /secrets/alertmanager.bundle.pem + key_file: /secrets/alertmanager.bundle.pem + client_auth_type: RequireAndVerifyClientCert + client_ca_file: /local/monitoring.ca.pem + +_EOT + destination = "local/web_tls.yml" + } + + template { + data = <<_EOT +#!/bin/sh + +set -euo pipefail + +exec alertmanager \ + --config.file=/secrets/alertmanager.yml \ + --storage.path=/data \ + --web.external-url=https://alerte.example.org \ + --web.route-prefix=/ \ + --web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \ + --cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \ + --cluster.advertise-address={{ env "NOMAD_HOST_ADDR_cluster" }} \ +{{- range service "monitoring-am-gossip" -}} +{{- if not (eq (env "NOMAD_ALLOC_INDEX") (index .ServiceMeta "alloc")) }} + --cluster.peer={{ .Address }}:{{ .Port }} \ +{{ end -}} +{{- end -}} + --cluster.tls-config=/local/cluster_tls.yml \ + --web.config.file=/local/web_tls.yml + +_EOT + destination = "local/alertmanager" + uid = 100000 + gid = 100000 + perms = "0755" + } + + # Certifiate used by AlertManager + template { + data = <<_EOT +{{- with pkiCert "pki/monitoring/issue/monitoring-alertmanager" + (printf "common_name=alertmanager-%s.monitoring" (env "NOMAD_ALLOC_INDEX")) + (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) + (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }} +{{ .Cert }} +{{ .Key }} +{{- end }} +_EOT + destination = "secrets/alertmanager.bundle.pem" + uid = 109093 + gid = 109090 + perms = "0440" + change_mode = "signal" + change_signal = "SIGHUP" + } + + # The trusted CA + template { + data = <<_EOT +{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + volume_mount { + volume = "data" + destination = "/data" + } + + + resources { + cpu = 50 + memory = 80 + } + + } + } +} diff --git a/example/vault/policies/metrics.hcl b/example/vault/policies/metrics.hcl new file mode 100644 index 0000000..849263d --- /dev/null +++ b/example/vault/policies/metrics.hcl @@ -0,0 +1,3 @@ +path "pki/monitoring/issue/metrics" { + capabilities = ["update"] +} diff --git a/example/vault/policies/monitoring-alertmanager.hcl b/example/vault/policies/monitoring-alertmanager.hcl new file mode 100644 index 0000000..9d1e050 --- /dev/null +++ b/example/vault/policies/monitoring-alertmanager.hcl @@ -0,0 +1,8 @@ + +path "pki/monitoring/issue/monitoring-alertmanager" { + capabilities = ["update"] +} + +path "kv/service/monitoring/alertmanager" { + capabilities = ["read"] +} diff --git a/example/vault/policies/monitoring-cluster-exporter.hcl b/example/vault/policies/monitoring-cluster-exporter.hcl new file mode 100644 index 0000000..9b98d67 --- /dev/null +++ b/example/vault/policies/monitoring-cluster-exporter.hcl @@ -0,0 +1,20 @@ + +# Read vault metrics +path "sys/metrics" { + capabilities = ["read", "list"] +} + +# Get a cert for Nomad +path "pki/nomad/issue/monitoring-cluster-exporter" { + capabilities = ["update"] +} + +# Get a cert for Consul +path "pki/consul/issue/monitoring-cluster-exporter" { + capabilities = ["update"] +} + +# Get a consul token +path "consul/creds/monitoring-cluster-exporter" { + capabilities = ["read"] +} diff --git a/example/vault/policies/monitoring-consul-exporter.hcl b/example/vault/policies/monitoring-consul-exporter.hcl new file mode 100644 index 0000000..aeadd60 --- /dev/null +++ b/example/vault/policies/monitoring-consul-exporter.hcl @@ -0,0 +1,4 @@ + +path "consul/creds/monitoring-consul-exporter" { + capabilities = ["read"] +} diff --git a/example/vault/policies/monitoring-prometheus.hcl b/example/vault/policies/monitoring-prometheus.hcl new file mode 100644 index 0000000..30e594f --- /dev/null +++ b/example/vault/policies/monitoring-prometheus.hcl @@ -0,0 +1,12 @@ + +path "pki/monitoring/issue/monitoring-prometheus" { + capabilities = ["update"] +} + +path "kv/service/monitoring/prometheus" { + capabilities = ["read"] +} + +path "consul/creds/monitoring-prometheus" { + capabilities = ["read"] +} diff --git a/images/alertmanager/Dockerfile b/images/alertmanager/Dockerfile new file mode 100644 index 0000000..885644c --- /dev/null +++ b/images/alertmanager/Dockerfile @@ -0,0 +1,41 @@ +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder + +ARG AM_VERSION=[[ .monitoring.alertmanager.version ]] + +ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/alertmanager-${AM_VERSION}.linux-amd64.tar.gz /tmp +ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/sha256sums.txt /tmp +RUN set -eux &&\ + apk --no-cache add \ + tar \ + &&\ + cd /tmp &&\ + grep "alertmanager-${AM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\ + tar xzf alertmanager-${AM_VERSION}.linux-amd64.tar.gz &&\ + mv alertmanager-${AM_VERSION}.linux-amd64 /opt/alertmanager + +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] +MAINTAINER [[ .docker.maintainer ]] + +ENV PATH=/opt/alertmanager:$PATH + +COPY --from=builder /opt/alertmanager /opt/alertmanager +RUN set -eux &&\ + addgroup -g 9093 alertmanager &&\ + adduser --system \ + --disabled-password \ + --uid 9093 \ + --ingroup alertmanager \ + --home /opt/alertmanager \ + --no-create-home \ + --shell /sbin/nologin \ + alertmanager &&\ + mkdir /data &&\ + chown alertmanager:alertmanager /data &&\ + chmod 700 data + +WORKDIR /opt/alertmanager +USER alertmanager +EXPOSE 9093 +CMD [ "alertmanager", \ + "--config.file=/opt/alertmanager/alertmanager.yml", \ + "--storage.path=/data" ] diff --git a/images/blackbox-exporter/Dockerfile b/images/blackbox-exporter/Dockerfile new file mode 100644 index 0000000..0013c3d --- /dev/null +++ b/images/blackbox-exporter/Dockerfile @@ -0,0 +1,29 @@ +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder + +ARG BLACKBOX_EXPORTER_VERSION=[[ .monitoring.exporters.blackbox.version ]] + +ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp +ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/sha256sums.txt /tmp +RUN set -eux &&\ + apk --no-cache add tar gzip &&\ + cd /tmp &&\ + grep "blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\ + tar xvf blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz &&\ + mkdir blackbox &&\ + mv blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64/blackbox_exporter /usr/local/bin/blackbox_exporter + +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] +MAINTAINER [[ .docker.maintainer ]] + +ENV BLACKBOX_CONF=/etc/blackbox.yml + +COPY --from=builder /usr/local/bin/blackbox_exporter /usr/local/bin/blackbox_exporter + +RUN set -eux &&\ + apk --no-cache upgrade &&\ + apk --no-cache add ca-certificates curl + +COPY root/ / + +EXPOSE 9195 +CMD ["sh", "-c", "exec blackbox_exporter --config.file=${BLACKBOX_CONF}"] diff --git a/images/blackbox-exporter/root/etc/blackbox.yml b/images/blackbox-exporter/root/etc/blackbox.yml new file mode 100644 index 0000000..51632ab --- /dev/null +++ b/images/blackbox-exporter/root/etc/blackbox.yml @@ -0,0 +1,65 @@ +modules: + http_2xx: + prober: http + http: + preferred_ip_protocol: "ip4" + http_ssl_no_check: + prober: http + http: + preferred_ip_protocol: "ip4" + tls_config: + insecure_skip_verify: true + http_post_2xx: + prober: http + http: + method: POST + preferred_ip_protocol: "ip4" + tcp_connect: + prober: tcp + tcp: + preferred_ip_protocol: "ip4" + pop3s_banner: + prober: tcp + tcp: + preferred_ip_protocol: "ip4" + query_response: + - expect: "^+OK" + tls: true + tls_config: + insecure_skip_verify: false + grpc: + prober: grpc + grpc: + tls: true + preferred_ip_protocol: "ip4" + grpc_plain: + prober: grpc + grpc: + preferred_ip_protocol: "ip4" + tls: false + service: "service1" + ssh_banner: + prober: tcp + tcp: + preferred_ip_protocol: "ip4" + query_response: + - expect: "^SSH-2.0-" + - send: "SSH-2.0-blackbox-ssh-check" + irc_banner: + prober: tcp + tcp: + preferred_ip_protocol: "ip4" + query_response: + - send: "NICK prober" + - send: "USER prober prober prober :prober" + - expect: "PING :([^ ]+)" + send: "PONG ${1}" + - expect: "^:[^ ]+ 001" + icmp: + prober: icmp + icmp_ttl5: + prober: icmp + timeout: 5s + icmp: + ttl: 5 + diff --git a/images/consul-exporter/Dockerfile b/images/consul-exporter/Dockerfile new file mode 100644 index 0000000..1d9861e --- /dev/null +++ b/images/consul-exporter/Dockerfile @@ -0,0 +1,21 @@ +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder + +ARG CONSUL_EXPORTER_VERSION=[[ .monitoring.exporters.consul.version ]] + +ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp +ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/sha256sums.txt /tmp +RUN set -eux &&\ + apk --no-cache add tar gzip &&\ + cd /tmp &&\ + grep "consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\ + tar xvf consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz &&\ + mv consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64/consul_exporter /usr/local/bin/consul_exporter + +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] +MAINTAINER [[ .docker.maintainer ]] + +COPY --from=builder /usr/local/bin/consul_exporter /usr/local/bin/consul_exporter + +USER 9107 +EXPOSE 9107 +CMD ["consul_exporter"] diff --git a/images/ping-exporter/Dockerfile b/images/ping-exporter/Dockerfile new file mode 100644 index 0000000..72205d9 --- /dev/null +++ b/images/ping-exporter/Dockerfile @@ -0,0 +1,24 @@ +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder +MAINTAINER [[ .docker.maintainer ]] + +ARG PING_EXPORTER_VERSION=[[ .monitoring.exporters.ping.version ]] + +ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz /tmp +ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt /tmp +RUN set -eux &&\ + apk --no-cache add \ + tar \ + gzip \ + &&\ + cd /tmp &&\ + grep "ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz" ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt | sha256sum -c &&\ + tar xvf ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz &&\ + mv ping_exporter /usr/local/bin/ + +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] +MAINTAINER [[ .docker.maintainer ]] + +COPY --from=builder /usr/local/bin/ping_exporter /usr/local/bin/ping_exporter + +EXPOSE 9427 +CMD ["ping_exporter", "--config.path=/config.yml"] diff --git a/images/ping-exporter/root/config.yml b/images/ping-exporter/root/config.yml new file mode 100644 index 0000000..95a4d57 --- /dev/null +++ b/images/ping-exporter/root/config.yml @@ -0,0 +1,4 @@ +# targets: +# - foo.bar +# - acme.com +targets: [] diff --git a/images/prometheus/Dockerfile b/images/prometheus/Dockerfile new file mode 100644 index 0000000..43abaf8 --- /dev/null +++ b/images/prometheus/Dockerfile @@ -0,0 +1,48 @@ +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder + +ARG PROM_VERSION=[[ .monitoring.prometheus.version ]] + +ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz /tmp +ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/sha256sums.txt /tmp +RUN set -eux &&\ + apk --no-cache add \ + curl \ + tar \ + ca-certificates \ + &&\ + cd /tmp &&\ + grep "prometheus-${PROM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\ + tar xvzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\ + rm -f prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\ + mv prometheus-${PROM_VERSION}.linux-amd64 /opt/prometheus + +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] +MAINTAINER [[ .docker.maintainer ]] + +ENV PATH=/opt/prometheus:$PATH + +COPY --from=builder /opt/prometheus /opt/prometheus +RUN set -eux &&\ + addgroup -g 9090 prometheus &&\ + adduser --system \ + --disabled-password \ + --uid 9090 \ + --ingroup prometheus \ + --home /opt/prometheus \ + --no-create-home \ + --shell /sbin/nologin \ + prometheus &&\ + mkdir /data &&\ + chown prometheus.prometheus /data &&\ + chmod 700 /data + +WORKDIR /opt/prometheus +USER prometheus +EXPOSE 9090 +CMD [ "/opt/prometheus/prometheus", \ + "--config.file=/opt/prometheus/prometheus.yml", \ + "--storage.tsdb.path=/data", \ + "--storage.tsdb.wal-compression", \ + "--storage.tsdb.wal-compression-type=zstd", \ + "--web.console.libraries=/opt/prometheus/console_libraries", \ + "--web.console.templates=/opt/prometheus/consoles" ] diff --git a/init/consul b/init/consul new file mode 100755 index 0000000..f7ecc70 --- /dev/null +++ b/init/consul @@ -0,0 +1,17 @@ +#!/bin/sh +# vim: syntax=sh + +vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-prometheus \ + ttl=720h \ + max_ttl=720h \ + consul_policies="[[ .instance ]]-prometheus" + +vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-consul-exporter \ + ttl=720h \ + max_ttl=720h \ + consul_policies="[[ .instance ]]-prometheus" + +vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-cluster-exporter \ + ttl=720h \ + max_ttl=720h \ + consul_policies="[[ .instance ]]-prometheus" diff --git a/init/pki b/init/pki new file mode 100755 index 0000000..d8d69c2 --- /dev/null +++ b/init/pki @@ -0,0 +1,69 @@ +#!/bin/sh + +set -euo pipefail + +[[ $c := merge .monitoring . ]] +[[ template "common/vault.mkpki.sh" $c ]] + +# Create a role for alertmanager +vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-alertmanager \ + allowed_domains="[[ .instance ]]" \ + allow_bare_domains=false \ + allow_subdomains=true \ + allow_localhost=false \ + allow_ip_sans=true \ + server_flag=true \ + client_flag=true \ + allow_wildcard_certificates=false \ + max_ttl=100h \ + ou="[[ $c.vault.pki.ou ]]" + +# Create a role for prometheus (which will only be a client, for AlertManager) +vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-prometheus \ + allowed_domains="[[ .instance ]]" \ + allow_bare_domains=false \ + allow_subdomains=true \ + allow_localhost=false \ + allow_ip_sans=false \ + server_flag=false \ + client_flag=true \ + allow_wildcard_certificates=false \ + max_ttl=100h \ + ou="[[ $c.vault.pki.ou ]]" + +# Create a role for metrics exporters (server only) +vault write [[ $c.vault.pki.path ]]/roles/metrics \ + allowed_domains="[[ .instance ]]" \ + allow_bare_domains=false \ + allow_subdomains=true \ + allow_localhost=false \ + allow_ip_sans=true \ + server_flag=true \ + client_flag=false \ + allow_wildcard_certificates=false \ + require_cn=false \ + max_ttl=72h \ + no_store=true \ + ou="[[ $c.vault.pki.ou ]]" + +# Create a role on the Nomad PKI for the cluster exporter +vault write pki/nomad/roles/[[ .instance ]]-cluster-exporter \ + allowed_domains='nomad.[[ .consul.domain ]]' \ + allow_subdomains=true \ + allow_wildcard_certificates=false \ + max_ttl=168h \ + allow_ip_sans=false \ + server_flag=false \ + client_flag=true \ + ou="Cluster metrics exporter" + +# Create a role on the Consul PKI for the cluster exporter +vault write pki/consul/roles/[[ .instance ]]-cluster-exporter \ + allowed_domains="consul.[[ .consul.domain ]]" \ + allow_bare_domains=false \ + allow_subdomains=true \ + allow_wildcard_certificates=false \ + max_ttl=168h \ + server_flags=false \ + client_flags=true \ + ou="Cluster metrics exporter" diff --git a/monitoring-exporters.nomad.hcl b/monitoring-exporters.nomad.hcl new file mode 100644 index 0000000..02564ca --- /dev/null +++ b/monitoring-exporters.nomad.hcl @@ -0,0 +1,253 @@ +job "[[ .instance ]]-exporters" { + +[[- $c := merge .monitoring.exporters . ]] +[[ template "common/job_start" $c ]] + + # Run exporters. Use a separated job so exporters can run in a distinct node_pool + group "exporters" { + + count = [[ $c.count ]] + + network { + mode = "bridge" + port "ping" {} + port "blackbox" {} + port "consul" {} + port "cluster" {} + } + + service { + name = "[[ .instance ]]-ping-exporter[[ .consul.suffix ]]" + port = "ping" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + metrics-port = "${NOMAD_HOST_PORT_ping}" + } + } + + service { + name = "[[ .instance ]]-blackbox-exporter[[ .consul.suffix ]]" + port = "blackbox" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + } + } + + service { + name = "[[ .instance ]]-consul-exporter[[ .consul.suffix ]]" + port = "ping" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + metrics-port = "${NOMAD_HOST_PORT_consul}" + } + } + + service { + name = "[[ .instance ]]-cluster-exporter[[ .consul.suffix ]]" + port = "cluster" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + } + } + +[[- if gt (len $c.ping.probes) 0 ]] +[[- $e := merge $c.ping $c ]] + # Ping exporter will collect ICMP ping stats and expose them + # Note : we could do it with blackbox, but as pings require privileges, it's better to grant it + # to a smaller, more focused container. This one only handle icmp pings check, and only from the configuration file + task "ping-exporter" { + driver = "[[ $e.nomad.driver ]]" + + config { + image = "[[ $e.image ]]" + readonly_rootfs = true + pids_limit = 30 + # Pings require privileges + privileged = true + userns_mode = "host" + command = "ping_exporter" + args = [ + "--web.listen-address=127.0.0.1:9427", + "--config.path=/local/config.yml" + ] + } + +[[ template "common/file_env" $e ]] + + template { + data = <<_EOT +[[ template "monitoring/ping_exporter/config.yml" $e ]] +_EOT + destination = "local/config.yml" + } + +[[ template "common/resources" $e ]] + } +[[- end ]] + +[[- if or (gt (len $c.blackbox.tcp_probes) 0) (gt (len $c.blackbox.http_probes) 0) ]] +[[- $e := merge $c.blackbox $c ]] + # Blackbox exporter will probe http/tcp targets and expose them + # for prometheus + task "blackbox-exporter" { + driver = "[[ $e.nomad.driver ]]" + + config { + image = "[[ $e.image ]]" + readonly_rootfs = true + pids_limit = 30 + } + +[[ template "common/file_env" $e ]] +[[ template "common/resources" $e ]] + + } +[[- end ]] + + # Export consul services status to prometheus + task "consul-exporter" { +[[- $e := merge $c.consul $c ]] + driver = "[[ $e.nomad.driver ]]" + + config { + image = "[[ $e.image ]]" + readonly_rootfs = true + pids_limit = 30 + command = "/local/consul-exporter" + } + +[[ template "common/file_env" $e ]] +[[ template "common/vault.policies" $e ]] + + template { + data = <<_EOT +[[ template "monitoring/consul-exporter/start.sh" $e ]] +_EOT + destination = "local/consul-exporter" + perms = 755 + } + + template { + data = <<_EOT +CONSUL_HTTP_TOKEN={{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-consul-exporter" }}{{ .Data.token }}{{ end }} +_EOT + destination = "secrets/.consul.env" + uid = 100000 + gid = 100000 + perms = 400 + env = true + } + +[[ template "common/resources" $e ]] + } + + # The cluster metrics exposes prometheus metrics from the various nodes of the cluster + # Nomad, Consul and Vault + # It also exposes the other exporters metrics with mTLS + task "cluster-metrics-proxy" { +[[- $e := merge $c.cluster $c ]] + driver = "[[ $e.nomad.driver ]]" + user = 8685 + + lifecycle { + hook = "poststart" + sidecar = true + } + + config { + image = "[[ $e.image ]]" + readonly_rootfs = true + pids_limit = 30 + # Mount the config in nginx conf dir + volumes = [ + "secrets/metrics.conf:/etc/nginx/conf.d/default.conf" + ] +[[ template "common/tmpfs" "/tmp" ]] + } + +[[ template "common/vault.policies" $e ]] + + # This is the main nginx configuration, which will proxypass requests to the real metrics endpoints + template { + data =<<_EOT +[[ template "monitoring/cluster-exporter/nginx.conf" $e ]] +_EOT + destination = "secrets/metrics.conf" + perms = "0440" + uid = 108685 + gid = 100000 + change_mode = "signal" + change_signal = "SIGHUP" + } + + # Get certificate to add mTLS to metrics endpoints + template { + data =<<_EOT +{{- with pkiCert "[[ .prometheus.vault_pki ]]/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) }} +{{ .Cert }} +{{ .Key }} +{{- end }} +_EOT + destination = "secrets/metrics.bundle.pem" + change_mode = "signal" + change_signal = "SIGHUP" + } + + # Get the CA for the monitoring PKI + template { + data =<<_EOT +{{ with secret "[[ .vault.root ]]pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + # Get a Nomad client certificate + template { + data = <<_EOT +{{- with pkiCert "pki/nomad/issue/[[ .instance ]]-cluster-exporter" "common_name=metrics-proxy.nomad.[[ .consul.domain ]]" "ttl=24h" }} +{{ .Data.Cert }} +{{ .Data.Key }} +{{- end }} +_EOT + destination = "secrets/nomad_client_bundle.pem" + perms = "0400" + uid = 108685 + gid = 100000 + change_mode = "signal" + change_signal = "SIGHUP" + } + + # The CA chain for Nomad + template { + data = <<_EOT +{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/nomad_ca.crt" + } + + # Same for Consul + template { + data = <<_EOT +{{- with pkiCert "pki/consul/issue/[[ .instance ]]-cluster-exporter" "common_name=metrics-proxy.consul.[[ .consul.domain ]]" "ttl=24h" }} +{{ .Data.Cert }} +{{ .Data.Key }} +{{- end }} +_EOT + destination = "secrets/consul_client_bundle.pem" + perms = "0400" + uid = 108685 + gid = 100000 + change_mode = "signal" + change_signal = "SIGHUP" + } + template { + data = <<_EOT +{{ with secret "pki/consul/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/consul_ca.crt" + } + +[[ template "common/resources" $e ]] + } + } +} diff --git a/monitoring.nomad.hcl b/monitoring.nomad.hcl new file mode 100644 index 0000000..8ffb256 --- /dev/null +++ b/monitoring.nomad.hcl @@ -0,0 +1,376 @@ +job "[[ .instance ]]" { + +[[ template "common/job_start" . ]] + + # Metrics is running prometheus and various exporters + group "metrics" { +[[- $c := merge .monitoring.prometheus .monitoring . ]] + + shutdown_delay = "6s" + count = [[ $c.count ]] + + network { + mode = "bridge" + port "metrics" {} + } + +[[ template "common/volumes" $c ]] + + service { + name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]" + port = 9090 + +[[ template "common/service_meta" $c ]] +[[ template "common/connect" $c ]] + + check { + name = "health" + type = "http" + expose = true + path = "/-/healthy" + interval = "15s" + timeout = "8s" + check_restart { + limit = 10 + grace = "5m" + } + } + + tags = [ +[[ template "common/traefik_tags" $c ]] + ] + } + +[[ template "common/task.metrics_proxy" $c ]] + + # The main prometheus task + task "prometheus" { + driver = "[[ $c.nomad.driver ]]" + leader = true + + config { + image = "[[ $c.image ]]" + readonly_rootfs = true + pids_limit = 200 + command = "prometheus" + args = [ + "--config.file=/local/prometheus.yml", + "--log.level=debug", + "--web.listen-address=127.0.0.1:9090", + "--storage.tsdb.path=/data", + "--storage.tsdb.retention.time=[[ $c.retention ]]", + "--web.console.libraries=/opt/prometheus/console_libraries", + "--web.console.templates=/opt/prometheus/consoles", + "--web.external-url=[[ $c.public_url ]]", + "--web.route-prefix=[[ if eq "" (urlParse $c.public_url).Path ]]/[[ else ]](urlParse $c.public_url).Path[[ end ]]" + ] + } + +[[ template "common/vault.policies" $c ]] +[[ template "common/artifacts" $c ]] + + # Main configuration for prometheus + template { + data = <<_EOT +[[ tmpl.Exec "monitoring/prometheus/prometheus.yml" $c | replaceAll "${" "$${" ]] +_EOT + destination = "local/prometheus.yml" + uid = 100000 + gid = 109090 + perms = 640 + change_mode = "signal" + change_signal = "SIGHUP" + } + + # Alert rules +[[- range (file.ReadDir "bundles/monitoring/templates/prometheus/rules") ]] + [[- if not (file.Exists (printf "prometheus/rules/%s" .)) ]] + template { + data = <<_EOT +[[ file.Read (printf "bundles/monitoring/templates/prometheus/rules/%s" .) ]] +_EOT + destination = "local/rules/[[ . ]]" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + [[- end ]] +[[- end ]] + +[[- if file.Exists "prometheus/rules" ]] + [[- range (file.ReadDir "prometheus/rules") ]] + + template { + data = <<_EOT +[[ file.Read (printf "prometheus/rules/%s" .) ]] +_EOT + destination = "local/rules/[[ . ]]" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + [[- end ]] +[[- end ]] + +[[- range $k, $v := $c.alert_rules ]] + + artifact { + source = "[[ $v.url ]]" + destination = "local/rules/[[ $k ]].yml" + mode = "file" + } +[[- end ]] + + # A client cert, to connect to the AlertManager API + template { + data = <<_EOT +{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-prometheus" + (printf "common_name=prometheus-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX")) + (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}} +{{ .Cert }} +{{ .Key }} +{{- end -}} +_EOT + destination = "secrets/prometheus.bundle.pem" + uid = 100000 + gid = 109090 + perms = "0440" + change_mode = "signal" + change_signal = "SIGHUP" + } + + # The monitoring CA chain, to validate AlertManager cert + template { + data = <<_EOT +{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + uid = 100000 + gid = 100000 + change_mode = "signal" + change_signal = "SIGHUP" + } + + # Persistent data + volume_mount { + volume = "data" + destination = "/data" + } + +[[ template "common/resources" $c ]] + } + } + + group "alerts" { + +[[- $c := merge .monitoring.alertmanager .monitoring . ]] + + count = [[ $c.count ]] + + network { + mode = "bridge" + port "web-tls" {} + port "cluster" {} + port "metrics" {} + } + +[[ template "common/volumes" $c ]] + + # This service is used for the different instances of alertmanager to communicate + service { + name = "[[ .instance ]]-alertmanager-gossip[[ .consul.suffix ]]" + port = "cluster" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + } + } + + # This service is used by prometheus. As it needs to be able to reach every instances, it cannot use + # the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh + service { + name = "[[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]" + port = "web-tls" + meta { + alloc = "${NOMAD_ALLOC_INDEX}" + } + } + + # This service is exposed through the service mesh + # and can be used to reach the web interface through Traefik + service { + name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]" + port = 9093 +[[ template "common/service_meta" $c ]] +[[ template "common/connect" $c ]] + + check { + name = "health" + type = "http" + expose = true + path = "/-/healthy" + interval = "20s" + timeout = "8s" + check_restart { + limit = 12 + grace = "30s" + } + } + + tags = [ +[[ template "common/traefik_tags" $c ]] + ] + } + +[[ template "common/task.metrics_proxy" $c ]] + + # This task will handle mTLS to the AlertManager API + # And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy + task "tls-proxy" { + driver = "[[ $c.nomad.driver ]]" + user = 9093 + + config { + image = "nginxinc/nginx-unprivileged:alpine" + force_pull = true + readonly_rootfs = true + pids_limit = 30 + volumes = [ + "local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro", + ] +[[ template "common/tmpfs" "/tmp" ]] + } + +[[ template "common/vault.policies" $c ]] + + lifecycle { + hook = "poststart" + sidecar = true + } + + template { + data = <<_EOT +[[ template "monitoring/alertmanager/nginx.conf" $c ]] +_EOT + destination = "local/alertmanager.conf" + } + + # Certifiate used by AlertManager + template { + data = <<_EOT +{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager" + (printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX")) + (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) + (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }} +{{ .Cert }} +{{ .Key }} +{{- end }} +_EOT + destination = "secrets/alertmanager.bundle.pem" + uid = 109093 + gid = 100000 + perms = "0440" + change_mode = "signal" + change_signal = "SIGHUP" + } + + # The trusted CA + template { + data = <<_EOT +{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + resources { + cpu = 10 + memory = 18 + } + } + + # The main alertmanager task + task "alertmanager" { + driver = "[[ $c.nomad.driver ]]" + leader = true + + config { + image = "[[ $c.image ]]" + readonly_rootfs = true + pids_limit = 200 + command = "/local/alertmanager" + } + +[[ template "common/vault.policies" $c ]] +[[ template "common/file_env" $c ]] + + template { + data = <<_EOT +[[- if isKind "map" $c.custom_config ]] +[[ merge $c.custom_config (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]] +[[- else if isKind "string" $c.custom_config ]] +[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]] +[[- else ]] +# Invalid custom config, using template only +[[ template "monitoring/alertmanager/alertmanager.yml" $c ]] +[[- end ]] +_EOT + destination = "secrets/alertmanager.yml" + } + + template { + data = <<_EOT +[[ template "monitoring/alertmanager/cluster_tls.yml" $c ]] +_EOT + destination = "local/cluster_tls.yml" + } + + template { + data = <<_EOT +[[ template "monitoring/alertmanager/web_tls.yml" $c ]] +_EOT + destination = "local/web_tls.yml" + } + + template { + data = <<_EOT +[[ template "monitoring/alertmanager/start.sh" $c ]] +_EOT + destination = "local/alertmanager" + uid = 100000 + gid = 100000 + perms = "0755" + } + + # Certifiate used by AlertManager + template { + data = <<_EOT +{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager" + (printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX")) + (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) + (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }} +{{ .Cert }} +{{ .Key }} +{{- end }} +_EOT + destination = "secrets/alertmanager.bundle.pem" + uid = 109093 + gid = 109090 + perms = "0440" + change_mode = "signal" + change_signal = "SIGHUP" + } + + # The trusted CA + template { + data = <<_EOT +{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + volume_mount { + volume = "data" + destination = "/data" + } + +[[ template "common/resources" $c ]] + } + } +} diff --git a/templates/alertmanager/alertmanager.yml b/templates/alertmanager/alertmanager.yml new file mode 100644 index 0000000..6b49d24 --- /dev/null +++ b/templates/alertmanager/alertmanager.yml @@ -0,0 +1,5 @@ +global: + smtp_from: '[[ .email.from ]]' + smtp_smarthost: localhost:25 + smtp_require_tls: false + diff --git a/templates/alertmanager/cluster_tls.yml b/templates/alertmanager/cluster_tls.yml new file mode 100644 index 0000000..082a159 --- /dev/null +++ b/templates/alertmanager/cluster_tls.yml @@ -0,0 +1,10 @@ +tls_server_config: + cert_file: /secrets/alertmanager.bundle.pem + key_file: /secrets/alertmanager.bundle.pem + client_auth_type: RequireAndVerifyClientCert + client_ca_file: /local/monitoring.ca.pem + +tls_client_config: + cert_file: /secrets/alertmanager.bundle.pem + key_file: /secrets/alertmanager.bundle.pem + ca_file: /local/monitoring.ca.pem diff --git a/templates/alertmanager/nginx.conf b/templates/alertmanager/nginx.conf new file mode 100644 index 0000000..493e743 --- /dev/null +++ b/templates/alertmanager/nginx.conf @@ -0,0 +1,13 @@ +server { + listen 127.0.0.1:9093; + location / { + proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }}; + proxy_ssl_certificate /secrets/alertmanager.bundle.pem; + proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem; + proxy_ssl_verify on; + proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring; + proxy_ssl_trusted_certificate /local/monitoring.ca.pem; + allow 127.0.0.1; + deny all; + } +} diff --git a/templates/alertmanager/start.sh b/templates/alertmanager/start.sh new file mode 100644 index 0000000..88345bc --- /dev/null +++ b/templates/alertmanager/start.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +set -euo pipefail + +exec alertmanager \ + --config.file=/secrets/alertmanager.yml \ + --storage.path=/data \ + --web.external-url=[[ .public_url ]] \ + --web.route-prefix=[[ if eq "" (urlParse .public_url).Path ]]/[[ else ]](urlParse .public_url).Path[[ end ]] \ + --web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \ + --cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \ + --cluster.advertise-address={{ env "NOMAD_HOST_ADDR_cluster" }} \ +{{- range service "[[ .instance ]]-am-gossip[[ .consul.suffix ]]" -}} +{{- if not (eq (env "NOMAD_ALLOC_INDEX") (index .ServiceMeta "alloc")) }} + --cluster.peer={{ .Address }}:{{ .Port }} \ +{{ end -}} +{{- end -}} + --cluster.tls-config=/local/cluster_tls.yml \ + --web.config.file=/local/web_tls.yml diff --git a/templates/alertmanager/web_tls.yml b/templates/alertmanager/web_tls.yml new file mode 100644 index 0000000..4ec3e59 --- /dev/null +++ b/templates/alertmanager/web_tls.yml @@ -0,0 +1,5 @@ +tls_server_config: + cert_file: /secrets/alertmanager.bundle.pem + key_file: /secrets/alertmanager.bundle.pem + client_auth_type: RequireAndVerifyClientCert + client_ca_file: /local/monitoring.ca.pem diff --git a/templates/cluster-exporter/nginx.conf b/templates/cluster-exporter/nginx.conf new file mode 100644 index 0000000..ceef2a1 --- /dev/null +++ b/templates/cluster-exporter/nginx.conf @@ -0,0 +1,170 @@ + +# Cluster exporter +server { + listen {{ env "NOMAD_ALLOC_PORT_cluster" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + + set $consul_token "{{ with secret "consul/creds/[[ .instance ]]-cluster-exporter" }}{{ .Data.token }}{{ end }}"; + +{{- range service "nomad-client" }} + location /nomad-client/{{ .Node }} { + proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus; + proxy_ssl_certificate /secrets/nomad_client_bundle.pem; + proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem; + proxy_ssl_verify on; + proxy_ssl_name client.{{ env "NOMAD_REGION" }}.nomad; + proxy_ssl_trusted_certificate /local/nomad_ca.crt; + } +{{- end }} + +{{- range service "nomad" }} + {{- if .Tags | contains "http" }} + location /nomad/{{ .Node }} { + proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus; + proxy_ssl_certificate /secrets/nomad_client_bundle.pem; + proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem; + proxy_ssl_verify on; + proxy_ssl_name server.{{ env "NOMAD_REGION" }}.nomad; + proxy_ssl_trusted_certificate /local/nomad_ca.crt; + } + {{- end }} +{{- end }} + +{{- range service "consul" }} + location /consul/{{ .Node }} { + proxy_pass https://{{ .Address }}:8501/v1/agent/metrics?format=prometheus; + proxy_set_header X-Consul-Token $consul_token; + proxy_ssl_certificate /secrets/consul_client_bundle.pem; + proxy_ssl_certificate_key /secrets/consul_client_bundle.pem; + proxy_ssl_verify off; + proxy_ssl_trusted_certificate /local/consul_ca.crt; + } +{{- end }} + +{{- range service "vault" }} + location /vault/{{ .Node }} { + proxy_pass https://{{ .Address }}:{{ .Port }}/v1/sys/metrics?format=prometheus; + proxy_ssl_verify on; + proxy_ssl_trusted_certificate /etc/ssl/cert.pem; + proxy_set_header X-Forwarded-For "$proxy_add_x_forwarded_for"; + proxy_set_header X-Real-IP "$remote_addr"; + proxy_set_header X-Forwarded-Proto "$scheme"; + proxy_set_header X-Scheme "$scheme"; + proxy_set_header X-Forwarded-Host "$host"; + proxy_set_header X-Forwarded-Port "$server_port"; + } +{{- end }} + + location / { + root /usr/share/nginx/html; + index index.html; + } +} + +# Ping exporter +server { + listen {{ env "NOMAD_ALLOC_PORT_ping" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + location /metrics { + proxy_pass http://127.0.0.1:9427; + } +} + +# Blackbox exporter +server { + listen {{ env "NOMAD_ALLOC_PORT_blackbox" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + + location / { + proxy_pass http://127.0.0.1:9115; + } +} + +# Consul exporter +server { + listen {{ env "NOMAD_ALLOC_PORT_consul" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + location /metrics { + proxy_pass http://127.0.0.1:9107; + } +} diff --git a/templates/consul-exporter/start.sh b/templates/consul-exporter/start.sh new file mode 100644 index 0000000..4acf091 --- /dev/null +++ b/templates/consul-exporter/start.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +set -euo pipefail + +exec consul_exporter \ + --web.listen-address=127.0.0.1:9107 \ + --consul.server=http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 \ + --consul.request-limit=20 diff --git a/templates/ping_exporter/config.yml b/templates/ping_exporter/config.yml new file mode 100644 index 0000000..ccb3516 --- /dev/null +++ b/templates/ping_exporter/config.yml @@ -0,0 +1,4 @@ +targets: +[[- range $idx, $probe := .probes ]] + - [[ $probe ]] +[[- end ]] diff --git a/templates/prometheus/prometheus.yml b/templates/prometheus/prometheus.yml new file mode 100644 index 0000000..95502e9 --- /dev/null +++ b/templates/prometheus/prometheus.yml @@ -0,0 +1,237 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + #query_log_file: /dev/stdout + external_labels: + cluster: [[ .consul.domain ]] + env: [[ getenv "NOMAD_NAMESPACE" ]] + +rule_files: + - /local/rules/*.yml + +alerting: + alertmanagers: + - scheme: https + tls_config: + ca_file: /local/monitoring.ca.pem + cert_file: /secrets/prometheus.bundle.pem + key_file: /secrets/prometheus.bundle.pem + consul_sd_configs: + - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 + scheme: http + datacenter: [[ .consul.datacenter ]] + relabel_configs: + # Only keep alertmanagers + - source_labels: [__meta_consul_service] + action: keep + regex: [[ .instance ]]-alertmanager-tls[[ .consul.suffix ]] + +scrape_configs: + +[[- range $k, $v := .jobs ]] + + - job_name: [[ $k ]] + static_configs: + - targets: + [[- range $target := $v.targets ]] + - [[ $target ]] + [[- end ]] +[[- end ]] + +[[- if gt (len .exporters.blackbox.http_probes) 0 ]] + + # Blackbox Exporter HTTP targets + - job_name: http_probe + metrics_path: /probe + scheme: https + tls_config: + ca_file: /local/monitoring.ca.pem + cert_file: /secrets/prometheus.bundle.pem + key_file: /secrets/prometheus.bundle.pem + params: + module: ["http_2xx"] + static_configs: + - targets: + [[- range $http_probe := .exporters.blackbox.http_probes ]] + - [[ $http_probe ]] + [[- end ]] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: {{ range $idx, $instance := service "[[ .instance ]]-blackbox-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }} +[[- end ]] + +[[- if gt (len .exporters.blackbox.tcp_probes) 0 ]] + + # Blackbox Exporter TCP targets + - job_name: tcp_probe + metrics_path: /probe + scheme: https + tls_config: + ca_file: /local/monitoring.ca.pem + cert_file: /secrets/prometheus.bundle.pem + key_file: /secrets/prometheus.bundle.pem + params: + module: ["tcp_connect"] + static_configs: + [[- range $target := .exporters.blackbox.tcp_probes ]] + - [[ $target ]] + [[- end ]] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: {{ range $idx, $instance := service "[[ .instance ]]-blackbox-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }} +[[- end ]] + + # Cluster services + - job_name: cluster-services + scheme: https + tls_config: + ca_file: /local/monitoring.ca.pem + cert_file: /secrets/prometheus.bundle.pem + key_file: /secrets/prometheus.bundle.pem + consul_sd_configs: + - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 + scheme: http + token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }} + datacenter: [[ .consul.datacenter ]] + relabel_configs: + + # Drop anything which is not Nomad, Consul or Vault + # Other services will be monitored with another job + - source_labels: [__meta_consul_service] + action: keep + regex: (nomad(\-client)?|consul|vault) + + - source_labels: [__meta_consul_service,__meta_consul_node] + regex: (.+);(.+) + replacement: ${1}/${2} + target_label: __metrics_path__ + + - source_labels: [__meta_consul_service] + regex: (.+) + replacement: {{ range $idx, $instance := service "[[ .instance ]]-cluster-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }} + target_label: __address__ + + # Rewrite the job labels to the name of the service + - source_labels: [__meta_consul_service] + regex: (.+) + replacement: ${1} + target_label: job + + # Rewrite the instance labels + - source_labels: [__meta_consul_node] + regex: (.+) + replacement: ${1} + target_label: instance + + # regular services discovered from the Consul Catalog + - job_name: consul-services + scheme: https + tls_config: + ca_file: /local/monitoring.ca.pem + cert_file: /secrets/prometheus.bundle.pem + key_file: /secrets/prometheus.bundle.pem + + consul_sd_configs: + - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 + scheme: http + token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }} + datacenter: [[ .consul.datacenter ]] + + relabel_configs: + + # Drop sidecar's service to prevent duplicate. Sidecar themselves are treated in another job + - source_labels: [__meta_consul_service] + action: drop + regex: (.+)-sidecar-proxy + + # Drop Nomad, Consul and vault, already handled + - source_labels: [__meta_consul_service] + action: drop + regex: (nomad(\-client)?|consul|vault) + + # Only keep services having a metrics-port set + - source_labels: [__meta_consul_service_metadata_metrics_port] + regex: \d+ + action: keep + + # Get metrics path from metadata + - source_labels: [__meta_consul_service_metadata_metrics_path] + target_label: __metrics_path__ + regex: (.+) + + # Rewrite the scheme if needed + - source_labels: [__meta_consul_service_metadata_metrics_scheme] + regex: (https?) + replacement: ${1} + target_label: __scheme__ + + # Rewrite the address to use the metrics port + - source_labels: [__address__, __meta_consul_service_metadata_metrics_port] + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: ${1}:${2} + target_label: __address__ + + # Rewrite the job labels to the name of the service + - source_labels: [__meta_consul_service] + regex: (.+) + replacement: ${1} + target_label: job + + # Set the default alloc to 0 if not set + - source_labels: [__meta_consul_service_metadata_alloc] + regex: ^$ + replacement: 0 + target_label: __meta_consul_service_metadata_alloc + + # Rewerite the instance label to be service-alloc + - source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc] + regex: (.+);([a-zA-Z\d\-\.]+) + replacement: ${1}-${2} + target_label: instance + + # envoy sidecars from consul + - job_name: consul-envoy-services + consul_sd_configs: + - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 + scheme: http + token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }} + datacenter: [[ .consul.datacenter ]] + + relabel_configs: + + # Only keep sidecar-service with a envoy-metrics-port defined + - source_labels: [__meta_consul_service, __meta_consul_service_metadata_envoy_metrics_port] + action: keep + regex: (.+)-sidecar-proxy;\d+ + + # Rewrite the address to use the envoy-metrics-port + - source_labels: [__address__, __meta_consul_service_metadata_envoy_metrics_port] + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: ${1}:${2} + target_label: __address__ + + # Rewrite the job label + - source_labels: [__meta_consul_service] + regex: (.+) + replacement: ${1} + target_label: job + + # Set the default alloc to 0 if not set + - source_labels: [__meta_consul_service_metadata_alloc] + regex: ^$ + replacement: 0 + target_label: __meta_consul_service_metadata_alloc + + # Rewerite the instance label to be service-alloc + - source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc] + regex: (.+);([a-zA-Z\d\-\.]+) + replacement: ${1}-${2} + target_label: instance diff --git a/templates/prometheus/rules/blackbox.yml b/templates/prometheus/rules/blackbox.yml new file mode 100644 index 0000000..a38b34c --- /dev/null +++ b/templates/prometheus/rules/blackbox.yml @@ -0,0 +1,69 @@ +# vi: syntax=yaml + +groups: + +- name: Blackbox + rules: + + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ $labels.instance }}) + description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (instance {{ $labels.instance }}) + description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) + description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20' + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateExpired + expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) + description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxProbeSlowHttp + expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1' + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) + description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/templates/prometheus/rules/consul.yml b/templates/prometheus/rules/consul.yml new file mode 100644 index 0000000..c2ddecf --- /dev/null +++ b/templates/prometheus/rules/consul.yml @@ -0,0 +1,54 @@ +# vi: syntax=yaml + +groups: + +- name: ConsulExporter + + rules: + + - alert: ConsulServiceHealthcheckFailed + # Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available) + expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0' + for: 2m + labels: + severity: critical + annotations: + summary: Consul service healthcheck failed (service {{ $labels.service_name }}) + description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ConsulMissingMasterNode + expr: 'consul_raft_peers < (max_over_time(consul_raft_peers{}[6h]) / 2) + 1' + for: 0m + labels: + severity: critical + annotations: + summary: Consul missing master node (node {{ $labels.node }}) + description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ConsulAgentUnhealthy + expr: 'consul_health_node_status{status="critical"} == 1' + for: 0m + labels: + severity: critical + annotations: + summary: Consul agent unhealthy (node {{ $labels.node }}) + description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ConsulServiceWarning + expr: 'consul_health_service_status{status="warning"} == 1' + for: 2m + labels: + severity: warning + annotations: + summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state + description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ConsulServiceCritical + expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1' + for: 2m + labels: + severity: critical + annotations: + summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state + description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + diff --git a/templates/prometheus/rules/jvm.yml b/templates/prometheus/rules/jvm.yml new file mode 100644 index 0000000..2fdde2b --- /dev/null +++ b/templates/prometheus/rules/jvm.yml @@ -0,0 +1,16 @@ +# vi: syntax=yaml + +groups: + +- name: JVM + + rules: + + - alert: JvmMemoryFillingUp + expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 90' + for: 2m + labels: + severity: warning + annotations: + summary: JVM memory filling up (instance {{ $labels.instance }}) + description: "JVM memory is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/templates/prometheus/rules/nomad.yml b/templates/prometheus/rules/nomad.yml new file mode 100644 index 0000000..99ea97e --- /dev/null +++ b/templates/prometheus/rules/nomad.yml @@ -0,0 +1,51 @@ +# vi: syntax=yaml + +groups: + +- name: Nomad + rules: + + - alert: NomadJobFailed + expr: 'delta(nomad_nomad_job_summary_failed[30m]) > 0' + for: 0m + labels: + severity: warning + annotations: + summary: Nomad job failed (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) + description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NomadJobLost + expr: 'nomad_nomad_job_summary_lost > 0' + for: 0m + labels: + severity: warning + annotations: + summary: Nomad job lost (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) + description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NomadJobQueued + expr: 'nomad_nomad_job_summary_queued > 0' + for: 3m + labels: + severity: warning + annotations: + summary: Nomad job queued (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) + description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NomadBlockedEvaluation + expr: 'nomad_nomad_blocked_evals_total_blocked > 0' + for: 2m + labels: + severity: warning + annotations: + summary: Nomad blocked evaluation (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) + description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NomadTaskOOM + expr: 'count_over_time(nomad_client_allocs_oom_killed[1h]) > 1' + for: 0m + labels: + severity: warning + annotations: + summary: Nomad task killed by OOM (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) + description: "Nomad task killed by OOM \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/templates/prometheus/rules/ping.yml b/templates/prometheus/rules/ping.yml new file mode 100644 index 0000000..5ccde4b --- /dev/null +++ b/templates/prometheus/rules/ping.yml @@ -0,0 +1,25 @@ +# vi: syntax=yaml + +groups: + +- name: Ping + rules: + + - alert: HostDown + expr: ping_loss_ratio == 1 + for: 3m + labels: + severity: critical + annotations: + summary: Host down (host {{ $labels.target }}) + description: "Host {{ $labels.target }} doesn't respond to ICMP pings, VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PingLoss + expr: | + avg_over_time(ping_loss_ratio[10m]) > 0.1 and min_over_time(ping_loss_ratio[10m]) < 1 + for: 0m + labels: + severity: warning + annotations: + summary: High packet loss (host {{ $labels.target }}) + description: "ICMP pings have a loss ratio > 10%, VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/templates/prometheus/rules/postgres.yml b/templates/prometheus/rules/postgres.yml new file mode 100644 index 0000000..4cac3c8 --- /dev/null +++ b/templates/prometheus/rules/postgres.yml @@ -0,0 +1,80 @@ +# vi: syntax=yaml + +groups: + +- name: Postgres + + rules: + + - alert: PostgresqlDown + expr: 'pg_up == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql down (instance {{ $labels.instance }}) + description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresTooManyRestarts + expr: changes(process_start_time_seconds{job="pg"}[15m]) > 3 + for: 1m + labels: + severity: warning + annotations: + summary: Postgres too many restarts (instance {{ $labels.instance }}) + description: "Postgres server has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyConnections + expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8' + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many connections (instance {{ $labels.instance }}) + description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlDeadLocks + expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql dead locks (instance {{ $labels.instance }}) + description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# - alert: PostgresqlHighRollbackRate +# expr: 'rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.05' +# for: 0m +# labels: +# severity: warning +# annotations: +# summary: Postgresql high rollback rate (instance {{ $labels.instance }}) +# description: "Ratio of transactions being aborted compared to committed is > 5 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRateStatementTimeout + expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate statement timeout (instance {{ $labels.instance }}) + description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRateDeadlock + expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate deadlock (instance {{ $labels.instance }}) + description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyLocksAcquired + expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20' + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql too many locks acquired (instance {{ $labels.instance }}) + description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + diff --git a/templates/prometheus/rules/prometheus.yml b/templates/prometheus/rules/prometheus.yml new file mode 100644 index 0000000..df5c5cd --- /dev/null +++ b/templates/prometheus/rules/prometheus.yml @@ -0,0 +1,89 @@ +# vi: syntax=yaml + +groups: + +# Prometheus +- name: Prometheus + rules: + + - alert: PrometheusTargetMissing + expr: up{job!~"sftp-PR\\d+"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: Prometheus target missing (job {{ $labels.job }}, instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 3 + for: 1m + labels: + severity: warning + annotations: + summary: Prometheus too many restarts (job {{ $labels.job }}, instance {{ $labels.instance }}) + description: "Prometheus has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 2m + labels: + severity: critical + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: rate(alertmanager_notifications_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + diff --git a/templates/prometheus/rules/traefik.yml b/templates/prometheus/rules/traefik.yml new file mode 100644 index 0000000..9e27037 --- /dev/null +++ b/templates/prometheus/rules/traefik.yml @@ -0,0 +1,16 @@ +# vi: syntax=yaml + +groups: + +- name: Traefik + + rules: + + - alert: TraefikHighHttp5xxErrorRateService + expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5' + for: 1m + labels: + severity: critical + annotations: + summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }}) + description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/templates/prometheus/rules/vault.yml b/templates/prometheus/rules/vault.yml new file mode 100644 index 0000000..6f6b9cf --- /dev/null +++ b/templates/prometheus/rules/vault.yml @@ -0,0 +1,16 @@ +# vi: syntax=yaml + +groups: + +- name: HashicorpVault + + rules: + + - alert: VaultSealed + expr: 'vault_core_unsealed == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Vault sealed (instance {{ $labels.instance }}) + description: "Vault instance is sealed on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/variables.yml b/variables.yml new file mode 100644 index 0000000..599d5ca --- /dev/null +++ b/variables.yml @@ -0,0 +1,127 @@ +--- + +instance: monitoring + +vault: + pki: + path: '[[ .prometheus.vault_pki ]]' + ou: Monitoring + +monitoring: + + exporters: + count: 1 + + ping: + version: 1.1.0 + image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1' + env: {} + resources: + cpu: 10 + memory: 30 + probes: [] + + blackbox: + version: 0.24.0 + image: '[[ .docker.repo ]]blackbox-exporter:[[ .monitoring.exporters.blackbox.version ]]-1' + env: {} + resources: + cpu: 10 + memory: 50 + tcp_probes: [] + http_probes: [] + + consul: + version: 0.11.0 + image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2' + env: {} + resources: + cpu: 20 + memory: 64 + vault: + policies: + - '[[ .instance ]]-consul-exporter' + + cluster: + image: nginxinc/nginx-unprivileged:alpine + env: {} + resources: + cpu: 10 + memory: 18 + vault: + policies: + - '[[ .instance ]]-cluster-exporter' + - metrics + + prometheus: + + version: 2.50.1 + + count: 1 + + image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1' + + env: {} + + resources: + cpu: 200 + memory: 768 + + volumes: + data: + type: csi + source: '[[ .instance ]]-prometheus-data' + per_alloc: true + + vault: + policies: + - '[[ .instance ]]-prometheus' + + jobs: {} + alert_rules: {} + # alert_rules: + # postgres: + # url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml + + public_url: https://prometheus.example.org + traefik: + enabled: true + router: prometheus + + retention: 30d + + prometheus: + enabled: true + metrics_url: http://localhost:9090/metrics + + alertmanager: + count: 1 + version: 0.27.0 + image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-1' + env: {} + resources: + cpu: 50 + memory: 80 + public_url: https://alerte.example.org + traefik: + enabled: true + router: alertmanager + strip_prefix: false + volumes: + data: + source: '[[ .instance ]]-alertmanager-data' + type: csi + per_alloc: true + prometheus: + metrics_url: http://127.0.0.1:9093/metrics + vault: + policies: + - metrics + - '[[ .instance ]]-alertmanager' + email: + from: alertmanager@[[ .consul.domain ]] + custom_config: "" + + +prometheus: + enabled: true diff --git a/vault/policies/metrics.hcl b/vault/policies/metrics.hcl new file mode 100644 index 0000000..ea40513 --- /dev/null +++ b/vault/policies/metrics.hcl @@ -0,0 +1,3 @@ +path "[[ .prometheus.vault_pki ]]/issue/metrics" { + capabilities = ["update"] +} diff --git a/vault/policies/monitoring-alertmanager.hcl b/vault/policies/monitoring-alertmanager.hcl new file mode 100644 index 0000000..82970a3 --- /dev/null +++ b/vault/policies/monitoring-alertmanager.hcl @@ -0,0 +1,8 @@ +[[- $c := merge .monitoring.alertmanager .monitoring . ]] +path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager" { + capabilities = ["update"] +} + +path "[[ .vault.root ]]kv/service/[[ .instance ]]/alertmanager" { + capabilities = ["read"] +} diff --git a/vault/policies/monitoring-cluster-exporter.hcl b/vault/policies/monitoring-cluster-exporter.hcl new file mode 100644 index 0000000..7a4eb51 --- /dev/null +++ b/vault/policies/monitoring-cluster-exporter.hcl @@ -0,0 +1,20 @@ +[[- $c := merge .monitoring.exporters.cluster .monitoring.exporters .monitoring . ]] +# Read vault metrics +path "sys/metrics" { + capabilities = ["read", "list"] +} + +# Get a cert for Nomad +path "pki/nomad/issue/[[ .instance ]]-cluster-exporter" { + capabilities = ["update"] +} + +# Get a cert for Consul +path "pki/consul/issue/[[ .instance ]]-cluster-exporter" { + capabilities = ["update"] +} + +# Get a consul token +path "consul/creds/[[ .instance ]]-cluster-exporter" { + capabilities = ["read"] +} diff --git a/vault/policies/monitoring-consul-exporter.hcl b/vault/policies/monitoring-consul-exporter.hcl new file mode 100644 index 0000000..dfd6ce6 --- /dev/null +++ b/vault/policies/monitoring-consul-exporter.hcl @@ -0,0 +1,4 @@ +[[- $c := merge .monitoring.exporters.consul .monitoring.exporters .monitoring . ]] +path "[[ $c.vault.root ]]consul/creds/[[ .instance ]]-consul-exporter" { + capabilities = ["read"] +} diff --git a/vault/policies/monitoring-prometheus.hcl b/vault/policies/monitoring-prometheus.hcl new file mode 100644 index 0000000..0f8e595 --- /dev/null +++ b/vault/policies/monitoring-prometheus.hcl @@ -0,0 +1,12 @@ +[[- $c := merge .monitoring.prometheus .monitoring . ]] +path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-prometheus" { + capabilities = ["update"] +} + +path "[[ $c.vault.root ]]kv/service/[[ .instance ]]/prometheus" { + capabilities = ["read"] +} + +path "[[ $c.vault.root ]]consul/creds/[[ .instance ]]-prometheus" { + capabilities = ["read"] +}