Start: prometheus + alertmanager + exporters
This commit is contained in:
parent
b32fe575af
commit
65441a4a9e
|
@ -0,0 +1,43 @@
|
|||
- ~~Split exporters dans un job dédié (pour pouvoir tourner sur node_pool spécifique)~~
|
||||
- Créer monitoring-agent type system avec vector + node-exporter
|
||||
- images
|
||||
- ~~prometheus~~
|
||||
- ~~ping-exporter~~
|
||||
- ~~blackbox-exporter~~
|
||||
- ~~consul-exporter~~
|
||||
- vector
|
||||
- loki
|
||||
- grafana
|
||||
- nomad-vector-logger
|
||||
|
||||
- pki roles:
|
||||
- ~~monitoring -> prom~~
|
||||
- ~~consul -> prom~~
|
||||
- ~~monitoring -> am~~
|
||||
|
||||
- vault pol
|
||||
- ~~prometheus~~
|
||||
- ~~issue prom on monitoring~~
|
||||
- ~~issue prom on consul~~
|
||||
- ~~consul-exporter~~
|
||||
- ~~issue consul-exporter on consul~~
|
||||
- ~~alertmanager~~
|
||||
- ~~issue alertmanager on monitoring~~
|
||||
|
||||
- consul defaults & intentions
|
||||
- ~~prometheus~~
|
||||
- ~~alertmanager~~
|
||||
- loki
|
||||
|
||||
- tasks
|
||||
- ~~alertmanager~~
|
||||
- vector-aggregator
|
||||
- vector-agent (dans job agent)
|
||||
- loki (modulariser ou laisser en monolithique ?)
|
||||
- grafana
|
||||
- ~~cluster-metrics (job exporters)~~
|
||||
|
||||
- questions
|
||||
- prom rules: keep or move to a -conf bundle ?
|
||||
- ~~config alertes am (recipient + routing)~~
|
||||
- ~~http and tcp probes, as exporters are now in a dedicated job~~
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
|
||||
dependencies:
|
||||
- url: ../common.git
|
|
@ -0,0 +1,3 @@
|
|||
Kind = "service-defaults"
|
||||
Name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]"
|
||||
Protocol = "http"
|
|
@ -0,0 +1,3 @@
|
|||
Kind = "service-defaults"
|
||||
Name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]"
|
||||
Protocol = "http"
|
|
@ -0,0 +1,16 @@
|
|||
Kind = "service-intentions"
|
||||
Name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]"
|
||||
Sources = [
|
||||
{
|
||||
Name = "[[ (merge .monitoring.alertmanager .).traefik.instance ]]"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathPrefix = "[[ if eq "" (urlParse .monitoring.alertmanager.public_url).Path ]]/[[ else ]](urlParse .monitoring.alertmanager.public_url).Path[[ end ]]"
|
||||
Methods = ["GET", "HEAD", "POST", "PUT", "DELETE", "PATCH"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
|
@ -0,0 +1,34 @@
|
|||
Kind = "service-intentions"
|
||||
Name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]"
|
||||
Sources = [
|
||||
{
|
||||
Name = "[[ (merge .monitoring.prometheus .).traefik.instance ]]"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
Methods = ["GET", "HEAD", "POST"]
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
Name = "[[ .instance ]]-grafana[[ .consul.suffix ]]"
|
||||
Permissions = [
|
||||
{
|
||||
# Deny access to the admin API from Grafana
|
||||
Action = "deny"
|
||||
HTTP {
|
||||
PathPrefix = "/api/v1/admin"
|
||||
}
|
||||
},
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathPrefix = "/api/v1"
|
||||
Methods = ["GET", "HEAD", "POST", "PUT"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
|
@ -0,0 +1,9 @@
|
|||
agent_prefix "" {
|
||||
policy = "read"
|
||||
}
|
||||
node_prefix "" {
|
||||
policy = "read"
|
||||
}
|
||||
service_prefix "" {
|
||||
policy = "read"
|
||||
}
|
Binary file not shown.
|
@ -0,0 +1,3 @@
|
|||
# monitoring
|
||||
|
||||
Monitoring stack
|
|
@ -0,0 +1,43 @@
|
|||
- ~~Split exporters dans un job dédié (pour pouvoir tourner sur node_pool spécifique)~~
|
||||
- Créer monitoring-agent type system avec vector + node-exporter
|
||||
- images
|
||||
- prometheus
|
||||
- ping-exporter
|
||||
- blackbox-exporter
|
||||
- consul-exporter
|
||||
- vector
|
||||
- loki
|
||||
- grafana
|
||||
- nomad-vector-logger
|
||||
|
||||
- pki roles:
|
||||
- ~~monitoring -> prom~~
|
||||
- ~~consul -> prom~~
|
||||
- ~~monitoring -> am~~
|
||||
|
||||
- vault pol
|
||||
- prometheus
|
||||
- ~~issue prom on monitoring~~
|
||||
- ~~issue prom on consul~~
|
||||
- consul-exporter
|
||||
- issue consul-exporter on consul
|
||||
- alertmanager
|
||||
- ~~issue alertmanager on monitoring~~
|
||||
|
||||
- consul defaults & intentions
|
||||
- ~~prometheus~~
|
||||
- ~~alertmanager~~
|
||||
- loki
|
||||
|
||||
- tasks
|
||||
- ~~alertmanager~~
|
||||
- vector-aggregator
|
||||
- vector-agent (dans job agent)
|
||||
- loki (modulariser ou laisser en monolithique ?)
|
||||
- grafana
|
||||
- cluster-metrics (job exporters)
|
||||
|
||||
- questions
|
||||
- prom rules: keep or move to a -conf bundle ?
|
||||
- ~~config alertes am (recipient + routing)~~
|
||||
- http and tcp probes, as exporters are now in a dedicated job
|
|
@ -0,0 +1,3 @@
|
|||
Kind = "service-defaults"
|
||||
Name = "monitoring-alertmanager"
|
||||
Protocol = "http"
|
|
@ -0,0 +1,3 @@
|
|||
Kind = "service-defaults"
|
||||
Name = "monitoring-prometheus"
|
||||
Protocol = "http"
|
|
@ -0,0 +1,16 @@
|
|||
Kind = "service-intentions"
|
||||
Name = "monitoring-alertmanager"
|
||||
Sources = [
|
||||
{
|
||||
Name = "traefik"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathPrefix = "/"
|
||||
Methods = ["GET", "HEAD", "POST", "PUT", "DELETE", "PATCH"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
|
@ -0,0 +1,34 @@
|
|||
Kind = "service-intentions"
|
||||
Name = "monitoring-prometheus"
|
||||
Sources = [
|
||||
{
|
||||
Name = "traefik"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
Methods = ["GET", "HEAD", "POST"]
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
Name = "monitoring-grafana"
|
||||
Permissions = [
|
||||
{
|
||||
# Deny access to the admin API from Grafana
|
||||
Action = "deny"
|
||||
HTTP {
|
||||
PathPrefix = "/api/v1/admin"
|
||||
}
|
||||
},
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathPrefix = "/api/v1"
|
||||
Methods = ["GET", "HEAD", "POST", "PUT"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
|
@ -0,0 +1,9 @@
|
|||
agent_prefix "" {
|
||||
policy = "read"
|
||||
}
|
||||
node_prefix "" {
|
||||
policy = "read"
|
||||
}
|
||||
service_prefix "" {
|
||||
policy = "read"
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
FROM danielberteaud/alpine:24.3-1 AS builder
|
||||
|
||||
ARG AM_VERSION=0.27.0
|
||||
|
||||
ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/alertmanager-${AM_VERSION}.linux-amd64.tar.gz /tmp
|
||||
ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/sha256sums.txt /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add \
|
||||
tar \
|
||||
&&\
|
||||
cd /tmp &&\
|
||||
grep "alertmanager-${AM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
|
||||
tar xzf alertmanager-${AM_VERSION}.linux-amd64.tar.gz &&\
|
||||
mv alertmanager-${AM_VERSION}.linux-amd64 /opt/alertmanager
|
||||
|
||||
FROM danielberteaud/alpine:24.3-1
|
||||
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
|
||||
|
||||
ENV PATH=/opt/alertmanager:$PATH
|
||||
|
||||
COPY --from=builder /opt/alertmanager /opt/alertmanager
|
||||
RUN set -eux &&\
|
||||
addgroup -g 9093 alertmanager &&\
|
||||
adduser --system \
|
||||
--disabled-password \
|
||||
--uid 9093 \
|
||||
--ingroup alertmanager \
|
||||
--home /opt/alertmanager \
|
||||
--no-create-home \
|
||||
--shell /sbin/nologin \
|
||||
alertmanager &&\
|
||||
mkdir /data &&\
|
||||
chown alertmanager:alertmanager /data &&\
|
||||
chmod 700 data
|
||||
|
||||
WORKDIR /opt/alertmanager
|
||||
USER alertmanager
|
||||
EXPOSE 9093
|
||||
CMD [ "alertmanager", \
|
||||
"--config.file=/opt/alertmanager/alertmanager.yml", \
|
||||
"--storage.path=/data" ]
|
|
@ -0,0 +1,29 @@
|
|||
FROM danielberteaud/alpine:24.3-1 AS builder
|
||||
|
||||
ARG BLACKBOX_EXPORTER_VERSION=0.24.0
|
||||
|
||||
ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
|
||||
ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/sha256sums.txt /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add tar gzip &&\
|
||||
cd /tmp &&\
|
||||
grep "blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
|
||||
tar xvf blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz &&\
|
||||
mkdir blackbox &&\
|
||||
mv blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64/blackbox_exporter /usr/local/bin/blackbox_exporter
|
||||
|
||||
FROM danielberteaud/alpine:24.3-1
|
||||
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
|
||||
|
||||
ENV BLACKBOX_CONF=/etc/blackbox.yml
|
||||
|
||||
COPY --from=builder /usr/local/bin/blackbox_exporter /usr/local/bin/blackbox_exporter
|
||||
|
||||
RUN set -eux &&\
|
||||
apk --no-cache upgrade &&\
|
||||
apk --no-cache add ca-certificates curl
|
||||
|
||||
COPY root/ /
|
||||
|
||||
EXPOSE 9195
|
||||
CMD ["sh", "-c", "exec blackbox_exporter --config.file=${BLACKBOX_CONF}"]
|
|
@ -0,0 +1,65 @@
|
|||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
http:
|
||||
preferred_ip_protocol: "ip4"
|
||||
http_ssl_no_check:
|
||||
prober: http
|
||||
http:
|
||||
preferred_ip_protocol: "ip4"
|
||||
tls_config:
|
||||
insecure_skip_verify: true
|
||||
http_post_2xx:
|
||||
prober: http
|
||||
http:
|
||||
method: POST
|
||||
preferred_ip_protocol: "ip4"
|
||||
tcp_connect:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: "ip4"
|
||||
pop3s_banner:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: "ip4"
|
||||
query_response:
|
||||
- expect: "^+OK"
|
||||
tls: true
|
||||
tls_config:
|
||||
insecure_skip_verify: false
|
||||
grpc:
|
||||
prober: grpc
|
||||
grpc:
|
||||
tls: true
|
||||
preferred_ip_protocol: "ip4"
|
||||
grpc_plain:
|
||||
prober: grpc
|
||||
grpc:
|
||||
preferred_ip_protocol: "ip4"
|
||||
tls: false
|
||||
service: "service1"
|
||||
ssh_banner:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: "ip4"
|
||||
query_response:
|
||||
- expect: "^SSH-2.0-"
|
||||
- send: "SSH-2.0-blackbox-ssh-check"
|
||||
irc_banner:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: "ip4"
|
||||
query_response:
|
||||
- send: "NICK prober"
|
||||
- send: "USER prober prober prober :prober"
|
||||
- expect: "PING :([^ ]+)"
|
||||
send: "PONG ${1}"
|
||||
- expect: "^:[^ ]+ 001"
|
||||
icmp:
|
||||
prober: icmp
|
||||
icmp_ttl5:
|
||||
prober: icmp
|
||||
timeout: 5s
|
||||
icmp:
|
||||
ttl: 5
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
FROM danielberteaud/alpine:24.3-1 AS builder
|
||||
|
||||
ARG CONSUL_EXPORTER_VERSION=0.11.0
|
||||
|
||||
ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
|
||||
ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/sha256sums.txt /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add tar gzip &&\
|
||||
cd /tmp &&\
|
||||
grep "consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
|
||||
tar xvf consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz &&\
|
||||
mv consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64/consul_exporter /usr/local/bin/consul_exporter
|
||||
|
||||
FROM danielberteaud/alpine:24.3-1
|
||||
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
|
||||
|
||||
COPY --from=builder /usr/local/bin/consul_exporter /usr/local/bin/consul_exporter
|
||||
|
||||
USER 9107
|
||||
EXPOSE 9107
|
||||
CMD ["consul_exporter"]
|
|
@ -0,0 +1,24 @@
|
|||
FROM danielberteaud/alpine:24.3-1 AS builder
|
||||
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
|
||||
|
||||
ARG PING_EXPORTER_VERSION=1.1.0
|
||||
|
||||
ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz /tmp
|
||||
ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add \
|
||||
tar \
|
||||
gzip \
|
||||
&&\
|
||||
cd /tmp &&\
|
||||
grep "ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz" ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt | sha256sum -c &&\
|
||||
tar xvf ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz &&\
|
||||
mv ping_exporter /usr/local/bin/
|
||||
|
||||
FROM danielberteaud/alpine:24.3-1
|
||||
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
|
||||
|
||||
COPY --from=builder /usr/local/bin/ping_exporter /usr/local/bin/ping_exporter
|
||||
|
||||
EXPOSE 9427
|
||||
CMD ["ping_exporter", "--config.path=/config.yml"]
|
|
@ -0,0 +1,4 @@
|
|||
# targets:
|
||||
# - foo.bar
|
||||
# - acme.com
|
||||
targets: []
|
|
@ -0,0 +1,48 @@
|
|||
FROM danielberteaud/alpine:24.3-1 AS builder
|
||||
|
||||
ARG PROM_VERSION=2.50.1
|
||||
|
||||
ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz /tmp
|
||||
ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/sha256sums.txt /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add \
|
||||
curl \
|
||||
tar \
|
||||
ca-certificates \
|
||||
&&\
|
||||
cd /tmp &&\
|
||||
grep "prometheus-${PROM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
|
||||
tar xvzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\
|
||||
rm -f prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\
|
||||
mv prometheus-${PROM_VERSION}.linux-amd64 /opt/prometheus
|
||||
|
||||
FROM danielberteaud/alpine:24.3-1
|
||||
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
|
||||
|
||||
ENV PATH=/opt/prometheus:$PATH
|
||||
|
||||
COPY --from=builder /opt/prometheus /opt/prometheus
|
||||
RUN set -eux &&\
|
||||
addgroup -g 9090 prometheus &&\
|
||||
adduser --system \
|
||||
--disabled-password \
|
||||
--uid 9090 \
|
||||
--ingroup prometheus \
|
||||
--home /opt/prometheus \
|
||||
--no-create-home \
|
||||
--shell /sbin/nologin \
|
||||
prometheus &&\
|
||||
mkdir /data &&\
|
||||
chown prometheus.prometheus /data &&\
|
||||
chmod 700 /data
|
||||
|
||||
WORKDIR /opt/prometheus
|
||||
USER prometheus
|
||||
EXPOSE 9090
|
||||
CMD [ "/opt/prometheus/prometheus", \
|
||||
"--config.file=/opt/prometheus/prometheus.yml", \
|
||||
"--storage.tsdb.path=/data", \
|
||||
"--storage.tsdb.wal-compression", \
|
||||
"--storage.tsdb.wal-compression-type=zstd", \
|
||||
"--web.console.libraries=/opt/prometheus/console_libraries", \
|
||||
"--web.console.templates=/opt/prometheus/consoles" ]
|
|
@ -0,0 +1,17 @@
|
|||
#!/bin/sh
|
||||
# vim: syntax=sh
|
||||
|
||||
vault write consul/roles/monitoring-prometheus \
|
||||
ttl=720h \
|
||||
max_ttl=720h \
|
||||
consul_policies="monitoring-prometheus"
|
||||
|
||||
vault write consul/roles/monitoring-consul-exporter \
|
||||
ttl=720h \
|
||||
max_ttl=720h \
|
||||
consul_policies="monitoring-prometheus"
|
||||
|
||||
vault write consul/roles/monitoring-cluster-exporter \
|
||||
ttl=720h \
|
||||
max_ttl=720h \
|
||||
consul_policies="monitoring-prometheus"
|
|
@ -0,0 +1,156 @@
|
|||
#!/bin/sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
|
||||
#!/bin/sh
|
||||
|
||||
# vim: syntax=sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
TMP=$(mktemp -d)
|
||||
|
||||
INITIAL_SETUP=false
|
||||
if [ "$(vault secrets list -format json | jq -r '.["pki/monitoring/"].type')" != "pki" ]; then
|
||||
INITIAL_SETUP=true
|
||||
fi
|
||||
|
||||
if [ "${INITIAL_SETUP}" = "true" ]; then
|
||||
# Enable the secret engine
|
||||
echo "Mounting new PKI secret engine at pki/monitoring"
|
||||
vault secrets enable -path=pki/monitoring pki
|
||||
else
|
||||
echo "Secret engine already mounted at pki/monitoring"
|
||||
fi
|
||||
|
||||
# Configure max-lease-ttl
|
||||
echo "Tune PKI secret engine"
|
||||
vault secrets tune -max-lease-ttl=131400h pki/monitoring
|
||||
|
||||
# Configure PKI URLs
|
||||
echo "Configure URL endpoints"
|
||||
vault write pki/monitoring/config/urls \
|
||||
issuing_certificates="${VAULT_ADDR}/v1pki/monitoring/ca" \
|
||||
crl_distribution_points="${VAULT_ADDR}/v1pki/monitoring/crl" \
|
||||
ocsp_servers="${VAULT_ADDR}/v1pki/monitoring/ocsp"
|
||||
|
||||
vault write pki/monitoring/config/cluster \
|
||||
path="${VAULT_ADDR}/v1pki/monitoring"
|
||||
|
||||
vault write pki/monitoring/config/crl \
|
||||
auto_rebuild=true \
|
||||
enable_delta=true
|
||||
|
||||
# Configure tidy
|
||||
echo "Configure auto tidy for the PKI"
|
||||
vault write pki/monitoring/config/auto-tidy \
|
||||
enabled=true \
|
||||
tidy_cert_store=true \
|
||||
tidy_expired_issuers=true \
|
||||
tidy_revocation_queue=true \
|
||||
tidy_revoked_cert_issuer_associations=true \
|
||||
tidy_revoked_certs=true \
|
||||
tidy_acme=true \
|
||||
tidy_cross_cluster_revoked_certs=true \
|
||||
tidy_move_legacy_ca_bundle=true \
|
||||
maintain_stored_certificate_counts=true
|
||||
|
||||
if [ "${INITIAL_SETUP}" = "true" ]; then
|
||||
# Generate an internal CA
|
||||
echo "Generating an internal CA"
|
||||
vault write -format=json pki/monitoring/intermediate/generate/internal \
|
||||
common_name="monitoring Certificate Authority" \
|
||||
ttl="131400h" \
|
||||
organization="ACME Corp" \
|
||||
ou="Monitoring" \
|
||||
locality="FooBar Ville" \
|
||||
key_type=rsa \
|
||||
key_bits=4096 \
|
||||
| jq -r '.data.csr' > ${TMP}/monitoring.csr
|
||||
|
||||
|
||||
|
||||
# Sign this PKI with a root PKI
|
||||
echo "Signing the new CA with the authority from pki/root"
|
||||
vault write -format=json pki/root/root/sign-intermediate \
|
||||
csr=@${TMP}/monitoring.csr \
|
||||
format=pem_bundle \
|
||||
ttl="131400h" \
|
||||
| jq -r '.data.certificate' > ${TMP}/monitoring.crt
|
||||
|
||||
# Update the intermediate CA with the signed one
|
||||
echo "Update the new CA with the signed version"
|
||||
vault write pki/monitoring/intermediate/set-signed \
|
||||
certificate=@${TMP}/monitoring.crt
|
||||
|
||||
|
||||
fi
|
||||
|
||||
# Remove temp files
|
||||
echo "Cleaning temp files"
|
||||
rm -rf ${TMP}
|
||||
|
||||
|
||||
# Create a role for alertmanager
|
||||
vault write pki/monitoring/roles/monitoring-alertmanager \
|
||||
allowed_domains="monitoring" \
|
||||
allow_bare_domains=false \
|
||||
allow_subdomains=true \
|
||||
allow_localhost=false \
|
||||
allow_ip_sans=true \
|
||||
server_flag=true \
|
||||
client_flag=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=100h \
|
||||
ou="Monitoring"
|
||||
|
||||
# Create a role for prometheus (which will only be a client, for AlertManager)
|
||||
vault write pki/monitoring/roles/monitoring-prometheus \
|
||||
allowed_domains="monitoring" \
|
||||
allow_bare_domains=false \
|
||||
allow_subdomains=true \
|
||||
allow_localhost=false \
|
||||
allow_ip_sans=false \
|
||||
server_flag=false \
|
||||
client_flag=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=100h \
|
||||
ou="Monitoring"
|
||||
|
||||
# Create a role for metrics exporters (server only)
|
||||
vault write pki/monitoring/roles/metrics \
|
||||
allowed_domains="monitoring" \
|
||||
allow_bare_domains=false \
|
||||
allow_subdomains=true \
|
||||
allow_localhost=false \
|
||||
allow_ip_sans=true \
|
||||
server_flag=true \
|
||||
client_flag=false \
|
||||
allow_wildcard_certificates=false \
|
||||
require_cn=false \
|
||||
max_ttl=72h \
|
||||
no_store=true \
|
||||
ou="Monitoring"
|
||||
|
||||
# Create a role on the Nomad PKI for the cluster exporter
|
||||
vault write pki/nomad/roles/monitoring-cluster-exporter \
|
||||
allowed_domains='nomad.consul' \
|
||||
allow_subdomains=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=168h \
|
||||
allow_ip_sans=false \
|
||||
server_flag=false \
|
||||
client_flag=true \
|
||||
ou="Cluster metrics exporter"
|
||||
|
||||
# Create a role on the Consul PKI for the cluster exporter
|
||||
vault write pki/consul/roles/monitoring-cluster-exporter \
|
||||
allowed_domains="consul.consul" \
|
||||
allow_bare_domains=false \
|
||||
allow_subdomains=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=168h \
|
||||
server_flags=false \
|
||||
client_flags=true \
|
||||
ou="Cluster metrics exporter"
|
|
@ -0,0 +1,419 @@
|
|||
job "monitoring-exporters" {
|
||||
|
||||
datacenters = ["dc1"]
|
||||
region = "global"
|
||||
|
||||
|
||||
# Run exporters. Use a separated job so exporters can run in a distinct node_pool
|
||||
group "exporters" {
|
||||
|
||||
count = 1
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "ping" {}
|
||||
port "blackbox" {}
|
||||
port "consul" {}
|
||||
port "cluster" {}
|
||||
}
|
||||
|
||||
service {
|
||||
name = "monitoring-ping-exporter"
|
||||
port = "ping"
|
||||
meta {
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
metrics-port = "${NOMAD_HOST_PORT_ping}"
|
||||
}
|
||||
}
|
||||
|
||||
service {
|
||||
name = "monitoring-blackbox-exporter"
|
||||
port = "blackbox"
|
||||
meta {
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
}
|
||||
}
|
||||
|
||||
service {
|
||||
name = "monitoring-consul-exporter"
|
||||
port = "ping"
|
||||
meta {
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
metrics-port = "${NOMAD_HOST_PORT_consul}"
|
||||
}
|
||||
}
|
||||
|
||||
service {
|
||||
name = "monitoring-cluster-exporter"
|
||||
port = "cluster"
|
||||
meta {
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
}
|
||||
}
|
||||
|
||||
# Export consul services status to prometheus
|
||||
task "consul-exporter" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "danielberteaud/consul-exporter:0.11.0-2"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 30
|
||||
command = "/local/consul-exporter"
|
||||
}
|
||||
|
||||
|
||||
|
||||
# Use a template block instead of env {} so we can fetch values from vault
|
||||
template {
|
||||
data = <<_EOT
|
||||
LANG=fr_FR.utf8
|
||||
TZ=Europe/Paris
|
||||
_EOT
|
||||
destination = "secrets/.env"
|
||||
perms = 400
|
||||
env = true
|
||||
}
|
||||
|
||||
|
||||
vault {
|
||||
policies = ["monitoring-consul-exporter"]
|
||||
env = false
|
||||
disable_file = true
|
||||
change_mode = "noop"
|
||||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
#!/bin/sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
exec consul_exporter \
|
||||
--web.listen-address=127.0.0.1:9107 \
|
||||
--consul.server=http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 \
|
||||
--consul.request-limit=20
|
||||
|
||||
_EOT
|
||||
destination = "local/consul-exporter"
|
||||
perms = 755
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
CONSUL_HTTP_TOKEN={{ with secret "consul/creds/monitoring-consul-exporter" }}{{ .Data.token }}{{ end }}
|
||||
_EOT
|
||||
destination = "secrets/.consul.env"
|
||||
uid = 100000
|
||||
gid = 100000
|
||||
perms = 400
|
||||
env = true
|
||||
}
|
||||
|
||||
|
||||
resources {
|
||||
cpu = 20
|
||||
memory = 64
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
# The cluster metrics exposes prometheus metrics from the various nodes of the cluster
|
||||
# Nomad, Consul and Vault
|
||||
# It also exposes the other exporters metrics with mTLS
|
||||
task "cluster-metrics-proxy" {
|
||||
driver = "docker"
|
||||
user = 8685
|
||||
|
||||
lifecycle {
|
||||
hook = "poststart"
|
||||
sidecar = true
|
||||
}
|
||||
|
||||
config {
|
||||
image = "nginxinc/nginx-unprivileged:alpine"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 30
|
||||
# Mount the config in nginx conf dir
|
||||
volumes = [
|
||||
"secrets/metrics.conf:/etc/nginx/conf.d/default.conf"
|
||||
]
|
||||
mount {
|
||||
type = "tmpfs"
|
||||
target = "/tmp"
|
||||
tmpfs_options {
|
||||
size = 3000000
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
vault {
|
||||
policies = ["monitoring-cluster-exporter", "metrics"]
|
||||
env = false
|
||||
disable_file = true
|
||||
change_mode = "noop"
|
||||
}
|
||||
|
||||
|
||||
# This is the main nginx configuration, which will proxypass requests to the real metrics endpoints
|
||||
template {
|
||||
data = <<_EOT
|
||||
|
||||
# Cluster exporter
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_cluster" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
|
||||
set $consul_token "{{ with secret "consul/creds/monitoring-cluster-exporter" }}{{ .Data.token }}{{ end }}";
|
||||
|
||||
{{- range service "nomad-client" }}
|
||||
location /nomad-client/{{ .Node }} {
|
||||
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus;
|
||||
proxy_ssl_certificate /secrets/nomad_client_bundle.pem;
|
||||
proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem;
|
||||
proxy_ssl_verify on;
|
||||
proxy_ssl_name client.{{ env "NOMAD_REGION" }}.nomad;
|
||||
proxy_ssl_trusted_certificate /local/nomad_ca.crt;
|
||||
}
|
||||
{{- end }}
|
||||
|
||||
{{- range service "nomad" }}
|
||||
{{- if .Tags | contains "http" }}
|
||||
location /nomad/{{ .Node }} {
|
||||
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus;
|
||||
proxy_ssl_certificate /secrets/nomad_client_bundle.pem;
|
||||
proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem;
|
||||
proxy_ssl_verify on;
|
||||
proxy_ssl_name server.{{ env "NOMAD_REGION" }}.nomad;
|
||||
proxy_ssl_trusted_certificate /local/nomad_ca.crt;
|
||||
}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{- range service "consul" }}
|
||||
location /consul/{{ .Node }} {
|
||||
proxy_pass https://{{ .Address }}:8501/v1/agent/metrics?format=prometheus;
|
||||
proxy_set_header X-Consul-Token $consul_token;
|
||||
proxy_ssl_certificate /secrets/consul_client_bundle.pem;
|
||||
proxy_ssl_certificate_key /secrets/consul_client_bundle.pem;
|
||||
proxy_ssl_verify off;
|
||||
proxy_ssl_trusted_certificate /local/consul_ca.crt;
|
||||
}
|
||||
{{- end }}
|
||||
|
||||
{{- range service "vault" }}
|
||||
location /vault/{{ .Node }} {
|
||||
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/sys/metrics?format=prometheus;
|
||||
proxy_ssl_verify on;
|
||||
proxy_ssl_trusted_certificate /etc/ssl/cert.pem;
|
||||
proxy_set_header X-Forwarded-For "$proxy_add_x_forwarded_for";
|
||||
proxy_set_header X-Real-IP "$remote_addr";
|
||||
proxy_set_header X-Forwarded-Proto "$scheme";
|
||||
proxy_set_header X-Scheme "$scheme";
|
||||
proxy_set_header X-Forwarded-Host "$host";
|
||||
proxy_set_header X-Forwarded-Port "$server_port";
|
||||
}
|
||||
{{- end }}
|
||||
|
||||
location / {
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
}
|
||||
}
|
||||
|
||||
# Ping exporter
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_ping" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
location /metrics {
|
||||
proxy_pass http://127.0.0.1:9427;
|
||||
}
|
||||
}
|
||||
|
||||
# Blackbox exporter
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_blackbox" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
|
||||
location / {
|
||||
proxy_pass http://127.0.0.1:9115;
|
||||
}
|
||||
}
|
||||
|
||||
# Consul exporter
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_consul" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
location /metrics {
|
||||
proxy_pass http://127.0.0.1:9107;
|
||||
}
|
||||
}
|
||||
|
||||
_EOT
|
||||
destination = "secrets/metrics.conf"
|
||||
perms = "0440"
|
||||
uid = 108685
|
||||
gid = 100000
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# Get certificate to add mTLS to metrics endpoints
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# Get the CA for the monitoring PKI
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
# Get a Nomad client certificate
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/nomad/issue/monitoring-cluster-exporter" "common_name=metrics-proxy.nomad.consul" "ttl=24h" }}
|
||||
{{ .Data.Cert }}
|
||||
{{ .Data.Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/nomad_client_bundle.pem"
|
||||
perms = "0400"
|
||||
uid = 108685
|
||||
gid = 100000
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# The CA chain for Nomad
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/nomad_ca.crt"
|
||||
}
|
||||
|
||||
# Same for Consul
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/consul/issue/monitoring-cluster-exporter" "common_name=metrics-proxy.consul.consul" "ttl=24h" }}
|
||||
{{ .Data.Cert }}
|
||||
{{ .Data.Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/consul_client_bundle.pem"
|
||||
perms = "0400"
|
||||
uid = 108685
|
||||
gid = 100000
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/consul/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/consul_ca.crt"
|
||||
}
|
||||
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 18
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,3 @@
|
|||
path "pki/monitoring/issue/metrics" {
|
||||
capabilities = ["update"]
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
|
||||
path "pki/monitoring/issue/monitoring-alertmanager" {
|
||||
capabilities = ["update"]
|
||||
}
|
||||
|
||||
path "kv/service/monitoring/alertmanager" {
|
||||
capabilities = ["read"]
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
|
||||
# Read vault metrics
|
||||
path "sys/metrics" {
|
||||
capabilities = ["read", "list"]
|
||||
}
|
||||
|
||||
# Get a cert for Nomad
|
||||
path "pki/nomad/issue/monitoring-cluster-exporter" {
|
||||
capabilities = ["update"]
|
||||
}
|
||||
|
||||
# Get a cert for Consul
|
||||
path "pki/consul/issue/monitoring-cluster-exporter" {
|
||||
capabilities = ["update"]
|
||||
}
|
||||
|
||||
# Get a consul token
|
||||
path "consul/creds/monitoring-cluster-exporter" {
|
||||
capabilities = ["read"]
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
|
||||
path "consul/creds/monitoring-consul-exporter" {
|
||||
capabilities = ["read"]
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
|
||||
path "pki/monitoring/issue/monitoring-prometheus" {
|
||||
capabilities = ["update"]
|
||||
}
|
||||
|
||||
path "kv/service/monitoring/prometheus" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
|
||||
path "consul/creds/monitoring-prometheus" {
|
||||
capabilities = ["read"]
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
|
||||
|
||||
ARG AM_VERSION=[[ .monitoring.alertmanager.version ]]
|
||||
|
||||
ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/alertmanager-${AM_VERSION}.linux-amd64.tar.gz /tmp
|
||||
ADD https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/sha256sums.txt /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add \
|
||||
tar \
|
||||
&&\
|
||||
cd /tmp &&\
|
||||
grep "alertmanager-${AM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
|
||||
tar xzf alertmanager-${AM_VERSION}.linux-amd64.tar.gz &&\
|
||||
mv alertmanager-${AM_VERSION}.linux-amd64 /opt/alertmanager
|
||||
|
||||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
|
||||
MAINTAINER [[ .docker.maintainer ]]
|
||||
|
||||
ENV PATH=/opt/alertmanager:$PATH
|
||||
|
||||
COPY --from=builder /opt/alertmanager /opt/alertmanager
|
||||
RUN set -eux &&\
|
||||
addgroup -g 9093 alertmanager &&\
|
||||
adduser --system \
|
||||
--disabled-password \
|
||||
--uid 9093 \
|
||||
--ingroup alertmanager \
|
||||
--home /opt/alertmanager \
|
||||
--no-create-home \
|
||||
--shell /sbin/nologin \
|
||||
alertmanager &&\
|
||||
mkdir /data &&\
|
||||
chown alertmanager:alertmanager /data &&\
|
||||
chmod 700 data
|
||||
|
||||
WORKDIR /opt/alertmanager
|
||||
USER alertmanager
|
||||
EXPOSE 9093
|
||||
CMD [ "alertmanager", \
|
||||
"--config.file=/opt/alertmanager/alertmanager.yml", \
|
||||
"--storage.path=/data" ]
|
|
@ -0,0 +1,29 @@
|
|||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
|
||||
|
||||
ARG BLACKBOX_EXPORTER_VERSION=[[ .monitoring.exporters.blackbox.version ]]
|
||||
|
||||
ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
|
||||
ADD https://github.com/prometheus/blackbox_exporter/releases/download/v${BLACKBOX_EXPORTER_VERSION}/sha256sums.txt /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add tar gzip &&\
|
||||
cd /tmp &&\
|
||||
grep "blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
|
||||
tar xvf blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64.tar.gz &&\
|
||||
mkdir blackbox &&\
|
||||
mv blackbox_exporter-${BLACKBOX_EXPORTER_VERSION}.linux-amd64/blackbox_exporter /usr/local/bin/blackbox_exporter
|
||||
|
||||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
|
||||
MAINTAINER [[ .docker.maintainer ]]
|
||||
|
||||
ENV BLACKBOX_CONF=/etc/blackbox.yml
|
||||
|
||||
COPY --from=builder /usr/local/bin/blackbox_exporter /usr/local/bin/blackbox_exporter
|
||||
|
||||
RUN set -eux &&\
|
||||
apk --no-cache upgrade &&\
|
||||
apk --no-cache add ca-certificates curl
|
||||
|
||||
COPY root/ /
|
||||
|
||||
EXPOSE 9195
|
||||
CMD ["sh", "-c", "exec blackbox_exporter --config.file=${BLACKBOX_CONF}"]
|
|
@ -0,0 +1,65 @@
|
|||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
http:
|
||||
preferred_ip_protocol: "ip4"
|
||||
http_ssl_no_check:
|
||||
prober: http
|
||||
http:
|
||||
preferred_ip_protocol: "ip4"
|
||||
tls_config:
|
||||
insecure_skip_verify: true
|
||||
http_post_2xx:
|
||||
prober: http
|
||||
http:
|
||||
method: POST
|
||||
preferred_ip_protocol: "ip4"
|
||||
tcp_connect:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: "ip4"
|
||||
pop3s_banner:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: "ip4"
|
||||
query_response:
|
||||
- expect: "^+OK"
|
||||
tls: true
|
||||
tls_config:
|
||||
insecure_skip_verify: false
|
||||
grpc:
|
||||
prober: grpc
|
||||
grpc:
|
||||
tls: true
|
||||
preferred_ip_protocol: "ip4"
|
||||
grpc_plain:
|
||||
prober: grpc
|
||||
grpc:
|
||||
preferred_ip_protocol: "ip4"
|
||||
tls: false
|
||||
service: "service1"
|
||||
ssh_banner:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: "ip4"
|
||||
query_response:
|
||||
- expect: "^SSH-2.0-"
|
||||
- send: "SSH-2.0-blackbox-ssh-check"
|
||||
irc_banner:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: "ip4"
|
||||
query_response:
|
||||
- send: "NICK prober"
|
||||
- send: "USER prober prober prober :prober"
|
||||
- expect: "PING :([^ ]+)"
|
||||
send: "PONG ${1}"
|
||||
- expect: "^:[^ ]+ 001"
|
||||
icmp:
|
||||
prober: icmp
|
||||
icmp_ttl5:
|
||||
prober: icmp
|
||||
timeout: 5s
|
||||
icmp:
|
||||
ttl: 5
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
|
||||
|
||||
ARG CONSUL_EXPORTER_VERSION=[[ .monitoring.exporters.consul.version ]]
|
||||
|
||||
ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
|
||||
ADD https://github.com/prometheus/consul_exporter/releases/download/v${CONSUL_EXPORTER_VERSION}/sha256sums.txt /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add tar gzip &&\
|
||||
cd /tmp &&\
|
||||
grep "consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
|
||||
tar xvf consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64.tar.gz &&\
|
||||
mv consul_exporter-${CONSUL_EXPORTER_VERSION}.linux-amd64/consul_exporter /usr/local/bin/consul_exporter
|
||||
|
||||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
|
||||
MAINTAINER [[ .docker.maintainer ]]
|
||||
|
||||
COPY --from=builder /usr/local/bin/consul_exporter /usr/local/bin/consul_exporter
|
||||
|
||||
USER 9107
|
||||
EXPOSE 9107
|
||||
CMD ["consul_exporter"]
|
|
@ -0,0 +1,24 @@
|
|||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
|
||||
MAINTAINER [[ .docker.maintainer ]]
|
||||
|
||||
ARG PING_EXPORTER_VERSION=[[ .monitoring.exporters.ping.version ]]
|
||||
|
||||
ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz /tmp
|
||||
ADD https://github.com/czerwonk/ping_exporter/releases/download/${PING_EXPORTER_VERSION}/ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add \
|
||||
tar \
|
||||
gzip \
|
||||
&&\
|
||||
cd /tmp &&\
|
||||
grep "ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz" ping_exporter_${PING_EXPORTER_VERSION}_checksums.txt | sha256sum -c &&\
|
||||
tar xvf ping_exporter_${PING_EXPORTER_VERSION}_linux_amd64.tar.gz &&\
|
||||
mv ping_exporter /usr/local/bin/
|
||||
|
||||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
|
||||
MAINTAINER [[ .docker.maintainer ]]
|
||||
|
||||
COPY --from=builder /usr/local/bin/ping_exporter /usr/local/bin/ping_exporter
|
||||
|
||||
EXPOSE 9427
|
||||
CMD ["ping_exporter", "--config.path=/config.yml"]
|
|
@ -0,0 +1,4 @@
|
|||
# targets:
|
||||
# - foo.bar
|
||||
# - acme.com
|
||||
targets: []
|
|
@ -0,0 +1,48 @@
|
|||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
|
||||
|
||||
ARG PROM_VERSION=[[ .monitoring.prometheus.version ]]
|
||||
|
||||
ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz /tmp
|
||||
ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/sha256sums.txt /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add \
|
||||
curl \
|
||||
tar \
|
||||
ca-certificates \
|
||||
&&\
|
||||
cd /tmp &&\
|
||||
grep "prometheus-${PROM_VERSION}.linux-amd64.tar.gz" sha256sums.txt | sha256sum -c &&\
|
||||
tar xvzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\
|
||||
rm -f prometheus-${PROM_VERSION}.linux-amd64.tar.gz &&\
|
||||
mv prometheus-${PROM_VERSION}.linux-amd64 /opt/prometheus
|
||||
|
||||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
|
||||
MAINTAINER [[ .docker.maintainer ]]
|
||||
|
||||
ENV PATH=/opt/prometheus:$PATH
|
||||
|
||||
COPY --from=builder /opt/prometheus /opt/prometheus
|
||||
RUN set -eux &&\
|
||||
addgroup -g 9090 prometheus &&\
|
||||
adduser --system \
|
||||
--disabled-password \
|
||||
--uid 9090 \
|
||||
--ingroup prometheus \
|
||||
--home /opt/prometheus \
|
||||
--no-create-home \
|
||||
--shell /sbin/nologin \
|
||||
prometheus &&\
|
||||
mkdir /data &&\
|
||||
chown prometheus.prometheus /data &&\
|
||||
chmod 700 /data
|
||||
|
||||
WORKDIR /opt/prometheus
|
||||
USER prometheus
|
||||
EXPOSE 9090
|
||||
CMD [ "/opt/prometheus/prometheus", \
|
||||
"--config.file=/opt/prometheus/prometheus.yml", \
|
||||
"--storage.tsdb.path=/data", \
|
||||
"--storage.tsdb.wal-compression", \
|
||||
"--storage.tsdb.wal-compression-type=zstd", \
|
||||
"--web.console.libraries=/opt/prometheus/console_libraries", \
|
||||
"--web.console.templates=/opt/prometheus/consoles" ]
|
|
@ -0,0 +1,17 @@
|
|||
#!/bin/sh
|
||||
# vim: syntax=sh
|
||||
|
||||
vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-prometheus \
|
||||
ttl=720h \
|
||||
max_ttl=720h \
|
||||
consul_policies="[[ .instance ]]-prometheus"
|
||||
|
||||
vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-consul-exporter \
|
||||
ttl=720h \
|
||||
max_ttl=720h \
|
||||
consul_policies="[[ .instance ]]-prometheus"
|
||||
|
||||
vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-cluster-exporter \
|
||||
ttl=720h \
|
||||
max_ttl=720h \
|
||||
consul_policies="[[ .instance ]]-prometheus"
|
|
@ -0,0 +1,69 @@
|
|||
#!/bin/sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
[[ $c := merge .monitoring . ]]
|
||||
[[ template "common/vault.mkpki.sh" $c ]]
|
||||
|
||||
# Create a role for alertmanager
|
||||
vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-alertmanager \
|
||||
allowed_domains="[[ .instance ]]" \
|
||||
allow_bare_domains=false \
|
||||
allow_subdomains=true \
|
||||
allow_localhost=false \
|
||||
allow_ip_sans=true \
|
||||
server_flag=true \
|
||||
client_flag=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=100h \
|
||||
ou="[[ $c.vault.pki.ou ]]"
|
||||
|
||||
# Create a role for prometheus (which will only be a client, for AlertManager)
|
||||
vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-prometheus \
|
||||
allowed_domains="[[ .instance ]]" \
|
||||
allow_bare_domains=false \
|
||||
allow_subdomains=true \
|
||||
allow_localhost=false \
|
||||
allow_ip_sans=false \
|
||||
server_flag=false \
|
||||
client_flag=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=100h \
|
||||
ou="[[ $c.vault.pki.ou ]]"
|
||||
|
||||
# Create a role for metrics exporters (server only)
|
||||
vault write [[ $c.vault.pki.path ]]/roles/metrics \
|
||||
allowed_domains="[[ .instance ]]" \
|
||||
allow_bare_domains=false \
|
||||
allow_subdomains=true \
|
||||
allow_localhost=false \
|
||||
allow_ip_sans=true \
|
||||
server_flag=true \
|
||||
client_flag=false \
|
||||
allow_wildcard_certificates=false \
|
||||
require_cn=false \
|
||||
max_ttl=72h \
|
||||
no_store=true \
|
||||
ou="[[ $c.vault.pki.ou ]]"
|
||||
|
||||
# Create a role on the Nomad PKI for the cluster exporter
|
||||
vault write pki/nomad/roles/[[ .instance ]]-cluster-exporter \
|
||||
allowed_domains='nomad.[[ .consul.domain ]]' \
|
||||
allow_subdomains=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=168h \
|
||||
allow_ip_sans=false \
|
||||
server_flag=false \
|
||||
client_flag=true \
|
||||
ou="Cluster metrics exporter"
|
||||
|
||||
# Create a role on the Consul PKI for the cluster exporter
|
||||
vault write pki/consul/roles/[[ .instance ]]-cluster-exporter \
|
||||
allowed_domains="consul.[[ .consul.domain ]]" \
|
||||
allow_bare_domains=false \
|
||||
allow_subdomains=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=168h \
|
||||
server_flags=false \
|
||||
client_flags=true \
|
||||
ou="Cluster metrics exporter"
|
|
@ -0,0 +1,253 @@
|
|||
job "[[ .instance ]]-exporters" {
|
||||
|
||||
[[- $c := merge .monitoring.exporters . ]]
|
||||
[[ template "common/job_start" $c ]]
|
||||
|
||||
# Run exporters. Use a separated job so exporters can run in a distinct node_pool
|
||||
group "exporters" {
|
||||
|
||||
count = [[ $c.count ]]
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "ping" {}
|
||||
port "blackbox" {}
|
||||
port "consul" {}
|
||||
port "cluster" {}
|
||||
}
|
||||
|
||||
service {
|
||||
name = "[[ .instance ]]-ping-exporter[[ .consul.suffix ]]"
|
||||
port = "ping"
|
||||
meta {
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
metrics-port = "${NOMAD_HOST_PORT_ping}"
|
||||
}
|
||||
}
|
||||
|
||||
service {
|
||||
name = "[[ .instance ]]-blackbox-exporter[[ .consul.suffix ]]"
|
||||
port = "blackbox"
|
||||
meta {
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
}
|
||||
}
|
||||
|
||||
service {
|
||||
name = "[[ .instance ]]-consul-exporter[[ .consul.suffix ]]"
|
||||
port = "ping"
|
||||
meta {
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
metrics-port = "${NOMAD_HOST_PORT_consul}"
|
||||
}
|
||||
}
|
||||
|
||||
service {
|
||||
name = "[[ .instance ]]-cluster-exporter[[ .consul.suffix ]]"
|
||||
port = "cluster"
|
||||
meta {
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
}
|
||||
}
|
||||
|
||||
[[- if gt (len $c.ping.probes) 0 ]]
|
||||
[[- $e := merge $c.ping $c ]]
|
||||
# Ping exporter will collect ICMP ping stats and expose them
|
||||
# Note : we could do it with blackbox, but as pings require privileges, it's better to grant it
|
||||
# to a smaller, more focused container. This one only handle icmp pings check, and only from the configuration file
|
||||
task "ping-exporter" {
|
||||
driver = "[[ $e.nomad.driver ]]"
|
||||
|
||||
config {
|
||||
image = "[[ $e.image ]]"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 30
|
||||
# Pings require privileges
|
||||
privileged = true
|
||||
userns_mode = "host"
|
||||
command = "ping_exporter"
|
||||
args = [
|
||||
"--web.listen-address=127.0.0.1:9427",
|
||||
"--config.path=/local/config.yml"
|
||||
]
|
||||
}
|
||||
|
||||
[[ template "common/file_env" $e ]]
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ template "monitoring/ping_exporter/config.yml" $e ]]
|
||||
_EOT
|
||||
destination = "local/config.yml"
|
||||
}
|
||||
|
||||
[[ template "common/resources" $e ]]
|
||||
}
|
||||
[[- end ]]
|
||||
|
||||
[[- if or (gt (len $c.blackbox.tcp_probes) 0) (gt (len $c.blackbox.http_probes) 0) ]]
|
||||
[[- $e := merge $c.blackbox $c ]]
|
||||
# Blackbox exporter will probe http/tcp targets and expose them
|
||||
# for prometheus
|
||||
task "blackbox-exporter" {
|
||||
driver = "[[ $e.nomad.driver ]]"
|
||||
|
||||
config {
|
||||
image = "[[ $e.image ]]"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 30
|
||||
}
|
||||
|
||||
[[ template "common/file_env" $e ]]
|
||||
[[ template "common/resources" $e ]]
|
||||
|
||||
}
|
||||
[[- end ]]
|
||||
|
||||
# Export consul services status to prometheus
|
||||
task "consul-exporter" {
|
||||
[[- $e := merge $c.consul $c ]]
|
||||
driver = "[[ $e.nomad.driver ]]"
|
||||
|
||||
config {
|
||||
image = "[[ $e.image ]]"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 30
|
||||
command = "/local/consul-exporter"
|
||||
}
|
||||
|
||||
[[ template "common/file_env" $e ]]
|
||||
[[ template "common/vault.policies" $e ]]
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ template "monitoring/consul-exporter/start.sh" $e ]]
|
||||
_EOT
|
||||
destination = "local/consul-exporter"
|
||||
perms = 755
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
CONSUL_HTTP_TOKEN={{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-consul-exporter" }}{{ .Data.token }}{{ end }}
|
||||
_EOT
|
||||
destination = "secrets/.consul.env"
|
||||
uid = 100000
|
||||
gid = 100000
|
||||
perms = 400
|
||||
env = true
|
||||
}
|
||||
|
||||
[[ template "common/resources" $e ]]
|
||||
}
|
||||
|
||||
# The cluster metrics exposes prometheus metrics from the various nodes of the cluster
|
||||
# Nomad, Consul and Vault
|
||||
# It also exposes the other exporters metrics with mTLS
|
||||
task "cluster-metrics-proxy" {
|
||||
[[- $e := merge $c.cluster $c ]]
|
||||
driver = "[[ $e.nomad.driver ]]"
|
||||
user = 8685
|
||||
|
||||
lifecycle {
|
||||
hook = "poststart"
|
||||
sidecar = true
|
||||
}
|
||||
|
||||
config {
|
||||
image = "[[ $e.image ]]"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 30
|
||||
# Mount the config in nginx conf dir
|
||||
volumes = [
|
||||
"secrets/metrics.conf:/etc/nginx/conf.d/default.conf"
|
||||
]
|
||||
[[ template "common/tmpfs" "/tmp" ]]
|
||||
}
|
||||
|
||||
[[ template "common/vault.policies" $e ]]
|
||||
|
||||
# This is the main nginx configuration, which will proxypass requests to the real metrics endpoints
|
||||
template {
|
||||
data =<<_EOT
|
||||
[[ template "monitoring/cluster-exporter/nginx.conf" $e ]]
|
||||
_EOT
|
||||
destination = "secrets/metrics.conf"
|
||||
perms = "0440"
|
||||
uid = 108685
|
||||
gid = 100000
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# Get certificate to add mTLS to metrics endpoints
|
||||
template {
|
||||
data =<<_EOT
|
||||
{{- with pkiCert "[[ .prometheus.vault_pki ]]/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# Get the CA for the monitoring PKI
|
||||
template {
|
||||
data =<<_EOT
|
||||
{{ with secret "[[ .vault.root ]]pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
# Get a Nomad client certificate
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/nomad/issue/[[ .instance ]]-cluster-exporter" "common_name=metrics-proxy.nomad.[[ .consul.domain ]]" "ttl=24h" }}
|
||||
{{ .Data.Cert }}
|
||||
{{ .Data.Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/nomad_client_bundle.pem"
|
||||
perms = "0400"
|
||||
uid = 108685
|
||||
gid = 100000
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# The CA chain for Nomad
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/nomad_ca.crt"
|
||||
}
|
||||
|
||||
# Same for Consul
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/consul/issue/[[ .instance ]]-cluster-exporter" "common_name=metrics-proxy.consul.[[ .consul.domain ]]" "ttl=24h" }}
|
||||
{{ .Data.Cert }}
|
||||
{{ .Data.Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/consul_client_bundle.pem"
|
||||
perms = "0400"
|
||||
uid = 108685
|
||||
gid = 100000
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/consul/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/consul_ca.crt"
|
||||
}
|
||||
|
||||
[[ template "common/resources" $e ]]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,376 @@
|
|||
job "[[ .instance ]]" {
|
||||
|
||||
[[ template "common/job_start" . ]]
|
||||
|
||||
# Metrics is running prometheus and various exporters
|
||||
group "metrics" {
|
||||
[[- $c := merge .monitoring.prometheus .monitoring . ]]
|
||||
|
||||
shutdown_delay = "6s"
|
||||
count = [[ $c.count ]]
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
[[ template "common/volumes" $c ]]
|
||||
|
||||
service {
|
||||
name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]"
|
||||
port = 9090
|
||||
|
||||
[[ template "common/service_meta" $c ]]
|
||||
[[ template "common/connect" $c ]]
|
||||
|
||||
check {
|
||||
name = "health"
|
||||
type = "http"
|
||||
expose = true
|
||||
path = "/-/healthy"
|
||||
interval = "15s"
|
||||
timeout = "8s"
|
||||
check_restart {
|
||||
limit = 10
|
||||
grace = "5m"
|
||||
}
|
||||
}
|
||||
|
||||
tags = [
|
||||
[[ template "common/traefik_tags" $c ]]
|
||||
]
|
||||
}
|
||||
|
||||
[[ template "common/task.metrics_proxy" $c ]]
|
||||
|
||||
# The main prometheus task
|
||||
task "prometheus" {
|
||||
driver = "[[ $c.nomad.driver ]]"
|
||||
leader = true
|
||||
|
||||
config {
|
||||
image = "[[ $c.image ]]"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 200
|
||||
command = "prometheus"
|
||||
args = [
|
||||
"--config.file=/local/prometheus.yml",
|
||||
"--log.level=debug",
|
||||
"--web.listen-address=127.0.0.1:9090",
|
||||
"--storage.tsdb.path=/data",
|
||||
"--storage.tsdb.retention.time=[[ $c.retention ]]",
|
||||
"--web.console.libraries=/opt/prometheus/console_libraries",
|
||||
"--web.console.templates=/opt/prometheus/consoles",
|
||||
"--web.external-url=[[ $c.public_url ]]",
|
||||
"--web.route-prefix=[[ if eq "" (urlParse $c.public_url).Path ]]/[[ else ]](urlParse $c.public_url).Path[[ end ]]"
|
||||
]
|
||||
}
|
||||
|
||||
[[ template "common/vault.policies" $c ]]
|
||||
[[ template "common/artifacts" $c ]]
|
||||
|
||||
# Main configuration for prometheus
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ tmpl.Exec "monitoring/prometheus/prometheus.yml" $c | replaceAll "${" "$${" ]]
|
||||
_EOT
|
||||
destination = "local/prometheus.yml"
|
||||
uid = 100000
|
||||
gid = 109090
|
||||
perms = 640
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# Alert rules
|
||||
[[- range (file.ReadDir "bundles/monitoring/templates/prometheus/rules") ]]
|
||||
[[- if not (file.Exists (printf "prometheus/rules/%s" .)) ]]
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ file.Read (printf "bundles/monitoring/templates/prometheus/rules/%s" .) ]]
|
||||
_EOT
|
||||
destination = "local/rules/[[ . ]]"
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
}
|
||||
[[- end ]]
|
||||
[[- end ]]
|
||||
|
||||
[[- if file.Exists "prometheus/rules" ]]
|
||||
[[- range (file.ReadDir "prometheus/rules") ]]
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ file.Read (printf "prometheus/rules/%s" .) ]]
|
||||
_EOT
|
||||
destination = "local/rules/[[ . ]]"
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
}
|
||||
[[- end ]]
|
||||
[[- end ]]
|
||||
|
||||
[[- range $k, $v := $c.alert_rules ]]
|
||||
|
||||
artifact {
|
||||
source = "[[ $v.url ]]"
|
||||
destination = "local/rules/[[ $k ]].yml"
|
||||
mode = "file"
|
||||
}
|
||||
[[- end ]]
|
||||
|
||||
# A client cert, to connect to the AlertManager API
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-prometheus"
|
||||
(printf "common_name=prometheus-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
|
||||
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end -}}
|
||||
_EOT
|
||||
destination = "secrets/prometheus.bundle.pem"
|
||||
uid = 100000
|
||||
gid = 109090
|
||||
perms = "0440"
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# The monitoring CA chain, to validate AlertManager cert
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
uid = 100000
|
||||
gid = 100000
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# Persistent data
|
||||
volume_mount {
|
||||
volume = "data"
|
||||
destination = "/data"
|
||||
}
|
||||
|
||||
[[ template "common/resources" $c ]]
|
||||
}
|
||||
}
|
||||
|
||||
group "alerts" {
|
||||
|
||||
[[- $c := merge .monitoring.alertmanager .monitoring . ]]
|
||||
|
||||
count = [[ $c.count ]]
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "web-tls" {}
|
||||
port "cluster" {}
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
[[ template "common/volumes" $c ]]
|
||||
|
||||
# This service is used for the different instances of alertmanager to communicate
|
||||
service {
|
||||
name = "[[ .instance ]]-alertmanager-gossip[[ .consul.suffix ]]"
|
||||
port = "cluster"
|
||||
meta {
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
}
|
||||
}
|
||||
|
||||
# This service is used by prometheus. As it needs to be able to reach every instances, it cannot use
|
||||
# the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh
|
||||
service {
|
||||
name = "[[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]"
|
||||
port = "web-tls"
|
||||
meta {
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
}
|
||||
}
|
||||
|
||||
# This service is exposed through the service mesh
|
||||
# and can be used to reach the web interface through Traefik
|
||||
service {
|
||||
name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]"
|
||||
port = 9093
|
||||
[[ template "common/service_meta" $c ]]
|
||||
[[ template "common/connect" $c ]]
|
||||
|
||||
check {
|
||||
name = "health"
|
||||
type = "http"
|
||||
expose = true
|
||||
path = "/-/healthy"
|
||||
interval = "20s"
|
||||
timeout = "8s"
|
||||
check_restart {
|
||||
limit = 12
|
||||
grace = "30s"
|
||||
}
|
||||
}
|
||||
|
||||
tags = [
|
||||
[[ template "common/traefik_tags" $c ]]
|
||||
]
|
||||
}
|
||||
|
||||
[[ template "common/task.metrics_proxy" $c ]]
|
||||
|
||||
# This task will handle mTLS to the AlertManager API
|
||||
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
|
||||
task "tls-proxy" {
|
||||
driver = "[[ $c.nomad.driver ]]"
|
||||
user = 9093
|
||||
|
||||
config {
|
||||
image = "nginxinc/nginx-unprivileged:alpine"
|
||||
force_pull = true
|
||||
readonly_rootfs = true
|
||||
pids_limit = 30
|
||||
volumes = [
|
||||
"local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro",
|
||||
]
|
||||
[[ template "common/tmpfs" "/tmp" ]]
|
||||
}
|
||||
|
||||
[[ template "common/vault.policies" $c ]]
|
||||
|
||||
lifecycle {
|
||||
hook = "poststart"
|
||||
sidecar = true
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ template "monitoring/alertmanager/nginx.conf" $c ]]
|
||||
_EOT
|
||||
destination = "local/alertmanager.conf"
|
||||
}
|
||||
|
||||
# Certifiate used by AlertManager
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager"
|
||||
(printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
|
||||
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
|
||||
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/alertmanager.bundle.pem"
|
||||
uid = 109093
|
||||
gid = 100000
|
||||
perms = "0440"
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# The trusted CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 18
|
||||
}
|
||||
}
|
||||
|
||||
# The main alertmanager task
|
||||
task "alertmanager" {
|
||||
driver = "[[ $c.nomad.driver ]]"
|
||||
leader = true
|
||||
|
||||
config {
|
||||
image = "[[ $c.image ]]"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 200
|
||||
command = "/local/alertmanager"
|
||||
}
|
||||
|
||||
[[ template "common/vault.policies" $c ]]
|
||||
[[ template "common/file_env" $c ]]
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[- if isKind "map" $c.custom_config ]]
|
||||
[[ merge $c.custom_config (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
|
||||
[[- else if isKind "string" $c.custom_config ]]
|
||||
[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
|
||||
[[- else ]]
|
||||
# Invalid custom config, using template only
|
||||
[[ template "monitoring/alertmanager/alertmanager.yml" $c ]]
|
||||
[[- end ]]
|
||||
_EOT
|
||||
destination = "secrets/alertmanager.yml"
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ template "monitoring/alertmanager/cluster_tls.yml" $c ]]
|
||||
_EOT
|
||||
destination = "local/cluster_tls.yml"
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ template "monitoring/alertmanager/web_tls.yml" $c ]]
|
||||
_EOT
|
||||
destination = "local/web_tls.yml"
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ template "monitoring/alertmanager/start.sh" $c ]]
|
||||
_EOT
|
||||
destination = "local/alertmanager"
|
||||
uid = 100000
|
||||
gid = 100000
|
||||
perms = "0755"
|
||||
}
|
||||
|
||||
# Certifiate used by AlertManager
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager"
|
||||
(printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
|
||||
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
|
||||
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/alertmanager.bundle.pem"
|
||||
uid = 109093
|
||||
gid = 109090
|
||||
perms = "0440"
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# The trusted CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "data"
|
||||
destination = "/data"
|
||||
}
|
||||
|
||||
[[ template "common/resources" $c ]]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
global:
|
||||
smtp_from: '[[ .email.from ]]'
|
||||
smtp_smarthost: localhost:25
|
||||
smtp_require_tls: false
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
tls_server_config:
|
||||
cert_file: /secrets/alertmanager.bundle.pem
|
||||
key_file: /secrets/alertmanager.bundle.pem
|
||||
client_auth_type: RequireAndVerifyClientCert
|
||||
client_ca_file: /local/monitoring.ca.pem
|
||||
|
||||
tls_client_config:
|
||||
cert_file: /secrets/alertmanager.bundle.pem
|
||||
key_file: /secrets/alertmanager.bundle.pem
|
||||
ca_file: /local/monitoring.ca.pem
|
|
@ -0,0 +1,13 @@
|
|||
server {
|
||||
listen 127.0.0.1:9093;
|
||||
location / {
|
||||
proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
|
||||
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
|
||||
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
|
||||
proxy_ssl_verify on;
|
||||
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring;
|
||||
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
|
||||
allow 127.0.0.1;
|
||||
deny all;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
exec alertmanager \
|
||||
--config.file=/secrets/alertmanager.yml \
|
||||
--storage.path=/data \
|
||||
--web.external-url=[[ .public_url ]] \
|
||||
--web.route-prefix=[[ if eq "" (urlParse .public_url).Path ]]/[[ else ]](urlParse .public_url).Path[[ end ]] \
|
||||
--web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \
|
||||
--cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \
|
||||
--cluster.advertise-address={{ env "NOMAD_HOST_ADDR_cluster" }} \
|
||||
{{- range service "[[ .instance ]]-am-gossip[[ .consul.suffix ]]" -}}
|
||||
{{- if not (eq (env "NOMAD_ALLOC_INDEX") (index .ServiceMeta "alloc")) }}
|
||||
--cluster.peer={{ .Address }}:{{ .Port }} \
|
||||
{{ end -}}
|
||||
{{- end -}}
|
||||
--cluster.tls-config=/local/cluster_tls.yml \
|
||||
--web.config.file=/local/web_tls.yml
|
|
@ -0,0 +1,5 @@
|
|||
tls_server_config:
|
||||
cert_file: /secrets/alertmanager.bundle.pem
|
||||
key_file: /secrets/alertmanager.bundle.pem
|
||||
client_auth_type: RequireAndVerifyClientCert
|
||||
client_ca_file: /local/monitoring.ca.pem
|
|
@ -0,0 +1,170 @@
|
|||
|
||||
# Cluster exporter
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_cluster" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
|
||||
set $consul_token "{{ with secret "consul/creds/[[ .instance ]]-cluster-exporter" }}{{ .Data.token }}{{ end }}";
|
||||
|
||||
{{- range service "nomad-client" }}
|
||||
location /nomad-client/{{ .Node }} {
|
||||
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus;
|
||||
proxy_ssl_certificate /secrets/nomad_client_bundle.pem;
|
||||
proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem;
|
||||
proxy_ssl_verify on;
|
||||
proxy_ssl_name client.{{ env "NOMAD_REGION" }}.nomad;
|
||||
proxy_ssl_trusted_certificate /local/nomad_ca.crt;
|
||||
}
|
||||
{{- end }}
|
||||
|
||||
{{- range service "nomad" }}
|
||||
{{- if .Tags | contains "http" }}
|
||||
location /nomad/{{ .Node }} {
|
||||
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/metrics?format=prometheus;
|
||||
proxy_ssl_certificate /secrets/nomad_client_bundle.pem;
|
||||
proxy_ssl_certificate_key /secrets/nomad_client_bundle.pem;
|
||||
proxy_ssl_verify on;
|
||||
proxy_ssl_name server.{{ env "NOMAD_REGION" }}.nomad;
|
||||
proxy_ssl_trusted_certificate /local/nomad_ca.crt;
|
||||
}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{- range service "consul" }}
|
||||
location /consul/{{ .Node }} {
|
||||
proxy_pass https://{{ .Address }}:8501/v1/agent/metrics?format=prometheus;
|
||||
proxy_set_header X-Consul-Token $consul_token;
|
||||
proxy_ssl_certificate /secrets/consul_client_bundle.pem;
|
||||
proxy_ssl_certificate_key /secrets/consul_client_bundle.pem;
|
||||
proxy_ssl_verify off;
|
||||
proxy_ssl_trusted_certificate /local/consul_ca.crt;
|
||||
}
|
||||
{{- end }}
|
||||
|
||||
{{- range service "vault" }}
|
||||
location /vault/{{ .Node }} {
|
||||
proxy_pass https://{{ .Address }}:{{ .Port }}/v1/sys/metrics?format=prometheus;
|
||||
proxy_ssl_verify on;
|
||||
proxy_ssl_trusted_certificate /etc/ssl/cert.pem;
|
||||
proxy_set_header X-Forwarded-For "$proxy_add_x_forwarded_for";
|
||||
proxy_set_header X-Real-IP "$remote_addr";
|
||||
proxy_set_header X-Forwarded-Proto "$scheme";
|
||||
proxy_set_header X-Scheme "$scheme";
|
||||
proxy_set_header X-Forwarded-Host "$host";
|
||||
proxy_set_header X-Forwarded-Port "$server_port";
|
||||
}
|
||||
{{- end }}
|
||||
|
||||
location / {
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
}
|
||||
}
|
||||
|
||||
# Ping exporter
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_ping" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
location /metrics {
|
||||
proxy_pass http://127.0.0.1:9427;
|
||||
}
|
||||
}
|
||||
|
||||
# Blackbox exporter
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_blackbox" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
|
||||
location / {
|
||||
proxy_pass http://127.0.0.1:9115;
|
||||
}
|
||||
}
|
||||
|
||||
# Consul exporter
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_consul" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
location /metrics {
|
||||
proxy_pass http://127.0.0.1:9107;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
#!/bin/sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
exec consul_exporter \
|
||||
--web.listen-address=127.0.0.1:9107 \
|
||||
--consul.server=http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 \
|
||||
--consul.request-limit=20
|
|
@ -0,0 +1,4 @@
|
|||
targets:
|
||||
[[- range $idx, $probe := .probes ]]
|
||||
- [[ $probe ]]
|
||||
[[- end ]]
|
|
@ -0,0 +1,237 @@
|
|||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
#query_log_file: /dev/stdout
|
||||
external_labels:
|
||||
cluster: [[ .consul.domain ]]
|
||||
env: [[ getenv "NOMAD_NAMESPACE" ]]
|
||||
|
||||
rule_files:
|
||||
- /local/rules/*.yml
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- scheme: https
|
||||
tls_config:
|
||||
ca_file: /local/monitoring.ca.pem
|
||||
cert_file: /secrets/prometheus.bundle.pem
|
||||
key_file: /secrets/prometheus.bundle.pem
|
||||
consul_sd_configs:
|
||||
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
|
||||
scheme: http
|
||||
datacenter: [[ .consul.datacenter ]]
|
||||
relabel_configs:
|
||||
# Only keep alertmanagers
|
||||
- source_labels: [__meta_consul_service]
|
||||
action: keep
|
||||
regex: [[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]
|
||||
|
||||
scrape_configs:
|
||||
|
||||
[[- range $k, $v := .jobs ]]
|
||||
|
||||
- job_name: [[ $k ]]
|
||||
static_configs:
|
||||
- targets:
|
||||
[[- range $target := $v.targets ]]
|
||||
- [[ $target ]]
|
||||
[[- end ]]
|
||||
[[- end ]]
|
||||
|
||||
[[- if gt (len .exporters.blackbox.http_probes) 0 ]]
|
||||
|
||||
# Blackbox Exporter HTTP targets
|
||||
- job_name: http_probe
|
||||
metrics_path: /probe
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /local/monitoring.ca.pem
|
||||
cert_file: /secrets/prometheus.bundle.pem
|
||||
key_file: /secrets/prometheus.bundle.pem
|
||||
params:
|
||||
module: ["http_2xx"]
|
||||
static_configs:
|
||||
- targets:
|
||||
[[- range $http_probe := .exporters.blackbox.http_probes ]]
|
||||
- [[ $http_probe ]]
|
||||
[[- end ]]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: {{ range $idx, $instance := service "[[ .instance ]]-blackbox-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
|
||||
[[- end ]]
|
||||
|
||||
[[- if gt (len .exporters.blackbox.tcp_probes) 0 ]]
|
||||
|
||||
# Blackbox Exporter TCP targets
|
||||
- job_name: tcp_probe
|
||||
metrics_path: /probe
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /local/monitoring.ca.pem
|
||||
cert_file: /secrets/prometheus.bundle.pem
|
||||
key_file: /secrets/prometheus.bundle.pem
|
||||
params:
|
||||
module: ["tcp_connect"]
|
||||
static_configs:
|
||||
[[- range $target := .exporters.blackbox.tcp_probes ]]
|
||||
- [[ $target ]]
|
||||
[[- end ]]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: {{ range $idx, $instance := service "[[ .instance ]]-blackbox-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
|
||||
[[- end ]]
|
||||
|
||||
# Cluster services
|
||||
- job_name: cluster-services
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /local/monitoring.ca.pem
|
||||
cert_file: /secrets/prometheus.bundle.pem
|
||||
key_file: /secrets/prometheus.bundle.pem
|
||||
consul_sd_configs:
|
||||
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
|
||||
scheme: http
|
||||
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
|
||||
datacenter: [[ .consul.datacenter ]]
|
||||
relabel_configs:
|
||||
|
||||
# Drop anything which is not Nomad, Consul or Vault
|
||||
# Other services will be monitored with another job
|
||||
- source_labels: [__meta_consul_service]
|
||||
action: keep
|
||||
regex: (nomad(\-client)?|consul|vault)
|
||||
|
||||
- source_labels: [__meta_consul_service,__meta_consul_node]
|
||||
regex: (.+);(.+)
|
||||
replacement: ${1}/${2}
|
||||
target_label: __metrics_path__
|
||||
|
||||
- source_labels: [__meta_consul_service]
|
||||
regex: (.+)
|
||||
replacement: {{ range $idx, $instance := service "[[ .instance ]]-cluster-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
|
||||
target_label: __address__
|
||||
|
||||
# Rewrite the job labels to the name of the service
|
||||
- source_labels: [__meta_consul_service]
|
||||
regex: (.+)
|
||||
replacement: ${1}
|
||||
target_label: job
|
||||
|
||||
# Rewrite the instance labels
|
||||
- source_labels: [__meta_consul_node]
|
||||
regex: (.+)
|
||||
replacement: ${1}
|
||||
target_label: instance
|
||||
|
||||
# regular services discovered from the Consul Catalog
|
||||
- job_name: consul-services
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /local/monitoring.ca.pem
|
||||
cert_file: /secrets/prometheus.bundle.pem
|
||||
key_file: /secrets/prometheus.bundle.pem
|
||||
|
||||
consul_sd_configs:
|
||||
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
|
||||
scheme: http
|
||||
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
|
||||
datacenter: [[ .consul.datacenter ]]
|
||||
|
||||
relabel_configs:
|
||||
|
||||
# Drop sidecar's service to prevent duplicate. Sidecar themselves are treated in another job
|
||||
- source_labels: [__meta_consul_service]
|
||||
action: drop
|
||||
regex: (.+)-sidecar-proxy
|
||||
|
||||
# Drop Nomad, Consul and vault, already handled
|
||||
- source_labels: [__meta_consul_service]
|
||||
action: drop
|
||||
regex: (nomad(\-client)?|consul|vault)
|
||||
|
||||
# Only keep services having a metrics-port set
|
||||
- source_labels: [__meta_consul_service_metadata_metrics_port]
|
||||
regex: \d+
|
||||
action: keep
|
||||
|
||||
# Get metrics path from metadata
|
||||
- source_labels: [__meta_consul_service_metadata_metrics_path]
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
|
||||
# Rewrite the scheme if needed
|
||||
- source_labels: [__meta_consul_service_metadata_metrics_scheme]
|
||||
regex: (https?)
|
||||
replacement: ${1}
|
||||
target_label: __scheme__
|
||||
|
||||
# Rewrite the address to use the metrics port
|
||||
- source_labels: [__address__, __meta_consul_service_metadata_metrics_port]
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: ${1}:${2}
|
||||
target_label: __address__
|
||||
|
||||
# Rewrite the job labels to the name of the service
|
||||
- source_labels: [__meta_consul_service]
|
||||
regex: (.+)
|
||||
replacement: ${1}
|
||||
target_label: job
|
||||
|
||||
# Set the default alloc to 0 if not set
|
||||
- source_labels: [__meta_consul_service_metadata_alloc]
|
||||
regex: ^$
|
||||
replacement: 0
|
||||
target_label: __meta_consul_service_metadata_alloc
|
||||
|
||||
# Rewerite the instance label to be service-alloc
|
||||
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc]
|
||||
regex: (.+);([a-zA-Z\d\-\.]+)
|
||||
replacement: ${1}-${2}
|
||||
target_label: instance
|
||||
|
||||
# envoy sidecars from consul
|
||||
- job_name: consul-envoy-services
|
||||
consul_sd_configs:
|
||||
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
|
||||
scheme: http
|
||||
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
|
||||
datacenter: [[ .consul.datacenter ]]
|
||||
|
||||
relabel_configs:
|
||||
|
||||
# Only keep sidecar-service with a envoy-metrics-port defined
|
||||
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_envoy_metrics_port]
|
||||
action: keep
|
||||
regex: (.+)-sidecar-proxy;\d+
|
||||
|
||||
# Rewrite the address to use the envoy-metrics-port
|
||||
- source_labels: [__address__, __meta_consul_service_metadata_envoy_metrics_port]
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: ${1}:${2}
|
||||
target_label: __address__
|
||||
|
||||
# Rewrite the job label
|
||||
- source_labels: [__meta_consul_service]
|
||||
regex: (.+)
|
||||
replacement: ${1}
|
||||
target_label: job
|
||||
|
||||
# Set the default alloc to 0 if not set
|
||||
- source_labels: [__meta_consul_service_metadata_alloc]
|
||||
regex: ^$
|
||||
replacement: 0
|
||||
target_label: __meta_consul_service_metadata_alloc
|
||||
|
||||
# Rewerite the instance label to be service-alloc
|
||||
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc]
|
||||
regex: (.+);([a-zA-Z\d\-\.]+)
|
||||
replacement: ${1}-${2}
|
||||
target_label: instance
|
|
@ -0,0 +1,69 @@
|
|||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: Blackbox
|
||||
rules:
|
||||
|
||||
- alert: BlackboxProbeFailed
|
||||
expr: probe_success == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Blackbox probe failed (instance {{ $labels.instance }})
|
||||
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSlowProbe
|
||||
expr: avg_over_time(probe_duration_seconds[1m]) > 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Blackbox slow probe (instance {{ $labels.instance }})
|
||||
description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxProbeHttpFailure
|
||||
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
|
||||
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSslCertificateWillExpireSoon
|
||||
expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSslCertificateWillExpireSoon
|
||||
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSslCertificateExpired
|
||||
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
|
||||
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxProbeSlowHttp
|
||||
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
|
||||
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
@ -0,0 +1,54 @@
|
|||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: ConsulExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ConsulServiceHealthcheckFailed
|
||||
# Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available)
|
||||
expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Consul service healthcheck failed (service {{ $labels.service_name }})
|
||||
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulMissingMasterNode
|
||||
expr: 'consul_raft_peers < (max_over_time(consul_raft_peers{}[6h]) / 2) + 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Consul missing master node (node {{ $labels.node }})
|
||||
description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulAgentUnhealthy
|
||||
expr: 'consul_health_node_status{status="critical"} == 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Consul agent unhealthy (node {{ $labels.node }})
|
||||
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulServiceWarning
|
||||
expr: 'consul_health_service_status{status="warning"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state
|
||||
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulServiceCritical
|
||||
expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state
|
||||
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: JVM
|
||||
|
||||
rules:
|
||||
|
||||
- alert: JvmMemoryFillingUp
|
||||
expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 90'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM memory filling up (instance {{ $labels.instance }})
|
||||
description: "JVM memory is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
@ -0,0 +1,51 @@
|
|||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: Nomad
|
||||
rules:
|
||||
|
||||
- alert: NomadJobFailed
|
||||
expr: 'delta(nomad_nomad_job_summary_failed[30m]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad job failed (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
|
||||
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NomadJobLost
|
||||
expr: 'nomad_nomad_job_summary_lost > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad job lost (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
|
||||
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NomadJobQueued
|
||||
expr: 'nomad_nomad_job_summary_queued > 0'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad job queued (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
|
||||
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NomadBlockedEvaluation
|
||||
expr: 'nomad_nomad_blocked_evals_total_blocked > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad blocked evaluation (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
|
||||
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NomadTaskOOM
|
||||
expr: 'count_over_time(nomad_client_allocs_oom_killed[1h]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad task killed by OOM (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }})
|
||||
description: "Nomad task killed by OOM \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
@ -0,0 +1,25 @@
|
|||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: Ping
|
||||
rules:
|
||||
|
||||
- alert: HostDown
|
||||
expr: ping_loss_ratio == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host down (host {{ $labels.target }})
|
||||
description: "Host {{ $labels.target }} doesn't respond to ICMP pings, VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PingLoss
|
||||
expr: |
|
||||
avg_over_time(ping_loss_ratio[10m]) > 0.1 and min_over_time(ping_loss_ratio[10m]) < 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: High packet loss (host {{ $labels.target }})
|
||||
description: "ICMP pings have a loss ratio > 10%, VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
@ -0,0 +1,80 @@
|
|||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: Postgres
|
||||
|
||||
rules:
|
||||
|
||||
- alert: PostgresqlDown
|
||||
expr: 'pg_up == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql down (instance {{ $labels.instance }})
|
||||
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresTooManyRestarts
|
||||
expr: changes(process_start_time_seconds{job="pg"}[15m]) > 3
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgres too many restarts (instance {{ $labels.instance }})
|
||||
description: "Postgres server has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyConnections
|
||||
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql too many connections (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlDeadLocks
|
||||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# - alert: PostgresqlHighRollbackRate
|
||||
# expr: 'rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.05'
|
||||
# for: 0m
|
||||
# labels:
|
||||
# severity: warning
|
||||
# annotations:
|
||||
# summary: Postgresql high rollback rate (instance {{ $labels.instance }})
|
||||
# description: "Ratio of transactions being aborted compared to committed is > 5 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateStatementTimeout
|
||||
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
|
||||
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateDeadlock
|
||||
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
|
||||
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyLocksAcquired
|
||||
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
|
||||
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
# Prometheus
|
||||
- name: Prometheus
|
||||
rules:
|
||||
|
||||
- alert: PrometheusTargetMissing
|
||||
expr: up{job!~"sftp-PR\\d+"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus target missing (job {{ $labels.job }}, instance {{ $labels.instance }})
|
||||
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTooManyRestarts
|
||||
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 3
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus too many restarts (job {{ $labels.job }}, instance {{ $labels.instance }})
|
||||
description: "Prometheus has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusNotConnectedToAlertmanager
|
||||
expr: prometheus_notifications_alertmanagers_discovered < 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
|
||||
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusRuleEvaluationFailures
|
||||
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusRuleEvaluationSlow
|
||||
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
||||
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusNotificationsBacklog
|
||||
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
|
||||
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerNotificationFailing
|
||||
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
|
||||
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetScrapingSlow
|
||||
expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
|
||||
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbWalCorruptions
|
||||
expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: Traefik
|
||||
|
||||
rules:
|
||||
|
||||
- alert: TraefikHighHttp5xxErrorRateService
|
||||
expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
|
||||
description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
@ -0,0 +1,16 @@
|
|||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: HashicorpVault
|
||||
|
||||
rules:
|
||||
|
||||
- alert: VaultSealed
|
||||
expr: 'vault_core_unsealed == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Vault sealed (instance {{ $labels.instance }})
|
||||
description: "Vault instance is sealed on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
@ -0,0 +1,127 @@
|
|||
---
|
||||
|
||||
instance: monitoring
|
||||
|
||||
vault:
|
||||
pki:
|
||||
path: '[[ .prometheus.vault_pki ]]'
|
||||
ou: Monitoring
|
||||
|
||||
monitoring:
|
||||
|
||||
exporters:
|
||||
count: 1
|
||||
|
||||
ping:
|
||||
version: 1.1.0
|
||||
image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1'
|
||||
env: {}
|
||||
resources:
|
||||
cpu: 10
|
||||
memory: 30
|
||||
probes: []
|
||||
|
||||
blackbox:
|
||||
version: 0.24.0
|
||||
image: '[[ .docker.repo ]]blackbox-exporter:[[ .monitoring.exporters.blackbox.version ]]-1'
|
||||
env: {}
|
||||
resources:
|
||||
cpu: 10
|
||||
memory: 50
|
||||
tcp_probes: []
|
||||
http_probes: []
|
||||
|
||||
consul:
|
||||
version: 0.11.0
|
||||
image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2'
|
||||
env: {}
|
||||
resources:
|
||||
cpu: 20
|
||||
memory: 64
|
||||
vault:
|
||||
policies:
|
||||
- '[[ .instance ]]-consul-exporter'
|
||||
|
||||
cluster:
|
||||
image: nginxinc/nginx-unprivileged:alpine
|
||||
env: {}
|
||||
resources:
|
||||
cpu: 10
|
||||
memory: 18
|
||||
vault:
|
||||
policies:
|
||||
- '[[ .instance ]]-cluster-exporter'
|
||||
- metrics
|
||||
|
||||
prometheus:
|
||||
|
||||
version: 2.50.1
|
||||
|
||||
count: 1
|
||||
|
||||
image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1'
|
||||
|
||||
env: {}
|
||||
|
||||
resources:
|
||||
cpu: 200
|
||||
memory: 768
|
||||
|
||||
volumes:
|
||||
data:
|
||||
type: csi
|
||||
source: '[[ .instance ]]-prometheus-data'
|
||||
per_alloc: true
|
||||
|
||||
vault:
|
||||
policies:
|
||||
- '[[ .instance ]]-prometheus'
|
||||
|
||||
jobs: {}
|
||||
alert_rules: {}
|
||||
# alert_rules:
|
||||
# postgres:
|
||||
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
|
||||
|
||||
public_url: https://prometheus.example.org
|
||||
traefik:
|
||||
enabled: true
|
||||
router: prometheus
|
||||
|
||||
retention: 30d
|
||||
|
||||
prometheus:
|
||||
enabled: true
|
||||
metrics_url: http://localhost:9090/metrics
|
||||
|
||||
alertmanager:
|
||||
count: 1
|
||||
version: 0.27.0
|
||||
image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-1'
|
||||
env: {}
|
||||
resources:
|
||||
cpu: 50
|
||||
memory: 80
|
||||
public_url: https://alerte.example.org
|
||||
traefik:
|
||||
enabled: true
|
||||
router: alertmanager
|
||||
strip_prefix: false
|
||||
volumes:
|
||||
data:
|
||||
source: '[[ .instance ]]-alertmanager-data'
|
||||
type: csi
|
||||
per_alloc: true
|
||||
prometheus:
|
||||
metrics_url: http://127.0.0.1:9093/metrics
|
||||
vault:
|
||||
policies:
|
||||
- metrics
|
||||
- '[[ .instance ]]-alertmanager'
|
||||
email:
|
||||
from: alertmanager@[[ .consul.domain ]]
|
||||
custom_config: ""
|
||||
|
||||
|
||||
prometheus:
|
||||
enabled: true
|
|
@ -0,0 +1,3 @@
|
|||
path "[[ .prometheus.vault_pki ]]/issue/metrics" {
|
||||
capabilities = ["update"]
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
[[- $c := merge .monitoring.alertmanager .monitoring . ]]
|
||||
path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager" {
|
||||
capabilities = ["update"]
|
||||
}
|
||||
|
||||
path "[[ .vault.root ]]kv/service/[[ .instance ]]/alertmanager" {
|
||||
capabilities = ["read"]
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
[[- $c := merge .monitoring.exporters.cluster .monitoring.exporters .monitoring . ]]
|
||||
# Read vault metrics
|
||||
path "sys/metrics" {
|
||||
capabilities = ["read", "list"]
|
||||
}
|
||||
|
||||
# Get a cert for Nomad
|
||||
path "pki/nomad/issue/[[ .instance ]]-cluster-exporter" {
|
||||
capabilities = ["update"]
|
||||
}
|
||||
|
||||
# Get a cert for Consul
|
||||
path "pki/consul/issue/[[ .instance ]]-cluster-exporter" {
|
||||
capabilities = ["update"]
|
||||
}
|
||||
|
||||
# Get a consul token
|
||||
path "consul/creds/[[ .instance ]]-cluster-exporter" {
|
||||
capabilities = ["read"]
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
[[- $c := merge .monitoring.exporters.consul .monitoring.exporters .monitoring . ]]
|
||||
path "[[ $c.vault.root ]]consul/creds/[[ .instance ]]-consul-exporter" {
|
||||
capabilities = ["read"]
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
[[- $c := merge .monitoring.prometheus .monitoring . ]]
|
||||
path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-prometheus" {
|
||||
capabilities = ["update"]
|
||||
}
|
||||
|
||||
path "[[ $c.vault.root ]]kv/service/[[ .instance ]]/prometheus" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
|
||||
path "[[ $c.vault.root ]]consul/creds/[[ .instance ]]-prometheus" {
|
||||
capabilities = ["read"]
|
||||
}
|
Loading…
Reference in New Issue