Some refactor, add grafana

This commit is contained in:
Daniel Berteaud 2024-03-22 16:10:10 +01:00
parent 7ed40afe9c
commit a4d66759e0
59 changed files with 860 additions and 191 deletions

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "alertmanager[[ .consul.suffix ]]"
Protocol = "http"

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "grafana[[ .consul.suffix ]]"
Protocol = "http"

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "loki[[ .consul.suffix ]]"
Protocol = "http"

View File

@ -1,3 +0,0 @@
Kind = "service-defaults"
Name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]"
Protocol = "http"

View File

@ -1,3 +0,0 @@
Kind = "service-defaults"
Name = "[[ .instance ]]-loki[[ .consul.suffix ]]"
Protocol = "http"

View File

@ -1,3 +0,0 @@
Kind = "service-defaults"
Name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]"
Protocol = "http"

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "prometheus[[ .consul.suffix ]]"
Protocol = "http"

View File

@ -1,5 +1,5 @@
Kind = "service-intentions"
Name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]"
Name = "alertmanager[[ .consul.suffix ]]"
Sources = [
{
Name = "[[ (merge .monitoring.alertmanager .).traefik.instance ]]"

View File

@ -0,0 +1,15 @@
Kind = "service-intentions"
Name = "grafana[[ .consul.suffix ]]"
Sources = [
{
Name = "[[ (merge .monitoring.grafana .monitoring .).traefik.instance ]]"
Permissions = [
{
Action = "allow"
HTTP {
PathPrefix = "[[ if eq (urlParse .monitoring.grafana.public_url).Path "" ]]/[[ else ]][[ (urlParse .monitoring.grafana.public_url).Path ]][[ end ]]"
}
}
]
}
]

View File

@ -1,5 +1,5 @@
Kind = "service-intentions"
Name = "[[ .instance ]]-loki[[ .consul.suffix ]]"
Name = "loki[[ .consul.suffix ]]"
Sources = [
{
Name = "[[ (merge .monitoring.loki .monitoring .).traefik.instance ]]"
@ -13,7 +13,7 @@ Sources = [
]
},
{
Name = "[[ .instance ]]-grafana[[ .consul.suffix ]]"
Name = "grafana[[ .consul.suffix ]]"
Permissions = [
{
Action = "allow"
@ -33,7 +33,7 @@ Sources = [
},
[[- range $idx, $service := coll.Slice "vector-aggregator" "vector-agent" ]]
{
Name = "[[ $.instance ]]-[[ $service ]][[ $.consul.suffix ]]"
Name = "[[ $service ]][[ $.consul.suffix ]]"
Permissions = [
{
Action = "allow"

View File

@ -1,5 +1,5 @@
Kind = "service-intentions"
Name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]"
Name = "prometheus[[ .consul.suffix ]]"
Sources = [
{
Name = "[[ (merge .monitoring.prometheus .).traefik.instance ]]"
@ -13,7 +13,7 @@ Sources = [
]
},
{
Name = "[[ .instance ]]-grafana[[ .consul.suffix ]]"
Name = "grafana[[ .consul.suffix ]]"
Permissions = [
{
# Deny access to the admin API from Grafana

View File

@ -1,3 +1,3 @@
Kind = "service-defaults"
Name = "monitoring-loki"
Name = "alertmanager"
Protocol = "http"

View File

@ -1,3 +1,3 @@
Kind = "service-defaults"
Name = "monitoring-prometheus"
Name = "grafana"
Protocol = "http"

View File

@ -1,3 +1,3 @@
Kind = "service-defaults"
Name = "monitoring-alertmanager"
Name = "loki"
Protocol = "http"

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "prometheus"
Protocol = "http"

View File

@ -1,5 +1,5 @@
Kind = "service-intentions"
Name = "monitoring-alertmanager"
Name = "alertmanager"
Sources = [
{
Name = "traefik"

View File

@ -0,0 +1,15 @@
Kind = "service-intentions"
Name = "grafana"
Sources = [
{
Name = "traefik"
Permissions = [
{
Action = "allow"
HTTP {
PathPrefix = "/"
}
}
]
}
]

View File

@ -1,5 +1,5 @@
Kind = "service-intentions"
Name = "monitoring-loki"
Name = "loki"
Sources = [
{
Name = "traefik"
@ -13,7 +13,7 @@ Sources = [
]
},
{
Name = "monitoring-grafana"
Name = "grafana"
Permissions = [
{
Action = "allow"
@ -32,7 +32,7 @@ Sources = [
]
},
{
Name = "monitoring-vector-aggregator"
Name = "vector-aggregator"
Permissions = [
{
Action = "allow"
@ -51,7 +51,7 @@ Sources = [
]
},
{
Name = "monitoring-vector-agent"
Name = "vector-agent"
Permissions = [
{
Action = "allow"

View File

@ -1,5 +1,5 @@
Kind = "service-intentions"
Name = "monitoring-prometheus"
Name = "prometheus"
Sources = [
{
Name = "traefik"
@ -13,7 +13,7 @@ Sources = [
]
},
{
Name = "monitoring-grafana"
Name = "grafana"
Permissions = [
{
# Deny access to the admin API from Grafana

View File

@ -18,7 +18,7 @@ job "monitoring-exporters" {
}
service {
name = "monitoring-ping-exporter"
name = "ping-exporter"
port = "ping"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -27,7 +27,7 @@ job "monitoring-exporters" {
}
service {
name = "monitoring-blackbox-exporter"
name = "blackbox-exporter"
port = "blackbox"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -35,7 +35,7 @@ job "monitoring-exporters" {
}
service {
name = "monitoring-consul-exporter"
name = "consul-exporter"
port = "ping"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -44,7 +44,7 @@ job "monitoring-exporters" {
}
service {
name = "monitoring-cluster-exporter"
name = "cluster-exporter"
port = "cluster"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -77,7 +77,7 @@ _EOT
vault {
policies = ["monitoring-consul-exporter"]
policies = ["consul-exporter"]
env = false
disable_file = true
change_mode = "noop"
@ -102,7 +102,7 @@ _EOT
template {
data = <<_EOT
CONSUL_HTTP_TOKEN={{ with secret "consul/creds/monitoring-consul-exporter" }}{{ .Data.token }}{{ end }}
CONSUL_HTTP_TOKEN={{ with secret "consul/creds/consul-exporter" }}{{ .Data.token }}{{ end }}
_EOT
destination = "secrets/.consul.env"
uid = 100000
@ -151,7 +151,7 @@ _EOT
vault {
policies = ["monitoring-cluster-exporter", "metrics"]
policies = ["cluster-exporter", "metrics"]
env = false
disable_file = true
change_mode = "noop"
@ -187,7 +187,7 @@ server {
return 405;
}
set $consul_token "{{ with secret "consul/creds/monitoring-cluster-exporter" }}{{ .Data.token }}{{ end }}";
set $consul_token "{{ with secret "consul/creds/cluster-exporter" }}{{ .Data.token }}{{ end }}";
{{- range service "nomad-client" }}
location /nomad-client/{{ .Node }} {
@ -365,7 +365,7 @@ _EOT
# Get a Nomad client certificate
template {
data = <<_EOT
{{- with pkiCert "pki/nomad/issue/monitoring-cluster-exporter" "common_name=metrics-proxy.nomad.consul" "ttl=24h" }}
{{- with pkiCert "pki/nomad/issue/cluster-exporter" "common_name=metrics-proxy.nomad.consul" "ttl=24h" }}
{{ .Data.Cert }}
{{ .Data.Key }}
{{- end }}
@ -389,7 +389,7 @@ _EOT
# Same for Consul
template {
data = <<_EOT
{{- with pkiCert "pki/consul/issue/monitoring-cluster-exporter" "common_name=metrics-proxy.consul.consul" "ttl=24h" }}
{{- with pkiCert "pki/consul/issue/cluster-exporter" "common_name=metrics-proxy.consul.consul" "ttl=24h" }}
{{ .Data.Cert }}
{{ .Data.Key }}
{{- end }}

View File

@ -0,0 +1,59 @@
FROM danielberteaud/alpine:24.3-1 AS builder
ARG GRAFANA_VERSION=10.4.1 \
GRAFANA_PLUGINS=grafana-clock-panel,grafana-piechart-panel
ADD https://dl.grafana.com/oss/release/grafana-${GRAFANA_VERSION}.linux-amd64.tar.gz /tmp
ADD https://dl.grafana.com/oss/release/grafana-${GRAFANA_VERSION}.linux-amd64.tar.gz.sha256 /tmp
RUN set -eux &&\
apk --no-cache add \
tar \
curl \
ca-certificates \
bash \
gcompat \
libc6-compat \
&&\
ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 &&\
cd /tmp &&\
echo "$(cat grafana-${GRAFANA_VERSION}.linux-amd64.tar.gz.sha256) grafana-${GRAFANA_VERSION}.linux-amd64.tar.gz" | sha256sum -c &&\
tar xzf grafana-${GRAFANA_VERSION}.linux-amd64.tar.gz &&\
mv grafana-v${GRAFANA_VERSION} /opt/grafana &&\
mkdir /opt/grafana/plugins &&\
IFS=',' &&\
for PLUGIN in ${GRAFANA_PLUGINS}; do /opt/grafana/bin/grafana cli --pluginsDir /opt/grafana/plugins plugins install ${PLUGIN}; done
FROM danielberteaud/alpine:24.3-1
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
ENV PATH=/opt/grafana/bin/:${PATH} \
GF_PATHS_DATA=/data \
GF_PATHS_PLUGINS=/opt/grafana/plugins \
GF_LOG_MODE=console
COPY --from=builder /opt/grafana /opt/grafana
RUN set -eux &&\
apk --no-cache add \
gcompat \
libc6-compat \
&&\
ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 &&\
addgroup -g 3000 grafana &&\
adduser --system \
--ingroup grafana \
--disabled-password \
--uid 3000 \
--home /opt/grafana \
--no-create-home \
--shell /sbin/nologin \
grafana &&\
mkdir /data &&\
chown -R grafana:grafana /data /opt/grafana/plugins &&\
chmod 700 /data
WORKDIR /opt/grafana
USER grafana
CMD ["grafana", \
"server", \
"--homepath=/opt/grafana", \
"--packaging=docker"]

View File

@ -1,6 +1,6 @@
FROM danielberteaud/alpine:24.3-1 AS builder
ARG LOKI_VERSION=2.9.5
ARG LOKI_VERSION=2.9.6
ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/loki-linux-amd64.zip /tmp
ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/SHA256SUMS /tmp

View File

@ -1,6 +1,6 @@
FROM danielberteaud/alpine:24.3-1 AS builder
ARG PROM_VERSION=2.50.1
ARG PROM_VERSION=2.51.0
ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz /tmp
ADD https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/sha256sums.txt /tmp

View File

@ -1,17 +1,17 @@
#!/bin/sh
# vim: syntax=sh
vault write consul/roles/monitoring-prometheus \
vault write consul/roles/prometheus \
ttl=720h \
max_ttl=720h \
consul_policies="monitoring-prometheus"
consul_policies="monitoring"
vault write consul/roles/monitoring-consul-exporter \
vault write consul/roles/consul-exporter \
ttl=720h \
max_ttl=720h \
consul_policies="monitoring-prometheus"
consul_policies="monitoring"
vault write consul/roles/monitoring-cluster-exporter \
vault write consul/roles/cluster-exporter \
ttl=720h \
max_ttl=720h \
consul_policies="monitoring-prometheus"
consul_policies="monitoring"

View File

@ -0,0 +1,12 @@
#!/bin/sh
set -euo pipefail
vault write database/roles/grafana \
db_name="postgres" \
creation_statements="CREATE ROLE \"{{name}}\" WITH LOGIN PASSWORD '{{password}}' VALID UNTIL '{{expiration}}'; \
GRANT \"grafana\" TO \"{{name}}\"; \
ALTER ROLE \"{{name}}\" SET role = \"grafana\"" \
default_ttl="12h" \
max_ttl="720h"

View File

@ -93,8 +93,8 @@ rm -rf ${TMP}
# Create a role for alertmanager
vault write pki/monitoring/roles/monitoring-alertmanager \
allowed_domains="monitoring" \
vault write pki/monitoring/roles/alertmanager \
allowed_domains="monitoring.consul" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
@ -106,8 +106,8 @@ vault write pki/monitoring/roles/monitoring-alertmanager \
ou="Monitoring"
# Create a role for prometheus (which will only be a client, for AlertManager)
vault write pki/monitoring/roles/monitoring-prometheus \
allowed_domains="monitoring" \
vault write pki/monitoring/roles/prometheus \
allowed_domains="monitoring.consul" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
@ -119,8 +119,8 @@ vault write pki/monitoring/roles/monitoring-prometheus \
ou="Monitoring"
# Create a role for loki (which will only be a client, for AlertManager)
vault write pki/monitoring/roles/monitoring-loki \
allowed_domains="monitoring" \
vault write pki/monitoring/roles/loki \
allowed_domains="monitoring.consul" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
@ -133,7 +133,7 @@ vault write pki/monitoring/roles/monitoring-loki \
# Create a role for metrics exporters (server only)
vault write pki/monitoring/roles/metrics \
allowed_domains="monitoring" \
allowed_domains="monitoring.consul" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
@ -147,7 +147,7 @@ vault write pki/monitoring/roles/metrics \
ou="Monitoring"
# Create a role on the Nomad PKI for the cluster exporter
vault write pki/nomad/roles/monitoring-cluster-exporter \
vault write pki/nomad/roles/cluster-exporter \
allowed_domains='nomad.consul' \
allow_subdomains=true \
allow_wildcard_certificates=false \
@ -158,7 +158,7 @@ vault write pki/nomad/roles/monitoring-cluster-exporter \
ou="Cluster metrics exporter"
# Create a role on the Consul PKI for the cluster exporter
vault write pki/consul/roles/monitoring-cluster-exporter \
vault write pki/consul/roles/cluster-exporter \
allowed_domains="consul.consul" \
allow_bare_domains=false \
allow_subdomains=true \

View File

@ -0,0 +1,22 @@
#!/bin/sh
set -euo pipefail
# vim: syntax=sh
export LC_ALL=C
VAULT_KV_PATH=kv/service/monitoring/grafana
RAND_CMD="tr -dc A-Za-z0-9\-_\/=~\.+ < /dev/urandom | head -c 50"
if ! vault kv list $(dirname ${VAULT_KV_PATH}) 2>/dev/null | grep -q -E "^$(basename ${VAULT_KV_PATH})\$"; then
vault kv put ${VAULT_KV_PATH} \
secret_key="$(sh -c "${RAND_CMD}")" \
fi
for SECRET in secret_key; do
if ! vault kv get -field ${SECRET} ${VAULT_KV_PATH} >/dev/null 2>&1; then
vault kv patch ${VAULT_KV_PATH} \
${SECRET}=$(sh -c "${RAND_CMD}")
fi
done

View File

@ -18,7 +18,7 @@ job "monitoring-services" {
volume "data" {
source = "monitoring-prometheus-data"
source = "prometheus-data"
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
@ -27,7 +27,7 @@ job "monitoring-services" {
service {
name = "monitoring-prometheus"
name = "prometheus"
port = 9090
meta {
@ -173,7 +173,7 @@ _EOT
leader = true
config {
image = "danielberteaud/prometheus:2.50.1-1"
image = "danielberteaud/prometheus:2.51.0-1"
readonly_rootfs = true
pids_limit = 200
command = "prometheus"
@ -192,7 +192,7 @@ _EOT
vault {
policies = ["monitoring-prometheus"]
policies = ["prometheus"]
env = false
disable_file = true
change_mode = "noop"
@ -225,13 +225,13 @@ alerting:
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "consul/creds/monitoring-prometheus" }}{{ .Data.token }}{{ end }}
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
datacenter: dc1
relabel_configs:
# Only keep alertmanagers
- source_labels: [__meta_consul_service]
action: keep
regex: monitoring-alertmanager-tls
regex: alertmanager-tls
scrape_configs:
@ -245,7 +245,7 @@ scrape_configs:
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "consul/creds/monitoring-prometheus" }}{{ .Data.token }}{{ end }}
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
datacenter: dc1
relabel_configs:
@ -262,7 +262,7 @@ scrape_configs:
- source_labels: [__meta_consul_service]
regex: (.+)
replacement: {{ range $idx, $instance := service "monitoring-cluster-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
replacement: {{ range $idx, $instance := service "cluster-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
target_label: __address__
# Rewrite the job labels to the name of the service
@ -288,7 +288,7 @@ scrape_configs:
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "consul/creds/monitoring-prometheus" }}{{ .Data.token }}{{ end }}
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
datacenter: dc1
relabel_configs:
@ -348,7 +348,7 @@ scrape_configs:
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "consul/creds/monitoring-prometheus" }}{{ .Data.token }}{{ end }}
token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }}
datacenter: dc1
relabel_configs:
@ -884,8 +884,8 @@ _EOT
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/monitoring-prometheus"
(printf "common_name=prometheus-%s.monitoring" (env "NOMAD_ALLOC_INDEX"))
{{- with pkiCert "pki/monitoring/issue/prometheus"
(printf "common_name=prometheus-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
@ -940,7 +940,7 @@ _EOT
volume "data" {
source = "monitoring-alertmanager-data"
source = "alertmanager-data"
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
@ -950,7 +950,7 @@ _EOT
# This service is used for the different instances of alertmanager to communicate
service {
name = "monitoring-alertmanager-gossip"
name = "alertmanager-gossip"
port = "cluster"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -960,7 +960,7 @@ _EOT
# This service is used by prometheus. As it needs to be able to reach every instances, it cannot use
# the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh
service {
name = "monitoring-alertmanager-tls"
name = "alertmanager-tls"
port = "web-tls"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -970,7 +970,7 @@ _EOT
# This service is exposed through the service mesh
# and can be used to reach the web interface through Traefik
service {
name = "monitoring-alertmanager"
name = "alertmanager"
port = 9093
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
@ -1111,7 +1111,7 @@ _EOT
# This task will handle mTLS to the AlertManager API
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
task "tls-proxy" {
task "untls-proxy" {
driver = "docker"
user = 9093
@ -1135,7 +1135,7 @@ _EOT
vault {
policies = ["metrics", "monitoring-alertmanager"]
policies = ["metrics", "alertmanager"]
env = false
disable_file = true
change_mode = "noop"
@ -1156,7 +1156,7 @@ server {
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
proxy_ssl_verify on;
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring;
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul;
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
allow 127.0.0.1;
deny all;
@ -1170,8 +1170,8 @@ _EOT
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/monitoring-alertmanager"
(printf "common_name=alertmanager-%s.monitoring" (env "NOMAD_ALLOC_INDEX"))
{{- with pkiCert "pki/monitoring/issue/alertmanager"
(printf "common_name=alertmanager-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
@ -1214,7 +1214,7 @@ _EOT
vault {
policies = ["metrics", "monitoring-alertmanager"]
policies = ["metrics", "alertmanager"]
env = false
disable_file = true
change_mode = "noop"
@ -1288,7 +1288,7 @@ exec alertmanager \
--web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \
--cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \
--cluster.advertise-address={{ env "NOMAD_HOST_ADDR_cluster" }} \
{{- range service "monitoring-am-gossip" -}}
{{- range service "alertmanager-gossip" -}}
{{- if not (eq (env "NOMAD_ALLOC_INDEX") (index .ServiceMeta "alloc")) }}
--cluster.peer={{ .Address }}:{{ .Port }} \
{{ end -}}
@ -1306,8 +1306,8 @@ _EOT
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/monitoring-alertmanager"
(printf "common_name=alertmanager-%s.monitoring" (env "NOMAD_ALLOC_INDEX"))
{{- with pkiCert "pki/monitoring/issue/alertmanager"
(printf "common_name=alertmanager-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
@ -1356,7 +1356,7 @@ _EOT
volume "data" {
source = "monitoring-loki-data"
source = "loki-data"
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
@ -1364,7 +1364,7 @@ _EOT
service {
name = "monitoring-loki"
name = "loki"
port = 3100
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
@ -1507,14 +1507,14 @@ _EOT
driver = "docker"
config {
image = "danielberteaud/loki:2.9.5-1"
image = "danielberteaud/loki:2.9.6-1"
command = "loki"
args = ["--config.file=/local/loki.yml"]
}
vault {
policies = ["monitoring-loki"]
policies = ["loki"]
env = false
disable_file = true
change_mode = "noop"
@ -1570,7 +1570,7 @@ ruler:
tls_cert_path: /secrets/loki.bundle.pem
tls_key_path: /secrets/loki.bundle.pem
tls_server_name: alertmanager.monitoring
alertmanager_url: monitoring-alertmanager-tls
alertmanager_url: alertmanager-tls
enable_alertmanager_discovery: true
enable_alertmanager_v2: true
enable_api: true
@ -1609,8 +1609,8 @@ _EOT
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/monitoring-loki"
(printf "common_name=loki-%s.monitoring" (env "NOMAD_ALLOC_INDEX"))
{{- with pkiCert "pki/monitoring/issue/loki"
(printf "common_name=loki-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
@ -1666,7 +1666,7 @@ _EOT
# The main service is the vector source
# It will provide access to other services through the mesh (like loki)
service {
name = "monitoring-vector-aggregator"
name = "vector-aggregator"
port = 9000
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
@ -1678,7 +1678,7 @@ _EOT
sidecar_service {
proxy {
upstreams {
destination_name = "monitoring-loki"
destination_name = "loki"
local_bind_port = 3100
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
@ -1909,4 +1909,300 @@ _EOT
}
}
group "interface" {
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
volume "data" {
source = "grafana-data"
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
}
service {
name = "grafana"
port = 3000
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
}
connect {
sidecar_service {
proxy {
upstreams {
destination_name = "postgres"
local_bind_port = 5432
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
}
}
sidecar_task {
config {
args = [
"-c",
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
"-l",
"${meta.connect.log_level}",
"--concurrency",
"${meta.connect.proxy_concurrency}",
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
check {
name = "health"
type = "http"
path = "/api/health"
expose = true
interval = "30s"
timeout = "8s"
}
tags = [
"traefik.enable=true",
"traefik.http.routers.monitoring-grafana.entrypoints=https",
"traefik.http.routers.monitoring-grafana.rule=Host(`grafana.example.org`)",
"traefik.http.middlewares.csp-monitoring-grafana.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
"traefik.http.routers.monitoring-grafana.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-grafana",
]
}
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
task "metrics-proxy" {
driver = "docker"
user = 8995
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
volumes = [
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
]
pids_limit = 100
}
lifecycle {
hook = "poststart"
sidecar = true
}
vault {
policies = ["metrics"]
}
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}{{ end -}}
_EOT
destination = "secrets/metrics.bundle.pem"
}
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://localhost:3000/metrics;
}
}
_EOT
destination = "local/default.conf"
}
resources {
cpu = 10
memory = 10
memory_max = 20
}
}
# Local memcached instance
task "memcached" {
driver = "docker"
user = 11211
lifecycle {
hook = "prestart"
sidecar = true
}
config {
image = "memcached:alpine"
readonly_rootfs = true
force_pull = true
entrypoint = ["/local/memcached"]
}
template {
data = <<_EOT
#!/bin/sh
set -eu
exec memcached -l 127.0.0.1 -p 11211 -m {{ env "NOMAD_MEMORY_LIMIT" | parseInt | subtract 5 }}
_EOT
destination = "local/memcached"
perms = 755
}
resources {
cpu = 10
memory = 20
}
}
task "grafana" {
driver = "docker"
leader = true
config {
image = "danielberteaud/grafana:10.4.1-1"
readonly_rootfs = true
pids_limit = 100
command = "grafana"
args = [
"server",
"--homepath=/opt/grafana",
"--config=/secrets/grafana.ini",
"--packaging=docker"
]
}
vault {
policies = ["grafana"]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = <<_EOT
LANG=fr_FR.utf8
TZ=Europe/Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
# Basic grafana configuration file
template {
data = <<_EOT
[server]
http_addr = 127.0.0.1
http_port = 3000
root_url = https://grafana.example.org
serve_from_sub_path = false
[database]
type = postgres
name = grafana
host = 127.0.0.1:5432
user = {{ with secret "database/creds/grafana" }}{{ .Data.username }}{{ end }}
password = {{ with secret "database/creds/grafana" }}{{ .Data.password }}{{ end }}
[remote_cache]
type = memcached
connstr = 127.0.0.1:11211
[analytics]
reporting_enabled = false
check_for_updates = false
check_for_plugin_updates = false
[security]
cookie_secure = true
cookie_samesite = strict
x_xss_protection = true
secret_key = {{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.secret_key }}{{ end }}
[dataproxy]
timeout = 120
_EOT
destination = "secrets/grafana.ini"
uid = 103000
perms = 400
}
# Mount volume in /data for persistence
volume_mount {
volume = "data"
destination = "/data"
}
resources {
cpu = 100
memory = 256
}
}
}
}

View File

@ -1,5 +1,5 @@
path "pki/monitoring/issue/monitoring-alertmanager" {
path "pki/monitoring/issue/alertmanager" {
capabilities = ["update"]
}

View File

@ -1,20 +1,19 @@
# Read vault metrics
path "sys/metrics" {
capabilities = ["read", "list"]
}
# Get a cert for Nomad
path "pki/nomad/issue/monitoring-cluster-exporter" {
path "pki/nomad/issue/cluster-exporter" {
capabilities = ["update"]
}
# Get a cert for Consul
path "pki/consul/issue/monitoring-cluster-exporter" {
path "pki/consul/issue/cluster-exporter" {
capabilities = ["update"]
}
# Get a consul token
path "consul/creds/monitoring-cluster-exporter" {
path "consul/creds/cluster-exporter" {
capabilities = ["read"]
}

View File

@ -0,0 +1,3 @@
path "consul/creds/consul-exporter" {
capabilities = ["read"]
}

View File

@ -0,0 +1,7 @@
path "database/creds/grafana" {
capabilities = ["read"]
}
path "kv/data/service/monitoring/grafana" {
capabilities = ["read"]
}

View File

@ -1,5 +1,5 @@
path "pki/monitoring/issue/monitoring-loki" {
path "pki/monitoring/issue/loki" {
capabilities = ["update"]
}

View File

@ -1,4 +0,0 @@
path "consul/creds/monitoring-consul-exporter" {
capabilities = ["read"]
}

View File

@ -1,5 +1,5 @@
path "pki/monitoring/issue/monitoring-prometheus" {
path "pki/monitoring/issue/prometheus" {
capabilities = ["update"]
}
@ -7,6 +7,6 @@ path "kv/service/monitoring/prometheus" {
capabilities = ["read"]
}
path "consul/creds/monitoring-prometheus" {
path "consul/creds/prometheus" {
capabilities = ["read"]
}

View File

@ -17,7 +17,7 @@ job "[[ .instance ]]-exporters" {
}
service {
name = "[[ .instance ]]-ping-exporter[[ .consul.suffix ]]"
name = "ping-exporter[[ .consul.suffix ]]"
port = "ping"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -26,7 +26,7 @@ job "[[ .instance ]]-exporters" {
}
service {
name = "[[ .instance ]]-blackbox-exporter[[ .consul.suffix ]]"
name = "blackbox-exporter[[ .consul.suffix ]]"
port = "blackbox"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -34,7 +34,7 @@ job "[[ .instance ]]-exporters" {
}
service {
name = "[[ .instance ]]-consul-exporter[[ .consul.suffix ]]"
name = "consul-exporter[[ .consul.suffix ]]"
port = "ping"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -43,7 +43,7 @@ job "[[ .instance ]]-exporters" {
}
service {
name = "[[ .instance ]]-cluster-exporter[[ .consul.suffix ]]"
name = "cluster-exporter[[ .consul.suffix ]]"
port = "cluster"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -129,7 +129,7 @@ _EOT
template {
data = <<_EOT
CONSUL_HTTP_TOKEN={{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-consul-exporter" }}{{ .Data.token }}{{ end }}
CONSUL_HTTP_TOKEN={{ with secret "consul/creds/consul-exporter[[ .consul.suffix ]]" }}{{ .Data.token }}{{ end }}
_EOT
destination = "secrets/.consul.env"
uid = 100000
@ -204,7 +204,7 @@ _EOT
# Get a Nomad client certificate
template {
data = <<_EOT
{{- with pkiCert "pki/nomad/issue/[[ .instance ]]-cluster-exporter" "common_name=metrics-proxy.nomad.[[ .consul.domain ]]" "ttl=24h" }}
{{- with pkiCert "pki/nomad/issue/cluster-exporter[[ .consul.suffix ]]" "common_name=metrics-proxy.nomad.[[ .consul.domain ]]" "ttl=24h" }}
{{ .Data.Cert }}
{{ .Data.Key }}
{{- end }}
@ -228,7 +228,7 @@ _EOT
# Same for Consul
template {
data = <<_EOT
{{- with pkiCert "pki/consul/issue/[[ .instance ]]-cluster-exporter" "common_name=metrics-proxy.consul.[[ .consul.domain ]]" "ttl=24h" }}
{{- with pkiCert "pki/consul/issue/cluster-exporter[[ .consul.suffix ]]" "common_name=metrics-proxy.consul.[[ .consul.domain ]]" "ttl=24h" }}
{{ .Data.Cert }}
{{ .Data.Key }}
{{- end }}

59
images/grafana/Dockerfile Normal file
View File

@ -0,0 +1,59 @@
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
ARG GRAFANA_VERSION=[[ .monitoring.grafana.version ]] \
GRAFANA_PLUGINS=[[ join .monitoring.grafana.plugins "," ]]
ADD https://dl.grafana.com/oss/release/grafana-${GRAFANA_VERSION}.linux-amd64.tar.gz /tmp
ADD https://dl.grafana.com/oss/release/grafana-${GRAFANA_VERSION}.linux-amd64.tar.gz.sha256 /tmp
RUN set -eux &&\
apk --no-cache add \
tar \
curl \
ca-certificates \
bash \
gcompat \
libc6-compat \
&&\
ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 &&\
cd /tmp &&\
echo "$(cat grafana-${GRAFANA_VERSION}.linux-amd64.tar.gz.sha256) grafana-${GRAFANA_VERSION}.linux-amd64.tar.gz" | sha256sum -c &&\
tar xzf grafana-${GRAFANA_VERSION}.linux-amd64.tar.gz &&\
mv grafana-v${GRAFANA_VERSION} /opt/grafana &&\
mkdir /opt/grafana/plugins &&\
IFS=',' &&\
for PLUGIN in ${GRAFANA_PLUGINS}; do /opt/grafana/bin/grafana cli --pluginsDir /opt/grafana/plugins plugins install ${PLUGIN}; done
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
MAINTAINER [[ .docker.maintainer ]]
ENV PATH=/opt/grafana/bin/:${PATH} \
GF_PATHS_DATA=/data \
GF_PATHS_PLUGINS=/opt/grafana/plugins \
GF_LOG_MODE=console
COPY --from=builder /opt/grafana /opt/grafana
RUN set -eux &&\
apk --no-cache add \
gcompat \
libc6-compat \
&&\
ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 &&\
addgroup -g 3000 grafana &&\
adduser --system \
--ingroup grafana \
--disabled-password \
--uid 3000 \
--home /opt/grafana \
--no-create-home \
--shell /sbin/nologin \
grafana &&\
mkdir /data &&\
chown -R grafana:grafana /data /opt/grafana/plugins &&\
chmod 700 /data
WORKDIR /opt/grafana
USER grafana
CMD ["grafana", \
"server", \
"--homepath=/opt/grafana", \
"--packaging=docker"]

View File

@ -1,17 +1,17 @@
#!/bin/sh
# vim: syntax=sh
vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-prometheus \
vault write consul/roles/prometheus[[ .consul.suffix ]] \
ttl=720h \
max_ttl=720h \
consul_policies="[[ .instance ]]-prometheus"
consul_policies="[[ .instance ]]"
vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-consul-exporter \
vault write consul/roles/consul-exporter[[ .consul.suffix ]] \
ttl=720h \
max_ttl=720h \
consul_policies="[[ .instance ]]-prometheus"
consul_policies="[[ .instance ]]"
vault write [[ .vault.root ]]consul/roles/[[ .instance ]]-cluster-exporter \
vault write consul/roles/cluster-exporter \
ttl=720h \
max_ttl=720h \
consul_policies="[[ .instance ]]-prometheus"
consul_policies="[[ .instance ]]"

5
init/grafana-vault-database Executable file
View File

@ -0,0 +1,5 @@
#!/bin/sh
set -euo pipefail
[[ template "common/vault.mkpgrole.sh" merge .monitoring.grafana . ]]

View File

@ -6,8 +6,8 @@ set -euo pipefail
[[ template "common/vault.mkpki.sh" $c ]]
# Create a role for alertmanager
vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-alertmanager \
allowed_domains="[[ .instance ]]" \
vault write [[ $c.vault.pki.path ]]/roles/alertmanager[[ .consul.suffix ]] \
allowed_domains="[[ .instance ]].[[ .consul.domain ]]" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
@ -19,8 +19,8 @@ vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-alertmanager \
ou="[[ $c.vault.pki.ou ]]"
# Create a role for prometheus (which will only be a client, for AlertManager)
vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-prometheus \
allowed_domains="[[ .instance ]]" \
vault write [[ $c.vault.pki.path ]]/roles/prometheus[[ .consul.suffix ]] \
allowed_domains="[[ .instance ]].[[ .consul.domain ]]" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
@ -32,8 +32,8 @@ vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-prometheus \
ou="[[ $c.vault.pki.ou ]]"
# Create a role for loki (which will only be a client, for AlertManager)
vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-loki \
allowed_domains="[[ .instance ]]" \
vault write [[ $c.vault.pki.path ]]/roles/loki[[ .consul.suffix ]] \
allowed_domains="[[ .instance ]].[[ .consul.domain ]]" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
@ -45,8 +45,8 @@ vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-loki \
ou="[[ $c.vault.pki.ou ]]"
# Create a role for metrics exporters (server only)
vault write [[ $c.vault.pki.path ]]/roles/metrics \
allowed_domains="[[ .instance ]]" \
vault write [[ $c.vault.pki.path ]]/roles/metrics[[ .consul.suffix ]] \
allowed_domains="[[ .instance ]].[[ .consul.domain ]]" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
@ -60,7 +60,7 @@ vault write [[ $c.vault.pki.path ]]/roles/metrics \
ou="[[ $c.vault.pki.ou ]]"
# Create a role on the Nomad PKI for the cluster exporter
vault write pki/nomad/roles/[[ .instance ]]-cluster-exporter \
vault write pki/nomad/roles/cluster-exporter[[ .consul.suffix ]] \
allowed_domains='nomad.[[ .consul.domain ]]' \
allow_subdomains=true \
allow_wildcard_certificates=false \
@ -71,7 +71,7 @@ vault write pki/nomad/roles/[[ .instance ]]-cluster-exporter \
ou="Cluster metrics exporter"
# Create a role on the Consul PKI for the cluster exporter
vault write pki/consul/roles/[[ .instance ]]-cluster-exporter \
vault write pki/consul/roles/cluster-exporter[[ .consul.suffix ]] \
allowed_domains="consul.[[ .consul.domain ]]" \
allow_bare_domains=false \
allow_subdomains=true \

View File

@ -0,0 +1,5 @@
#!/bin/sh
set -euo pipefail
[[ template "common/vault.rand_secrets" merge .monitoring . ]]

View File

@ -17,7 +17,7 @@ job "[[ .instance ]]-services" {
[[ template "common/volumes" $c ]]
service {
name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]"
name = "prometheus[[ .consul.suffix ]]"
port = 9090
[[ template "common/service_meta" $c ]]
@ -122,8 +122,8 @@ _EOT
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-prometheus"
(printf "common_name=prometheus-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/prometheus"
(printf "common_name=prometheus-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
@ -177,7 +177,7 @@ _EOT
# This service is used for the different instances of alertmanager to communicate
service {
name = "[[ .instance ]]-alertmanager-gossip[[ .consul.suffix ]]"
name = "alertmanager-gossip[[ .consul.suffix ]]"
port = "cluster"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -187,7 +187,7 @@ _EOT
# This service is used by prometheus. As it needs to be able to reach every instances, it cannot use
# the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh
service {
name = "[[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]"
name = "alertmanager-tls[[ .consul.suffix ]]"
port = "web-tls"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
@ -197,7 +197,7 @@ _EOT
# This service is exposed through the service mesh
# and can be used to reach the web interface through Traefik
service {
name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]"
name = "alertmanager[[ .consul.suffix ]]"
port = 9093
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
@ -224,7 +224,7 @@ _EOT
# This task will handle mTLS to the AlertManager API
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
task "tls-proxy" {
task "untls-proxy" {
driver = "[[ $c.nomad.driver ]]"
user = 9093
@ -256,8 +256,8 @@ _EOT
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager"
(printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/alertmanager"
(printf "common_name=alertmanager-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
@ -342,8 +342,8 @@ _EOT
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager"
(printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/alertmanager"
(printf "common_name=alertmanager-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
@ -389,7 +389,7 @@ _EOT
[[ template "common/volumes" $c ]]
service {
name = "[[ .instance ]]-loki[[ .consul.suffix ]]"
name = "loki[[ .consul.suffix ]]"
port = 3100
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
@ -443,8 +443,8 @@ _EOT
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-loki"
(printf "common_name=loki-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/loki"
(printf "common_name=loki-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
@ -496,7 +496,7 @@ _EOT
# The main service is the vector source
# It will provide access to other services through the mesh (like loki)
service {
name = "[[ .instance ]]-vector-aggregator[[ .consul.suffix ]]"
name = "vector-aggregator[[ .consul.suffix ]]"
port = 9000
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
@ -509,7 +509,7 @@ _EOT
# The syslog UDP service can be used to ingest standard syslog logs from other
# devices, and can be exposed by Traefik for this
service {
name = "[[ .instance ]]-syslog-udp[[ .consul.suffix ]]"
name = "syslog-udp[[ .consul.suffix ]]"
port = "syslog-udp"
tags = [
[[ template "common/traefik_tags" merge $c.syslog_udp $c ]]
@ -522,7 +522,7 @@ _EOT
[[- if $c.fluentd.enabled ]]
# The fluentd service can be used to ingest fluentd logs
service {
name = "[[ .instance ]]-syslog-udp[[ .consul.suffix ]]"
name = "syslog-udp[[ .consul.suffix ]]"
port = 24224
tags = [
[[ template "common/traefik_tags" merge $c.fluentd $c ]]
@ -555,6 +555,84 @@ _EOT
change_signal = "SIGHUP"
}
[[ template "common/resources" $c ]]
}
}
group "interface" {
[[- $c := merge .monitoring.grafana .monitoring . ]]
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
[[ template "common/volumes" $c ]]
service {
name = "grafana[[ .consul.suffix ]]"
port = 3000
[[ template "common/metrics_meta" $c ]]
[[ template "common/connect" $c ]]
check {
name = "health"
type = "http"
path = "/api/health"
expose = true
interval = "30s"
timeout = "8s"
}
tags = [
[[ template "common/traefik_tags" $c ]]
]
}
[[ template "common/task.metrics_proxy" $c ]]
[[ template "common/task.pgpooler" $c ]]
[[ template "common/task.memcached" ]]
task "grafana" {
driver = "[[ $c.nomad.driver ]]"
leader = true
config {
image = "[[ $c.image ]]"
readonly_rootfs = true
pids_limit = 100
command = "grafana"
args = [
"server",
"--homepath=/opt/grafana",
"--config=/secrets/grafana.ini",
"--packaging=docker"
]
}
[[ template "common/vault.policies" $c ]]
[[ template "common/file_env" $c ]]
# Basic grafana configuration file
template {
data = <<_EOT
[[ template "monitoring/grafana/grafana.ini" $c ]]
_EOT
destination = "secrets/grafana.ini"
uid = 103000
perms = 400
}
# Mount volume in /data for persistence
volume_mount {
volume = "data"
destination = "/data"
}
[[ template "common/resources" $c ]]
}
}

View File

@ -5,7 +5,7 @@ server {
proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
proxy_ssl_verify on;
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring;
proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.[[ .instance ]].[[ .consul.domain ]];
proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
allow 127.0.0.1;
deny all;

View File

@ -10,7 +10,7 @@ exec alertmanager \
--web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \
--cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \
--cluster.advertise-address={{ env "NOMAD_HOST_ADDR_cluster" }} \
{{- range service "[[ .instance ]]-am-gossip[[ .consul.suffix ]]" -}}
{{- range service "alertmanager-gossip[[ .consul.suffix ]]" -}}
{{- if not (eq (env "NOMAD_ALLOC_INDEX") (index .ServiceMeta "alloc")) }}
--cluster.peer={{ .Address }}:{{ .Port }} \
{{ end -}}

View File

@ -24,7 +24,7 @@ server {
return 405;
}
set $consul_token "{{ with secret "consul/creds/[[ .instance ]]-cluster-exporter" }}{{ .Data.token }}{{ end }}";
set $consul_token "{{ with secret "consul/creds/cluster-exporter[[ .consul.suffix ]]" }}{{ .Data.token }}{{ end }}";
{{- range service "nomad-client" }}
location /nomad-client/{{ .Node }} {

View File

@ -0,0 +1,37 @@
[server]
http_addr = 127.0.0.1
http_port = 3000
root_url = [[ .monitoring.grafana.public_url ]]
serve_from_sub_path = [[ if eq (urlParse .monitoring.grafana.public_url).Path "" ]]false[[ else ]]true[[ end ]]
[database]
type = postgres
name = [[ .postgres.database ]]
[[- if ne .postgres.pooler.engine "none" ]]
host = 127.0.0.1:[[ .postgres.pooler.port ]]
user = [[ .instance ]]
password = {{ env "NOMAD_ALLOC_ID" }}
ssl_mode = disable
[[- else ]]
host = [[ .postgres.host ]]:[[ .postgres.port ]]
user = [[ .postgres.user ]]
password = [[ .postgres.password ]]
[[ end ]]
[remote_cache]
type = memcached
connstr = 127.0.0.1:11211
[analytics]
reporting_enabled = false
check_for_updates = false
check_for_plugin_updates = false
[security]
cookie_secure = true
cookie_samesite = strict
x_xss_protection = true
secret_key = {{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.secret_key }}{{ end }}
[dataproxy]
timeout = 120

View File

@ -53,7 +53,7 @@ limits_config:
max_query_parallelism: 128
ruler:
alertmanager_url: [[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]
alertmanager_url: alertmanager-tls[[ .consul.suffix ]]
enable_alertmanager_discovery: true
alertmanager_client:
tls_cert_path: /secrets/loki.bundle.pem

View File

@ -19,13 +19,13 @@ alerting:
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
token: {{ with secret "consul/creds/prometheus[[ .consul.suffix ]]" }}{{ .Data.token }}{{ end }}
datacenter: [[ .consul.datacenter ]]
relabel_configs:
# Only keep alertmanagers
- source_labels: [__meta_consul_service]
action: keep
regex: [[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]
regex: alertmanager-tls[[ .consul.suffix ]]
scrape_configs:
@ -40,7 +40,7 @@ scrape_configs:
[[- end ]]
[[- if gt (len .exporters.blackbox.http_probes) 0 ]]
{{- if gt (len (service "blackbox-exporter[[ .consul.suffix ]]")) 0 }}
# Blackbox Exporter HTTP targets
- job_name: http_probe
metrics_path: /probe
@ -52,6 +52,7 @@ scrape_configs:
params:
module: ["http_2xx"]
static_configs:
{{ range $idx, $instance := service "blackbox-exporter[[ .consul.suffix ]]" }}
- targets:
[[- range $http_probe := .exporters.blackbox.http_probes ]]
- [[ $http_probe ]]
@ -62,11 +63,13 @@ scrape_configs:
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: {{ range $idx, $instance := service "[[ .instance ]]-blackbox-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
replacement: {{ .Address }}:{{ .Port }}
{{ end }}
{{- end }}
[[- end ]]
[[- if gt (len .exporters.blackbox.tcp_probes) 0 ]]
{{ if gt (len (service "blackbox-exporter[[ .consul.suffix ]]")) 0 }}
# Blackbox Exporter TCP targets
- job_name: tcp_probe
metrics_path: /probe
@ -78,6 +81,7 @@ scrape_configs:
params:
module: ["tcp_connect"]
static_configs:
{ range $idx, $instance := service "blackbox-exporter[[ .consul.suffix ]]" }}
[[- range $target := .exporters.blackbox.tcp_probes ]]
- [[ $target ]]
[[- end ]]
@ -87,7 +91,9 @@ scrape_configs:
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: {{ range $idx, $instance := service "[[ .instance ]]-blackbox-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
replacement: {{ .Address }}:{{ .Port }}
{{ end }}
{{- end }}
[[- end ]]
# Cluster services
@ -100,7 +106,7 @@ scrape_configs:
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
token: {{ with secret "consul/creds/prometheus[[ .consul.suffix ]]" }}{{ .Data.token }}{{ end }}
datacenter: [[ .consul.datacenter ]]
relabel_configs:
@ -117,7 +123,7 @@ scrape_configs:
- source_labels: [__meta_consul_service]
regex: (.+)
replacement: {{ range $idx, $instance := service "[[ .instance ]]-cluster-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
replacement: {{ range $idx, $instance := service "cluster-exporter[[ .consul.suffix ]]" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}
target_label: __address__
# Rewrite the job labels to the name of the service
@ -143,7 +149,7 @@ scrape_configs:
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
token: {{ with secret "consul/creds/prometheus[[ .consul.suffix ]]" }}{{ .Data.token }}{{ end }}
datacenter: [[ .consul.datacenter ]]
relabel_configs:
@ -203,7 +209,7 @@ scrape_configs:
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
token: {{ with secret "consul/creds/prometheus[[ .consul.suffix ]]" }}{{ .Data.token }}{{ end }}
datacenter: [[ .consul.datacenter ]]
relabel_configs:

View File

@ -6,6 +6,10 @@ vault:
pki:
path: '[[ .prometheus.vault_pki ]]'
ou: Monitoring
rand_secrets:
- path: grafana
fields:
- secret_key
monitoring:
@ -40,7 +44,7 @@ monitoring:
memory: 32
vault:
policies:
- '[[ .instance ]]-consul-exporter[[ .consul.suffix ]]'
- 'consul-exporter[[ .consul.suffix ]]'
cluster:
image: nginxinc/nginx-unprivileged:alpine
@ -50,12 +54,12 @@ monitoring:
memory: 15
vault:
policies:
- '[[ .instance ]]-cluster-exporter[[ .consul.suffix ]]'
- 'cluster-exporter[[ .consul.suffix ]]'
- metrics
prometheus:
version: 2.50.1
version: 2.51.0
count: 1
@ -70,12 +74,12 @@ monitoring:
volumes:
data:
type: csi
source: '[[ .instance ]]-prometheus-data[[ .consul.suffix ]]'
source: 'prometheus-data'
per_alloc: true
vault:
policies:
- '[[ .instance ]]-prometheus[[ .consul.suffix ]]'
- 'prometheus[[ .consul.suffix ]]'
jobs: {}
alert_rules: {}
@ -110,7 +114,7 @@ monitoring:
strip_prefix: false
volumes:
data:
source: '[[ .instance ]]-alertmanager-data[[ .consul.suffix ]]'
source: 'alertmanager-data'
type: csi
per_alloc: true
prometheus:
@ -118,13 +122,13 @@ monitoring:
vault:
policies:
- metrics
- '[[ .instance ]]-alertmanager[[ .consul.suffix ]]'
- 'alertmanager[[ .consul.suffix ]]'
email:
from: alertmanager@[[ .consul.domain ]]
custom_config: {}
loki:
version: 2.9.5
version: 2.9.6
image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1'
env: {}
resources:
@ -132,7 +136,7 @@ monitoring:
memory: 512
vault:
policies:
- '[[ .instance ]]-loki[[ .consul.suffix ]]'
- 'loki[[ .consul.suffix ]]'
public_url: https://loki.example.org
traefik:
router: loki
@ -143,7 +147,7 @@ monitoring:
volumes:
data:
type: csi
source: '[[ .instance ]]-loki-data[[ .consul.suffix ]]'
source: 'loki-data'
vector:
version: 0.36.1
@ -159,7 +163,7 @@ monitoring:
consul:
connect:
upstreams:
- destination_name: '[[ .instance ]]-loki[[ .consul.suffix ]]'
- destination_name: 'loki[[ .consul.suffix ]]'
local_bind_port: 3100
fluentd:
enabled: false
@ -181,6 +185,46 @@ monitoring:
prometheus:
metrics_url: http://127.0.0.1:9001/metrics
grafana:
version: 10.4.1
image: '[[ .docker.repo ]]grafana:[[ .monitoring.grafana.version ]]-1'
env: {}
resources:
cpu: 100
memory: 256
public_url: https://grafana.example.org
plugins:
#- alexanderzobnin-zabbix-app
#- ddurieux-glpi-app
- grafana-clock-panel
- grafana-piechart-panel
traefik:
enabled: true
router: grafana
strip_prefix: false
consul:
connect:
upstreams:
- destination_name: postgres[[ .consul.suffix ]]
local_bind_port: 5432
volumes:
data:
type: csi
source: 'grafana-data'
vault:
policies:
- 'grafana[[ .consul.suffix ]]'
database:
role: grafana
pgrole: grafana
postgres:
database: grafana
user: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.username }}{{ end }}'
password: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.password }}{{ end }}'
pooler:
mode: session
prometheus:
metrics_url: http://localhost:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics
prometheus:
enabled: true

View File

@ -1,5 +1,5 @@
[[- $c := merge .monitoring.alertmanager .monitoring . ]]
path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager" {
path "[[ $c.vault.pki.path ]]/issue/alertmanager" {
capabilities = ["update"]
}

View File

@ -0,0 +1,19 @@
# Read vault metrics
path "sys/metrics" {
capabilities = ["read", "list"]
}
# Get a cert for Nomad
path "pki/nomad/issue/cluster-exporter" {
capabilities = ["update"]
}
# Get a cert for Consul
path "pki/consul/issue/cluster-exporter" {
capabilities = ["update"]
}
# Get a consul token
path "consul/creds/cluster-exporter" {
capabilities = ["read"]
}

View File

@ -0,0 +1,3 @@
path "consul/creds/consul-exporter" {
capabilities = ["read"]
}

View File

@ -0,0 +1,7 @@
path "[[ .vault.root ]]database/creds/[[ .monitoring.grafana.vault.database.role ]]" {
capabilities = ["read"]
}
path "[[ .vault.root ]]kv/data/service/[[ .instance ]]/grafana" {
capabilities = ["read"]
}

View File

@ -1,5 +1,5 @@
[[- $c := merge .monitoring.loki .monitoring . ]]
path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-loki" {
path "[[ $c.vault.pki.path ]]/issue/loki" {
capabilities = ["update"]
}

View File

@ -1,20 +0,0 @@
[[- $c := merge .monitoring.exporters.cluster .monitoring.exporters .monitoring . ]]
# Read vault metrics
path "sys/metrics" {
capabilities = ["read", "list"]
}
# Get a cert for Nomad
path "pki/nomad/issue/[[ .instance ]]-cluster-exporter" {
capabilities = ["update"]
}
# Get a cert for Consul
path "pki/consul/issue/[[ .instance ]]-cluster-exporter" {
capabilities = ["update"]
}
# Get a consul token
path "consul/creds/[[ .instance ]]-cluster-exporter" {
capabilities = ["read"]
}

View File

@ -1,4 +0,0 @@
[[- $c := merge .monitoring.exporters.consul .monitoring.exporters .monitoring . ]]
path "[[ $c.vault.root ]]consul/creds/[[ .instance ]]-consul-exporter" {
capabilities = ["read"]
}

View File

@ -1,5 +1,5 @@
[[- $c := merge .monitoring.prometheus .monitoring . ]]
path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-prometheus" {
path "[[ $c.vault.pki.path ]]/issue/prometheus" {
capabilities = ["update"]
}
@ -7,6 +7,6 @@ path "[[ $c.vault.root ]]kv/service/[[ .instance ]]/prometheus" {
capabilities = ["read"]
}
path "[[ $c.vault.root ]]consul/creds/[[ .instance ]]-prometheus" {
path "[[ $c.vault.root ]]consul/creds/prometheus" {
capabilities = ["read"]
}