monitoring/monitoring.nomad.hcl

377 lines
9.9 KiB
HCL

job "[[ .instance ]]" {
[[ template "common/job_start" . ]]
# Metrics is running prometheus and various exporters
group "metrics" {
[[- $c := merge .monitoring.prometheus .monitoring . ]]
shutdown_delay = "6s"
count = [[ $c.count ]]
network {
mode = "bridge"
port "metrics" {}
}
[[ template "common/volumes" $c ]]
service {
name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]"
port = 9090
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
check {
name = "health"
type = "http"
expose = true
path = "/-/healthy"
interval = "15s"
timeout = "8s"
check_restart {
limit = 10
grace = "5m"
}
}
tags = [
[[ template "common/traefik_tags" $c ]]
]
}
[[ template "common/task.metrics_proxy" $c ]]
# The main prometheus task
task "prometheus" {
driver = "[[ $c.nomad.driver ]]"
leader = true
config {
image = "[[ $c.image ]]"
readonly_rootfs = true
pids_limit = 200
command = "prometheus"
args = [
"--config.file=/local/prometheus.yml",
"--log.level=debug",
"--web.listen-address=127.0.0.1:9090",
"--storage.tsdb.path=/data",
"--storage.tsdb.retention.time=[[ $c.retention ]]",
"--web.console.libraries=/opt/prometheus/console_libraries",
"--web.console.templates=/opt/prometheus/consoles",
"--web.external-url=[[ $c.public_url ]]",
"--web.route-prefix=[[ if eq "" (urlParse $c.public_url).Path ]]/[[ else ]](urlParse $c.public_url).Path[[ end ]]"
]
}
[[ template "common/vault.policies" $c ]]
[[ template "common/artifacts" $c ]]
# Main configuration for prometheus
template {
data = <<_EOT
[[ tmpl.Exec "monitoring/prometheus/prometheus.yml" $c | replaceAll "${" "$${" ]]
_EOT
destination = "local/prometheus.yml"
uid = 100000
gid = 109090
perms = 640
change_mode = "signal"
change_signal = "SIGHUP"
}
# Alert rules
[[- range (file.ReadDir "bundles/monitoring/templates/prometheus/rules") ]]
[[- if not (file.Exists (printf "prometheus/rules/%s" .)) ]]
template {
data = <<_EOT
[[ file.Read (printf "bundles/monitoring/templates/prometheus/rules/%s" .) ]]
_EOT
destination = "local/rules/[[ . ]]"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
[[- end ]]
[[- end ]]
[[- if file.Exists "prometheus/rules" ]]
[[- range (file.ReadDir "prometheus/rules") ]]
template {
data = <<_EOT
[[ file.Read (printf "prometheus/rules/%s" .) ]]
_EOT
destination = "local/rules/[[ . ]]"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
[[- end ]]
[[- end ]]
[[- range $k, $v := $c.alert_rules ]]
artifact {
source = "[[ $v.url ]]"
destination = "local/rules/[[ $k ]].yml"
mode = "file"
}
[[- end ]]
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-prometheus"
(printf "common_name=prometheus-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
{{- end -}}
_EOT
destination = "secrets/prometheus.bundle.pem"
uid = 100000
gid = 109090
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The monitoring CA chain, to validate AlertManager cert
template {
data = <<_EOT
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
uid = 100000
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
# Persistent data
volume_mount {
volume = "data"
destination = "/data"
}
[[ template "common/resources" $c ]]
}
}
group "alerts" {
[[- $c := merge .monitoring.alertmanager .monitoring . ]]
count = [[ $c.count ]]
network {
mode = "bridge"
port "web-tls" {}
port "cluster" {}
port "metrics" {}
}
[[ template "common/volumes" $c ]]
# This service is used for the different instances of alertmanager to communicate
service {
name = "[[ .instance ]]-alertmanager-gossip[[ .consul.suffix ]]"
port = "cluster"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
# This service is used by prometheus. As it needs to be able to reach every instances, it cannot use
# the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh
service {
name = "[[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]"
port = "web-tls"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
# This service is exposed through the service mesh
# and can be used to reach the web interface through Traefik
service {
name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]"
port = 9093
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
check {
name = "health"
type = "http"
expose = true
path = "/-/healthy"
interval = "20s"
timeout = "8s"
check_restart {
limit = 12
grace = "30s"
}
}
tags = [
[[ template "common/traefik_tags" $c ]]
]
}
[[ template "common/task.metrics_proxy" $c ]]
# This task will handle mTLS to the AlertManager API
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
task "tls-proxy" {
driver = "[[ $c.nomad.driver ]]"
user = 9093
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
readonly_rootfs = true
pids_limit = 30
volumes = [
"local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro",
]
[[ template "common/tmpfs" "/tmp" ]]
}
[[ template "common/vault.policies" $c ]]
lifecycle {
hook = "poststart"
sidecar = true
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/nginx.conf" $c ]]
_EOT
destination = "local/alertmanager.conf"
}
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager"
(printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/alertmanager.bundle.pem"
uid = 109093
gid = 100000
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The trusted CA
template {
data = <<_EOT
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
resources {
cpu = 10
memory = 18
}
}
# The main alertmanager task
task "alertmanager" {
driver = "[[ $c.nomad.driver ]]"
leader = true
config {
image = "[[ $c.image ]]"
readonly_rootfs = true
pids_limit = 200
command = "/local/alertmanager"
}
[[ template "common/vault.policies" $c ]]
[[ template "common/file_env" $c ]]
template {
data = <<_EOT
[[- if isKind "map" $c.custom_config ]]
[[ merge $c.custom_config (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
[[- else if isKind "string" $c.custom_config ]]
[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
[[- else ]]
# Invalid custom config, using template only
[[ template "monitoring/alertmanager/alertmanager.yml" $c ]]
[[- end ]]
_EOT
destination = "secrets/alertmanager.yml"
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/cluster_tls.yml" $c ]]
_EOT
destination = "local/cluster_tls.yml"
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/web_tls.yml" $c ]]
_EOT
destination = "local/web_tls.yml"
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/start.sh" $c ]]
_EOT
destination = "local/alertmanager"
uid = 100000
gid = 100000
perms = "0755"
}
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager"
(printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/alertmanager.bundle.pem"
uid = 109093
gid = 109090
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The trusted CA
template {
data = <<_EOT
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
volume_mount {
volume = "data"
destination = "/data"
}
[[ template "common/resources" $c ]]
}
}
}