377 lines
9.9 KiB
HCL
377 lines
9.9 KiB
HCL
job "[[ .instance ]]" {
|
|
|
|
[[ template "common/job_start" . ]]
|
|
|
|
# Metrics is running prometheus and various exporters
|
|
group "metrics" {
|
|
[[- $c := merge .monitoring.prometheus .monitoring . ]]
|
|
|
|
shutdown_delay = "6s"
|
|
count = [[ $c.count ]]
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "metrics" {}
|
|
}
|
|
|
|
[[ template "common/volumes" $c ]]
|
|
|
|
service {
|
|
name = "[[ .instance ]]-prometheus[[ .consul.suffix ]]"
|
|
port = 9090
|
|
|
|
[[ template "common/service_meta" $c ]]
|
|
[[ template "common/connect" $c ]]
|
|
|
|
check {
|
|
name = "health"
|
|
type = "http"
|
|
expose = true
|
|
path = "/-/healthy"
|
|
interval = "15s"
|
|
timeout = "8s"
|
|
check_restart {
|
|
limit = 10
|
|
grace = "5m"
|
|
}
|
|
}
|
|
|
|
tags = [
|
|
[[ template "common/traefik_tags" $c ]]
|
|
]
|
|
}
|
|
|
|
[[ template "common/task.metrics_proxy" $c ]]
|
|
|
|
# The main prometheus task
|
|
task "prometheus" {
|
|
driver = "[[ $c.nomad.driver ]]"
|
|
leader = true
|
|
|
|
config {
|
|
image = "[[ $c.image ]]"
|
|
readonly_rootfs = true
|
|
pids_limit = 200
|
|
command = "prometheus"
|
|
args = [
|
|
"--config.file=/local/prometheus.yml",
|
|
"--log.level=debug",
|
|
"--web.listen-address=127.0.0.1:9090",
|
|
"--storage.tsdb.path=/data",
|
|
"--storage.tsdb.retention.time=[[ $c.retention ]]",
|
|
"--web.console.libraries=/opt/prometheus/console_libraries",
|
|
"--web.console.templates=/opt/prometheus/consoles",
|
|
"--web.external-url=[[ $c.public_url ]]",
|
|
"--web.route-prefix=[[ if eq "" (urlParse $c.public_url).Path ]]/[[ else ]](urlParse $c.public_url).Path[[ end ]]"
|
|
]
|
|
}
|
|
|
|
[[ template "common/vault.policies" $c ]]
|
|
[[ template "common/artifacts" $c ]]
|
|
|
|
# Main configuration for prometheus
|
|
template {
|
|
data = <<_EOT
|
|
[[ tmpl.Exec "monitoring/prometheus/prometheus.yml" $c | replaceAll "${" "$${" ]]
|
|
_EOT
|
|
destination = "local/prometheus.yml"
|
|
uid = 100000
|
|
gid = 109090
|
|
perms = 640
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# Alert rules
|
|
[[- range (file.ReadDir "bundles/monitoring/templates/prometheus/rules") ]]
|
|
[[- if not (file.Exists (printf "prometheus/rules/%s" .)) ]]
|
|
template {
|
|
data = <<_EOT
|
|
[[ file.Read (printf "bundles/monitoring/templates/prometheus/rules/%s" .) ]]
|
|
_EOT
|
|
destination = "local/rules/[[ . ]]"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
[[- end ]]
|
|
[[- end ]]
|
|
|
|
[[- if file.Exists "prometheus/rules" ]]
|
|
[[- range (file.ReadDir "prometheus/rules") ]]
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[ file.Read (printf "prometheus/rules/%s" .) ]]
|
|
_EOT
|
|
destination = "local/rules/[[ . ]]"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
[[- end ]]
|
|
[[- end ]]
|
|
|
|
[[- range $k, $v := $c.alert_rules ]]
|
|
|
|
artifact {
|
|
source = "[[ $v.url ]]"
|
|
destination = "local/rules/[[ $k ]].yml"
|
|
mode = "file"
|
|
}
|
|
[[- end ]]
|
|
|
|
# A client cert, to connect to the AlertManager API
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-prometheus"
|
|
(printf "common_name=prometheus-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
|
|
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end -}}
|
|
_EOT
|
|
destination = "secrets/prometheus.bundle.pem"
|
|
uid = 100000
|
|
gid = 109090
|
|
perms = "0440"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# The monitoring CA chain, to validate AlertManager cert
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
uid = 100000
|
|
gid = 100000
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# Persistent data
|
|
volume_mount {
|
|
volume = "data"
|
|
destination = "/data"
|
|
}
|
|
|
|
[[ template "common/resources" $c ]]
|
|
}
|
|
}
|
|
|
|
group "alerts" {
|
|
|
|
[[- $c := merge .monitoring.alertmanager .monitoring . ]]
|
|
|
|
count = [[ $c.count ]]
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "web-tls" {}
|
|
port "cluster" {}
|
|
port "metrics" {}
|
|
}
|
|
|
|
[[ template "common/volumes" $c ]]
|
|
|
|
# This service is used for the different instances of alertmanager to communicate
|
|
service {
|
|
name = "[[ .instance ]]-alertmanager-gossip[[ .consul.suffix ]]"
|
|
port = "cluster"
|
|
meta {
|
|
alloc = "${NOMAD_ALLOC_INDEX}"
|
|
}
|
|
}
|
|
|
|
# This service is used by prometheus. As it needs to be able to reach every instances, it cannot use
|
|
# the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh
|
|
service {
|
|
name = "[[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]"
|
|
port = "web-tls"
|
|
meta {
|
|
alloc = "${NOMAD_ALLOC_INDEX}"
|
|
}
|
|
}
|
|
|
|
# This service is exposed through the service mesh
|
|
# and can be used to reach the web interface through Traefik
|
|
service {
|
|
name = "[[ .instance ]]-alertmanager[[ .consul.suffix ]]"
|
|
port = 9093
|
|
[[ template "common/service_meta" $c ]]
|
|
[[ template "common/connect" $c ]]
|
|
|
|
check {
|
|
name = "health"
|
|
type = "http"
|
|
expose = true
|
|
path = "/-/healthy"
|
|
interval = "20s"
|
|
timeout = "8s"
|
|
check_restart {
|
|
limit = 12
|
|
grace = "30s"
|
|
}
|
|
}
|
|
|
|
tags = [
|
|
[[ template "common/traefik_tags" $c ]]
|
|
]
|
|
}
|
|
|
|
[[ template "common/task.metrics_proxy" $c ]]
|
|
|
|
# This task will handle mTLS to the AlertManager API
|
|
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
|
|
task "tls-proxy" {
|
|
driver = "[[ $c.nomad.driver ]]"
|
|
user = 9093
|
|
|
|
config {
|
|
image = "nginxinc/nginx-unprivileged:alpine"
|
|
force_pull = true
|
|
readonly_rootfs = true
|
|
pids_limit = 30
|
|
volumes = [
|
|
"local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro",
|
|
]
|
|
[[ template "common/tmpfs" "/tmp" ]]
|
|
}
|
|
|
|
[[ template "common/vault.policies" $c ]]
|
|
|
|
lifecycle {
|
|
hook = "poststart"
|
|
sidecar = true
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[ template "monitoring/alertmanager/nginx.conf" $c ]]
|
|
_EOT
|
|
destination = "local/alertmanager.conf"
|
|
}
|
|
|
|
# Certifiate used by AlertManager
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager"
|
|
(printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
|
|
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
|
|
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/alertmanager.bundle.pem"
|
|
uid = 109093
|
|
gid = 100000
|
|
perms = "0440"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# The trusted CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
resources {
|
|
cpu = 10
|
|
memory = 18
|
|
}
|
|
}
|
|
|
|
# The main alertmanager task
|
|
task "alertmanager" {
|
|
driver = "[[ $c.nomad.driver ]]"
|
|
leader = true
|
|
|
|
config {
|
|
image = "[[ $c.image ]]"
|
|
readonly_rootfs = true
|
|
pids_limit = 200
|
|
command = "/local/alertmanager"
|
|
}
|
|
|
|
[[ template "common/vault.policies" $c ]]
|
|
[[ template "common/file_env" $c ]]
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[- if isKind "map" $c.custom_config ]]
|
|
[[ merge $c.custom_config (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
|
|
[[- else if isKind "string" $c.custom_config ]]
|
|
[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
|
|
[[- else ]]
|
|
# Invalid custom config, using template only
|
|
[[ template "monitoring/alertmanager/alertmanager.yml" $c ]]
|
|
[[- end ]]
|
|
_EOT
|
|
destination = "secrets/alertmanager.yml"
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[ template "monitoring/alertmanager/cluster_tls.yml" $c ]]
|
|
_EOT
|
|
destination = "local/cluster_tls.yml"
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[ template "monitoring/alertmanager/web_tls.yml" $c ]]
|
|
_EOT
|
|
destination = "local/web_tls.yml"
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[ template "monitoring/alertmanager/start.sh" $c ]]
|
|
_EOT
|
|
destination = "local/alertmanager"
|
|
uid = 100000
|
|
gid = 100000
|
|
perms = "0755"
|
|
}
|
|
|
|
# Certifiate used by AlertManager
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-alertmanager"
|
|
(printf "common_name=alertmanager-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
|
|
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
|
|
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/alertmanager.bundle.pem"
|
|
uid = 109093
|
|
gid = 109090
|
|
perms = "0440"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# The trusted CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
volume_mount {
|
|
volume = "data"
|
|
destination = "/data"
|
|
}
|
|
|
|
[[ template "common/resources" $c ]]
|
|
}
|
|
}
|
|
}
|