monitoring/monitoring-services.nomad.hcl

673 lines
17 KiB
HCL

job "[[ .instance ]]-services" {
[[ template "common/job_start" . ]]
# Metrics is running prometheus
group "metrics-server" {
[[- $c := merge .monitoring.prometheus .monitoring . ]]
shutdown_delay = "6s"
count = [[ $c.count ]]
network {
mode = "bridge"
port "metrics" {}
}
[[ template "common/volumes" $c ]]
service {
name = "prometheus[[ .consul.suffix ]]"
port = 9090
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
check {
name = "health"
type = "http"
expose = true
path = "/-/healthy"
interval = "20s"
timeout = "8s"
check_restart {
limit = 10
grace = "5m"
}
}
tags = [
[[ template "common/traefik_tags" $c ]]
]
}
[[ template "common/task.metrics_proxy" $c ]]
# The main prometheus task
task "prometheus" {
driver = "[[ $c.nomad.driver ]]"
leader = true
config {
image = "[[ $c.image ]]"
readonly_rootfs = true
pids_limit = 200
command = "prometheus"
args = [
"--config.file=/local/prometheus.yml",
"--log.level=info",
"--web.listen-address=127.0.0.1:9090",
"--storage.tsdb.path=/data",
"--storage.tsdb.retention.time=[[ $c.retention ]]",
"--web.console.libraries=/opt/prometheus/console_libraries",
"--web.console.templates=/opt/prometheus/consoles",
"--web.external-url=[[ $c.public_url ]]",
"--web.route-prefix=/"
]
}
[[ template "common/vault.policies" $c ]]
[[ template "common/artifacts" $c ]]
# Main configuration for prometheus
template {
data = <<_EOT
[[ tmpl.Exec "monitoring/prometheus/prometheus.yml" $c | replaceAll "${" "$${" ]]
_EOT
destination = "local/prometheus.yml"
uid = 100000
gid = 109090
perms = 640
change_mode = "signal"
change_signal = "SIGHUP"
}
# Alert rules
[[- range $bundle := file.ReadDir "bundles" ]]
[[- if file.Exists (printf "bundles/%s/templates/prometheus/rules" $bundle) ]]
[[- range $tpl := file.ReadDir (printf "bundles/%s/templates/prometheus/rules" $bundle) ]]
[[- if not (file.Exists (printf "prometheus/rules/%s" $tpl)) ]]
template {
data = <<_EOT
[[ tmpl.Inline (file.Read (printf "bundles/%s/templates/prometheus/rules/%s" $bundle $tpl)) $c ]]
_EOT
destination = "local/rules/[[ $tpl ]]"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
[[- end ]]
[[- end ]]
[[- end ]]
[[- end ]]
[[- if file.Exists "prometheus/rules" ]]
[[- range $tpl := file.ReadDir "prometheus/rules" ]]
template {
data = <<_EOT
[[ tmpl.Inline (file.Read (printf "prometheus/rules/%s" $tpl)) $c ]]
_EOT
destination = "local/rules/[[ . ]]"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
[[- end ]]
[[- end ]]
[[- /* Support prometheus rules as artifacts or as raw content */]]
[[- range $k, $v := $c.alert_rules ]]
[[- if has $v "url" ]]
artifact {
source = "[[ $v.url ]]"
destination = "local/rules/[[ $k ]].yml"
mode = "file"
}
[[- else if has $v "content" ]]
template {
data = <<_EOT
[[ $v.content ]]
_EOT
destination = "local/rules/[[ $k ]].yml"
}
[[- end ]]
[[- end ]]
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/prometheus"
(printf "common_name=prometheus-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
{{- end -}}
_EOT
destination = "secrets/prometheus.bundle.pem"
uid = 100000
gid = 109090
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The monitoring CA chain, to validate AlertManager cert
template {
data = <<_EOT
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
uid = 100000
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
# Persistent data
volume_mount {
volume = "data"
destination = "/data"
}
[[ template "common/resources" $c ]]
}
}
group "alerts" {
[[- $c := merge .monitoring.alertmanager .monitoring . ]]
shutdown_delay = "6s"
count = [[ $c.count ]]
network {
mode = "bridge"
# Port exposing the web API, with mTLS
port "web-tls" {}
# Port used for gossip between the different alertmanager instance
port "cluster" {}
# Port to expose metrics to prometheus
port "metrics" {}
}
[[ template "common/volumes" $c ]]
# This service is used for the different instances of alertmanager to communicate
service {
name = "alertmanager-gossip[[ .consul.suffix ]]"
port = "cluster"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
# This service is used by prometheus. As it needs to be able to reach every instances, it cannot use
# the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh
service {
name = "alertmanager-tls[[ .consul.suffix ]]"
port = "web-tls"
meta {
alloc = "${NOMAD_ALLOC_INDEX}"
}
}
# This service is exposed through the service mesh
# and can be used to reach the web interface through Traefik
service {
name = "alertmanager[[ .consul.suffix ]]"
port = 9093
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
check {
name = "health"
type = "http"
expose = true
path = "/-/healthy"
interval = "20s"
timeout = "8s"
check_restart {
limit = 12
grace = "30s"
}
}
tags = [
[[ template "common/traefik_tags" $c ]]
]
}
# This task will handle mTLS to the AlertManager API
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
task "untls-proxy" {
driver = "[[ $c.nomad.driver ]]"
user = 9093
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
readonly_rootfs = true
pids_limit = 30
volumes = [
"local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro",
]
[[ template "common/tmpfs" "/tmp" ]]
}
[[ template "common/vault.policies" $c ]]
lifecycle {
hook = "poststart"
sidecar = true
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/nginx.conf" $c ]]
_EOT
destination = "local/alertmanager.conf"
}
[[ template "common/metrics_cert" $c ]]
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/alertmanager"
(printf "common_name=alertmanager-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/alertmanager.bundle.pem"
uid = 109093
gid = 100000
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
resources {
cpu = 10
memory = 18
}
}
# The main alertmanager task
task "alertmanager" {
driver = "[[ $c.nomad.driver ]]"
leader = true
config {
image = "[[ $c.image ]]"
readonly_rootfs = true
pids_limit = 200
command = "/local/alertmanager"
}
[[ template "common/vault.policies" $c ]]
[[ template "common/file_env" $c ]]
template {
data = <<_EOT
[[- if isKind "map" $c.custom_config ]]
[[ merge $c.custom_config (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
[[- else if isKind "string" $c.custom_config ]]
[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
[[- else ]]
# Invalid custom config, using template only
[[ template "monitoring/alertmanager/alertmanager.yml" $c ]]
[[- end ]]
_EOT
destination = "secrets/alertmanager.yml"
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/cluster_tls.yml" $c ]]
_EOT
destination = "local/cluster_tls.yml"
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/web_tls.yml" $c ]]
_EOT
destination = "local/web_tls.yml"
}
template {
data = <<_EOT
[[ template "monitoring/alertmanager/start.sh" $c ]]
_EOT
destination = "local/alertmanager"
uid = 100000
gid = 100000
perms = "0755"
}
# Certifiate used by AlertManager
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/alertmanager"
(printf "common_name=alertmanager-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/alertmanager.bundle.pem"
uid = 109093
gid = 109090
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The trusted CA
template {
data = <<_EOT
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
volume_mount {
volume = "data"
destination = "/data"
}
[[ template "common/resources" $c ]]
}
}
group "logs-server" {
[[- $c := merge .monitoring.loki .monitoring . ]]
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
[[ template "common/volumes" $c ]]
service {
name = "loki[[ .consul.suffix ]]"
port = 3100
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
check {
name = "ready"
type = "http"
path = "/ready"
expose = true
interval = "20s"
timeout = "8s"
check_restart {
limit = 6
grace = "5m"
}
}
tags = [
[[ template "common/traefik_tags" $c ]]
]
}
[[ template "common/task.metrics_proxy" $c ]]
task "loki" {
driver = "[[ $c.nomad.driver ]]"
config {
image = "[[ $c.image ]]"
command = "loki"
args = ["--config.file=/local/loki.yml", "--pattern-ingester.enabled=true"]
}
[[ template "common/vault.policies" $c ]]
[[ template "common/file_env" $c ]]
template {
data =<<_EOT
[[- if isKind "map" $c.custom_config ]]
[[ merge $c.custom_config (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]]
[[- else if isKind "string" $c.custom_config ]]
[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]]
[[- else ]]
# Not using custom_config as it's invalid
[[ template "monitoring/loki/loki.yml" $c ]]
[[- end ]]
_EOT
destination = "local/loki.yml"
}
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/loki"
(printf "common_name=loki-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
{{- end -}}
_EOT
destination = "secrets/loki.bundle.pem"
uid = 100000
gid = 103100
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The monitoring CA chain, to validate AlertManager cert
template {
data = <<_EOT
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
uid = 100000
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
volume_mount {
volume = "data"
destination = "/data"
}
[[ template "common/resources" $c ]]
}
}
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
# And with a loki sink. The goal is to be able to collect logs from various sources
group "logs-aggregator" {
[[- $c := merge .monitoring.aggregator .monitoring . ]]
count = [[ $c.count ]]
shutdown_delay = "6s"
network {
mode = "bridge"
[[- if $c.syslog_udp.enabled ]]
port "syslog_udp" {}
[[- end ]]
[[- if $c.syslog_tcp.enabled ]]
port "syslog_tcp" {}
[[- end ]]
port "metrics" {}
}
# The main service is the vector source
# It will provide access to other services through the mesh (like loki)
service {
name = "vector-aggregator[[ .consul.suffix ]]"
port = 9000
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
tags = [
[[ template "common/traefik_tags" merge $c.vector $c ]]
]
}
[[- if $c.syslog_udp.enabled ]]
# The syslog UDP service can be used to ingest standard syslog logs from other
# devices, and can be exposed by Traefik for this
service {
name = "syslog-udp[[ .consul.suffix ]]"
port = "syslog_udp"
tags = [
[[ template "common/traefik_tags" merge $c.syslog_udp $c ]]
]
}
[[- end ]]
[[- if $c.syslog_tcp.enabled ]]
# The syslog TCP service can be used to ingest standard syslog logs from other
# devices, and can be exposed by Traefik for this
service {
name = "syslog-tcp[[ .consul.suffix ]]"
port = "syslog_tcp"
tags = [
[[ template "common/traefik_tags" merge $c.syslog_tcp $c ]]
]
}
[[- end ]]
[[- if $c.fluentd.enabled ]]
# The fluentd service can be used to ingest fluentd logs
service {
name = "fluent[[ .consul.suffix ]]"
port = 24224
tags = [
[[ template "common/traefik_tags" merge $c.fluentd $c ]]
]
}
[[- end ]]
task "vector" {
driver = "[[ $c.nomad.driver ]]"
leader = true
config {
image = "[[ $c.image ]]"
readonly_rootfs = true
pids_limit = 200
args = [ "--config=/local/vector.yml" ]
}
[[ template "common/vault.policies" $c ]]
[[ template "common/file_env" $c ]]
[[ template "common/metrics_cert" $c ]]
template {
data = <<_EOT
[[ tmpl.Exec "monitoring/aggregator/vector.yml" $c | replaceAll "%{" "%%{" | replaceAll "${" "$${" ]]
_EOT
destination = "local/vector.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
change_mode = "signal"
change_signal = "SIGHUP"
}
[[ template "common/resources" $c ]]
}
}
group "grafana" {
[[- $c := merge .monitoring.grafana .monitoring . ]]
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
[[ template "common/volumes" $c ]]
service {
name = "grafana[[ .consul.suffix ]]"
port = 3000
[[ template "common/metrics_meta" $c ]]
[[ template "common/connect" $c ]]
check {
name = "health"
type = "http"
path = "/api/health"
expose = true
interval = "30s"
timeout = "8s"
}
tags = [
[[ template "common/traefik_tags" $c ]]
]
}
[[ template "common/task.metrics_proxy" $c ]]
[[ template "common/task.pgpooler" $c ]]
[[ template "common/task.memcached" ]]
task "grafana" {
driver = "[[ $c.nomad.driver ]]"
leader = true
config {
image = "[[ $c.image ]]"
readonly_rootfs = true
pids_limit = 100
command = "grafana"
args = [
"server",
"--homepath=/opt/grafana",
"--config=/secrets/grafana.ini",
"--packaging=docker"
]
}
[[ template "common/vault.policies" $c ]]
[[ template "common/file_env" $c ]]
template {
data = <<_EOT
GF_SECURITY_ADMIN_PASSWORD={{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd | sprig_squote }}{{ end }}
_EOT
destination = "secrets/.grafana.env"
perms = 400
env = true
}
# Basic grafana configuration file
template {
data = <<_EOT
[[ template "monitoring/grafana/grafana.ini" $c ]]
_EOT
destination = "secrets/grafana.ini"
uid = 103000
perms = 400
}
# Mount volume in /data for persistence
volume_mount {
volume = "data"
destination = "/data"
}
[[ template "common/resources" $c ]]
}
}
}