673 lines
17 KiB
HCL
673 lines
17 KiB
HCL
job "[[ .instance ]]-services" {
|
|
|
|
[[ template "common/job_start" . ]]
|
|
|
|
# Metrics is running prometheus
|
|
group "metrics-server" {
|
|
[[- $c := merge .monitoring.prometheus .monitoring . ]]
|
|
|
|
shutdown_delay = "6s"
|
|
count = [[ $c.count ]]
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "metrics" {}
|
|
}
|
|
|
|
[[ template "common/volumes" $c ]]
|
|
|
|
service {
|
|
name = "prometheus[[ .consul.suffix ]]"
|
|
port = 9090
|
|
|
|
[[ template "common/service_meta" $c ]]
|
|
[[ template "common/connect" $c ]]
|
|
|
|
check {
|
|
name = "health"
|
|
type = "http"
|
|
expose = true
|
|
path = "/-/healthy"
|
|
interval = "20s"
|
|
timeout = "8s"
|
|
check_restart {
|
|
limit = 10
|
|
grace = "5m"
|
|
}
|
|
}
|
|
|
|
tags = [
|
|
[[ template "common/traefik_tags" $c ]]
|
|
]
|
|
}
|
|
|
|
[[ template "common/task.metrics_proxy" $c ]]
|
|
|
|
# The main prometheus task
|
|
task "prometheus" {
|
|
driver = "[[ $c.nomad.driver ]]"
|
|
leader = true
|
|
|
|
config {
|
|
image = "[[ $c.image ]]"
|
|
readonly_rootfs = true
|
|
pids_limit = 200
|
|
command = "prometheus"
|
|
args = [
|
|
"--config.file=/local/prometheus.yml",
|
|
"--log.level=info",
|
|
"--web.listen-address=127.0.0.1:9090",
|
|
"--storage.tsdb.path=/data",
|
|
"--storage.tsdb.retention.time=[[ $c.retention ]]",
|
|
"--web.console.libraries=/opt/prometheus/console_libraries",
|
|
"--web.console.templates=/opt/prometheus/consoles",
|
|
"--web.external-url=[[ $c.public_url ]]",
|
|
"--web.route-prefix=/"
|
|
]
|
|
}
|
|
|
|
[[ template "common/vault.policies" $c ]]
|
|
[[ template "common/artifacts" $c ]]
|
|
|
|
# Main configuration for prometheus
|
|
template {
|
|
data = <<_EOT
|
|
[[ tmpl.Exec "monitoring/prometheus/prometheus.yml" $c | replaceAll "${" "$${" ]]
|
|
_EOT
|
|
destination = "local/prometheus.yml"
|
|
uid = 100000
|
|
gid = 109090
|
|
perms = 640
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# Alert rules
|
|
[[- range $bundle := file.ReadDir "bundles" ]]
|
|
[[- if file.Exists (printf "bundles/%s/templates/prometheus/rules" $bundle) ]]
|
|
[[- range $tpl := file.ReadDir (printf "bundles/%s/templates/prometheus/rules" $bundle) ]]
|
|
[[- if not (file.Exists (printf "prometheus/rules/%s" $tpl)) ]]
|
|
template {
|
|
data = <<_EOT
|
|
[[ tmpl.Inline (file.Read (printf "bundles/%s/templates/prometheus/rules/%s" $bundle $tpl)) $c ]]
|
|
_EOT
|
|
destination = "local/rules/[[ $tpl ]]"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
[[- end ]]
|
|
[[- end ]]
|
|
[[- end ]]
|
|
[[- end ]]
|
|
|
|
[[- if file.Exists "prometheus/rules" ]]
|
|
[[- range $tpl := file.ReadDir "prometheus/rules" ]]
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[ tmpl.Inline (file.Read (printf "prometheus/rules/%s" $tpl)) $c ]]
|
|
_EOT
|
|
destination = "local/rules/[[ . ]]"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
[[- end ]]
|
|
[[- end ]]
|
|
|
|
[[- /* Support prometheus rules as artifacts or as raw content */]]
|
|
[[- range $k, $v := $c.alert_rules ]]
|
|
[[- if has $v "url" ]]
|
|
artifact {
|
|
source = "[[ $v.url ]]"
|
|
destination = "local/rules/[[ $k ]].yml"
|
|
mode = "file"
|
|
}
|
|
[[- else if has $v "content" ]]
|
|
template {
|
|
data = <<_EOT
|
|
[[ $v.content ]]
|
|
_EOT
|
|
destination = "local/rules/[[ $k ]].yml"
|
|
}
|
|
[[- end ]]
|
|
[[- end ]]
|
|
|
|
# A client cert, to connect to the AlertManager API
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/prometheus"
|
|
(printf "common_name=prometheus-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
|
|
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end -}}
|
|
_EOT
|
|
destination = "secrets/prometheus.bundle.pem"
|
|
uid = 100000
|
|
gid = 109090
|
|
perms = "0440"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# The monitoring CA chain, to validate AlertManager cert
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
uid = 100000
|
|
gid = 100000
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# Persistent data
|
|
volume_mount {
|
|
volume = "data"
|
|
destination = "/data"
|
|
}
|
|
|
|
[[ template "common/resources" $c ]]
|
|
}
|
|
}
|
|
|
|
group "alerts" {
|
|
|
|
[[- $c := merge .monitoring.alertmanager .monitoring . ]]
|
|
|
|
shutdown_delay = "6s"
|
|
count = [[ $c.count ]]
|
|
|
|
network {
|
|
mode = "bridge"
|
|
# Port exposing the web API, with mTLS
|
|
port "web-tls" {}
|
|
# Port used for gossip between the different alertmanager instance
|
|
port "cluster" {}
|
|
# Port to expose metrics to prometheus
|
|
port "metrics" {}
|
|
}
|
|
|
|
[[ template "common/volumes" $c ]]
|
|
|
|
# This service is used for the different instances of alertmanager to communicate
|
|
service {
|
|
name = "alertmanager-gossip[[ .consul.suffix ]]"
|
|
port = "cluster"
|
|
meta {
|
|
alloc = "${NOMAD_ALLOC_INDEX}"
|
|
}
|
|
}
|
|
|
|
# This service is used by prometheus. As it needs to be able to reach every instances, it cannot use
|
|
# the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh
|
|
service {
|
|
name = "alertmanager-tls[[ .consul.suffix ]]"
|
|
port = "web-tls"
|
|
meta {
|
|
alloc = "${NOMAD_ALLOC_INDEX}"
|
|
}
|
|
}
|
|
|
|
# This service is exposed through the service mesh
|
|
# and can be used to reach the web interface through Traefik
|
|
service {
|
|
name = "alertmanager[[ .consul.suffix ]]"
|
|
port = 9093
|
|
[[ template "common/service_meta" $c ]]
|
|
[[ template "common/connect" $c ]]
|
|
|
|
check {
|
|
name = "health"
|
|
type = "http"
|
|
expose = true
|
|
path = "/-/healthy"
|
|
interval = "20s"
|
|
timeout = "8s"
|
|
check_restart {
|
|
limit = 12
|
|
grace = "30s"
|
|
}
|
|
}
|
|
|
|
tags = [
|
|
[[ template "common/traefik_tags" $c ]]
|
|
]
|
|
}
|
|
|
|
# This task will handle mTLS to the AlertManager API
|
|
# And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
|
|
task "untls-proxy" {
|
|
driver = "[[ $c.nomad.driver ]]"
|
|
user = 9093
|
|
|
|
config {
|
|
image = "nginxinc/nginx-unprivileged:alpine"
|
|
force_pull = true
|
|
readonly_rootfs = true
|
|
pids_limit = 30
|
|
volumes = [
|
|
"local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro",
|
|
]
|
|
[[ template "common/tmpfs" "/tmp" ]]
|
|
}
|
|
|
|
[[ template "common/vault.policies" $c ]]
|
|
|
|
lifecycle {
|
|
hook = "poststart"
|
|
sidecar = true
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[ template "monitoring/alertmanager/nginx.conf" $c ]]
|
|
_EOT
|
|
destination = "local/alertmanager.conf"
|
|
}
|
|
|
|
[[ template "common/metrics_cert" $c ]]
|
|
|
|
# Certifiate used by AlertManager
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/alertmanager"
|
|
(printf "common_name=alertmanager-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
|
|
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
|
|
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/alertmanager.bundle.pem"
|
|
uid = 109093
|
|
gid = 100000
|
|
perms = "0440"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
resources {
|
|
cpu = 10
|
|
memory = 18
|
|
}
|
|
}
|
|
|
|
# The main alertmanager task
|
|
task "alertmanager" {
|
|
driver = "[[ $c.nomad.driver ]]"
|
|
leader = true
|
|
|
|
config {
|
|
image = "[[ $c.image ]]"
|
|
readonly_rootfs = true
|
|
pids_limit = 200
|
|
command = "/local/alertmanager"
|
|
}
|
|
|
|
[[ template "common/vault.policies" $c ]]
|
|
[[ template "common/file_env" $c ]]
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[- if isKind "map" $c.custom_config ]]
|
|
[[ merge $c.custom_config (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
|
|
[[- else if isKind "string" $c.custom_config ]]
|
|
[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]]
|
|
[[- else ]]
|
|
# Invalid custom config, using template only
|
|
[[ template "monitoring/alertmanager/alertmanager.yml" $c ]]
|
|
[[- end ]]
|
|
_EOT
|
|
destination = "secrets/alertmanager.yml"
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[ template "monitoring/alertmanager/cluster_tls.yml" $c ]]
|
|
_EOT
|
|
destination = "local/cluster_tls.yml"
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[ template "monitoring/alertmanager/web_tls.yml" $c ]]
|
|
_EOT
|
|
destination = "local/web_tls.yml"
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[ template "monitoring/alertmanager/start.sh" $c ]]
|
|
_EOT
|
|
destination = "local/alertmanager"
|
|
uid = 100000
|
|
gid = 100000
|
|
perms = "0755"
|
|
}
|
|
|
|
# Certifiate used by AlertManager
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/alertmanager"
|
|
(printf "common_name=alertmanager-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
|
|
(printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster"))
|
|
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/alertmanager.bundle.pem"
|
|
uid = 109093
|
|
gid = 109090
|
|
perms = "0440"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# The trusted CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
volume_mount {
|
|
volume = "data"
|
|
destination = "/data"
|
|
}
|
|
|
|
[[ template "common/resources" $c ]]
|
|
}
|
|
}
|
|
|
|
group "logs-server" {
|
|
|
|
[[- $c := merge .monitoring.loki .monitoring . ]]
|
|
|
|
shutdown_delay = "6s"
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "metrics" {}
|
|
}
|
|
|
|
[[ template "common/volumes" $c ]]
|
|
|
|
service {
|
|
name = "loki[[ .consul.suffix ]]"
|
|
port = 3100
|
|
[[ template "common/service_meta" $c ]]
|
|
[[ template "common/connect" $c ]]
|
|
|
|
check {
|
|
name = "ready"
|
|
type = "http"
|
|
path = "/ready"
|
|
expose = true
|
|
interval = "20s"
|
|
timeout = "8s"
|
|
check_restart {
|
|
limit = 6
|
|
grace = "5m"
|
|
}
|
|
}
|
|
|
|
tags = [
|
|
[[ template "common/traefik_tags" $c ]]
|
|
]
|
|
}
|
|
|
|
[[ template "common/task.metrics_proxy" $c ]]
|
|
|
|
task "loki" {
|
|
driver = "[[ $c.nomad.driver ]]"
|
|
|
|
config {
|
|
image = "[[ $c.image ]]"
|
|
command = "loki"
|
|
args = ["--config.file=/local/loki.yml", "--pattern-ingester.enabled=true"]
|
|
}
|
|
|
|
[[ template "common/vault.policies" $c ]]
|
|
[[ template "common/file_env" $c ]]
|
|
|
|
template {
|
|
data =<<_EOT
|
|
[[- if isKind "map" $c.custom_config ]]
|
|
[[ merge $c.custom_config (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]]
|
|
[[- else if isKind "string" $c.custom_config ]]
|
|
[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]]
|
|
[[- else ]]
|
|
# Not using custom_config as it's invalid
|
|
[[ template "monitoring/loki/loki.yml" $c ]]
|
|
[[- end ]]
|
|
_EOT
|
|
destination = "local/loki.yml"
|
|
}
|
|
|
|
# A client cert, to connect to the AlertManager API
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/loki"
|
|
(printf "common_name=loki-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX"))
|
|
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end -}}
|
|
_EOT
|
|
destination = "secrets/loki.bundle.pem"
|
|
uid = 100000
|
|
gid = 103100
|
|
perms = "0440"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
# The monitoring CA chain, to validate AlertManager cert
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
uid = 100000
|
|
gid = 100000
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
volume_mount {
|
|
volume = "data"
|
|
destination = "/data"
|
|
}
|
|
|
|
[[ template "common/resources" $c ]]
|
|
}
|
|
}
|
|
|
|
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
|
|
# And with a loki sink. The goal is to be able to collect logs from various sources
|
|
group "logs-aggregator" {
|
|
[[- $c := merge .monitoring.aggregator .monitoring . ]]
|
|
|
|
count = [[ $c.count ]]
|
|
shutdown_delay = "6s"
|
|
|
|
network {
|
|
mode = "bridge"
|
|
[[- if $c.syslog_udp.enabled ]]
|
|
port "syslog_udp" {}
|
|
[[- end ]]
|
|
[[- if $c.syslog_tcp.enabled ]]
|
|
port "syslog_tcp" {}
|
|
[[- end ]]
|
|
port "metrics" {}
|
|
}
|
|
|
|
# The main service is the vector source
|
|
# It will provide access to other services through the mesh (like loki)
|
|
service {
|
|
name = "vector-aggregator[[ .consul.suffix ]]"
|
|
port = 9000
|
|
[[ template "common/service_meta" $c ]]
|
|
[[ template "common/connect" $c ]]
|
|
tags = [
|
|
[[ template "common/traefik_tags" merge $c.vector $c ]]
|
|
]
|
|
}
|
|
|
|
[[- if $c.syslog_udp.enabled ]]
|
|
# The syslog UDP service can be used to ingest standard syslog logs from other
|
|
# devices, and can be exposed by Traefik for this
|
|
service {
|
|
name = "syslog-udp[[ .consul.suffix ]]"
|
|
port = "syslog_udp"
|
|
tags = [
|
|
[[ template "common/traefik_tags" merge $c.syslog_udp $c ]]
|
|
]
|
|
}
|
|
[[- end ]]
|
|
|
|
[[- if $c.syslog_tcp.enabled ]]
|
|
# The syslog TCP service can be used to ingest standard syslog logs from other
|
|
# devices, and can be exposed by Traefik for this
|
|
service {
|
|
name = "syslog-tcp[[ .consul.suffix ]]"
|
|
port = "syslog_tcp"
|
|
tags = [
|
|
[[ template "common/traefik_tags" merge $c.syslog_tcp $c ]]
|
|
]
|
|
}
|
|
[[- end ]]
|
|
|
|
[[- if $c.fluentd.enabled ]]
|
|
# The fluentd service can be used to ingest fluentd logs
|
|
service {
|
|
name = "fluent[[ .consul.suffix ]]"
|
|
port = 24224
|
|
tags = [
|
|
[[ template "common/traefik_tags" merge $c.fluentd $c ]]
|
|
]
|
|
}
|
|
[[- end ]]
|
|
|
|
task "vector" {
|
|
driver = "[[ $c.nomad.driver ]]"
|
|
leader = true
|
|
|
|
config {
|
|
image = "[[ $c.image ]]"
|
|
readonly_rootfs = true
|
|
pids_limit = 200
|
|
args = [ "--config=/local/vector.yml" ]
|
|
}
|
|
|
|
[[ template "common/vault.policies" $c ]]
|
|
[[ template "common/file_env" $c ]]
|
|
[[ template "common/metrics_cert" $c ]]
|
|
|
|
template {
|
|
data = <<_EOT
|
|
[[ tmpl.Exec "monitoring/aggregator/vector.yml" $c | replaceAll "%{" "%%{" | replaceAll "${" "$${" ]]
|
|
_EOT
|
|
destination = "local/vector.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
}
|
|
|
|
[[ template "common/resources" $c ]]
|
|
}
|
|
}
|
|
|
|
group "grafana" {
|
|
[[- $c := merge .monitoring.grafana .monitoring . ]]
|
|
|
|
shutdown_delay = "6s"
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "metrics" {}
|
|
}
|
|
|
|
[[ template "common/volumes" $c ]]
|
|
|
|
service {
|
|
name = "grafana[[ .consul.suffix ]]"
|
|
port = 3000
|
|
|
|
[[ template "common/metrics_meta" $c ]]
|
|
[[ template "common/connect" $c ]]
|
|
|
|
check {
|
|
name = "health"
|
|
type = "http"
|
|
path = "/api/health"
|
|
expose = true
|
|
interval = "30s"
|
|
timeout = "8s"
|
|
}
|
|
|
|
tags = [
|
|
[[ template "common/traefik_tags" $c ]]
|
|
]
|
|
}
|
|
|
|
[[ template "common/task.metrics_proxy" $c ]]
|
|
[[ template "common/task.pgpooler" $c ]]
|
|
[[ template "common/task.memcached" ]]
|
|
|
|
task "grafana" {
|
|
|
|
driver = "[[ $c.nomad.driver ]]"
|
|
leader = true
|
|
|
|
config {
|
|
image = "[[ $c.image ]]"
|
|
readonly_rootfs = true
|
|
pids_limit = 100
|
|
command = "grafana"
|
|
args = [
|
|
"server",
|
|
"--homepath=/opt/grafana",
|
|
"--config=/secrets/grafana.ini",
|
|
"--packaging=docker"
|
|
]
|
|
}
|
|
|
|
[[ template "common/vault.policies" $c ]]
|
|
[[ template "common/file_env" $c ]]
|
|
|
|
template {
|
|
data = <<_EOT
|
|
GF_SECURITY_ADMIN_PASSWORD={{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd | sprig_squote }}{{ end }}
|
|
_EOT
|
|
destination = "secrets/.grafana.env"
|
|
perms = 400
|
|
env = true
|
|
}
|
|
|
|
# Basic grafana configuration file
|
|
template {
|
|
data = <<_EOT
|
|
[[ template "monitoring/grafana/grafana.ini" $c ]]
|
|
_EOT
|
|
destination = "secrets/grafana.ini"
|
|
uid = 103000
|
|
perms = 400
|
|
}
|
|
|
|
# Mount volume in /data for persistence
|
|
volume_mount {
|
|
volume = "data"
|
|
destination = "/data"
|
|
}
|
|
|
|
[[ template "common/resources" $c ]]
|
|
}
|
|
}
|
|
}
|