job "[[ .instance ]]-services" { [[ template "common/job_start" . ]] # Metrics is running prometheus group "metrics-server" { [[- $c := merge .monitoring.prometheus .monitoring . ]] shutdown_delay = "6s" count = [[ $c.count ]] network { mode = "bridge" port "metrics" {} } [[ template "common/volumes" $c ]] service { name = "prometheus[[ .consul.suffix ]]" port = 9090 [[ template "common/service_meta" $c ]] [[ template "common/connect" $c ]] check { name = "health" type = "http" expose = true path = "/-/healthy" interval = "20s" timeout = "8s" check_restart { limit = 10 grace = "5m" } } tags = [ [[ template "common/traefik_tags" $c ]] ] } [[ template "common/task.metrics_proxy" $c ]] # The main prometheus task task "prometheus" { driver = "[[ $c.nomad.driver ]]" leader = true config { image = "[[ $c.image ]]" readonly_rootfs = true pids_limit = 200 command = "prometheus" args = [ "--config.file=/local/prometheus.yml", "--log.level=info", "--web.listen-address=127.0.0.1:9090", "--storage.tsdb.path=/data", "--storage.tsdb.retention.time=[[ $c.retention ]]", "--web.console.libraries=/opt/prometheus/console_libraries", "--web.console.templates=/opt/prometheus/consoles", "--web.external-url=[[ $c.public_url ]]", "--web.route-prefix=/" ] } [[ template "common/vault.policies" $c ]] [[ template "common/artifacts" $c ]] # Main configuration for prometheus template { data = <<_EOT [[ tmpl.Exec "monitoring/prometheus/prometheus.yml" $c | replaceAll "${" "$${" ]] _EOT destination = "local/prometheus.yml" uid = 100000 gid = 109090 perms = 640 change_mode = "signal" change_signal = "SIGHUP" } # Alert rules [[- range $bundle := file.ReadDir "bundles" ]] [[- if file.Exists (printf "bundles/%s/templates/prometheus/rules" $bundle) ]] [[- range $tpl := file.ReadDir (printf "bundles/%s/templates/prometheus/rules" $bundle) ]] [[- if not (file.Exists (printf "prometheus/rules/%s" $tpl)) ]] template { data = <<_EOT [[ tmpl.Inline (file.Read (printf "bundles/%s/templates/prometheus/rules/%s" $bundle $tpl)) $c ]] _EOT destination = "local/rules/[[ $tpl ]]" left_delimiter = "{{{" right_delimiter = "}}}" } [[- end ]] [[- end ]] [[- end ]] [[- end ]] [[- if file.Exists "prometheus/rules" ]] [[- range $tpl := file.ReadDir "prometheus/rules" ]] template { data = <<_EOT [[ tmpl.Inline (file.Read (printf "prometheus/rules/%s" $tpl)) $c ]] _EOT destination = "local/rules/[[ . ]]" left_delimiter = "{{{" right_delimiter = "}}}" } [[- end ]] [[- end ]] [[- /* Support prometheus rules as artifacts or as raw content */]] [[- range $k, $v := $c.alert_rules ]] [[- if has $v "url" ]] artifact { source = "[[ $v.url ]]" destination = "local/rules/[[ $k ]].yml" mode = "file" } [[- else if has $v "content" ]] template { data = <<_EOT [[ $v.content ]] _EOT destination = "local/rules/[[ $k ]].yml" } [[- end ]] [[- end ]] # A client cert, to connect to the AlertManager API template { data = <<_EOT {{- with pkiCert "[[ $c.vault.pki.path ]]/issue/prometheus" (printf "common_name=prometheus-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX")) (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}} {{ .Cert }} {{ .Key }} {{- end -}} _EOT destination = "secrets/prometheus.bundle.pem" uid = 100000 gid = 109090 perms = "0440" change_mode = "signal" change_signal = "SIGHUP" } # The monitoring CA chain, to validate AlertManager cert template { data = <<_EOT {{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/monitoring.ca.pem" uid = 100000 gid = 100000 change_mode = "signal" change_signal = "SIGHUP" } # Persistent data volume_mount { volume = "data" destination = "/data" } [[ template "common/resources" $c ]] } } group "alerts" { [[- $c := merge .monitoring.alertmanager .monitoring . ]] shutdown_delay = "6s" count = [[ $c.count ]] network { mode = "bridge" # Port exposing the web API, with mTLS port "web-tls" {} # Port used for gossip between the different alertmanager instance port "cluster" {} # Port to expose metrics to prometheus port "metrics" {} } [[ template "common/volumes" $c ]] # This service is used for the different instances of alertmanager to communicate service { name = "alertmanager-gossip[[ .consul.suffix ]]" port = "cluster" meta { alloc = "${NOMAD_ALLOC_INDEX}" } } # This service is used by prometheus. As it needs to be able to reach every instances, it cannot use # the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh service { name = "alertmanager-tls[[ .consul.suffix ]]" port = "web-tls" meta { alloc = "${NOMAD_ALLOC_INDEX}" } } # This service is exposed through the service mesh # and can be used to reach the web interface through Traefik service { name = "alertmanager[[ .consul.suffix ]]" port = 9093 [[ template "common/service_meta" $c ]] [[ template "common/connect" $c ]] check { name = "health" type = "http" expose = true path = "/-/healthy" interval = "20s" timeout = "8s" check_restart { limit = 12 grace = "30s" } } tags = [ [[ template "common/traefik_tags" $c ]] ] } # This task will handle mTLS to the AlertManager API # And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy task "untls-proxy" { driver = "[[ $c.nomad.driver ]]" user = 9093 config { image = "nginxinc/nginx-unprivileged:alpine" force_pull = true readonly_rootfs = true pids_limit = 30 volumes = [ "local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro", ] [[ template "common/tmpfs" "/tmp" ]] } [[ template "common/vault.policies" $c ]] lifecycle { hook = "poststart" sidecar = true } template { data = <<_EOT [[ template "monitoring/alertmanager/nginx.conf" $c ]] _EOT destination = "local/alertmanager.conf" } [[ template "common/metrics_cert" $c ]] # Certifiate used by AlertManager template { data = <<_EOT {{- with pkiCert "[[ $c.vault.pki.path ]]/issue/alertmanager" (printf "common_name=alertmanager-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX")) (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }} {{ .Cert }} {{ .Key }} {{- end }} _EOT destination = "secrets/alertmanager.bundle.pem" uid = 109093 gid = 100000 perms = "0440" change_mode = "signal" change_signal = "SIGHUP" } resources { cpu = 10 memory = 18 } } # The main alertmanager task task "alertmanager" { driver = "[[ $c.nomad.driver ]]" leader = true config { image = "[[ $c.image ]]" readonly_rootfs = true pids_limit = 200 command = "/local/alertmanager" } [[ template "common/vault.policies" $c ]] [[ template "common/file_env" $c ]] template { data = <<_EOT [[- if isKind "map" $c.custom_config ]] [[ merge $c.custom_config (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]] [[- else if isKind "string" $c.custom_config ]] [[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/alertmanager/alertmanager.yml" $c | yaml) | toYAML ]] [[- else ]] # Invalid custom config, using template only [[ template "monitoring/alertmanager/alertmanager.yml" $c ]] [[- end ]] _EOT destination = "secrets/alertmanager.yml" } template { data = <<_EOT [[ template "monitoring/alertmanager/cluster_tls.yml" $c ]] _EOT destination = "local/cluster_tls.yml" } template { data = <<_EOT [[ template "monitoring/alertmanager/web_tls.yml" $c ]] _EOT destination = "local/web_tls.yml" } template { data = <<_EOT [[ template "monitoring/alertmanager/start.sh" $c ]] _EOT destination = "local/alertmanager" uid = 100000 gid = 100000 perms = "0755" } # Certifiate used by AlertManager template { data = <<_EOT {{- with pkiCert "[[ $c.vault.pki.path ]]/issue/alertmanager" (printf "common_name=alertmanager-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX")) (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }} {{ .Cert }} {{ .Key }} {{- end }} _EOT destination = "secrets/alertmanager.bundle.pem" uid = 109093 gid = 109090 perms = "0440" change_mode = "signal" change_signal = "SIGHUP" } # The trusted CA template { data = <<_EOT {{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/monitoring.ca.pem" } volume_mount { volume = "data" destination = "/data" } [[ template "common/resources" $c ]] } } group "logs-server" { [[- $c := merge .monitoring.loki .monitoring . ]] shutdown_delay = "6s" network { mode = "bridge" port "metrics" {} } [[ template "common/volumes" $c ]] service { name = "loki[[ .consul.suffix ]]" port = 3100 [[ template "common/service_meta" $c ]] [[ template "common/connect" $c ]] check { name = "ready" type = "http" path = "/ready" expose = true interval = "20s" timeout = "8s" check_restart { limit = 6 grace = "5m" } } tags = [ [[ template "common/traefik_tags" $c ]] ] } [[ template "common/task.metrics_proxy" $c ]] task "loki" { driver = "[[ $c.nomad.driver ]]" config { image = "[[ $c.image ]]" command = "loki" args = ["--config.file=/local/loki.yml", "--pattern-ingester.enabled=true"] } [[ template "common/vault.policies" $c ]] [[ template "common/file_env" $c ]] template { data =<<_EOT [[- if isKind "map" $c.custom_config ]] [[ merge $c.custom_config (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]] [[- else if isKind "string" $c.custom_config ]] [[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]] [[- else ]] # Not using custom_config as it's invalid [[ template "monitoring/loki/loki.yml" $c ]] [[- end ]] _EOT destination = "local/loki.yml" } # A client cert, to connect to the AlertManager API template { data = <<_EOT {{- with pkiCert "[[ $c.vault.pki.path ]]/issue/loki" (printf "common_name=loki-%s.[[ .instance ]].[[ .consul.domain ]]" (env "NOMAD_ALLOC_INDEX")) (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}} {{ .Cert }} {{ .Key }} {{- end -}} _EOT destination = "secrets/loki.bundle.pem" uid = 100000 gid = 103100 perms = "0440" change_mode = "signal" change_signal = "SIGHUP" } # The monitoring CA chain, to validate AlertManager cert template { data = <<_EOT {{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/monitoring.ca.pem" uid = 100000 gid = 100000 change_mode = "signal" change_signal = "SIGHUP" } volume_mount { volume = "data" destination = "/data" } [[ template "common/resources" $c ]] } } # The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.) # And with a loki sink. The goal is to be able to collect logs from various sources group "logs-aggregator" { [[- $c := merge .monitoring.aggregator .monitoring . ]] count = [[ $c.count ]] shutdown_delay = "6s" network { mode = "bridge" [[- if $c.syslog_udp.enabled ]] port "syslog_udp" {} [[- end ]] [[- if $c.syslog_tcp.enabled ]] port "syslog_tcp" {} [[- end ]] port "metrics" {} } # The main service is the vector source # It will provide access to other services through the mesh (like loki) service { name = "vector-aggregator[[ .consul.suffix ]]" port = 9000 [[ template "common/service_meta" $c ]] [[ template "common/connect" $c ]] tags = [ [[ template "common/traefik_tags" merge $c.vector $c ]] ] } [[- if $c.syslog_udp.enabled ]] # The syslog UDP service can be used to ingest standard syslog logs from other # devices, and can be exposed by Traefik for this service { name = "syslog-udp[[ .consul.suffix ]]" port = "syslog_udp" tags = [ [[ template "common/traefik_tags" merge $c.syslog_udp $c ]] ] } [[- end ]] [[- if $c.syslog_tcp.enabled ]] # The syslog TCP service can be used to ingest standard syslog logs from other # devices, and can be exposed by Traefik for this service { name = "syslog-tcp[[ .consul.suffix ]]" port = "syslog_tcp" tags = [ [[ template "common/traefik_tags" merge $c.syslog_tcp $c ]] ] } [[- end ]] [[- if $c.fluentd.enabled ]] # The fluentd service can be used to ingest fluentd logs service { name = "fluent[[ .consul.suffix ]]" port = 24224 tags = [ [[ template "common/traefik_tags" merge $c.fluentd $c ]] ] } [[- end ]] task "vector" { driver = "[[ $c.nomad.driver ]]" leader = true config { image = "[[ $c.image ]]" readonly_rootfs = true pids_limit = 200 args = [ "--config=/local/vector.yml" ] } [[ template "common/vault.policies" $c ]] [[ template "common/file_env" $c ]] [[ template "common/metrics_cert" $c ]] template { data = <<_EOT [[ tmpl.Exec "monitoring/aggregator/vector.yml" $c | replaceAll "%{" "%%{" | replaceAll "${" "$${" ]] _EOT destination = "local/vector.yml" left_delimiter = "{{{" right_delimiter = "}}}" change_mode = "signal" change_signal = "SIGHUP" } [[ template "common/resources" $c ]] } } group "grafana" { [[- $c := merge .monitoring.grafana .monitoring . ]] shutdown_delay = "6s" network { mode = "bridge" port "metrics" {} } [[ template "common/volumes" $c ]] service { name = "grafana[[ .consul.suffix ]]" port = 3000 [[ template "common/metrics_meta" $c ]] [[ template "common/connect" $c ]] check { name = "health" type = "http" path = "/api/health" expose = true interval = "30s" timeout = "8s" } tags = [ [[ template "common/traefik_tags" $c ]] ] } [[ template "common/task.metrics_proxy" $c ]] [[ template "common/task.pgpooler" $c ]] [[ template "common/task.memcached" ]] task "grafana" { driver = "[[ $c.nomad.driver ]]" leader = true config { image = "[[ $c.image ]]" readonly_rootfs = true pids_limit = 100 command = "grafana" args = [ "server", "--homepath=/opt/grafana", "--config=/secrets/grafana.ini", "--packaging=docker" ] } [[ template "common/vault.policies" $c ]] [[ template "common/file_env" $c ]] template { data = <<_EOT GF_SECURITY_ADMIN_PASSWORD={{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd | sprig_squote }}{{ end }} _EOT destination = "secrets/.grafana.env" perms = 400 env = true } # Basic grafana configuration file template { data = <<_EOT [[ template "monitoring/grafana/grafana.ini" $c ]] _EOT destination = "secrets/grafana.ini" uid = 103000 perms = 400 } # Mount volume in /data for persistence volume_mount { volume = "data" destination = "/data" } [[ template "common/resources" $c ]] } } }