2024-03-25 12:27:46 +01:00
job "monitoring-agent" {
2024-03-25 22:23:31 +01:00
2024-03-25 12:27:46 +01:00
datacenters = [ "dc1" ]
region = "global"
node_pool = "all"
2024-03-25 22:23:31 +01:00
priority = 60
2024-03-25 12:27:46 +01:00
type = "system"
2024-03-25 14:54:13 +01:00
# This group will collect logs from the allocation running on the node
# It uses nomad-vector-logger to query the Nomad API and discover running allocations
# and then vector to read logs from all the discovered allocations. Logs are fowarded to loki through the service mesh
2024-03-25 12:27:46 +01:00
group "logs-collector" {
network {
mode = "bridge"
port "metrics" {}
}
# Try harder to restart tasks if they fail
restart {
attempts = 20
interval = "5m"
mode = "delay"
}
2024-04-08 22:44:10 +02:00
2024-03-25 12:27:46 +01:00
volume "data" {
source = "vector_data"
type = "host"
}
2024-04-08 22:44:10 +02:00
2024-03-25 12:27:46 +01:00
volume "nomad" {
source = "nomad_alloc"
type = "host"
read_only = true
}
service {
name = "vector-agent"
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${node.unique.name}"
2024-03-28 09:39:59 +01:00
datacenter = "${NOMAD_DC}"
group = "${NOMAD_GROUP_NAME}"
2024-03-25 12:27:46 +01:00
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
2024-03-28 09:39:59 +01:00
node = "${node.unique.name}"
region = "${NOMAD_REGION}"
2024-03-25 12:27:46 +01:00
}
connect {
sidecar_service {
proxy {
upstreams {
destination_name = "loki"
local_bind_port = 3100
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
2024-04-17 10:13:09 +02:00
config {
protocol = "http"
}
2024-03-25 12:27:46 +01:00
}
}
}
sidecar_task {
config {
args = [
"-c" ,
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json" ,
"-l" ,
"${meta.connect.log_level}" ,
"--concurrency" ,
"${meta.connect.proxy_concurrency}" ,
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
}
task "nomad-vector-logger" {
driver = "docker"
# Use a random user instead of root
user = 3987
config {
image = "danielberteaud/nomad-vector-logger:24.3-2"
readonly_rootfs = true
pids_limit = 50
# Nomad Vector Logger needs to run on the host's network namespace
# so it can reach the Nomad Agent API on localhost:4646
network_mode = "host"
# Host network namespace requires disabling user namespace
userns_mode = "host"
command = "nomad-vector-logger"
args = [
"--config" ,
"/local/nomad-vector-logger.toml"
]
}
# We want to run Nomad Vector Logger before vector agent
lifecycle {
hook = "prestart"
sidecar = true
}
vault {
policies = [ "nomad-vector-logger" ]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = < < _EOT
LANG = fr_FR . utf8
TZ = Europe / Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
# Env to access Nomad API
template {
data = < < _EOT
NOMAD_TOKEN = {{ with secret "nomad/creds/nomad-vector-logger" }}{{ . Data . secret_id }}{{ end }}
NOMAD_ADDR = https : / / localhost : 4646
NOMAD_CLIENT_CERT = / secrets / nomad . bundle . pem
NOMAD_CLIENT_KEY = / secrets / nomad . bundle . pem
NOMAD_CACERT = / local / nomad . ca . pem
_EOT
destination = "secrets/.nomad-vector-logger.env"
perms = 400
env = true
}
# The main configuration file for nomad-vector-logger
template {
data = < < _EOT
[ app ]
log_level = "info"
env = "prod"
refresh_interval = "10s"
remove_alloc_interval = "30s"
nomad_data_dir = "/nomad"
vector_config_dir = "/alloc/data/vector_conf"
extra_templates_dir = "/local/template/"
custom_logs_dir = "alloc/custom"
_EOT
destination = "local/nomad-vector-logger.toml"
}
2024-03-25 22:23:31 +01:00
# Disable the default nomad.toml template, as we provide our own nomad.yml template
2024-03-25 12:27:46 +01:00
template {
data = "# Disable the default toml template"
destination = "local/template/nomad.toml"
}
# The vector configuration template used to generate the vector conf
template {
data = < < _EOT
sources :
{{- range $value : = . }}
source_ {{ $ value . Key }}:
type : file
include : [ "{{ $value.LogDir }}" ]
line_delimiter : "\n"
read_from : beginning
# Handle multi-line Java stacktraces
multiline :
start_pattern : "^[^\\s]"
mode : continue_through
condition_pattern : "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
timeout_ms : 1000
2024-03-25 22:23:31 +01:00
ignore_older_secs : 1800
oldest_first : true
2024-03-25 12:27:46 +01:00
{{- end }}
transforms :
{{- range $value : = . }}
transform_ {{ $ value . Key }}:
type : remap
inputs : [ "source_{{ $value.Key }}" ]
source : |
# Store Nomad metadata.
.nomad.namespace = "{{ $value.Namespace }}"
.nomad.node_name = "{{ $value.Node }}"
.nomad.job_name = "{{ $value.Job }}"
.nomad.group_name = "{{ $value.Group }}"
.nomad.task_name = "{{ $value.Task }}"
.nomad.alloc_id = "{{ $value.ID }}"
.nomad.alloc_name = "{{ $value.Name }}"
2024-03-25 14:54:13 +01:00
.nomad.alloc = replace ( "{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index" )
# Set instance = <TaskName>-<Alloc Index> so it's similar to what prometheus has
.nomad.instance = join ! ( [ "{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-" )
2024-03-25 12:27:46 +01:00
{{- end }}
_EOT
destination = "local/template/nomad.yml"
# {{ }} is used by the template, so prevent consul-template to interprete it
left_delimiter = "{{{"
right_delimiter = "}}}"
}
# Get a client cert for the Nomad API
template {
data = < < _EOT
{{- with pkiCert "pki/nomad/issue/nomad-vector-logger"
"common_name = nomad - vector - logger . nomad . consul "
"ttl = 72 h " }}
{{ . Cert }}
{{ . Key }}
{{- end }}
_EOT
destination = "secrets/nomad.bundle.pem"
uid = 3987
perms = "0400"
}
# The CA chain to validate Nomad certificates
template {
data = < < _EOT
{{ with secret "pki/nomad/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
_EOT
destination = "local/nomad.ca.pem"
}
# The main config file
volume_mount {
volume = "nomad"
destination = "/nomad"
read_only = true
}
resources {
cpu = 20
memory = 24
memory_max = 50
}
}
# Nomad Vector Logger can take a few seconds to generate the initial configuration file
# This task ensure the file exists before vector is started (to prevent an error as the
# transform_nomad_alloc_* sources won't have anything before the file exists)
task "wait-for-vector-conf" {
driver = "docker"
config {
2024-03-25 22:23:31 +01:00
image = "busybox:latest"
readonly_rootfs = true
pids_limit = 20
command = "sh"
2024-03-25 12:27:46 +01:00
args = [
"-c" ,
2024-03-25 22:23:31 +01:00
"echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done"
2024-03-25 12:27:46 +01:00
]
}
lifecycle {
hook = "prestart"
}
# The task will shutdown once the config is available, so just
# allocate very few resources
resources {
cpu = 10
memory = 10
}
}
# The main vector task, which will read logs using the config file generated by Nomad Vector Logger
task "vector" {
driver = "docker"
leader = true
config {
2024-04-11 22:44:15 +02:00
image = "danielberteaud/vector:0.37.1-1"
2024-03-25 22:23:31 +01:00
userns_mode = "host"
readonly_rootfs = true
2024-03-28 09:39:59 +01:00
pids_limit = 1000
2024-03-25 12:27:46 +01:00
args = [
"--watch-config" ,
"--config", "/local/vector.yml" ,
"--config-dir", "/alloc/data/vector_conf"
]
}
vault {
policies = [ "metrics" ]
env = false
disable_file = true
change_mode = "noop"
}
env {
NODE_UNIQUE_NAME = "${node.unique.name}"
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = < < _EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans = % s " (env "NOMAD_HOST_IP_metrics" ) ) }}
{{ . Cert }}
{{ . Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = < < _EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
2024-03-25 22:23:31 +01:00
# Main vector configuration
2024-03-25 12:27:46 +01:00
template {
data = < < _EOT
data_dir : / data
# Don't keep metrics indefinitly if they are not updated anymore
expire_metrics_secs : 60
sources :
metrics - vector :
type : internal_metrics
sinks :
loki :
type : loki
inputs : [ "transform_nomad_alloc_*" ]
endpoint : http : / / 127 . 0 . 0 . 1 : 3100
encoding :
codec : text
labels :
namespace : "{{ .nomad.namespace }}"
job : "{{ .nomad.job_name }}"
group : "{{ .nomad.group_name }}"
task : "{{ .nomad.task_name }}"
host : "{{ .nomad.node_name }}"
alloc : "{{ .nomad.alloc }}"
buffer :
type : disk
max_size : 268435488
remove_label_fields : true
prometheus :
type : prometheus_exporter
inputs : [ "metrics-vector" ]
address : 0 . 0 . 0 . 0 : ${ N O M A D _ A L L O C _ P O R T _ m e t r i c s }
tls :
enabled : true
crt_file : / secrets / metrics . bundle . pem
key_file : / secrets / metrics . bundle . pem
ca_file : / local / monitoring . ca . pem
verify_certificate : true
_EOT
destination = "local/vector.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
wait {
min = "5s"
max = "30s"
}
}
volume_mount {
volume = "nomad"
destination = "/nomad"
read_only = true
}
volume_mount {
volume = "data"
destination = "/data"
read_only = false
}
resources {
cpu = 100
2024-03-25 22:23:31 +01:00
memory = 384
memory_max = 512
2024-03-25 12:27:46 +01:00
}
}
}
2024-03-25 14:54:13 +01:00
# This group runs the prometheus node-exporter to expose prometheus metrics from the node
group "node-exporter" {
network {
mode = "bridge"
port "metrics" {}
}
2024-04-08 22:44:10 +02:00
2024-03-25 14:54:13 +01:00
volume "host" {
source = "host_root"
type = "host"
read_only = true
}
service {
name = "node-exporter"
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${node.unique.name}"
2024-03-28 09:39:59 +01:00
datacenter = "${NOMAD_DC}"
group = "${NOMAD_GROUP_NAME}"
2024-03-25 14:54:13 +01:00
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
2024-03-28 09:39:59 +01:00
node = "${node.unique.name}"
region = "${NOMAD_REGION}"
2024-03-25 14:54:13 +01:00
}
}
task "node-exporter" {
driver = "docker"
2024-04-24 15:49:58 +02:00
user = 100320
2024-03-25 14:54:13 +01:00
config {
2024-04-24 15:49:58 +02:00
image = "danielberteaud/node-exporter:1.8.0-1"
2024-03-25 22:23:31 +01:00
pid_mode = "host"
2024-03-25 14:54:13 +01:00
userns_mode = "host"
readonly_rootfs = true
pids_limit = 50
2024-03-25 22:23:31 +01:00
command = "/usr/local/bin/node_exporter"
2024-03-25 14:54:13 +01:00
args = [
"--path.rootfs = / host " ,
"--web.config.file = / local / tls . yml " ,
2024-03-25 22:23:31 +01:00
"--web.listen-address = : ${ N O M A D _ A L L O C _ P O R T _ m e t r i c s } " ,
"--collector.filesystem.mount-points-exclude = ^ / ( dev | proc | sys | var / lib / ( docker | containers ) / . + | opt / nomad / data / ( alloc | client ) ) ( $ | / ) " ,
2024-03-25 14:54:13 +01:00
]
}
vault {
policies = [ "metrics" ]
env = false
disable_file = true
change_mode = "noop"
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = < < _EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans = % s " (env "NOMAD_HOST_IP_metrics" ) ) }}
{{ . Cert }}
{{ . Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = < < _EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
2024-03-25 22:23:31 +01:00
2024-03-25 14:54:13 +01:00
template {
data = < < _EOT
tls_server_config :
cert_file : / secrets / metrics . bundle . pem
key_file : / secrets / metrics . bundle . pem
client_ca_file : / local / monitoring . ca . pem
client_auth_type : RequireAndVerifyClientCert
_EOT
destination = "local/tls.yml"
}
volume_mount {
volume = "host"
destination = "/host"
read_only = true
propagation_mode = "host-to-task"
}
resources {
cpu = 50
2024-03-26 15:43:00 +01:00
memory = 32
memory_max = 56
2024-03-25 14:54:13 +01:00
}
}
}
2024-04-21 22:03:24 +02:00
group "consul-agent-exporter" {
shutdown_delay = "6s"
ephemeral_disk {
# Use minimal ephemeral disk
size = 101
}
network {
mode = "bridge"
port "metrics" {}
}
service {
name = "consul-agent"
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${node.unique.name}"
datacenter = "${NOMAD_DC}"
group = "${NOMAD_GROUP_NAME}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
node = "${node.unique.name}"
region = "${NOMAD_REGION}"
}
}
task "consul-agent-metrics-proxy" {
driver = "docker"
config {
image = "nginxinc/nginx-unprivileged:alpine"
readonly_rootfs = true
mount {
type = "tmpfs"
target = "/tmp"
tmpfs_options {
size = 3000000
}
}
volumes = [
"secrets/nginx.conf:/etc/nginx/conf.d/default.conf:ro"
]
}
vault {
policies = [ "metrics", "cluster-exporter" ]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = < < _EOT
LANG = fr_FR . utf8
TZ = Europe / Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = < < _EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans = % s " (env "NOMAD_HOST_IP_metrics" ) ) }}
{{ . Cert }}
{{ . Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = < < _EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ . Data . ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = < < _EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl ;
http2 on ;
ssl_certificate / secrets / metrics . bundle . pem ;
ssl_certificate_key / secrets / metrics . bundle . pem ;
ssl_client_certificate / local / monitoring . ca . pem ;
ssl_verify_client on ;
ssl_protocols TLSv1 . 2 TLSv1 . 3 ;
ssl_ciphers ECDHE - ECDSA - AES128 - GCM - SHA256 : ECDHE - RSA - AES128 - GCM - SHA256 : ECDHE - ECDSA - AES256 - GCM - SHA384 : ECDHE - RSA - AES256 - GCM - SHA384 : ECDHE - ECDSA - CHACHA20 - POLY1305 : ECDHE - RSA - CHACHA20 - POLY1305 : DHE - RSA - AES128 - GCM - SHA256 : DHE - RSA - AES256 - GCM - SHA384 ;
ssl_session_cache shared : SSL : 10 m ;
ssl_session_timeout 1 h ;
ssl_session_tickets off ;
gzip on ;
gzip_types
text / plain ;
gzip_vary on ;
server_tokens off ;
if ( $ request_method ! ~ ^ ( GET | HEAD ) $ ) {
return 405 ;
}
set $ consul_token "{{ with secret "consul/creds/cluster-exporter" }}{{ .Data.token }}{{ end }}" ;
location / metrics {
proxy_pass http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500/v1/agent/metrics?format = prometheus ;
proxy_set_header X - Consul - Token $ consul_token ;
}
}
_EOT
destination = "secrets/nginx.conf"
}
resources {
cpu = 10
memory = 15
memory_max = 24
}
}
}
2024-03-25 12:27:46 +01:00
}