684 lines
16 KiB
HCL
684 lines
16 KiB
HCL
job "monitoring-agent" {
|
|
|
|
|
|
datacenters = ["dc1"]
|
|
region = "global"
|
|
node_pool = "all"
|
|
priority = 60
|
|
|
|
|
|
type = "system"
|
|
|
|
# This group will collect logs from the allocation running on the node
|
|
# It uses nomad-vector-logger to query the Nomad API and discover running allocations
|
|
# and then vector to read logs from all the discovered allocations. Logs are fowarded to loki through the service mesh
|
|
group "logs-collector" {
|
|
|
|
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "metrics" {}
|
|
}
|
|
|
|
# Try harder to restart tasks if they fail
|
|
restart {
|
|
attempts = 20
|
|
interval = "5m"
|
|
mode = "delay"
|
|
}
|
|
|
|
|
|
|
|
volume "data" {
|
|
source = "vector_data"
|
|
type = "host"
|
|
}
|
|
|
|
volume "nomad" {
|
|
source = "nomad_alloc"
|
|
type = "host"
|
|
}
|
|
|
|
|
|
service {
|
|
name = "vector-agent"
|
|
meta {
|
|
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
|
alloc = "${node.unique.name}"
|
|
datacenter = "${NOMAD_DC}"
|
|
group = "${NOMAD_GROUP_NAME}"
|
|
job = "${NOMAD_JOB_NAME}"
|
|
namespace = "${NOMAD_NAMESPACE}"
|
|
node = "${node.unique.name}"
|
|
region = "${NOMAD_REGION}"
|
|
}
|
|
|
|
connect {
|
|
sidecar_service {
|
|
proxy {
|
|
upstreams {
|
|
destination_name = "loki"
|
|
local_bind_port = 3100
|
|
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
|
|
destination_type = "service"
|
|
config {
|
|
protocol = "http"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
sidecar_task {
|
|
|
|
logs {
|
|
disabled = false
|
|
}
|
|
|
|
config {
|
|
args = [
|
|
"-c",
|
|
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
|
|
"-l",
|
|
"${meta.connect.log_level}",
|
|
"--concurrency",
|
|
"${meta.connect.proxy_concurrency}",
|
|
"--disable-hot-restart"
|
|
]
|
|
}
|
|
|
|
resources {
|
|
cpu = 50
|
|
memory = 64
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
}
|
|
task "nomad-vector-logger" {
|
|
|
|
driver = "docker"
|
|
# Use a random user instead of root
|
|
user = 3987
|
|
|
|
config {
|
|
image = "danielberteaud/nomad-vector-logger:24.5-1"
|
|
readonly_rootfs = true
|
|
pids_limit = 50
|
|
# Nomad Vector Logger needs to run on the host's network namespace
|
|
# so it can reach the Nomad Agent API on localhost:4646
|
|
network_mode = "host"
|
|
# Host network namespace requires disabling user namespace
|
|
userns_mode = "host"
|
|
command = "nomad-vector-logger"
|
|
args = [
|
|
"--config",
|
|
"/local/nomad-vector-logger.toml"
|
|
]
|
|
}
|
|
|
|
# We want to run Nomad Vector Logger before vector agent
|
|
lifecycle {
|
|
hook = "prestart"
|
|
sidecar = true
|
|
}
|
|
|
|
|
|
vault {
|
|
policies = ["nomad-vector-logger"]
|
|
env = false
|
|
disable_file = true
|
|
change_mode = "noop"
|
|
}
|
|
|
|
|
|
|
|
# Use a template block instead of env {} so we can fetch values from vault
|
|
template {
|
|
data = <<_EOT
|
|
LANG=fr_FR.utf8
|
|
TZ=Europe/Paris
|
|
_EOT
|
|
destination = "secrets/.env"
|
|
perms = 400
|
|
env = true
|
|
}
|
|
|
|
|
|
# Env to access Nomad API
|
|
template {
|
|
data = <<_EOT
|
|
NOMAD_TOKEN={{ with secret "nomad/creds/nomad-vector-logger" }}{{ .Data.secret_id }}{{ end }}
|
|
NOMAD_ADDR=https://localhost:4646
|
|
NOMAD_CLIENT_CERT=/secrets/nomad.bundle.pem
|
|
NOMAD_CLIENT_KEY=/secrets/nomad.bundle.pem
|
|
NOMAD_CACERT=/local/nomad.ca.pem
|
|
_EOT
|
|
destination = "secrets/.nomad-vector-logger.env"
|
|
perms = 400
|
|
env = true
|
|
}
|
|
|
|
# The main configuration file for nomad-vector-logger
|
|
template {
|
|
data = <<_EOT
|
|
[app]
|
|
log_level = "info"
|
|
env = "prod"
|
|
refresh_interval = "10s"
|
|
remove_alloc_interval = "30s"
|
|
nomad_data_dir = "/nomad"
|
|
vector_config_dir = "/alloc/data/vector_conf"
|
|
extra_templates_dir = "/local/template/"
|
|
custom_logs_dir = "alloc/custom"
|
|
|
|
_EOT
|
|
destination = "local/nomad-vector-logger.toml"
|
|
}
|
|
|
|
# Disable the default nomad.toml template, as we provide our own nomad.yml template
|
|
template {
|
|
data = "# Disable the default toml template"
|
|
destination = "local/template/nomad.toml"
|
|
}
|
|
|
|
# The vector configuration template used to generate the vector conf
|
|
template {
|
|
data = <<_EOT
|
|
sources:
|
|
{{- range $value := . }}
|
|
source_{{ $value.Key }}:
|
|
type: file
|
|
include: ["{{ $value.LogDir }}"]
|
|
line_delimiter: "\n"
|
|
read_from: beginning
|
|
# Handle multi-line Java stacktraces
|
|
multiline:
|
|
start_pattern: "^[^\\s]"
|
|
mode: continue_through
|
|
condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
|
|
timeout_ms: 1000
|
|
ignore_older_secs: 1800
|
|
oldest_first: true
|
|
|
|
{{- end }}
|
|
|
|
transforms:
|
|
{{- range $value := . }}
|
|
|
|
transform_{{ $value.Key }}:
|
|
type: remap
|
|
inputs: ["source_{{ $value.Key }}"]
|
|
source: |
|
|
# Store Nomad metadata.
|
|
.nomad.namespace = "{{ $value.Namespace }}"
|
|
.nomad.node_name = "{{ $value.Node }}"
|
|
.nomad.job_name = "{{ $value.Job }}"
|
|
.nomad.group_name = "{{ $value.Group }}"
|
|
.nomad.task_name = "{{ $value.Task }}"
|
|
.nomad.alloc_name = "{{ $value.Name }}"
|
|
.nomad.alloc = replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")
|
|
# Set instance = <TaskName>-<Alloc Index> so it's similar to what prometheus has
|
|
.nomad.instance = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
|
|
|
|
{{- end }}
|
|
|
|
_EOT
|
|
destination = "local/template/nomad.yml"
|
|
# {{ }} is used by the template, so prevent consul-template to interprete it
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
}
|
|
|
|
# Get a client cert for the Nomad API
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/nomad/issue/nomad-vector-logger"
|
|
"common_name=nomad-vector-logger.nomad.consul"
|
|
"ttl=72h" }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/nomad.bundle.pem"
|
|
uid = 3987
|
|
perms = "0400"
|
|
}
|
|
|
|
# The CA chain to validate Nomad certificates
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/nomad.ca.pem"
|
|
}
|
|
|
|
# Nomad vector logger needs read only access to the log dir
|
|
volume_mount {
|
|
volume = "nomad"
|
|
destination = "/nomad"
|
|
read_only = true
|
|
}
|
|
|
|
|
|
resources {
|
|
cpu = 20
|
|
memory = 24
|
|
memory_max = 50
|
|
}
|
|
|
|
}
|
|
|
|
# Nomad Vector Logger can take a few seconds to generate the initial configuration file
|
|
# This task ensure the file exists before vector is started (to prevent an error as the
|
|
# transform_nomad_alloc_* sources won't have anything before the file exists)
|
|
task "wait-for-vector-conf" {
|
|
|
|
driver = "docker"
|
|
|
|
config {
|
|
image = "busybox:latest"
|
|
readonly_rootfs = true
|
|
pids_limit = 20
|
|
command = "sh"
|
|
args = [
|
|
"-c",
|
|
"echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done"
|
|
]
|
|
}
|
|
|
|
lifecycle {
|
|
hook = "prestart"
|
|
}
|
|
|
|
# The task will shutdown once the config is available, so just
|
|
# allocate very few resources
|
|
resources {
|
|
cpu = 10
|
|
memory = 10
|
|
}
|
|
}
|
|
|
|
# The main vector task, which will read logs using the config file generated by Nomad Vector Logger
|
|
task "vector" {
|
|
|
|
driver = "docker"
|
|
leader = true
|
|
|
|
config {
|
|
image = "danielberteaud/vector:0.38.0-1"
|
|
userns_mode = "host"
|
|
readonly_rootfs = true
|
|
pids_limit = 1000
|
|
args = [
|
|
"--config", "/local/vector.yml",
|
|
"--config-dir", "/alloc/data/vector_conf",
|
|
"--watch-config"
|
|
]
|
|
}
|
|
|
|
|
|
vault {
|
|
policies = ["metrics"]
|
|
env = false
|
|
disable_file = true
|
|
change_mode = "noop"
|
|
}
|
|
|
|
|
|
env {
|
|
NODE_UNIQUE_NAME = "${node.unique.name}"
|
|
}
|
|
|
|
# Get a certificate from vault to protect the metrics endpoint
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/metrics.bundle.pem"
|
|
}
|
|
|
|
# Get the root CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
|
|
|
|
|
|
# Main vector configuration
|
|
template {
|
|
data = <<_EOT
|
|
data_dir: /data
|
|
# Don't keep metrics indefinitly if they are not updated anymore
|
|
expire_metrics_secs: 60
|
|
|
|
sources:
|
|
metrics-vector:
|
|
type: internal_metrics
|
|
|
|
sinks:
|
|
|
|
# Read Nomad's logs and send them to Loki
|
|
out_files_loki:
|
|
type: loki
|
|
endpoint: http://127.0.0.1:3100
|
|
inputs: ["transform_nomad_alloc_*"]
|
|
encoding:
|
|
codec: text
|
|
out_of_order_action: accept
|
|
labels:
|
|
namespace: "{{ .nomad.namespace }}"
|
|
job: "{{ .nomad.job_name }}"
|
|
group: "{{ .nomad.group_name }}"
|
|
task: "{{ .nomad.task_name }}"
|
|
host: "{{ .nomad.node_name }}"
|
|
alloc: "{{ .nomad.alloc }}"
|
|
buffer:
|
|
type: disk
|
|
max_size: 268435488
|
|
remove_label_fields: true
|
|
|
|
prometheus:
|
|
type: prometheus_exporter
|
|
inputs: ["metrics-vector"]
|
|
address: 0.0.0.0:${NOMAD_ALLOC_PORT_metrics}
|
|
tls:
|
|
enabled: true
|
|
crt_file: /secrets/metrics.bundle.pem
|
|
key_file: /secrets/metrics.bundle.pem
|
|
ca_file: /local/monitoring.ca.pem
|
|
verify_certificate: true
|
|
|
|
|
|
_EOT
|
|
destination = "local/vector.yml"
|
|
left_delimiter = "{{{"
|
|
right_delimiter = "}}}"
|
|
|
|
wait {
|
|
min = "5s"
|
|
max = "30s"
|
|
}
|
|
}
|
|
volume_mount {
|
|
volume = "nomad"
|
|
destination = "/nomad"
|
|
read_only = true
|
|
}
|
|
|
|
volume_mount {
|
|
volume = "data"
|
|
destination = "/data"
|
|
read_only = false
|
|
}
|
|
|
|
|
|
resources {
|
|
cpu = 100
|
|
memory = 384
|
|
memory_max = 512
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
# This group runs the prometheus node-exporter to expose prometheus metrics from the node
|
|
group "node-exporter" {
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "metrics" {}
|
|
}
|
|
|
|
|
|
|
|
volume "host" {
|
|
source = "host_root"
|
|
type = "host"
|
|
read_only = true
|
|
}
|
|
|
|
|
|
service {
|
|
name = "node-exporter"
|
|
meta {
|
|
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
|
alloc = "${node.unique.name}"
|
|
datacenter = "${NOMAD_DC}"
|
|
group = "${NOMAD_GROUP_NAME}"
|
|
job = "${NOMAD_JOB_NAME}"
|
|
namespace = "${NOMAD_NAMESPACE}"
|
|
node = "${node.unique.name}"
|
|
region = "${NOMAD_REGION}"
|
|
}
|
|
|
|
}
|
|
|
|
task "node-exporter" {
|
|
driver = "docker"
|
|
user = 100320
|
|
|
|
config {
|
|
image = "danielberteaud/node-exporter:1.8.0-1"
|
|
pid_mode = "host"
|
|
userns_mode = "host"
|
|
readonly_rootfs = true
|
|
pids_limit = 50
|
|
command = "/usr/local/bin/node_exporter"
|
|
args = [
|
|
"--path.rootfs=/host",
|
|
"--web.config.file=/local/tls.yml",
|
|
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}",
|
|
"--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)",
|
|
]
|
|
}
|
|
|
|
|
|
vault {
|
|
policies = ["metrics"]
|
|
env = false
|
|
disable_file = true
|
|
change_mode = "noop"
|
|
}
|
|
|
|
# Get a certificate from vault to protect the metrics endpoint
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/metrics.bundle.pem"
|
|
}
|
|
|
|
# Get the root CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
|
|
|
|
|
|
template {
|
|
data = <<_EOT
|
|
tls_server_config:
|
|
cert_file: /secrets/metrics.bundle.pem
|
|
key_file: /secrets/metrics.bundle.pem
|
|
client_ca_file: /local/monitoring.ca.pem
|
|
client_auth_type: RequireAndVerifyClientCert
|
|
|
|
|
|
_EOT
|
|
destination = "local/tls.yml"
|
|
}
|
|
|
|
volume_mount {
|
|
volume = "host"
|
|
destination = "/host"
|
|
read_only = true
|
|
propagation_mode = "host-to-task"
|
|
}
|
|
|
|
|
|
resources {
|
|
cpu = 50
|
|
memory = 32
|
|
memory_max = 56
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
group "consul-agent-exporter" {
|
|
|
|
|
|
shutdown_delay = "6s"
|
|
|
|
|
|
|
|
ephemeral_disk {
|
|
# Use minimal ephemeral disk
|
|
size = 101
|
|
}
|
|
|
|
|
|
network {
|
|
mode = "bridge"
|
|
port "metrics" {}
|
|
}
|
|
|
|
service {
|
|
name = "consul-agent"
|
|
meta {
|
|
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
|
alloc = "${node.unique.name}"
|
|
datacenter = "${NOMAD_DC}"
|
|
group = "${NOMAD_GROUP_NAME}"
|
|
job = "${NOMAD_JOB_NAME}"
|
|
namespace = "${NOMAD_NAMESPACE}"
|
|
node = "${node.unique.name}"
|
|
region = "${NOMAD_REGION}"
|
|
}
|
|
|
|
}
|
|
|
|
task "consul-agent-metrics-proxy" {
|
|
driver = "docker"
|
|
|
|
config {
|
|
image = "nginxinc/nginx-unprivileged:alpine"
|
|
readonly_rootfs = true
|
|
|
|
mount {
|
|
type = "tmpfs"
|
|
target = "/tmp"
|
|
tmpfs_options {
|
|
size = 3000000
|
|
}
|
|
}
|
|
|
|
volumes = [
|
|
"secrets/nginx.conf:/etc/nginx/conf.d/default.conf:ro"
|
|
]
|
|
}
|
|
|
|
|
|
vault {
|
|
policies = ["metrics", "cluster-exporter"]
|
|
env = false
|
|
disable_file = true
|
|
change_mode = "noop"
|
|
}
|
|
|
|
|
|
|
|
# Use a template block instead of env {} so we can fetch values from vault
|
|
template {
|
|
data = <<_EOT
|
|
LANG=fr_FR.utf8
|
|
TZ=Europe/Paris
|
|
_EOT
|
|
destination = "secrets/.env"
|
|
perms = 400
|
|
env = true
|
|
}
|
|
|
|
# Get a certificate from vault to protect the metrics endpoint
|
|
template {
|
|
data = <<_EOT
|
|
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
|
{{ .Cert }}
|
|
{{ .Key }}
|
|
{{- end }}
|
|
_EOT
|
|
destination = "secrets/metrics.bundle.pem"
|
|
}
|
|
|
|
# Get the root CA
|
|
template {
|
|
data = <<_EOT
|
|
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
|
_EOT
|
|
destination = "local/monitoring.ca.pem"
|
|
}
|
|
|
|
template {
|
|
data = <<_EOT
|
|
server {
|
|
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
|
|
http2 on;
|
|
|
|
ssl_certificate /secrets/metrics.bundle.pem;
|
|
ssl_certificate_key /secrets/metrics.bundle.pem;
|
|
ssl_client_certificate /local/monitoring.ca.pem;
|
|
ssl_verify_client on;
|
|
ssl_protocols TLSv1.2 TLSv1.3;
|
|
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
|
ssl_session_cache shared:SSL:10m;
|
|
ssl_session_timeout 1h;
|
|
ssl_session_tickets off;
|
|
gzip on;
|
|
gzip_types
|
|
text/plain;
|
|
gzip_vary on;
|
|
|
|
server_tokens off;
|
|
|
|
if ($request_method !~ ^(GET|HEAD)$ ) {
|
|
return 405;
|
|
}
|
|
|
|
set $consul_token "{{ with secret "consul/creds/cluster-exporter" }}{{ .Data.token }}{{ end }}";
|
|
|
|
location /metrics {
|
|
proxy_pass http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500/v1/agent/metrics?format=prometheus;
|
|
proxy_set_header X-Consul-Token $consul_token;
|
|
}
|
|
}
|
|
|
|
_EOT
|
|
destination = "secrets/nginx.conf"
|
|
}
|
|
|
|
|
|
resources {
|
|
cpu = 10
|
|
memory = 15
|
|
memory_max = 24
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|