monitoring/example/agent.nomad.hcl

404 lines
9.5 KiB
HCL

job "monitoring-agent" {
datacenters = ["dc1"]
region = "global"
node_pool = "all"
type = "system"
group "logs-collector" {
network {
mode = "bridge"
port "metrics" {}
}
# Try harder to restart tasks if they fail
restart {
attempts = 20
interval = "5m"
mode = "delay"
}
volume "data" {
source = "vector_data"
type = "host"
}
volume "nomad" {
source = "nomad_alloc"
type = "host"
read_only = true
}
service {
name = "vector-agent"
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${node.unique.name}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
}
connect {
sidecar_service {
proxy {
upstreams {
destination_name = "loki"
local_bind_port = 3100
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
}
}
sidecar_task {
config {
args = [
"-c",
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
"-l",
"${meta.connect.log_level}",
"--concurrency",
"${meta.connect.proxy_concurrency}",
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
}
task "nomad-vector-logger" {
driver = "docker"
# Use a random user instead of root
user = 3987
config {
image = "danielberteaud/nomad-vector-logger:24.3-2"
readonly_rootfs = true
pids_limit = 50
# Nomad Vector Logger needs to run on the host's network namespace
# so it can reach the Nomad Agent API on localhost:4646
network_mode = "host"
# Host network namespace requires disabling user namespace
userns_mode = "host"
command = "nomad-vector-logger"
args = [
"--config",
"/local/nomad-vector-logger.toml"
]
}
# We want to run Nomad Vector Logger before vector agent
lifecycle {
hook = "prestart"
sidecar = true
}
vault {
policies = ["nomad-vector-logger"]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = <<_EOT
LANG=fr_FR.utf8
TZ=Europe/Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
# Env to access Nomad API
template {
data = <<_EOT
NOMAD_TOKEN={{ with secret "nomad/creds/nomad-vector-logger" }}{{ .Data.secret_id }}{{ end }}
NOMAD_ADDR=https://localhost:4646
NOMAD_CLIENT_CERT=/secrets/nomad.bundle.pem
NOMAD_CLIENT_KEY=/secrets/nomad.bundle.pem
NOMAD_CACERT=/local/nomad.ca.pem
_EOT
destination = "secrets/.nomad-vector-logger.env"
perms = 400
env = true
}
# The main configuration file for nomad-vector-logger
template {
data = <<_EOT
[app]
log_level = "info"
env = "prod"
refresh_interval = "10s"
remove_alloc_interval = "30s"
nomad_data_dir = "/nomad"
vector_config_dir = "/alloc/data/vector_conf"
extra_templates_dir = "/local/template/"
custom_logs_dir = "alloc/custom"
_EOT
destination = "local/nomad-vector-logger.toml"
}
# Disable the default nomad.toml template
template {
data = "# Disable the default toml template"
destination = "local/template/nomad.toml"
}
# The vector configuration template used to generate the vector conf
template {
data = <<_EOT
sources:
{{- range $value := . }}
source_{{ $value.Key }}:
type: file
include: ["{{ $value.LogDir }}"]
line_delimiter: "\n"
read_from: beginning
# Handle multi-line Java stacktraces
multiline:
start_pattern: "^[^\\s]"
mode: continue_through
condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
timeout_ms: 1000
{{- end }}
transforms:
{{- range $value := . }}
transform_{{ $value.Key }}:
type: remap
inputs: ["source_{{ $value.Key }}"]
source: |
# Store Nomad metadata.
.nomad.namespace = "{{ $value.Namespace }}"
.nomad.node_name = "{{ $value.Node }}"
.nomad.job_name = "{{ $value.Job }}"
.nomad.group_name = "{{ $value.Group }}"
.nomad.task_name = "{{ $value.Task }}"
.nomad.alloc_id = "{{ $value.ID }}"
.nomad.alloc_name = "{{ $value.Name }}"
# Set alloc = <TaskName>-<Alloc Index> so it's similar to what prometheus has
.nomad.alloc = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
{{- end }}
_EOT
destination = "local/template/nomad.yml"
# {{ }} is used by the template, so prevent consul-template to interprete it
left_delimiter = "{{{"
right_delimiter = "}}}"
}
# Get a client cert for the Nomad API
template {
data = <<_EOT
{{- with pkiCert "pki/nomad/issue/nomad-vector-logger"
"common_name=nomad-vector-logger.nomad.consul"
"ttl=72h" }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/nomad.bundle.pem"
uid = 3987
perms = "0400"
}
# The CA chain to validate Nomad certificates
template {
data = <<_EOT
{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/nomad.ca.pem"
}
# The main config file
volume_mount {
volume = "nomad"
destination = "/nomad"
read_only = true
}
resources {
cpu = 20
memory = 24
memory_max = 50
}
}
# Nomad Vector Logger can take a few seconds to generate the initial configuration file
# This task ensure the file exists before vector is started (to prevent an error as the
# transform_nomad_alloc_* sources won't have anything before the file exists)
task "wait-for-vector-conf" {
driver = "docker"
config {
image = "busybox:latest"
command = "sh"
args = [
"-c",
"echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
]
}
lifecycle {
hook = "prestart"
}
# The task will shutdown once the config is available, so just
# allocate very few resources
resources {
cpu = 10
memory = 10
}
}
# The main vector task, which will read logs using the config file generated by Nomad Vector Logger
task "vector" {
driver = "docker"
leader = true
config {
image = "danielberteaud/vector:0.36.1-1"
userns_mode = "host"
args = [
"--watch-config",
"--config", "/local/vector.yml",
"--config-dir", "/alloc/data/vector_conf"
]
}
vault {
policies = ["metrics"]
env = false
disable_file = true
change_mode = "noop"
}
env {
NODE_UNIQUE_NAME = "${node.unique.name}"
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
data_dir: /data
# Don't keep metrics indefinitly if they are not updated anymore
expire_metrics_secs: 60
sources:
metrics-vector:
type: internal_metrics
sinks:
loki:
type: loki
inputs: ["transform_nomad_alloc_*"]
endpoint: http://127.0.0.1:3100
encoding:
codec: text
labels:
namespace: "{{ .nomad.namespace }}"
job: "{{ .nomad.job_name }}"
group: "{{ .nomad.group_name }}"
task: "{{ .nomad.task_name }}"
host: "{{ .nomad.node_name }}"
alloc: "{{ .nomad.alloc }}"
buffer:
type: disk
max_size: 268435488
remove_label_fields: true
prometheus:
type: prometheus_exporter
inputs: ["metrics-vector"]
address: 0.0.0.0:${NOMAD_ALLOC_PORT_metrics}
tls:
enabled: true
crt_file: /secrets/metrics.bundle.pem
key_file: /secrets/metrics.bundle.pem
ca_file: /local/monitoring.ca.pem
verify_certificate: true
_EOT
destination = "local/vector.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
wait {
min = "5s"
max = "30s"
}
}
volume_mount {
volume = "nomad"
destination = "/nomad"
read_only = true
}
volume_mount {
volume = "data"
destination = "/data"
read_only = false
}
resources {
cpu = 100
memory = 192
memory_max = 384
}
}
}
}