Add vector-agent and nomad-vector-logger

This commit is contained in:
Daniel Berteaud 2024-03-25 12:27:46 +01:00
parent 210264b4aa
commit defebffc50
22 changed files with 1031 additions and 160 deletions

216
agent.nomad.hcl Normal file
View File

@ -0,0 +1,216 @@
job "[[ .instance ]]-agent" {
[[- $c := merge .monitoring.agent .monitoring . ]]
[[ template "common/job_start" $c ]]
type = "system"
group "logs-collector" {
[[ $c := merge $c.vector $c ]]
network {
mode = "bridge"
port "metrics" {}
}
# Try harder to restart tasks if they fail
restart {
attempts = 20
interval = "5m"
mode = "delay"
}
[[ template "common/volumes" $c ]]
service {
name = "vector-agent[[ .consul.suffix ]]"
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
}
task "nomad-vector-logger" {
[[- $n := merge $c.nomad_vector_logger $c ]]
driver = "[[ $n.nomad.driver ]]"
# Use a random user instead of root
user = 3987
config {
image = "[[ $n.image ]]"
readonly_rootfs = true
pids_limit = 50
# Nomad Vector Logger needs to run on the host's network namespace
# so it can reach the Nomad Agent API on localhost:4646
network_mode = "host"
# Host network namespace requires disabling user namespace
userns_mode = "host"
command = "nomad-vector-logger"
args = [
"--config",
"/local/nomad-vector-logger.toml"
]
}
# We want to run Nomad Vector Logger before vector agent
lifecycle {
hook = "prestart"
sidecar = true
}
[[ template "common/vault.policies" $n ]]
[[ template "common/file_env" $n ]]
# Env to access Nomad API
template {
data = <<_EOT
NOMAD_TOKEN={{ with secret "nomad/creds/nomad-vector-logger[[ .consul.suffix ]]" }}{{ .Data.secret_id }}{{ end }}
NOMAD_ADDR=https://localhost:4646
NOMAD_CLIENT_CERT=/secrets/nomad.bundle.pem
NOMAD_CLIENT_KEY=/secrets/nomad.bundle.pem
NOMAD_CACERT=/local/nomad.ca.pem
_EOT
destination = "secrets/.nomad-vector-logger.env"
perms = 400
env = true
}
# The main configuration file for nomad-vector-logger
template {
data = <<_EOT
[[ template "monitoring/agent/nomad-vector-logger.toml" $n ]]
_EOT
destination = "local/nomad-vector-logger.toml"
}
# Disable the default nomad.toml template
template {
data = "# Disable the default toml template"
destination = "local/template/nomad.toml"
}
# The vector configuration template used to generate the vector conf
template {
data = <<_EOT
[[ template "monitoring/agent/vector-template.yml" $n ]]
_EOT
destination = "local/template/nomad.yml"
# {{ }} is used by the template, so prevent consul-template to interprete it
left_delimiter = "{{{"
right_delimiter = "}}}"
}
# Get a client cert for the Nomad API
template {
data = <<_EOT
{{- with pkiCert "pki/nomad/issue/nomad-vector-logger[[ .consul.suffix ]]"
"common_name=nomad-vector-logger[[ .consul.suffix ]].nomad.[[ .consul.domain ]]"
"ttl=72h" }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/nomad.bundle.pem"
uid = 3987
perms = "0400"
}
# The CA chain to validate Nomad certificates
template {
data = <<_EOT
{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/nomad.ca.pem"
}
# The main config file
volume_mount {
volume = "nomad"
destination = "/nomad"
read_only = true
}
[[ template "common/resources" $n ]]
}
# Nomad Vector Logger can take a few seconds to generate the initial configuration file
# This task ensure the file exists before vector is started (to prevent an error as the
# transform_nomad_alloc_* sources won't have anything before the file exists)
task "wait-for-vector-conf" {
driver = "[[ $c.nomad.driver ]]"
config {
image = "busybox:latest"
command = "sh"
args = [
"-c",
"echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
]
}
lifecycle {
hook = "prestart"
}
# The task will shutdown once the config is available, so just
# allocate very few resources
resources {
cpu = 10
memory = 10
}
}
# The main vector task, which will read logs using the config file generated by Nomad Vector Logger
task "vector" {
driver = "[[ $c.nomad.driver ]]"
leader = true
config {
image = "[[ $c.image ]]"
userns_mode = "host"
args = [
"--watch-config",
"--config", "/local/vector.yml",
"--config-dir", "/alloc/data/vector_conf"
]
}
[[ template "common/vault.policies" $c ]]
env {
NODE_UNIQUE_NAME = "${node.unique.name}"
}
[[ template "common/metrics_cert" $c ]]
template {
data =<<_EOT
[[ template "monitoring/agent/vector.yml" $c ]]
_EOT
destination = "local/vector.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
wait {
min = "5s"
max = "30s"
}
}
volume_mount {
volume = "nomad"
destination = "/nomad"
read_only = true
}
volume_mount {
volume = "data"
destination = "/data"
read_only = false
}
[[ template "common/resources" $c ]]
}
}
}

403
example/agent.nomad.hcl Normal file
View File

@ -0,0 +1,403 @@
job "monitoring-agent" {
datacenters = ["dc1"]
region = "global"
node_pool = "all"
type = "system"
group "logs-collector" {
network {
mode = "bridge"
port "metrics" {}
}
# Try harder to restart tasks if they fail
restart {
attempts = 20
interval = "5m"
mode = "delay"
}
volume "data" {
source = "vector_data"
type = "host"
}
volume "nomad" {
source = "nomad_alloc"
type = "host"
read_only = true
}
service {
name = "vector-agent"
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${node.unique.name}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
}
connect {
sidecar_service {
proxy {
upstreams {
destination_name = "loki"
local_bind_port = 3100
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
}
}
sidecar_task {
config {
args = [
"-c",
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
"-l",
"${meta.connect.log_level}",
"--concurrency",
"${meta.connect.proxy_concurrency}",
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
}
task "nomad-vector-logger" {
driver = "docker"
# Use a random user instead of root
user = 3987
config {
image = "danielberteaud/nomad-vector-logger:24.3-2"
readonly_rootfs = true
pids_limit = 50
# Nomad Vector Logger needs to run on the host's network namespace
# so it can reach the Nomad Agent API on localhost:4646
network_mode = "host"
# Host network namespace requires disabling user namespace
userns_mode = "host"
command = "nomad-vector-logger"
args = [
"--config",
"/local/nomad-vector-logger.toml"
]
}
# We want to run Nomad Vector Logger before vector agent
lifecycle {
hook = "prestart"
sidecar = true
}
vault {
policies = ["nomad-vector-logger"]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = <<_EOT
LANG=fr_FR.utf8
TZ=Europe/Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
# Env to access Nomad API
template {
data = <<_EOT
NOMAD_TOKEN={{ with secret "nomad/creds/nomad-vector-logger" }}{{ .Data.secret_id }}{{ end }}
NOMAD_ADDR=https://localhost:4646
NOMAD_CLIENT_CERT=/secrets/nomad.bundle.pem
NOMAD_CLIENT_KEY=/secrets/nomad.bundle.pem
NOMAD_CACERT=/local/nomad.ca.pem
_EOT
destination = "secrets/.nomad-vector-logger.env"
perms = 400
env = true
}
# The main configuration file for nomad-vector-logger
template {
data = <<_EOT
[app]
log_level = "info"
env = "prod"
refresh_interval = "10s"
remove_alloc_interval = "30s"
nomad_data_dir = "/nomad"
vector_config_dir = "/alloc/data/vector_conf"
extra_templates_dir = "/local/template/"
custom_logs_dir = "alloc/custom"
_EOT
destination = "local/nomad-vector-logger.toml"
}
# Disable the default nomad.toml template
template {
data = "# Disable the default toml template"
destination = "local/template/nomad.toml"
}
# The vector configuration template used to generate the vector conf
template {
data = <<_EOT
sources:
{{- range $value := . }}
source_{{ $value.Key }}:
type: file
include: ["{{ $value.LogDir }}"]
line_delimiter: "\n"
read_from: beginning
# Handle multi-line Java stacktraces
multiline:
start_pattern: "^[^\\s]"
mode: continue_through
condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
timeout_ms: 1000
{{- end }}
transforms:
{{- range $value := . }}
transform_{{ $value.Key }}:
type: remap
inputs: ["source_{{ $value.Key }}"]
source: |
# Store Nomad metadata.
.nomad.namespace = "{{ $value.Namespace }}"
.nomad.node_name = "{{ $value.Node }}"
.nomad.job_name = "{{ $value.Job }}"
.nomad.group_name = "{{ $value.Group }}"
.nomad.task_name = "{{ $value.Task }}"
.nomad.alloc_id = "{{ $value.ID }}"
.nomad.alloc_name = "{{ $value.Name }}"
# Set alloc = <TaskName>-<Alloc Index> so it's similar to what prometheus has
.nomad.alloc = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
{{- end }}
_EOT
destination = "local/template/nomad.yml"
# {{ }} is used by the template, so prevent consul-template to interprete it
left_delimiter = "{{{"
right_delimiter = "}}}"
}
# Get a client cert for the Nomad API
template {
data = <<_EOT
{{- with pkiCert "pki/nomad/issue/nomad-vector-logger"
"common_name=nomad-vector-logger.nomad.consul"
"ttl=72h" }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/nomad.bundle.pem"
uid = 3987
perms = "0400"
}
# The CA chain to validate Nomad certificates
template {
data = <<_EOT
{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/nomad.ca.pem"
}
# The main config file
volume_mount {
volume = "nomad"
destination = "/nomad"
read_only = true
}
resources {
cpu = 20
memory = 24
memory_max = 50
}
}
# Nomad Vector Logger can take a few seconds to generate the initial configuration file
# This task ensure the file exists before vector is started (to prevent an error as the
# transform_nomad_alloc_* sources won't have anything before the file exists)
task "wait-for-vector-conf" {
driver = "docker"
config {
image = "busybox:latest"
command = "sh"
args = [
"-c",
"echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
]
}
lifecycle {
hook = "prestart"
}
# The task will shutdown once the config is available, so just
# allocate very few resources
resources {
cpu = 10
memory = 10
}
}
# The main vector task, which will read logs using the config file generated by Nomad Vector Logger
task "vector" {
driver = "docker"
leader = true
config {
image = "danielberteaud/vector:0.36.1-1"
userns_mode = "host"
args = [
"--watch-config",
"--config", "/local/vector.yml",
"--config-dir", "/alloc/data/vector_conf"
]
}
vault {
policies = ["metrics"]
env = false
disable_file = true
change_mode = "noop"
}
env {
NODE_UNIQUE_NAME = "${node.unique.name}"
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
data_dir: /data
# Don't keep metrics indefinitly if they are not updated anymore
expire_metrics_secs: 60
sources:
metrics-vector:
type: internal_metrics
sinks:
loki:
type: loki
inputs: ["transform_nomad_alloc_*"]
endpoint: http://127.0.0.1:3100
encoding:
codec: text
labels:
namespace: "{{ .nomad.namespace }}"
job: "{{ .nomad.job_name }}"
group: "{{ .nomad.group_name }}"
task: "{{ .nomad.task_name }}"
host: "{{ .nomad.node_name }}"
alloc: "{{ .nomad.alloc }}"
buffer:
type: disk
max_size: 268435488
remove_label_fields: true
prometheus:
type: prometheus_exporter
inputs: ["metrics-vector"]
address: 0.0.0.0:${NOMAD_ALLOC_PORT_metrics}
tls:
enabled: true
crt_file: /secrets/metrics.bundle.pem
key_file: /secrets/metrics.bundle.pem
ca_file: /local/monitoring.ca.pem
verify_certificate: true
_EOT
destination = "local/vector.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
wait {
min = "5s"
max = "30s"
}
}
volume_mount {
volume = "nomad"
destination = "/nomad"
read_only = true
}
volume_mount {
volume = "data"
destination = "/data"
read_only = false
}
resources {
cpu = 100
memory = 192
memory_max = 384
}
}
}
}

View File

@ -0,0 +1,19 @@
FROM golang:alpine AS builder
RUN set -eux &&\
apk --no-cache add tar git ca-certificates &&\
cd /tmp &&\
git clone --depth=1 --branch=feat/name https://github.com/mr-karan/nomad-vector-logger.git &&\
cd nomad-vector-logger &&\
CGO_ENABLED=0 go build -ldflags="-s -w" -o /nomad-vector-logger
FROM danielberteaud/alpine:24.3-1
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
RUN set -euxo pipefail &&\
mkdir -p /etc/nomad-vector-logger
COPY --from=builder --chown=root:root --chmod=755 /nomad-vector-logger /usr/local/bin/nomad-vector-logger
COPY --from=builder /tmp/nomad-vector-logger/config.sample.toml /etc/nomad-vector-logger/
WORKDIR /etc/nomad-vector-logger/
CMD ["nomad-vector-logger"]

8
example/init/nomad Executable file
View File

@ -0,0 +1,8 @@
#!/bin/sh
set -euo pipefail
vault write nomad/role/nomad-vector-logger \
ttl=720h \
max_ttl=720h \
policies="nomad-vector-logger"

View File

@ -167,3 +167,15 @@ vault write pki/consul/roles/cluster-exporter \
server_flag=false \
client_flag=true \
ou="Cluster metrics exporter"
# Create a role on the Nomad PKI for nomad-vector-logger
vault write pki/nomad/roles/nomad-vector-logger \
allowed_domains='nomad-vector-logger.nomad.consul' \
allow_bare_domains=true \
allow_subdomains=false \
allow_wildcard_certificates=false \
max_ttl=168h \
allow_ip_sans=false \
server_flag=false \
client_flag=true \
ou="Nomad Vector Logger"

View File

@ -0,0 +1,13 @@
namespace "*" {
capabilities = ["list-jobs", "read-job"]
}
node {
policy = "read"
}
agent {
policy = "read"
}

View File

@ -6,7 +6,7 @@ job "monitoring-services" {
# Metrics is running prometheus and various exporters
group "metrics" {
group "metrics-server" {
shutdown_delay = "6s"
count = 1
@ -34,6 +34,7 @@ job "monitoring-services" {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
}
connect {
@ -109,15 +110,18 @@ job "monitoring-services" {
policies = ["metrics"]
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}{{ end -}}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
@ -125,6 +129,7 @@ _EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {
@ -475,68 +480,6 @@ _EOT
groups:
- name: ConsulExporter
rules:
- alert: ConsulServiceHealthcheckFailed
# Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available)
expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0'
for: 2m
labels:
severity: critical
annotations:
summary: Consul service healthcheck failed (service {{ $labels.service_name }})
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulMissingMasterNode
expr: 'consul_raft_peers < (max_over_time(consul_raft_peers{}[6h]) / 2) + 1'
for: 0m
labels:
severity: critical
annotations:
summary: Consul missing master node (node {{ $labels.node }})
description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulAgentUnhealthy
expr: 'consul_health_node_status{status="critical"} == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Consul agent unhealthy (node {{ $labels.node }})
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulServiceWarning
expr: 'consul_health_service_status{status="warning"} == 1'
for: 2m
labels:
severity: warning
annotations:
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulServiceCritical
expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/consul.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
# vi: syntax=yaml
groups:
- name: JVM
rules:
@ -880,6 +823,68 @@ _EOT
left_delimiter = "{{{"
right_delimiter = "}}}"
}
template {
data = <<_EOT
# vi: syntax=yaml
groups:
- name: ConsulExporter
rules:
- alert: ConsulServiceHealthcheckFailed
# Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available)
expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0'
for: 2m
labels:
severity: critical
annotations:
summary: Consul service healthcheck failed (service {{ $labels.service_name }})
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulMissingMasterNode
expr: 'consul_raft_leader != 1'
for: 0m
labels:
severity: critical
annotations:
summary: Consul missing master node (node {{ $labels.node }})
description: "No consul leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulAgentUnhealthy
expr: 'consul_health_node_status{status="critical"} == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Consul agent unhealthy (node {{ $labels.node }})
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulServiceWarning
expr: 'consul_health_service_status{status="warning"} == 1'
for: 2m
labels:
severity: warning
annotations:
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulServiceCritical
expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
_EOT
destination = "local/rules/consul.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
}
# A client cert, to connect to the AlertManager API
template {
@ -976,6 +981,7 @@ _EOT
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
}
connect {
@ -1051,15 +1057,18 @@ _EOT
policies = ["metrics"]
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}{{ end -}}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
@ -1067,6 +1076,7 @@ _EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {
@ -1345,7 +1355,7 @@ _EOT
}
}
group "logs" {
group "logs-server" {
shutdown_delay = "6s"
@ -1370,6 +1380,7 @@ _EOT
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
}
connect {
@ -1445,15 +1456,18 @@ _EOT
policies = ["metrics"]
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}{{ end -}}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
@ -1461,6 +1475,7 @@ _EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {
@ -1564,6 +1579,7 @@ limits_config:
max_entries_limit_per_query: 20000
max_query_parallelism: 128
retention_period: 720h
split_queries_by_interval: 0
ruler:
alertmanager_client:
tls_ca_path: /secrets/monitoring.ca.pem
@ -1652,7 +1668,7 @@ _EOT
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
# And with a loki sink. The goal is to be able to collect logs from various sources
group "aggregator" {
group "logs-aggregator" {
count = 1
shutdown_delay = "6s"
@ -1672,6 +1688,7 @@ _EOT
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
}
connect {
@ -1712,88 +1729,6 @@ _EOT
]
}
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
task "metrics-proxy" {
driver = "docker"
user = 8995
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
volumes = [
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
]
pids_limit = 100
}
lifecycle {
hook = "poststart"
sidecar = true
}
vault {
policies = ["metrics"]
}
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}{{ end -}}
_EOT
destination = "secrets/metrics.bundle.pem"
}
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://127.0.0.1:9001/metrics;
}
}
_EOT
destination = "local/default.conf"
}
resources {
cpu = 10
memory = 10
memory_max = 20
}
}
task "vector" {
driver = "docker"
@ -1805,6 +1740,14 @@ _EOT
}
vault {
policies = ["metrics"]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
@ -1817,6 +1760,25 @@ _EOT
env = true
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
@ -1891,7 +1853,13 @@ sinks:
prometheus:
type: prometheus_exporter
inputs: ["vector_metrics"]
address: "127.0.0.1:9001"
address: 0.0.0.0:$${NOMAD_ALLOC_PORT_metrics}
tls:
enabled: true
crt_file: /secrets/metrics.bundle.pem
key_file: /secrets/metrics.bundle.pem
ca_file: /local/monitoring.ca.pem
verify_certificate: true
_EOT
destination = "local/vector.yml"
@ -1947,6 +1915,18 @@ _EOT
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
upstreams {
destination_name = "loki"
local_bind_port = 3100
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
upstreams {
destination_name = "prometheus"
local_bind_port = 9090
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
}
}
sidecar_task {
@ -2015,15 +1995,18 @@ _EOT
policies = ["metrics"]
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}{{ end -}}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
@ -2031,6 +2014,7 @@ _EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {

View File

@ -0,0 +1,6 @@
path "nomad/creds/nomad-vector-logger" {
capabilities = ["read"]
}
path "pki/nomad/issue/nomad-vector-logger" {
capabilities = ["update"]
}

View File

@ -0,0 +1,19 @@
FROM golang:alpine AS builder
RUN set -eux &&\
apk --no-cache add tar git ca-certificates &&\
cd /tmp &&\
git clone --depth=1 --branch=feat/name https://github.com/mr-karan/nomad-vector-logger.git &&\
cd nomad-vector-logger &&\
CGO_ENABLED=0 go build -ldflags="-s -w" -o /nomad-vector-logger
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
MAINTAINER [[ .docker.maintainer ]]
RUN set -euxo pipefail &&\
mkdir -p /etc/nomad-vector-logger
COPY --from=builder --chown=root:root --chmod=755 /nomad-vector-logger /usr/local/bin/nomad-vector-logger
COPY --from=builder /tmp/nomad-vector-logger/config.sample.toml /etc/nomad-vector-logger/
WORKDIR /etc/nomad-vector-logger/
CMD ["nomad-vector-logger"]

8
init/nomad Executable file
View File

@ -0,0 +1,8 @@
#!/bin/sh
set -euo pipefail
vault write nomad/role/nomad-vector-logger[[ .consul.suffix ]] \
ttl=720h \
max_ttl=720h \
policies="nomad-vector-logger[[ .consul.suffix ]]"

View File

@ -80,3 +80,15 @@ vault write pki/consul/roles/cluster-exporter[[ .consul.suffix ]] \
server_flag=false \
client_flag=true \
ou="Cluster metrics exporter"
# Create a role on the Nomad PKI for nomad-vector-logger
vault write pki/nomad/roles/nomad-vector-logger[[ .consul.suffix ]] \
allowed_domains='nomad-vector-logger[[ .consul.suffix ]].nomad.[[ .consul.domain ]]' \
allow_bare_domains=true \
allow_subdomains=false \
allow_wildcard_certificates=false \
max_ttl=168h \
allow_ip_sans=false \
server_flag=false \
client_flag=true \
ou="Nomad Vector Logger"

View File

@ -0,0 +1,14 @@
[[- range $ns := .monitoring.namespaces ]]
namespace "[[ $ns ]]" {
capabilities = ["list-jobs", "read-job"]
}
[[- end ]]
node {
policy = "read"
}
agent {
policy = "read"
}

View File

@ -3,7 +3,7 @@ job "[[ .instance ]]-services" {
[[ template "common/job_start" . ]]
# Metrics is running prometheus and various exporters
group "metrics" {
group "metrics-server" {
[[- $c := merge .monitoring.prometheus .monitoring . ]]
shutdown_delay = "6s"
@ -375,7 +375,7 @@ _EOT
}
}
group "logs" {
group "logs-server" {
[[- $c := merge .monitoring.loki .monitoring . ]]
@ -481,7 +481,7 @@ _EOT
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
# And with a loki sink. The goal is to be able to collect logs from various sources
group "aggregator" {
group "logs-aggregator" {
[[- $c := merge .monitoring.aggregator .monitoring . ]]
count = [[ $c.count ]]
@ -530,8 +530,6 @@ _EOT
}
[[- end ]]
[[ template "common/task.metrics_proxy" $c ]]
task "vector" {
driver = "[[ $c.nomad.driver ]]"
@ -542,7 +540,9 @@ _EOT
args = [ "--config=/local/vector.yml" ]
}
[[ template "common/vault.policies" $c ]]
[[ template "common/file_env" $c ]]
[[ template "common/metrics_cert" $c ]]
template {
data = <<_EOT

View File

@ -0,0 +1,9 @@
[app]
log_level = "info"
env = "prod"
refresh_interval = "10s"
remove_alloc_interval = "30s"
nomad_data_dir = "/nomad"
vector_config_dir = "/alloc/data/vector_conf"
extra_templates_dir = "/local/template/"
custom_logs_dir = "alloc/custom"

View File

@ -0,0 +1,36 @@
sources:
{{- range $value := . }}
source_{{ $value.Key }}:
type: file
include: ["{{ $value.LogDir }}"]
line_delimiter: "\n"
read_from: beginning
# Handle multi-line Java stacktraces
multiline:
start_pattern: "^[^\\s]"
mode: continue_through
condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
timeout_ms: 1000
{{- end }}
transforms:
{{- range $value := . }}
transform_{{ $value.Key }}:
type: remap
inputs: ["source_{{ $value.Key }}"]
source: |
# Store Nomad metadata.
.nomad.namespace = "{{ $value.Namespace }}"
.nomad.node_name = "{{ $value.Node }}"
.nomad.job_name = "{{ $value.Job }}"
.nomad.group_name = "{{ $value.Group }}"
.nomad.task_name = "{{ $value.Task }}"
.nomad.alloc_id = "{{ $value.ID }}"
.nomad.alloc_name = "{{ $value.Name }}"
# Set alloc = <TaskName>-<Alloc Index> so it's similar to what prometheus has
.nomad.alloc = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
{{- end }}

View File

@ -0,0 +1,38 @@
data_dir: /data
# Don't keep metrics indefinitly if they are not updated anymore
expire_metrics_secs: 60
sources:
metrics-vector:
type: internal_metrics
sinks:
loki:
type: loki
inputs: ["transform_nomad_alloc_*"]
endpoint: http://127.0.0.1:3100
encoding:
codec: text
labels:
namespace: "{{ .nomad.namespace }}"
job: "{{ .nomad.job_name }}"
group: "{{ .nomad.group_name }}"
task: "{{ .nomad.task_name }}"
host: "{{ .nomad.node_name }}"
alloc: "{{ .nomad.alloc }}"
buffer:
type: disk
max_size: 268435488
remove_label_fields: true
prometheus:
type: prometheus_exporter
inputs: ["metrics-vector"]
address: 0.0.0.0:${NOMAD_ALLOC_PORT_metrics}
tls:
enabled: true
crt_file: /secrets/metrics.bundle.pem
key_file: /secrets/metrics.bundle.pem
ca_file: /local/monitoring.ca.pem
verify_certificate: true

View File

@ -82,4 +82,10 @@ sinks:
prometheus:
type: prometheus_exporter
inputs: ["vector_metrics"]
address: "127.0.0.1:9001"
address: 0.0.0.0:${NOMAD_ALLOC_PORT_metrics}
tls:
enabled: true
crt_file: /secrets/metrics.bundle.pem
key_file: /secrets/metrics.bundle.pem
ca_file: /local/monitoring.ca.pem
verify_certificate: true

View File

@ -51,6 +51,7 @@ limits_config:
ingestion_burst_size_mb: 100
max_entries_limit_per_query: 20000
max_query_parallelism: 128
split_queries_by_interval: 0
ruler:
alertmanager_url: alertmanager-tls[[ .consul.suffix ]]

View File

@ -179,6 +179,14 @@ scrape_configs:
action: drop
regex: (nomad(\-client)?|consul|vault)
[[- if not (has .namespaces "*") ]]
# Only monitor services from the namespace configured
- source_labels: [__meta_consul_service_metadata_namespace]
regex: ^[[ $namespaces := coll.Slice ]][[ range $ns := .namespaces ]][[ $ns = $ns | regexp.Replace "^\\*$" ".+" ]][[ $namespaces = append $ns $namespaces ]][[ end ]][[ join $namespaces "|" ]]$
action: keep
[[- end ]]
# Only keep services having a metrics-port set
- source_labels: [__meta_consul_service_metadata_metrics_port]
regex: \d+

View File

@ -17,13 +17,13 @@ groups:
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulMissingMasterNode
expr: 'consul_raft_peers < (max_over_time(consul_raft_peers{}[6h]) / 2) + 1'
expr: 'consul_raft_leader != 1'
for: 0m
labels:
severity: critical
annotations:
summary: Consul missing master node (node {{ $labels.node }})
description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "No consul leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulAgentUnhealthy
expr: 'consul_health_node_status{status="critical"} == 1'

View File

@ -14,6 +14,9 @@ vault:
monitoring:
namespaces:
- '*'
exporters:
count: 1
@ -166,6 +169,9 @@ monitoring:
upstreams:
- destination_name: 'loki[[ .consul.suffix ]]'
local_bind_port: 3100
vault:
policies:
- metrics[[ .consul.suffix ]]
fluentd:
enabled: false
traefik:
@ -210,6 +216,10 @@ monitoring:
upstreams:
- destination_name: postgres[[ .consul.suffix ]]
local_bind_port: 5432
- destination_name: loki[[ .consul.suffix ]]
local_bind_port: 3100
- destination_name: prometheus[[ .consul.suffix ]]
local_bind_port: 9090
volumes:
data:
type: csi
@ -229,5 +239,48 @@ monitoring:
prometheus:
metrics_url: http://localhost:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics
agent:
consul:
meta:
alloc: '${node.unique.name}'
nomad:
node_pool: all
nomad_vector_logger:
version: 24.3
image: '[[ .docker.repo ]]nomad-vector-logger:[[ .monitoring.agent.nomad_vector_logger.version ]]-2'
env: {}
resources:
cpu: 20
memory: 24
memory_max: 50
vault:
policies:
- nomad-vector-logger[[ .consul.suffix ]]
vector:
image: '[[ .monitoring.vector.image ]]'
env: {}
resources:
cpu: 100
memory: 192
memory_max: 384
vault:
policies:
- metrics[[ .consul.suffix ]]
consul:
connect:
upstreams:
- destination_name: loki[[ .consul.suffix ]]
local_bind_port: 3100
volumes:
nomad:
type: host
source: nomad_alloc
read_only: true
data:
type: host
source: vector_data
prometheus:
metrics_url: http://127.0.0.1:9001/metrics
prometheus:
enabled: true

View File

@ -0,0 +1,6 @@
path "nomad/creds/nomad-vector-logger[[ .consul.suffix ]]" {
capabilities = ["read"]
}
path "pki/nomad/issue/nomad-vector-logger[[ .consul.suffix ]]" {
capabilities = ["update"]
}