Add node-exporter to the agent job

This commit is contained in:
Daniel Berteaud 2024-03-25 14:54:13 +01:00
parent defebffc50
commit f954afc251
8 changed files with 249 additions and 14 deletions

View File

@ -4,6 +4,9 @@ job "[[ .instance ]]-agent" {
[[ template "common/job_start" $c ]]
type = "system"
# This group will collect logs from the allocation running on the node
# It uses nomad-vector-logger to query the Nomad API and discover running allocations
# and then vector to read logs from all the discovered allocations. Logs are fowarded to loki through the service mesh
group "logs-collector" {
[[ $c := merge $c.vector $c ]]
@ -210,6 +213,61 @@ _EOT
read_only = false
}
[[ template "common/resources" $c ]]
}
}
# This group runs the prometheus node-exporter to expose prometheus metrics from the node
group "node-exporter" {
[[- $c := merge .monitoring.agent.node_exporter .monitoring.agent .monitoring . ]]
network {
mode = "bridge"
port "metrics" {}
}
[[ template "common/volumes" $c ]]
service {
name = "node-exporter[[.consul.suffix ]]"
[[ template "common/service_meta" $c ]]
}
task "node-exporter" {
driver = "[[ $c.nomad.driver ]]"
config {
image = "[[ $c.image ]]"
pid_mode = "host"
#network_mode = "host"
userns_mode = "host"
readonly_rootfs = true
pids_limit = 50
args = [
"--path.rootfs=/host",
"--web.config.file=/local/tls.yml",
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
]
}
[[ template "common/vault.policies" $c ]]
[[ template "common/metrics_cert" $c ]]
template {
data = <<_EOT
[[ template "monitoring/agent/node-exporter.yml" $c ]]
_EOT
destination = "local/tls.yml"
}
volume_mount {
volume = "host"
destination = "/host"
read_only = true
propagation_mode = "host-to-task"
}
[[ template "common/resources" $c ]]
}
}

View File

@ -6,6 +6,9 @@ job "monitoring-agent" {
type = "system"
# This group will collect logs from the allocation running on the node
# It uses nomad-vector-logger to query the Nomad API and discover running allocations
# and then vector to read logs from all the discovered allocations. Logs are fowarded to loki through the service mesh
group "logs-collector" {
@ -199,8 +202,9 @@ transforms:
.nomad.task_name = "{{ $value.Task }}"
.nomad.alloc_id = "{{ $value.ID }}"
.nomad.alloc_name = "{{ $value.Name }}"
# Set alloc = <TaskName>-<Alloc Index> so it's similar to what prometheus has
.nomad.alloc = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
.nomad.alloc = replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")
# Set instance = <TaskName>-<Alloc Index> so it's similar to what prometheus has
.nomad.instance = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
{{- end }}
@ -400,4 +404,106 @@ _EOT
}
}
# This group runs the prometheus node-exporter to expose prometheus metrics from the node
group "node-exporter" {
network {
mode = "bridge"
port "metrics" {}
}
volume "host" {
source = "host_root"
type = "host"
read_only = true
}
service {
name = "node-exporter"
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${node.unique.name}"
job = "${NOMAD_JOB_NAME}"
namespace = "${NOMAD_NAMESPACE}"
}
}
task "node-exporter" {
driver = "docker"
config {
image = "quay.io/prometheus/node-exporter:latest"
pid_mode = "host"
#network_mode = "host"
userns_mode = "host"
readonly_rootfs = true
pids_limit = 50
args = [
"--path.rootfs=/host",
"--web.config.file=/local/tls.yml",
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
]
}
vault {
policies = ["metrics"]
env = false
disable_file = true
change_mode = "noop"
}
# Get a certificate from vault to protect the metrics endpoint
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}
{{- end }}
_EOT
destination = "secrets/metrics.bundle.pem"
}
# Get the root CA
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
tls_server_config:
cert_file: /secrets/metrics.bundle.pem
key_file: /secrets/metrics.bundle.pem
client_ca_file: /local/monitoring.ca.pem
client_auth_type: RequireAndVerifyClientCert
_EOT
destination = "local/tls.yml"
}
volume_mount {
volume = "host"
destination = "/host"
read_only = true
propagation_mode = "host-to-task"
}
resources {
cpu = 50
memory = 24
memory_max = 32
}
}
}
}

View File

@ -184,7 +184,7 @@ _EOT
command = "prometheus"
args = [
"--config.file=/local/prometheus.yml",
"--log.level=debug",
"--log.level=info",
"--web.listen-address=127.0.0.1:9090",
"--storage.tsdb.path=/data",
"--storage.tsdb.retention.time=30d",
@ -342,8 +342,15 @@ scrape_configs:
replacement: 0
target_label: __meta_consul_service_metadata_alloc
# Keep the alloc meta in a label
# Note that most of the time, alloc is just the allocation index, but in some cases, it can be the host name (for system jobs)
- source_labels: [__meta_consul_service_metadata_alloc]
regex: (.+)
replacement: $${1}
target_label: alloc
# Rewerite the instance label to be service-alloc
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc]
- source_labels: [__meta_consul_service, alloc]
regex: (.+);([a-zA-Z\d\-\.]+)
replacement: $${1}-$${2}
target_label: instance

View File

@ -55,7 +55,7 @@ job "[[ .instance ]]-services" {
command = "prometheus"
args = [
"--config.file=/local/prometheus.yml",
"--log.level=debug",
"--log.level=info",
"--web.listen-address=127.0.0.1:9090",
"--storage.tsdb.path=/data",
"--storage.tsdb.retention.time=[[ $c.retention ]]",

View File

@ -0,0 +1,6 @@
tls_server_config:
cert_file: /secrets/metrics.bundle.pem
key_file: /secrets/metrics.bundle.pem
client_ca_file: /local/monitoring.ca.pem
client_auth_type: RequireAndVerifyClientCert

View File

@ -30,7 +30,8 @@ transforms:
.nomad.task_name = "{{ $value.Task }}"
.nomad.alloc_id = "{{ $value.ID }}"
.nomad.alloc_name = "{{ $value.Name }}"
# Set alloc = <TaskName>-<Alloc Index> so it's similar to what prometheus has
.nomad.alloc = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
.nomad.alloc = replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")
# Set instance = <TaskName>-<Alloc Index> so it's similar to what prometheus has
.nomad.instance = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
{{- end }}

View File

@ -221,8 +221,15 @@ scrape_configs:
replacement: 0
target_label: __meta_consul_service_metadata_alloc
# Keep the alloc meta in a label
# Note that most of the time, alloc is just the allocation index, but in some cases, it can be the host name (for system jobs)
- source_labels: [__meta_consul_service_metadata_alloc]
regex: (.+)
replacement: ${1}
target_label: alloc
# Rewerite the instance label to be service-alloc
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc]
- source_labels: [__meta_consul_service, alloc]
regex: (.+);([a-zA-Z\d\-\.]+)
replacement: ${1}-${2}
target_label: instance

View File

@ -1,44 +1,82 @@
---
# The name of this instance
# Note : it's not supported to run several instances in the same namespace, so generally
# you won't need to change this
instance: monitoring
# General vault settings
vault:
pki:
# The path of the PKI used for the monitoring
path: '[[ .prometheus.vault_pki ]]'
ou: Monitoring
# Some random secrets to generate
rand_secrets:
- path: grafana
fields:
- secret_key
- initial_admin_pwd
monitoring:
# List of namespace in which services will be monitored (use * to monitor everything)
# This might be useful if you run several monitoring instances in different namespaces
namespaces:
- '*'
# Exporters job will run in its own job (so you can easily assign it
# to a dedicated node_pool
exporters:
# Number of exporter instances
count: 1
# Ping exporter can ping external hosts and expose stats to prometheus
ping:
# Version of the exporter to use
version: 1.1.0
# Docker image to use
image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1'
# Custom env var to set in the container
env: {}
# Resource allocation
resources:
cpu: 10
memory: 25
memory: 24
# List of host to ping and for which statistics will be exposed. Eg
# probes:
# - gatway.acme.org
# - 10.99.10.1
probes: []
# The blackbox exporter can be used to probes external http or tcp services and
# expose those metrics to prometheus
blackbox:
# Version of the exporter
version: 0.24.0
# Docker image to use
image: '[[ .docker.repo ]]blackbox-exporter:[[ .monitoring.exporters.blackbox.version ]]-1'
# Custom env var to set in the container
env: {}
# Resource allocation
resources:
cpu: 10
memory: 50
memory: 32
# List of tcp probes, eg
# tcp_probes:
# - 10.99.1.1:443
# - 10.118.3.13:587
tcp_probes: []
# List of http probes, eg
# http_probes:
# - https://id.example.org
# - https://portal.acme.com
http_probes: []
# Consul exporter will expose consul metrics
consul:
version: 0.11.0
image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2'
@ -189,8 +227,6 @@ monitoring:
public_url: https://vector.example.org
traefik:
enabled: false
prometheus:
metrics_url: http://127.0.0.1:9001/metrics
grafana:
version: 10.4.1
@ -279,8 +315,22 @@ monitoring:
data:
type: host
source: vector_data
prometheus:
metrics_url: http://127.0.0.1:9001/metrics
node_exporter:
image: quay.io/prometheus/node-exporter:latest
env: {}
resources:
cpu: 50
memory: 24
memory_max: 32
vault:
policies:
- metrics[[ .consul.suffix ]]
volumes:
host:
type: host
source: host_root
read_only: true
prometheus:
enabled: true