From f954afc251da85b425e58010a8cb977d45220d30 Mon Sep 17 00:00:00 2001 From: Daniel Berteaud Date: Mon, 25 Mar 2024 14:54:13 +0100 Subject: [PATCH] Add node-exporter to the agent job --- agent.nomad.hcl | 58 +++++++++++++++ example/agent.nomad.hcl | 110 +++++++++++++++++++++++++++- example/services.nomad.hcl | 11 ++- services.nomad.hcl | 2 +- templates/agent/node-exporter.yml | 6 ++ templates/agent/vector-template.yml | 5 +- templates/prometheus/prometheus.yml | 9 ++- variables.yml | 62 ++++++++++++++-- 8 files changed, 249 insertions(+), 14 deletions(-) create mode 100644 templates/agent/node-exporter.yml diff --git a/agent.nomad.hcl b/agent.nomad.hcl index 4172778..0bfd901 100644 --- a/agent.nomad.hcl +++ b/agent.nomad.hcl @@ -4,6 +4,9 @@ job "[[ .instance ]]-agent" { [[ template "common/job_start" $c ]] type = "system" + # This group will collect logs from the allocation running on the node + # It uses nomad-vector-logger to query the Nomad API and discover running allocations + # and then vector to read logs from all the discovered allocations. Logs are fowarded to loki through the service mesh group "logs-collector" { [[ $c := merge $c.vector $c ]] @@ -210,6 +213,61 @@ _EOT read_only = false } +[[ template "common/resources" $c ]] + } + } + + # This group runs the prometheus node-exporter to expose prometheus metrics from the node + group "node-exporter" { + +[[- $c := merge .monitoring.agent.node_exporter .monitoring.agent .monitoring . ]] + + network { + mode = "bridge" + port "metrics" {} + } + +[[ template "common/volumes" $c ]] + + service { + name = "node-exporter[[.consul.suffix ]]" +[[ template "common/service_meta" $c ]] + } + + task "node-exporter" { + driver = "[[ $c.nomad.driver ]]" + + config { + image = "[[ $c.image ]]" + pid_mode = "host" + #network_mode = "host" + userns_mode = "host" + readonly_rootfs = true + pids_limit = 50 + args = [ + "--path.rootfs=/host", + "--web.config.file=/local/tls.yml", + "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}" + ] + } + +[[ template "common/vault.policies" $c ]] +[[ template "common/metrics_cert" $c ]] + + template { + data = <<_EOT +[[ template "monitoring/agent/node-exporter.yml" $c ]] +_EOT + destination = "local/tls.yml" + } + + volume_mount { + volume = "host" + destination = "/host" + read_only = true + propagation_mode = "host-to-task" + } + [[ template "common/resources" $c ]] } } diff --git a/example/agent.nomad.hcl b/example/agent.nomad.hcl index 4700f80..ccfb727 100644 --- a/example/agent.nomad.hcl +++ b/example/agent.nomad.hcl @@ -6,6 +6,9 @@ job "monitoring-agent" { type = "system" + # This group will collect logs from the allocation running on the node + # It uses nomad-vector-logger to query the Nomad API and discover running allocations + # and then vector to read logs from all the discovered allocations. Logs are fowarded to loki through the service mesh group "logs-collector" { @@ -199,8 +202,9 @@ transforms: .nomad.task_name = "{{ $value.Task }}" .nomad.alloc_id = "{{ $value.ID }}" .nomad.alloc_name = "{{ $value.Name }}" - # Set alloc = - so it's similar to what prometheus has - .nomad.alloc = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P\d+)\]', "$$index")], separator: "-") + .nomad.alloc = replace("{{ $value.Name }}", r'.+\[(?P\d+)\]', "$$index") + # Set instance = - so it's similar to what prometheus has + .nomad.instance = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P\d+)\]', "$$index")], separator: "-") {{- end }} @@ -400,4 +404,106 @@ _EOT } } + + # This group runs the prometheus node-exporter to expose prometheus metrics from the node + group "node-exporter" { + + network { + mode = "bridge" + port "metrics" {} + } + + + volume "host" { + source = "host_root" + type = "host" + read_only = true + } + + + service { + name = "node-exporter" + meta { + metrics-port = "${NOMAD_HOST_PORT_metrics}" + alloc = "${node.unique.name}" + job = "${NOMAD_JOB_NAME}" + namespace = "${NOMAD_NAMESPACE}" + } + + } + + task "node-exporter" { + driver = "docker" + + config { + image = "quay.io/prometheus/node-exporter:latest" + pid_mode = "host" + #network_mode = "host" + userns_mode = "host" + readonly_rootfs = true + pids_limit = 50 + args = [ + "--path.rootfs=/host", + "--web.config.file=/local/tls.yml", + "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}" + ] + } + + + vault { + policies = ["metrics"] + env = false + disable_file = true + change_mode = "noop" + } + + # Get a certificate from vault to protect the metrics endpoint + template { + data = <<_EOT +{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} +{{ .Cert }} +{{ .Key }} +{{- end }} +_EOT + destination = "secrets/metrics.bundle.pem" + } + + # Get the root CA + template { + data = <<_EOT +{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + + template { + data = <<_EOT +tls_server_config: + cert_file: /secrets/metrics.bundle.pem + key_file: /secrets/metrics.bundle.pem + client_ca_file: /local/monitoring.ca.pem + client_auth_type: RequireAndVerifyClientCert + + +_EOT + destination = "local/tls.yml" + } + + volume_mount { + volume = "host" + destination = "/host" + read_only = true + propagation_mode = "host-to-task" + } + + + resources { + cpu = 50 + memory = 24 + memory_max = 32 + } + + } + } } diff --git a/example/services.nomad.hcl b/example/services.nomad.hcl index cf290b3..fcbd7c0 100644 --- a/example/services.nomad.hcl +++ b/example/services.nomad.hcl @@ -184,7 +184,7 @@ _EOT command = "prometheus" args = [ "--config.file=/local/prometheus.yml", - "--log.level=debug", + "--log.level=info", "--web.listen-address=127.0.0.1:9090", "--storage.tsdb.path=/data", "--storage.tsdb.retention.time=30d", @@ -342,8 +342,15 @@ scrape_configs: replacement: 0 target_label: __meta_consul_service_metadata_alloc + # Keep the alloc meta in a label + # Note that most of the time, alloc is just the allocation index, but in some cases, it can be the host name (for system jobs) + - source_labels: [__meta_consul_service_metadata_alloc] + regex: (.+) + replacement: $${1} + target_label: alloc + # Rewerite the instance label to be service-alloc - - source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc] + - source_labels: [__meta_consul_service, alloc] regex: (.+);([a-zA-Z\d\-\.]+) replacement: $${1}-$${2} target_label: instance diff --git a/services.nomad.hcl b/services.nomad.hcl index 9c5c9f8..a5b4c0a 100644 --- a/services.nomad.hcl +++ b/services.nomad.hcl @@ -55,7 +55,7 @@ job "[[ .instance ]]-services" { command = "prometheus" args = [ "--config.file=/local/prometheus.yml", - "--log.level=debug", + "--log.level=info", "--web.listen-address=127.0.0.1:9090", "--storage.tsdb.path=/data", "--storage.tsdb.retention.time=[[ $c.retention ]]", diff --git a/templates/agent/node-exporter.yml b/templates/agent/node-exporter.yml new file mode 100644 index 0000000..cd5d6c3 --- /dev/null +++ b/templates/agent/node-exporter.yml @@ -0,0 +1,6 @@ +tls_server_config: + cert_file: /secrets/metrics.bundle.pem + key_file: /secrets/metrics.bundle.pem + client_ca_file: /local/monitoring.ca.pem + client_auth_type: RequireAndVerifyClientCert + diff --git a/templates/agent/vector-template.yml b/templates/agent/vector-template.yml index 279ebae..45d225d 100644 --- a/templates/agent/vector-template.yml +++ b/templates/agent/vector-template.yml @@ -30,7 +30,8 @@ transforms: .nomad.task_name = "{{ $value.Task }}" .nomad.alloc_id = "{{ $value.ID }}" .nomad.alloc_name = "{{ $value.Name }}" - # Set alloc = - so it's similar to what prometheus has - .nomad.alloc = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P\d+)\]', "$$index")], separator: "-") + .nomad.alloc = replace("{{ $value.Name }}", r'.+\[(?P\d+)\]', "$$index") + # Set instance = - so it's similar to what prometheus has + .nomad.instance = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P\d+)\]', "$$index")], separator: "-") {{- end }} diff --git a/templates/prometheus/prometheus.yml b/templates/prometheus/prometheus.yml index 59da09c..d88f612 100644 --- a/templates/prometheus/prometheus.yml +++ b/templates/prometheus/prometheus.yml @@ -221,8 +221,15 @@ scrape_configs: replacement: 0 target_label: __meta_consul_service_metadata_alloc + # Keep the alloc meta in a label + # Note that most of the time, alloc is just the allocation index, but in some cases, it can be the host name (for system jobs) + - source_labels: [__meta_consul_service_metadata_alloc] + regex: (.+) + replacement: ${1} + target_label: alloc + # Rewerite the instance label to be service-alloc - - source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc] + - source_labels: [__meta_consul_service, alloc] regex: (.+);([a-zA-Z\d\-\.]+) replacement: ${1}-${2} target_label: instance diff --git a/variables.yml b/variables.yml index 6f65b78..46e2d82 100644 --- a/variables.yml +++ b/variables.yml @@ -1,44 +1,82 @@ --- +# The name of this instance +# Note : it's not supported to run several instances in the same namespace, so generally +# you won't need to change this instance: monitoring +# General vault settings vault: pki: + # The path of the PKI used for the monitoring path: '[[ .prometheus.vault_pki ]]' ou: Monitoring + + # Some random secrets to generate rand_secrets: - path: grafana fields: - secret_key - initial_admin_pwd + monitoring: + # List of namespace in which services will be monitored (use * to monitor everything) + # This might be useful if you run several monitoring instances in different namespaces namespaces: - '*' + # Exporters job will run in its own job (so you can easily assign it + # to a dedicated node_pool exporters: + + # Number of exporter instances count: 1 + # Ping exporter can ping external hosts and expose stats to prometheus ping: + # Version of the exporter to use version: 1.1.0 + # Docker image to use image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1' + # Custom env var to set in the container env: {} + # Resource allocation resources: cpu: 10 - memory: 25 + memory: 24 + # List of host to ping and for which statistics will be exposed. Eg + # probes: + # - gatway.acme.org + # - 10.99.10.1 probes: [] + # The blackbox exporter can be used to probes external http or tcp services and + # expose those metrics to prometheus blackbox: + # Version of the exporter version: 0.24.0 + # Docker image to use image: '[[ .docker.repo ]]blackbox-exporter:[[ .monitoring.exporters.blackbox.version ]]-1' + # Custom env var to set in the container env: {} + # Resource allocation resources: cpu: 10 - memory: 50 + memory: 32 + # List of tcp probes, eg + # tcp_probes: + # - 10.99.1.1:443 + # - 10.118.3.13:587 tcp_probes: [] + # List of http probes, eg + # http_probes: + # - https://id.example.org + # - https://portal.acme.com http_probes: [] + # Consul exporter will expose consul metrics consul: version: 0.11.0 image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2' @@ -189,8 +227,6 @@ monitoring: public_url: https://vector.example.org traefik: enabled: false - prometheus: - metrics_url: http://127.0.0.1:9001/metrics grafana: version: 10.4.1 @@ -279,8 +315,22 @@ monitoring: data: type: host source: vector_data - prometheus: - metrics_url: http://127.0.0.1:9001/metrics + + node_exporter: + image: quay.io/prometheus/node-exporter:latest + env: {} + resources: + cpu: 50 + memory: 24 + memory_max: 32 + vault: + policies: + - metrics[[ .consul.suffix ]] + volumes: + host: + type: host + source: host_root + read_only: true prometheus: enabled: true