Add node-exporter to the agent job
This commit is contained in:
parent
defebffc50
commit
f954afc251
|
@ -4,6 +4,9 @@ job "[[ .instance ]]-agent" {
|
|||
[[ template "common/job_start" $c ]]
|
||||
type = "system"
|
||||
|
||||
# This group will collect logs from the allocation running on the node
|
||||
# It uses nomad-vector-logger to query the Nomad API and discover running allocations
|
||||
# and then vector to read logs from all the discovered allocations. Logs are fowarded to loki through the service mesh
|
||||
group "logs-collector" {
|
||||
|
||||
[[ $c := merge $c.vector $c ]]
|
||||
|
@ -210,6 +213,61 @@ _EOT
|
|||
read_only = false
|
||||
}
|
||||
|
||||
[[ template "common/resources" $c ]]
|
||||
}
|
||||
}
|
||||
|
||||
# This group runs the prometheus node-exporter to expose prometheus metrics from the node
|
||||
group "node-exporter" {
|
||||
|
||||
[[- $c := merge .monitoring.agent.node_exporter .monitoring.agent .monitoring . ]]
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
[[ template "common/volumes" $c ]]
|
||||
|
||||
service {
|
||||
name = "node-exporter[[.consul.suffix ]]"
|
||||
[[ template "common/service_meta" $c ]]
|
||||
}
|
||||
|
||||
task "node-exporter" {
|
||||
driver = "[[ $c.nomad.driver ]]"
|
||||
|
||||
config {
|
||||
image = "[[ $c.image ]]"
|
||||
pid_mode = "host"
|
||||
#network_mode = "host"
|
||||
userns_mode = "host"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 50
|
||||
args = [
|
||||
"--path.rootfs=/host",
|
||||
"--web.config.file=/local/tls.yml",
|
||||
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
|
||||
]
|
||||
}
|
||||
|
||||
[[ template "common/vault.policies" $c ]]
|
||||
[[ template "common/metrics_cert" $c ]]
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ template "monitoring/agent/node-exporter.yml" $c ]]
|
||||
_EOT
|
||||
destination = "local/tls.yml"
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "host"
|
||||
destination = "/host"
|
||||
read_only = true
|
||||
propagation_mode = "host-to-task"
|
||||
}
|
||||
|
||||
[[ template "common/resources" $c ]]
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,6 +6,9 @@ job "monitoring-agent" {
|
|||
|
||||
type = "system"
|
||||
|
||||
# This group will collect logs from the allocation running on the node
|
||||
# It uses nomad-vector-logger to query the Nomad API and discover running allocations
|
||||
# and then vector to read logs from all the discovered allocations. Logs are fowarded to loki through the service mesh
|
||||
group "logs-collector" {
|
||||
|
||||
|
||||
|
@ -199,8 +202,9 @@ transforms:
|
|||
.nomad.task_name = "{{ $value.Task }}"
|
||||
.nomad.alloc_id = "{{ $value.ID }}"
|
||||
.nomad.alloc_name = "{{ $value.Name }}"
|
||||
# Set alloc = <TaskName>-<Alloc Index> so it's similar to what prometheus has
|
||||
.nomad.alloc = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
|
||||
.nomad.alloc = replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")
|
||||
# Set instance = <TaskName>-<Alloc Index> so it's similar to what prometheus has
|
||||
.nomad.instance = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
|
||||
|
||||
{{- end }}
|
||||
|
||||
|
@ -400,4 +404,106 @@ _EOT
|
|||
|
||||
}
|
||||
}
|
||||
|
||||
# This group runs the prometheus node-exporter to expose prometheus metrics from the node
|
||||
group "node-exporter" {
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
|
||||
volume "host" {
|
||||
source = "host_root"
|
||||
type = "host"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
|
||||
service {
|
||||
name = "node-exporter"
|
||||
meta {
|
||||
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
||||
alloc = "${node.unique.name}"
|
||||
job = "${NOMAD_JOB_NAME}"
|
||||
namespace = "${NOMAD_NAMESPACE}"
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
task "node-exporter" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "quay.io/prometheus/node-exporter:latest"
|
||||
pid_mode = "host"
|
||||
#network_mode = "host"
|
||||
userns_mode = "host"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 50
|
||||
args = [
|
||||
"--path.rootfs=/host",
|
||||
"--web.config.file=/local/tls.yml",
|
||||
"--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
vault {
|
||||
policies = ["metrics"]
|
||||
env = false
|
||||
disable_file = true
|
||||
change_mode = "noop"
|
||||
}
|
||||
|
||||
# Get a certificate from vault to protect the metrics endpoint
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
# Get the root CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
tls_server_config:
|
||||
cert_file: /secrets/metrics.bundle.pem
|
||||
key_file: /secrets/metrics.bundle.pem
|
||||
client_ca_file: /local/monitoring.ca.pem
|
||||
client_auth_type: RequireAndVerifyClientCert
|
||||
|
||||
|
||||
_EOT
|
||||
destination = "local/tls.yml"
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "host"
|
||||
destination = "/host"
|
||||
read_only = true
|
||||
propagation_mode = "host-to-task"
|
||||
}
|
||||
|
||||
|
||||
resources {
|
||||
cpu = 50
|
||||
memory = 24
|
||||
memory_max = 32
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -184,7 +184,7 @@ _EOT
|
|||
command = "prometheus"
|
||||
args = [
|
||||
"--config.file=/local/prometheus.yml",
|
||||
"--log.level=debug",
|
||||
"--log.level=info",
|
||||
"--web.listen-address=127.0.0.1:9090",
|
||||
"--storage.tsdb.path=/data",
|
||||
"--storage.tsdb.retention.time=30d",
|
||||
|
@ -342,8 +342,15 @@ scrape_configs:
|
|||
replacement: 0
|
||||
target_label: __meta_consul_service_metadata_alloc
|
||||
|
||||
# Keep the alloc meta in a label
|
||||
# Note that most of the time, alloc is just the allocation index, but in some cases, it can be the host name (for system jobs)
|
||||
- source_labels: [__meta_consul_service_metadata_alloc]
|
||||
regex: (.+)
|
||||
replacement: $${1}
|
||||
target_label: alloc
|
||||
|
||||
# Rewerite the instance label to be service-alloc
|
||||
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc]
|
||||
- source_labels: [__meta_consul_service, alloc]
|
||||
regex: (.+);([a-zA-Z\d\-\.]+)
|
||||
replacement: $${1}-$${2}
|
||||
target_label: instance
|
||||
|
|
|
@ -55,7 +55,7 @@ job "[[ .instance ]]-services" {
|
|||
command = "prometheus"
|
||||
args = [
|
||||
"--config.file=/local/prometheus.yml",
|
||||
"--log.level=debug",
|
||||
"--log.level=info",
|
||||
"--web.listen-address=127.0.0.1:9090",
|
||||
"--storage.tsdb.path=/data",
|
||||
"--storage.tsdb.retention.time=[[ $c.retention ]]",
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
tls_server_config:
|
||||
cert_file: /secrets/metrics.bundle.pem
|
||||
key_file: /secrets/metrics.bundle.pem
|
||||
client_ca_file: /local/monitoring.ca.pem
|
||||
client_auth_type: RequireAndVerifyClientCert
|
||||
|
|
@ -30,7 +30,8 @@ transforms:
|
|||
.nomad.task_name = "{{ $value.Task }}"
|
||||
.nomad.alloc_id = "{{ $value.ID }}"
|
||||
.nomad.alloc_name = "{{ $value.Name }}"
|
||||
# Set alloc = <TaskName>-<Alloc Index> so it's similar to what prometheus has
|
||||
.nomad.alloc = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
|
||||
.nomad.alloc = replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")
|
||||
# Set instance = <TaskName>-<Alloc Index> so it's similar to what prometheus has
|
||||
.nomad.instance = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
|
||||
|
||||
{{- end }}
|
||||
|
|
|
@ -221,8 +221,15 @@ scrape_configs:
|
|||
replacement: 0
|
||||
target_label: __meta_consul_service_metadata_alloc
|
||||
|
||||
# Keep the alloc meta in a label
|
||||
# Note that most of the time, alloc is just the allocation index, but in some cases, it can be the host name (for system jobs)
|
||||
- source_labels: [__meta_consul_service_metadata_alloc]
|
||||
regex: (.+)
|
||||
replacement: ${1}
|
||||
target_label: alloc
|
||||
|
||||
# Rewerite the instance label to be service-alloc
|
||||
- source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc]
|
||||
- source_labels: [__meta_consul_service, alloc]
|
||||
regex: (.+);([a-zA-Z\d\-\.]+)
|
||||
replacement: ${1}-${2}
|
||||
target_label: instance
|
||||
|
|
|
@ -1,44 +1,82 @@
|
|||
---
|
||||
|
||||
# The name of this instance
|
||||
# Note : it's not supported to run several instances in the same namespace, so generally
|
||||
# you won't need to change this
|
||||
instance: monitoring
|
||||
|
||||
# General vault settings
|
||||
vault:
|
||||
pki:
|
||||
# The path of the PKI used for the monitoring
|
||||
path: '[[ .prometheus.vault_pki ]]'
|
||||
ou: Monitoring
|
||||
|
||||
# Some random secrets to generate
|
||||
rand_secrets:
|
||||
- path: grafana
|
||||
fields:
|
||||
- secret_key
|
||||
- initial_admin_pwd
|
||||
|
||||
|
||||
monitoring:
|
||||
|
||||
# List of namespace in which services will be monitored (use * to monitor everything)
|
||||
# This might be useful if you run several monitoring instances in different namespaces
|
||||
namespaces:
|
||||
- '*'
|
||||
|
||||
# Exporters job will run in its own job (so you can easily assign it
|
||||
# to a dedicated node_pool
|
||||
exporters:
|
||||
|
||||
# Number of exporter instances
|
||||
count: 1
|
||||
|
||||
# Ping exporter can ping external hosts and expose stats to prometheus
|
||||
ping:
|
||||
# Version of the exporter to use
|
||||
version: 1.1.0
|
||||
# Docker image to use
|
||||
image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1'
|
||||
# Custom env var to set in the container
|
||||
env: {}
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 10
|
||||
memory: 25
|
||||
memory: 24
|
||||
# List of host to ping and for which statistics will be exposed. Eg
|
||||
# probes:
|
||||
# - gatway.acme.org
|
||||
# - 10.99.10.1
|
||||
probes: []
|
||||
|
||||
# The blackbox exporter can be used to probes external http or tcp services and
|
||||
# expose those metrics to prometheus
|
||||
blackbox:
|
||||
# Version of the exporter
|
||||
version: 0.24.0
|
||||
# Docker image to use
|
||||
image: '[[ .docker.repo ]]blackbox-exporter:[[ .monitoring.exporters.blackbox.version ]]-1'
|
||||
# Custom env var to set in the container
|
||||
env: {}
|
||||
# Resource allocation
|
||||
resources:
|
||||
cpu: 10
|
||||
memory: 50
|
||||
memory: 32
|
||||
# List of tcp probes, eg
|
||||
# tcp_probes:
|
||||
# - 10.99.1.1:443
|
||||
# - 10.118.3.13:587
|
||||
tcp_probes: []
|
||||
# List of http probes, eg
|
||||
# http_probes:
|
||||
# - https://id.example.org
|
||||
# - https://portal.acme.com
|
||||
http_probes: []
|
||||
|
||||
# Consul exporter will expose consul metrics
|
||||
consul:
|
||||
version: 0.11.0
|
||||
image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2'
|
||||
|
@ -189,8 +227,6 @@ monitoring:
|
|||
public_url: https://vector.example.org
|
||||
traefik:
|
||||
enabled: false
|
||||
prometheus:
|
||||
metrics_url: http://127.0.0.1:9001/metrics
|
||||
|
||||
grafana:
|
||||
version: 10.4.1
|
||||
|
@ -279,8 +315,22 @@ monitoring:
|
|||
data:
|
||||
type: host
|
||||
source: vector_data
|
||||
prometheus:
|
||||
metrics_url: http://127.0.0.1:9001/metrics
|
||||
|
||||
node_exporter:
|
||||
image: quay.io/prometheus/node-exporter:latest
|
||||
env: {}
|
||||
resources:
|
||||
cpu: 50
|
||||
memory: 24
|
||||
memory_max: 32
|
||||
vault:
|
||||
policies:
|
||||
- metrics[[ .consul.suffix ]]
|
||||
volumes:
|
||||
host:
|
||||
type: host
|
||||
source: host_root
|
||||
read_only: true
|
||||
|
||||
prometheus:
|
||||
enabled: true
|
||||
|
|
Loading…
Reference in New Issue