monitoring/variables.yml

521 lines
17 KiB
YAML

---
# The name of this instance
# Note : it's not supported to run several instances in the same namespace, so generally
# you won't need to change this
instance: monitoring
# General vault settings
vault:
pki:
# The path of the PKI used for the monitoring
path: '[[ .prometheus.vault_pki ]]'
ou: Monitoring
# Some random secrets to generate
rand_secrets:
- path: grafana
fields:
- secret_key
- initial_admin_pwd
monitoring:
# List of namespace in which services will be monitored (use * to monitor everything)
# This might be useful if you run several monitoring instances in different namespaces
namespaces:
- '*'
# Exporters job will run in its own job (so you can easily assign it
# to a dedicated node_pool
exporters:
# Number of exporter instances
count: 1
# Ping exporter can ping external hosts and expose stats to prometheus
ping:
# Version of the exporter to use
version: 1.1.2
# Docker image to use
image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1'
# Custom env var to set in the container
env: {}
# Resource allocation
resources:
cpu: 10
memory: 24
# List of host to ping and for which statistics will be exposed. Eg
# probes:
# - gatway.acme.org
# - 10.99.10.1
probes: []
# The blackbox exporter can be used to probes external http or tcp services and
# expose those metrics to prometheus
blackbox:
# Version of the exporter
version: 0.24.0
# Docker image to use
image: '[[ .docker.repo ]]blackbox-exporter:[[ .monitoring.exporters.blackbox.version ]]-1'
# Custom env var to set in the container
env: {}
# Resource allocation
resources:
cpu: 10
memory: 32
# List of tcp probes, eg
# tcp_probes:
# - 10.99.1.1:443
# - 10.118.3.13:587
tcp_probes: []
# List of http probes, eg
# http_probes:
# - https://id.example.org
# - https://portal.acme.com
http_probes: []
# Consul exporter will expose consul metrics (mainly registered services status)
consul:
# Version of the exporter
version: 0.11.0
# Docker image to use
image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2'
# Custom env var to set in the container
env: {}
# Resource allocation
resources:
cpu: 20
memory: 32
vault:
# Vault policies to attach
policies:
- 'consul-exporter[[ .consul.suffix ]]'
# The cluster exporter is a simple nginx used as a proxy
# which handles TLS for the cluster services (vault, consul and nomad)
cluster:
# Docker image to use
image: nginxinc/nginx-unprivileged:alpine
# Custom env
env: {}
# Resource allocation
resources:
cpu: 10
memory: 20
vault:
# Vault policies to attach to the task
policies:
- 'cluster-exporter[[ .consul.suffix ]]'
- metrics[[ .consul.suffix ]]
# The prometheus server
prometheus:
# Number of instances to run. Note that if you run several instances, they will be independant, and all of
# them will scrape the same data. Then queries to the prometheus API will be loadbalanced between all instances.
# This should work most of the time, but can give some strange result if eg, one of the instances was down (queries
# for data during the downtime can give some random result depending on the instance your query is routed to)
count: 1
# Version of prometheus
version: 2.51.1
# Docker image to use
image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1'
# Custom env var to set
env: {}
# Resource allocation
resources:
cpu: 200
memory: 768
memory_max: 1024
# Volumes used for data persistence
# You must create a prometheus-data[0] volume as it's a per_alloc volume
volumes:
data:
type: csi
source: 'prometheus-data'
per_alloc: true
vault:
# Vault policies to attach to the task
policies:
- 'prometheus[[ .consul.suffix ]]'
# A dict of custom jobs. Eg
# jobs:
# squid:
# targets:
# - 10.11.2.3:9305
# - 192.168.6.20:782
jobs: {}
# A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts. Eg
# alert_rules:
# postgres:
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
# patroni:
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml
# If you need something more flexible (like download an archive of rules and uncompress it, you should use artifacts instead. Just ensure your rules
# are in /local/rules/ inside the container
alert_rules: {}
# The public URL where prometheus will be reachable (if exposed with Traefik)
public_url: https://prometheus.example.org
# Traefik settings
traefik:
# Turn this on to expose prometheus with Traefik
# Caution : there's no builtin security, you should configure the appropriate middlewares
enabled: false
router: prometheus
# Metrics retention duration
retention: 30d
# always enable prometheus metrics (of course :-) )
prometheus:
# This is the URL where metrics are exposed, where the metrics proxy will point at (from the container PoV)
metrics_url: http://localhost:9090/metrics
# AlertManager can process and send alerts
alertmanager:
# Number of instances to run. Set > 1 if you wan HA
count: 1
# Version of alertmanager
version: 0.27.0
# DOcker image to use
image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-2'
# Custom env var to set in the container
env: {}
# Resource allocation
resources:
cpu: 50
memory: 64
memory_max: 80
# If a proxy is used
proxy:
address: http://alertmanager[[ .consul.suffix ]]:alertmanager[[ .consul.suffix ]]@127.0.0.1:3128
# URL where the web interface is reachable (if exposed with Traefik)
public_url: https://alert.example.org
# Traefik settings
traefik:
# Turn this on to expose alertmanager with traefik
# Caution : there's no builtin security, you should configure appropriate middlewares before enabling
enabled: false
router: alertmanager
# Volumes used for data persistence. Note : it's a per_alloc volume
# so you need to create eg alertmanager-data[0]. This volume should be writeable by user with ID 9093
volumes:
data:
source: 'alertmanager-data'
type: csi
per_alloc: true
vault:
# List of vault policies to attach to the task
policies:
- metrics[[ .consul.suffix ]]
- alertmanager[[ .consul.suffix ]]
# Email settings
email:
from: alertmanager@[[ .consul.domain ]]
# You can merge your own custom config with the default provided one. Eg
# custom_config:
# receivers:
# - name: dani
# email_configs:
# - to: dani@example.org
# route:
# group_by: ['alertname', 'cluster', 'job']
# receiver: dani
custom_config: {}
# Loki is the log server
loki:
# Version of loki
version: 2.9.6
# Docker image to use
image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1'
# Custom env to set in the container
env: {}
# Resource allocation
resources:
cpu: 150
memory: 1024
vault:
# Vault policies to attach in the container
policies:
- 'loki[[ .consul.suffix ]]'
# URL where loki is exposed (if enabled)
public_url: https://loki.example.org
# Traefik settings
traefik:
# Turn it on to expose Loki with Traefik
# Caution : there's no builtin security, you should add appropriate Traefik middlewares
enabled: false
router: loki
# Retention for logs. Older will be deleted
retention: 720h # 1 month
# Custom configuration which will be merged on top of the default one
custom_config: {}
prometheus:
# URL where metrics are available for the metrics proxy (from inside the container PoV)
metrics_url: http://localhost:3100/metrics
# Volumes for data persistence. Should be writable for user id 3100
volumes:
data:
type: csi
source: 'loki-data'
# Common vector settings
vector:
# Version of vector
version: 0.37.0
# Docker image to use
image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1'
# Vector aggregator can be used to ingest logs from external device (using syslog or fluentd)
# Logs will then be forwarded to loki
aggregator:
# Number of instances
count: 1
# Docker image to use
image: '[[ .monitoring.vector.image ]]'
# Custom env to set in the container
env: {}
# Resource allocation
resources:
cpu: 100
memory: 192
consul:
connect:
upstreams:
# Connect to loki through the service mesh
- destination_name: 'loki[[ .consul.suffix ]]'
local_bind_port: 3100
vault:
# Vault policies to attach to the task.
# Note : vector can expose its metrics with mTLS natively, so we do not add a metrics_proxy task
# but we need to grant the metrics policy to the vector task instead
policies:
- metrics[[ .consul.suffix ]]
# Fluentd source settings
fluentd:
enabled: false
traefik:
router: fluentd
entrypoints:
- fluentd
# Syslog source settings
syslog_udp:
enabled: false
traefik:
router: syslog-udp
proto: udp
entrypoints:
- syslog-udp
# Syslog (tcp) source settings
syslog_tcp:
enabled: false
traefik:
router: syslog-tcp
proto: tcp
entrypoints:
- syslog-tcp
# Native vector (http) source settings
vector:
enabled: true
# URL where the vector endpoint is available from the outside (if exposed with Traefik)
public_url: https://vector.example.org
traefik:
# Set to true if you want to expose the service with Traefik
# Caution : there's no builtin security, you should configure appropriate middlewares before enabling it
enabled: false
# Grafana settings
grafana:
# Grafana version
version: 10.4.1
# Docker image to use
image: '[[ .docker.repo ]]grafana:[[ .monitoring.grafana.version ]]-1'
# Custom env var to set in the container
env: {}
# Resource allocation
resources:
cpu: 100
memory: 256
# If proxy is used
proxy:
address: http://grafana[[ .consul.suffix ]]:grafana[[ .consul.suffix ]]@127.0.0.1:3128
# URL where Grafana is reachable
public_url: https://grafana.example.org
# List of plugins to install. Note : plugins are installed at image build time, so you need to rebuild
# the image if you want to update it
plugins:
- grafana-clock-panel
- grafana-piechart-panel
# Dict of feature toggles. See https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/feature-toggles/
# Example:
# feature_toggles:
# featureToggleAdminPage: true
# ssoSettingsApi: true
feature_toggles: {}
# Traefik settings
traefik:
enabled: true
router: grafana
csp:
img-src: "'self' data: blob: https://grafana.com"
connect-src: "'self' https://grafana.com"
consul:
connect:
# Connect to postgres, loki and prometheus with the service mesh
upstreams:
- destination_name: postgres[[ .consul.suffix ]]
local_bind_port: 5432
- destination_name: loki[[ .consul.suffix ]]
local_bind_port: 3100
- destination_name: prometheus[[ .consul.suffix ]]
local_bind_port: 9090
# Volumes for data persistence
volumes:
data:
type: csi
source: 'grafana-data'
vault:
# Vault policies to attach to the task
policies:
- 'grafana[[ .consul.suffix ]]'
# Postgres DB settings
database:
role: grafana
pgrole: grafana
# Override some default postgres handling
postgres:
database: grafana
user: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.username }}{{ end }}'
password: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.password }}{{ end }}'
pooler:
mode: session
prometheus:
# URL where Grafana metrics are reachable for the metrics proxy (from inside the container PoV)
metrics_url: http://127.0.0.1:3000/metrics
# Agent runs as a system jobs, on all the nodes
agent:
consul:
meta:
# Override the alloc service meta, the hostname will be more useful than a 0)
alloc: '${node.unique.name}'
# Nomad settings
nomad:
# Run on all node pools
node_pool: all
# Run with an above average priority
priority: 60
# Nomad vector logger is a small container which will query the Nomad API to discover running allocation on the current node
# Then generate a vector configuration with scraping for all the discovered allocation.
nomad_vector_logger:
# Docker image to use
image: '[[ .docker.repo ]]nomad-vector-logger:24.3-2'
# Custom env to set in the container
env: {}
# Resource allocation
resources:
cpu: 20
memory: 24
memory_max: 50
vault:
# Vault policies to attach to the task
policies:
- nomad-vector-logger[[ .consul.suffix ]]
# Vector is the main task. It'll read it's config created by nomad-vector-logger and will read log files
# accordingly, add useful metadata (like node, job, group, task, alloc etc.) and push logs to loki
vector:
# Docker image to use
image: '[[ .monitoring.vector.image ]]'
# Custom env to set in the container
env: {}
# Resource allocation
resources:
cpu: 100
memory: 384
memory_max: 512
vault:
# Vault policies to attach to the container. Vector being able to use mTLS on the metrics endpoint
# there's no need to add a metrics_proxy task. Instead, we grant the metrics policy to vector so it can get
# a certificate from vault
policies:
- metrics[[ .consul.suffix ]]
consul:
connect:
upstreams:
# Connect to loki with the service mesh
- destination_name: loki[[ .consul.suffix ]]
local_bind_port: 3100
# Volumes for data persistence
volumes:
# The nomad volume should expose the Nomad alloc dir (eg /opt/nomad/data/alloc) where vector will be able
# to read the logs. You should create a host volume in nomad client config of all your nodes. Eg
# client {
# enabled = true
# host_volume "nomad_alloc" {
# path = "/opt/nomad/data/alloc"
# read_only = "true"
# }
# }
nomad:
type: host
source: nomad_alloc
read_only: true
# The data volume will be used by vector for buffering (in case loki is unavailable)
# You can create a host volume in Nomad's client config, eg
# client {
# enabled = true
# host_volume "nomad_alloc" {
# path = "/data/vector-agent"
# }
# }
data:
type: host
source: vector_data
# The node exporter can be used to expose the host metrics to prometheus
node_exporter:
# Is the node exporter enabled ? (set to false if you don't want it, or if you
# already manage the node-exporter separatly)
enabled: true
# Version of the exporter
version: 1.7.0
# Docker image to use
image: '[[ .docker.repo ]]node-exporter:[[ .monitoring.agent.node_exporter.version ]]-1'
# Custom env to set in the container
env: {}
# Resource allocation
resources:
cpu: 50
memory: 32
memory_max: 56
vault:
# Vault policies to atatch to the task
# This exporter can handle mTLS itself, so no need to create a metrics_proxy task, instead, grant the metrics policy
# So it can get a certificate from vault
policies:
- metrics[[ .consul.suffix ]]
# Args to add to the exporter on start
args:
- '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)'
# Volumes
volumes:
# The exporter should access the host root filesystem
# For this, you should create a host volume in Nomad's client config, eg
# client {
# enabled = true
# host_volume "host_root" {
# path = "/"
# read_only = true
# }
# }
host:
type: host
source: host_root
read_only: true
# Enable globaly prometheus for this bundle :-)
prometheus:
enabled: true