564 lines
19 KiB
YAML
564 lines
19 KiB
YAML
---
|
|
|
|
# The name of this instance
|
|
# Note : it's not supported to run several instances in the same namespace, so generally
|
|
# you won't need to change this
|
|
instance: monitoring
|
|
|
|
# General vault settings
|
|
vault:
|
|
pki:
|
|
# The path of the PKI used for the monitoring
|
|
path: '[[ .prometheus.vault_pki ]]'
|
|
ou: Monitoring
|
|
|
|
# Some random secrets to generate
|
|
rand_secrets:
|
|
- path: grafana
|
|
fields:
|
|
- secret_key
|
|
- initial_admin_pwd
|
|
|
|
|
|
monitoring:
|
|
|
|
# List of namespace in which services will be monitored (use * to monitor everything)
|
|
# This might be useful if you run several monitoring instances in different namespaces
|
|
namespaces:
|
|
- '*'
|
|
|
|
# Exporters job will run in its own job (so you can easily assign it
|
|
# to a dedicated node_pool
|
|
exporters:
|
|
|
|
# Number of exporter instances
|
|
count: 1
|
|
|
|
# Ping exporter can ping external hosts and expose stats to prometheus
|
|
ping:
|
|
# Version of the exporter to use
|
|
version: 1.1.3
|
|
# Docker image to use
|
|
image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1'
|
|
# Custom env var to set in the container
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 10
|
|
memory: 24
|
|
# List of host to ping and for which statistics will be exposed. Eg
|
|
# probes:
|
|
# - gatway.acme.org
|
|
# - 10.99.10.1
|
|
probes: []
|
|
|
|
# The blackbox exporter can be used to probes external http or tcp services and
|
|
# expose those metrics to prometheus
|
|
blackbox:
|
|
# Version of the exporter
|
|
version: 0.25.0
|
|
# Docker image to use
|
|
image: '[[ .docker.repo ]]blackbox-exporter:[[ .monitoring.exporters.blackbox.version ]]-1'
|
|
# Custom env var to set in the container
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 10
|
|
memory: 32
|
|
# List of tcp probes, eg
|
|
# tcp_probes:
|
|
# - 10.99.1.1:443
|
|
# - 10.118.3.13:587
|
|
tcp_probes: []
|
|
# List of http probes, eg
|
|
# http_probes:
|
|
# - https://id.example.org
|
|
# - https://portal.acme.com
|
|
http_probes: []
|
|
|
|
# Consul exporter will expose consul metrics (mainly registered services status)
|
|
consul:
|
|
# Version of the exporter
|
|
version: 0.12.0
|
|
# Docker image to use
|
|
image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-1'
|
|
# Custom env var to set in the container
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 48
|
|
memory: 64
|
|
vault:
|
|
# Vault policies to attach
|
|
policies:
|
|
- 'consul-exporter[[ .consul.suffix ]]'
|
|
|
|
# The cluster exporter is a simple nginx used as a proxy
|
|
# which handles TLS for the cluster services (vault, consul and nomad)
|
|
cluster:
|
|
# Docker image to use
|
|
image: nginxinc/nginx-unprivileged:alpine
|
|
# Custom env
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 10
|
|
memory: 48
|
|
memory_max: 64
|
|
vault:
|
|
# Vault policies to attach to the task
|
|
policies:
|
|
- 'cluster-exporter[[ .consul.suffix ]]'
|
|
- metrics[[ .consul.suffix ]]
|
|
|
|
# The prometheus server
|
|
prometheus:
|
|
# Number of instances to run. Note that if you run several instances, they will be independant, and all of
|
|
# them will scrape the same data. Then queries to the prometheus API will be loadbalanced between all instances.
|
|
# This should work most of the time, but can give some strange result if eg, one of the instances was down (queries
|
|
# for data during the downtime can give some random result depending on the instance your query is routed to)
|
|
count: 1
|
|
# Version of prometheus
|
|
version: 2.51.2
|
|
# Docker image to use
|
|
image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1'
|
|
# Custom env var to set
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 200
|
|
memory: 768
|
|
memory_max: 1024
|
|
# Volumes used for data persistence
|
|
# You must create a prometheus-data[0] volume as it's a per_alloc volume
|
|
volumes:
|
|
data:
|
|
type: csi
|
|
source: 'prometheus-data'
|
|
per_alloc: true
|
|
vault:
|
|
# Vault policies to attach to the task
|
|
policies:
|
|
- 'prometheus[[ .consul.suffix ]]'
|
|
# A dict of custom jobs. Eg
|
|
# jobs:
|
|
# squid:
|
|
# targets:
|
|
# - 10.11.2.3:9305
|
|
# - 192.168.6.20:782
|
|
jobs: {}
|
|
# A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts,
|
|
# or you can provide it as raw content. Eg
|
|
# alert_rules:
|
|
# postgres:
|
|
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
|
|
# patroni:
|
|
# url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml
|
|
# custom:
|
|
# content: |
|
|
# groups:
|
|
# - name: EmbeddedExporter
|
|
# rules:
|
|
# - alert: PrometheusJobMissing
|
|
# expr: 'absent(up{job="prometheus"})'
|
|
# for: 0m
|
|
# labels:
|
|
# severity: warning
|
|
# annotations:
|
|
# summary: Prometheus job missing (instance {{ $labels.instance }})
|
|
# description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# If you need something more flexible (like download an archive of rules and uncompress it, you should use artifacts instead. Just ensure your rules
|
|
# are in /local/rules/ inside the container
|
|
alert_rules: {}
|
|
# The public URL where prometheus will be reachable (if exposed with Traefik)
|
|
public_url: https://prometheus.example.org
|
|
# Traefik settings
|
|
traefik:
|
|
# Turn this on to expose prometheus with Traefik
|
|
# Caution : there's no builtin security, you should configure the appropriate middlewares
|
|
enabled: false
|
|
router: prometheus
|
|
# Metrics retention duration
|
|
retention: 30d
|
|
# always enable prometheus metrics (of course :-) )
|
|
prometheus:
|
|
# This is the URL where metrics are exposed, where the metrics proxy will point at (from the container PoV)
|
|
metrics_url: http://localhost:9090/metrics
|
|
|
|
# AlertManager can process and send alerts
|
|
alertmanager:
|
|
# Number of instances to run. Set > 1 if you wan HA
|
|
count: 1
|
|
# Version of alertmanager
|
|
version: 0.27.0
|
|
# DOcker image to use
|
|
image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-2'
|
|
# Custom env var to set in the container
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 50
|
|
memory: 64
|
|
memory_max: 80
|
|
# If a proxy is used
|
|
proxy:
|
|
address: http://alertmanager[[ .consul.suffix ]]:alertmanager[[ .consul.suffix ]]@127.0.0.1:3128
|
|
# URL where the web interface is reachable (if exposed with Traefik)
|
|
public_url: https://alert.example.org
|
|
# Traefik settings
|
|
traefik:
|
|
# Turn this on to expose alertmanager with traefik
|
|
# Caution : there's no builtin security, you should configure appropriate middlewares before enabling
|
|
enabled: false
|
|
router: alertmanager
|
|
# Volumes used for data persistence. Note : it's a per_alloc volume
|
|
# so you need to create eg alertmanager-data[0]. This volume should be writeable by user with ID 9093
|
|
volumes:
|
|
data:
|
|
source: 'alertmanager-data'
|
|
type: csi
|
|
per_alloc: true
|
|
vault:
|
|
# List of vault policies to attach to the task
|
|
policies:
|
|
- metrics[[ .consul.suffix ]]
|
|
- alertmanager[[ .consul.suffix ]]
|
|
# Email settings
|
|
email:
|
|
from: alertmanager@[[ .consul.domain ]]
|
|
# You can merge your own custom config with the default provided one. Eg
|
|
# custom_config:
|
|
# receivers:
|
|
# - name: dani
|
|
# email_configs:
|
|
# - to: dani@example.org
|
|
# route:
|
|
# group_by: ['alertname', 'cluster', 'job']
|
|
# receiver: dani
|
|
custom_config: {}
|
|
|
|
# Loki is the log server
|
|
loki:
|
|
# Version of loki
|
|
version: 3.0.0
|
|
# Docker image to use
|
|
image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1'
|
|
# Custom env to set in the container
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 150
|
|
memory: 1024
|
|
vault:
|
|
# Vault policies to attach in the container
|
|
policies:
|
|
- 'loki[[ .consul.suffix ]]'
|
|
# URL where loki is exposed (if enabled)
|
|
public_url: https://loki.example.org
|
|
# Traefik settings
|
|
traefik:
|
|
# Turn it on to expose Loki with Traefik
|
|
# Caution : there's no builtin security, you should add appropriate Traefik middlewares
|
|
enabled: false
|
|
router: loki
|
|
# Retention for logs. Older will be deleted
|
|
retention: 720h # 1 month
|
|
# Custom configuration which will be merged on top of the default one
|
|
custom_config: {}
|
|
prometheus:
|
|
# URL where metrics are available for the metrics proxy (from inside the container PoV)
|
|
metrics_url: http://localhost:3100/metrics
|
|
# Volumes for data persistence. Should be writable for user id 3100
|
|
volumes:
|
|
data:
|
|
type: csi
|
|
source: 'loki-data'
|
|
|
|
# Common vector settings
|
|
vector:
|
|
# Version of vector
|
|
version: 0.38.0
|
|
# Docker image to use
|
|
image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1'
|
|
|
|
# Vector aggregator can be used to ingest logs from external device (using syslog or fluentd)
|
|
# Logs will then be forwarded to loki
|
|
aggregator:
|
|
# Number of instances
|
|
count: 1
|
|
# Docker image to use
|
|
image: '[[ .monitoring.vector.image ]]'
|
|
# Custom env to set in the container
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 100
|
|
memory: 192
|
|
consul:
|
|
connect:
|
|
upstreams:
|
|
# Connect to loki through the service mesh
|
|
- destination_name: 'loki[[ .consul.suffix ]]'
|
|
local_bind_port: 3100
|
|
config:
|
|
protocol: http
|
|
vault:
|
|
# Vault policies to attach to the task.
|
|
# Note : vector can expose its metrics with mTLS natively, so we do not add a metrics_proxy task
|
|
# but we need to grant the metrics policy to the vector task instead
|
|
policies:
|
|
- metrics[[ .consul.suffix ]]
|
|
# Fluentd source settings
|
|
fluentd:
|
|
enabled: false
|
|
traefik:
|
|
router: fluentd
|
|
entrypoints:
|
|
- fluentd
|
|
# Syslog source settings
|
|
syslog_udp:
|
|
enabled: false
|
|
traefik:
|
|
router: syslog-udp
|
|
proto: udp
|
|
entrypoints:
|
|
- syslog-udp
|
|
# Syslog (tcp) source settings
|
|
syslog_tcp:
|
|
enabled: false
|
|
traefik:
|
|
router: syslog-tcp
|
|
proto: tcp
|
|
entrypoints:
|
|
- syslog-tcp
|
|
# Native vector (http) source settings
|
|
vector:
|
|
enabled: true
|
|
# URL where the vector endpoint is available from the outside (if exposed with Traefik)
|
|
public_url: https://vector.example.org
|
|
traefik:
|
|
# Set to true if you want to expose the service with Traefik
|
|
# Caution : there's no builtin security, you should configure appropriate middlewares before enabling it
|
|
enabled: false
|
|
|
|
# Grafana settings
|
|
grafana:
|
|
# Grafana version
|
|
version: 10.4.2
|
|
# Docker image to use
|
|
image: '[[ .docker.repo ]]grafana:[[ .monitoring.grafana.version ]]-1'
|
|
# Custom env var to set in the container
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 100
|
|
memory: 256
|
|
# If proxy is used
|
|
proxy:
|
|
address: http://grafana[[ .consul.suffix ]]:grafana[[ .consul.suffix ]]@127.0.0.1:3128
|
|
# URL where Grafana is reachable
|
|
public_url: https://grafana.example.org
|
|
# List of plugins to install. Note : plugins are installed at image build time, so you need to rebuild
|
|
# the image if you want to update it
|
|
plugins:
|
|
- grafana-clock-panel
|
|
- grafana-piechart-panel
|
|
- name: grafana-lokiexplore-app
|
|
options:
|
|
pluginUrl: https://storage.googleapis.com/integration-artifacts/grafana-lokiexplore-app/grafana-lokiexplore-app-latest.zip
|
|
# Dict of feature toggles. See https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/feature-toggles/
|
|
# Example:
|
|
# feature_toggles:
|
|
# featureToggleAdminPage: true
|
|
# ssoSettingsApi: true
|
|
feature_toggles: {}
|
|
# Traefik settings
|
|
traefik:
|
|
enabled: true
|
|
router: grafana
|
|
csp:
|
|
img-src: "'self' data: blob: https://grafana.com"
|
|
connect-src: "'self' https://grafana.com"
|
|
consul:
|
|
connect:
|
|
# Connect to postgres, loki and prometheus with the service mesh
|
|
upstreams:
|
|
- destination_name: postgres[[ .consul.suffix ]]
|
|
local_bind_port: 5432
|
|
- destination_name: loki[[ .consul.suffix ]]
|
|
local_bind_port: 3100
|
|
config:
|
|
protocol: http
|
|
- destination_name: prometheus[[ .consul.suffix ]]
|
|
local_bind_port: 9090
|
|
config:
|
|
protocol: http
|
|
# Volumes for data persistence
|
|
volumes:
|
|
data:
|
|
type: csi
|
|
source: 'grafana-data'
|
|
vault:
|
|
# Vault policies to attach to the task
|
|
policies:
|
|
- 'grafana[[ .consul.suffix ]]'
|
|
# Postgres DB settings
|
|
database:
|
|
role: grafana
|
|
pgrole: grafana
|
|
# Override some default postgres handling
|
|
postgres:
|
|
database: grafana
|
|
user: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.username }}{{ end }}'
|
|
password: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.password }}{{ end }}'
|
|
pooler:
|
|
mode: session
|
|
prometheus:
|
|
# URL where Grafana metrics are reachable for the metrics proxy (from inside the container PoV)
|
|
metrics_url: http://127.0.0.1:3000/metrics
|
|
|
|
# Agent runs as a system jobs, on all the nodes
|
|
agent:
|
|
consul:
|
|
meta:
|
|
# Override the alloc service meta, the hostname will be more useful than a 0)
|
|
alloc: '${node.unique.name}'
|
|
# Nomad settings
|
|
nomad:
|
|
# Run on all node pools
|
|
node_pool: all
|
|
# Run with an above average priority
|
|
priority: 60
|
|
|
|
# Nomad vector logger is a small container which will query the Nomad API to discover running allocation on the current node
|
|
# Then generate a vector configuration with scraping for all the discovered allocation.
|
|
nomad_vector_logger:
|
|
# Docker image to use
|
|
image: '[[ .docker.repo ]]nomad-vector-logger:24.5-1'
|
|
# Custom env to set in the container
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 20
|
|
memory: 24
|
|
memory_max: 50
|
|
vault:
|
|
# Vault policies to attach to the task
|
|
policies:
|
|
- nomad-vector-logger[[ .consul.suffix ]]
|
|
|
|
# Vector is the main task. It'll read it's config created by nomad-vector-logger and will read log files
|
|
# accordingly, add useful metadata (like node, job, group, task, alloc etc.) and push logs to loki
|
|
vector:
|
|
# Docker image to use
|
|
image: '[[ .monitoring.vector.image ]]'
|
|
# Custom env to set in the container
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 100
|
|
memory: 384
|
|
memory_max: 512
|
|
vault:
|
|
# Vault policies to attach to the container. Vector being able to use mTLS on the metrics endpoint
|
|
# there's no need to add a metrics_proxy task. Instead, we grant the metrics policy to vector so it can get
|
|
# a certificate from vault
|
|
policies:
|
|
- metrics
|
|
consul:
|
|
connect:
|
|
upstreams:
|
|
# Connect to loki with the service mesh
|
|
- destination_name: loki[[ .consul.suffix ]]
|
|
local_bind_port: 3100
|
|
config:
|
|
protocol: http
|
|
# Volumes for data persistence
|
|
volumes:
|
|
# The nomad volume should expose the Nomad alloc dir (eg /opt/nomad/data/alloc) where vector will be able
|
|
# to read the logs. You should create a host volume in nomad client config of all your nodes. Eg
|
|
# client {
|
|
# enabled = true
|
|
# host_volume "nomad_alloc" {
|
|
# path = "/opt/nomad/data/alloc"
|
|
# read_only = "true"
|
|
# }
|
|
# }
|
|
nomad:
|
|
type: host
|
|
source: nomad_alloc
|
|
read_only: true
|
|
# The data volume will be used by vector for buffering (in case loki is unavailable)
|
|
# You can create a host volume in Nomad's client config, eg
|
|
# client {
|
|
# enabled = true
|
|
# host_volume "nomad_alloc" {
|
|
# path = "/data/vector-agent"
|
|
# }
|
|
# }
|
|
data:
|
|
type: host
|
|
source: vector_data
|
|
|
|
# The node exporter can be used to expose the host metrics to prometheus
|
|
node_exporter:
|
|
# Is the node exporter enabled ? (set to false if you don't want it, or if you
|
|
# already manage the node-exporter separatly)
|
|
enabled: true
|
|
# Version of the exporter
|
|
version: 1.8.0
|
|
# Docker image to use
|
|
image: '[[ .docker.repo ]]node-exporter:[[ .monitoring.agent.node_exporter.version ]]-1'
|
|
# Custom env to set in the container
|
|
env: {}
|
|
# Resource allocation
|
|
resources:
|
|
cpu: 50
|
|
memory: 32
|
|
memory_max: 56
|
|
vault:
|
|
# Vault policies to atatch to the task
|
|
# This exporter can handle mTLS itself, so no need to create a metrics_proxy task, instead, grant the metrics policy
|
|
# So it can get a certificate from vault
|
|
policies:
|
|
- metrics
|
|
# Args to add to the exporter on start
|
|
args:
|
|
- '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)'
|
|
# Volumes
|
|
volumes:
|
|
# The exporter should access the host root filesystem
|
|
# For this, you should create a host volume in Nomad's client config, eg
|
|
# client {
|
|
# enabled = true
|
|
# host_volume "host_root" {
|
|
# path = "/"
|
|
# read_only = true
|
|
# }
|
|
# }
|
|
host:
|
|
type: host
|
|
source: host_root
|
|
read_only: true
|
|
|
|
# Consul agents are not registered as services in the catalog
|
|
# so cannot be discovered. This adds a small nginx proxy which expose metrics of the local consul agent of
|
|
# every node (runs as a system job)
|
|
consul_agent_exporter:
|
|
enabled: true
|
|
image: nginxinc/nginx-unprivileged:alpine
|
|
env: {}
|
|
resources:
|
|
cpu: 10
|
|
memory: 15
|
|
memory_max: 24
|
|
vault:
|
|
policies:
|
|
- metrics
|
|
- cluster-exporter[[ .consul.suffix ]]
|
|
|
|
# Enable globaly prometheus for this bundle :-)
|
|
prometheus:
|
|
enabled: true
|