--- # The name of this instance # Note : it's not supported to run several instances in the same namespace, so generally # you won't need to change this instance: monitoring # General vault settings vault: pki: # The path of the PKI used for the monitoring path: '[[ .prometheus.vault_pki ]]' ou: Monitoring # Some random secrets to generate rand_secrets: - path: grafana fields: - secret_key - initial_admin_pwd monitoring: # List of namespace in which services will be monitored (use * to monitor everything) # This might be useful if you run several monitoring instances in different namespaces namespaces: - '*' # Exporters job will run in its own job (so you can easily assign it # to a dedicated node_pool exporters: # Number of exporter instances count: 1 # Ping exporter can ping external hosts and expose stats to prometheus ping: # Version of the exporter to use version: 1.1.3 # Docker image to use image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1' # Custom env var to set in the container env: {} # Resource allocation resources: cpu: 10 memory: 24 # List of host to ping and for which statistics will be exposed. Eg # probes: # - gatway.acme.org # - 10.99.10.1 probes: [] # The blackbox exporter can be used to probes external http or tcp services and # expose those metrics to prometheus blackbox: # Version of the exporter version: 0.25.0 # Docker image to use image: '[[ .docker.repo ]]blackbox-exporter:[[ .monitoring.exporters.blackbox.version ]]-1' # Custom env var to set in the container env: {} # Resource allocation resources: cpu: 10 memory: 32 # List of tcp probes, eg # tcp_probes: # - 10.99.1.1:443 # - 10.118.3.13:587 tcp_probes: [] # List of http probes, eg # http_probes: # - https://id.example.org # - https://portal.acme.com http_probes: [] # Consul exporter will expose consul metrics (mainly registered services status) consul: # Version of the exporter version: 0.12.0 # Docker image to use image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-1' # Custom env var to set in the container env: {} # Resource allocation resources: cpu: 48 memory: 64 vault: # Vault policies to attach policies: - 'consul-exporter[[ .consul.suffix ]]' # The cluster exporter is a simple nginx used as a proxy # which handles TLS for the cluster services (vault, consul and nomad) cluster: # Docker image to use image: nginxinc/nginx-unprivileged:alpine # Custom env env: {} # Resource allocation resources: cpu: 10 memory: 48 memory_max: 64 vault: # Vault policies to attach to the task policies: - 'cluster-exporter[[ .consul.suffix ]]' - metrics[[ .consul.suffix ]] # The prometheus server prometheus: # Number of instances to run. Note that if you run several instances, they will be independant, and all of # them will scrape the same data. Then queries to the prometheus API will be loadbalanced between all instances. # This should work most of the time, but can give some strange result if eg, one of the instances was down (queries # for data during the downtime can give some random result depending on the instance your query is routed to) count: 1 # Version of prometheus version: 2.51.2 # Docker image to use image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1' # Custom env var to set env: {} # Resource allocation resources: cpu: 200 memory: 768 memory_max: 1024 # Volumes used for data persistence # You must create a prometheus-data[0] volume as it's a per_alloc volume volumes: data: type: csi source: 'prometheus-data' per_alloc: true vault: # Vault policies to attach to the task policies: - 'prometheus[[ .consul.suffix ]]' # A dict of custom jobs. Eg # jobs: # squid: # targets: # - 10.11.2.3:9305 # - 192.168.6.20:782 jobs: {} # A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts, # or you can provide it as raw content. Eg # alert_rules: # postgres: # url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml # patroni: # url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml # custom: # content: | # groups: # - name: EmbeddedExporter # rules: # - alert: PrometheusJobMissing # expr: 'absent(up{job="prometheus"})' # for: 0m # labels: # severity: warning # annotations: # summary: Prometheus job missing (instance {{ $labels.instance }}) # description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # If you need something more flexible (like download an archive of rules and uncompress it, you should use artifacts instead. Just ensure your rules # are in /local/rules/ inside the container alert_rules: {} # The public URL where prometheus will be reachable (if exposed with Traefik) public_url: https://prometheus.example.org # Traefik settings traefik: # Turn this on to expose prometheus with Traefik # Caution : there's no builtin security, you should configure the appropriate middlewares enabled: false router: prometheus # Metrics retention duration retention: 30d # always enable prometheus metrics (of course :-) ) prometheus: # This is the URL where metrics are exposed, where the metrics proxy will point at (from the container PoV) metrics_url: http://localhost:9090/metrics # AlertManager can process and send alerts alertmanager: # Number of instances to run. Set > 1 if you wan HA count: 1 # Version of alertmanager version: 0.27.0 # DOcker image to use image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-2' # Custom env var to set in the container env: {} # Resource allocation resources: cpu: 50 memory: 64 memory_max: 80 # If a proxy is used proxy: address: http://alertmanager[[ .consul.suffix ]]:alertmanager[[ .consul.suffix ]]@127.0.0.1:3128 # URL where the web interface is reachable (if exposed with Traefik) public_url: https://alert.example.org # Traefik settings traefik: # Turn this on to expose alertmanager with traefik # Caution : there's no builtin security, you should configure appropriate middlewares before enabling enabled: false router: alertmanager # Volumes used for data persistence. Note : it's a per_alloc volume # so you need to create eg alertmanager-data[0]. This volume should be writeable by user with ID 9093 volumes: data: source: 'alertmanager-data' type: csi per_alloc: true vault: # List of vault policies to attach to the task policies: - metrics[[ .consul.suffix ]] - alertmanager[[ .consul.suffix ]] # Email settings email: from: alertmanager@[[ .consul.domain ]] # You can merge your own custom config with the default provided one. Eg # custom_config: # receivers: # - name: dani # email_configs: # - to: dani@example.org # route: # group_by: ['alertname', 'cluster', 'job'] # receiver: dani custom_config: {} # Loki is the log server loki: # Version of loki version: 3.0.0 # Docker image to use image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1' # Custom env to set in the container env: {} # Resource allocation resources: cpu: 150 memory: 1024 vault: # Vault policies to attach in the container policies: - 'loki[[ .consul.suffix ]]' # URL where loki is exposed (if enabled) public_url: https://loki.example.org # Traefik settings traefik: # Turn it on to expose Loki with Traefik # Caution : there's no builtin security, you should add appropriate Traefik middlewares enabled: false router: loki # Retention for logs. Older will be deleted retention: 720h # 1 month # Custom configuration which will be merged on top of the default one custom_config: {} prometheus: # URL where metrics are available for the metrics proxy (from inside the container PoV) metrics_url: http://localhost:3100/metrics # Volumes for data persistence. Should be writable for user id 3100 volumes: data: type: csi source: 'loki-data' # Common vector settings vector: # Version of vector version: 0.38.0 # Docker image to use image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1' # Vector aggregator can be used to ingest logs from external device (using syslog or fluentd) # Logs will then be forwarded to loki aggregator: # Number of instances count: 1 # Docker image to use image: '[[ .monitoring.vector.image ]]' # Custom env to set in the container env: {} # Resource allocation resources: cpu: 100 memory: 192 consul: connect: upstreams: # Connect to loki through the service mesh - destination_name: 'loki[[ .consul.suffix ]]' local_bind_port: 3100 config: protocol: http vault: # Vault policies to attach to the task. # Note : vector can expose its metrics with mTLS natively, so we do not add a metrics_proxy task # but we need to grant the metrics policy to the vector task instead policies: - metrics[[ .consul.suffix ]] # Fluentd source settings fluentd: enabled: false traefik: router: fluentd entrypoints: - fluentd # Syslog source settings syslog_udp: enabled: false traefik: router: syslog-udp proto: udp entrypoints: - syslog-udp # Syslog (tcp) source settings syslog_tcp: enabled: false traefik: router: syslog-tcp proto: tcp entrypoints: - syslog-tcp # Native vector (http) source settings vector: enabled: true # URL where the vector endpoint is available from the outside (if exposed with Traefik) public_url: https://vector.example.org traefik: # Set to true if you want to expose the service with Traefik # Caution : there's no builtin security, you should configure appropriate middlewares before enabling it enabled: false # Grafana settings grafana: # Grafana version version: 11.0.0 # Docker image to use image: '[[ .docker.repo ]]grafana:[[ .monitoring.grafana.version ]]-1' # Custom env var to set in the container env: {} # Resource allocation resources: cpu: 100 memory: 256 # If proxy is used proxy: address: http://grafana[[ .consul.suffix ]]:grafana[[ .consul.suffix ]]@127.0.0.1:3128 # URL where Grafana is reachable public_url: https://grafana.example.org # List of plugins to install. Note : plugins are installed at image build time, so you need to rebuild # the image if you want to update it plugins: - grafana-clock-panel - grafana-piechart-panel - name: grafana-lokiexplore-app options: pluginUrl: https://storage.googleapis.com/integration-artifacts/grafana-lokiexplore-app/grafana-lokiexplore-app-latest.zip # Dict of feature toggles. See https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/feature-toggles/ # Example: # feature_toggles: # featureToggleAdminPage: true # ssoSettingsApi: true feature_toggles: {} # Traefik settings traefik: enabled: true router: grafana csp: img-src: "'self' data: blob: https://grafana.com" connect-src: "'self' https://grafana.com" consul: connect: # Connect to postgres, loki and prometheus with the service mesh upstreams: - destination_name: postgres[[ .consul.suffix ]] local_bind_port: 5432 - destination_name: loki[[ .consul.suffix ]] local_bind_port: 3100 config: protocol: http - destination_name: prometheus[[ .consul.suffix ]] local_bind_port: 9090 config: protocol: http # Volumes for data persistence volumes: data: type: csi source: 'grafana-data' vault: # Vault policies to attach to the task policies: - 'grafana[[ .consul.suffix ]]' # Postgres DB settings database: role: grafana pgrole: grafana # Override some default postgres handling postgres: database: grafana user: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.username }}{{ end }}' password: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.password }}{{ end }}' pooler: mode: session prometheus: # URL where Grafana metrics are reachable for the metrics proxy (from inside the container PoV) metrics_url: http://127.0.0.1:3000/metrics # Agent runs as a system jobs, on all the nodes agent: consul: meta: # Override the alloc service meta, the hostname will be more useful than a 0) alloc: '${node.unique.name}' # Nomad settings nomad: # Run on all node pools node_pool: all # Run with an above average priority priority: 60 # How logs are collected log_collection: # files will read log files writtent by Nomad, and works with any task driver # But can consume a lot of resources (especially with the Docker driver) files: enabled: true # fluentd mode requires the Nomad task drivers (mainly Docker) to be configured to send logs # to localhost:4224. If enabled, the vector agent will listen on this port and ingest logs from the Docker daemon directly # To use it, you can configure Nomad like this (note the env which allows vector to get the required metadata) # plugin "docker" { # config { # logging { # type = "fluentd" # config { # fluentd-address = "127.0.0.1:4224" # fluentd-async = true # env = "NOMAD_JOB_NAME,NOMAD_GROUP_NAME,NOMAD_DC,NOMAD_REGION,NOMAD_TASK_NAME,NOMAD_ALLOC_INDEX,NOMAD_ALLOC_ID,NOMAD_NAMESPACE" # } # } # } # } fluentd: enabled: false port: 4224 # When you use fluentd, you can send alloc logs to files, where Nomad would expect them. This allow disabling Nomad internal log collection # while still being able to use Nomad's API to read logs. It can save a lot of resources. Do not turn this on unless you set disable_log_collection # Also, you should disable log_collection.files to prevent logs loop !! create_nomad_logs: enabled: false # Nomad vector logger is a small container which will query the Nomad API to discover running allocation on the current node # Then generate a vector configuration with scraping for all the discovered allocation. nomad_vector_logger: # Docker image to use image: '[[ .docker.repo ]]nomad-vector-logger:24.5-1' # Custom env to set in the container env: {} # Resource allocation resources: cpu: 20 memory: 24 memory_max: 50 vault: # Vault policies to attach to the task policies: - nomad-vector-logger[[ .consul.suffix ]] # Vector is the main task. It'll read it's config created by nomad-vector-logger and will read log files # accordingly, add useful metadata (like node, job, group, task, alloc etc.) and push logs to loki vector: # Docker image to use image: '[[ .monitoring.vector.image ]]' # Custom env to set in the container env: {} # Resource allocation resources: cpu: 100 memory: 384 memory_max: 512 vault: # Vault policies to attach to the container. Vector being able to use mTLS on the metrics endpoint # there's no need to add a metrics_proxy task. Instead, we grant the metrics policy to vector so it can get # a certificate from vault policies: - metrics consul: connect: upstreams: # Connect to loki with the service mesh - destination_name: loki[[ .consul.suffix ]] local_bind_port: 3100 config: protocol: http # Volumes for data persistence volumes: # The nomad volume should expose the Nomad alloc dir (eg /opt/nomad/data/alloc) where vector will be able # to read the logs. You should create a host volume in nomad client config of all your nodes. Eg # client { # enabled = true # host_volume "nomad_alloc" { # path = "/opt/nomad/data/alloc" # read_only = "true" # } # } nomad: type: host source: nomad_alloc # The data volume will be used by vector for buffering (in case loki is unavailable) # You can create a host volume in Nomad's client config, eg # client { # enabled = true # host_volume "nomad_alloc" { # path = "/data/vector-agent" # } # } data: type: host source: vector_data # The node exporter can be used to expose the host metrics to prometheus node_exporter: # Is the node exporter enabled ? (set to false if you don't want it, or if you # already manage the node-exporter separatly) enabled: true # Version of the exporter version: 1.8.0 # Docker image to use image: '[[ .docker.repo ]]node-exporter:[[ .monitoring.agent.node_exporter.version ]]-1' # Custom env to set in the container env: {} # Resource allocation resources: cpu: 50 memory: 32 memory_max: 56 vault: # Vault policies to atatch to the task # This exporter can handle mTLS itself, so no need to create a metrics_proxy task, instead, grant the metrics policy # So it can get a certificate from vault policies: - metrics # Args to add to the exporter on start args: - '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)' # Volumes volumes: # The exporter should access the host root filesystem # For this, you should create a host volume in Nomad's client config, eg # client { # enabled = true # host_volume "host_root" { # path = "/" # read_only = true # } # } host: type: host source: host_root read_only: true # Consul agents are not registered as services in the catalog # so cannot be discovered. This adds a small nginx proxy which expose metrics of the local consul agent of # every node (runs as a system job) consul_agent_exporter: enabled: true image: nginxinc/nginx-unprivileged:alpine env: {} resources: cpu: 10 memory: 15 memory_max: 24 vault: policies: - metrics - cluster-exporter[[ .consul.suffix ]] # Enable globaly prometheus for this bundle :-) prometheus: enabled: true