monitoring/variables.yml

---

# The name of this instance
# Note : it's not supported to run several instances in the same namespace, so generally
# you won't need to change this
instance: monitoring

# General vault settings
vault:
  pki:
    # The path of the PKI used for the monitoring
    path: '[[ .prometheus.vault_pki ]]'
    ou: Monitoring

  # Some random secrets to generate
  rand_secrets:
    - path: grafana
      fields:
        - secret_key
        - initial_admin_pwd


monitoring:

  # List of namespace in which services will be monitored (use * to monitor everything)
  # This might be useful if you run several monitoring instances in different namespaces
  namespaces:
    - '*'

  # Exporters job will run in its own job (so you can easily assign it
  # to a dedicated node_pool
  exporters:

    # Number of exporter instances
    count: 1

    # Ping exporter can ping external hosts and expose stats to prometheus
    ping:
      # Version of the exporter to use
      version: 1.1.3
      # Docker image to use
      image: '[[ .docker.repo ]]ping-exporter:[[ .monitoring.exporters.ping.version ]]-1'
      # Custom env var to set in the container
      env: {}
      # Resource allocation
      resources:
        cpu: 10
        memory: 24
      # List of host to ping and for which statistics will be exposed. Eg
      # probes:
      #   - gatway.acme.org
      #   - 10.99.10.1
      probes: []

    # The blackbox exporter can be used to probes external http or tcp services and
    # expose those metrics to prometheus
    blackbox:
      # Version of the exporter
      version: 0.25.0
      # Docker image to use
      image: '[[ .docker.repo ]]blackbox-exporter:[[ .monitoring.exporters.blackbox.version ]]-1'
      # Custom env var to set in the container
      env: {}
      # Resource allocation
      resources:
        cpu: 10
        memory: 32
      # List of tcp probes, eg
      # tcp_probes:
      #   - 10.99.1.1:443
      #   - 10.118.3.13:587
      tcp_probes: []
      # List of http probes, eg
      # http_probes:
      #   - https://id.example.org
      #   - https://portal.acme.com
      http_probes: []

    # Consul exporter will expose consul metrics (mainly registered services status)
    consul:
      # Version of the exporter
      version: 0.12.0
      # Docker image to use
      image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-1'
      # Custom env var to set in the container
      env: {}
      # Resource allocation
      resources:
        cpu: 48
        memory: 64
      vault:
        # Vault policies to attach
        policies:
          - 'consul-exporter[[ .consul.suffix ]]'

    # The cluster exporter is a simple nginx used as a proxy
    # which handles TLS for the cluster services (vault, consul and nomad)
    cluster:
      # Docker image to use
      image: nginxinc/nginx-unprivileged:alpine
      # Custom env
      env: {}
      # Resource allocation
      resources:
        cpu: 10
        memory: 48
        memory_max: 64
      vault:
        # Vault policies to attach to the task
        policies:
          - 'cluster-exporter[[ .consul.suffix ]]'
          - metrics[[ .consul.suffix ]]

  # The prometheus server
  prometheus:
    # Number of instances to run. Note that if you run several instances, they will be independant, and all of
    # them will scrape the same data. Then queries to the prometheus API will be loadbalanced between all instances.
    # This should work most of the time, but can give some strange result if eg, one of the instances was down (queries
    # for data during the downtime can give some random result depending on the instance your query is routed to)
    count: 1
    # Version of prometheus
    version: 2.51.2
    # Docker image to use
    image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1'
    # Custom env var to set
    env: {}
    # Resource allocation
    resources:
      cpu: 200
      memory: 768
      memory_max: 1024
    # Volumes used for data persistence
    # You must create a prometheus-data[0] volume as it's a per_alloc volume
    volumes:
      data:
        type: csi
        source: 'prometheus-data'
        per_alloc: true
    vault:
      # Vault policies to attach to the task
      policies:
        - 'prometheus[[ .consul.suffix ]]'
    # A dict of custom jobs. Eg
    # jobs:
    #   squid:
    #     targets:
    #       - 10.11.2.3:9305
    #       - 192.168.6.20:782
    jobs: {}
    # A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts,
    # or you can provide it as raw content. Eg
    # alert_rules:
    #   postgres:
    #     url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
    #   patroni:
    #     url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml
    #   custom:
    #     content: |
    #       groups:
    #         - name: EmbeddedExporter
    #           rules:
    #             - alert: PrometheusJobMissing
    #               expr: 'absent(up{job="prometheus"})'
    #               for: 0m
    #               labels:
    #                 severity: warning
    #               annotations:
    #                 summary: Prometheus job missing (instance {{ $labels.instance }})
    #                 description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # If you need something more flexible (like download an archive of rules and uncompress it, you should use artifacts instead. Just ensure your rules
    # are in /local/rules/ inside the container
    alert_rules: {}
    # The public URL where prometheus will be reachable (if exposed with Traefik)
    public_url: https://prometheus.example.org
    # Traefik settings
    traefik:
      # Turn this on to expose prometheus with Traefik
      # Caution : there's no builtin security, you should configure the appropriate middlewares
      enabled: false
      router: prometheus
    # Metrics retention duration
    retention: 30d
    # always enable prometheus metrics (of course :-) )
    prometheus:
      # This is the URL where metrics are exposed, where the metrics proxy will point at (from the container PoV)
      metrics_url: http://localhost:9090/metrics

  # AlertManager can process and send alerts
  alertmanager:
    # Number of instances to run. Set > 1 if you wan HA
    count: 1
    # Version of alertmanager
    version: 0.27.0
    # DOcker image to use
    image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-2'
    # Custom env var to set in the container
    env: {}
    # Resource allocation
    resources:
      cpu: 50
      memory: 64
      memory_max: 80
    # If a proxy is used
    proxy:
      address: http://alertmanager[[ .consul.suffix ]]:alertmanager[[ .consul.suffix ]]@127.0.0.1:3128
    # URL where the web interface is reachable (if exposed with Traefik)
    public_url: https://alert.example.org
    # Traefik settings
    traefik:
      # Turn this on to expose alertmanager with traefik
      # Caution : there's no builtin security, you should configure appropriate middlewares before enabling
      enabled: false
      router: alertmanager
    # Volumes used for data persistence. Note : it's a per_alloc volume
    # so you need to create eg alertmanager-data[0]. This volume should be writeable by user with ID 9093
    volumes:
      data:
        source: 'alertmanager-data'
        type: csi
        per_alloc: true
    vault:
      # List of vault policies to attach to the task
      policies:
        - metrics[[ .consul.suffix ]]
        - alertmanager[[ .consul.suffix ]]
    # Email settings
    email:
      from: alertmanager@[[ .consul.domain ]]
    # You can merge your own custom config with the default provided one. Eg
    # custom_config:
    #   receivers:
    #     - name: dani
    #       email_configs:
    #         - to: dani@example.org
    #   route:
    #     group_by: ['alertname', 'cluster', 'job']
    #     receiver: dani
    custom_config: {}

  # Loki is the log server
  loki:
    # Version of loki
    version: 3.0.0
    # Docker image to use
    image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1'
    # Custom env to set in the container
    env: {}
    # Resource allocation
    resources:
      cpu: 150
      memory: 1024
    vault:
      # Vault policies to attach in the container
      policies:
        - 'loki[[ .consul.suffix ]]'
    # URL where loki is exposed (if enabled)
    public_url: https://loki.example.org
    # Traefik settings
    traefik:
      # Turn it on to expose Loki with Traefik
      # Caution : there's no builtin security, you should add appropriate Traefik middlewares
      enabled: false
      router: loki
    # Retention for logs. Older will be deleted
    retention: 720h # 1 month
    # Custom configuration which will be merged on top of the default one
    custom_config: {}
    prometheus:
      # URL where metrics are available for the metrics proxy (from inside the container PoV)
      metrics_url: http://localhost:3100/metrics
    # Volumes for data persistence. Should be writable for user id 3100
    volumes:
      data:
        type: csi
        source: 'loki-data'

  # Common vector settings
  vector:
    # Version of vector
    version: 0.38.0
    # Docker image to use
    image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1'

  # Vector aggregator can be used to ingest logs from external device (using syslog or fluentd)
  # Logs will then be forwarded to loki
  aggregator:
    # Number of instances
    count: 1
    # Docker image to use
    image: '[[ .monitoring.vector.image ]]'
    # Custom env to set in the container
    env: {}
    # Resource allocation
    resources:
      cpu: 100
      memory: 192
    consul:
      connect:
        upstreams:
          # Connect to loki through the service mesh
          - destination_name: 'loki[[ .consul.suffix ]]'
            local_bind_port: 3100
            config:
              protocol: http
    vault:
      # Vault policies to attach to the task.
      # Note : vector can expose its metrics with mTLS natively, so we do not add a metrics_proxy task
      # but we need to grant the metrics policy to the vector task instead
      policies:
        - metrics[[ .consul.suffix ]]
    # Fluentd source settings
    fluentd:
      enabled: false
      traefik:
        router: fluentd
        entrypoints:
          - fluentd
    # Syslog source settings
    syslog_udp:
      enabled: false
      traefik:
        router: syslog-udp
        proto: udp
        entrypoints:
          - syslog-udp
    # Syslog (tcp) source settings
    syslog_tcp:
      enabled: false
      traefik:
        router: syslog-tcp
        proto: tcp
        entrypoints:
          - syslog-tcp
    # Native vector (http) source settings
    vector:
      enabled: true
      # URL where the vector endpoint is available from the outside (if exposed with Traefik)
      public_url: https://vector.example.org
      traefik:
        # Set to true if you want to expose the service with Traefik
        # Caution : there's no builtin security, you should configure appropriate middlewares before enabling it
        enabled: false

  # Grafana settings
  grafana:
    # Grafana version
    version: 10.4.2
    # Docker image to use
    image: '[[ .docker.repo ]]grafana:[[ .monitoring.grafana.version ]]-1'
    # Custom env var to set in the container
    env: {}
    # Resource allocation
    resources:
      cpu: 100
      memory: 256
    # If proxy is used
    proxy:
      address: http://grafana[[ .consul.suffix ]]:grafana[[ .consul.suffix ]]@127.0.0.1:3128
    # URL where Grafana is reachable
    public_url: https://grafana.example.org
    # List of plugins to install. Note : plugins are installed at image build time, so you need to rebuild
    # the image if you want to update it
    plugins:
      - grafana-clock-panel
      - grafana-piechart-panel
      - name: grafana-lokiexplore-app
        options:
          pluginUrl: https://storage.googleapis.com/integration-artifacts/grafana-lokiexplore-app/grafana-lokiexplore-app-latest.zip
    # Dict of feature toggles. See https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/feature-toggles/
    # Example:
    # feature_toggles:
    #   featureToggleAdminPage: true
    #   ssoSettingsApi: true
    feature_toggles: {}
    # Traefik settings
    traefik:
      enabled: true
      router: grafana
      csp:
        img-src: "'self' data: blob: https://grafana.com"
        connect-src: "'self' https://grafana.com"
    consul:
      connect:
        # Connect to postgres, loki and prometheus with the service mesh
        upstreams:
          - destination_name: postgres[[ .consul.suffix ]]
            local_bind_port: 5432
          - destination_name: loki[[ .consul.suffix ]]
            local_bind_port: 3100
            config:
              protocol: http
          - destination_name: prometheus[[ .consul.suffix ]]
            local_bind_port: 9090
            config:
              protocol: http
    # Volumes for data persistence
    volumes:
      data:
        type: csi
        source: 'grafana-data'
    vault:
      # Vault policies to attach to the task
      policies:
        - 'grafana[[ .consul.suffix ]]'
      # Postgres DB settings
      database:
        role: grafana
        pgrole: grafana
    # Override some default postgres handling
    postgres:
      database: grafana
      user: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.username }}{{ end }}'
      password: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.password }}{{ end }}'
      pooler:
        mode: session
    prometheus:
      # URL where Grafana metrics are reachable for the metrics proxy (from inside the container PoV)
      metrics_url: http://127.0.0.1:3000/metrics

  # Agent runs as a system jobs, on all the nodes
  agent:
    consul:
      meta:
        # Override the alloc service meta, the hostname will be more useful than a 0)
        alloc: '${node.unique.name}'
    # Nomad settings
    nomad:
      # Run on all node pools
      node_pool: all
      # Run with an above average priority
      priority: 60

    # Nomad vector logger is a small container which will query the Nomad API to discover running allocation on the current node
    # Then generate a vector configuration with scraping for all the discovered allocation.
    nomad_vector_logger:
      # Docker image to use
      image: '[[ .docker.repo ]]nomad-vector-logger:24.5-1'
      # Custom env to set in the container
      env: {}
      # Resource allocation
      resources:
        cpu: 20
        memory: 24
        memory_max: 50
      vault:
        # Vault policies to attach to the task
        policies:
          - nomad-vector-logger[[ .consul.suffix ]]

    # Vector is the main task. It'll read it's config created by nomad-vector-logger and will read log files
    # accordingly, add useful metadata (like node, job, group, task, alloc etc.) and push logs to loki
    vector:
      # Docker image to use
      image: '[[ .monitoring.vector.image ]]'
      # Custom env to set in the container
      env: {}
      # Resource allocation
      resources:
        cpu: 100
        memory: 384
        memory_max: 512
      vault:
        # Vault policies to attach to the container. Vector being able to use mTLS on the metrics endpoint
        # there's no need to add a metrics_proxy task. Instead, we grant the metrics policy to vector so it can get
        # a certificate from vault
        policies:
          - metrics
      consul:
        connect:
          upstreams:
            # Connect to loki with the service mesh
            - destination_name: loki[[ .consul.suffix ]]
              local_bind_port: 3100
              config:
                protocol: http
      # Volumes for data persistence
      volumes:
        # The nomad volume should expose the Nomad alloc dir (eg /opt/nomad/data/alloc) where vector will be able
        # to read the logs. You should create a host volume in nomad client config of all your nodes. Eg
        # client {
        #   enabled = true
        #   host_volume "nomad_alloc" {
        #     path = "/opt/nomad/data/alloc"
        #     read_only = "true"
        #   }
        # }
        nomad:
          type: host
          source: nomad_alloc
          read_only: true
        # The data volume will be used by vector for buffering (in case loki is unavailable)
        # You can create a host volume in Nomad's client config, eg
        # client {
        #   enabled = true
        #   host_volume "nomad_alloc" {
        #     path = "/data/vector-agent"
        #   }
        # }
        data:
          type: host
          source: vector_data

    # The node exporter can be used to expose the host metrics to prometheus
    node_exporter:
      # Is the node exporter enabled ? (set to false if you don't want it, or if you
      # already manage the node-exporter separatly)
      enabled: true
      # Version of the exporter
      version: 1.8.0
      # Docker image to use
      image: '[[ .docker.repo ]]node-exporter:[[ .monitoring.agent.node_exporter.version ]]-1'
      # Custom env to set in the container
      env: {}
      # Resource allocation
      resources:
        cpu: 50
        memory: 32
        memory_max: 56
      vault:
        # Vault policies to atatch to the task
        # This exporter can handle mTLS itself, so no need to create a metrics_proxy task, instead, grant the metrics policy
        # So it can get a certificate from vault
        policies:
          - metrics
      # Args to add to the exporter on start
      args:
        - '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)'
      # Volumes
      volumes:
        # The exporter should access the host root filesystem
        # For this, you should create a host volume in Nomad's client config, eg
        # client {
        #   enabled = true
        #   host_volume "host_root" {
        #     path = "/"
        #     read_only = true
        #   }
        # }
        host:
          type: host
          source: host_root
          read_only: true

    # Consul agents are not registered as services in the catalog
    # so cannot be discovered. This adds a small nginx proxy which expose metrics of the local consul agent of
    # every node (runs as a system job)
    consul_agent_exporter:
      enabled: true
      image: nginxinc/nginx-unprivileged:alpine
      env: {}
      resources:
        cpu: 10
        memory: 15
        memory_max: 24
      vault:
        policies:
          - metrics
          - cluster-exporter[[ .consul.suffix ]]

# Enable globaly prometheus for this bundle :-)
prometheus:
  enabled: true