Various cleanup

2024-03-25 22:23:31 +01:00 · 2024-03-25 22:23:31 +01:00 · 2ae2a91002
parent f954afc251
commit 2ae2a91002
18 changed files with 1281 additions and 196 deletions
--- a/agent.nomad.hcl
+++ b/agent.nomad.hcl
@ -1,7 +1,9 @@
 job "[[ .instance ]]-agent" {

 [[- $c := merge .monitoring.agent .monitoring . ]]
+
 [[ template "common/job_start" $c ]]
+
  type = "system"

  # This group will collect logs from the allocation running on the node
@ -39,16 +41,16 @@ job "[[ .instance ]]-agent" {
      user = 3987

      config {
-        image = "[[ $n.image ]]"
+        image           = "[[ $n.image ]]"
        readonly_rootfs = true
-        pids_limit = 50
+        pids_limit      = 50
        # Nomad Vector Logger needs to run on the host's network namespace
        # so it can reach the Nomad Agent API on localhost:4646
        network_mode = "host"
        # Host network namespace requires disabling user namespace
        userns_mode = "host"
-        command = "nomad-vector-logger"
-        args = [
+        command     = "nomad-vector-logger"
+        args        = [
          "--config",
          "/local/nomad-vector-logger.toml"
        ]
@ -85,9 +87,9 @@ _EOT
        destination = "local/nomad-vector-logger.toml"
      }

-      # Disable the default nomad.toml template
+      # Disable the default nomad.toml template, as we provide our own nomad.yml template
      template {
-        data = "# Disable the default toml template"
+        data        = "# Disable the default toml template"
        destination = "local/template/nomad.toml"
      }

@ -143,11 +145,13 @@ _EOT
      driver = "[[ $c.nomad.driver ]]"

      config {
-        image   = "busybox:latest"
-        command = "sh"
-        args = [
+        image           = "busybox:latest"
+        readonly_rootfs = true
+        pids_limit      = 20
+        command         = "sh"
+        args            = [
          "-c",
-          "echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
+          "echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done"
        ]
      }

@ -170,9 +174,11 @@ _EOT
      leader = true

      config {
-        image       = "[[ $c.image ]]"
-        userns_mode = "host"
-        args = [
+        image           = "[[ $c.image ]]"
+        userns_mode     = "host"
+        readonly_rootfs = true
+        pids_limit      = 200
+        args            = [
          "--watch-config",
          "--config", "/local/vector.yml",
          "--config-dir", "/alloc/data/vector_conf"
@ -186,7 +192,9 @@ _EOT
      }

 [[ template "common/metrics_cert" $c ]]
+[[ template "common/artifacts" $c ]]

+      # Main vector configuration
      template {
        data            =<<_EOT
 [[ template "monitoring/agent/vector.yml" $c ]]
@ -217,6 +225,8 @@ _EOT
    }
  }

+[[- if .monitoring.agent.node_exporter.enabled ]]
+
  # This group runs the prometheus node-exporter to expose prometheus metrics from the node
  group "node-exporter" {

@ -238,21 +248,25 @@ _EOT
      driver = "[[ $c.nomad.driver ]]"

      config {
-        image = "[[ $c.image ]]"
-        pid_mode = "host"
-        #network_mode = "host"
-        userns_mode = "host"
+        image           = "[[ $c.image ]]"
+        pid_mode        = "host"
+        userns_mode     = "host"
        readonly_rootfs = true
-        pids_limit = 50
-        args     = [
+        pids_limit      = 50
+        command         = "/usr/local/bin/node_exporter"
+        args            = [
          "--path.rootfs=/host",
          "--web.config.file=/local/tls.yml",
-          "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
+          "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}",
+[[- range $arg := $c.args ]]
+          "[[ $arg ]]",
+[[- end ]]
        ]
      }

 [[ template "common/vault.policies" $c ]]
 [[ template "common/metrics_cert" $c ]]
+[[ template "common/artifacts" $c ]]

      template {
        data = <<_EOT
@ -271,4 +285,5 @@ _EOT
 [[ template "common/resources" $c ]]
    }
  }
+[[- end ]]
 }
--- a/consul/config/service-defaults/vector-aggregator.hcl
+++ b/consul/config/service-defaults/vector-aggregator.hcl
@ -0,0 +1,3 @@
+Kind = "service-defaults"
+Name = "vector-aggregator[[ .consul.suffix ]]"
+Protocol = "http"
--- a/consul/config/service-intentions/vector-aggregator.hcl
+++ b/consul/config/service-intentions/vector-aggregator.hcl
@ -0,0 +1,16 @@
+[[- $c := merge .monitoring.aggregator .monitoring . -]]
+Kind = "service-intentions"
+Name = "vector-aggregator[[ .consul.suffix ]]"
+Sources = [
+  {
+    Name = "[[ $c.traefik.instance ]]"
+    Permissions = [
+      {
+        Action = "[[ $c.traefik.enabled | ternary "allow" "deny" ]]"
+        HTTP {
+          Methods = ["POST"]
+        }
+      }
+    ]
+  }
+]
--- a/example/.services.nomad.hcl.swp
+++ b/example/.services.nomad.hcl.swp
--- a/example/.variables.yml.swp
+++ b/example/.variables.yml.swp
--- a/example/agent.nomad.hcl
+++ b/example/agent.nomad.hcl
@ -1,8 +1,11 @@
 job "monitoring-agent" {

+
  datacenters = ["dc1"]
  region      = "global"
  node_pool   = "all"
+  priority    = 60
+

  type = "system"

@ -161,7 +164,7 @@ _EOT
        destination = "local/nomad-vector-logger.toml"
      }

-      # Disable the default nomad.toml template
+      # Disable the default nomad.toml template, as we provide our own nomad.yml template
      template {
        data        = "# Disable the default toml template"
        destination = "local/template/nomad.toml"
@ -184,6 +187,8 @@ sources:
      mode: continue_through
      condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
      timeout_ms: 1000
+    ignore_older_secs: 1800
+    oldest_first: true

  {{- end }}

@ -262,11 +267,13 @@ _EOT
      driver = "docker"

      config {
-        image   = "busybox:latest"
-        command = "sh"
+        image           = "busybox:latest"
+        readonly_rootfs = true
+        pids_limit      = 20
+        command         = "sh"
        args = [
          "-c",
-          "echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
+          "echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done"
        ]
      }

@ -289,8 +296,10 @@ _EOT
      leader = true

      config {
-        image       = "danielberteaud/vector:0.36.1-1"
-        userns_mode = "host"
+        image           = "danielberteaud/vector:0.36.1-1"
+        userns_mode     = "host"
+        readonly_rootfs = true
+        pids_limit      = 200
        args = [
          "--watch-config",
          "--config", "/local/vector.yml",
@ -331,6 +340,9 @@ _EOT
      }


+
+
+      # Main vector configuration
      template {
        data            = <<_EOT
 data_dir: /data
@ -398,8 +410,8 @@ _EOT

      resources {
        cpu        = 100
-        memory     = 192
-        memory_max = 384
+        memory     = 384
+        memory_max = 512
      }

    }
@ -436,16 +448,17 @@ _EOT
      driver = "docker"

      config {
-        image    = "quay.io/prometheus/node-exporter:latest"
-        pid_mode = "host"
-        #network_mode = "host"
+        image           = "danielberteaud/node-exporter:1.7.0-1"
+        pid_mode        = "host"
        userns_mode     = "host"
        readonly_rootfs = true
        pids_limit      = 50
+        command         = "/usr/local/bin/node_exporter"
        args = [
          "--path.rootfs=/host",
          "--web.config.file=/local/tls.yml",
-          "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
+          "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}",
+          "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)",
        ]
      }

@ -477,6 +490,8 @@ _EOT
      }


+
+
      template {
        data        = <<_EOT
 tls_server_config:
--- a/example/consul/config/service-defaults/vector-aggregator.hcl
+++ b/example/consul/config/service-defaults/vector-aggregator.hcl
@ -0,0 +1,3 @@
+Kind = "service-defaults"
+Name = "vector-aggregator"
+Protocol = "http"
--- a/example/consul/config/service-intentions/vector-aggregator.hcl
+++ b/example/consul/config/service-intentions/vector-aggregator.hcl
@ -0,0 +1,15 @@
+Kind = "service-intentions"
+Name = "vector-aggregator"
+Sources = [
+  {
+    Name = "traefik"
+    Permissions = [
+      {
+        Action = "allow"
+        HTTP {
+          Methods = ["POST"]
+        }
+      }
+    ]
+  }
+]
--- a/example/exporters.nomad.hcl
+++ b/example/exporters.nomad.hcl
@ -411,7 +411,7 @@ _EOT

      resources {
        cpu    = 10
-        memory = 15
+        memory = 20
      }

    }
--- a/example/images/node-exporter/Dockerfile
+++ b/example/images/node-exporter/Dockerfile
@ -0,0 +1,24 @@
+FROM danielberteaud/alpine:24.3-1 AS builder
+
+ARG EXPORTER_VERSION=1.7.0
+
+ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
+ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/sha256sums.txt /tmp
+
+RUN set -euxo pipefail &&\
+    apk --no-cache add \
+      curl \
+      tar \
+      ca-certificates \
+    &&\
+    cd /tmp &&\
+    grep node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz sha256sums.txt | sha256sum -c &&\
+    tar xvzf node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz &&\
+    mv node_exporter-${EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/node_exporter
+
+FROM danielberteaud/alpine:24.3-1
+MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
+
+COPY --from=builder --chown=root:root --chmod=755 /usr/local/bin/node_exporter /usr/local/bin/node_exporter
+
+CMD ["/usr/local/bin/node_exporter"]
--- a/example/services.nomad.hcl
+++ b/example/services.nomad.hcl
@ -5,7 +5,7 @@ job "monitoring-services" {
  region      = "global"


-  # Metrics is running prometheus and various exporters
+  # Metrics is running prometheus
  group "metrics-server" {

    shutdown_delay = "6s"
@ -67,7 +67,7 @@ job "monitoring-services" {
        type     = "http"
        expose   = true
        path     = "/-/healthy"
-        interval = "15s"
+        interval = "20s"
        timeout  = "8s"
        check_restart {
          limit = 10
@ -77,11 +77,6 @@ job "monitoring-services" {

      tags = [

-        "traefik.enable=true",
-        "traefik.http.routers.monitoring-prometheus.entrypoints=https",
-        "traefik.http.routers.monitoring-prometheus.rule=Host(`prometheus.example.org`)",
-        "traefik.http.middlewares.csp-monitoring-prometheus.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
-        "traefik.http.routers.monitoring-prometheus.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-prometheus",

      ]
    }
@ -892,6 +887,410 @@ _EOT
        left_delimiter  = "{{{"
        right_delimiter = "}}}"
      }
+      template {
+        data            = <<_EOT
+groups:
+
+- name: EmbeddedExporter
+
+  rules:
+
+    - alert: LokiProcessTooManyRestarts
+      expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Loki process too many restarts (instance {{ $labels.instance }})
+        description: "A loki process had too many restarts (target {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestErrors
+      expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request errors (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestPanic
+      expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request panic (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestLatency
+      expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request latency (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+_EOT
+        destination     = "local/rules/loki.yml"
+        left_delimiter  = "{{{"
+        right_delimiter = "}}}"
+      }
+      template {
+        data            = <<_EOT
+groups:
+
+- name: NodeExporter
+
+  rules:
+
+    - alert: HostOutOfMemory
+      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of memory (instance {{ $labels.instance }})
+        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryUnderMemoryPressure
+      expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host memory under memory pressure (instance {{ $labels.instance }})
+        description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryIsUnderutilized
+      expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 1w
+      labels:
+        severity: info
+      annotations:
+        summary: Host Memory is underutilized (instance {{ $labels.instance }})
+        description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputIn
+      expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput in (instance {{ $labels.instance }})
+        description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputOut
+      expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput out (instance {{ $labels.instance }})
+        description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadRate
+      expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read rate (instance {{ $labels.instance }})
+        description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteRate
+      expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write rate (instance {{ $labels.instance }})
+        description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfDiskSpace
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of disk space (instance {{ $labels.instance }})
+        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostDiskWillFillIn24Hours
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfInodes
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of inodes (instance {{ $labels.instance }})
+        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostFilesystemDeviceError
+      expr: 'node_filesystem_device_error == 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host filesystem device error (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostInodesWillFillIn24Hours
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadLatency
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteLatency
+      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostHighCpuLoad
+      expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host high CPU load (instance {{ $labels.instance }})
+        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+#    - alert: HostCpuIsUnderutilized
+#      expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 1w
+#      labels:
+#        severity: info
+#      annotations:
+#        summary: Host CPU is underutilized (instance {{ $labels.instance }})
+#        description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuStealNoisyNeighbor
+      expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuHighIowait
+      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU high iowait (instance {{ $labels.instance }})
+        description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskIo
+      expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk IO (instance {{ $labels.instance }})
+        description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostContextSwitching
+      expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host context switching (instance {{ $labels.instance }})
+        description: "Context switching is growing on the node (> 10000 / CPU / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+#    - alert: HostSwapIsFillingUp
+#      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 2m
+#      labels:
+#        severity: warning
+#      annotations:
+#        summary: Host swap is filling up (instance {{ $labels.instance }})
+#        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSystemdServiceCrashed
+      expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host systemd service crashed (instance {{ $labels.instance }})
+        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostPhysicalComponentTooHot
+      expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host physical component too hot (instance {{ $labels.instance }})
+        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNodeOvertemperatureAlarm
+      expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRaidArrayGotInactive
+      expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host RAID array got inactive (instance {{ $labels.instance }})
+        description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRaidDiskFailure
+      expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host RAID disk failure (instance {{ $labels.instance }})
+        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostKernelVersionDeviations
+      expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 6h
+      labels:
+        severity: warning
+      annotations:
+        summary: Host kernel version deviations (instance {{ $labels.instance }})
+        description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOomKillDetected
+      expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host OOM kill detected (instance {{ $labels.instance }})
+        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacCorrectableErrorsDetected
+      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacUncorrectableErrorsDetected
+      expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkReceiveErrors
+      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Receive Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkTransmitErrors
+      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkInterfaceSaturated
+      expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+        description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkBondDegraded
+      expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostConntrackLimit
+      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host conntrack limit (instance {{ $labels.instance }})
+        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockSkew
+      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock skew (instance {{ $labels.instance }})
+        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockNotSynchronising
+      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock not synchronising (instance {{ $labels.instance }})
+        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRequiresReboot
+      expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 4h
+      labels:
+        severity: info
+      annotations:
+        summary: Host requires reboot (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+_EOT
+        destination     = "local/rules/node.yml"
+        left_delimiter  = "{{{"
+        right_delimiter = "}}}"
+      }

      # A client cert, to connect to the AlertManager API
      template {
@ -945,8 +1344,11 @@ _EOT

    network {
      mode = "bridge"
+      # Port exposing the web API, with mTLS
      port "web-tls" {}
+      # Port used for gossip between the different alertmanager instance
      port "cluster" {}
+      # Port to expose metrics to prometheus
      port "metrics" {}
    }

@ -1031,101 +1433,10 @@ _EOT

      tags = [

-        "traefik.enable=true",
-        "traefik.http.routers.monitoring-alertmanager.entrypoints=https",
-        "traefik.http.routers.monitoring-alertmanager.rule=Host(`alerte.example.org`)",
-        "traefik.http.middlewares.csp-monitoring-alertmanager.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
-        "traefik.http.routers.monitoring-alertmanager.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-alertmanager",

      ]
    }

-
-    # The prometheus metrics proxy, adding mTLS to the metrics endpoint
-    task "metrics-proxy" {
-      driver = "docker"
-      user   = 8995
-
-      config {
-        image      = "nginxinc/nginx-unprivileged:alpine"
-        force_pull = true
-        volumes = [
-          "local/default.conf:/etc/nginx/conf.d/default.conf:ro"
-        ]
-        pids_limit = 100
-      }
-
-      lifecycle {
-        hook    = "poststart"
-        sidecar = true
-      }
-
-      vault {
-        policies = ["metrics"]
-      }
-
-      # Get a certificate from vault to protect the metrics endpoint
-      template {
-        data        = <<_EOT
-{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
-{{ .Cert }}
-{{ .Key }}
-{{- end }}
-_EOT
-        destination = "secrets/metrics.bundle.pem"
-      }
-
-      # Get the root CA
-      template {
-        data        = <<_EOT
-{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
-_EOT
-        destination = "local/monitoring.ca.pem"
-      }
-
-
-      template {
-        data        = <<_EOT
-server {
-  listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
-  http2 on;
-
-  ssl_certificate /secrets/metrics.bundle.pem;
-  ssl_certificate_key /secrets/metrics.bundle.pem;
-  ssl_client_certificate /local/monitoring.ca.pem;
-  ssl_verify_client on;
-  ssl_protocols TLSv1.2 TLSv1.3;
-  ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
-  ssl_session_cache shared:SSL:10m;
-  ssl_session_timeout 1h;
-  ssl_session_tickets off;
-  gzip on;
-  gzip_types
-    text/plain;
-  gzip_vary on;
-
-  server_tokens off;
-
-  if ($request_method !~ ^(GET|HEAD)$ ) {
-    return 405;
-  }
-  location /metrics {
-    proxy_pass http://127.0.0.1:9093/metrics;
-  }
-}
-_EOT
-        destination = "local/default.conf"
-      }
-
-      resources {
-        cpu        = 10
-        memory     = 10
-        memory_max = 20
-      }
-    }
-
-
-
    # This task will handle mTLS to the AlertManager API
    # And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
    task "untls-proxy" {
@ -1166,10 +1477,11 @@ _EOT

      template {
        data        = <<_EOT
+# UnTLS for the web API
 server {
  listen 127.0.0.1:9093;
  location / {
-    proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
+    proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
    proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
    proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
    proxy_ssl_verify on;
@ -1180,10 +1492,66 @@ server {
  }
 }

+# Metrics proxy
+server {
+  listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
+  http2 on;
+
+  ssl_certificate /secrets/metrics.bundle.pem;
+  ssl_certificate_key /secrets/metrics.bundle.pem;
+  ssl_client_certificate /local/monitoring.ca.pem;
+  ssl_verify_client on;
+  ssl_protocols TLSv1.2 TLSv1.3;
+  ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
+  ssl_session_cache shared:SSL:10m;
+  ssl_session_timeout 1h;
+  ssl_session_tickets off;
+  gzip on;
+  gzip_types
+    text/plain;
+  gzip_vary on;
+
+  server_tokens off;
+
+  if ($request_method !~ ^(GET|HEAD)$ ) {
+    return 405;
+  }
+
+  location /metrics {
+    proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
+    proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
+    proxy_ssl_verify on;
+    proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul;
+    proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
+    proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
+  }
+}
+
+
 _EOT
        destination = "local/alertmanager.conf"
      }

+      # Get a certificate from vault to protect the metrics endpoint
+      template {
+        data        = <<_EOT
+{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
+{{ .Cert }}
+{{ .Key }}
+{{- end }}
+_EOT
+        destination = "secrets/metrics.bundle.pem"
+      }
+
+      # Get the root CA
+      template {
+        data        = <<_EOT
+{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
+_EOT
+        destination = "local/monitoring.ca.pem"
+      }
+
+
      # Certifiate used by AlertManager
      template {
        data          = <<_EOT
@ -1203,14 +1571,6 @@ _EOT
        change_signal = "SIGHUP"
      }

-      # The trusted CA
-      template {
-        data        = <<_EOT
-{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
-_EOT
-        destination = "local/monitoring.ca.pem"
-      }
-
      resources {
        cpu    = 10
        memory = 18
@ -1300,7 +1660,7 @@ set -euo pipefail
 exec alertmanager \
  --config.file=/secrets/alertmanager.yml \
  --storage.path=/data \
-  --web.external-url=https://alerte.example.org \
+  --web.external-url=https://alert.example.org \
  --web.route-prefix=/ \
  --web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \
  --cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \
@ -1430,11 +1790,6 @@ _EOT

      tags = [

-        "traefik.enable=true",
-        "traefik.http.routers.monitoring-loki.entrypoints=https",
-        "traefik.http.routers.monitoring-loki.rule=Host(`loki.example.org`)",
-        "traefik.http.middlewares.csp-monitoring-loki.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
-        "traefik.http.routers.monitoring-loki.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-loki",

      ]
    }
@ -2048,7 +2403,7 @@ server {
    return 405;
  }
  location /metrics {
-    proxy_pass http://localhost:3000/metrics;
+    proxy_pass http://127.0.0.1:3000/metrics;
  }
 }
 _EOT
@ -2132,7 +2487,6 @@ _EOT
      # Use a template block instead of env {} so we can fetch values from vault
      template {
        data        = <<_EOT
-GF_SECURITY_ADMIN_PASSWORD={{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}
 LANG=fr_FR.utf8
 TZ=Europe/Paris
 _EOT
@ -2142,6 +2496,15 @@ _EOT
      }


+      template {
+        data        = <<_EOT
+GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
+_EOT
+        destination = "secrets/.grafana.env"
+        perms       = 400
+        env         = true
+      }
+
      # Basic grafana configuration file
      template {
        data        = <<_EOT
--- a/images/node-exporter/Dockerfile
+++ b/images/node-exporter/Dockerfile
@ -0,0 +1,24 @@
+FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
+
+ARG EXPORTER_VERSION=[[ .monitoring.agent.node_exporter.version ]]
+
+ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
+ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/sha256sums.txt /tmp
+
+RUN set -euxo pipefail &&\
+    apk --no-cache add \
+      curl \
+      tar \
+      ca-certificates \
+    &&\
+    cd /tmp &&\
+    grep node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz sha256sums.txt | sha256sum -c &&\
+    tar xvzf node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz &&\
+    mv node_exporter-${EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/node_exporter
+
+FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
+MAINTAINER [[ .docker.maintainer ]]
+
+COPY --from=builder --chown=root:root --chmod=755 /usr/local/bin/node_exporter /usr/local/bin/node_exporter
+
+CMD ["/usr/local/bin/node_exporter"]
--- a/services.nomad.hcl
+++ b/services.nomad.hcl
@ -2,7 +2,7 @@ job "[[ .instance ]]-services" {

 [[ template "common/job_start" . ]]

-  # Metrics is running prometheus and various exporters
+  # Metrics is running prometheus
  group "metrics-server" {
 [[- $c := merge .monitoring.prometheus .monitoring . ]]

@ -28,7 +28,7 @@ job "[[ .instance ]]-services" {
        type     = "http"
        expose   = true
        path     = "/-/healthy"
-        interval = "15s"
+        interval = "20s"
        timeout  = "8s"
        check_restart {
          limit = 10
@ -168,8 +168,11 @@ _EOT

    network {
      mode = "bridge"
+      # Port exposing the web API, with mTLS
      port "web-tls" {}
+      # Port used for gossip between the different alertmanager instance
      port "cluster" {}
+      # Port to expose metrics to prometheus
      port "metrics" {}
    }

@ -220,8 +223,6 @@ _EOT
      ]
    }

-[[ template "common/task.metrics_proxy" $c ]]
-
    # This task will handle mTLS to the AlertManager API
    # And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
    task "untls-proxy" {
@ -253,6 +254,8 @@ _EOT
        destination = "local/alertmanager.conf"
      }

+[[ template "common/metrics_cert" $c ]]
+
      # Certifiate used by AlertManager
      template {
        data          = <<_EOT
@ -272,14 +275,6 @@ _EOT
        change_signal = "SIGHUP"
      }

-      # The trusted CA
-      template {
-        data        = <<_EOT
-{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
-_EOT
-        destination = "local/monitoring.ca.pem"
-      }
-
      resources {
        cpu    = 10
        memory = 18
@ -617,6 +612,15 @@ _EOT
 [[ template "common/vault.policies" $c ]]
 [[ template "common/file_env" $c ]]

+      template {
+        data = <<_EOT
+GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
+_EOT
+        destination = "secrets/.grafana.env"
+        perms = 400
+        env = true
+      }
+
      # Basic grafana configuration file
      template {
        data = <<_EOT
--- a/templates/agent/vector-template.yml
+++ b/templates/agent/vector-template.yml
@ -12,6 +12,8 @@ sources:
      mode: continue_through
      condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
      timeout_ms: 1000
+    ignore_older_secs: 1800
+    oldest_first: true

  {{- end }}

--- a/templates/alertmanager/nginx.conf
+++ b/templates/alertmanager/nginx.conf
@ -1,7 +1,8 @@
+# UnTLS for the web API
 server {
  listen 127.0.0.1:9093;
  location / {
-    proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
+    proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
    proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
    proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
    proxy_ssl_verify on;
@ -11,3 +12,39 @@ server {
    deny all;
  }
 }
+
+# Metrics proxy
+server {
+  listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
+  http2 on;
+
+  ssl_certificate /secrets/metrics.bundle.pem;
+  ssl_certificate_key /secrets/metrics.bundle.pem;
+  ssl_client_certificate /local/monitoring.ca.pem;
+  ssl_verify_client on;
+  ssl_protocols TLSv1.2 TLSv1.3;
+  ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
+  ssl_session_cache shared:SSL:10m;
+  ssl_session_timeout 1h;
+  ssl_session_tickets off;
+  gzip on;
+  gzip_types
+    text/plain;
+  gzip_vary on;
+
+  server_tokens off;
+
+  if ($request_method !~ ^(GET|HEAD)$ ) {
+    return 405;
+  }
+
+  location /metrics {
+    proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
+    proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
+    proxy_ssl_verify on;
+    proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.[[ .instance ]].[[ .consul.domain ]];
+    proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
+    proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
+  }
+}
+
--- a/templates/prometheus/rules/loki.yml
+++ b/templates/prometheus/rules/loki.yml
@ -0,0 +1,41 @@
+groups:
+
+- name: EmbeddedExporter
+
+  rules:
+
+    - alert: LokiProcessTooManyRestarts
+      expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Loki process too many restarts (instance {{ $labels.instance }})
+        description: "A loki process had too many restarts (target {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestErrors
+      expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request errors (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestPanic
+      expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request panic (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestLatency
+      expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request latency (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/templates/prometheus/rules/node.yml
+++ b/templates/prometheus/rules/node.yml
@ -0,0 +1,347 @@
+groups:
+
+- name: NodeExporter
+
+  rules:
+
+    - alert: HostOutOfMemory
+      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of memory (instance {{ $labels.instance }})
+        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryUnderMemoryPressure
+      expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host memory under memory pressure (instance {{ $labels.instance }})
+        description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryIsUnderutilized
+      expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 1w
+      labels:
+        severity: info
+      annotations:
+        summary: Host Memory is underutilized (instance {{ $labels.instance }})
+        description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputIn
+      expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput in (instance {{ $labels.instance }})
+        description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputOut
+      expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput out (instance {{ $labels.instance }})
+        description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadRate
+      expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read rate (instance {{ $labels.instance }})
+        description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteRate
+      expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write rate (instance {{ $labels.instance }})
+        description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfDiskSpace
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of disk space (instance {{ $labels.instance }})
+        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostDiskWillFillIn24Hours
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfInodes
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of inodes (instance {{ $labels.instance }})
+        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostFilesystemDeviceError
+      expr: 'node_filesystem_device_error == 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host filesystem device error (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostInodesWillFillIn24Hours
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadLatency
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteLatency
+      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostHighCpuLoad
+      expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host high CPU load (instance {{ $labels.instance }})
+        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+#    - alert: HostCpuIsUnderutilized
+#      expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 1w
+#      labels:
+#        severity: info
+#      annotations:
+#        summary: Host CPU is underutilized (instance {{ $labels.instance }})
+#        description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuStealNoisyNeighbor
+      expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuHighIowait
+      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU high iowait (instance {{ $labels.instance }})
+        description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskIo
+      expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk IO (instance {{ $labels.instance }})
+        description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostContextSwitching
+      expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host context switching (instance {{ $labels.instance }})
+        description: "Context switching is growing on the node (> 10000 / CPU / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+#    - alert: HostSwapIsFillingUp
+#      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 2m
+#      labels:
+#        severity: warning
+#      annotations:
+#        summary: Host swap is filling up (instance {{ $labels.instance }})
+#        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSystemdServiceCrashed
+      expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host systemd service crashed (instance {{ $labels.instance }})
+        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostPhysicalComponentTooHot
+      expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host physical component too hot (instance {{ $labels.instance }})
+        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNodeOvertemperatureAlarm
+      expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRaidArrayGotInactive
+      expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host RAID array got inactive (instance {{ $labels.instance }})
+        description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRaidDiskFailure
+      expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host RAID disk failure (instance {{ $labels.instance }})
+        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostKernelVersionDeviations
+      expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 6h
+      labels:
+        severity: warning
+      annotations:
+        summary: Host kernel version deviations (instance {{ $labels.instance }})
+        description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOomKillDetected
+      expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host OOM kill detected (instance {{ $labels.instance }})
+        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacCorrectableErrorsDetected
+      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacUncorrectableErrorsDetected
+      expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkReceiveErrors
+      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Receive Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkTransmitErrors
+      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkInterfaceSaturated
+      expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+        description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkBondDegraded
+      expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostConntrackLimit
+      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host conntrack limit (instance {{ $labels.instance }})
+        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockSkew
+      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock skew (instance {{ $labels.instance }})
+        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockNotSynchronising
+      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock not synchronising (instance {{ $labels.instance }})
+        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRequiresReboot
+      expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 4h
+      labels:
+        severity: info
+      annotations:
+        summary: Host requires reboot (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/variables.yml
+++ b/variables.yml
@ -76,179 +76,284 @@ monitoring:
      #   - https://portal.acme.com
      http_probes: []

-    # Consul exporter will expose consul metrics
+    # Consul exporter will expose consul metrics (mainly registered services status)
    consul:
+      # Version of the exporter
      version: 0.11.0
+      # Docker image to use
      image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2'
+      # Custom env var to set in the container
      env: {}
+      # Resource allocation
      resources:
        cpu: 20
        memory: 32
      vault:
+        # Vault policies to attach
        policies:
          - 'consul-exporter[[ .consul.suffix ]]'

+    # The cluster exporter is a simple nginx used as a proxy
+    # which handles TLS for the cluster services (vault, consul and nomad)
    cluster:
+      # Docker image to use
      image: nginxinc/nginx-unprivileged:alpine
+      # Custom env
      env: {}
+      # Resource allocation
      resources:
        cpu: 10
-        memory: 15
+        memory: 20
      vault:
+        # Vault policies to attach to the task
        policies:
          - 'cluster-exporter[[ .consul.suffix ]]'
-          - metrics
+          - metrics[[ .consul.suffix ]]

+  # The prometheus server
  prometheus:
-
-    version: 2.51.0
-
+    # Number of instances to run. Note that if you run several instances, they will be independant, and all of
+    # them will scrape the same data. Then queries to the prometheus API will be loadbalanced between all instances.
+    # This should work most of the time, but can give some strange result if eg, one of the instances was down (queries
+    # for data during the downtime can give some random result depending on the instance your query is routed to)
    count: 1
-
+    # Version of prometheus
+    version: 2.51.0
+    # Docker image to use
    image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1'
-
+    # Custom env var to set
    env: {}
-
+    # Resource allocation
    resources:
      cpu: 200
      memory: 512
-
+    # Volumes used for data persistence
+    # You must create a prometheus-data[0] volume as it's a per_alloc volume
    volumes:
      data:
        type: csi
        source: 'prometheus-data'
        per_alloc: true
-
    vault:
+      # Vault policies to attach to the task
      policies:
        - 'prometheus[[ .consul.suffix ]]'
-
+    # A dict of custom jobs. Eg
+    # jobs:
+    #   squid:
+    #     targets:
+    #       - 10.11.2.3:9305
+    #       - 192.168.6.20:782
    jobs: {}
-    alert_rules: {}
+    # A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts. Eg
    # alert_rules:
    #   postgres:
    #     url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
-
+    #   patroni:
+    #     url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml
+    # If you need something more flexible (like download an archive of rules and uncompress it, you should use artifacts instead. Just ensure your rules
+    # are in /local/rules/ inside the container
+    alert_rules: {}
+    # The public URL where prometheus will be reachable (if exposed with Traefik)
    public_url: https://prometheus.example.org
+    # Traefik settings
    traefik:
-      enabled: true
+      # Turn this on to expose prometheus with Traefik
+      # Caution : there's no builtin security, you should configure the appropriate middlewares
+      enabled: false
      router: prometheus
-
+    # Metrics retention duration
    retention: 30d
-
+    # always enable prometheus metrics (of course :-) )
    prometheus:
-      enabled: true
+      # This is the URL where metrics are exposed, where the metrics proxy will point at (from the container PoV)
      metrics_url: http://localhost:9090/metrics

+  # AlertManager can process and send alerts
  alertmanager:
+    # Number of instances to run. Set > 1 if you wan HA
    count: 1
+    # Version of alertmanager
    version: 0.27.0
+    # DOcker image to use
    image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-1'
+    # Custom env var to set in the container
    env: {}
+    # Resource allocation
    resources:
      cpu: 50
      memory: 64
      memory_max: 80
-    public_url: https://alerte.example.org
+    # URL where the web interface is reachable (if exposed with Traefik)
+    public_url: https://alert.example.org
+    # Traefik settings
    traefik:
-      enabled: true
+      # Turn this on to expose alertmanager with traefik
+      # Caution : there's no builtin security, you should configure appropriate middlewares before enabling
+      enabled: false
      router: alertmanager
+      # No need to strip prefix as alertmanager will be configured to handle it
      strip_prefix: false
+    # Volumes used for data persistence. Note : it's a per_alloc volume
+    # so you need to create eg alertmanager-data[0]. This volume should be writeable by user with ID 9093
    volumes:
      data:
        source: 'alertmanager-data'
        type: csi
        per_alloc: true
-    prometheus:
-      metrics_url: http://127.0.0.1:9093/metrics
    vault:
+      # List of vault policies to attach to the task
      policies:
-        - metrics
-        - 'alertmanager[[ .consul.suffix ]]'
+        - metrics[[ .consul.suffix ]]
+        - alertmanager[[ .consul.suffix ]]
+    # Email settings
    email:
      from: alertmanager@[[ .consul.domain ]]
+    # You can merge your own custom config with the default provided one. Eg
+    # custom_config:
+    #   receivers:
+    #     - name: dani
+    #       email_configs:
+    #         - to: dani@example.org
+    #   route:
+    #     group_by: ['alertname', 'cluster', 'job']
+    #     receiver: dani
    custom_config: {}

+  # Loki is the log server
  loki:
+    # Version of loki
    version: 2.9.6
+    # Docker image to use
    image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1'
+    # Custom env to set in the container
    env: {}
+    # Resource allocation
    resources:
      cpu: 150
      memory: 512
    vault:
+      # Vault policies to attach in the container
      policies:
        - 'loki[[ .consul.suffix ]]'
+    # URL where loki is exposed (if enabled)
    public_url: https://loki.example.org
+    # Traefik settings
    traefik:
+      # Turn it on to expose Loki with Traefik
+      # Caution : there's no builtin security, you should add appropriate Traefik middlewares
+      enabled: false
      router: loki
+    # Retention for logs. Older will be deleted
    retention: 720h # 1 month
+    # Custom configuration which will be merged on top of the default one
    custom_config: {}
    prometheus:
+      # URL where metrics are available for the metrics proxy (from inside the container PoV)
      metrics_url: http://localhost:3100/metrics
+    # Volumes for data persistence. Should be writable for user id 3100
    volumes:
      data:
        type: csi
        source: 'loki-data'

+  # Common vector settings
  vector:
+    # Version of vector
    version: 0.36.1
+    # Docker image to use
    image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1'

+  # Vector aggregator can be used to ingest logs from external device (using syslog or fluentd)
+  # Logs will then be forwarded to loki
  aggregator:
+    # Number of instances
    count: 1
+    # Docker image to use
    image: '[[ .monitoring.vector.image ]]'
+    # Custom env to set in the container
    env: {}
+    # Resource allocation
    resources:
      cpu: 100
      memory: 192
    consul:
      connect:
        upstreams:
+          # Connect to loki through the service mesh
          - destination_name: 'loki[[ .consul.suffix ]]'
            local_bind_port: 3100
    vault:
+      # Vault policies to attach to the task.
+      # Note : vector can expose its metrics with mTLS natively, so we do not add a metrics_proxy task
+      # but we need to grant the metrics policy to the vector task instead
      policies:
        - metrics[[ .consul.suffix ]]
+    # Fluentd source settings
    fluentd:
      enabled: false
      traefik:
        router: fluentd
        entrypoints:
          - fluentd
+    # Syslog source settings
    syslog_udp:
      enabled: false
      traefik:
        router: syslog-udp
        entrypoints:
-          - syslog
+          - syslog-udp
+    # Syslog (tcp) source settings
+    syslog_tcp:
+      enabled: false
+      traefik:
+        router: syslog-tcp
+        entrypoints:
+          - syslog-tcp
+    # Native vector (http) source settings
    vector:
      enabled: true
+      # URL where the vector endpoint is available from the outside (if exposed with Traefik)
      public_url: https://vector.example.org
      traefik:
+        # Set to true if you want to expose the service with Traefik
+        # Caution : there's no builtin security, you should configure appropriate middlewares before enabling it
        enabled: false

+  # Grafana settings
  grafana:
+    # Grafana version
    version: 10.4.1
+    # Docker image to use
    image: '[[ .docker.repo ]]grafana:[[ .monitoring.grafana.version ]]-1'
-    env:
-      GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
+    # Custom env var to set in the container
+    env: {}
+    # Resource allocation
    resources:
      cpu: 100
      memory: 256
+    # URL where Grafana is reachable
    public_url: https://grafana.example.org
+    # List of plugins to install. Note : plugins are installed at image build time, so you need to rebuild
+    # the image if you want to update it
    plugins:
-      #- alexanderzobnin-zabbix-app
-      #- ddurieux-glpi-app
      - grafana-clock-panel
      - grafana-piechart-panel
+    # Dict of feature toggles. See https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/feature-toggles/
+    # Example:
+    # feature_toggles:
+    #   featureToggleAdminPage: true
+    #   ssoSettingsApi: true
    feature_toggles: {}
+    # Traefik settings
    traefik:
      enabled: true
      router: grafana
+      # No need to strip prefix as Grafana will be configured to handle it correctly
      strip_prefix: false
    consul:
      connect:
+        # Connect to postgres, loki and prometheus with the service mesh
        upstreams:
          - destination_name: postgres[[ .consul.suffix ]]
            local_bind_port: 5432
@ -256,16 +361,20 @@ monitoring:
            local_bind_port: 3100
          - destination_name: prometheus[[ .consul.suffix ]]
            local_bind_port: 9090
+    # Volumes for data persistence
    volumes:
      data:
        type: csi
        source: 'grafana-data'
    vault:
+      # Vault policies to attach to the task
      policies:
        - 'grafana[[ .consul.suffix ]]'
+      # Postgres DB settings
      database:
        role: grafana
        pgrole: grafana
+    # Override some default postgres handling
    postgres:
      database: grafana
      user: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.username }}{{ end }}'
@ -273,64 +382,131 @@ monitoring:
      pooler:
        mode: session
    prometheus:
-      metrics_url: http://localhost:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics
+      # URL where Grafana metrics are reachable for the metrics proxy (from inside the container PoV)
+      metrics_url: http://127.0.0.1:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics

+  # Agent runs as a system jobs, on all the nodes
  agent:
    consul:
      meta:
+        # Override the alloc service meta, the hostname will be more useful than a 0)
        alloc: '${node.unique.name}'
+    # Nomad settings
    nomad:
+      # Run on all node pools
      node_pool: all
+      # Run with an above average priority
+      priority: 60
+
+    # Nomad vector logger is a small container which will query the Nomad API to discover running allocation on the current node
+    # Then generate a vector configuration with scraping for all the discovered allocation.
    nomad_vector_logger:
-      version: 24.3
-      image: '[[ .docker.repo ]]nomad-vector-logger:[[ .monitoring.agent.nomad_vector_logger.version ]]-2'
+      # Docker image to use
+      image: '[[ .docker.repo ]]nomad-vector-logger:24.3-2'
+      # Custom env to set in the container
      env: {}
+      # Resource allocation
      resources:
        cpu: 20
        memory: 24
        memory_max: 50
      vault:
+        # Vault policies to attach to the task
        policies:
          - nomad-vector-logger[[ .consul.suffix ]]
+
+    # Vector is the main task. It'll read it's config created by nomad-vector-logger and will read log files
+    # accordingly, add useful metadata (like node, job, group, task, alloc etc.) and push logs to loki
    vector:
+      # Docker image to use
      image: '[[ .monitoring.vector.image ]]'
+      # Custom env to set in the container
      env: {}
+      # Resource allocation
      resources:
        cpu: 100
-        memory: 192
-        memory_max: 384
+        memory: 384
+        memory_max: 512
      vault:
+        # Vault policies to attach to the container. Vector being able to use mTLS on the metrics endpoint
+        # there's no need to add a metrics_proxy task. Instead, we grant the metrics policy to vector so it can get
+        # a certificate from vault
        policies:
          - metrics[[ .consul.suffix ]]
      consul:
        connect:
          upstreams:
+            # Connect to loki with the service mesh
            - destination_name: loki[[ .consul.suffix ]]
              local_bind_port: 3100
+      # Volumes for data persistence
      volumes:
+        # The nomad volume should expose the Nomad alloc dir (eg /opt/nomad/data/alloc) where vector will be able
+        # to read the logs. You should create a host volume in nomad client config of all your nodes. Eg
+        # client {
+        #   enabled = true
+        #   host_volume "nomad_alloc" {
+        #     path = "/opt/nomad/data/alloc"
+        #     read_only = "true"
+        #   }
+        # }
        nomad:
          type: host
          source: nomad_alloc
          read_only: true
+        # The data volume will be used by vector for buffering (in case loki is unavailable)
+        # You can create a host volume in Nomad's client config, eg
+        # client {
+        #   enabled = true
+        #   host_volume "nomad_alloc" {
+        #     path = "/data/vector-agent"
+        #   }
+        # }
        data:
          type: host
          source: vector_data

+    # The node exporter can be used to expose the host metrics to prometheus
    node_exporter:
-      image: quay.io/prometheus/node-exporter:latest
+      # Is the node exporter enabled ? (set to false if you don't want it, or if you
+      # already manage the node-exporter separatly)
+      enabled: true
+      # Version of the exporter
+      version: 1.7.0
+      # Docker image to use
+      image: '[[ .docker.repo ]]node-exporter:[[ .monitoring.agent.node_exporter.version ]]-1'
+      # Custom env to set in the container
      env: {}
+      # Resource allocation
      resources:
        cpu: 50
        memory: 24
        memory_max: 32
      vault:
+        # Vault policies to atatch to the task
+        # This exporter can handle mTLS itself, so no need to create a metrics_proxy task, instead, grant the metrics policy
+        # So it can get a certificate from vault
        policies:
          - metrics[[ .consul.suffix ]]
+      # Args to add to the exporter on start
+      args:
+        - '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)'
+      # Volumes
      volumes:
+        # The exporter should access the host root filesystem
+        # For this, you should create a host volume in Nomad's client config, eg
+        # client {
+        #   enabled = true
+        #   host_volume "host_root" {
+        #     path = "/"
+        #     read_only = true
+        #   }
+        # }
        host:
          type: host
          source: host_root
          read_only: true

+# Enable globaly prometheus for this bundle :-)
 prometheus:
  enabled: true