job "monitoring-services" { datacenters = ["dc1"] region = "global" # Metrics is running prometheus group "metrics-server" { shutdown_delay = "6s" count = 1 network { mode = "bridge" port "metrics" {} } volume "data" { source = "prometheus-data" type = "csi" access_mode = "single-node-writer" attachment_mode = "file-system" per_alloc = true } service { name = "prometheus" port = 9090 meta { metrics-port = "${NOMAD_HOST_PORT_metrics}" alloc = "${NOMAD_ALLOC_INDEX}" datacenter = "${NOMAD_DC}" group = "${NOMAD_GROUP_NAME}" job = "${NOMAD_JOB_NAME}" namespace = "${NOMAD_NAMESPACE}" node = "${node.unique.name}" region = "${NOMAD_REGION}" } connect { sidecar_service { } sidecar_task { config { args = [ "-c", "${NOMAD_SECRETS_DIR}/envoy_bootstrap.json", "-l", "${meta.connect.log_level}", "--concurrency", "${meta.connect.proxy_concurrency}", "--disable-hot-restart" ] } resources { cpu = 50 memory = 64 } } } check { name = "health" type = "http" expose = true path = "/-/healthy" interval = "20s" timeout = "8s" check_restart { limit = 10 grace = "5m" } } tags = [ ] } # The prometheus metrics proxy, adding mTLS to the metrics endpoint task "metrics-proxy" { driver = "docker" user = 8995 config { image = "nginxinc/nginx-unprivileged:alpine" force_pull = true volumes = [ "local/default.conf:/etc/nginx/conf.d/default.conf:ro" ] pids_limit = 100 } lifecycle { hook = "poststart" sidecar = true } vault { policies = ["metrics"] } # Get a certificate from vault to protect the metrics endpoint template { data = <<_EOT {{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} {{ .Cert }} {{ .Key }} {{- end }} _EOT destination = "secrets/metrics.bundle.pem" } # Get the root CA template { data = <<_EOT {{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/monitoring.ca.pem" } template { data = <<_EOT server { listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl; http2 on; ssl_certificate /secrets/metrics.bundle.pem; ssl_certificate_key /secrets/metrics.bundle.pem; ssl_client_certificate /local/monitoring.ca.pem; ssl_verify_client on; ssl_protocols TLSv1.2 TLSv1.3; ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; ssl_session_cache shared:SSL:10m; ssl_session_timeout 1h; ssl_session_tickets off; gzip on; gzip_types text/plain; gzip_vary on; server_tokens off; if ($request_method !~ ^(GET|HEAD)$ ) { return 405; } location /metrics { proxy_pass http://localhost:9090/metrics; } } _EOT destination = "local/default.conf" } resources { cpu = 10 memory = 10 memory_max = 20 } } # The main prometheus task task "prometheus" { driver = "docker" leader = true config { image = "danielberteaud/prometheus:2.51.1-1" readonly_rootfs = true pids_limit = 200 command = "prometheus" args = [ "--config.file=/local/prometheus.yml", "--log.level=info", "--web.listen-address=127.0.0.1:9090", "--storage.tsdb.path=/data", "--storage.tsdb.retention.time=30d", "--web.console.libraries=/opt/prometheus/console_libraries", "--web.console.templates=/opt/prometheus/consoles", "--web.external-url=https://prometheus.example.org", "--web.route-prefix=/" ] } vault { policies = ["prometheus"] env = false disable_file = true change_mode = "noop" } # Main configuration for prometheus template { data = <<_EOT global: scrape_interval: 15s evaluation_interval: 15s #query_log_file: /dev/stdout external_labels: cluster: consul env: default rule_files: - /local/rules/*.yml alerting: alertmanagers: - scheme: https tls_config: ca_file: /local/monitoring.ca.pem cert_file: /secrets/prometheus.bundle.pem key_file: /secrets/prometheus.bundle.pem consul_sd_configs: - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 scheme: http token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }} datacenter: dc1 relabel_configs: # Only keep alertmanagers - source_labels: [__meta_consul_service] action: keep regex: alertmanager-tls scrape_configs: # Cluster services - job_name: cluster-services scheme: https tls_config: ca_file: /local/monitoring.ca.pem cert_file: /secrets/prometheus.bundle.pem key_file: /secrets/prometheus.bundle.pem consul_sd_configs: - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 scheme: http token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }} datacenter: dc1 relabel_configs: # Drop anything which is not Nomad, Consul or Vault # Other services will be monitored with another job - source_labels: [__meta_consul_service] action: keep regex: (nomad(\-client)?|consul|vault) - source_labels: [__meta_consul_service,__meta_consul_node] regex: (.+);(.+) replacement: $${1}/$${2} target_label: __metrics_path__ - source_labels: [__meta_consul_service] regex: (.+) replacement: {{ range $idx, $instance := service "cluster-exporter" }}{{ if eq $idx 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }} target_label: __address__ # Rewrite the job labels to the name of the service - source_labels: [__meta_consul_service] regex: (.+) replacement: $${1} target_label: job # Rewrite the instance labels - source_labels: [__meta_consul_node] regex: (.+) replacement: $${1} target_label: instance # regular services discovered from the Consul Catalog - job_name: consul-services scheme: https tls_config: ca_file: /local/monitoring.ca.pem cert_file: /secrets/prometheus.bundle.pem key_file: /secrets/prometheus.bundle.pem consul_sd_configs: - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 scheme: http token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }} datacenter: dc1 relabel_configs: # Drop sidecar's service to prevent duplicate. Sidecar themselves are treated in another job - source_labels: [__meta_consul_service] action: drop regex: (.+)-sidecar-proxy # Drop Nomad, Consul and vault, already handled - source_labels: [__meta_consul_service] action: drop regex: (nomad(\-client)?|consul|vault) # Only keep services having a metrics-port set - source_labels: [__meta_consul_service_metadata_metrics_port] regex: \d+ action: keep # Get metrics path from metadata - source_labels: [__meta_consul_service_metadata_metrics_path] target_label: __metrics_path__ regex: (.+) # Rewrite the scheme if needed - source_labels: [__meta_consul_service_metadata_metrics_scheme] regex: (https?) replacement: $${1} target_label: __scheme__ # Rewrite the address to use the metrics port - source_labels: [__address__, __meta_consul_service_metadata_metrics_port] regex: ([^:]+)(?::\d+)?;(\d+) replacement: $${1}:$${2} target_label: __address__ # Rewrite the job labels to the name of the service - source_labels: [__meta_consul_service] regex: (.+) replacement: $${1} target_label: job # Set the default alloc to 0 if not set - source_labels: [__meta_consul_service_metadata_alloc] regex: ^$ replacement: 0 target_label: __meta_consul_service_metadata_alloc # Keep the alloc meta in a label # Note that most of the time, alloc is just the allocation index, but in some cases, it can be the host name (for system jobs) - source_labels: [__meta_consul_service_metadata_alloc] regex: (.+) replacement: $${1} target_label: alloc # Rewerite the instance label to be service-alloc - source_labels: [__meta_consul_service, alloc] regex: (.+);([a-zA-Z\d\-\.]+) replacement: $${1}-$${2} target_label: instance # envoy sidecars from consul - job_name: consul-envoy-services consul_sd_configs: - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 scheme: http token: {{ with secret "consul/creds/prometheus" }}{{ .Data.token }}{{ end }} datacenter: dc1 relabel_configs: # Only keep sidecar-service with a envoy-metrics-port defined - source_labels: [__meta_consul_service, __meta_consul_service_metadata_envoy_metrics_port] action: keep regex: (.+)-sidecar-proxy;\d+ # Rewrite the address to use the envoy-metrics-port - source_labels: [__address__, __meta_consul_service_metadata_envoy_metrics_port] regex: ([^:]+)(?::\d+)?;(\d+) replacement: $${1}:$${2} target_label: __address__ # Rewrite the job label - source_labels: [__meta_consul_service] regex: (.+) replacement: $${1} target_label: job # Set the default alloc to 0 if not set - source_labels: [__meta_consul_service_metadata_alloc] regex: ^$ replacement: 0 target_label: __meta_consul_service_metadata_alloc # Rewerite the instance label to be service-alloc - source_labels: [__meta_consul_service, __meta_consul_service_metadata_alloc] regex: (.+);([a-zA-Z\d\-\.]+) replacement: $${1}-$${2} target_label: instance _EOT destination = "local/prometheus.yml" uid = 100000 gid = 109090 perms = 640 change_mode = "signal" change_signal = "SIGHUP" } # Alert rules template { data = <<_EOT # vi: syntax=yaml groups: - name: Blackbox rules: - alert: BlackboxProbeFailed expr: probe_success == 0 for: 0m labels: severity: critical annotations: summary: Blackbox probe failed (instance {{ $labels.instance }}) description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxSlowProbe expr: avg_over_time(probe_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: summary: Blackbox slow probe (instance {{ $labels.instance }}) description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxProbeHttpFailure expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 for: 0m labels: severity: critical annotations: summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxSslCertificateWillExpireSoon expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20' for: 0m labels: severity: warning annotations: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxSslCertificateWillExpireSoon expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3' for: 0m labels: severity: critical annotations: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxSslCertificateExpired expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0' for: 0m labels: severity: critical annotations: summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxProbeSlowHttp expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1' for: 1m labels: severity: warning annotations: summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" _EOT destination = "local/rules/blackbox.yml" left_delimiter = "{{{" right_delimiter = "}}}" } template { data = <<_EOT # vi: syntax=yaml groups: - name: JVM rules: - alert: JvmMemoryFillingUp expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 90' for: 2m labels: severity: warning annotations: summary: JVM memory filling up (instance {{ $labels.instance }}) description: "JVM memory is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" _EOT destination = "local/rules/jvm.yml" left_delimiter = "{{{" right_delimiter = "}}}" } template { data = <<_EOT # vi: syntax=yaml groups: - name: Nomad rules: - alert: NomadJobFailed expr: 'delta(nomad_nomad_job_summary_failed[30m]) > 0' for: 0m labels: severity: warning annotations: summary: Nomad job failed (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadJobLost expr: 'nomad_nomad_job_summary_lost > 0' for: 0m labels: severity: warning annotations: summary: Nomad job lost (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadJobQueued expr: 'nomad_nomad_job_summary_queued > 0' for: 3m labels: severity: warning annotations: summary: Nomad job queued (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadBlockedEvaluation expr: 'nomad_nomad_blocked_evals_total_blocked > 0' for: 2m labels: severity: warning annotations: summary: Nomad blocked evaluation (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadTaskOOM expr: 'count_over_time(nomad_client_allocs_oom_killed[1h]) > 1' for: 0m labels: severity: warning annotations: summary: Nomad task killed by OOM (job {{ $labels.exported_job }}, group {{ $labels.task_group }}, instance {{ $labels.instance }}, task {{ $labels.task }}) description: "Nomad task killed by OOM \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" _EOT destination = "local/rules/nomad.yml" left_delimiter = "{{{" right_delimiter = "}}}" } template { data = <<_EOT # vi: syntax=yaml groups: - name: Ping rules: - alert: HostDown expr: ping_loss_ratio == 1 for: 3m labels: severity: critical annotations: summary: Host down (host {{ $labels.target }}) description: "Host {{ $labels.target }} doesn't respond to ICMP pings, VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PingLoss expr: | avg_over_time(ping_loss_ratio[10m]) > 0.1 and min_over_time(ping_loss_ratio[10m]) < 1 for: 0m labels: severity: warning annotations: summary: High packet loss (host {{ $labels.target }}) description: "ICMP pings have a loss ratio > 10%, VALUE = {{ $value }}\n LABELS = {{ $labels }}" _EOT destination = "local/rules/ping.yml" left_delimiter = "{{{" right_delimiter = "}}}" } template { data = <<_EOT # vi: syntax=yaml groups: - name: Postgres rules: - alert: PostgresqlDown expr: 'pg_up == 0' for: 0m labels: severity: critical annotations: summary: Postgresql down (instance {{ $labels.instance }}) description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresTooManyRestarts expr: changes(process_start_time_seconds{job="pg"}[15m]) > 3 for: 1m labels: severity: warning annotations: summary: Postgres too many restarts (instance {{ $labels.instance }}) description: "Postgres server has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTooManyConnections expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8' for: 2m labels: severity: warning annotations: summary: Postgresql too many connections (instance {{ $labels.instance }}) description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlDeadLocks expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' for: 0m labels: severity: warning annotations: summary: Postgresql dead locks (instance {{ $labels.instance }}) description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # - alert: PostgresqlHighRollbackRate # expr: 'rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.05' # for: 0m # labels: # severity: warning # annotations: # summary: Postgresql high rollback rate (instance {{ $labels.instance }}) # description: "Ratio of transactions being aborted compared to committed is > 5 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlHighRateStatementTimeout expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3' for: 0m labels: severity: critical annotations: summary: Postgresql high rate statement timeout (instance {{ $labels.instance }}) description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlHighRateDeadlock expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1' for: 0m labels: severity: critical annotations: summary: Postgresql high rate deadlock (instance {{ $labels.instance }}) description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTooManyLocksAcquired expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20' for: 2m labels: severity: critical annotations: summary: Postgresql too many locks acquired (instance {{ $labels.instance }}) description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" _EOT destination = "local/rules/postgres.yml" left_delimiter = "{{{" right_delimiter = "}}}" } template { data = <<_EOT # vi: syntax=yaml groups: # Prometheus - name: Prometheus rules: - alert: PrometheusTargetMissing expr: up{job!~"sftp-PR\\d+"} == 0 for: 5m labels: severity: critical annotations: summary: Prometheus target missing (job {{ $labels.job }}, instance {{ $labels.instance }}) description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTooManyRestarts expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 3 for: 1m labels: severity: warning annotations: summary: Prometheus too many restarts (job {{ $labels.job }}, instance {{ $labels.instance }}) description: "Prometheus has restarted more than 3 times in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusNotConnectedToAlertmanager expr: prometheus_notifications_alertmanagers_discovered < 1 for: 2m labels: severity: critical annotations: summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusRuleEvaluationFailures expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusRuleEvaluationSlow expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds for: 5m labels: severity: warning annotations: summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusNotificationsBacklog expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 for: 0m labels: severity: warning annotations: summary: Prometheus notifications backlog (instance {{ $labels.instance }}) description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusAlertmanagerNotificationFailing expr: rate(alertmanager_notifications_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTargetScrapingSlow expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 for: 5m labels: severity: warning annotations: summary: Prometheus target scraping slow (instance {{ $labels.instance }}) description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTsdbWalCorruptions expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" _EOT destination = "local/rules/prometheus.yml" left_delimiter = "{{{" right_delimiter = "}}}" } template { data = <<_EOT # vi: syntax=yaml groups: - name: Traefik rules: - alert: TraefikHighHttp5xxErrorRateService expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5' for: 1m labels: severity: critical annotations: summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }}) description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" _EOT destination = "local/rules/traefik.yml" left_delimiter = "{{{" right_delimiter = "}}}" } template { data = <<_EOT # vi: syntax=yaml groups: - name: HashicorpVault rules: - alert: VaultSealed expr: 'vault_core_unsealed == 0' for: 0m labels: severity: critical annotations: summary: Vault sealed (instance {{ $labels.instance }}) description: "Vault instance is sealed on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" _EOT destination = "local/rules/vault.yml" left_delimiter = "{{{" right_delimiter = "}}}" } template { data = <<_EOT # vi: syntax=yaml groups: - name: ConsulExporter rules: - alert: ConsulServiceHealthcheckFailed # Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available) expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0' for: 2m labels: severity: critical annotations: summary: Consul service healthcheck failed (service {{ $labels.service_name }}) description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ConsulMissingMasterNode expr: 'consul_raft_leader != 1' for: 0m labels: severity: critical annotations: summary: Consul missing master node (node {{ $labels.node }}) description: "No consul leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ConsulAgentUnhealthy expr: 'consul_health_node_status{status="critical"} == 1' for: 0m labels: severity: critical annotations: summary: Consul agent unhealthy (node {{ $labels.node }}) description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ConsulServiceWarning expr: 'consul_health_service_status{status="warning"} == 1' for: 2m labels: severity: warning annotations: summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ConsulServiceCritical expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1' for: 2m labels: severity: critical annotations: summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" _EOT destination = "local/rules/consul.yml" left_delimiter = "{{{" right_delimiter = "}}}" } template { data = <<_EOT groups: - name: EmbeddedExporter rules: - alert: LokiProcessTooManyRestarts expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2' for: 0m labels: severity: warning annotations: summary: Loki process too many restarts (instance {{ $labels.instance }}) description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestErrors expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10' for: 15m labels: severity: critical annotations: summary: Loki request errors (instance {{ $labels.instance }}) description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestPanic expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0' for: 5m labels: severity: critical annotations: summary: Loki request panic (instance {{ $labels.instance }}) description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestLatency expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1' for: 5m labels: severity: critical annotations: summary: Loki request latency (instance {{ $labels.instance }}) description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" _EOT destination = "local/rules/loki.yml" left_delimiter = "{{{" right_delimiter = "}}}" } template { data = <<_EOT groups: - name: NodeExporter rules: - alert: HostOutOfMemory expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host out of memory (instance {{ $labels.instance }}) description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostMemoryUnderMemoryPressure expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host memory under memory pressure (instance {{ $labels.instance }}) description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostMemoryIsUnderutilized expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 1w labels: severity: info annotations: summary: Host Memory is underutilized (instance {{ $labels.instance }}) description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualNetworkThroughputIn expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 5m labels: severity: warning annotations: summary: Host unusual network throughput in (instance {{ $labels.instance }}) description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualNetworkThroughputOut expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 5m labels: severity: warning annotations: summary: Host unusual network throughput out (instance {{ $labels.instance }}) description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskReadRate expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 5m labels: severity: warning annotations: summary: Host unusual disk read rate (instance {{ $labels.instance }}) description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskWriteRate expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host unusual disk write rate (instance {{ $labels.instance }}) description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostOutOfDiskSpace expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host out of disk space (instance {{ $labels.instance }}) description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostDiskWillFillIn24Hours expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostOutOfInodes expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host out of inodes (instance {{ $labels.instance }}) description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostFilesystemDeviceError expr: 'node_filesystem_device_error == 1' for: 0m labels: severity: critical annotations: summary: Host filesystem device error (instance {{ $labels.instance }}) description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostInodesWillFillIn24Hours expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskReadLatency expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host unusual disk read latency (instance {{ $labels.instance }}) description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskWriteLatency expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host unusual disk write latency (instance {{ $labels.instance }}) description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostHighCpuLoad expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 10m labels: severity: warning annotations: summary: Host high CPU load (instance {{ $labels.instance }}) description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # - alert: HostCpuIsUnderutilized # expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' # for: 1w # labels: # severity: info # annotations: # summary: Host CPU is underutilized (instance {{ $labels.instance }}) # description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuStealNoisyNeighbor expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning annotations: summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuHighIowait expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning annotations: summary: Host CPU high iowait (instance {{ $labels.instance }}) description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskIo expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 5m labels: severity: warning annotations: summary: Host unusual disk IO (instance {{ $labels.instance }}) description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostContextSwitching expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 20000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning annotations: summary: Host context switching (instance {{ $labels.instance }}) description: "Context switching is growing on the node (> 20000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # - alert: HostSwapIsFillingUp # expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' # for: 2m # labels: # severity: warning # annotations: # summary: Host swap is filling up (instance {{ $labels.instance }}) # description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostSystemdServiceCrashed expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning annotations: summary: Host systemd service crashed (instance {{ $labels.instance }}) description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostPhysicalComponentTooHot expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 5m labels: severity: warning annotations: summary: Host physical component too hot (instance {{ $labels.instance }}) description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNodeOvertemperatureAlarm expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: critical annotations: summary: Host node overtemperature alarm (instance {{ $labels.instance }}) description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostRaidArrayGotInactive expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: critical annotations: summary: Host RAID array got inactive (instance {{ $labels.instance }}) description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostRaidDiskFailure expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host RAID disk failure (instance {{ $labels.instance }}) description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostKernelVersionDeviations expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 6h labels: severity: warning annotations: summary: Host kernel version deviations (instance {{ $labels.instance }}) description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostOomKillDetected expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning annotations: summary: Host OOM kill detected (instance {{ $labels.instance }}) description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostEdacCorrectableErrorsDetected expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: info annotations: summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostEdacUncorrectableErrorsDetected expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning annotations: summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkReceiveErrors expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host Network Receive Errors (instance {{ $labels.instance }}) description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkTransmitErrors expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host Network Transmit Errors (instance {{ $labels.instance }}) description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkInterfaceSaturated expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 1m labels: severity: warning annotations: summary: Host Network Interface Saturated (instance {{ $labels.instance }}) description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkBondDegraded expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host Network Bond Degraded (instance {{ $labels.instance }}) description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostConntrackLimit expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 5m labels: severity: warning annotations: summary: Host conntrack limit (instance {{ $labels.instance }}) description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostClockSkew expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 10m labels: severity: warning annotations: summary: Host clock skew (instance {{ $labels.instance }}) description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostClockNotSynchronising expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host clock not synchronising (instance {{ $labels.instance }}) description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostRequiresReboot expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 4h labels: severity: info annotations: summary: Host requires reboot (instance {{ $labels.instance }}) description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" _EOT destination = "local/rules/node.yml" left_delimiter = "{{{" right_delimiter = "}}}" } # A client cert, to connect to the AlertManager API template { data = <<_EOT {{- with pkiCert "pki/monitoring/issue/prometheus" (printf "common_name=prometheus-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX")) (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}} {{ .Cert }} {{ .Key }} {{- end -}} _EOT destination = "secrets/prometheus.bundle.pem" uid = 100000 gid = 109090 perms = "0440" change_mode = "signal" change_signal = "SIGHUP" } # The monitoring CA chain, to validate AlertManager cert template { data = <<_EOT {{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/monitoring.ca.pem" uid = 100000 gid = 100000 change_mode = "signal" change_signal = "SIGHUP" } # Persistent data volume_mount { volume = "data" destination = "/data" } resources { cpu = 200 memory = 768 memory_max = 1024 } } } group "alerts" { shutdown_delay = "6s" count = 1 network { mode = "bridge" # Port exposing the web API, with mTLS port "web-tls" {} # Port used for gossip between the different alertmanager instance port "cluster" {} # Port to expose metrics to prometheus port "metrics" {} } volume "data" { source = "alertmanager-data" type = "csi" access_mode = "single-node-writer" attachment_mode = "file-system" per_alloc = true } # This service is used for the different instances of alertmanager to communicate service { name = "alertmanager-gossip" port = "cluster" meta { alloc = "${NOMAD_ALLOC_INDEX}" } } # This service is used by prometheus. As it needs to be able to reach every instances, it cannot use # the service mesh. The exposed port uses mTLS, so it's safe to expose it outside of the mesh service { name = "alertmanager-tls" port = "web-tls" meta { alloc = "${NOMAD_ALLOC_INDEX}" } } # This service is exposed through the service mesh # and can be used to reach the web interface through Traefik service { name = "alertmanager" port = 9093 meta { metrics-port = "${NOMAD_HOST_PORT_metrics}" alloc = "${NOMAD_ALLOC_INDEX}" datacenter = "${NOMAD_DC}" group = "${NOMAD_GROUP_NAME}" job = "${NOMAD_JOB_NAME}" namespace = "${NOMAD_NAMESPACE}" node = "${node.unique.name}" region = "${NOMAD_REGION}" } connect { sidecar_service { } sidecar_task { config { args = [ "-c", "${NOMAD_SECRETS_DIR}/envoy_bootstrap.json", "-l", "${meta.connect.log_level}", "--concurrency", "${meta.connect.proxy_concurrency}", "--disable-hot-restart" ] } resources { cpu = 50 memory = 64 } } } check { name = "health" type = "http" expose = true path = "/-/healthy" interval = "20s" timeout = "8s" check_restart { limit = 12 grace = "30s" } } tags = [ ] } # This task will handle mTLS to the AlertManager API # And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy task "untls-proxy" { driver = "docker" user = 9093 config { image = "nginxinc/nginx-unprivileged:alpine" force_pull = true readonly_rootfs = true pids_limit = 30 volumes = [ "local/alertmanager.conf:/etc/nginx/conf.d/default.conf:ro", ] mount { type = "tmpfs" target = "/tmp" tmpfs_options { size = 3000000 } } } vault { policies = ["metrics", "alertmanager"] env = false disable_file = true change_mode = "noop" } lifecycle { hook = "poststart" sidecar = true } template { data = <<_EOT # UnTLS for the web API server { listen 127.0.0.1:9093; location / { proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }}; proxy_ssl_certificate /secrets/alertmanager.bundle.pem; proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem; proxy_ssl_verify on; proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul; proxy_ssl_trusted_certificate /local/monitoring.ca.pem; allow 127.0.0.1; deny all; } } # Metrics proxy server { listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl; http2 on; ssl_certificate /secrets/metrics.bundle.pem; ssl_certificate_key /secrets/metrics.bundle.pem; ssl_client_certificate /local/monitoring.ca.pem; ssl_verify_client on; ssl_protocols TLSv1.2 TLSv1.3; ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; ssl_session_cache shared:SSL:10m; ssl_session_timeout 1h; ssl_session_tickets off; gzip on; gzip_types text/plain; gzip_vary on; server_tokens off; if ($request_method !~ ^(GET|HEAD)$ ) { return 405; } location /metrics { proxy_ssl_certificate /secrets/alertmanager.bundle.pem; proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem; proxy_ssl_verify on; proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul; proxy_ssl_trusted_certificate /local/monitoring.ca.pem; proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }}; } } _EOT destination = "local/alertmanager.conf" } # Get a certificate from vault to protect the metrics endpoint template { data = <<_EOT {{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} {{ .Cert }} {{ .Key }} {{- end }} _EOT destination = "secrets/metrics.bundle.pem" } # Get the root CA template { data = <<_EOT {{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/monitoring.ca.pem" } # Certifiate used by AlertManager template { data = <<_EOT {{- with pkiCert "pki/monitoring/issue/alertmanager" (printf "common_name=alertmanager-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX")) (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }} {{ .Cert }} {{ .Key }} {{- end }} _EOT destination = "secrets/alertmanager.bundle.pem" uid = 109093 gid = 100000 perms = "0440" change_mode = "signal" change_signal = "SIGHUP" } resources { cpu = 10 memory = 18 } } # The main alertmanager task task "alertmanager" { driver = "docker" leader = true config { image = "danielberteaud/alertmanager:0.27.0-2" readonly_rootfs = true pids_limit = 200 command = "/local/alertmanager" } vault { policies = ["metrics", "alertmanager"] env = false disable_file = true change_mode = "noop" } # Use a template block instead of env {} so we can fetch values from vault template { data = <<_EOT LANG=fr_FR.utf8 TZ=Europe/Paris _EOT destination = "secrets/.env" perms = 400 env = true } template { data = <<_EOT global: smtp_from: alertmanager@consul smtp_require_tls: false smtp_smarthost: localhost:25 _EOT destination = "secrets/alertmanager.yml" } template { data = <<_EOT tls_server_config: cert_file: /secrets/alertmanager.bundle.pem key_file: /secrets/alertmanager.bundle.pem client_auth_type: RequireAndVerifyClientCert client_ca_file: /local/monitoring.ca.pem tls_client_config: cert_file: /secrets/alertmanager.bundle.pem key_file: /secrets/alertmanager.bundle.pem ca_file: /local/monitoring.ca.pem _EOT destination = "local/cluster_tls.yml" } template { data = <<_EOT tls_server_config: cert_file: /secrets/alertmanager.bundle.pem key_file: /secrets/alertmanager.bundle.pem client_auth_type: RequireAndVerifyClientCert client_ca_file: /local/monitoring.ca.pem _EOT destination = "local/web_tls.yml" } template { data = <<_EOT #!/bin/sh set -euo pipefail exec alertmanager \ --config.file=/secrets/alertmanager.yml \ --storage.path=/data \ --web.external-url=https://alert.example.org \ --web.route-prefix=/ \ --web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \ --cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \ --cluster.advertise-address={{ env "NOMAD_HOST_ADDR_cluster" }} \ {{- range service "alertmanager-gossip" -}} {{- if not (eq (env "NOMAD_ALLOC_INDEX") (index .ServiceMeta "alloc")) }} --cluster.peer={{ .Address }}:{{ .Port }} \ {{ end -}} {{- end -}} --cluster.tls-config=/local/cluster_tls.yml \ --web.config.file=/local/web_tls.yml _EOT destination = "local/alertmanager" uid = 100000 gid = 100000 perms = "0755" } # Certifiate used by AlertManager template { data = <<_EOT {{- with pkiCert "pki/monitoring/issue/alertmanager" (printf "common_name=alertmanager-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX")) (printf "ip_sans=%s" (env "NOMAD_HOST_IP_cluster")) (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) }} {{ .Cert }} {{ .Key }} {{- end }} _EOT destination = "secrets/alertmanager.bundle.pem" uid = 109093 gid = 109090 perms = "0440" change_mode = "signal" change_signal = "SIGHUP" } # The trusted CA template { data = <<_EOT {{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/monitoring.ca.pem" } volume_mount { volume = "data" destination = "/data" } resources { cpu = 50 memory = 64 memory_max = 80 } } } group "logs-server" { shutdown_delay = "6s" network { mode = "bridge" port "metrics" {} } volume "data" { source = "loki-data" type = "csi" access_mode = "single-node-writer" attachment_mode = "file-system" } service { name = "loki" port = 3100 meta { metrics-port = "${NOMAD_HOST_PORT_metrics}" alloc = "${NOMAD_ALLOC_INDEX}" datacenter = "${NOMAD_DC}" group = "${NOMAD_GROUP_NAME}" job = "${NOMAD_JOB_NAME}" namespace = "${NOMAD_NAMESPACE}" node = "${node.unique.name}" region = "${NOMAD_REGION}" } connect { sidecar_service { } sidecar_task { config { args = [ "-c", "${NOMAD_SECRETS_DIR}/envoy_bootstrap.json", "-l", "${meta.connect.log_level}", "--concurrency", "${meta.connect.proxy_concurrency}", "--disable-hot-restart" ] } resources { cpu = 50 memory = 64 } } } check { name = "ready" type = "http" path = "/ready" expose = true interval = "20s" timeout = "8s" check_restart { limit = 6 grace = "5m" } } tags = [ ] } # The prometheus metrics proxy, adding mTLS to the metrics endpoint task "metrics-proxy" { driver = "docker" user = 8995 config { image = "nginxinc/nginx-unprivileged:alpine" force_pull = true volumes = [ "local/default.conf:/etc/nginx/conf.d/default.conf:ro" ] pids_limit = 100 } lifecycle { hook = "poststart" sidecar = true } vault { policies = ["metrics"] } # Get a certificate from vault to protect the metrics endpoint template { data = <<_EOT {{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} {{ .Cert }} {{ .Key }} {{- end }} _EOT destination = "secrets/metrics.bundle.pem" } # Get the root CA template { data = <<_EOT {{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/monitoring.ca.pem" } template { data = <<_EOT server { listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl; http2 on; ssl_certificate /secrets/metrics.bundle.pem; ssl_certificate_key /secrets/metrics.bundle.pem; ssl_client_certificate /local/monitoring.ca.pem; ssl_verify_client on; ssl_protocols TLSv1.2 TLSv1.3; ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; ssl_session_cache shared:SSL:10m; ssl_session_timeout 1h; ssl_session_tickets off; gzip on; gzip_types text/plain; gzip_vary on; server_tokens off; if ($request_method !~ ^(GET|HEAD)$ ) { return 405; } location /metrics { proxy_pass http://localhost:3100/metrics; } } _EOT destination = "local/default.conf" } resources { cpu = 10 memory = 10 memory_max = 20 } } task "loki" { driver = "docker" config { image = "danielberteaud/loki:2.9.6-1" command = "loki" args = ["--config.file=/local/loki.yml"] } vault { policies = ["loki"] env = false disable_file = true change_mode = "noop" } # Use a template block instead of env {} so we can fetch values from vault template { data = <<_EOT LANG=fr_FR.utf8 TZ=Europe/Paris _EOT destination = "secrets/.env" perms = 400 env = true } template { data = <<_EOT analytics: reporting_enabled: false auth_enabled: false common: instance_addr: 127.0.0.1 path_prefix: /data replication_factor: 1 ring: kvstore: store: inmemory storage: filesystem: chunks_directory: /data/chunks rules_directory: /data/rules compactor: compaction_interval: 1h deletion_mode: filter-and-delete retention_enabled: true shared_store: filesystem working_directory: /data/compactor ingester: chunk_idle_period: 1h limits_config: ingestion_burst_size_mb: 100 ingestion_rate_mb: 20 max_entries_limit_per_query: 20000 max_query_parallelism: 128 retention_period: 720h split_queries_by_interval: 0 ruler: alertmanager_client: tls_ca_path: /secrets/monitoring.ca.pem tls_cert_path: /secrets/loki.bundle.pem tls_key_path: /secrets/loki.bundle.pem tls_server_name: alertmanager.monitoring alertmanager_url: alertmanager-tls enable_alertmanager_discovery: true enable_alertmanager_v2: true enable_api: true ring: kvstore: store: inmemory rule_path: /tmp/loki-rules storage: local: directory: /local/rules type: local schema_config: configs: - from: "2020-10-24" index: period: 24h prefix: index_ object_store: filesystem schema: v11 store: boltdb-shipper server: grpc_listen_address: 127.0.0.1 grpc_listen_port: 9095 http_listen_address: 127.0.0.1 http_listen_port: 3100 storage_config: boltdb_shipper: active_index_directory: /data/index cache_location: /data/boltdb-cache shared_store: filesystem _EOT destination = "local/loki.yml" } # A client cert, to connect to the AlertManager API template { data = <<_EOT {{- with pkiCert "pki/monitoring/issue/loki" (printf "common_name=loki-%s.monitoring.consul" (env "NOMAD_ALLOC_INDEX")) (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}} {{ .Cert }} {{ .Key }} {{- end -}} _EOT destination = "secrets/loki.bundle.pem" uid = 100000 gid = 103100 perms = "0440" change_mode = "signal" change_signal = "SIGHUP" } # The monitoring CA chain, to validate AlertManager cert template { data = <<_EOT {{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/monitoring.ca.pem" uid = 100000 gid = 100000 change_mode = "signal" change_signal = "SIGHUP" } volume_mount { volume = "data" destination = "/data" } resources { cpu = 150 memory = 1024 } } } # The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.) # And with a loki sink. The goal is to be able to collect logs from various sources group "logs-aggregator" { count = 1 shutdown_delay = "6s" network { mode = "bridge" port "metrics" {} } # The main service is the vector source # It will provide access to other services through the mesh (like loki) service { name = "vector-aggregator" port = 9000 meta { metrics-port = "${NOMAD_HOST_PORT_metrics}" alloc = "${NOMAD_ALLOC_INDEX}" datacenter = "${NOMAD_DC}" group = "${NOMAD_GROUP_NAME}" job = "${NOMAD_JOB_NAME}" namespace = "${NOMAD_NAMESPACE}" node = "${node.unique.name}" region = "${NOMAD_REGION}" } connect { sidecar_service { proxy { upstreams { destination_name = "loki" local_bind_port = 3100 # Work arround, see https://github.com/hashicorp/nomad/issues/18538 destination_type = "service" } } } sidecar_task { config { args = [ "-c", "${NOMAD_SECRETS_DIR}/envoy_bootstrap.json", "-l", "${meta.connect.log_level}", "--concurrency", "${meta.connect.proxy_concurrency}", "--disable-hot-restart" ] } resources { cpu = 50 memory = 64 } } } tags = [ ] } task "vector" { driver = "docker" leader = true config { image = "danielberteaud/vector:0.37.0-1" readonly_rootfs = true pids_limit = 200 args = ["--config=/local/vector.yml"] } vault { policies = ["metrics"] env = false disable_file = true change_mode = "noop" } # Use a template block instead of env {} so we can fetch values from vault template { data = <<_EOT LANG=fr_FR.utf8 TZ=Europe/Paris _EOT destination = "secrets/.env" perms = 400 env = true } # Get a certificate from vault to protect the metrics endpoint template { data = <<_EOT {{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} {{ .Cert }} {{ .Key }} {{- end }} _EOT destination = "secrets/metrics.bundle.pem" } # Get the root CA template { data = <<_EOT {{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/monitoring.ca.pem" } template { data = <<_EOT data_dir: /local expire_metrics_secs: 600 sources: logs_vector: type: vector address: 127.0.0.1:9000 vector_metrics: type: internal_metrics transforms: split-by-app: type: route inputs: [ "logs_*" ] route: traefik: '.service == "traefik"' postgres: '.service == "postgres"' syslog: '.source_type == "syslog"' parse-traefik: type: remap inputs: ["split-by-app.traefik"] source: | .http = parse_grok!(.message, "%%{HTTPD_COMMONLOG}") .loki_labels.http_method = .http.verb .loki_labels.http_status = .http.response .loki_labels.user = .http.auth parse-postgres: type: remap inputs: ["split-by-app.postgres"] source: | if includes(array!(.nomad.tags), "master"){ .loki_labels.pg_role = "master" } else if includes(array!(.nomad.tags), "replica"){ .loki_labels.pg_role = "replica" } parse-syslog: type: remap inputs: ["split-by-app.syslog"] source: | # PfSense sends /usr/sbin/cron as the appname, instead of cron if string!(.appname) == "/usr/sbin/cron" { .appname = "cron" } .service = .appname sinks: loki: type: loki inputs: [ "split-by-app._unmatched", "parse-*" ] endpoint: http://127.0.0.1:3100 encoding: codec: text labels: job: "{{ .service }}" host: "{{ .host }}" _*: "{{ .loki_labels }}" buffer: type: disk max_size: 268435488 remove_label_fields: true # Expose vector internal metrics prometheus: type: prometheus_exporter inputs: ["vector_metrics"] address: 0.0.0.0:$${NOMAD_ALLOC_PORT_metrics} tls: enabled: true crt_file: /secrets/metrics.bundle.pem key_file: /secrets/metrics.bundle.pem ca_file: /local/monitoring.ca.pem verify_certificate: true _EOT destination = "local/vector.yml" left_delimiter = "{{{" right_delimiter = "}}}" change_mode = "signal" change_signal = "SIGHUP" } resources { cpu = 100 memory = 192 } } } group "grafana" { shutdown_delay = "6s" network { mode = "bridge" port "metrics" {} } volume "data" { source = "grafana-data" type = "csi" access_mode = "single-node-writer" attachment_mode = "file-system" } service { name = "grafana" port = 3000 meta { metrics-port = "${NOMAD_HOST_PORT_metrics}" alloc = "${NOMAD_ALLOC_INDEX}" } connect { sidecar_service { proxy { upstreams { destination_name = "postgres" local_bind_port = 5432 # Work arround, see https://github.com/hashicorp/nomad/issues/18538 destination_type = "service" } upstreams { destination_name = "loki" local_bind_port = 3100 # Work arround, see https://github.com/hashicorp/nomad/issues/18538 destination_type = "service" } upstreams { destination_name = "prometheus" local_bind_port = 9090 # Work arround, see https://github.com/hashicorp/nomad/issues/18538 destination_type = "service" } } } sidecar_task { config { args = [ "-c", "${NOMAD_SECRETS_DIR}/envoy_bootstrap.json", "-l", "${meta.connect.log_level}", "--concurrency", "${meta.connect.proxy_concurrency}", "--disable-hot-restart" ] } resources { cpu = 50 memory = 64 } } } check { name = "health" type = "http" path = "/api/health" expose = true interval = "30s" timeout = "8s" } tags = [ "traefik.enable=true", "traefik.http.routers.monitoring-grafana.entrypoints=https", "traefik.http.routers.monitoring-grafana.rule=Host(`grafana.example.org`)", "traefik.http.middlewares.csp-monitoring-grafana.headers.contentsecuritypolicy=connect-src 'self' https://grafana.com;default-src 'self';font-src 'self' data:;img-src 'self' data: blob: https://grafana.com;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';", "traefik.http.routers.monitoring-grafana.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-grafana", ] } # The prometheus metrics proxy, adding mTLS to the metrics endpoint task "metrics-proxy" { driver = "docker" user = 8995 config { image = "nginxinc/nginx-unprivileged:alpine" force_pull = true volumes = [ "local/default.conf:/etc/nginx/conf.d/default.conf:ro" ] pids_limit = 100 } lifecycle { hook = "poststart" sidecar = true } vault { policies = ["metrics"] } # Get a certificate from vault to protect the metrics endpoint template { data = <<_EOT {{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} {{ .Cert }} {{ .Key }} {{- end }} _EOT destination = "secrets/metrics.bundle.pem" } # Get the root CA template { data = <<_EOT {{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/monitoring.ca.pem" } template { data = <<_EOT server { listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl; http2 on; ssl_certificate /secrets/metrics.bundle.pem; ssl_certificate_key /secrets/metrics.bundle.pem; ssl_client_certificate /local/monitoring.ca.pem; ssl_verify_client on; ssl_protocols TLSv1.2 TLSv1.3; ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; ssl_session_cache shared:SSL:10m; ssl_session_timeout 1h; ssl_session_tickets off; gzip on; gzip_types text/plain; gzip_vary on; server_tokens off; if ($request_method !~ ^(GET|HEAD)$ ) { return 405; } location /metrics { proxy_pass http://127.0.0.1:3000/metrics; } } _EOT destination = "local/default.conf" } resources { cpu = 10 memory = 10 memory_max = 20 } } # Local memcached instance task "memcached" { driver = "docker" user = 11211 lifecycle { hook = "prestart" sidecar = true } config { image = "memcached:alpine" readonly_rootfs = true force_pull = true entrypoint = ["/local/memcached"] } template { data = <<_EOT #!/bin/sh set -eu exec memcached -l 127.0.0.1 -p 11211 -m {{ env "NOMAD_MEMORY_LIMIT" | parseInt | subtract 5 }} _EOT destination = "local/memcached" perms = 755 } resources { cpu = 10 memory = 20 } } task "grafana" { driver = "docker" leader = true config { image = "danielberteaud/grafana:10.4.1-1" readonly_rootfs = true pids_limit = 100 command = "grafana" args = [ "server", "--homepath=/opt/grafana", "--config=/secrets/grafana.ini", "--packaging=docker" ] } vault { policies = ["grafana"] env = false disable_file = true change_mode = "noop" } # Use a template block instead of env {} so we can fetch values from vault template { data = <<_EOT LANG=fr_FR.utf8 TZ=Europe/Paris _EOT destination = "secrets/.env" perms = 400 env = true } template { data = <<_EOT GF_SECURITY_ADMIN_PASSWORD={{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd | sprig_squote }}{{ end }} _EOT destination = "secrets/.grafana.env" perms = 400 env = true } # Basic grafana configuration file template { data = <<_EOT [server] http_addr = 127.0.0.1 http_port = 3000 root_url = https://grafana.example.org [database] type = postgres name = grafana host = 127.0.0.1:5432 user = {{ with secret "database/creds/grafana" }}{{ .Data.username }}{{ end }} password = {{ with secret "database/creds/grafana" }}{{ .Data.password }}{{ end }} [remote_cache] type = memcached connstr = 127.0.0.1:11211 [analytics] reporting_enabled = false check_for_updates = false check_for_plugin_updates = false [security] cookie_secure = true cookie_samesite = strict x_xss_protection = true secret_key = {{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.secret_key }}{{ end }} [dataproxy] timeout = 120 [feature_toggles] _EOT destination = "secrets/grafana.ini" uid = 103000 perms = 400 } # Mount volume in /data for persistence volume_mount { volume = "data" destination = "/data" } resources { cpu = 100 memory = 256 } } } }