diff --git a/TODO.md b/TODO.md index f2e1157..d05f589 100644 --- a/TODO.md +++ b/TODO.md @@ -6,7 +6,7 @@ - ~~blackbox-exporter~~ - ~~consul-exporter~~ - vector - - loki + - ~~loki~~ - grafana - nomad-vector-logger @@ -27,17 +27,18 @@ - consul defaults & intentions - ~~prometheus~~ - ~~alertmanager~~ - - loki + - ~~loki~~ - tasks - ~~alertmanager~~ - vector-aggregator - vector-agent (dans job agent) - - loki (modulariser ou laisser en monolithique ?) + - ~~loki (modulariser ou laisser en monolithique ?)~~ - grafana - ~~cluster-metrics (job exporters)~~ -- questions +- questions / various - prom rules: keep or move to a -conf bundle ? - ~~config alertes am (recipient + routing)~~ - ~~http and tcp probes, as exporters are now in a dedicated job~~ + - alertmanager & rules for loki diff --git a/consul/config/service-defaults/monitoring-loki.hcl b/consul/config/service-defaults/monitoring-loki.hcl new file mode 100644 index 0000000..b63d91f --- /dev/null +++ b/consul/config/service-defaults/monitoring-loki.hcl @@ -0,0 +1,3 @@ +Kind = "service-defaults" +Name = "[[ .instance ]]-loki[[ .consul.suffix ]]" +Protocol = "http" diff --git a/consul/config/service-intentions/monitoring-loki.hcl b/consul/config/service-intentions/monitoring-loki.hcl new file mode 100644 index 0000000..d7d533e --- /dev/null +++ b/consul/config/service-intentions/monitoring-loki.hcl @@ -0,0 +1,55 @@ +Kind = "service-intentions" +Name = "[[ .instance ]]-loki[[ .consul.suffix ]]" +Sources = [ + { + Name = "[[ (merge .monitoring.loki .monitoring .).traefik.instance ]]" + Permissions = [ + { + Action = "allow" + HTTP { + PathPrefix = "/" + } + } + ] + }, + { + Name = "[[ .instance ]]-grafana[[ .consul.suffix ]]" + Permissions = [ + { + Action = "allow" + HTTP { + PathPrefix = "/loki/api/v1" + Methods = ["GET", "HEAD", "POST"] + } + }, + { + Action = "allow" + HTTP { + PathPrefix = "/ready" + Methods = ["GET"] + } + } + ] + }, +[[- range $idx, $service := coll.Slice "vector-aggregator" "vector-agent" ]] + { + Name = "[[ $.instance ]]-[[ $service ]][[ $.consul.suffix ]]" + Permissions = [ + { + Action = "allow" + HTTP { + PathExact = "/loki/api/v1/push" + Methods = ["POST"] + } + }, + { + Action = "allow" + HTTP { + PathExact = "/ready" + Methods = ["GET"] + } + } + ] + }, +[[- end ]] +] diff --git a/example/TODO.md b/example/TODO.md index e4cee54..d05f589 100644 --- a/example/TODO.md +++ b/example/TODO.md @@ -1,12 +1,12 @@ - ~~Split exporters dans un job dédié (pour pouvoir tourner sur node_pool spécifique)~~ - Créer monitoring-agent type system avec vector + node-exporter - images - - prometheus - - ping-exporter - - blackbox-exporter - - consul-exporter + - ~~prometheus~~ + - ~~ping-exporter~~ + - ~~blackbox-exporter~~ + - ~~consul-exporter~~ - vector - - loki + - ~~loki~~ - grafana - nomad-vector-logger @@ -16,28 +16,29 @@ - ~~monitoring -> am~~ - vault pol - - prometheus + - ~~prometheus~~ - ~~issue prom on monitoring~~ - ~~issue prom on consul~~ - - consul-exporter - - issue consul-exporter on consul - - alertmanager + - ~~consul-exporter~~ + - ~~issue consul-exporter on consul~~ + - ~~alertmanager~~ - ~~issue alertmanager on monitoring~~ - consul defaults & intentions - ~~prometheus~~ - ~~alertmanager~~ - - loki + - ~~loki~~ - tasks - ~~alertmanager~~ - vector-aggregator - vector-agent (dans job agent) - - loki (modulariser ou laisser en monolithique ?) + - ~~loki (modulariser ou laisser en monolithique ?)~~ - grafana - - cluster-metrics (job exporters) + - ~~cluster-metrics (job exporters)~~ -- questions +- questions / various - prom rules: keep or move to a -conf bundle ? - ~~config alertes am (recipient + routing)~~ - - http and tcp probes, as exporters are now in a dedicated job + - ~~http and tcp probes, as exporters are now in a dedicated job~~ + - alertmanager & rules for loki diff --git a/example/consul/config/service-defaults/monitoring-loki.hcl b/example/consul/config/service-defaults/monitoring-loki.hcl new file mode 100644 index 0000000..69e7b61 --- /dev/null +++ b/example/consul/config/service-defaults/monitoring-loki.hcl @@ -0,0 +1,3 @@ +Kind = "service-defaults" +Name = "monitoring-loki" +Protocol = "http" diff --git a/example/consul/config/service-intentions/monitoring-loki.hcl b/example/consul/config/service-intentions/monitoring-loki.hcl new file mode 100644 index 0000000..02d055d --- /dev/null +++ b/example/consul/config/service-intentions/monitoring-loki.hcl @@ -0,0 +1,72 @@ +Kind = "service-intentions" +Name = "monitoring-loki" +Sources = [ + { + Name = "traefik" + Permissions = [ + { + Action = "allow" + HTTP { + PathPrefix = "/" + } + } + ] + }, + { + Name = "monitoring-grafana" + Permissions = [ + { + Action = "allow" + HTTP { + PathPrefix = "/loki/api/v1" + Methods = ["GET", "HEAD", "POST"] + } + }, + { + Action = "allow" + HTTP { + PathPrefix = "/ready" + Methods = ["GET"] + } + } + ] + }, + { + Name = "monitoring-vector-aggregator" + Permissions = [ + { + Action = "allow" + HTTP { + PathExact = "/loki/api/v1/push" + Methods = ["POST"] + } + }, + { + Action = "allow" + HTTP { + PathExact = "/ready" + Methods = ["GET"] + } + } + ] + }, + { + Name = "monitoring-vector-agent" + Permissions = [ + { + Action = "allow" + HTTP { + PathExact = "/loki/api/v1/push" + Methods = ["POST"] + } + }, + { + Action = "allow" + HTTP { + PathExact = "/ready" + Methods = ["GET"] + } + } + ] + }, +] diff --git a/example/images/loki/Dockerfile b/example/images/loki/Dockerfile new file mode 100644 index 0000000..9f25ff0 --- /dev/null +++ b/example/images/loki/Dockerfile @@ -0,0 +1,38 @@ +FROM danielberteaud/alpine:24.3-1 AS builder + +ARG LOKI_VERSION=2.9.5 + +ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/loki-linux-amd64.zip /tmp +ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/SHA256SUMS /tmp +RUN set -eux &&\ + apk --no-cache add unzip &&\ + cd /tmp &&\ + grep "loki-linux-amd64.zip" SHA256SUMS | sha256sum -c &&\ + unzip loki-linux-amd64.zip &&\ + mkdir /opt/loki &&\ + mv loki-linux-amd64 /opt/loki/loki + +FROM danielberteaud/alpine:24.3-1 +MAINTAINER Daniel Berteaud + +ENV PATH=/opt/loki:$PATH +COPY --from=builder /opt/loki /opt/loki +RUN set -eux &&\ + addgroup -g 3100 loki &&\ + adduser \ + --system \ + --disabled-password \ + --uid 3100 \ + --ingroup loki \ + --home /opt/loki \ + --no-create-home \ + --shell /sbin/nologin \ + loki &&\ + mkdir /data &&\ + chown loki:loki /data &&\ + chmod 700 data + +WORKDIR /opt/loki +USER loki +EXPOSE 3100 +CMD ["loki"] diff --git a/example/images/vector/Dockerfile b/example/images/vector/Dockerfile new file mode 100644 index 0000000..4b231a4 --- /dev/null +++ b/example/images/vector/Dockerfile @@ -0,0 +1 @@ +FROM timberio/vector:0.36.1-alpine diff --git a/example/init/pki b/example/init/pki index 9f25071..30555d2 100755 --- a/example/init/pki +++ b/example/init/pki @@ -118,6 +118,19 @@ vault write pki/monitoring/roles/monitoring-prometheus \ max_ttl=100h \ ou="Monitoring" +# Create a role for loki (which will only be a client, for AlertManager) +vault write pki/monitoring/roles/monitoring-loki \ + allowed_domains="monitoring" \ + allow_bare_domains=false \ + allow_subdomains=true \ + allow_localhost=false \ + allow_ip_sans=false \ + server_flag=false \ + client_flag=true \ + allow_wildcard_certificates=false \ + max_ttl=100h \ + ou="Monitoring" + # Create a role for metrics exporters (server only) vault write pki/monitoring/roles/metrics \ allowed_domains="monitoring" \ @@ -151,6 +164,6 @@ vault write pki/consul/roles/monitoring-cluster-exporter \ allow_subdomains=true \ allow_wildcard_certificates=false \ max_ttl=168h \ - server_flags=false \ - client_flags=true \ + server_flag=false \ + client_flag=true \ ou="Cluster metrics exporter" diff --git a/example/monitoring-exporters.nomad.hcl b/example/monitoring-exporters.nomad.hcl index 9fa0cdd..d7752f6 100644 --- a/example/monitoring-exporters.nomad.hcl +++ b/example/monitoring-exporters.nomad.hcl @@ -114,7 +114,7 @@ _EOT resources { cpu = 20 - memory = 64 + memory = 32 } } @@ -411,7 +411,7 @@ _EOT resources { cpu = 10 - memory = 18 + memory = 15 } } diff --git a/example/monitoring.nomad.hcl b/example/monitoring-services.nomad.hcl similarity index 75% rename from example/monitoring.nomad.hcl rename to example/monitoring-services.nomad.hcl index 742e225..cfadd6b 100644 --- a/example/monitoring.nomad.hcl +++ b/example/monitoring-services.nomad.hcl @@ -1,4 +1,4 @@ -job "monitoring" { +job "monitoring-services" { datacenters = ["dc1"] @@ -225,6 +225,7 @@ alerting: consul_sd_configs: - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 scheme: http + token: {{ with secret "consul/creds/monitoring-prometheus" }}{{ .Data.token }}{{ end }} datacenter: dc1 relabel_configs: # Only keep alertmanagers @@ -919,7 +920,7 @@ _EOT resources { cpu = 200 - memory = 768 + memory = 512 } } @@ -927,7 +928,8 @@ _EOT group "alerts" { - count = 1 + shutdown_delay = "6s" + count = 1 network { mode = "bridge" @@ -1335,8 +1337,574 @@ _EOT resources { - cpu = 50 - memory = 80 + cpu = 50 + memory = 64 + memory_max = 80 + } + + } + } + + group "logs" { + + shutdown_delay = "6s" + + network { + mode = "bridge" + port "metrics" {} + } + + + volume "data" { + source = "monitoring-loki-data" + type = "csi" + access_mode = "single-node-writer" + attachment_mode = "file-system" + } + + + service { + name = "monitoring-loki" + port = 3100 + meta { + metrics-port = "${NOMAD_HOST_PORT_metrics}" + alloc = "${NOMAD_ALLOC_INDEX}" + job = "${NOMAD_JOB_NAME}" + } + + connect { + sidecar_service { + } + sidecar_task { + config { + args = [ + "-c", + "${NOMAD_SECRETS_DIR}/envoy_bootstrap.json", + "-l", + "${meta.connect.log_level}", + "--concurrency", + "${meta.connect.proxy_concurrency}", + "--disable-hot-restart" + ] + } + + resources { + cpu = 50 + memory = 64 + } + + } + } + + + check { + name = "ready" + type = "http" + path = "/ready" + expose = true + interval = "20s" + timeout = "8s" + check_restart { + limit = 6 + grace = "5m" + } + } + + tags = [ + + "traefik.enable=true", + "traefik.http.routers.monitoring-loki.entrypoints=https", + "traefik.http.routers.monitoring-loki.rule=Host(`loki.example.org`)", + "traefik.http.middlewares.csp-monitoring-loki.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';", + "traefik.http.routers.monitoring-loki.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-loki", + + ] + } + + + # The prometheus metrics proxy, adding mTLS to the metrics endpoint + task "metrics-proxy" { + driver = "docker" + user = 8995 + + config { + image = "nginxinc/nginx-unprivileged:alpine" + force_pull = true + volumes = [ + "local/default.conf:/etc/nginx/conf.d/default.conf:ro" + ] + pids_limit = 100 + } + + lifecycle { + hook = "poststart" + sidecar = true + } + + vault { + policies = ["metrics"] + } + + template { + data = <<_EOT +{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} +{{ .Cert }} +{{ .Key }}{{ end -}} +_EOT + destination = "secrets/metrics.bundle.pem" + } + + template { + data = <<_EOT +{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + template { + data = <<_EOT +server { + listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + location /metrics { + proxy_pass http://localhost:3100/metrics; + } +} +_EOT + destination = "local/default.conf" + } + + resources { + cpu = 10 + memory = 10 + memory_max = 20 + } + } + + + + task "loki" { + driver = "docker" + + config { + image = "danielberteaud/loki:2.9.5-1" + command = "loki" + args = ["--config.file=/local/loki.yml"] + } + + + vault { + policies = ["monitoring-loki"] + env = false + disable_file = true + change_mode = "noop" + } + + + + # Use a template block instead of env {} so we can fetch values from vault + template { + data = <<_EOT +LANG=fr_FR.utf8 +TZ=Europe/Paris +_EOT + destination = "secrets/.env" + perms = 400 + env = true + } + + + template { + data = <<_EOT +analytics: + reporting_enabled: false +auth_enabled: false +common: + instance_addr: 127.0.0.1 + path_prefix: /data + replication_factor: 1 + ring: + kvstore: + store: inmemory + storage: + filesystem: + chunks_directory: /data/chunks + rules_directory: /data/rules +compactor: + compaction_interval: 1h + deletion_mode: filter-and-delete + retention_enabled: true + shared_store: filesystem + working_directory: /data/compactor +ingester: + chunk_idle_period: 1h +limits_config: + ingestion_burst_size_mb: 100 + ingestion_rate_mb: 20 + max_entries_limit_per_query: 20000 + max_query_parallelism: 128 + retention_period: 720h +ruler: + alertmanager_client: + tls_ca_path: /secrets/monitoring.ca.pem + tls_cert_path: /secrets/loki.bundle.pem + tls_key_path: /secrets/loki.bundle.pem + tls_server_name: alertmanager.monitoring + alertmanager_url: monitoring-alertmanager-tls + enable_alertmanager_discovery: true + enable_alertmanager_v2: true + enable_api: true + ring: + kvstore: + store: inmemory + rule_path: /tmp/loki-rules + storage: + local: + directory: /local/rules + type: local +schema_config: + configs: + - from: "2020-10-24" + index: + period: 24h + prefix: index_ + object_store: filesystem + schema: v11 + store: boltdb-shipper +server: + grpc_listen_address: 127.0.0.1 + grpc_listen_port: 9095 + http_listen_address: 127.0.0.1 + http_listen_port: 3100 +storage_config: + boltdb_shipper: + active_index_directory: /data/index + cache_location: /data/boltdb-cache + shared_store: filesystem + +_EOT + destination = "local/loki.yml" + } + + # A client cert, to connect to the AlertManager API + template { + data = <<_EOT +{{- with pkiCert "pki/monitoring/issue/monitoring-loki" + (printf "common_name=loki-%s.monitoring" (env "NOMAD_ALLOC_INDEX")) + (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}} +{{ .Cert }} +{{ .Key }} +{{- end -}} +_EOT + destination = "secrets/loki.bundle.pem" + uid = 100000 + gid = 103100 + perms = "0440" + change_mode = "signal" + change_signal = "SIGHUP" + } + + # The monitoring CA chain, to validate AlertManager cert + template { + data = <<_EOT +{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + uid = 100000 + gid = 100000 + change_mode = "signal" + change_signal = "SIGHUP" + } + + volume_mount { + volume = "data" + destination = "/data" + } + + + resources { + cpu = 150 + memory = 512 + } + + } + } + + # The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.) + # And with a loki sink. The goal is to be able to collect logs from various sources + group "aggregator" { + + count = 1 + shutdown_delay = "6s" + + network { + mode = "bridge" + port "syslog-udp" {} + port "metrics" {} + } + + # The main service is the vector source + # It will provide access to other services through the mesh (like loki) + service { + name = "monitoring-vector-aggregator" + port = 9000 + meta { + metrics-port = "${NOMAD_HOST_PORT_metrics}" + alloc = "${NOMAD_ALLOC_INDEX}" + job = "${NOMAD_JOB_NAME}" + } + + connect { + sidecar_service { + proxy { + upstreams { + destination_name = "monitoring-loki" + local_bind_port = 3100 + # Work arround, see https://github.com/hashicorp/nomad/issues/18538 + destination_type = "service" + } + } + } + sidecar_task { + config { + args = [ + "-c", + "${NOMAD_SECRETS_DIR}/envoy_bootstrap.json", + "-l", + "${meta.connect.log_level}", + "--concurrency", + "${meta.connect.proxy_concurrency}", + "--disable-hot-restart" + ] + } + + resources { + cpu = 50 + memory = 64 + } + + } + } + + tags = [ + + + ] + } + + + # The prometheus metrics proxy, adding mTLS to the metrics endpoint + task "metrics-proxy" { + driver = "docker" + user = 8995 + + config { + image = "nginxinc/nginx-unprivileged:alpine" + force_pull = true + volumes = [ + "local/default.conf:/etc/nginx/conf.d/default.conf:ro" + ] + pids_limit = 100 + } + + lifecycle { + hook = "poststart" + sidecar = true + } + + vault { + policies = ["metrics"] + } + + template { + data = <<_EOT +{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} +{{ .Cert }} +{{ .Key }}{{ end -}} +_EOT + destination = "secrets/metrics.bundle.pem" + } + + template { + data = <<_EOT +{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + template { + data = <<_EOT +server { + listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + location /metrics { + proxy_pass http://127.0.0.1:9001/metrics; + } +} +_EOT + destination = "local/default.conf" + } + + resources { + cpu = 10 + memory = 10 + memory_max = 20 + } + } + + + + task "vector" { + driver = "docker" + + config { + image = "danielberteaud/vector:0.36.1-1" + readonly_rootfs = true + pids_limit = 200 + args = ["--config=/local/vector.yml"] + } + + + + # Use a template block instead of env {} so we can fetch values from vault + template { + data = <<_EOT +LANG=fr_FR.utf8 +TZ=Europe/Paris +_EOT + destination = "secrets/.env" + perms = 400 + env = true + } + + + template { + data = <<_EOT +data_dir: /local +expire_metrics_secs: 600 + +sources: + + logs_vector: + type: vector + address: 127.0.0.1:9000 + + vector_metrics: + type: internal_metrics + +transforms: + split-by-app: + type: route + inputs: [ "logs_*" ] + route: + traefik: '.service == "traefik"' + postgres: '.service == "postgres"' + syslog: '.source_type == "syslog"' + + parse-traefik: + type: remap + inputs: ["split-by-app.traefik"] + source: | + .http = parse_grok!(.message, "%%{HTTPD_COMMONLOG}") + .loki_labels.http_method = .http.verb + .loki_labels.http_status = .http.response + .loki_labels.user = .http.auth + + parse-postgres: + type: remap + inputs: ["split-by-app.postgres"] + source: | + if includes(array!(.nomad.tags), "master"){ + .loki_labels.pg_role = "master" + } else if includes(array!(.nomad.tags), "replica"){ + .loki_labels.pg_role = "replica" + } + + parse-syslog: + type: remap + inputs: ["split-by-app.syslog"] + source: | + # PfSense sends /usr/sbin/cron as the appname, instead of cron + if string!(.appname) == "/usr/sbin/cron" { + .appname = "cron" + } + .service = .appname + +sinks: + + loki: + type: loki + inputs: [ "split-by-app._unmatched", "parse-*" ] + endpoint: http://127.0.0.1:3100 + encoding: + codec: text + labels: + job: "{{ .service }}" + host: "{{ .host }}" + _*: "{{ .loki_labels }}" + buffer: + type: disk + max_size: 268435488 + remove_label_fields: true + + # Expose vector internal metrics + prometheus: + type: prometheus_exporter + inputs: ["vector_metrics"] + address: "127.0.0.1:9001" + +_EOT + destination = "local/vector.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + change_mode = "signal" + change_signal = "SIGHUP" + } + + + resources { + cpu = 100 + memory = 192 } } diff --git a/example/vault/policies/monitoring-loki.hcl b/example/vault/policies/monitoring-loki.hcl new file mode 100644 index 0000000..9c42dca --- /dev/null +++ b/example/vault/policies/monitoring-loki.hcl @@ -0,0 +1,8 @@ + +path "pki/monitoring/issue/monitoring-loki" { + capabilities = ["update"] +} + +path "kv/service/monitoring/loki" { + capabilities = ["read"] +} diff --git a/images/loki/Dockerfile b/images/loki/Dockerfile new file mode 100644 index 0000000..ff264e3 --- /dev/null +++ b/images/loki/Dockerfile @@ -0,0 +1,38 @@ +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder + +ARG LOKI_VERSION=[[ .monitoring.loki.version ]] + +ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/loki-linux-amd64.zip /tmp +ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/SHA256SUMS /tmp +RUN set -eux &&\ + apk --no-cache add unzip &&\ + cd /tmp &&\ + grep "loki-linux-amd64.zip" SHA256SUMS | sha256sum -c &&\ + unzip loki-linux-amd64.zip &&\ + mkdir /opt/loki &&\ + mv loki-linux-amd64 /opt/loki/loki + +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] +MAINTAINER [[ .docker.maintainer ]] + +ENV PATH=/opt/loki:$PATH +COPY --from=builder /opt/loki /opt/loki +RUN set -eux &&\ + addgroup -g 3100 loki &&\ + adduser \ + --system \ + --disabled-password \ + --uid 3100 \ + --ingroup loki \ + --home /opt/loki \ + --no-create-home \ + --shell /sbin/nologin \ + loki &&\ + mkdir /data &&\ + chown loki:loki /data &&\ + chmod 700 data + +WORKDIR /opt/loki +USER loki +EXPOSE 3100 +CMD ["loki"] diff --git a/images/vector/Dockerfile b/images/vector/Dockerfile new file mode 100644 index 0000000..208e1ed --- /dev/null +++ b/images/vector/Dockerfile @@ -0,0 +1 @@ +FROM timberio/vector:[[ .monitoring.vector.version ]]-alpine diff --git a/init/pki b/init/pki index d8d69c2..6b8ac44 100755 --- a/init/pki +++ b/init/pki @@ -31,6 +31,19 @@ vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-prometheus \ max_ttl=100h \ ou="[[ $c.vault.pki.ou ]]" +# Create a role for loki (which will only be a client, for AlertManager) +vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-loki \ + allowed_domains="[[ .instance ]]" \ + allow_bare_domains=false \ + allow_subdomains=true \ + allow_localhost=false \ + allow_ip_sans=false \ + server_flag=false \ + client_flag=true \ + allow_wildcard_certificates=false \ + max_ttl=100h \ + ou="[[ $c.vault.pki.ou ]]" + # Create a role for metrics exporters (server only) vault write [[ $c.vault.pki.path ]]/roles/metrics \ allowed_domains="[[ .instance ]]" \ @@ -64,6 +77,6 @@ vault write pki/consul/roles/[[ .instance ]]-cluster-exporter \ allow_subdomains=true \ allow_wildcard_certificates=false \ max_ttl=168h \ - server_flags=false \ - client_flags=true \ + server_flag=false \ + client_flag=true \ ou="Cluster metrics exporter" diff --git a/monitoring.nomad.hcl b/monitoring-services.nomad.hcl similarity index 67% rename from monitoring.nomad.hcl rename to monitoring-services.nomad.hcl index 8ffb256..cfd0100 100644 --- a/monitoring.nomad.hcl +++ b/monitoring-services.nomad.hcl @@ -1,4 +1,4 @@ -job "[[ .instance ]]" { +job "[[ .instance ]]-services" { [[ template "common/job_start" . ]] @@ -163,6 +163,7 @@ _EOT [[- $c := merge .monitoring.alertmanager .monitoring . ]] + shutdown_delay = "6s" count = [[ $c.count ]] network { @@ -370,6 +371,190 @@ _EOT destination = "/data" } +[[ template "common/resources" $c ]] + } + } + + group "logs" { + +[[- $c := merge .monitoring.loki .monitoring . ]] + + shutdown_delay = "6s" + + network { + mode = "bridge" + port "metrics" {} + } + +[[ template "common/volumes" $c ]] + + service { + name = "[[ .instance ]]-loki[[ .consul.suffix ]]" + port = 3100 +[[ template "common/service_meta" $c ]] +[[ template "common/connect" $c ]] + + check { + name = "ready" + type = "http" + path = "/ready" + expose = true + interval = "20s" + timeout = "8s" + check_restart { + limit = 6 + grace = "5m" + } + } + + tags = [ +[[ template "common/traefik_tags" $c ]] + ] + } + +[[ template "common/task.metrics_proxy" $c ]] + + task "loki" { + driver = "[[ $c.nomad.driver ]]" + + config { + image = "[[ $c.image ]]" + command = "loki" + args = ["--config.file=/local/loki.yml"] + } + +[[ template "common/vault.policies" $c ]] +[[ template "common/file_env" $c ]] + + template { + data =<<_EOT +[[- if isKind "map" $c.custom_config ]] +[[ merge $c.custom_config (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]] +[[- else if isKind "string" $c.custom_config ]] +[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]] +[[- else ]] +# Not using custom_config as it's invalid +[[ template "monitoring/loki/loki.yml" $c ]] +[[- end ]] +_EOT + destination = "local/loki.yml" + } + + # A client cert, to connect to the AlertManager API + template { + data = <<_EOT +{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-loki" + (printf "common_name=loki-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX")) + (printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}} +{{ .Cert }} +{{ .Key }} +{{- end -}} +_EOT + destination = "secrets/loki.bundle.pem" + uid = 100000 + gid = 103100 + perms = "0440" + change_mode = "signal" + change_signal = "SIGHUP" + } + + # The monitoring CA chain, to validate AlertManager cert + template { + data = <<_EOT +{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + uid = 100000 + gid = 100000 + change_mode = "signal" + change_signal = "SIGHUP" + } + + volume_mount { + volume = "data" + destination = "/data" + } + +[[ template "common/resources" $c ]] + } + } + + # The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.) + # And with a loki sink. The goal is to be able to collect logs from various sources + group "aggregator" { +[[- $c := merge .monitoring.aggregator .monitoring . ]] + + count = [[ $c.count ]] + shutdown_delay = "6s" + + network { + mode = "bridge" + port "syslog-udp" {} + port "metrics" {} + } + + # The main service is the vector source + # It will provide access to other services through the mesh (like loki) + service { + name = "[[ .instance ]]-vector-aggregator[[ .consul.suffix ]]" + port = 9000 +[[ template "common/service_meta" $c ]] +[[ template "common/connect" $c ]] + tags = [ +[[ template "common/traefik_tags" merge $c.vector $c ]] + ] + } + +[[- if $c.syslog_udp.enabled ]] + # The syslog UDP service can be used to ingest standard syslog logs from other + # devices, and can be exposed by Traefik for this + service { + name = "[[ .instance ]]-syslog-udp[[ .consul.suffix ]]" + port = "syslog-udp" + tags = [ +[[ template "common/traefik_tags" merge $c.syslog_udp $c ]] + # UDP services can't be used through the mesh + "[[ $c.traefik.instance ]].consulcatalog.connect=false" + ] + } +[[- end ]] + +[[- if $c.fluentd.enabled ]] + # The fluentd service can be used to ingest fluentd logs + service { + name = "[[ .instance ]]-syslog-udp[[ .consul.suffix ]]" + port = 24224 + tags = [ +[[ template "common/traefik_tags" merge $c.fluentd $c ]] + ] + } +[[- end ]] + +[[ template "common/task.metrics_proxy" $c ]] + + task "vector" { + driver = "[[ $c.nomad.driver ]]" + + config { + image = "[[ $c.image ]]" + readonly_rootfs = true + pids_limit = 200 + args = [ "--config=/local/vector.yml" ] + } + +[[ template "common/file_env" $c ]] + + template { + data = <<_EOT +[[ tmpl.Exec "monitoring/aggregator/vector.yml" $c | replaceAll "%{" "%%{" | replaceAll "${" "$${" ]] +_EOT + destination = "local/vector.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + change_mode = "signal" + change_signal = "SIGHUP" + } + [[ template "common/resources" $c ]] } } diff --git a/templates/aggregator/vector.yml b/templates/aggregator/vector.yml new file mode 100644 index 0000000..1185d2f --- /dev/null +++ b/templates/aggregator/vector.yml @@ -0,0 +1,85 @@ +data_dir: /local +expire_metrics_secs: 600 + +sources: + + logs_vector: + type: vector + address: 127.0.0.1:9000 + + vector_metrics: + type: internal_metrics + +[[- if .syslog_udp.enabled ]] + logs_syslog_udp: + type: syslog + mode: udp + address: 0.0.0.0:{{{ env "NOMAD_ALLOC_PORT_syslog_udp" }}} +[[- end ]] + +[[- if .fluentd.enabled ]] + logs_fluentd: + type: fluent + address: 127.0.0.1:24224 +[[- end ]] + +transforms: + split-by-app: + type: route + inputs: [ "logs_*" ] + route: + traefik: '.service == "traefik"' + postgres: '.service == "postgres"' + syslog: '.source_type == "syslog"' + + parse-traefik: + type: remap + inputs: ["split-by-app.traefik"] + source: | + .http = parse_grok!(.message, "%{HTTPD_COMMONLOG}") + .loki_labels.http_method = .http.verb + .loki_labels.http_status = .http.response + .loki_labels.user = .http.auth + + parse-postgres: + type: remap + inputs: ["split-by-app.postgres"] + source: | + if includes(array!(.nomad.tags), "master"){ + .loki_labels.pg_role = "master" + } else if includes(array!(.nomad.tags), "replica"){ + .loki_labels.pg_role = "replica" + } + + parse-syslog: + type: remap + inputs: ["split-by-app.syslog"] + source: | + # PfSense sends /usr/sbin/cron as the appname, instead of cron + if string!(.appname) == "/usr/sbin/cron" { + .appname = "cron" + } + .service = .appname + +sinks: + + loki: + type: loki + inputs: [ "split-by-app._unmatched", "parse-*" ] + endpoint: http://127.0.0.1:3100 + encoding: + codec: text + labels: + job: "{{ .service }}" + host: "{{ .host }}" + _*: "{{ .loki_labels }}" + buffer: + type: disk + max_size: 268435488 + remove_label_fields: true + + # Expose vector internal metrics + prometheus: + type: prometheus_exporter + inputs: ["vector_metrics"] + address: "127.0.0.1:9001" diff --git a/templates/loki/loki.yml b/templates/loki/loki.yml new file mode 100644 index 0000000..e554349 --- /dev/null +++ b/templates/loki/loki.yml @@ -0,0 +1,75 @@ +auth_enabled: false + +server: + http_listen_address: 127.0.0.1 + http_listen_port: 3100 + grpc_listen_address: 127.0.0.1 + grpc_listen_port: 9095 + #log_level: debug + +common: + path_prefix: /data + storage: + filesystem: + chunks_directory: /data/chunks + rules_directory: /data/rules + replication_factor: 1 + instance_addr: 127.0.0.1 + ring: + kvstore: + store: inmemory + +storage_config: + boltdb_shipper: + active_index_directory: /data/index + cache_location: /data/boltdb-cache + shared_store: filesystem + +schema_config: + configs: + - from: '2020-10-24' + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + +compactor: + working_directory: /data/compactor + shared_store: filesystem + compaction_interval: 1h + retention_enabled: true + deletion_mode: filter-and-delete + +ingester: + chunk_idle_period: 1h + +limits_config: + retention_period: '[[ .retention ]]' + ingestion_rate_mb: 20 + ingestion_burst_size_mb: 100 + max_entries_limit_per_query: 20000 + max_query_parallelism: 128 + +ruler: + alertmanager_url: [[ .instance ]]-alertmanager-tls[[ .consul.suffix ]] + enable_alertmanager_discovery: true + alertmanager_client: + tls_cert_path: /secrets/loki.bundle.pem + tls_key_path: /secrets/loki.bundle.pem + tls_ca_path: /secrets/monitoring.ca.pem + tls_server_name: alertmanager.monitoring + enable_alertmanager_v2: true + enable_api: true + rule_path: /tmp/loki-rules + storage: + type: local + local: + directory: /local/rules + ring: + kvstore: + store: inmemory + +analytics: + reporting_enabled: false diff --git a/templates/prometheus/prometheus.yml b/templates/prometheus/prometheus.yml index 95502e9..974858b 100644 --- a/templates/prometheus/prometheus.yml +++ b/templates/prometheus/prometheus.yml @@ -19,6 +19,7 @@ alerting: consul_sd_configs: - server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 scheme: http + token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }} datacenter: [[ .consul.datacenter ]] relabel_configs: # Only keep alertmanagers diff --git a/variables.yml b/variables.yml index 599d5ca..816bbe7 100644 --- a/variables.yml +++ b/variables.yml @@ -18,7 +18,7 @@ monitoring: env: {} resources: cpu: 10 - memory: 30 + memory: 25 probes: [] blackbox: @@ -37,20 +37,20 @@ monitoring: env: {} resources: cpu: 20 - memory: 64 + memory: 32 vault: policies: - - '[[ .instance ]]-consul-exporter' + - '[[ .instance ]]-consul-exporter[[ .consul.suffix ]]' cluster: image: nginxinc/nginx-unprivileged:alpine env: {} resources: cpu: 10 - memory: 18 + memory: 15 vault: policies: - - '[[ .instance ]]-cluster-exporter' + - '[[ .instance ]]-cluster-exporter[[ .consul.suffix ]]' - metrics prometheus: @@ -65,17 +65,17 @@ monitoring: resources: cpu: 200 - memory: 768 + memory: 512 volumes: data: type: csi - source: '[[ .instance ]]-prometheus-data' + source: '[[ .instance ]]-prometheus-data[[ .consul.suffix ]]' per_alloc: true vault: policies: - - '[[ .instance ]]-prometheus' + - '[[ .instance ]]-prometheus[[ .consul.suffix ]]' jobs: {} alert_rules: {} @@ -101,7 +101,8 @@ monitoring: env: {} resources: cpu: 50 - memory: 80 + memory: 64 + memory_max: 80 public_url: https://alerte.example.org traefik: enabled: true @@ -109,7 +110,7 @@ monitoring: strip_prefix: false volumes: data: - source: '[[ .instance ]]-alertmanager-data' + source: '[[ .instance ]]-alertmanager-data[[ .consul.suffix ]]' type: csi per_alloc: true prometheus: @@ -117,11 +118,69 @@ monitoring: vault: policies: - metrics - - '[[ .instance ]]-alertmanager' + - '[[ .instance ]]-alertmanager[[ .consul.suffix ]]' email: from: alertmanager@[[ .consul.domain ]] - custom_config: "" - + custom_config: {} + + loki: + version: 2.9.5 + image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1' + env: {} + resources: + cpu: 150 + memory: 512 + vault: + policies: + - '[[ .instance ]]-loki[[ .consul.suffix ]]' + public_url: https://loki.example.org + traefik: + router: loki + retention: 720h # 1 month + custom_config: {} + prometheus: + metrics_url: http://localhost:3100/metrics + volumes: + data: + type: csi + source: '[[ .instance ]]-loki-data[[ .consul.suffix ]]' + + vector: + version: 0.36.1 + image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1' + + aggregator: + count: 1 + image: '[[ .monitoring.vector.image ]]' + env: {} + resources: + cpu: 100 + memory: 192 + consul: + connect: + upstreams: + - destination_name: '[[ .instance ]]-loki[[ .consul.suffix ]]' + local_bind_port: 3100 + fluentd: + enabled: false + traefik: + router: fluentd + entrypoints: + - fluentd + syslog_udp: + enabled: false + traefik: + router: syslog-udp + entrypoints: + - syslog + vector: + enabled: true + public_url: https://vector.example.org + traefik: + enabled: false + prometheus: + metrics_url: http://127.0.0.1:9001/metrics + prometheus: enabled: true diff --git a/vault/policies/monitoring-loki.hcl b/vault/policies/monitoring-loki.hcl new file mode 100644 index 0000000..bee4e15 --- /dev/null +++ b/vault/policies/monitoring-loki.hcl @@ -0,0 +1,8 @@ +[[- $c := merge .monitoring.loki .monitoring . ]] +path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-loki" { + capabilities = ["update"] +} + +path "[[ $c.vault.root ]]kv/service/[[ .instance ]]/loki" { + capabilities = ["read"] +}