More work, including loki and vector aggregator

This commit is contained in:
Daniel Berteaud 2024-03-20 11:10:07 +01:00
parent 65441a4a9e
commit 7ed40afe9c
21 changed files with 1271 additions and 43 deletions

View File

@ -6,7 +6,7 @@
- ~~blackbox-exporter~~
- ~~consul-exporter~~
- vector
- loki
- ~~loki~~
- grafana
- nomad-vector-logger
@ -27,17 +27,18 @@
- consul defaults & intentions
- ~~prometheus~~
- ~~alertmanager~~
- loki
- ~~loki~~
- tasks
- ~~alertmanager~~
- vector-aggregator
- vector-agent (dans job agent)
- loki (modulariser ou laisser en monolithique ?)
- ~~loki (modulariser ou laisser en monolithique ?)~~
- grafana
- ~~cluster-metrics (job exporters)~~
- questions
- questions / various
- prom rules: keep or move to a -conf bundle ?
- ~~config alertes am (recipient + routing)~~
- ~~http and tcp probes, as exporters are now in a dedicated job~~
- alertmanager & rules for loki

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "[[ .instance ]]-loki[[ .consul.suffix ]]"
Protocol = "http"

View File

@ -0,0 +1,55 @@
Kind = "service-intentions"
Name = "[[ .instance ]]-loki[[ .consul.suffix ]]"
Sources = [
{
Name = "[[ (merge .monitoring.loki .monitoring .).traefik.instance ]]"
Permissions = [
{
Action = "allow"
HTTP {
PathPrefix = "/"
}
}
]
},
{
Name = "[[ .instance ]]-grafana[[ .consul.suffix ]]"
Permissions = [
{
Action = "allow"
HTTP {
PathPrefix = "/loki/api/v1"
Methods = ["GET", "HEAD", "POST"]
}
},
{
Action = "allow"
HTTP {
PathPrefix = "/ready"
Methods = ["GET"]
}
}
]
},
[[- range $idx, $service := coll.Slice "vector-aggregator" "vector-agent" ]]
{
Name = "[[ $.instance ]]-[[ $service ]][[ $.consul.suffix ]]"
Permissions = [
{
Action = "allow"
HTTP {
PathExact = "/loki/api/v1/push"
Methods = ["POST"]
}
},
{
Action = "allow"
HTTP {
PathExact = "/ready"
Methods = ["GET"]
}
}
]
},
[[- end ]]
]

View File

@ -1,12 +1,12 @@
- ~~Split exporters dans un job dédié (pour pouvoir tourner sur node_pool spécifique)~~
- Créer monitoring-agent type system avec vector + node-exporter
- images
- prometheus
- ping-exporter
- blackbox-exporter
- consul-exporter
- ~~prometheus~~
- ~~ping-exporter~~
- ~~blackbox-exporter~~
- ~~consul-exporter~~
- vector
- loki
- ~~loki~~
- grafana
- nomad-vector-logger
@ -16,28 +16,29 @@
- ~~monitoring -> am~~
- vault pol
- prometheus
- ~~prometheus~~
- ~~issue prom on monitoring~~
- ~~issue prom on consul~~
- consul-exporter
- issue consul-exporter on consul
- alertmanager
- ~~consul-exporter~~
- ~~issue consul-exporter on consul~~
- ~~alertmanager~~
- ~~issue alertmanager on monitoring~~
- consul defaults & intentions
- ~~prometheus~~
- ~~alertmanager~~
- loki
- ~~loki~~
- tasks
- ~~alertmanager~~
- vector-aggregator
- vector-agent (dans job agent)
- loki (modulariser ou laisser en monolithique ?)
- ~~loki (modulariser ou laisser en monolithique ?)~~
- grafana
- cluster-metrics (job exporters)
- ~~cluster-metrics (job exporters)~~
- questions
- questions / various
- prom rules: keep or move to a -conf bundle ?
- ~~config alertes am (recipient + routing)~~
- http and tcp probes, as exporters are now in a dedicated job
- ~~http and tcp probes, as exporters are now in a dedicated job~~
- alertmanager & rules for loki

View File

@ -0,0 +1,3 @@
Kind = "service-defaults"
Name = "monitoring-loki"
Protocol = "http"

View File

@ -0,0 +1,72 @@
Kind = "service-intentions"
Name = "monitoring-loki"
Sources = [
{
Name = "traefik"
Permissions = [
{
Action = "allow"
HTTP {
PathPrefix = "/"
}
}
]
},
{
Name = "monitoring-grafana"
Permissions = [
{
Action = "allow"
HTTP {
PathPrefix = "/loki/api/v1"
Methods = ["GET", "HEAD", "POST"]
}
},
{
Action = "allow"
HTTP {
PathPrefix = "/ready"
Methods = ["GET"]
}
}
]
},
{
Name = "monitoring-vector-aggregator"
Permissions = [
{
Action = "allow"
HTTP {
PathExact = "/loki/api/v1/push"
Methods = ["POST"]
}
},
{
Action = "allow"
HTTP {
PathExact = "/ready"
Methods = ["GET"]
}
}
]
},
{
Name = "monitoring-vector-agent"
Permissions = [
{
Action = "allow"
HTTP {
PathExact = "/loki/api/v1/push"
Methods = ["POST"]
}
},
{
Action = "allow"
HTTP {
PathExact = "/ready"
Methods = ["GET"]
}
}
]
},
]

View File

@ -0,0 +1,38 @@
FROM danielberteaud/alpine:24.3-1 AS builder
ARG LOKI_VERSION=2.9.5
ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/loki-linux-amd64.zip /tmp
ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/SHA256SUMS /tmp
RUN set -eux &&\
apk --no-cache add unzip &&\
cd /tmp &&\
grep "loki-linux-amd64.zip" SHA256SUMS | sha256sum -c &&\
unzip loki-linux-amd64.zip &&\
mkdir /opt/loki &&\
mv loki-linux-amd64 /opt/loki/loki
FROM danielberteaud/alpine:24.3-1
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
ENV PATH=/opt/loki:$PATH
COPY --from=builder /opt/loki /opt/loki
RUN set -eux &&\
addgroup -g 3100 loki &&\
adduser \
--system \
--disabled-password \
--uid 3100 \
--ingroup loki \
--home /opt/loki \
--no-create-home \
--shell /sbin/nologin \
loki &&\
mkdir /data &&\
chown loki:loki /data &&\
chmod 700 data
WORKDIR /opt/loki
USER loki
EXPOSE 3100
CMD ["loki"]

View File

@ -0,0 +1 @@
FROM timberio/vector:0.36.1-alpine

View File

@ -118,6 +118,19 @@ vault write pki/monitoring/roles/monitoring-prometheus \
max_ttl=100h \
ou="Monitoring"
# Create a role for loki (which will only be a client, for AlertManager)
vault write pki/monitoring/roles/monitoring-loki \
allowed_domains="monitoring" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
allow_ip_sans=false \
server_flag=false \
client_flag=true \
allow_wildcard_certificates=false \
max_ttl=100h \
ou="Monitoring"
# Create a role for metrics exporters (server only)
vault write pki/monitoring/roles/metrics \
allowed_domains="monitoring" \
@ -151,6 +164,6 @@ vault write pki/consul/roles/monitoring-cluster-exporter \
allow_subdomains=true \
allow_wildcard_certificates=false \
max_ttl=168h \
server_flags=false \
client_flags=true \
server_flag=false \
client_flag=true \
ou="Cluster metrics exporter"

View File

@ -114,7 +114,7 @@ _EOT
resources {
cpu = 20
memory = 64
memory = 32
}
}
@ -411,7 +411,7 @@ _EOT
resources {
cpu = 10
memory = 18
memory = 15
}
}

View File

@ -1,4 +1,4 @@
job "monitoring" {
job "monitoring-services" {
datacenters = ["dc1"]
@ -225,6 +225,7 @@ alerting:
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "consul/creds/monitoring-prometheus" }}{{ .Data.token }}{{ end }}
datacenter: dc1
relabel_configs:
# Only keep alertmanagers
@ -919,7 +920,7 @@ _EOT
resources {
cpu = 200
memory = 768
memory = 512
}
}
@ -927,7 +928,8 @@ _EOT
group "alerts" {
count = 1
shutdown_delay = "6s"
count = 1
network {
mode = "bridge"
@ -1335,8 +1337,574 @@ _EOT
resources {
cpu = 50
memory = 80
cpu = 50
memory = 64
memory_max = 80
}
}
}
group "logs" {
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
volume "data" {
source = "monitoring-loki-data"
type = "csi"
access_mode = "single-node-writer"
attachment_mode = "file-system"
}
service {
name = "monitoring-loki"
port = 3100
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
job = "${NOMAD_JOB_NAME}"
}
connect {
sidecar_service {
}
sidecar_task {
config {
args = [
"-c",
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
"-l",
"${meta.connect.log_level}",
"--concurrency",
"${meta.connect.proxy_concurrency}",
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
check {
name = "ready"
type = "http"
path = "/ready"
expose = true
interval = "20s"
timeout = "8s"
check_restart {
limit = 6
grace = "5m"
}
}
tags = [
"traefik.enable=true",
"traefik.http.routers.monitoring-loki.entrypoints=https",
"traefik.http.routers.monitoring-loki.rule=Host(`loki.example.org`)",
"traefik.http.middlewares.csp-monitoring-loki.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
"traefik.http.routers.monitoring-loki.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-loki",
]
}
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
task "metrics-proxy" {
driver = "docker"
user = 8995
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
volumes = [
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
]
pids_limit = 100
}
lifecycle {
hook = "poststart"
sidecar = true
}
vault {
policies = ["metrics"]
}
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}{{ end -}}
_EOT
destination = "secrets/metrics.bundle.pem"
}
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://localhost:3100/metrics;
}
}
_EOT
destination = "local/default.conf"
}
resources {
cpu = 10
memory = 10
memory_max = 20
}
}
task "loki" {
driver = "docker"
config {
image = "danielberteaud/loki:2.9.5-1"
command = "loki"
args = ["--config.file=/local/loki.yml"]
}
vault {
policies = ["monitoring-loki"]
env = false
disable_file = true
change_mode = "noop"
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = <<_EOT
LANG=fr_FR.utf8
TZ=Europe/Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
template {
data = <<_EOT
analytics:
reporting_enabled: false
auth_enabled: false
common:
instance_addr: 127.0.0.1
path_prefix: /data
replication_factor: 1
ring:
kvstore:
store: inmemory
storage:
filesystem:
chunks_directory: /data/chunks
rules_directory: /data/rules
compactor:
compaction_interval: 1h
deletion_mode: filter-and-delete
retention_enabled: true
shared_store: filesystem
working_directory: /data/compactor
ingester:
chunk_idle_period: 1h
limits_config:
ingestion_burst_size_mb: 100
ingestion_rate_mb: 20
max_entries_limit_per_query: 20000
max_query_parallelism: 128
retention_period: 720h
ruler:
alertmanager_client:
tls_ca_path: /secrets/monitoring.ca.pem
tls_cert_path: /secrets/loki.bundle.pem
tls_key_path: /secrets/loki.bundle.pem
tls_server_name: alertmanager.monitoring
alertmanager_url: monitoring-alertmanager-tls
enable_alertmanager_discovery: true
enable_alertmanager_v2: true
enable_api: true
ring:
kvstore:
store: inmemory
rule_path: /tmp/loki-rules
storage:
local:
directory: /local/rules
type: local
schema_config:
configs:
- from: "2020-10-24"
index:
period: 24h
prefix: index_
object_store: filesystem
schema: v11
store: boltdb-shipper
server:
grpc_listen_address: 127.0.0.1
grpc_listen_port: 9095
http_listen_address: 127.0.0.1
http_listen_port: 3100
storage_config:
boltdb_shipper:
active_index_directory: /data/index
cache_location: /data/boltdb-cache
shared_store: filesystem
_EOT
destination = "local/loki.yml"
}
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/monitoring-loki"
(printf "common_name=loki-%s.monitoring" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
{{- end -}}
_EOT
destination = "secrets/loki.bundle.pem"
uid = 100000
gid = 103100
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The monitoring CA chain, to validate AlertManager cert
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
uid = 100000
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
volume_mount {
volume = "data"
destination = "/data"
}
resources {
cpu = 150
memory = 512
}
}
}
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
# And with a loki sink. The goal is to be able to collect logs from various sources
group "aggregator" {
count = 1
shutdown_delay = "6s"
network {
mode = "bridge"
port "syslog-udp" {}
port "metrics" {}
}
# The main service is the vector source
# It will provide access to other services through the mesh (like loki)
service {
name = "monitoring-vector-aggregator"
port = 9000
meta {
metrics-port = "${NOMAD_HOST_PORT_metrics}"
alloc = "${NOMAD_ALLOC_INDEX}"
job = "${NOMAD_JOB_NAME}"
}
connect {
sidecar_service {
proxy {
upstreams {
destination_name = "monitoring-loki"
local_bind_port = 3100
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
destination_type = "service"
}
}
}
sidecar_task {
config {
args = [
"-c",
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
"-l",
"${meta.connect.log_level}",
"--concurrency",
"${meta.connect.proxy_concurrency}",
"--disable-hot-restart"
]
}
resources {
cpu = 50
memory = 64
}
}
}
tags = [
]
}
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
task "metrics-proxy" {
driver = "docker"
user = 8995
config {
image = "nginxinc/nginx-unprivileged:alpine"
force_pull = true
volumes = [
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
]
pids_limit = 100
}
lifecycle {
hook = "poststart"
sidecar = true
}
vault {
policies = ["metrics"]
}
template {
data = <<_EOT
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
{{ .Cert }}
{{ .Key }}{{ end -}}
_EOT
destination = "secrets/metrics.bundle.pem"
}
template {
data = <<_EOT
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
}
template {
data = <<_EOT
server {
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
http2 on;
ssl_certificate /secrets/metrics.bundle.pem;
ssl_certificate_key /secrets/metrics.bundle.pem;
ssl_client_certificate /local/monitoring.ca.pem;
ssl_verify_client on;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1h;
ssl_session_tickets off;
gzip on;
gzip_types
text/plain;
gzip_vary on;
server_tokens off;
if ($request_method !~ ^(GET|HEAD)$ ) {
return 405;
}
location /metrics {
proxy_pass http://127.0.0.1:9001/metrics;
}
}
_EOT
destination = "local/default.conf"
}
resources {
cpu = 10
memory = 10
memory_max = 20
}
}
task "vector" {
driver = "docker"
config {
image = "danielberteaud/vector:0.36.1-1"
readonly_rootfs = true
pids_limit = 200
args = ["--config=/local/vector.yml"]
}
# Use a template block instead of env {} so we can fetch values from vault
template {
data = <<_EOT
LANG=fr_FR.utf8
TZ=Europe/Paris
_EOT
destination = "secrets/.env"
perms = 400
env = true
}
template {
data = <<_EOT
data_dir: /local
expire_metrics_secs: 600
sources:
logs_vector:
type: vector
address: 127.0.0.1:9000
vector_metrics:
type: internal_metrics
transforms:
split-by-app:
type: route
inputs: [ "logs_*" ]
route:
traefik: '.service == "traefik"'
postgres: '.service == "postgres"'
syslog: '.source_type == "syslog"'
parse-traefik:
type: remap
inputs: ["split-by-app.traefik"]
source: |
.http = parse_grok!(.message, "%%{HTTPD_COMMONLOG}")
.loki_labels.http_method = .http.verb
.loki_labels.http_status = .http.response
.loki_labels.user = .http.auth
parse-postgres:
type: remap
inputs: ["split-by-app.postgres"]
source: |
if includes(array!(.nomad.tags), "master"){
.loki_labels.pg_role = "master"
} else if includes(array!(.nomad.tags), "replica"){
.loki_labels.pg_role = "replica"
}
parse-syslog:
type: remap
inputs: ["split-by-app.syslog"]
source: |
# PfSense sends /usr/sbin/cron as the appname, instead of cron
if string!(.appname) == "/usr/sbin/cron" {
.appname = "cron"
}
.service = .appname
sinks:
loki:
type: loki
inputs: [ "split-by-app._unmatched", "parse-*" ]
endpoint: http://127.0.0.1:3100
encoding:
codec: text
labels:
job: "{{ .service }}"
host: "{{ .host }}"
_*: "{{ .loki_labels }}"
buffer:
type: disk
max_size: 268435488
remove_label_fields: true
# Expose vector internal metrics
prometheus:
type: prometheus_exporter
inputs: ["vector_metrics"]
address: "127.0.0.1:9001"
_EOT
destination = "local/vector.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
change_mode = "signal"
change_signal = "SIGHUP"
}
resources {
cpu = 100
memory = 192
}
}

View File

@ -0,0 +1,8 @@
path "pki/monitoring/issue/monitoring-loki" {
capabilities = ["update"]
}
path "kv/service/monitoring/loki" {
capabilities = ["read"]
}

38
images/loki/Dockerfile Normal file
View File

@ -0,0 +1,38 @@
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
ARG LOKI_VERSION=[[ .monitoring.loki.version ]]
ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/loki-linux-amd64.zip /tmp
ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/SHA256SUMS /tmp
RUN set -eux &&\
apk --no-cache add unzip &&\
cd /tmp &&\
grep "loki-linux-amd64.zip" SHA256SUMS | sha256sum -c &&\
unzip loki-linux-amd64.zip &&\
mkdir /opt/loki &&\
mv loki-linux-amd64 /opt/loki/loki
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
MAINTAINER [[ .docker.maintainer ]]
ENV PATH=/opt/loki:$PATH
COPY --from=builder /opt/loki /opt/loki
RUN set -eux &&\
addgroup -g 3100 loki &&\
adduser \
--system \
--disabled-password \
--uid 3100 \
--ingroup loki \
--home /opt/loki \
--no-create-home \
--shell /sbin/nologin \
loki &&\
mkdir /data &&\
chown loki:loki /data &&\
chmod 700 data
WORKDIR /opt/loki
USER loki
EXPOSE 3100
CMD ["loki"]

1
images/vector/Dockerfile Normal file
View File

@ -0,0 +1 @@
FROM timberio/vector:[[ .monitoring.vector.version ]]-alpine

View File

@ -31,6 +31,19 @@ vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-prometheus \
max_ttl=100h \
ou="[[ $c.vault.pki.ou ]]"
# Create a role for loki (which will only be a client, for AlertManager)
vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-loki \
allowed_domains="[[ .instance ]]" \
allow_bare_domains=false \
allow_subdomains=true \
allow_localhost=false \
allow_ip_sans=false \
server_flag=false \
client_flag=true \
allow_wildcard_certificates=false \
max_ttl=100h \
ou="[[ $c.vault.pki.ou ]]"
# Create a role for metrics exporters (server only)
vault write [[ $c.vault.pki.path ]]/roles/metrics \
allowed_domains="[[ .instance ]]" \
@ -64,6 +77,6 @@ vault write pki/consul/roles/[[ .instance ]]-cluster-exporter \
allow_subdomains=true \
allow_wildcard_certificates=false \
max_ttl=168h \
server_flags=false \
client_flags=true \
server_flag=false \
client_flag=true \
ou="Cluster metrics exporter"

View File

@ -1,4 +1,4 @@
job "[[ .instance ]]" {
job "[[ .instance ]]-services" {
[[ template "common/job_start" . ]]
@ -163,6 +163,7 @@ _EOT
[[- $c := merge .monitoring.alertmanager .monitoring . ]]
shutdown_delay = "6s"
count = [[ $c.count ]]
network {
@ -370,6 +371,190 @@ _EOT
destination = "/data"
}
[[ template "common/resources" $c ]]
}
}
group "logs" {
[[- $c := merge .monitoring.loki .monitoring . ]]
shutdown_delay = "6s"
network {
mode = "bridge"
port "metrics" {}
}
[[ template "common/volumes" $c ]]
service {
name = "[[ .instance ]]-loki[[ .consul.suffix ]]"
port = 3100
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
check {
name = "ready"
type = "http"
path = "/ready"
expose = true
interval = "20s"
timeout = "8s"
check_restart {
limit = 6
grace = "5m"
}
}
tags = [
[[ template "common/traefik_tags" $c ]]
]
}
[[ template "common/task.metrics_proxy" $c ]]
task "loki" {
driver = "[[ $c.nomad.driver ]]"
config {
image = "[[ $c.image ]]"
command = "loki"
args = ["--config.file=/local/loki.yml"]
}
[[ template "common/vault.policies" $c ]]
[[ template "common/file_env" $c ]]
template {
data =<<_EOT
[[- if isKind "map" $c.custom_config ]]
[[ merge $c.custom_config (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]]
[[- else if isKind "string" $c.custom_config ]]
[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]]
[[- else ]]
# Not using custom_config as it's invalid
[[ template "monitoring/loki/loki.yml" $c ]]
[[- end ]]
_EOT
destination = "local/loki.yml"
}
# A client cert, to connect to the AlertManager API
template {
data = <<_EOT
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-loki"
(printf "common_name=loki-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
{{ .Cert }}
{{ .Key }}
{{- end -}}
_EOT
destination = "secrets/loki.bundle.pem"
uid = 100000
gid = 103100
perms = "0440"
change_mode = "signal"
change_signal = "SIGHUP"
}
# The monitoring CA chain, to validate AlertManager cert
template {
data = <<_EOT
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
_EOT
destination = "local/monitoring.ca.pem"
uid = 100000
gid = 100000
change_mode = "signal"
change_signal = "SIGHUP"
}
volume_mount {
volume = "data"
destination = "/data"
}
[[ template "common/resources" $c ]]
}
}
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
# And with a loki sink. The goal is to be able to collect logs from various sources
group "aggregator" {
[[- $c := merge .monitoring.aggregator .monitoring . ]]
count = [[ $c.count ]]
shutdown_delay = "6s"
network {
mode = "bridge"
port "syslog-udp" {}
port "metrics" {}
}
# The main service is the vector source
# It will provide access to other services through the mesh (like loki)
service {
name = "[[ .instance ]]-vector-aggregator[[ .consul.suffix ]]"
port = 9000
[[ template "common/service_meta" $c ]]
[[ template "common/connect" $c ]]
tags = [
[[ template "common/traefik_tags" merge $c.vector $c ]]
]
}
[[- if $c.syslog_udp.enabled ]]
# The syslog UDP service can be used to ingest standard syslog logs from other
# devices, and can be exposed by Traefik for this
service {
name = "[[ .instance ]]-syslog-udp[[ .consul.suffix ]]"
port = "syslog-udp"
tags = [
[[ template "common/traefik_tags" merge $c.syslog_udp $c ]]
# UDP services can't be used through the mesh
"[[ $c.traefik.instance ]].consulcatalog.connect=false"
]
}
[[- end ]]
[[- if $c.fluentd.enabled ]]
# The fluentd service can be used to ingest fluentd logs
service {
name = "[[ .instance ]]-syslog-udp[[ .consul.suffix ]]"
port = 24224
tags = [
[[ template "common/traefik_tags" merge $c.fluentd $c ]]
]
}
[[- end ]]
[[ template "common/task.metrics_proxy" $c ]]
task "vector" {
driver = "[[ $c.nomad.driver ]]"
config {
image = "[[ $c.image ]]"
readonly_rootfs = true
pids_limit = 200
args = [ "--config=/local/vector.yml" ]
}
[[ template "common/file_env" $c ]]
template {
data = <<_EOT
[[ tmpl.Exec "monitoring/aggregator/vector.yml" $c | replaceAll "%{" "%%{" | replaceAll "${" "$${" ]]
_EOT
destination = "local/vector.yml"
left_delimiter = "{{{"
right_delimiter = "}}}"
change_mode = "signal"
change_signal = "SIGHUP"
}
[[ template "common/resources" $c ]]
}
}

View File

@ -0,0 +1,85 @@
data_dir: /local
expire_metrics_secs: 600
sources:
logs_vector:
type: vector
address: 127.0.0.1:9000
vector_metrics:
type: internal_metrics
[[- if .syslog_udp.enabled ]]
logs_syslog_udp:
type: syslog
mode: udp
address: 0.0.0.0:{{{ env "NOMAD_ALLOC_PORT_syslog_udp" }}}
[[- end ]]
[[- if .fluentd.enabled ]]
logs_fluentd:
type: fluent
address: 127.0.0.1:24224
[[- end ]]
transforms:
split-by-app:
type: route
inputs: [ "logs_*" ]
route:
traefik: '.service == "traefik"'
postgres: '.service == "postgres"'
syslog: '.source_type == "syslog"'
parse-traefik:
type: remap
inputs: ["split-by-app.traefik"]
source: |
.http = parse_grok!(.message, "%{HTTPD_COMMONLOG}")
.loki_labels.http_method = .http.verb
.loki_labels.http_status = .http.response
.loki_labels.user = .http.auth
parse-postgres:
type: remap
inputs: ["split-by-app.postgres"]
source: |
if includes(array!(.nomad.tags), "master"){
.loki_labels.pg_role = "master"
} else if includes(array!(.nomad.tags), "replica"){
.loki_labels.pg_role = "replica"
}
parse-syslog:
type: remap
inputs: ["split-by-app.syslog"]
source: |
# PfSense sends /usr/sbin/cron as the appname, instead of cron
if string!(.appname) == "/usr/sbin/cron" {
.appname = "cron"
}
.service = .appname
sinks:
loki:
type: loki
inputs: [ "split-by-app._unmatched", "parse-*" ]
endpoint: http://127.0.0.1:3100
encoding:
codec: text
labels:
job: "{{ .service }}"
host: "{{ .host }}"
_*: "{{ .loki_labels }}"
buffer:
type: disk
max_size: 268435488
remove_label_fields: true
# Expose vector internal metrics
prometheus:
type: prometheus_exporter
inputs: ["vector_metrics"]
address: "127.0.0.1:9001"

75
templates/loki/loki.yml Normal file
View File

@ -0,0 +1,75 @@
auth_enabled: false
server:
http_listen_address: 127.0.0.1
http_listen_port: 3100
grpc_listen_address: 127.0.0.1
grpc_listen_port: 9095
#log_level: debug
common:
path_prefix: /data
storage:
filesystem:
chunks_directory: /data/chunks
rules_directory: /data/rules
replication_factor: 1
instance_addr: 127.0.0.1
ring:
kvstore:
store: inmemory
storage_config:
boltdb_shipper:
active_index_directory: /data/index
cache_location: /data/boltdb-cache
shared_store: filesystem
schema_config:
configs:
- from: '2020-10-24'
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
compactor:
working_directory: /data/compactor
shared_store: filesystem
compaction_interval: 1h
retention_enabled: true
deletion_mode: filter-and-delete
ingester:
chunk_idle_period: 1h
limits_config:
retention_period: '[[ .retention ]]'
ingestion_rate_mb: 20
ingestion_burst_size_mb: 100
max_entries_limit_per_query: 20000
max_query_parallelism: 128
ruler:
alertmanager_url: [[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]
enable_alertmanager_discovery: true
alertmanager_client:
tls_cert_path: /secrets/loki.bundle.pem
tls_key_path: /secrets/loki.bundle.pem
tls_ca_path: /secrets/monitoring.ca.pem
tls_server_name: alertmanager.monitoring
enable_alertmanager_v2: true
enable_api: true
rule_path: /tmp/loki-rules
storage:
type: local
local:
directory: /local/rules
ring:
kvstore:
store: inmemory
analytics:
reporting_enabled: false

View File

@ -19,6 +19,7 @@ alerting:
consul_sd_configs:
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
scheme: http
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
datacenter: [[ .consul.datacenter ]]
relabel_configs:
# Only keep alertmanagers

View File

@ -18,7 +18,7 @@ monitoring:
env: {}
resources:
cpu: 10
memory: 30
memory: 25
probes: []
blackbox:
@ -37,20 +37,20 @@ monitoring:
env: {}
resources:
cpu: 20
memory: 64
memory: 32
vault:
policies:
- '[[ .instance ]]-consul-exporter'
- '[[ .instance ]]-consul-exporter[[ .consul.suffix ]]'
cluster:
image: nginxinc/nginx-unprivileged:alpine
env: {}
resources:
cpu: 10
memory: 18
memory: 15
vault:
policies:
- '[[ .instance ]]-cluster-exporter'
- '[[ .instance ]]-cluster-exporter[[ .consul.suffix ]]'
- metrics
prometheus:
@ -65,17 +65,17 @@ monitoring:
resources:
cpu: 200
memory: 768
memory: 512
volumes:
data:
type: csi
source: '[[ .instance ]]-prometheus-data'
source: '[[ .instance ]]-prometheus-data[[ .consul.suffix ]]'
per_alloc: true
vault:
policies:
- '[[ .instance ]]-prometheus'
- '[[ .instance ]]-prometheus[[ .consul.suffix ]]'
jobs: {}
alert_rules: {}
@ -101,7 +101,8 @@ monitoring:
env: {}
resources:
cpu: 50
memory: 80
memory: 64
memory_max: 80
public_url: https://alerte.example.org
traefik:
enabled: true
@ -109,7 +110,7 @@ monitoring:
strip_prefix: false
volumes:
data:
source: '[[ .instance ]]-alertmanager-data'
source: '[[ .instance ]]-alertmanager-data[[ .consul.suffix ]]'
type: csi
per_alloc: true
prometheus:
@ -117,11 +118,69 @@ monitoring:
vault:
policies:
- metrics
- '[[ .instance ]]-alertmanager'
- '[[ .instance ]]-alertmanager[[ .consul.suffix ]]'
email:
from: alertmanager@[[ .consul.domain ]]
custom_config: ""
custom_config: {}
loki:
version: 2.9.5
image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1'
env: {}
resources:
cpu: 150
memory: 512
vault:
policies:
- '[[ .instance ]]-loki[[ .consul.suffix ]]'
public_url: https://loki.example.org
traefik:
router: loki
retention: 720h # 1 month
custom_config: {}
prometheus:
metrics_url: http://localhost:3100/metrics
volumes:
data:
type: csi
source: '[[ .instance ]]-loki-data[[ .consul.suffix ]]'
vector:
version: 0.36.1
image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1'
aggregator:
count: 1
image: '[[ .monitoring.vector.image ]]'
env: {}
resources:
cpu: 100
memory: 192
consul:
connect:
upstreams:
- destination_name: '[[ .instance ]]-loki[[ .consul.suffix ]]'
local_bind_port: 3100
fluentd:
enabled: false
traefik:
router: fluentd
entrypoints:
- fluentd
syslog_udp:
enabled: false
traefik:
router: syslog-udp
entrypoints:
- syslog
vector:
enabled: true
public_url: https://vector.example.org
traefik:
enabled: false
prometheus:
metrics_url: http://127.0.0.1:9001/metrics
prometheus:
enabled: true

View File

@ -0,0 +1,8 @@
[[- $c := merge .monitoring.loki .monitoring . ]]
path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-loki" {
capabilities = ["update"]
}
path "[[ $c.vault.root ]]kv/service/[[ .instance ]]/loki" {
capabilities = ["read"]
}