More work, including loki and vector aggregator
This commit is contained in:
parent
65441a4a9e
commit
7ed40afe9c
9
TODO.md
9
TODO.md
|
@ -6,7 +6,7 @@
|
|||
- ~~blackbox-exporter~~
|
||||
- ~~consul-exporter~~
|
||||
- vector
|
||||
- loki
|
||||
- ~~loki~~
|
||||
- grafana
|
||||
- nomad-vector-logger
|
||||
|
||||
|
@ -27,17 +27,18 @@
|
|||
- consul defaults & intentions
|
||||
- ~~prometheus~~
|
||||
- ~~alertmanager~~
|
||||
- loki
|
||||
- ~~loki~~
|
||||
|
||||
- tasks
|
||||
- ~~alertmanager~~
|
||||
- vector-aggregator
|
||||
- vector-agent (dans job agent)
|
||||
- loki (modulariser ou laisser en monolithique ?)
|
||||
- ~~loki (modulariser ou laisser en monolithique ?)~~
|
||||
- grafana
|
||||
- ~~cluster-metrics (job exporters)~~
|
||||
|
||||
- questions
|
||||
- questions / various
|
||||
- prom rules: keep or move to a -conf bundle ?
|
||||
- ~~config alertes am (recipient + routing)~~
|
||||
- ~~http and tcp probes, as exporters are now in a dedicated job~~
|
||||
- alertmanager & rules for loki
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
Kind = "service-defaults"
|
||||
Name = "[[ .instance ]]-loki[[ .consul.suffix ]]"
|
||||
Protocol = "http"
|
|
@ -0,0 +1,55 @@
|
|||
Kind = "service-intentions"
|
||||
Name = "[[ .instance ]]-loki[[ .consul.suffix ]]"
|
||||
Sources = [
|
||||
{
|
||||
Name = "[[ (merge .monitoring.loki .monitoring .).traefik.instance ]]"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathPrefix = "/"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
Name = "[[ .instance ]]-grafana[[ .consul.suffix ]]"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathPrefix = "/loki/api/v1"
|
||||
Methods = ["GET", "HEAD", "POST"]
|
||||
}
|
||||
},
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathPrefix = "/ready"
|
||||
Methods = ["GET"]
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
[[- range $idx, $service := coll.Slice "vector-aggregator" "vector-agent" ]]
|
||||
{
|
||||
Name = "[[ $.instance ]]-[[ $service ]][[ $.consul.suffix ]]"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathExact = "/loki/api/v1/push"
|
||||
Methods = ["POST"]
|
||||
}
|
||||
},
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathExact = "/ready"
|
||||
Methods = ["GET"]
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
[[- end ]]
|
||||
]
|
|
@ -1,12 +1,12 @@
|
|||
- ~~Split exporters dans un job dédié (pour pouvoir tourner sur node_pool spécifique)~~
|
||||
- Créer monitoring-agent type system avec vector + node-exporter
|
||||
- images
|
||||
- prometheus
|
||||
- ping-exporter
|
||||
- blackbox-exporter
|
||||
- consul-exporter
|
||||
- ~~prometheus~~
|
||||
- ~~ping-exporter~~
|
||||
- ~~blackbox-exporter~~
|
||||
- ~~consul-exporter~~
|
||||
- vector
|
||||
- loki
|
||||
- ~~loki~~
|
||||
- grafana
|
||||
- nomad-vector-logger
|
||||
|
||||
|
@ -16,28 +16,29 @@
|
|||
- ~~monitoring -> am~~
|
||||
|
||||
- vault pol
|
||||
- prometheus
|
||||
- ~~prometheus~~
|
||||
- ~~issue prom on monitoring~~
|
||||
- ~~issue prom on consul~~
|
||||
- consul-exporter
|
||||
- issue consul-exporter on consul
|
||||
- alertmanager
|
||||
- ~~consul-exporter~~
|
||||
- ~~issue consul-exporter on consul~~
|
||||
- ~~alertmanager~~
|
||||
- ~~issue alertmanager on monitoring~~
|
||||
|
||||
- consul defaults & intentions
|
||||
- ~~prometheus~~
|
||||
- ~~alertmanager~~
|
||||
- loki
|
||||
- ~~loki~~
|
||||
|
||||
- tasks
|
||||
- ~~alertmanager~~
|
||||
- vector-aggregator
|
||||
- vector-agent (dans job agent)
|
||||
- loki (modulariser ou laisser en monolithique ?)
|
||||
- ~~loki (modulariser ou laisser en monolithique ?)~~
|
||||
- grafana
|
||||
- cluster-metrics (job exporters)
|
||||
- ~~cluster-metrics (job exporters)~~
|
||||
|
||||
- questions
|
||||
- questions / various
|
||||
- prom rules: keep or move to a -conf bundle ?
|
||||
- ~~config alertes am (recipient + routing)~~
|
||||
- http and tcp probes, as exporters are now in a dedicated job
|
||||
- ~~http and tcp probes, as exporters are now in a dedicated job~~
|
||||
- alertmanager & rules for loki
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
Kind = "service-defaults"
|
||||
Name = "monitoring-loki"
|
||||
Protocol = "http"
|
|
@ -0,0 +1,72 @@
|
|||
Kind = "service-intentions"
|
||||
Name = "monitoring-loki"
|
||||
Sources = [
|
||||
{
|
||||
Name = "traefik"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathPrefix = "/"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
Name = "monitoring-grafana"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathPrefix = "/loki/api/v1"
|
||||
Methods = ["GET", "HEAD", "POST"]
|
||||
}
|
||||
},
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathPrefix = "/ready"
|
||||
Methods = ["GET"]
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
Name = "monitoring-vector-aggregator"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathExact = "/loki/api/v1/push"
|
||||
Methods = ["POST"]
|
||||
}
|
||||
},
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathExact = "/ready"
|
||||
Methods = ["GET"]
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
Name = "monitoring-vector-agent"
|
||||
Permissions = [
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathExact = "/loki/api/v1/push"
|
||||
Methods = ["POST"]
|
||||
}
|
||||
},
|
||||
{
|
||||
Action = "allow"
|
||||
HTTP {
|
||||
PathExact = "/ready"
|
||||
Methods = ["GET"]
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
]
|
|
@ -0,0 +1,38 @@
|
|||
FROM danielberteaud/alpine:24.3-1 AS builder
|
||||
|
||||
ARG LOKI_VERSION=2.9.5
|
||||
|
||||
ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/loki-linux-amd64.zip /tmp
|
||||
ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/SHA256SUMS /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add unzip &&\
|
||||
cd /tmp &&\
|
||||
grep "loki-linux-amd64.zip" SHA256SUMS | sha256sum -c &&\
|
||||
unzip loki-linux-amd64.zip &&\
|
||||
mkdir /opt/loki &&\
|
||||
mv loki-linux-amd64 /opt/loki/loki
|
||||
|
||||
FROM danielberteaud/alpine:24.3-1
|
||||
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
|
||||
|
||||
ENV PATH=/opt/loki:$PATH
|
||||
COPY --from=builder /opt/loki /opt/loki
|
||||
RUN set -eux &&\
|
||||
addgroup -g 3100 loki &&\
|
||||
adduser \
|
||||
--system \
|
||||
--disabled-password \
|
||||
--uid 3100 \
|
||||
--ingroup loki \
|
||||
--home /opt/loki \
|
||||
--no-create-home \
|
||||
--shell /sbin/nologin \
|
||||
loki &&\
|
||||
mkdir /data &&\
|
||||
chown loki:loki /data &&\
|
||||
chmod 700 data
|
||||
|
||||
WORKDIR /opt/loki
|
||||
USER loki
|
||||
EXPOSE 3100
|
||||
CMD ["loki"]
|
|
@ -0,0 +1 @@
|
|||
FROM timberio/vector:0.36.1-alpine
|
|
@ -118,6 +118,19 @@ vault write pki/monitoring/roles/monitoring-prometheus \
|
|||
max_ttl=100h \
|
||||
ou="Monitoring"
|
||||
|
||||
# Create a role for loki (which will only be a client, for AlertManager)
|
||||
vault write pki/monitoring/roles/monitoring-loki \
|
||||
allowed_domains="monitoring" \
|
||||
allow_bare_domains=false \
|
||||
allow_subdomains=true \
|
||||
allow_localhost=false \
|
||||
allow_ip_sans=false \
|
||||
server_flag=false \
|
||||
client_flag=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=100h \
|
||||
ou="Monitoring"
|
||||
|
||||
# Create a role for metrics exporters (server only)
|
||||
vault write pki/monitoring/roles/metrics \
|
||||
allowed_domains="monitoring" \
|
||||
|
@ -151,6 +164,6 @@ vault write pki/consul/roles/monitoring-cluster-exporter \
|
|||
allow_subdomains=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=168h \
|
||||
server_flags=false \
|
||||
client_flags=true \
|
||||
server_flag=false \
|
||||
client_flag=true \
|
||||
ou="Cluster metrics exporter"
|
||||
|
|
|
@ -114,7 +114,7 @@ _EOT
|
|||
|
||||
resources {
|
||||
cpu = 20
|
||||
memory = 64
|
||||
memory = 32
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -411,7 +411,7 @@ _EOT
|
|||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 18
|
||||
memory = 15
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
job "monitoring" {
|
||||
job "monitoring-services" {
|
||||
|
||||
|
||||
datacenters = ["dc1"]
|
||||
|
@ -225,6 +225,7 @@ alerting:
|
|||
consul_sd_configs:
|
||||
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
|
||||
scheme: http
|
||||
token: {{ with secret "consul/creds/monitoring-prometheus" }}{{ .Data.token }}{{ end }}
|
||||
datacenter: dc1
|
||||
relabel_configs:
|
||||
# Only keep alertmanagers
|
||||
|
@ -919,7 +920,7 @@ _EOT
|
|||
|
||||
resources {
|
||||
cpu = 200
|
||||
memory = 768
|
||||
memory = 512
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -927,7 +928,8 @@ _EOT
|
|||
|
||||
group "alerts" {
|
||||
|
||||
count = 1
|
||||
shutdown_delay = "6s"
|
||||
count = 1
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
|
@ -1335,8 +1337,574 @@ _EOT
|
|||
|
||||
|
||||
resources {
|
||||
cpu = 50
|
||||
memory = 80
|
||||
cpu = 50
|
||||
memory = 64
|
||||
memory_max = 80
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
group "logs" {
|
||||
|
||||
shutdown_delay = "6s"
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
|
||||
volume "data" {
|
||||
source = "monitoring-loki-data"
|
||||
type = "csi"
|
||||
access_mode = "single-node-writer"
|
||||
attachment_mode = "file-system"
|
||||
}
|
||||
|
||||
|
||||
service {
|
||||
name = "monitoring-loki"
|
||||
port = 3100
|
||||
meta {
|
||||
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
job = "${NOMAD_JOB_NAME}"
|
||||
}
|
||||
|
||||
connect {
|
||||
sidecar_service {
|
||||
}
|
||||
sidecar_task {
|
||||
config {
|
||||
args = [
|
||||
"-c",
|
||||
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
|
||||
"-l",
|
||||
"${meta.connect.log_level}",
|
||||
"--concurrency",
|
||||
"${meta.connect.proxy_concurrency}",
|
||||
"--disable-hot-restart"
|
||||
]
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 50
|
||||
memory = 64
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
check {
|
||||
name = "ready"
|
||||
type = "http"
|
||||
path = "/ready"
|
||||
expose = true
|
||||
interval = "20s"
|
||||
timeout = "8s"
|
||||
check_restart {
|
||||
limit = 6
|
||||
grace = "5m"
|
||||
}
|
||||
}
|
||||
|
||||
tags = [
|
||||
|
||||
"traefik.enable=true",
|
||||
"traefik.http.routers.monitoring-loki.entrypoints=https",
|
||||
"traefik.http.routers.monitoring-loki.rule=Host(`loki.example.org`)",
|
||||
"traefik.http.middlewares.csp-monitoring-loki.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
|
||||
"traefik.http.routers.monitoring-loki.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-loki",
|
||||
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
|
||||
task "metrics-proxy" {
|
||||
driver = "docker"
|
||||
user = 8995
|
||||
|
||||
config {
|
||||
image = "nginxinc/nginx-unprivileged:alpine"
|
||||
force_pull = true
|
||||
volumes = [
|
||||
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
|
||||
]
|
||||
pids_limit = 100
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
hook = "poststart"
|
||||
sidecar = true
|
||||
}
|
||||
|
||||
vault {
|
||||
policies = ["metrics"]
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}{{ end -}}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
location /metrics {
|
||||
proxy_pass http://localhost:3100/metrics;
|
||||
}
|
||||
}
|
||||
_EOT
|
||||
destination = "local/default.conf"
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 10
|
||||
memory_max = 20
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
task "loki" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "danielberteaud/loki:2.9.5-1"
|
||||
command = "loki"
|
||||
args = ["--config.file=/local/loki.yml"]
|
||||
}
|
||||
|
||||
|
||||
vault {
|
||||
policies = ["monitoring-loki"]
|
||||
env = false
|
||||
disable_file = true
|
||||
change_mode = "noop"
|
||||
}
|
||||
|
||||
|
||||
|
||||
# Use a template block instead of env {} so we can fetch values from vault
|
||||
template {
|
||||
data = <<_EOT
|
||||
LANG=fr_FR.utf8
|
||||
TZ=Europe/Paris
|
||||
_EOT
|
||||
destination = "secrets/.env"
|
||||
perms = 400
|
||||
env = true
|
||||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
analytics:
|
||||
reporting_enabled: false
|
||||
auth_enabled: false
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /data
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /data/chunks
|
||||
rules_directory: /data/rules
|
||||
compactor:
|
||||
compaction_interval: 1h
|
||||
deletion_mode: filter-and-delete
|
||||
retention_enabled: true
|
||||
shared_store: filesystem
|
||||
working_directory: /data/compactor
|
||||
ingester:
|
||||
chunk_idle_period: 1h
|
||||
limits_config:
|
||||
ingestion_burst_size_mb: 100
|
||||
ingestion_rate_mb: 20
|
||||
max_entries_limit_per_query: 20000
|
||||
max_query_parallelism: 128
|
||||
retention_period: 720h
|
||||
ruler:
|
||||
alertmanager_client:
|
||||
tls_ca_path: /secrets/monitoring.ca.pem
|
||||
tls_cert_path: /secrets/loki.bundle.pem
|
||||
tls_key_path: /secrets/loki.bundle.pem
|
||||
tls_server_name: alertmanager.monitoring
|
||||
alertmanager_url: monitoring-alertmanager-tls
|
||||
enable_alertmanager_discovery: true
|
||||
enable_alertmanager_v2: true
|
||||
enable_api: true
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
rule_path: /tmp/loki-rules
|
||||
storage:
|
||||
local:
|
||||
directory: /local/rules
|
||||
type: local
|
||||
schema_config:
|
||||
configs:
|
||||
- from: "2020-10-24"
|
||||
index:
|
||||
period: 24h
|
||||
prefix: index_
|
||||
object_store: filesystem
|
||||
schema: v11
|
||||
store: boltdb-shipper
|
||||
server:
|
||||
grpc_listen_address: 127.0.0.1
|
||||
grpc_listen_port: 9095
|
||||
http_listen_address: 127.0.0.1
|
||||
http_listen_port: 3100
|
||||
storage_config:
|
||||
boltdb_shipper:
|
||||
active_index_directory: /data/index
|
||||
cache_location: /data/boltdb-cache
|
||||
shared_store: filesystem
|
||||
|
||||
_EOT
|
||||
destination = "local/loki.yml"
|
||||
}
|
||||
|
||||
# A client cert, to connect to the AlertManager API
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/monitoring-loki"
|
||||
(printf "common_name=loki-%s.monitoring" (env "NOMAD_ALLOC_INDEX"))
|
||||
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end -}}
|
||||
_EOT
|
||||
destination = "secrets/loki.bundle.pem"
|
||||
uid = 100000
|
||||
gid = 103100
|
||||
perms = "0440"
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# The monitoring CA chain, to validate AlertManager cert
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
uid = 100000
|
||||
gid = 100000
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "data"
|
||||
destination = "/data"
|
||||
}
|
||||
|
||||
|
||||
resources {
|
||||
cpu = 150
|
||||
memory = 512
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
|
||||
# And with a loki sink. The goal is to be able to collect logs from various sources
|
||||
group "aggregator" {
|
||||
|
||||
count = 1
|
||||
shutdown_delay = "6s"
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "syslog-udp" {}
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
# The main service is the vector source
|
||||
# It will provide access to other services through the mesh (like loki)
|
||||
service {
|
||||
name = "monitoring-vector-aggregator"
|
||||
port = 9000
|
||||
meta {
|
||||
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
job = "${NOMAD_JOB_NAME}"
|
||||
}
|
||||
|
||||
connect {
|
||||
sidecar_service {
|
||||
proxy {
|
||||
upstreams {
|
||||
destination_name = "monitoring-loki"
|
||||
local_bind_port = 3100
|
||||
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
|
||||
destination_type = "service"
|
||||
}
|
||||
}
|
||||
}
|
||||
sidecar_task {
|
||||
config {
|
||||
args = [
|
||||
"-c",
|
||||
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
|
||||
"-l",
|
||||
"${meta.connect.log_level}",
|
||||
"--concurrency",
|
||||
"${meta.connect.proxy_concurrency}",
|
||||
"--disable-hot-restart"
|
||||
]
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 50
|
||||
memory = 64
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
tags = [
|
||||
|
||||
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
|
||||
task "metrics-proxy" {
|
||||
driver = "docker"
|
||||
user = 8995
|
||||
|
||||
config {
|
||||
image = "nginxinc/nginx-unprivileged:alpine"
|
||||
force_pull = true
|
||||
volumes = [
|
||||
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
|
||||
]
|
||||
pids_limit = 100
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
hook = "poststart"
|
||||
sidecar = true
|
||||
}
|
||||
|
||||
vault {
|
||||
policies = ["metrics"]
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}{{ end -}}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
location /metrics {
|
||||
proxy_pass http://127.0.0.1:9001/metrics;
|
||||
}
|
||||
}
|
||||
_EOT
|
||||
destination = "local/default.conf"
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 10
|
||||
memory_max = 20
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
task "vector" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "danielberteaud/vector:0.36.1-1"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 200
|
||||
args = ["--config=/local/vector.yml"]
|
||||
}
|
||||
|
||||
|
||||
|
||||
# Use a template block instead of env {} so we can fetch values from vault
|
||||
template {
|
||||
data = <<_EOT
|
||||
LANG=fr_FR.utf8
|
||||
TZ=Europe/Paris
|
||||
_EOT
|
||||
destination = "secrets/.env"
|
||||
perms = 400
|
||||
env = true
|
||||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
data_dir: /local
|
||||
expire_metrics_secs: 600
|
||||
|
||||
sources:
|
||||
|
||||
logs_vector:
|
||||
type: vector
|
||||
address: 127.0.0.1:9000
|
||||
|
||||
vector_metrics:
|
||||
type: internal_metrics
|
||||
|
||||
transforms:
|
||||
split-by-app:
|
||||
type: route
|
||||
inputs: [ "logs_*" ]
|
||||
route:
|
||||
traefik: '.service == "traefik"'
|
||||
postgres: '.service == "postgres"'
|
||||
syslog: '.source_type == "syslog"'
|
||||
|
||||
parse-traefik:
|
||||
type: remap
|
||||
inputs: ["split-by-app.traefik"]
|
||||
source: |
|
||||
.http = parse_grok!(.message, "%%{HTTPD_COMMONLOG}")
|
||||
.loki_labels.http_method = .http.verb
|
||||
.loki_labels.http_status = .http.response
|
||||
.loki_labels.user = .http.auth
|
||||
|
||||
parse-postgres:
|
||||
type: remap
|
||||
inputs: ["split-by-app.postgres"]
|
||||
source: |
|
||||
if includes(array!(.nomad.tags), "master"){
|
||||
.loki_labels.pg_role = "master"
|
||||
} else if includes(array!(.nomad.tags), "replica"){
|
||||
.loki_labels.pg_role = "replica"
|
||||
}
|
||||
|
||||
parse-syslog:
|
||||
type: remap
|
||||
inputs: ["split-by-app.syslog"]
|
||||
source: |
|
||||
# PfSense sends /usr/sbin/cron as the appname, instead of cron
|
||||
if string!(.appname) == "/usr/sbin/cron" {
|
||||
.appname = "cron"
|
||||
}
|
||||
.service = .appname
|
||||
|
||||
sinks:
|
||||
|
||||
loki:
|
||||
type: loki
|
||||
inputs: [ "split-by-app._unmatched", "parse-*" ]
|
||||
endpoint: http://127.0.0.1:3100
|
||||
encoding:
|
||||
codec: text
|
||||
labels:
|
||||
job: "{{ .service }}"
|
||||
host: "{{ .host }}"
|
||||
_*: "{{ .loki_labels }}"
|
||||
buffer:
|
||||
type: disk
|
||||
max_size: 268435488
|
||||
remove_label_fields: true
|
||||
|
||||
# Expose vector internal metrics
|
||||
prometheus:
|
||||
type: prometheus_exporter
|
||||
inputs: ["vector_metrics"]
|
||||
address: "127.0.0.1:9001"
|
||||
|
||||
_EOT
|
||||
destination = "local/vector.yml"
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
|
||||
resources {
|
||||
cpu = 100
|
||||
memory = 192
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
|
||||
path "pki/monitoring/issue/monitoring-loki" {
|
||||
capabilities = ["update"]
|
||||
}
|
||||
|
||||
path "kv/service/monitoring/loki" {
|
||||
capabilities = ["read"]
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
|
||||
|
||||
ARG LOKI_VERSION=[[ .monitoring.loki.version ]]
|
||||
|
||||
ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/loki-linux-amd64.zip /tmp
|
||||
ADD https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/SHA256SUMS /tmp
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add unzip &&\
|
||||
cd /tmp &&\
|
||||
grep "loki-linux-amd64.zip" SHA256SUMS | sha256sum -c &&\
|
||||
unzip loki-linux-amd64.zip &&\
|
||||
mkdir /opt/loki &&\
|
||||
mv loki-linux-amd64 /opt/loki/loki
|
||||
|
||||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
|
||||
MAINTAINER [[ .docker.maintainer ]]
|
||||
|
||||
ENV PATH=/opt/loki:$PATH
|
||||
COPY --from=builder /opt/loki /opt/loki
|
||||
RUN set -eux &&\
|
||||
addgroup -g 3100 loki &&\
|
||||
adduser \
|
||||
--system \
|
||||
--disabled-password \
|
||||
--uid 3100 \
|
||||
--ingroup loki \
|
||||
--home /opt/loki \
|
||||
--no-create-home \
|
||||
--shell /sbin/nologin \
|
||||
loki &&\
|
||||
mkdir /data &&\
|
||||
chown loki:loki /data &&\
|
||||
chmod 700 data
|
||||
|
||||
WORKDIR /opt/loki
|
||||
USER loki
|
||||
EXPOSE 3100
|
||||
CMD ["loki"]
|
|
@ -0,0 +1 @@
|
|||
FROM timberio/vector:[[ .monitoring.vector.version ]]-alpine
|
17
init/pki
17
init/pki
|
@ -31,6 +31,19 @@ vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-prometheus \
|
|||
max_ttl=100h \
|
||||
ou="[[ $c.vault.pki.ou ]]"
|
||||
|
||||
# Create a role for loki (which will only be a client, for AlertManager)
|
||||
vault write [[ $c.vault.pki.path ]]/roles/[[ .instance ]]-loki \
|
||||
allowed_domains="[[ .instance ]]" \
|
||||
allow_bare_domains=false \
|
||||
allow_subdomains=true \
|
||||
allow_localhost=false \
|
||||
allow_ip_sans=false \
|
||||
server_flag=false \
|
||||
client_flag=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=100h \
|
||||
ou="[[ $c.vault.pki.ou ]]"
|
||||
|
||||
# Create a role for metrics exporters (server only)
|
||||
vault write [[ $c.vault.pki.path ]]/roles/metrics \
|
||||
allowed_domains="[[ .instance ]]" \
|
||||
|
@ -64,6 +77,6 @@ vault write pki/consul/roles/[[ .instance ]]-cluster-exporter \
|
|||
allow_subdomains=true \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=168h \
|
||||
server_flags=false \
|
||||
client_flags=true \
|
||||
server_flag=false \
|
||||
client_flag=true \
|
||||
ou="Cluster metrics exporter"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
job "[[ .instance ]]" {
|
||||
job "[[ .instance ]]-services" {
|
||||
|
||||
[[ template "common/job_start" . ]]
|
||||
|
||||
|
@ -163,6 +163,7 @@ _EOT
|
|||
|
||||
[[- $c := merge .monitoring.alertmanager .monitoring . ]]
|
||||
|
||||
shutdown_delay = "6s"
|
||||
count = [[ $c.count ]]
|
||||
|
||||
network {
|
||||
|
@ -370,6 +371,190 @@ _EOT
|
|||
destination = "/data"
|
||||
}
|
||||
|
||||
[[ template "common/resources" $c ]]
|
||||
}
|
||||
}
|
||||
|
||||
group "logs" {
|
||||
|
||||
[[- $c := merge .monitoring.loki .monitoring . ]]
|
||||
|
||||
shutdown_delay = "6s"
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
[[ template "common/volumes" $c ]]
|
||||
|
||||
service {
|
||||
name = "[[ .instance ]]-loki[[ .consul.suffix ]]"
|
||||
port = 3100
|
||||
[[ template "common/service_meta" $c ]]
|
||||
[[ template "common/connect" $c ]]
|
||||
|
||||
check {
|
||||
name = "ready"
|
||||
type = "http"
|
||||
path = "/ready"
|
||||
expose = true
|
||||
interval = "20s"
|
||||
timeout = "8s"
|
||||
check_restart {
|
||||
limit = 6
|
||||
grace = "5m"
|
||||
}
|
||||
}
|
||||
|
||||
tags = [
|
||||
[[ template "common/traefik_tags" $c ]]
|
||||
]
|
||||
}
|
||||
|
||||
[[ template "common/task.metrics_proxy" $c ]]
|
||||
|
||||
task "loki" {
|
||||
driver = "[[ $c.nomad.driver ]]"
|
||||
|
||||
config {
|
||||
image = "[[ $c.image ]]"
|
||||
command = "loki"
|
||||
args = ["--config.file=/local/loki.yml"]
|
||||
}
|
||||
|
||||
[[ template "common/vault.policies" $c ]]
|
||||
[[ template "common/file_env" $c ]]
|
||||
|
||||
template {
|
||||
data =<<_EOT
|
||||
[[- if isKind "map" $c.custom_config ]]
|
||||
[[ merge $c.custom_config (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]]
|
||||
[[- else if isKind "string" $c.custom_config ]]
|
||||
[[ merge ($c.custom_config | yaml) (tmpl.Exec "monitoring/loki/loki.yml" $c | yaml) | toYAML ]]
|
||||
[[- else ]]
|
||||
# Not using custom_config as it's invalid
|
||||
[[ template "monitoring/loki/loki.yml" $c ]]
|
||||
[[- end ]]
|
||||
_EOT
|
||||
destination = "local/loki.yml"
|
||||
}
|
||||
|
||||
# A client cert, to connect to the AlertManager API
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-loki"
|
||||
(printf "common_name=loki-%s.[[ .instance ]]" (env "NOMAD_ALLOC_INDEX"))
|
||||
(printf "ttl=%dh" (env "NOMAD_ALLOC_INDEX" | parseInt | multiply 24 | add 72)) -}}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end -}}
|
||||
_EOT
|
||||
destination = "secrets/loki.bundle.pem"
|
||||
uid = 100000
|
||||
gid = 103100
|
||||
perms = "0440"
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
# The monitoring CA chain, to validate AlertManager cert
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
uid = 100000
|
||||
gid = 100000
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "data"
|
||||
destination = "/data"
|
||||
}
|
||||
|
||||
[[ template "common/resources" $c ]]
|
||||
}
|
||||
}
|
||||
|
||||
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
|
||||
# And with a loki sink. The goal is to be able to collect logs from various sources
|
||||
group "aggregator" {
|
||||
[[- $c := merge .monitoring.aggregator .monitoring . ]]
|
||||
|
||||
count = [[ $c.count ]]
|
||||
shutdown_delay = "6s"
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "syslog-udp" {}
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
# The main service is the vector source
|
||||
# It will provide access to other services through the mesh (like loki)
|
||||
service {
|
||||
name = "[[ .instance ]]-vector-aggregator[[ .consul.suffix ]]"
|
||||
port = 9000
|
||||
[[ template "common/service_meta" $c ]]
|
||||
[[ template "common/connect" $c ]]
|
||||
tags = [
|
||||
[[ template "common/traefik_tags" merge $c.vector $c ]]
|
||||
]
|
||||
}
|
||||
|
||||
[[- if $c.syslog_udp.enabled ]]
|
||||
# The syslog UDP service can be used to ingest standard syslog logs from other
|
||||
# devices, and can be exposed by Traefik for this
|
||||
service {
|
||||
name = "[[ .instance ]]-syslog-udp[[ .consul.suffix ]]"
|
||||
port = "syslog-udp"
|
||||
tags = [
|
||||
[[ template "common/traefik_tags" merge $c.syslog_udp $c ]]
|
||||
# UDP services can't be used through the mesh
|
||||
"[[ $c.traefik.instance ]].consulcatalog.connect=false"
|
||||
]
|
||||
}
|
||||
[[- end ]]
|
||||
|
||||
[[- if $c.fluentd.enabled ]]
|
||||
# The fluentd service can be used to ingest fluentd logs
|
||||
service {
|
||||
name = "[[ .instance ]]-syslog-udp[[ .consul.suffix ]]"
|
||||
port = 24224
|
||||
tags = [
|
||||
[[ template "common/traefik_tags" merge $c.fluentd $c ]]
|
||||
]
|
||||
}
|
||||
[[- end ]]
|
||||
|
||||
[[ template "common/task.metrics_proxy" $c ]]
|
||||
|
||||
task "vector" {
|
||||
driver = "[[ $c.nomad.driver ]]"
|
||||
|
||||
config {
|
||||
image = "[[ $c.image ]]"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 200
|
||||
args = [ "--config=/local/vector.yml" ]
|
||||
}
|
||||
|
||||
[[ template "common/file_env" $c ]]
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ tmpl.Exec "monitoring/aggregator/vector.yml" $c | replaceAll "%{" "%%{" | replaceAll "${" "$${" ]]
|
||||
_EOT
|
||||
destination = "local/vector.yml"
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
change_mode = "signal"
|
||||
change_signal = "SIGHUP"
|
||||
}
|
||||
|
||||
[[ template "common/resources" $c ]]
|
||||
}
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
data_dir: /local
|
||||
expire_metrics_secs: 600
|
||||
|
||||
sources:
|
||||
|
||||
logs_vector:
|
||||
type: vector
|
||||
address: 127.0.0.1:9000
|
||||
|
||||
vector_metrics:
|
||||
type: internal_metrics
|
||||
|
||||
[[- if .syslog_udp.enabled ]]
|
||||
logs_syslog_udp:
|
||||
type: syslog
|
||||
mode: udp
|
||||
address: 0.0.0.0:{{{ env "NOMAD_ALLOC_PORT_syslog_udp" }}}
|
||||
[[- end ]]
|
||||
|
||||
[[- if .fluentd.enabled ]]
|
||||
logs_fluentd:
|
||||
type: fluent
|
||||
address: 127.0.0.1:24224
|
||||
[[- end ]]
|
||||
|
||||
transforms:
|
||||
split-by-app:
|
||||
type: route
|
||||
inputs: [ "logs_*" ]
|
||||
route:
|
||||
traefik: '.service == "traefik"'
|
||||
postgres: '.service == "postgres"'
|
||||
syslog: '.source_type == "syslog"'
|
||||
|
||||
parse-traefik:
|
||||
type: remap
|
||||
inputs: ["split-by-app.traefik"]
|
||||
source: |
|
||||
.http = parse_grok!(.message, "%{HTTPD_COMMONLOG}")
|
||||
.loki_labels.http_method = .http.verb
|
||||
.loki_labels.http_status = .http.response
|
||||
.loki_labels.user = .http.auth
|
||||
|
||||
parse-postgres:
|
||||
type: remap
|
||||
inputs: ["split-by-app.postgres"]
|
||||
source: |
|
||||
if includes(array!(.nomad.tags), "master"){
|
||||
.loki_labels.pg_role = "master"
|
||||
} else if includes(array!(.nomad.tags), "replica"){
|
||||
.loki_labels.pg_role = "replica"
|
||||
}
|
||||
|
||||
parse-syslog:
|
||||
type: remap
|
||||
inputs: ["split-by-app.syslog"]
|
||||
source: |
|
||||
# PfSense sends /usr/sbin/cron as the appname, instead of cron
|
||||
if string!(.appname) == "/usr/sbin/cron" {
|
||||
.appname = "cron"
|
||||
}
|
||||
.service = .appname
|
||||
|
||||
sinks:
|
||||
|
||||
loki:
|
||||
type: loki
|
||||
inputs: [ "split-by-app._unmatched", "parse-*" ]
|
||||
endpoint: http://127.0.0.1:3100
|
||||
encoding:
|
||||
codec: text
|
||||
labels:
|
||||
job: "{{ .service }}"
|
||||
host: "{{ .host }}"
|
||||
_*: "{{ .loki_labels }}"
|
||||
buffer:
|
||||
type: disk
|
||||
max_size: 268435488
|
||||
remove_label_fields: true
|
||||
|
||||
# Expose vector internal metrics
|
||||
prometheus:
|
||||
type: prometheus_exporter
|
||||
inputs: ["vector_metrics"]
|
||||
address: "127.0.0.1:9001"
|
|
@ -0,0 +1,75 @@
|
|||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_address: 127.0.0.1
|
||||
http_listen_port: 3100
|
||||
grpc_listen_address: 127.0.0.1
|
||||
grpc_listen_port: 9095
|
||||
#log_level: debug
|
||||
|
||||
common:
|
||||
path_prefix: /data
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /data/chunks
|
||||
rules_directory: /data/rules
|
||||
replication_factor: 1
|
||||
instance_addr: 127.0.0.1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
storage_config:
|
||||
boltdb_shipper:
|
||||
active_index_directory: /data/index
|
||||
cache_location: /data/boltdb-cache
|
||||
shared_store: filesystem
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: '2020-10-24'
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v11
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
compactor:
|
||||
working_directory: /data/compactor
|
||||
shared_store: filesystem
|
||||
compaction_interval: 1h
|
||||
retention_enabled: true
|
||||
deletion_mode: filter-and-delete
|
||||
|
||||
ingester:
|
||||
chunk_idle_period: 1h
|
||||
|
||||
limits_config:
|
||||
retention_period: '[[ .retention ]]'
|
||||
ingestion_rate_mb: 20
|
||||
ingestion_burst_size_mb: 100
|
||||
max_entries_limit_per_query: 20000
|
||||
max_query_parallelism: 128
|
||||
|
||||
ruler:
|
||||
alertmanager_url: [[ .instance ]]-alertmanager-tls[[ .consul.suffix ]]
|
||||
enable_alertmanager_discovery: true
|
||||
alertmanager_client:
|
||||
tls_cert_path: /secrets/loki.bundle.pem
|
||||
tls_key_path: /secrets/loki.bundle.pem
|
||||
tls_ca_path: /secrets/monitoring.ca.pem
|
||||
tls_server_name: alertmanager.monitoring
|
||||
enable_alertmanager_v2: true
|
||||
enable_api: true
|
||||
rule_path: /tmp/loki-rules
|
||||
storage:
|
||||
type: local
|
||||
local:
|
||||
directory: /local/rules
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
analytics:
|
||||
reporting_enabled: false
|
|
@ -19,6 +19,7 @@ alerting:
|
|||
consul_sd_configs:
|
||||
- server: {{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500
|
||||
scheme: http
|
||||
token: {{ with secret "[[ .vault.root ]]consul/creds/[[ .instance ]]-prometheus" }}{{ .Data.token }}{{ end }}
|
||||
datacenter: [[ .consul.datacenter ]]
|
||||
relabel_configs:
|
||||
# Only keep alertmanagers
|
||||
|
|
|
@ -18,7 +18,7 @@ monitoring:
|
|||
env: {}
|
||||
resources:
|
||||
cpu: 10
|
||||
memory: 30
|
||||
memory: 25
|
||||
probes: []
|
||||
|
||||
blackbox:
|
||||
|
@ -37,20 +37,20 @@ monitoring:
|
|||
env: {}
|
||||
resources:
|
||||
cpu: 20
|
||||
memory: 64
|
||||
memory: 32
|
||||
vault:
|
||||
policies:
|
||||
- '[[ .instance ]]-consul-exporter'
|
||||
- '[[ .instance ]]-consul-exporter[[ .consul.suffix ]]'
|
||||
|
||||
cluster:
|
||||
image: nginxinc/nginx-unprivileged:alpine
|
||||
env: {}
|
||||
resources:
|
||||
cpu: 10
|
||||
memory: 18
|
||||
memory: 15
|
||||
vault:
|
||||
policies:
|
||||
- '[[ .instance ]]-cluster-exporter'
|
||||
- '[[ .instance ]]-cluster-exporter[[ .consul.suffix ]]'
|
||||
- metrics
|
||||
|
||||
prometheus:
|
||||
|
@ -65,17 +65,17 @@ monitoring:
|
|||
|
||||
resources:
|
||||
cpu: 200
|
||||
memory: 768
|
||||
memory: 512
|
||||
|
||||
volumes:
|
||||
data:
|
||||
type: csi
|
||||
source: '[[ .instance ]]-prometheus-data'
|
||||
source: '[[ .instance ]]-prometheus-data[[ .consul.suffix ]]'
|
||||
per_alloc: true
|
||||
|
||||
vault:
|
||||
policies:
|
||||
- '[[ .instance ]]-prometheus'
|
||||
- '[[ .instance ]]-prometheus[[ .consul.suffix ]]'
|
||||
|
||||
jobs: {}
|
||||
alert_rules: {}
|
||||
|
@ -101,7 +101,8 @@ monitoring:
|
|||
env: {}
|
||||
resources:
|
||||
cpu: 50
|
||||
memory: 80
|
||||
memory: 64
|
||||
memory_max: 80
|
||||
public_url: https://alerte.example.org
|
||||
traefik:
|
||||
enabled: true
|
||||
|
@ -109,7 +110,7 @@ monitoring:
|
|||
strip_prefix: false
|
||||
volumes:
|
||||
data:
|
||||
source: '[[ .instance ]]-alertmanager-data'
|
||||
source: '[[ .instance ]]-alertmanager-data[[ .consul.suffix ]]'
|
||||
type: csi
|
||||
per_alloc: true
|
||||
prometheus:
|
||||
|
@ -117,11 +118,69 @@ monitoring:
|
|||
vault:
|
||||
policies:
|
||||
- metrics
|
||||
- '[[ .instance ]]-alertmanager'
|
||||
- '[[ .instance ]]-alertmanager[[ .consul.suffix ]]'
|
||||
email:
|
||||
from: alertmanager@[[ .consul.domain ]]
|
||||
custom_config: ""
|
||||
|
||||
custom_config: {}
|
||||
|
||||
loki:
|
||||
version: 2.9.5
|
||||
image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1'
|
||||
env: {}
|
||||
resources:
|
||||
cpu: 150
|
||||
memory: 512
|
||||
vault:
|
||||
policies:
|
||||
- '[[ .instance ]]-loki[[ .consul.suffix ]]'
|
||||
public_url: https://loki.example.org
|
||||
traefik:
|
||||
router: loki
|
||||
retention: 720h # 1 month
|
||||
custom_config: {}
|
||||
prometheus:
|
||||
metrics_url: http://localhost:3100/metrics
|
||||
volumes:
|
||||
data:
|
||||
type: csi
|
||||
source: '[[ .instance ]]-loki-data[[ .consul.suffix ]]'
|
||||
|
||||
vector:
|
||||
version: 0.36.1
|
||||
image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1'
|
||||
|
||||
aggregator:
|
||||
count: 1
|
||||
image: '[[ .monitoring.vector.image ]]'
|
||||
env: {}
|
||||
resources:
|
||||
cpu: 100
|
||||
memory: 192
|
||||
consul:
|
||||
connect:
|
||||
upstreams:
|
||||
- destination_name: '[[ .instance ]]-loki[[ .consul.suffix ]]'
|
||||
local_bind_port: 3100
|
||||
fluentd:
|
||||
enabled: false
|
||||
traefik:
|
||||
router: fluentd
|
||||
entrypoints:
|
||||
- fluentd
|
||||
syslog_udp:
|
||||
enabled: false
|
||||
traefik:
|
||||
router: syslog-udp
|
||||
entrypoints:
|
||||
- syslog
|
||||
vector:
|
||||
enabled: true
|
||||
public_url: https://vector.example.org
|
||||
traefik:
|
||||
enabled: false
|
||||
prometheus:
|
||||
metrics_url: http://127.0.0.1:9001/metrics
|
||||
|
||||
|
||||
prometheus:
|
||||
enabled: true
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
[[- $c := merge .monitoring.loki .monitoring . ]]
|
||||
path "[[ $c.vault.pki.path ]]/issue/[[ .instance ]]-loki" {
|
||||
capabilities = ["update"]
|
||||
}
|
||||
|
||||
path "[[ $c.vault.root ]]kv/service/[[ .instance ]]/loki" {
|
||||
capabilities = ["read"]
|
||||
}
|
Loading…
Reference in New Issue