Add vector-agent and nomad-vector-logger
This commit is contained in:
parent
210264b4aa
commit
defebffc50
|
@ -0,0 +1,216 @@
|
|||
job "[[ .instance ]]-agent" {
|
||||
|
||||
[[- $c := merge .monitoring.agent .monitoring . ]]
|
||||
[[ template "common/job_start" $c ]]
|
||||
type = "system"
|
||||
|
||||
group "logs-collector" {
|
||||
|
||||
[[ $c := merge $c.vector $c ]]
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
# Try harder to restart tasks if they fail
|
||||
restart {
|
||||
attempts = 20
|
||||
interval = "5m"
|
||||
mode = "delay"
|
||||
}
|
||||
|
||||
[[ template "common/volumes" $c ]]
|
||||
|
||||
service {
|
||||
name = "vector-agent[[ .consul.suffix ]]"
|
||||
[[ template "common/service_meta" $c ]]
|
||||
[[ template "common/connect" $c ]]
|
||||
}
|
||||
|
||||
task "nomad-vector-logger" {
|
||||
[[- $n := merge $c.nomad_vector_logger $c ]]
|
||||
|
||||
driver = "[[ $n.nomad.driver ]]"
|
||||
# Use a random user instead of root
|
||||
user = 3987
|
||||
|
||||
config {
|
||||
image = "[[ $n.image ]]"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 50
|
||||
# Nomad Vector Logger needs to run on the host's network namespace
|
||||
# so it can reach the Nomad Agent API on localhost:4646
|
||||
network_mode = "host"
|
||||
# Host network namespace requires disabling user namespace
|
||||
userns_mode = "host"
|
||||
command = "nomad-vector-logger"
|
||||
args = [
|
||||
"--config",
|
||||
"/local/nomad-vector-logger.toml"
|
||||
]
|
||||
}
|
||||
|
||||
# We want to run Nomad Vector Logger before vector agent
|
||||
lifecycle {
|
||||
hook = "prestart"
|
||||
sidecar = true
|
||||
}
|
||||
|
||||
[[ template "common/vault.policies" $n ]]
|
||||
[[ template "common/file_env" $n ]]
|
||||
|
||||
# Env to access Nomad API
|
||||
template {
|
||||
data = <<_EOT
|
||||
NOMAD_TOKEN={{ with secret "nomad/creds/nomad-vector-logger[[ .consul.suffix ]]" }}{{ .Data.secret_id }}{{ end }}
|
||||
NOMAD_ADDR=https://localhost:4646
|
||||
NOMAD_CLIENT_CERT=/secrets/nomad.bundle.pem
|
||||
NOMAD_CLIENT_KEY=/secrets/nomad.bundle.pem
|
||||
NOMAD_CACERT=/local/nomad.ca.pem
|
||||
_EOT
|
||||
destination = "secrets/.nomad-vector-logger.env"
|
||||
perms = 400
|
||||
env = true
|
||||
}
|
||||
|
||||
# The main configuration file for nomad-vector-logger
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ template "monitoring/agent/nomad-vector-logger.toml" $n ]]
|
||||
_EOT
|
||||
destination = "local/nomad-vector-logger.toml"
|
||||
}
|
||||
|
||||
# Disable the default nomad.toml template
|
||||
template {
|
||||
data = "# Disable the default toml template"
|
||||
destination = "local/template/nomad.toml"
|
||||
}
|
||||
|
||||
# The vector configuration template used to generate the vector conf
|
||||
template {
|
||||
data = <<_EOT
|
||||
[[ template "monitoring/agent/vector-template.yml" $n ]]
|
||||
_EOT
|
||||
destination = "local/template/nomad.yml"
|
||||
# {{ }} is used by the template, so prevent consul-template to interprete it
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
}
|
||||
|
||||
# Get a client cert for the Nomad API
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/nomad/issue/nomad-vector-logger[[ .consul.suffix ]]"
|
||||
"common_name=nomad-vector-logger[[ .consul.suffix ]].nomad.[[ .consul.domain ]]"
|
||||
"ttl=72h" }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/nomad.bundle.pem"
|
||||
uid = 3987
|
||||
perms = "0400"
|
||||
}
|
||||
|
||||
# The CA chain to validate Nomad certificates
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/nomad.ca.pem"
|
||||
}
|
||||
|
||||
# The main config file
|
||||
volume_mount {
|
||||
volume = "nomad"
|
||||
destination = "/nomad"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
[[ template "common/resources" $n ]]
|
||||
}
|
||||
|
||||
# Nomad Vector Logger can take a few seconds to generate the initial configuration file
|
||||
# This task ensure the file exists before vector is started (to prevent an error as the
|
||||
# transform_nomad_alloc_* sources won't have anything before the file exists)
|
||||
task "wait-for-vector-conf" {
|
||||
|
||||
driver = "[[ $c.nomad.driver ]]"
|
||||
|
||||
config {
|
||||
image = "busybox:latest"
|
||||
command = "sh"
|
||||
args = [
|
||||
"-c",
|
||||
"echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
|
||||
]
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
hook = "prestart"
|
||||
}
|
||||
|
||||
# The task will shutdown once the config is available, so just
|
||||
# allocate very few resources
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 10
|
||||
}
|
||||
}
|
||||
|
||||
# The main vector task, which will read logs using the config file generated by Nomad Vector Logger
|
||||
task "vector" {
|
||||
|
||||
driver = "[[ $c.nomad.driver ]]"
|
||||
leader = true
|
||||
|
||||
config {
|
||||
image = "[[ $c.image ]]"
|
||||
userns_mode = "host"
|
||||
args = [
|
||||
"--watch-config",
|
||||
"--config", "/local/vector.yml",
|
||||
"--config-dir", "/alloc/data/vector_conf"
|
||||
]
|
||||
}
|
||||
|
||||
[[ template "common/vault.policies" $c ]]
|
||||
|
||||
env {
|
||||
NODE_UNIQUE_NAME = "${node.unique.name}"
|
||||
}
|
||||
|
||||
[[ template "common/metrics_cert" $c ]]
|
||||
|
||||
template {
|
||||
data =<<_EOT
|
||||
[[ template "monitoring/agent/vector.yml" $c ]]
|
||||
_EOT
|
||||
destination = "local/vector.yml"
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
|
||||
wait {
|
||||
min = "5s"
|
||||
max = "30s"
|
||||
}
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "nomad"
|
||||
destination = "/nomad"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "data"
|
||||
destination = "/data"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
[[ template "common/resources" $c ]]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,403 @@
|
|||
job "monitoring-agent" {
|
||||
|
||||
datacenters = ["dc1"]
|
||||
region = "global"
|
||||
node_pool = "all"
|
||||
|
||||
type = "system"
|
||||
|
||||
group "logs-collector" {
|
||||
|
||||
|
||||
|
||||
network {
|
||||
mode = "bridge"
|
||||
port "metrics" {}
|
||||
}
|
||||
|
||||
# Try harder to restart tasks if they fail
|
||||
restart {
|
||||
attempts = 20
|
||||
interval = "5m"
|
||||
mode = "delay"
|
||||
}
|
||||
|
||||
|
||||
volume "data" {
|
||||
source = "vector_data"
|
||||
type = "host"
|
||||
}
|
||||
volume "nomad" {
|
||||
source = "nomad_alloc"
|
||||
type = "host"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
|
||||
service {
|
||||
name = "vector-agent"
|
||||
meta {
|
||||
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
||||
alloc = "${node.unique.name}"
|
||||
job = "${NOMAD_JOB_NAME}"
|
||||
namespace = "${NOMAD_NAMESPACE}"
|
||||
}
|
||||
|
||||
connect {
|
||||
sidecar_service {
|
||||
proxy {
|
||||
upstreams {
|
||||
destination_name = "loki"
|
||||
local_bind_port = 3100
|
||||
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
|
||||
destination_type = "service"
|
||||
}
|
||||
}
|
||||
}
|
||||
sidecar_task {
|
||||
config {
|
||||
args = [
|
||||
"-c",
|
||||
"${NOMAD_SECRETS_DIR}/envoy_bootstrap.json",
|
||||
"-l",
|
||||
"${meta.connect.log_level}",
|
||||
"--concurrency",
|
||||
"${meta.connect.proxy_concurrency}",
|
||||
"--disable-hot-restart"
|
||||
]
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 50
|
||||
memory = 64
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
task "nomad-vector-logger" {
|
||||
|
||||
driver = "docker"
|
||||
# Use a random user instead of root
|
||||
user = 3987
|
||||
|
||||
config {
|
||||
image = "danielberteaud/nomad-vector-logger:24.3-2"
|
||||
readonly_rootfs = true
|
||||
pids_limit = 50
|
||||
# Nomad Vector Logger needs to run on the host's network namespace
|
||||
# so it can reach the Nomad Agent API on localhost:4646
|
||||
network_mode = "host"
|
||||
# Host network namespace requires disabling user namespace
|
||||
userns_mode = "host"
|
||||
command = "nomad-vector-logger"
|
||||
args = [
|
||||
"--config",
|
||||
"/local/nomad-vector-logger.toml"
|
||||
]
|
||||
}
|
||||
|
||||
# We want to run Nomad Vector Logger before vector agent
|
||||
lifecycle {
|
||||
hook = "prestart"
|
||||
sidecar = true
|
||||
}
|
||||
|
||||
|
||||
vault {
|
||||
policies = ["nomad-vector-logger"]
|
||||
env = false
|
||||
disable_file = true
|
||||
change_mode = "noop"
|
||||
}
|
||||
|
||||
|
||||
|
||||
# Use a template block instead of env {} so we can fetch values from vault
|
||||
template {
|
||||
data = <<_EOT
|
||||
LANG=fr_FR.utf8
|
||||
TZ=Europe/Paris
|
||||
_EOT
|
||||
destination = "secrets/.env"
|
||||
perms = 400
|
||||
env = true
|
||||
}
|
||||
|
||||
|
||||
# Env to access Nomad API
|
||||
template {
|
||||
data = <<_EOT
|
||||
NOMAD_TOKEN={{ with secret "nomad/creds/nomad-vector-logger" }}{{ .Data.secret_id }}{{ end }}
|
||||
NOMAD_ADDR=https://localhost:4646
|
||||
NOMAD_CLIENT_CERT=/secrets/nomad.bundle.pem
|
||||
NOMAD_CLIENT_KEY=/secrets/nomad.bundle.pem
|
||||
NOMAD_CACERT=/local/nomad.ca.pem
|
||||
_EOT
|
||||
destination = "secrets/.nomad-vector-logger.env"
|
||||
perms = 400
|
||||
env = true
|
||||
}
|
||||
|
||||
# The main configuration file for nomad-vector-logger
|
||||
template {
|
||||
data = <<_EOT
|
||||
[app]
|
||||
log_level = "info"
|
||||
env = "prod"
|
||||
refresh_interval = "10s"
|
||||
remove_alloc_interval = "30s"
|
||||
nomad_data_dir = "/nomad"
|
||||
vector_config_dir = "/alloc/data/vector_conf"
|
||||
extra_templates_dir = "/local/template/"
|
||||
custom_logs_dir = "alloc/custom"
|
||||
|
||||
_EOT
|
||||
destination = "local/nomad-vector-logger.toml"
|
||||
}
|
||||
|
||||
# Disable the default nomad.toml template
|
||||
template {
|
||||
data = "# Disable the default toml template"
|
||||
destination = "local/template/nomad.toml"
|
||||
}
|
||||
|
||||
# The vector configuration template used to generate the vector conf
|
||||
template {
|
||||
data = <<_EOT
|
||||
sources:
|
||||
{{- range $value := . }}
|
||||
|
||||
source_{{ $value.Key }}:
|
||||
type: file
|
||||
include: ["{{ $value.LogDir }}"]
|
||||
line_delimiter: "\n"
|
||||
read_from: beginning
|
||||
# Handle multi-line Java stacktraces
|
||||
multiline:
|
||||
start_pattern: "^[^\\s]"
|
||||
mode: continue_through
|
||||
condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
|
||||
timeout_ms: 1000
|
||||
|
||||
{{- end }}
|
||||
|
||||
transforms:
|
||||
{{- range $value := . }}
|
||||
|
||||
transform_{{ $value.Key }}:
|
||||
type: remap
|
||||
inputs: ["source_{{ $value.Key }}"]
|
||||
source: |
|
||||
# Store Nomad metadata.
|
||||
.nomad.namespace = "{{ $value.Namespace }}"
|
||||
.nomad.node_name = "{{ $value.Node }}"
|
||||
.nomad.job_name = "{{ $value.Job }}"
|
||||
.nomad.group_name = "{{ $value.Group }}"
|
||||
.nomad.task_name = "{{ $value.Task }}"
|
||||
.nomad.alloc_id = "{{ $value.ID }}"
|
||||
.nomad.alloc_name = "{{ $value.Name }}"
|
||||
# Set alloc = <TaskName>-<Alloc Index> so it's similar to what prometheus has
|
||||
.nomad.alloc = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
|
||||
|
||||
{{- end }}
|
||||
|
||||
_EOT
|
||||
destination = "local/template/nomad.yml"
|
||||
# {{ }} is used by the template, so prevent consul-template to interprete it
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
}
|
||||
|
||||
# Get a client cert for the Nomad API
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/nomad/issue/nomad-vector-logger"
|
||||
"common_name=nomad-vector-logger.nomad.consul"
|
||||
"ttl=72h" }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/nomad.bundle.pem"
|
||||
uid = 3987
|
||||
perms = "0400"
|
||||
}
|
||||
|
||||
# The CA chain to validate Nomad certificates
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/nomad/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/nomad.ca.pem"
|
||||
}
|
||||
|
||||
# The main config file
|
||||
volume_mount {
|
||||
volume = "nomad"
|
||||
destination = "/nomad"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
|
||||
resources {
|
||||
cpu = 20
|
||||
memory = 24
|
||||
memory_max = 50
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
# Nomad Vector Logger can take a few seconds to generate the initial configuration file
|
||||
# This task ensure the file exists before vector is started (to prevent an error as the
|
||||
# transform_nomad_alloc_* sources won't have anything before the file exists)
|
||||
task "wait-for-vector-conf" {
|
||||
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "busybox:latest"
|
||||
command = "sh"
|
||||
args = [
|
||||
"-c",
|
||||
"echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
|
||||
]
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
hook = "prestart"
|
||||
}
|
||||
|
||||
# The task will shutdown once the config is available, so just
|
||||
# allocate very few resources
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 10
|
||||
}
|
||||
}
|
||||
|
||||
# The main vector task, which will read logs using the config file generated by Nomad Vector Logger
|
||||
task "vector" {
|
||||
|
||||
driver = "docker"
|
||||
leader = true
|
||||
|
||||
config {
|
||||
image = "danielberteaud/vector:0.36.1-1"
|
||||
userns_mode = "host"
|
||||
args = [
|
||||
"--watch-config",
|
||||
"--config", "/local/vector.yml",
|
||||
"--config-dir", "/alloc/data/vector_conf"
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
vault {
|
||||
policies = ["metrics"]
|
||||
env = false
|
||||
disable_file = true
|
||||
change_mode = "noop"
|
||||
}
|
||||
|
||||
|
||||
env {
|
||||
NODE_UNIQUE_NAME = "${node.unique.name}"
|
||||
}
|
||||
|
||||
# Get a certificate from vault to protect the metrics endpoint
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
# Get the root CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
data_dir: /data
|
||||
# Don't keep metrics indefinitly if they are not updated anymore
|
||||
expire_metrics_secs: 60
|
||||
|
||||
sources:
|
||||
metrics-vector:
|
||||
type: internal_metrics
|
||||
|
||||
sinks:
|
||||
loki:
|
||||
type: loki
|
||||
inputs: ["transform_nomad_alloc_*"]
|
||||
endpoint: http://127.0.0.1:3100
|
||||
encoding:
|
||||
codec: text
|
||||
labels:
|
||||
namespace: "{{ .nomad.namespace }}"
|
||||
job: "{{ .nomad.job_name }}"
|
||||
group: "{{ .nomad.group_name }}"
|
||||
task: "{{ .nomad.task_name }}"
|
||||
host: "{{ .nomad.node_name }}"
|
||||
alloc: "{{ .nomad.alloc }}"
|
||||
buffer:
|
||||
type: disk
|
||||
max_size: 268435488
|
||||
remove_label_fields: true
|
||||
|
||||
prometheus:
|
||||
type: prometheus_exporter
|
||||
inputs: ["metrics-vector"]
|
||||
address: 0.0.0.0:${NOMAD_ALLOC_PORT_metrics}
|
||||
tls:
|
||||
enabled: true
|
||||
crt_file: /secrets/metrics.bundle.pem
|
||||
key_file: /secrets/metrics.bundle.pem
|
||||
ca_file: /local/monitoring.ca.pem
|
||||
verify_certificate: true
|
||||
|
||||
|
||||
_EOT
|
||||
destination = "local/vector.yml"
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
|
||||
wait {
|
||||
min = "5s"
|
||||
max = "30s"
|
||||
}
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "nomad"
|
||||
destination = "/nomad"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "data"
|
||||
destination = "/data"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
|
||||
resources {
|
||||
cpu = 100
|
||||
memory = 192
|
||||
memory_max = 384
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
FROM golang:alpine AS builder
|
||||
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add tar git ca-certificates &&\
|
||||
cd /tmp &&\
|
||||
git clone --depth=1 --branch=feat/name https://github.com/mr-karan/nomad-vector-logger.git &&\
|
||||
cd nomad-vector-logger &&\
|
||||
CGO_ENABLED=0 go build -ldflags="-s -w" -o /nomad-vector-logger
|
||||
|
||||
FROM danielberteaud/alpine:24.3-1
|
||||
MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
|
||||
|
||||
RUN set -euxo pipefail &&\
|
||||
mkdir -p /etc/nomad-vector-logger
|
||||
COPY --from=builder --chown=root:root --chmod=755 /nomad-vector-logger /usr/local/bin/nomad-vector-logger
|
||||
COPY --from=builder /tmp/nomad-vector-logger/config.sample.toml /etc/nomad-vector-logger/
|
||||
|
||||
WORKDIR /etc/nomad-vector-logger/
|
||||
CMD ["nomad-vector-logger"]
|
|
@ -0,0 +1,8 @@
|
|||
#!/bin/sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
vault write nomad/role/nomad-vector-logger \
|
||||
ttl=720h \
|
||||
max_ttl=720h \
|
||||
policies="nomad-vector-logger"
|
|
@ -167,3 +167,15 @@ vault write pki/consul/roles/cluster-exporter \
|
|||
server_flag=false \
|
||||
client_flag=true \
|
||||
ou="Cluster metrics exporter"
|
||||
|
||||
# Create a role on the Nomad PKI for nomad-vector-logger
|
||||
vault write pki/nomad/roles/nomad-vector-logger \
|
||||
allowed_domains='nomad-vector-logger.nomad.consul' \
|
||||
allow_bare_domains=true \
|
||||
allow_subdomains=false \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=168h \
|
||||
allow_ip_sans=false \
|
||||
server_flag=false \
|
||||
client_flag=true \
|
||||
ou="Nomad Vector Logger"
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
|
||||
|
||||
namespace "*" {
|
||||
capabilities = ["list-jobs", "read-job"]
|
||||
}
|
||||
|
||||
node {
|
||||
policy = "read"
|
||||
}
|
||||
|
||||
agent {
|
||||
policy = "read"
|
||||
}
|
|
@ -6,7 +6,7 @@ job "monitoring-services" {
|
|||
|
||||
|
||||
# Metrics is running prometheus and various exporters
|
||||
group "metrics" {
|
||||
group "metrics-server" {
|
||||
|
||||
shutdown_delay = "6s"
|
||||
count = 1
|
||||
|
@ -34,6 +34,7 @@ job "monitoring-services" {
|
|||
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
job = "${NOMAD_JOB_NAME}"
|
||||
namespace = "${NOMAD_NAMESPACE}"
|
||||
}
|
||||
|
||||
connect {
|
||||
|
@ -109,15 +110,18 @@ job "monitoring-services" {
|
|||
policies = ["metrics"]
|
||||
}
|
||||
|
||||
# Get a certificate from vault to protect the metrics endpoint
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}{{ end -}}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
# Get the root CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
|
@ -125,6 +129,7 @@ _EOT
|
|||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
server {
|
||||
|
@ -475,68 +480,6 @@ _EOT
|
|||
|
||||
groups:
|
||||
|
||||
- name: ConsulExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ConsulServiceHealthcheckFailed
|
||||
# Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available)
|
||||
expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Consul service healthcheck failed (service {{ $labels.service_name }})
|
||||
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulMissingMasterNode
|
||||
expr: 'consul_raft_peers < (max_over_time(consul_raft_peers{}[6h]) / 2) + 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Consul missing master node (node {{ $labels.node }})
|
||||
description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulAgentUnhealthy
|
||||
expr: 'consul_health_node_status{status="critical"} == 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Consul agent unhealthy (node {{ $labels.node }})
|
||||
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulServiceWarning
|
||||
expr: 'consul_health_service_status{status="warning"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state
|
||||
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulServiceCritical
|
||||
expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state
|
||||
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
|
||||
_EOT
|
||||
destination = "local/rules/consul.yml"
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
}
|
||||
template {
|
||||
data = <<_EOT
|
||||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: JVM
|
||||
|
||||
rules:
|
||||
|
@ -880,6 +823,68 @@ _EOT
|
|||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
}
|
||||
template {
|
||||
data = <<_EOT
|
||||
# vi: syntax=yaml
|
||||
|
||||
groups:
|
||||
|
||||
- name: ConsulExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ConsulServiceHealthcheckFailed
|
||||
# Note : don't check sidecar service health, as they can report a critical state when the main task is pending (eg, waiting for a volume to be available)
|
||||
expr: 'consul_catalog_service_node_healthy{service_name!~".*-sidecar-proxy"} == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Consul service healthcheck failed (service {{ $labels.service_name }})
|
||||
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulMissingMasterNode
|
||||
expr: 'consul_raft_leader != 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Consul missing master node (node {{ $labels.node }})
|
||||
description: "No consul leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulAgentUnhealthy
|
||||
expr: 'consul_health_node_status{status="critical"} == 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Consul agent unhealthy (node {{ $labels.node }})
|
||||
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulServiceWarning
|
||||
expr: 'consul_health_service_status{status="warning"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state
|
||||
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in warning state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulServiceCritical
|
||||
expr: 'consul_health_service_status{status="critical",service_name!~".*-sidecar-proxy"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state
|
||||
description: "Service {{ $labels.service_name }} on node {{ $labels.node }} is in critical state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
|
||||
_EOT
|
||||
destination = "local/rules/consul.yml"
|
||||
left_delimiter = "{{{"
|
||||
right_delimiter = "}}}"
|
||||
}
|
||||
|
||||
# A client cert, to connect to the AlertManager API
|
||||
template {
|
||||
|
@ -976,6 +981,7 @@ _EOT
|
|||
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
job = "${NOMAD_JOB_NAME}"
|
||||
namespace = "${NOMAD_NAMESPACE}"
|
||||
}
|
||||
|
||||
connect {
|
||||
|
@ -1051,15 +1057,18 @@ _EOT
|
|||
policies = ["metrics"]
|
||||
}
|
||||
|
||||
# Get a certificate from vault to protect the metrics endpoint
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}{{ end -}}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
# Get the root CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
|
@ -1067,6 +1076,7 @@ _EOT
|
|||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
server {
|
||||
|
@ -1345,7 +1355,7 @@ _EOT
|
|||
}
|
||||
}
|
||||
|
||||
group "logs" {
|
||||
group "logs-server" {
|
||||
|
||||
shutdown_delay = "6s"
|
||||
|
||||
|
@ -1370,6 +1380,7 @@ _EOT
|
|||
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
job = "${NOMAD_JOB_NAME}"
|
||||
namespace = "${NOMAD_NAMESPACE}"
|
||||
}
|
||||
|
||||
connect {
|
||||
|
@ -1445,15 +1456,18 @@ _EOT
|
|||
policies = ["metrics"]
|
||||
}
|
||||
|
||||
# Get a certificate from vault to protect the metrics endpoint
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}{{ end -}}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
# Get the root CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
|
@ -1461,6 +1475,7 @@ _EOT
|
|||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
server {
|
||||
|
@ -1564,6 +1579,7 @@ limits_config:
|
|||
max_entries_limit_per_query: 20000
|
||||
max_query_parallelism: 128
|
||||
retention_period: 720h
|
||||
split_queries_by_interval: 0
|
||||
ruler:
|
||||
alertmanager_client:
|
||||
tls_ca_path: /secrets/monitoring.ca.pem
|
||||
|
@ -1652,7 +1668,7 @@ _EOT
|
|||
|
||||
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
|
||||
# And with a loki sink. The goal is to be able to collect logs from various sources
|
||||
group "aggregator" {
|
||||
group "logs-aggregator" {
|
||||
|
||||
count = 1
|
||||
shutdown_delay = "6s"
|
||||
|
@ -1672,6 +1688,7 @@ _EOT
|
|||
metrics-port = "${NOMAD_HOST_PORT_metrics}"
|
||||
alloc = "${NOMAD_ALLOC_INDEX}"
|
||||
job = "${NOMAD_JOB_NAME}"
|
||||
namespace = "${NOMAD_NAMESPACE}"
|
||||
}
|
||||
|
||||
connect {
|
||||
|
@ -1712,88 +1729,6 @@ _EOT
|
|||
]
|
||||
}
|
||||
|
||||
|
||||
# The prometheus metrics proxy, adding mTLS to the metrics endpoint
|
||||
task "metrics-proxy" {
|
||||
driver = "docker"
|
||||
user = 8995
|
||||
|
||||
config {
|
||||
image = "nginxinc/nginx-unprivileged:alpine"
|
||||
force_pull = true
|
||||
volumes = [
|
||||
"local/default.conf:/etc/nginx/conf.d/default.conf:ro"
|
||||
]
|
||||
pids_limit = 100
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
hook = "poststart"
|
||||
sidecar = true
|
||||
}
|
||||
|
||||
vault {
|
||||
policies = ["metrics"]
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}{{ end -}}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
server {
|
||||
listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
|
||||
http2 on;
|
||||
|
||||
ssl_certificate /secrets/metrics.bundle.pem;
|
||||
ssl_certificate_key /secrets/metrics.bundle.pem;
|
||||
ssl_client_certificate /local/monitoring.ca.pem;
|
||||
ssl_verify_client on;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 1h;
|
||||
ssl_session_tickets off;
|
||||
gzip on;
|
||||
gzip_types
|
||||
text/plain;
|
||||
gzip_vary on;
|
||||
|
||||
server_tokens off;
|
||||
|
||||
if ($request_method !~ ^(GET|HEAD)$ ) {
|
||||
return 405;
|
||||
}
|
||||
location /metrics {
|
||||
proxy_pass http://127.0.0.1:9001/metrics;
|
||||
}
|
||||
}
|
||||
_EOT
|
||||
destination = "local/default.conf"
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 10
|
||||
memory_max = 20
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
task "vector" {
|
||||
driver = "docker"
|
||||
|
||||
|
@ -1805,6 +1740,14 @@ _EOT
|
|||
}
|
||||
|
||||
|
||||
vault {
|
||||
policies = ["metrics"]
|
||||
env = false
|
||||
disable_file = true
|
||||
change_mode = "noop"
|
||||
}
|
||||
|
||||
|
||||
|
||||
# Use a template block instead of env {} so we can fetch values from vault
|
||||
template {
|
||||
|
@ -1817,6 +1760,25 @@ _EOT
|
|||
env = true
|
||||
}
|
||||
|
||||
# Get a certificate from vault to protect the metrics endpoint
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
# Get the root CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
_EOT
|
||||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
|
@ -1891,7 +1853,13 @@ sinks:
|
|||
prometheus:
|
||||
type: prometheus_exporter
|
||||
inputs: ["vector_metrics"]
|
||||
address: "127.0.0.1:9001"
|
||||
address: 0.0.0.0:$${NOMAD_ALLOC_PORT_metrics}
|
||||
tls:
|
||||
enabled: true
|
||||
crt_file: /secrets/metrics.bundle.pem
|
||||
key_file: /secrets/metrics.bundle.pem
|
||||
ca_file: /local/monitoring.ca.pem
|
||||
verify_certificate: true
|
||||
|
||||
_EOT
|
||||
destination = "local/vector.yml"
|
||||
|
@ -1947,6 +1915,18 @@ _EOT
|
|||
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
|
||||
destination_type = "service"
|
||||
}
|
||||
upstreams {
|
||||
destination_name = "loki"
|
||||
local_bind_port = 3100
|
||||
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
|
||||
destination_type = "service"
|
||||
}
|
||||
upstreams {
|
||||
destination_name = "prometheus"
|
||||
local_bind_port = 9090
|
||||
# Work arround, see https://github.com/hashicorp/nomad/issues/18538
|
||||
destination_type = "service"
|
||||
}
|
||||
}
|
||||
}
|
||||
sidecar_task {
|
||||
|
@ -2015,15 +1995,18 @@ _EOT
|
|||
policies = ["metrics"]
|
||||
}
|
||||
|
||||
# Get a certificate from vault to protect the metrics endpoint
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
|
||||
{{ .Cert }}
|
||||
{{ .Key }}{{ end -}}
|
||||
{{ .Key }}
|
||||
{{- end }}
|
||||
_EOT
|
||||
destination = "secrets/metrics.bundle.pem"
|
||||
}
|
||||
|
||||
# Get the root CA
|
||||
template {
|
||||
data = <<_EOT
|
||||
{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
|
||||
|
@ -2031,6 +2014,7 @@ _EOT
|
|||
destination = "local/monitoring.ca.pem"
|
||||
}
|
||||
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
server {
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
path "nomad/creds/nomad-vector-logger" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
path "pki/nomad/issue/nomad-vector-logger" {
|
||||
capabilities = ["update"]
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
FROM golang:alpine AS builder
|
||||
|
||||
RUN set -eux &&\
|
||||
apk --no-cache add tar git ca-certificates &&\
|
||||
cd /tmp &&\
|
||||
git clone --depth=1 --branch=feat/name https://github.com/mr-karan/nomad-vector-logger.git &&\
|
||||
cd nomad-vector-logger &&\
|
||||
CGO_ENABLED=0 go build -ldflags="-s -w" -o /nomad-vector-logger
|
||||
|
||||
FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
|
||||
MAINTAINER [[ .docker.maintainer ]]
|
||||
|
||||
RUN set -euxo pipefail &&\
|
||||
mkdir -p /etc/nomad-vector-logger
|
||||
COPY --from=builder --chown=root:root --chmod=755 /nomad-vector-logger /usr/local/bin/nomad-vector-logger
|
||||
COPY --from=builder /tmp/nomad-vector-logger/config.sample.toml /etc/nomad-vector-logger/
|
||||
|
||||
WORKDIR /etc/nomad-vector-logger/
|
||||
CMD ["nomad-vector-logger"]
|
|
@ -0,0 +1,8 @@
|
|||
#!/bin/sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
vault write nomad/role/nomad-vector-logger[[ .consul.suffix ]] \
|
||||
ttl=720h \
|
||||
max_ttl=720h \
|
||||
policies="nomad-vector-logger[[ .consul.suffix ]]"
|
12
init/pki
12
init/pki
|
@ -80,3 +80,15 @@ vault write pki/consul/roles/cluster-exporter[[ .consul.suffix ]] \
|
|||
server_flag=false \
|
||||
client_flag=true \
|
||||
ou="Cluster metrics exporter"
|
||||
|
||||
# Create a role on the Nomad PKI for nomad-vector-logger
|
||||
vault write pki/nomad/roles/nomad-vector-logger[[ .consul.suffix ]] \
|
||||
allowed_domains='nomad-vector-logger[[ .consul.suffix ]].nomad.[[ .consul.domain ]]' \
|
||||
allow_bare_domains=true \
|
||||
allow_subdomains=false \
|
||||
allow_wildcard_certificates=false \
|
||||
max_ttl=168h \
|
||||
allow_ip_sans=false \
|
||||
server_flag=false \
|
||||
client_flag=true \
|
||||
ou="Nomad Vector Logger"
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
[[- range $ns := .monitoring.namespaces ]]
|
||||
|
||||
namespace "[[ $ns ]]" {
|
||||
capabilities = ["list-jobs", "read-job"]
|
||||
}
|
||||
[[- end ]]
|
||||
|
||||
node {
|
||||
policy = "read"
|
||||
}
|
||||
|
||||
agent {
|
||||
policy = "read"
|
||||
}
|
|
@ -3,7 +3,7 @@ job "[[ .instance ]]-services" {
|
|||
[[ template "common/job_start" . ]]
|
||||
|
||||
# Metrics is running prometheus and various exporters
|
||||
group "metrics" {
|
||||
group "metrics-server" {
|
||||
[[- $c := merge .monitoring.prometheus .monitoring . ]]
|
||||
|
||||
shutdown_delay = "6s"
|
||||
|
@ -375,7 +375,7 @@ _EOT
|
|||
}
|
||||
}
|
||||
|
||||
group "logs" {
|
||||
group "logs-server" {
|
||||
|
||||
[[- $c := merge .monitoring.loki .monitoring . ]]
|
||||
|
||||
|
@ -481,7 +481,7 @@ _EOT
|
|||
|
||||
# The aggregator group runs vector with different sources connectors (syslog, fluentd, vector etc.)
|
||||
# And with a loki sink. The goal is to be able to collect logs from various sources
|
||||
group "aggregator" {
|
||||
group "logs-aggregator" {
|
||||
[[- $c := merge .monitoring.aggregator .monitoring . ]]
|
||||
|
||||
count = [[ $c.count ]]
|
||||
|
@ -530,8 +530,6 @@ _EOT
|
|||
}
|
||||
[[- end ]]
|
||||
|
||||
[[ template "common/task.metrics_proxy" $c ]]
|
||||
|
||||
task "vector" {
|
||||
driver = "[[ $c.nomad.driver ]]"
|
||||
|
||||
|
@ -542,7 +540,9 @@ _EOT
|
|||
args = [ "--config=/local/vector.yml" ]
|
||||
}
|
||||
|
||||
[[ template "common/vault.policies" $c ]]
|
||||
[[ template "common/file_env" $c ]]
|
||||
[[ template "common/metrics_cert" $c ]]
|
||||
|
||||
template {
|
||||
data = <<_EOT
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
[app]
|
||||
log_level = "info"
|
||||
env = "prod"
|
||||
refresh_interval = "10s"
|
||||
remove_alloc_interval = "30s"
|
||||
nomad_data_dir = "/nomad"
|
||||
vector_config_dir = "/alloc/data/vector_conf"
|
||||
extra_templates_dir = "/local/template/"
|
||||
custom_logs_dir = "alloc/custom"
|
|
@ -0,0 +1,36 @@
|
|||
sources:
|
||||
{{- range $value := . }}
|
||||
|
||||
source_{{ $value.Key }}:
|
||||
type: file
|
||||
include: ["{{ $value.LogDir }}"]
|
||||
line_delimiter: "\n"
|
||||
read_from: beginning
|
||||
# Handle multi-line Java stacktraces
|
||||
multiline:
|
||||
start_pattern: "^[^\\s]"
|
||||
mode: continue_through
|
||||
condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
|
||||
timeout_ms: 1000
|
||||
|
||||
{{- end }}
|
||||
|
||||
transforms:
|
||||
{{- range $value := . }}
|
||||
|
||||
transform_{{ $value.Key }}:
|
||||
type: remap
|
||||
inputs: ["source_{{ $value.Key }}"]
|
||||
source: |
|
||||
# Store Nomad metadata.
|
||||
.nomad.namespace = "{{ $value.Namespace }}"
|
||||
.nomad.node_name = "{{ $value.Node }}"
|
||||
.nomad.job_name = "{{ $value.Job }}"
|
||||
.nomad.group_name = "{{ $value.Group }}"
|
||||
.nomad.task_name = "{{ $value.Task }}"
|
||||
.nomad.alloc_id = "{{ $value.ID }}"
|
||||
.nomad.alloc_name = "{{ $value.Name }}"
|
||||
# Set alloc = <TaskName>-<Alloc Index> so it's similar to what prometheus has
|
||||
.nomad.alloc = join!(["{{ $value.Task }}", replace("{{ $value.Name }}", r'.+\[(?P<index>\d+)\]', "$$index")], separator: "-")
|
||||
|
||||
{{- end }}
|
|
@ -0,0 +1,38 @@
|
|||
data_dir: /data
|
||||
# Don't keep metrics indefinitly if they are not updated anymore
|
||||
expire_metrics_secs: 60
|
||||
|
||||
sources:
|
||||
metrics-vector:
|
||||
type: internal_metrics
|
||||
|
||||
sinks:
|
||||
loki:
|
||||
type: loki
|
||||
inputs: ["transform_nomad_alloc_*"]
|
||||
endpoint: http://127.0.0.1:3100
|
||||
encoding:
|
||||
codec: text
|
||||
labels:
|
||||
namespace: "{{ .nomad.namespace }}"
|
||||
job: "{{ .nomad.job_name }}"
|
||||
group: "{{ .nomad.group_name }}"
|
||||
task: "{{ .nomad.task_name }}"
|
||||
host: "{{ .nomad.node_name }}"
|
||||
alloc: "{{ .nomad.alloc }}"
|
||||
buffer:
|
||||
type: disk
|
||||
max_size: 268435488
|
||||
remove_label_fields: true
|
||||
|
||||
prometheus:
|
||||
type: prometheus_exporter
|
||||
inputs: ["metrics-vector"]
|
||||
address: 0.0.0.0:${NOMAD_ALLOC_PORT_metrics}
|
||||
tls:
|
||||
enabled: true
|
||||
crt_file: /secrets/metrics.bundle.pem
|
||||
key_file: /secrets/metrics.bundle.pem
|
||||
ca_file: /local/monitoring.ca.pem
|
||||
verify_certificate: true
|
||||
|
|
@ -82,4 +82,10 @@ sinks:
|
|||
prometheus:
|
||||
type: prometheus_exporter
|
||||
inputs: ["vector_metrics"]
|
||||
address: "127.0.0.1:9001"
|
||||
address: 0.0.0.0:${NOMAD_ALLOC_PORT_metrics}
|
||||
tls:
|
||||
enabled: true
|
||||
crt_file: /secrets/metrics.bundle.pem
|
||||
key_file: /secrets/metrics.bundle.pem
|
||||
ca_file: /local/monitoring.ca.pem
|
||||
verify_certificate: true
|
||||
|
|
|
@ -51,6 +51,7 @@ limits_config:
|
|||
ingestion_burst_size_mb: 100
|
||||
max_entries_limit_per_query: 20000
|
||||
max_query_parallelism: 128
|
||||
split_queries_by_interval: 0
|
||||
|
||||
ruler:
|
||||
alertmanager_url: alertmanager-tls[[ .consul.suffix ]]
|
||||
|
|
|
@ -179,6 +179,14 @@ scrape_configs:
|
|||
action: drop
|
||||
regex: (nomad(\-client)?|consul|vault)
|
||||
|
||||
[[- if not (has .namespaces "*") ]]
|
||||
|
||||
# Only monitor services from the namespace configured
|
||||
- source_labels: [__meta_consul_service_metadata_namespace]
|
||||
regex: ^[[ $namespaces := coll.Slice ]][[ range $ns := .namespaces ]][[ $ns = $ns | regexp.Replace "^\\*$" ".+" ]][[ $namespaces = append $ns $namespaces ]][[ end ]][[ join $namespaces "|" ]]$
|
||||
action: keep
|
||||
[[- end ]]
|
||||
|
||||
# Only keep services having a metrics-port set
|
||||
- source_labels: [__meta_consul_service_metadata_metrics_port]
|
||||
regex: \d+
|
||||
|
|
|
@ -17,13 +17,13 @@ groups:
|
|||
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulMissingMasterNode
|
||||
expr: 'consul_raft_peers < (max_over_time(consul_raft_peers{}[6h]) / 2) + 1'
|
||||
expr: 'consul_raft_leader != 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Consul missing master node (node {{ $labels.node }})
|
||||
description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "No consul leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulAgentUnhealthy
|
||||
expr: 'consul_health_node_status{status="critical"} == 1'
|
||||
|
|
|
@ -14,6 +14,9 @@ vault:
|
|||
|
||||
monitoring:
|
||||
|
||||
namespaces:
|
||||
- '*'
|
||||
|
||||
exporters:
|
||||
count: 1
|
||||
|
||||
|
@ -166,6 +169,9 @@ monitoring:
|
|||
upstreams:
|
||||
- destination_name: 'loki[[ .consul.suffix ]]'
|
||||
local_bind_port: 3100
|
||||
vault:
|
||||
policies:
|
||||
- metrics[[ .consul.suffix ]]
|
||||
fluentd:
|
||||
enabled: false
|
||||
traefik:
|
||||
|
@ -210,6 +216,10 @@ monitoring:
|
|||
upstreams:
|
||||
- destination_name: postgres[[ .consul.suffix ]]
|
||||
local_bind_port: 5432
|
||||
- destination_name: loki[[ .consul.suffix ]]
|
||||
local_bind_port: 3100
|
||||
- destination_name: prometheus[[ .consul.suffix ]]
|
||||
local_bind_port: 9090
|
||||
volumes:
|
||||
data:
|
||||
type: csi
|
||||
|
@ -229,5 +239,48 @@ monitoring:
|
|||
prometheus:
|
||||
metrics_url: http://localhost:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics
|
||||
|
||||
agent:
|
||||
consul:
|
||||
meta:
|
||||
alloc: '${node.unique.name}'
|
||||
nomad:
|
||||
node_pool: all
|
||||
nomad_vector_logger:
|
||||
version: 24.3
|
||||
image: '[[ .docker.repo ]]nomad-vector-logger:[[ .monitoring.agent.nomad_vector_logger.version ]]-2'
|
||||
env: {}
|
||||
resources:
|
||||
cpu: 20
|
||||
memory: 24
|
||||
memory_max: 50
|
||||
vault:
|
||||
policies:
|
||||
- nomad-vector-logger[[ .consul.suffix ]]
|
||||
vector:
|
||||
image: '[[ .monitoring.vector.image ]]'
|
||||
env: {}
|
||||
resources:
|
||||
cpu: 100
|
||||
memory: 192
|
||||
memory_max: 384
|
||||
vault:
|
||||
policies:
|
||||
- metrics[[ .consul.suffix ]]
|
||||
consul:
|
||||
connect:
|
||||
upstreams:
|
||||
- destination_name: loki[[ .consul.suffix ]]
|
||||
local_bind_port: 3100
|
||||
volumes:
|
||||
nomad:
|
||||
type: host
|
||||
source: nomad_alloc
|
||||
read_only: true
|
||||
data:
|
||||
type: host
|
||||
source: vector_data
|
||||
prometheus:
|
||||
metrics_url: http://127.0.0.1:9001/metrics
|
||||
|
||||
prometheus:
|
||||
enabled: true
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
path "nomad/creds/nomad-vector-logger[[ .consul.suffix ]]" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
path "pki/nomad/issue/nomad-vector-logger[[ .consul.suffix ]]" {
|
||||
capabilities = ["update"]
|
||||
}
|
Loading…
Reference in New Issue