From 094aa1eb7e1e082bdf8a2a23fe4d583d52691e4d Mon Sep 17 00:00:00 2001 From: Daniel Berteaud Date: Tue, 26 Mar 2024 17:05:51 +0100 Subject: [PATCH] Update vector and double thres for Context Switch alert --- example/agent.nomad.hcl | 2 +- example/images/vector/Dockerfile | 2 +- example/services.nomad.hcl | 6 +++--- templates/prometheus/rules/node.yml | 4 ++-- variables.yml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/example/agent.nomad.hcl b/example/agent.nomad.hcl index 0406a0e..b875423 100644 --- a/example/agent.nomad.hcl +++ b/example/agent.nomad.hcl @@ -296,7 +296,7 @@ _EOT leader = true config { - image = "danielberteaud/vector:0.36.1-1" + image = "danielberteaud/vector:0.37.0-1" userns_mode = "host" readonly_rootfs = true pids_limit = 200 diff --git a/example/images/vector/Dockerfile b/example/images/vector/Dockerfile index 4b231a4..338ad0e 100644 --- a/example/images/vector/Dockerfile +++ b/example/images/vector/Dockerfile @@ -1 +1 @@ -FROM timberio/vector:0.36.1-alpine +FROM timberio/vector:0.37.0-alpine diff --git a/example/services.nomad.hcl b/example/services.nomad.hcl index cf8c94e..7af9208 100644 --- a/example/services.nomad.hcl +++ b/example/services.nomad.hcl @@ -1116,13 +1116,13 @@ groups: description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostContextSwitching - expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 20000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning annotations: summary: Host context switching (instance {{ $labels.instance }}) - description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Context switching is growing on the node (> 20000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # - alert: HostSwapIsFillingUp # expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' @@ -2096,7 +2096,7 @@ _EOT leader = true config { - image = "danielberteaud/vector:0.36.1-1" + image = "danielberteaud/vector:0.37.0-1" readonly_rootfs = true pids_limit = 200 args = ["--config=/local/vector.yml"] diff --git a/templates/prometheus/rules/node.yml b/templates/prometheus/rules/node.yml index df796c9..c701caa 100644 --- a/templates/prometheus/rules/node.yml +++ b/templates/prometheus/rules/node.yml @@ -176,13 +176,13 @@ groups: description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostContextSwitching - expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 20000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning annotations: summary: Host context switching (instance {{ $labels.instance }}) - description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Context switching is growing on the node (> 20000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # - alert: HostSwapIsFillingUp # expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' diff --git a/variables.yml b/variables.yml index 0a817c0..da1c39e 100644 --- a/variables.yml +++ b/variables.yml @@ -262,7 +262,7 @@ monitoring: # Common vector settings vector: # Version of vector - version: 0.36.1 + version: 0.37.0 # Docker image to use image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1'