From 2ae2a910025ff45db11104b22fc2d0d563f22270 Mon Sep 17 00:00:00 2001 From: Daniel Berteaud Date: Mon, 25 Mar 2024 22:23:31 +0100 Subject: [PATCH] Various cleanup --- agent.nomad.hcl | 55 +- .../service-defaults/vector-aggregator.hcl | 3 + .../service-intentions/vector-aggregator.hcl | 16 + example/.services.nomad.hcl.swp | 0 example/.variables.yml.swp | Bin 12022 -> 35547 bytes example/agent.nomad.hcl | 39 +- .../service-defaults/vector-aggregator.hcl | 3 + .../service-intentions/vector-aggregator.hcl | 15 + example/exporters.nomad.hcl | 2 +- example/images/node-exporter/Dockerfile | 24 + example/services.nomad.hcl | 593 ++++++++++++++---- images/node-exporter/Dockerfile | 24 + services.nomad.hcl | 28 +- templates/agent/vector-template.yml | 2 + templates/alertmanager/nginx.conf | 39 +- templates/prometheus/rules/loki.yml | 41 ++ templates/prometheus/rules/node.yml | 347 ++++++++++ variables.yml | 246 ++++++-- 18 files changed, 1281 insertions(+), 196 deletions(-) create mode 100644 consul/config/service-defaults/vector-aggregator.hcl create mode 100644 consul/config/service-intentions/vector-aggregator.hcl create mode 100644 example/.services.nomad.hcl.swp create mode 100644 example/consul/config/service-defaults/vector-aggregator.hcl create mode 100644 example/consul/config/service-intentions/vector-aggregator.hcl create mode 100644 example/images/node-exporter/Dockerfile create mode 100644 images/node-exporter/Dockerfile create mode 100644 templates/prometheus/rules/loki.yml create mode 100644 templates/prometheus/rules/node.yml diff --git a/agent.nomad.hcl b/agent.nomad.hcl index 0bfd901..20c371f 100644 --- a/agent.nomad.hcl +++ b/agent.nomad.hcl @@ -1,7 +1,9 @@ job "[[ .instance ]]-agent" { [[- $c := merge .monitoring.agent .monitoring . ]] + [[ template "common/job_start" $c ]] + type = "system" # This group will collect logs from the allocation running on the node @@ -39,16 +41,16 @@ job "[[ .instance ]]-agent" { user = 3987 config { - image = "[[ $n.image ]]" + image = "[[ $n.image ]]" readonly_rootfs = true - pids_limit = 50 + pids_limit = 50 # Nomad Vector Logger needs to run on the host's network namespace # so it can reach the Nomad Agent API on localhost:4646 network_mode = "host" # Host network namespace requires disabling user namespace userns_mode = "host" - command = "nomad-vector-logger" - args = [ + command = "nomad-vector-logger" + args = [ "--config", "/local/nomad-vector-logger.toml" ] @@ -85,9 +87,9 @@ _EOT destination = "local/nomad-vector-logger.toml" } - # Disable the default nomad.toml template + # Disable the default nomad.toml template, as we provide our own nomad.yml template template { - data = "# Disable the default toml template" + data = "# Disable the default toml template" destination = "local/template/nomad.toml" } @@ -143,11 +145,13 @@ _EOT driver = "[[ $c.nomad.driver ]]" config { - image = "busybox:latest" - command = "sh" - args = [ + image = "busybox:latest" + readonly_rootfs = true + pids_limit = 20 + command = "sh" + args = [ "-c", - "echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done" + "echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done" ] } @@ -170,9 +174,11 @@ _EOT leader = true config { - image = "[[ $c.image ]]" - userns_mode = "host" - args = [ + image = "[[ $c.image ]]" + userns_mode = "host" + readonly_rootfs = true + pids_limit = 200 + args = [ "--watch-config", "--config", "/local/vector.yml", "--config-dir", "/alloc/data/vector_conf" @@ -186,7 +192,9 @@ _EOT } [[ template "common/metrics_cert" $c ]] +[[ template "common/artifacts" $c ]] + # Main vector configuration template { data =<<_EOT [[ template "monitoring/agent/vector.yml" $c ]] @@ -217,6 +225,8 @@ _EOT } } +[[- if .monitoring.agent.node_exporter.enabled ]] + # This group runs the prometheus node-exporter to expose prometheus metrics from the node group "node-exporter" { @@ -238,21 +248,25 @@ _EOT driver = "[[ $c.nomad.driver ]]" config { - image = "[[ $c.image ]]" - pid_mode = "host" - #network_mode = "host" - userns_mode = "host" + image = "[[ $c.image ]]" + pid_mode = "host" + userns_mode = "host" readonly_rootfs = true - pids_limit = 50 - args = [ + pids_limit = 50 + command = "/usr/local/bin/node_exporter" + args = [ "--path.rootfs=/host", "--web.config.file=/local/tls.yml", - "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}" + "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}", +[[- range $arg := $c.args ]] + "[[ $arg ]]", +[[- end ]] ] } [[ template "common/vault.policies" $c ]] [[ template "common/metrics_cert" $c ]] +[[ template "common/artifacts" $c ]] template { data = <<_EOT @@ -271,4 +285,5 @@ _EOT [[ template "common/resources" $c ]] } } +[[- end ]] } diff --git a/consul/config/service-defaults/vector-aggregator.hcl b/consul/config/service-defaults/vector-aggregator.hcl new file mode 100644 index 0000000..ec88768 --- /dev/null +++ b/consul/config/service-defaults/vector-aggregator.hcl @@ -0,0 +1,3 @@ +Kind = "service-defaults" +Name = "vector-aggregator[[ .consul.suffix ]]" +Protocol = "http" diff --git a/consul/config/service-intentions/vector-aggregator.hcl b/consul/config/service-intentions/vector-aggregator.hcl new file mode 100644 index 0000000..2216a22 --- /dev/null +++ b/consul/config/service-intentions/vector-aggregator.hcl @@ -0,0 +1,16 @@ +[[- $c := merge .monitoring.aggregator .monitoring . -]] +Kind = "service-intentions" +Name = "vector-aggregator[[ .consul.suffix ]]" +Sources = [ + { + Name = "[[ $c.traefik.instance ]]" + Permissions = [ + { + Action = "[[ $c.traefik.enabled | ternary "allow" "deny" ]]" + HTTP { + Methods = ["POST"] + } + } + ] + } +] diff --git a/example/.services.nomad.hcl.swp b/example/.services.nomad.hcl.swp new file mode 100644 index 0000000..e69de29 diff --git a/example/.variables.yml.swp b/example/.variables.yml.swp index 1e6e138b0e4049b0dea0e9e2eae1496020e9a742..b0d77bb59a65a1cf53bed80088cb84998cc61328 100644 GIT binary patch literal 35547 zcmeI536LCDd4R_b*cc)UgMlJ8;o-H}U89-VL+5~0mSx$%aHGXM|aQcjL=~h z1$4WeyJKZm!JBAz5>fpr&_Ltt=J1%ekUq7 z!&VT7ouE}O&vsk2hTpSm!RrLxEIUg}&BpS=^F6r)atREVKsTN%u3xcldTMe)P8vDq zN$Sy$+Bf|C{75c=TmrcSatY)T$R&_VAeTTcfm{N=aT18zrxsqwQ=M#|tZRRt)c51vkNU5P=W7U^Ps`6g(NugF9Cg z3Ng&VbKnX%6Yl%nLgAb64R{~C6W#$$sKV1=6dnQp{D?x~T8QCUa0#q~bK$cOFBCoq zZ-JZOFuWKh;bHKT(>MqI4Q_#p;c;*pd>d!O=i$TfPPiEkLj!ifQ{gN)1@6YN@osn% z+yZ;x5*URu;0HK1Zi0EZ95z7_o&XPr`*Ey%7XAZX4ljde!3rqAmvOAz3V#cK1&c5T z`(X>Lg;U`NI6&SDH^X1U3*kyQ8Lp>orG3woHa?n7v5ly`^FgGd`LNrlsl~8!KsCcC zR^gnA=Y16iO@Eb|?Z&F=wNyP=@KqEx`9c0itM03gA9WkC3g(nwU!}s9?=B2lQS7y< zexw$?NY%o{mKwXV>vsY_Dwz9Hd{Zq3jfRS<9k0#3xL@Q^c+HEw67}+1%3h-4P#{|b ziq~k9F_aW>QCK}d+vyACak`N?W(8zwHKG>h10HvtwjhZ}?cKgd&4nGUJ@!N``ao23 zOp9QZA~^{)*3H)nn_g{P-_taiyU2-O(OWzf1myn{$+o5cm!HSnPY0q`T3g(nGbHFp|aHXxz2yd>prDXz5vcg1X|lNxhURDmfNa%4O-Q zUZdnyn1~9c={Z5INy> zTVI+kO-@(VtY4QDC>7z@22uh`la8y?JR;?!z1aq<{7Q7%fvawJW3cXYaF)h?bF<#Sp`pGsd7x>z&lxQkVyy7Kzv zO{a$%a}_GUVN|un>q+Z{40AM4qPGlzPG^UxtJMH>BzY!LjJg*udP_!ov#M!JNzQd@ z!864&J}_L@^(ZPMt{I1=@1VQ9)sk1HkwmSewmSVyHE3$JE&M@x`5eCn$+M-}L#H`r z<5M2TJ((KG1#Yi?>yJo*o~rq^fVVwkWGk%sGwm>JaOQ~GYNsfX`lE2xw71Nmcx43C zMdpCAe=G`>Os_T;H;Vo{BrFa5H`Y%pq*Xl!q;r>f1MJ|Ek@FvnY) zxo7`mEzK~_O!KHkEts2QXm7=tl6A$MA}2SLI$H&HZvj13r3>1^dH6AUqdH$iV)rST zEH1_Kv=wiQv=$X+bPy{5%XJwm2+JCZDHeJ4}$X!}5r z$~nX((xB5B}N z*fCZrjm8oM%k*51nrCE^=PHOHS1cC9fW8r<@lP-+#ejAJ=zB1Z{|j!1J+Kv4!?|z< z`~)NT+weJfBfJutFatYbBaFit@DTVO#_PJypu7=Hxb18;`w z;V)nbE`qgi9y|t4hI=rUzX%_O*TXAd1^k@nxfebM?}j(P>){n3&w3r?TP}fI0=Wco z3FH#UC6G%Xm%y)10vKd1Ssq!dsJPSh3#t16BSR%y$MS;a6GES(?Ld8!h2IYnGZ~WER_D z(266hW}K)sf8+DYV>N%_5LK=o;=Dt670Zoamc81udkE*9T_ZYPE}7f(H&aAqs#v+RL`d)ubZ|LLPfj?wnKO!b67UVR zbl_k{tgi}J9S92Mu>9<{2{rj%Gt<1NNedCUyJua#=DHJW{q?B}$~FJwhCT;-`XEQA z0R3)p(}Z}#r`MJyoe$FPESQIopG8bu!VyJtq}T}SbwWn&_%Wz-qOZd}ZNp;*168{J zi3U`(g0FNsb@k&KG46cXtslZ@n^kA?aIR4kzfwg)OWOM0gthuU5bN8epMMey``z$1 zSb`PsBdq8z!1W;E>8HRQSj@M=E%3+Cfy-eMo&@(|A%6w_3H}Cd01vjqW_UE{d;jVN z{`}6l1ab-F638WxOCXm(E`eMExdeXABp`;%hx2sdac9Xf%w!JC5!@CNfqbJTAyLeu3yeLe4Gf>8o`x|g2u{l% zX#i3xBO5-9?B&j*u-u2xZ*Gk!7d1AaO?O;}$Oq@#^*9}@hhnDNY2b|^uZOgu$*Fav z3I3a`Y?#Fg*jX_JDy) z2G>F$nN$8;fZu1%YuJoJQPZ7a@+A5+5_RM>o#&T%o+PWqC~j90#|T=R^g~H#K>BQ* zuuG;+ucj9Ck<_2pd;cw7SJEjJEhg=&!`&p}1`Yfr64~z}sHIg};x>6Vse$s^Xab3p zBL8MkBN1}Z!%;!f1Oa=RL_CNRZZ{Pm_TBABjGXr1hSXl%Poy?v+s*dKj)xFT0x225 z@^7b&d(EG{9jx8uM%7fsB{fc$+LQSvvGw~%V19}uLu1`vV#l=`u;#A;A0#PV64Jkl zHUB}l9bOMFgToNRcK8L>yu{GI3Ll5J!Oidzn1$Ui3a7$PvFg7CpMl%p)o>%+kC)+_ z@M+irQ*aA$@Rjf-asWOAOK>Sn!;|5=B#FNSo(~>e07du#$>CptkHHL-pa=!Hm&EXQ zz?FdtbYl0fJ(WW}d%rjTrzGAxX!%`zR>VP>B?sgaj2 zDT^i>Gh%IsxrUrKrME$Yh>n%$SR2!viJpm{+(OTl&>Wc&r^;muQoP%p7Ol$Yojcq= zD`Z888k32$E{<7)3A(9W;eMw%iGJ%Cqtu#0C{I@2xEH?HAV-)KWF3yo9Cx8NHAcQV z0`D-Bs1(PQDVyv@!go5Ye|o}XK8~mzb|#w~RJAVIis@<+H+@QLQP83s)DcBqCo*{w z84sC2*_#fXFs0Qq(I7wve9r8@RScUDXd=?W;PeAT$jLNpe!l)Xh( zI5quZD$}Bt5uYsfWSL00hQ(n;LJDn%q}sC2l{HM&B)nU+NAwqsr4F(V#O`ywNp?So zRZDtrG4dO8#WY`cd2M@ZN%+5x_+2%#3Q~|dGyIWOUDbUtx7}}C9(H-((K-}{oB4@RpWeL3q<7f^sUu*h zk^gL(s40DnJasWI^_n;PW^lH>v+7wAr}MOj*xyHC$7{j5iA+VUnRIb@J!zS;=>!kt ztVQ2($Rs6|US+DZCbIx!_$sz!iefI%rT460OUu;9`X(rnU$1=|ybrDV>+T@Sa{E@~ z+MIu6A$j&@QL0=b7Cmn?H^DK3qAaUj(o<3KONhr}T*uf~VU=#jY85N|LTiQJh1LBE zxE4BaCftpsEt%{$z_rkZy|4$?LkS)WC&SmVv|kIaflcsOco=*KOZ$J|{qRcYz#K@- zeIu0MYMo(Fs2Jh&aJJc250geStI;C`&~@4zSF4){lq4E`6v0eB8< zhE?zw_#V%F7u*TA!|UJ%SOk$t6ZXSCxD?jHJ;>`Wcm_NT#IipNzKaEa7u*i7gX=(+ zhVOzcP=?3DFRvk88gTatekRroCCiOv2@?J_-G}7e z8bvH+Cr57ds;kUY%&dh`=SXjw^&-E__*IL_b}7v{>guaGvvd(hOYUm3Pv-AeCl4so zgySn!uGH&G^qqQ3b(P94?cphU-o4Tw?RtmD>^)2N589iQS&%bir`D{sx3ga!FdEYLepl*}i)Z$1-Lh}@w!P1u z*?iHCZ98XnZQirznV0RpXh7dQuu!J!$x80b%C>cR@S{_kky(jHRz4A8 zzVNXmrh)9W$|E25aHepoU-DU#iaSLwx;5b)S(EFE)Am?XK9(WLxC?%R&=T=Hs!$^i z#yU%{hLP@0m#-fXXv};|lJ}P6ndy`2axP8p`GTA-+p!Gb&T*P110QLSIu6$McZ1u#6zdTOy)%;2La%XX`AGkJVqWRO^7Ci$AGo676KS)5NsrtG)E zZ@y(P{b2zd3dLr6;%qN;`j9iBge1V-PFW9Z8s?^%;R}& z;D#8ClFTgz?ufN|WC{1D;STtFkdU(&x|`rUSP7@Y_c3Pgflt7D;TEVu1;pSz6CMgb z#JK%3d@Bugg`(X=6y`BoE!_TSbo$wyG75)wm!;9f6sKf8W z6JRBr44=g4m2aWF3;qFah8y68(12+;15SY-V(|Va{42Z#UJKVj2c8EzVFR2GE8!;? zyI+IP!TUkJmG!b*@(_%|?yuZ73KY4Cjv-n-!i za5cm*56=M!fv*D5q}=?)B`PLy;~=enjds0(h#5iLV&OrMCkYVK2X zv)iM)?+IC?a>pavoDYso_TZUU+5D4Zu?sR5P}a`TZE!5*EVm+@VkOP_0GI^g8Pn$e z(N5);O}3@w#T2zk7y46}Vt|#@Weq%e$<(^$H#o<6N8FH{@tP(*DRmNQyXcs+RJ9`^ zlb^cR=8b0ZRILbYngwZ5Jz1&S#j@A_3VW$Oy=Ro*%A?Ijl}B7QaExTKDdYr+l({r5 zow8+?;Ja)Y>0CYDh~@sAp%GpCaVpFnlZB%nrl&W$w?kj>bMgU8**Xrtg9wEf=!VkT zOqSM=&$A|7MW3&quO|X3WwBQm$!~Eu7>SF$iLT-lNxzgk%z9L89lz@1x*|R!A0eeE zDNHt^Cf)YTEZ;)7!lboJ9Aj z7@tdU5X3Nj7F-C*u?D`)2mdf8Lw( z$>Y1vJuydT>>fe8osj3FzxW5UPmS$<`$iJ_;X}tyoO=4<+2i)?$vNdQvc)WyS#I^* zDVr}z!2)gvS+dpj!q9+DAp$4JQpuu}i^55=G?JN!(nZHF(=g@(rWGb4=EG>OCGpn5 zjm0Qn6xcz5ePn8~=PIX(d+(w9?|60xe{dhqkvJsC}0#Y3K#{90@tAeEF@(8213Mc z1dsp!H{bs^ZzJRy_znCDu7V%HH{f&d8MqAI2akfI;C?U;c7Z={C1eBq488@+;05p` zcn}-{`@!G)2-yT1;79N>NWob!2RvYbyTR345Cixgd=0(=pMuNa68I3D0?&ZQ!DC<= z90GTOaj*w$-b~2v;0jm=pMXo?9dHJm26JErJOB=Wi#HK+0h|Pn02_>h17Hjgu(6kr zpTNi9Bk(SG9jt--z+GT3xV8t+2Va17@E&*zyarB#6Tk%&{DF1(1$+m-1aE>jz^g#5 z*^9t5qkvJsC}0%09u%OIr|KMA=us(3PI8eYTrTml@UZ3A&7udp({L6`4))!Fw}^tG zw&YnH1rbMQ)pdoHv8>)wT072{BKGK#l)2|Py~%0Y#ZS+hac8ES{;~Q4QDdo=b1~22 zI1B1vD=VdPE$1E$iik9pA}d7z+hE*Q$RulmSMTdEv$#x}nEM1-U~$3qkT^!j7ou~O zdtUmOWH*lj6JaqIlG}WhC3(#4Of1%+Bs|H)nn#bhwIRq$j~=P61h3+(<>KjzM_+nH zJ4OjxRPl!_jd<*<(ODT*&q-K{o5FK#C7;!+X0(Dsi&2((v@+{af~-uXM|-%?5jH)W zr5&{QDK$2uh3>c+LSS~&2@ z3PkOMQ6U{ssw1F%J_U~#tgbE`j={o^hZbMWGpv`;M?+OSDySTaE@Hu}9Ga-sV_pwh zne-!TLz4x|nP2FXAZi%Z21~JvrpfA&1xR#P{MxR7nsVvJ)1}h(8!oUBwNv{x`96L0 zSZ&b>Kk07CczwG`d3~`>&UZn1llBp87AoU)hc>PCbQMnNHuD_{2W8&tb!Cnyx!*Lh6dN50WBXX|ieFv8ePqu!J#nw|!ZW~%q7-N9|@Urxi1S#ZwJR$E=wesy@8 ziKrVyJ*(-}PG!q1x`3ye0RNkSa~$>Vg6-L(Mb~y6s`9VHY!eaM)WxqpKfBEV7g1VB PmZ}e>&Wvx9mPP&n2&$W6 diff --git a/example/agent.nomad.hcl b/example/agent.nomad.hcl index ccfb727..c233b43 100644 --- a/example/agent.nomad.hcl +++ b/example/agent.nomad.hcl @@ -1,8 +1,11 @@ job "monitoring-agent" { + datacenters = ["dc1"] region = "global" node_pool = "all" + priority = 60 + type = "system" @@ -161,7 +164,7 @@ _EOT destination = "local/nomad-vector-logger.toml" } - # Disable the default nomad.toml template + # Disable the default nomad.toml template, as we provide our own nomad.yml template template { data = "# Disable the default toml template" destination = "local/template/nomad.toml" @@ -184,6 +187,8 @@ sources: mode: continue_through condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)" timeout_ms: 1000 + ignore_older_secs: 1800 + oldest_first: true {{- end }} @@ -262,11 +267,13 @@ _EOT driver = "docker" config { - image = "busybox:latest" - command = "sh" + image = "busybox:latest" + readonly_rootfs = true + pids_limit = 20 + command = "sh" args = [ "-c", - "echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done" + "echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done" ] } @@ -289,8 +296,10 @@ _EOT leader = true config { - image = "danielberteaud/vector:0.36.1-1" - userns_mode = "host" + image = "danielberteaud/vector:0.36.1-1" + userns_mode = "host" + readonly_rootfs = true + pids_limit = 200 args = [ "--watch-config", "--config", "/local/vector.yml", @@ -331,6 +340,9 @@ _EOT } + + + # Main vector configuration template { data = <<_EOT data_dir: /data @@ -398,8 +410,8 @@ _EOT resources { cpu = 100 - memory = 192 - memory_max = 384 + memory = 384 + memory_max = 512 } } @@ -436,16 +448,17 @@ _EOT driver = "docker" config { - image = "quay.io/prometheus/node-exporter:latest" - pid_mode = "host" - #network_mode = "host" + image = "danielberteaud/node-exporter:1.7.0-1" + pid_mode = "host" userns_mode = "host" readonly_rootfs = true pids_limit = 50 + command = "/usr/local/bin/node_exporter" args = [ "--path.rootfs=/host", "--web.config.file=/local/tls.yml", - "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}" + "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}", + "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)", ] } @@ -477,6 +490,8 @@ _EOT } + + template { data = <<_EOT tls_server_config: diff --git a/example/consul/config/service-defaults/vector-aggregator.hcl b/example/consul/config/service-defaults/vector-aggregator.hcl new file mode 100644 index 0000000..9af7463 --- /dev/null +++ b/example/consul/config/service-defaults/vector-aggregator.hcl @@ -0,0 +1,3 @@ +Kind = "service-defaults" +Name = "vector-aggregator" +Protocol = "http" diff --git a/example/consul/config/service-intentions/vector-aggregator.hcl b/example/consul/config/service-intentions/vector-aggregator.hcl new file mode 100644 index 0000000..5e76613 --- /dev/null +++ b/example/consul/config/service-intentions/vector-aggregator.hcl @@ -0,0 +1,15 @@ +Kind = "service-intentions" +Name = "vector-aggregator" +Sources = [ + { + Name = "traefik" + Permissions = [ + { + Action = "allow" + HTTP { + Methods = ["POST"] + } + } + ] + } +] diff --git a/example/exporters.nomad.hcl b/example/exporters.nomad.hcl index 79fe485..035f046 100644 --- a/example/exporters.nomad.hcl +++ b/example/exporters.nomad.hcl @@ -411,7 +411,7 @@ _EOT resources { cpu = 10 - memory = 15 + memory = 20 } } diff --git a/example/images/node-exporter/Dockerfile b/example/images/node-exporter/Dockerfile new file mode 100644 index 0000000..4a7eb63 --- /dev/null +++ b/example/images/node-exporter/Dockerfile @@ -0,0 +1,24 @@ +FROM danielberteaud/alpine:24.3-1 AS builder + +ARG EXPORTER_VERSION=1.7.0 + +ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz /tmp +ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/sha256sums.txt /tmp + +RUN set -euxo pipefail &&\ + apk --no-cache add \ + curl \ + tar \ + ca-certificates \ + &&\ + cd /tmp &&\ + grep node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz sha256sums.txt | sha256sum -c &&\ + tar xvzf node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz &&\ + mv node_exporter-${EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/node_exporter + +FROM danielberteaud/alpine:24.3-1 +MAINTAINER Daniel Berteaud + +COPY --from=builder --chown=root:root --chmod=755 /usr/local/bin/node_exporter /usr/local/bin/node_exporter + +CMD ["/usr/local/bin/node_exporter"] diff --git a/example/services.nomad.hcl b/example/services.nomad.hcl index fcbd7c0..032dcdf 100644 --- a/example/services.nomad.hcl +++ b/example/services.nomad.hcl @@ -5,7 +5,7 @@ job "monitoring-services" { region = "global" - # Metrics is running prometheus and various exporters + # Metrics is running prometheus group "metrics-server" { shutdown_delay = "6s" @@ -67,7 +67,7 @@ job "monitoring-services" { type = "http" expose = true path = "/-/healthy" - interval = "15s" + interval = "20s" timeout = "8s" check_restart { limit = 10 @@ -77,11 +77,6 @@ job "monitoring-services" { tags = [ - "traefik.enable=true", - "traefik.http.routers.monitoring-prometheus.entrypoints=https", - "traefik.http.routers.monitoring-prometheus.rule=Host(`prometheus.example.org`)", - "traefik.http.middlewares.csp-monitoring-prometheus.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';", - "traefik.http.routers.monitoring-prometheus.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-prometheus", ] } @@ -892,6 +887,410 @@ _EOT left_delimiter = "{{{" right_delimiter = "}}}" } + template { + data = <<_EOT +groups: + +- name: EmbeddedExporter + + rules: + + - alert: LokiProcessTooManyRestarts + expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2' + for: 0m + labels: + severity: warning + annotations: + summary: Loki process too many restarts (instance {{ $labels.instance }}) + description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestErrors + expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10' + for: 15m + labels: + severity: critical + annotations: + summary: Loki request errors (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestPanic + expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0' + for: 5m + labels: + severity: critical + annotations: + summary: Loki request panic (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestLatency + expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1' + for: 5m + labels: + severity: critical + annotations: + summary: Loki request latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +_EOT + destination = "local/rules/loki.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } + template { + data = <<_EOT +groups: + +- name: NodeExporter + + rules: + + - alert: HostOutOfMemory + expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostMemoryUnderMemoryPressure + expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure (instance {{ $labels.instance }}) + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostMemoryIsUnderutilized + expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 1w + labels: + severity: info + annotations: + summary: Host Memory is underutilized (instance {{ $labels.instance }}) + description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputIn + expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput in (instance {{ $labels.instance }}) + description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputOut + expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput out (instance {{ $labels.instance }}) + description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskReadRate + expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk read rate (instance {{ $labels.instance }}) + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskWriteRate + expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write rate (instance {{ $labels.instance }}) + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfDiskSpace + expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostDiskWillFillIn24Hours + expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfInodes + expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostFilesystemDeviceError + expr: 'node_filesystem_device_error == 1' + for: 0m + labels: + severity: critical + annotations: + summary: Host filesystem device error (instance {{ $labels.instance }}) + description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostInodesWillFillIn24Hours + expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskReadLatency + expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk read latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskWriteLatency + expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostHighCpuLoad + expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m + labels: + severity: warning + annotations: + summary: Host high CPU load (instance {{ $labels.instance }}) + description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# - alert: HostCpuIsUnderutilized +# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 1w +# labels: +# severity: info +# annotations: +# summary: Host CPU is underutilized (instance {{ $labels.instance }}) +# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuStealNoisyNeighbor + expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) + description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuHighIowait + expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU high iowait (instance {{ $labels.instance }}) + description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskIo + expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostContextSwitching + expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host context switching (instance {{ $labels.instance }}) + description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# - alert: HostSwapIsFillingUp +# expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 2m +# labels: +# severity: warning +# annotations: +# summary: Host swap is filling up (instance {{ $labels.instance }}) +# description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSystemdServiceCrashed + expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd service crashed (instance {{ $labels.instance }}) + description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostPhysicalComponentTooHot + expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNodeOvertemperatureAlarm + expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: critical + annotations: + summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidArrayGotInactive + expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: critical + annotations: + summary: Host RAID array got inactive (instance {{ $labels.instance }}) + description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidDiskFailure + expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host RAID disk failure (instance {{ $labels.instance }}) + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostKernelVersionDeviations + expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 6h + labels: + severity: warning + annotations: + summary: Host kernel version deviations (instance {{ $labels.instance }}) + description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOomKillDetected + expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacCorrectableErrorsDetected + expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: info + annotations: + summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacUncorrectableErrorsDetected + expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkReceiveErrors + expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Receive Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkTransmitErrors + expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Transmit Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkInterfaceSaturated + expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 1m + labels: + severity: warning + annotations: + summary: Host Network Interface Saturated (instance {{ $labels.instance }}) + description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkBondDegraded + expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostConntrackLimit + expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockSkew + expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockNotSynchronising + expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRequiresReboot + expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 4h + labels: + severity: info + annotations: + summary: Host requires reboot (instance {{ $labels.instance }}) + description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +_EOT + destination = "local/rules/node.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + } # A client cert, to connect to the AlertManager API template { @@ -945,8 +1344,11 @@ _EOT network { mode = "bridge" + # Port exposing the web API, with mTLS port "web-tls" {} + # Port used for gossip between the different alertmanager instance port "cluster" {} + # Port to expose metrics to prometheus port "metrics" {} } @@ -1031,101 +1433,10 @@ _EOT tags = [ - "traefik.enable=true", - "traefik.http.routers.monitoring-alertmanager.entrypoints=https", - "traefik.http.routers.monitoring-alertmanager.rule=Host(`alerte.example.org`)", - "traefik.http.middlewares.csp-monitoring-alertmanager.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';", - "traefik.http.routers.monitoring-alertmanager.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-alertmanager", ] } - - # The prometheus metrics proxy, adding mTLS to the metrics endpoint - task "metrics-proxy" { - driver = "docker" - user = 8995 - - config { - image = "nginxinc/nginx-unprivileged:alpine" - force_pull = true - volumes = [ - "local/default.conf:/etc/nginx/conf.d/default.conf:ro" - ] - pids_limit = 100 - } - - lifecycle { - hook = "poststart" - sidecar = true - } - - vault { - policies = ["metrics"] - } - - # Get a certificate from vault to protect the metrics endpoint - template { - data = <<_EOT -{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} -{{ .Cert }} -{{ .Key }} -{{- end }} -_EOT - destination = "secrets/metrics.bundle.pem" - } - - # Get the root CA - template { - data = <<_EOT -{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} -_EOT - destination = "local/monitoring.ca.pem" - } - - - template { - data = <<_EOT -server { - listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl; - http2 on; - - ssl_certificate /secrets/metrics.bundle.pem; - ssl_certificate_key /secrets/metrics.bundle.pem; - ssl_client_certificate /local/monitoring.ca.pem; - ssl_verify_client on; - ssl_protocols TLSv1.2 TLSv1.3; - ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; - ssl_session_cache shared:SSL:10m; - ssl_session_timeout 1h; - ssl_session_tickets off; - gzip on; - gzip_types - text/plain; - gzip_vary on; - - server_tokens off; - - if ($request_method !~ ^(GET|HEAD)$ ) { - return 405; - } - location /metrics { - proxy_pass http://127.0.0.1:9093/metrics; - } -} -_EOT - destination = "local/default.conf" - } - - resources { - cpu = 10 - memory = 10 - memory_max = 20 - } - } - - - # This task will handle mTLS to the AlertManager API # And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy task "untls-proxy" { @@ -1166,10 +1477,11 @@ _EOT template { data = <<_EOT +# UnTLS for the web API server { listen 127.0.0.1:9093; location / { - proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }}; + proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }}; proxy_ssl_certificate /secrets/alertmanager.bundle.pem; proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem; proxy_ssl_verify on; @@ -1180,10 +1492,66 @@ server { } } +# Metrics proxy +server { + listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + + location /metrics { + proxy_ssl_certificate /secrets/alertmanager.bundle.pem; + proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem; + proxy_ssl_verify on; + proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul; + proxy_ssl_trusted_certificate /local/monitoring.ca.pem; + proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }}; + } +} + + _EOT destination = "local/alertmanager.conf" } + # Get a certificate from vault to protect the metrics endpoint + template { + data = <<_EOT +{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }} +{{ .Cert }} +{{ .Key }} +{{- end }} +_EOT + destination = "secrets/metrics.bundle.pem" + } + + # Get the root CA + template { + data = <<_EOT +{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} +_EOT + destination = "local/monitoring.ca.pem" + } + + # Certifiate used by AlertManager template { data = <<_EOT @@ -1203,14 +1571,6 @@ _EOT change_signal = "SIGHUP" } - # The trusted CA - template { - data = <<_EOT -{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} -_EOT - destination = "local/monitoring.ca.pem" - } - resources { cpu = 10 memory = 18 @@ -1300,7 +1660,7 @@ set -euo pipefail exec alertmanager \ --config.file=/secrets/alertmanager.yml \ --storage.path=/data \ - --web.external-url=https://alerte.example.org \ + --web.external-url=https://alert.example.org \ --web.route-prefix=/ \ --web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \ --cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \ @@ -1430,11 +1790,6 @@ _EOT tags = [ - "traefik.enable=true", - "traefik.http.routers.monitoring-loki.entrypoints=https", - "traefik.http.routers.monitoring-loki.rule=Host(`loki.example.org`)", - "traefik.http.middlewares.csp-monitoring-loki.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';", - "traefik.http.routers.monitoring-loki.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-loki", ] } @@ -2048,7 +2403,7 @@ server { return 405; } location /metrics { - proxy_pass http://localhost:3000/metrics; + proxy_pass http://127.0.0.1:3000/metrics; } } _EOT @@ -2132,7 +2487,6 @@ _EOT # Use a template block instead of env {} so we can fetch values from vault template { data = <<_EOT -GF_SECURITY_ADMIN_PASSWORD={{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }} LANG=fr_FR.utf8 TZ=Europe/Paris _EOT @@ -2142,6 +2496,15 @@ _EOT } + template { + data = <<_EOT +GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}' +_EOT + destination = "secrets/.grafana.env" + perms = 400 + env = true + } + # Basic grafana configuration file template { data = <<_EOT diff --git a/images/node-exporter/Dockerfile b/images/node-exporter/Dockerfile new file mode 100644 index 0000000..05f2cb4 --- /dev/null +++ b/images/node-exporter/Dockerfile @@ -0,0 +1,24 @@ +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder + +ARG EXPORTER_VERSION=[[ .monitoring.agent.node_exporter.version ]] + +ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz /tmp +ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/sha256sums.txt /tmp + +RUN set -euxo pipefail &&\ + apk --no-cache add \ + curl \ + tar \ + ca-certificates \ + &&\ + cd /tmp &&\ + grep node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz sha256sums.txt | sha256sum -c &&\ + tar xvzf node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz &&\ + mv node_exporter-${EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/node_exporter + +FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] +MAINTAINER [[ .docker.maintainer ]] + +COPY --from=builder --chown=root:root --chmod=755 /usr/local/bin/node_exporter /usr/local/bin/node_exporter + +CMD ["/usr/local/bin/node_exporter"] diff --git a/services.nomad.hcl b/services.nomad.hcl index a5b4c0a..cdb2cf1 100644 --- a/services.nomad.hcl +++ b/services.nomad.hcl @@ -2,7 +2,7 @@ job "[[ .instance ]]-services" { [[ template "common/job_start" . ]] - # Metrics is running prometheus and various exporters + # Metrics is running prometheus group "metrics-server" { [[- $c := merge .monitoring.prometheus .monitoring . ]] @@ -28,7 +28,7 @@ job "[[ .instance ]]-services" { type = "http" expose = true path = "/-/healthy" - interval = "15s" + interval = "20s" timeout = "8s" check_restart { limit = 10 @@ -168,8 +168,11 @@ _EOT network { mode = "bridge" + # Port exposing the web API, with mTLS port "web-tls" {} + # Port used for gossip between the different alertmanager instance port "cluster" {} + # Port to expose metrics to prometheus port "metrics" {} } @@ -220,8 +223,6 @@ _EOT ] } -[[ template "common/task.metrics_proxy" $c ]] - # This task will handle mTLS to the AlertManager API # And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy task "untls-proxy" { @@ -253,6 +254,8 @@ _EOT destination = "local/alertmanager.conf" } +[[ template "common/metrics_cert" $c ]] + # Certifiate used by AlertManager template { data = <<_EOT @@ -272,14 +275,6 @@ _EOT change_signal = "SIGHUP" } - # The trusted CA - template { - data = <<_EOT -{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} -_EOT - destination = "local/monitoring.ca.pem" - } - resources { cpu = 10 memory = 18 @@ -617,6 +612,15 @@ _EOT [[ template "common/vault.policies" $c ]] [[ template "common/file_env" $c ]] + template { + data = <<_EOT +GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}' +_EOT + destination = "secrets/.grafana.env" + perms = 400 + env = true + } + # Basic grafana configuration file template { data = <<_EOT diff --git a/templates/agent/vector-template.yml b/templates/agent/vector-template.yml index 45d225d..37f8e79 100644 --- a/templates/agent/vector-template.yml +++ b/templates/agent/vector-template.yml @@ -12,6 +12,8 @@ sources: mode: continue_through condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)" timeout_ms: 1000 + ignore_older_secs: 1800 + oldest_first: true {{- end }} diff --git a/templates/alertmanager/nginx.conf b/templates/alertmanager/nginx.conf index a27d027..1b35167 100644 --- a/templates/alertmanager/nginx.conf +++ b/templates/alertmanager/nginx.conf @@ -1,7 +1,8 @@ +# UnTLS for the web API server { listen 127.0.0.1:9093; location / { - proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }}; + proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }}; proxy_ssl_certificate /secrets/alertmanager.bundle.pem; proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem; proxy_ssl_verify on; @@ -11,3 +12,39 @@ server { deny all; } } + +# Metrics proxy +server { + listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl; + http2 on; + + ssl_certificate /secrets/metrics.bundle.pem; + ssl_certificate_key /secrets/metrics.bundle.pem; + ssl_client_certificate /local/monitoring.ca.pem; + ssl_verify_client on; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1h; + ssl_session_tickets off; + gzip on; + gzip_types + text/plain; + gzip_vary on; + + server_tokens off; + + if ($request_method !~ ^(GET|HEAD)$ ) { + return 405; + } + + location /metrics { + proxy_ssl_certificate /secrets/alertmanager.bundle.pem; + proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem; + proxy_ssl_verify on; + proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.[[ .instance ]].[[ .consul.domain ]]; + proxy_ssl_trusted_certificate /local/monitoring.ca.pem; + proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }}; + } +} + diff --git a/templates/prometheus/rules/loki.yml b/templates/prometheus/rules/loki.yml new file mode 100644 index 0000000..077036a --- /dev/null +++ b/templates/prometheus/rules/loki.yml @@ -0,0 +1,41 @@ +groups: + +- name: EmbeddedExporter + + rules: + + - alert: LokiProcessTooManyRestarts + expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2' + for: 0m + labels: + severity: warning + annotations: + summary: Loki process too many restarts (instance {{ $labels.instance }}) + description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestErrors + expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10' + for: 15m + labels: + severity: critical + annotations: + summary: Loki request errors (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestPanic + expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0' + for: 5m + labels: + severity: critical + annotations: + summary: Loki request panic (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestLatency + expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1' + for: 5m + labels: + severity: critical + annotations: + summary: Loki request latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/templates/prometheus/rules/node.yml b/templates/prometheus/rules/node.yml new file mode 100644 index 0000000..df796c9 --- /dev/null +++ b/templates/prometheus/rules/node.yml @@ -0,0 +1,347 @@ +groups: + +- name: NodeExporter + + rules: + + - alert: HostOutOfMemory + expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostMemoryUnderMemoryPressure + expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure (instance {{ $labels.instance }}) + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostMemoryIsUnderutilized + expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 1w + labels: + severity: info + annotations: + summary: Host Memory is underutilized (instance {{ $labels.instance }}) + description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputIn + expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput in (instance {{ $labels.instance }}) + description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputOut + expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput out (instance {{ $labels.instance }}) + description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskReadRate + expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk read rate (instance {{ $labels.instance }}) + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskWriteRate + expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write rate (instance {{ $labels.instance }}) + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfDiskSpace + expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostDiskWillFillIn24Hours + expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfInodes + expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostFilesystemDeviceError + expr: 'node_filesystem_device_error == 1' + for: 0m + labels: + severity: critical + annotations: + summary: Host filesystem device error (instance {{ $labels.instance }}) + description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostInodesWillFillIn24Hours + expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskReadLatency + expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk read latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskWriteLatency + expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostHighCpuLoad + expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m + labels: + severity: warning + annotations: + summary: Host high CPU load (instance {{ $labels.instance }}) + description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# - alert: HostCpuIsUnderutilized +# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 1w +# labels: +# severity: info +# annotations: +# summary: Host CPU is underutilized (instance {{ $labels.instance }}) +# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuStealNoisyNeighbor + expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) + description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuHighIowait + expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU high iowait (instance {{ $labels.instance }}) + description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskIo + expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostContextSwitching + expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host context switching (instance {{ $labels.instance }}) + description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# - alert: HostSwapIsFillingUp +# expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 2m +# labels: +# severity: warning +# annotations: +# summary: Host swap is filling up (instance {{ $labels.instance }}) +# description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSystemdServiceCrashed + expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd service crashed (instance {{ $labels.instance }}) + description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostPhysicalComponentTooHot + expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNodeOvertemperatureAlarm + expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: critical + annotations: + summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidArrayGotInactive + expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: critical + annotations: + summary: Host RAID array got inactive (instance {{ $labels.instance }}) + description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidDiskFailure + expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host RAID disk failure (instance {{ $labels.instance }}) + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostKernelVersionDeviations + expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 6h + labels: + severity: warning + annotations: + summary: Host kernel version deviations (instance {{ $labels.instance }}) + description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOomKillDetected + expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacCorrectableErrorsDetected + expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: info + annotations: + summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacUncorrectableErrorsDetected + expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkReceiveErrors + expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Receive Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkTransmitErrors + expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Transmit Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkInterfaceSaturated + expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 1m + labels: + severity: warning + annotations: + summary: Host Network Interface Saturated (instance {{ $labels.instance }}) + description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkBondDegraded + expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostConntrackLimit + expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockSkew + expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockNotSynchronising + expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRequiresReboot + expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 4h + labels: + severity: info + annotations: + summary: Host requires reboot (instance {{ $labels.instance }}) + description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/variables.yml b/variables.yml index 46e2d82..77ddff7 100644 --- a/variables.yml +++ b/variables.yml @@ -76,179 +76,284 @@ monitoring: # - https://portal.acme.com http_probes: [] - # Consul exporter will expose consul metrics + # Consul exporter will expose consul metrics (mainly registered services status) consul: + # Version of the exporter version: 0.11.0 + # Docker image to use image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2' + # Custom env var to set in the container env: {} + # Resource allocation resources: cpu: 20 memory: 32 vault: + # Vault policies to attach policies: - 'consul-exporter[[ .consul.suffix ]]' + # The cluster exporter is a simple nginx used as a proxy + # which handles TLS for the cluster services (vault, consul and nomad) cluster: + # Docker image to use image: nginxinc/nginx-unprivileged:alpine + # Custom env env: {} + # Resource allocation resources: cpu: 10 - memory: 15 + memory: 20 vault: + # Vault policies to attach to the task policies: - 'cluster-exporter[[ .consul.suffix ]]' - - metrics + - metrics[[ .consul.suffix ]] + # The prometheus server prometheus: - - version: 2.51.0 - + # Number of instances to run. Note that if you run several instances, they will be independant, and all of + # them will scrape the same data. Then queries to the prometheus API will be loadbalanced between all instances. + # This should work most of the time, but can give some strange result if eg, one of the instances was down (queries + # for data during the downtime can give some random result depending on the instance your query is routed to) count: 1 - + # Version of prometheus + version: 2.51.0 + # Docker image to use image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1' - + # Custom env var to set env: {} - + # Resource allocation resources: cpu: 200 memory: 512 - + # Volumes used for data persistence + # You must create a prometheus-data[0] volume as it's a per_alloc volume volumes: data: type: csi source: 'prometheus-data' per_alloc: true - vault: + # Vault policies to attach to the task policies: - 'prometheus[[ .consul.suffix ]]' - + # A dict of custom jobs. Eg + # jobs: + # squid: + # targets: + # - 10.11.2.3:9305 + # - 192.168.6.20:782 jobs: {} - alert_rules: {} + # A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts. Eg # alert_rules: # postgres: # url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml - + # patroni: + # url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml + # If you need something more flexible (like download an archive of rules and uncompress it, you should use artifacts instead. Just ensure your rules + # are in /local/rules/ inside the container + alert_rules: {} + # The public URL where prometheus will be reachable (if exposed with Traefik) public_url: https://prometheus.example.org + # Traefik settings traefik: - enabled: true + # Turn this on to expose prometheus with Traefik + # Caution : there's no builtin security, you should configure the appropriate middlewares + enabled: false router: prometheus - + # Metrics retention duration retention: 30d - + # always enable prometheus metrics (of course :-) ) prometheus: - enabled: true + # This is the URL where metrics are exposed, where the metrics proxy will point at (from the container PoV) metrics_url: http://localhost:9090/metrics + # AlertManager can process and send alerts alertmanager: + # Number of instances to run. Set > 1 if you wan HA count: 1 + # Version of alertmanager version: 0.27.0 + # DOcker image to use image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-1' + # Custom env var to set in the container env: {} + # Resource allocation resources: cpu: 50 memory: 64 memory_max: 80 - public_url: https://alerte.example.org + # URL where the web interface is reachable (if exposed with Traefik) + public_url: https://alert.example.org + # Traefik settings traefik: - enabled: true + # Turn this on to expose alertmanager with traefik + # Caution : there's no builtin security, you should configure appropriate middlewares before enabling + enabled: false router: alertmanager + # No need to strip prefix as alertmanager will be configured to handle it strip_prefix: false + # Volumes used for data persistence. Note : it's a per_alloc volume + # so you need to create eg alertmanager-data[0]. This volume should be writeable by user with ID 9093 volumes: data: source: 'alertmanager-data' type: csi per_alloc: true - prometheus: - metrics_url: http://127.0.0.1:9093/metrics vault: + # List of vault policies to attach to the task policies: - - metrics - - 'alertmanager[[ .consul.suffix ]]' + - metrics[[ .consul.suffix ]] + - alertmanager[[ .consul.suffix ]] + # Email settings email: from: alertmanager@[[ .consul.domain ]] + # You can merge your own custom config with the default provided one. Eg + # custom_config: + # receivers: + # - name: dani + # email_configs: + # - to: dani@example.org + # route: + # group_by: ['alertname', 'cluster', 'job'] + # receiver: dani custom_config: {} + # Loki is the log server loki: + # Version of loki version: 2.9.6 + # Docker image to use image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1' + # Custom env to set in the container env: {} + # Resource allocation resources: cpu: 150 memory: 512 vault: + # Vault policies to attach in the container policies: - 'loki[[ .consul.suffix ]]' + # URL where loki is exposed (if enabled) public_url: https://loki.example.org + # Traefik settings traefik: + # Turn it on to expose Loki with Traefik + # Caution : there's no builtin security, you should add appropriate Traefik middlewares + enabled: false router: loki + # Retention for logs. Older will be deleted retention: 720h # 1 month + # Custom configuration which will be merged on top of the default one custom_config: {} prometheus: + # URL where metrics are available for the metrics proxy (from inside the container PoV) metrics_url: http://localhost:3100/metrics + # Volumes for data persistence. Should be writable for user id 3100 volumes: data: type: csi source: 'loki-data' + # Common vector settings vector: + # Version of vector version: 0.36.1 + # Docker image to use image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1' + # Vector aggregator can be used to ingest logs from external device (using syslog or fluentd) + # Logs will then be forwarded to loki aggregator: + # Number of instances count: 1 + # Docker image to use image: '[[ .monitoring.vector.image ]]' + # Custom env to set in the container env: {} + # Resource allocation resources: cpu: 100 memory: 192 consul: connect: upstreams: + # Connect to loki through the service mesh - destination_name: 'loki[[ .consul.suffix ]]' local_bind_port: 3100 vault: + # Vault policies to attach to the task. + # Note : vector can expose its metrics with mTLS natively, so we do not add a metrics_proxy task + # but we need to grant the metrics policy to the vector task instead policies: - metrics[[ .consul.suffix ]] + # Fluentd source settings fluentd: enabled: false traefik: router: fluentd entrypoints: - fluentd + # Syslog source settings syslog_udp: enabled: false traefik: router: syslog-udp entrypoints: - - syslog + - syslog-udp + # Syslog (tcp) source settings + syslog_tcp: + enabled: false + traefik: + router: syslog-tcp + entrypoints: + - syslog-tcp + # Native vector (http) source settings vector: enabled: true + # URL where the vector endpoint is available from the outside (if exposed with Traefik) public_url: https://vector.example.org traefik: + # Set to true if you want to expose the service with Traefik + # Caution : there's no builtin security, you should configure appropriate middlewares before enabling it enabled: false + # Grafana settings grafana: + # Grafana version version: 10.4.1 + # Docker image to use image: '[[ .docker.repo ]]grafana:[[ .monitoring.grafana.version ]]-1' - env: - GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}' + # Custom env var to set in the container + env: {} + # Resource allocation resources: cpu: 100 memory: 256 + # URL where Grafana is reachable public_url: https://grafana.example.org + # List of plugins to install. Note : plugins are installed at image build time, so you need to rebuild + # the image if you want to update it plugins: - #- alexanderzobnin-zabbix-app - #- ddurieux-glpi-app - grafana-clock-panel - grafana-piechart-panel + # Dict of feature toggles. See https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/feature-toggles/ + # Example: + # feature_toggles: + # featureToggleAdminPage: true + # ssoSettingsApi: true feature_toggles: {} + # Traefik settings traefik: enabled: true router: grafana + # No need to strip prefix as Grafana will be configured to handle it correctly strip_prefix: false consul: connect: + # Connect to postgres, loki and prometheus with the service mesh upstreams: - destination_name: postgres[[ .consul.suffix ]] local_bind_port: 5432 @@ -256,16 +361,20 @@ monitoring: local_bind_port: 3100 - destination_name: prometheus[[ .consul.suffix ]] local_bind_port: 9090 + # Volumes for data persistence volumes: data: type: csi source: 'grafana-data' vault: + # Vault policies to attach to the task policies: - 'grafana[[ .consul.suffix ]]' + # Postgres DB settings database: role: grafana pgrole: grafana + # Override some default postgres handling postgres: database: grafana user: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.username }}{{ end }}' @@ -273,64 +382,131 @@ monitoring: pooler: mode: session prometheus: - metrics_url: http://localhost:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics + # URL where Grafana metrics are reachable for the metrics proxy (from inside the container PoV) + metrics_url: http://127.0.0.1:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics + # Agent runs as a system jobs, on all the nodes agent: consul: meta: + # Override the alloc service meta, the hostname will be more useful than a 0) alloc: '${node.unique.name}' + # Nomad settings nomad: + # Run on all node pools node_pool: all + # Run with an above average priority + priority: 60 + + # Nomad vector logger is a small container which will query the Nomad API to discover running allocation on the current node + # Then generate a vector configuration with scraping for all the discovered allocation. nomad_vector_logger: - version: 24.3 - image: '[[ .docker.repo ]]nomad-vector-logger:[[ .monitoring.agent.nomad_vector_logger.version ]]-2' + # Docker image to use + image: '[[ .docker.repo ]]nomad-vector-logger:24.3-2' + # Custom env to set in the container env: {} + # Resource allocation resources: cpu: 20 memory: 24 memory_max: 50 vault: + # Vault policies to attach to the task policies: - nomad-vector-logger[[ .consul.suffix ]] + + # Vector is the main task. It'll read it's config created by nomad-vector-logger and will read log files + # accordingly, add useful metadata (like node, job, group, task, alloc etc.) and push logs to loki vector: + # Docker image to use image: '[[ .monitoring.vector.image ]]' + # Custom env to set in the container env: {} + # Resource allocation resources: cpu: 100 - memory: 192 - memory_max: 384 + memory: 384 + memory_max: 512 vault: + # Vault policies to attach to the container. Vector being able to use mTLS on the metrics endpoint + # there's no need to add a metrics_proxy task. Instead, we grant the metrics policy to vector so it can get + # a certificate from vault policies: - metrics[[ .consul.suffix ]] consul: connect: upstreams: + # Connect to loki with the service mesh - destination_name: loki[[ .consul.suffix ]] local_bind_port: 3100 + # Volumes for data persistence volumes: + # The nomad volume should expose the Nomad alloc dir (eg /opt/nomad/data/alloc) where vector will be able + # to read the logs. You should create a host volume in nomad client config of all your nodes. Eg + # client { + # enabled = true + # host_volume "nomad_alloc" { + # path = "/opt/nomad/data/alloc" + # read_only = "true" + # } + # } nomad: type: host source: nomad_alloc read_only: true + # The data volume will be used by vector for buffering (in case loki is unavailable) + # You can create a host volume in Nomad's client config, eg + # client { + # enabled = true + # host_volume "nomad_alloc" { + # path = "/data/vector-agent" + # } + # } data: type: host source: vector_data + # The node exporter can be used to expose the host metrics to prometheus node_exporter: - image: quay.io/prometheus/node-exporter:latest + # Is the node exporter enabled ? (set to false if you don't want it, or if you + # already manage the node-exporter separatly) + enabled: true + # Version of the exporter + version: 1.7.0 + # Docker image to use + image: '[[ .docker.repo ]]node-exporter:[[ .monitoring.agent.node_exporter.version ]]-1' + # Custom env to set in the container env: {} + # Resource allocation resources: cpu: 50 memory: 24 memory_max: 32 vault: + # Vault policies to atatch to the task + # This exporter can handle mTLS itself, so no need to create a metrics_proxy task, instead, grant the metrics policy + # So it can get a certificate from vault policies: - metrics[[ .consul.suffix ]] + # Args to add to the exporter on start + args: + - '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)' + # Volumes volumes: + # The exporter should access the host root filesystem + # For this, you should create a host volume in Nomad's client config, eg + # client { + # enabled = true + # host_volume "host_root" { + # path = "/" + # read_only = true + # } + # } host: type: host source: host_root read_only: true +# Enable globaly prometheus for this bundle :-) prometheus: enabled: true