From 2ae2a910025ff45db11104b22fc2d0d563f22270 Mon Sep 17 00:00:00 2001
From: Daniel Berteaud <dbd@ehtrace.com>
Date: Mon, 25 Mar 2024 22:23:31 +0100
Subject: [PATCH] Various cleanup

---
 agent.nomad.hcl                               |  55 +-
 .../service-defaults/vector-aggregator.hcl    |   3 +
 .../service-intentions/vector-aggregator.hcl  |  16 +
 example/.services.nomad.hcl.swp               |   0
 example/.variables.yml.swp                    | Bin 12022 -> 35547 bytes
 example/agent.nomad.hcl                       |  39 +-
 .../service-defaults/vector-aggregator.hcl    |   3 +
 .../service-intentions/vector-aggregator.hcl  |  15 +
 example/exporters.nomad.hcl                   |   2 +-
 example/images/node-exporter/Dockerfile       |  24 +
 example/services.nomad.hcl                    | 593 ++++++++++++++----
 images/node-exporter/Dockerfile               |  24 +
 services.nomad.hcl                            |  28 +-
 templates/agent/vector-template.yml           |   2 +
 templates/alertmanager/nginx.conf             |  39 +-
 templates/prometheus/rules/loki.yml           |  41 ++
 templates/prometheus/rules/node.yml           | 347 ++++++++++
 variables.yml                                 | 246 ++++++--
 18 files changed, 1281 insertions(+), 196 deletions(-)
 create mode 100644 consul/config/service-defaults/vector-aggregator.hcl
 create mode 100644 consul/config/service-intentions/vector-aggregator.hcl
 create mode 100644 example/.services.nomad.hcl.swp
 create mode 100644 example/consul/config/service-defaults/vector-aggregator.hcl
 create mode 100644 example/consul/config/service-intentions/vector-aggregator.hcl
 create mode 100644 example/images/node-exporter/Dockerfile
 create mode 100644 images/node-exporter/Dockerfile
 create mode 100644 templates/prometheus/rules/loki.yml
 create mode 100644 templates/prometheus/rules/node.yml

diff --git a/agent.nomad.hcl b/agent.nomad.hcl
index 0bfd901..20c371f 100644
--- a/agent.nomad.hcl
+++ b/agent.nomad.hcl
@@ -1,7 +1,9 @@
 job "[[ .instance ]]-agent" {
 
 [[- $c := merge .monitoring.agent .monitoring . ]]
+
 [[ template "common/job_start" $c ]]
+
   type = "system"
 
   # This group will collect logs from the allocation running on the node
@@ -39,16 +41,16 @@ job "[[ .instance ]]-agent" {
       user = 3987
 
       config {
-        image = "[[ $n.image ]]"
+        image           = "[[ $n.image ]]"
         readonly_rootfs = true
-        pids_limit = 50
+        pids_limit      = 50
         # Nomad Vector Logger needs to run on the host's network namespace
         # so it can reach the Nomad Agent API on localhost:4646
         network_mode = "host"
         # Host network namespace requires disabling user namespace
         userns_mode = "host"
-        command = "nomad-vector-logger"
-        args = [
+        command     = "nomad-vector-logger"
+        args        = [
           "--config",
           "/local/nomad-vector-logger.toml"
         ]
@@ -85,9 +87,9 @@ _EOT
         destination = "local/nomad-vector-logger.toml"
       }
 
-      # Disable the default nomad.toml template
+      # Disable the default nomad.toml template, as we provide our own nomad.yml template
       template {
-        data = "# Disable the default toml template"
+        data        = "# Disable the default toml template"
         destination = "local/template/nomad.toml"
       }
 
@@ -143,11 +145,13 @@ _EOT
       driver = "[[ $c.nomad.driver ]]"
 
       config {
-        image   = "busybox:latest"
-        command = "sh"
-        args = [
+        image           = "busybox:latest"
+        readonly_rootfs = true
+        pids_limit      = 20
+        command         = "sh"
+        args            = [
           "-c",
-          "echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
+          "echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done"
         ]
       }
 
@@ -170,9 +174,11 @@ _EOT
       leader = true
 
       config {
-        image       = "[[ $c.image ]]"
-        userns_mode = "host"
-        args = [
+        image           = "[[ $c.image ]]"
+        userns_mode     = "host"
+        readonly_rootfs = true
+        pids_limit      = 200
+        args            = [
           "--watch-config",
           "--config", "/local/vector.yml",
           "--config-dir", "/alloc/data/vector_conf"
@@ -186,7 +192,9 @@ _EOT
       }
 
 [[ template "common/metrics_cert" $c ]]
+[[ template "common/artifacts" $c ]]
 
+      # Main vector configuration
       template {
         data            =<<_EOT
 [[ template "monitoring/agent/vector.yml" $c ]]
@@ -217,6 +225,8 @@ _EOT
     }
   }
 
+[[- if .monitoring.agent.node_exporter.enabled ]]
+
   # This group runs the prometheus node-exporter to expose prometheus metrics from the node
   group "node-exporter" {
 
@@ -238,21 +248,25 @@ _EOT
       driver = "[[ $c.nomad.driver ]]"
 
       config {
-        image = "[[ $c.image ]]"
-        pid_mode = "host"
-        #network_mode = "host"
-        userns_mode = "host"
+        image           = "[[ $c.image ]]"
+        pid_mode        = "host"
+        userns_mode     = "host"
         readonly_rootfs = true
-        pids_limit = 50
-        args     = [
+        pids_limit      = 50
+        command         = "/usr/local/bin/node_exporter"
+        args            = [
           "--path.rootfs=/host",
           "--web.config.file=/local/tls.yml",
-          "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
+          "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}",
+[[- range $arg := $c.args ]]
+          "[[ $arg ]]",
+[[- end ]]
         ]
       }
 
 [[ template "common/vault.policies" $c ]]
 [[ template "common/metrics_cert" $c ]]
+[[ template "common/artifacts" $c ]]
 
       template {
         data = <<_EOT
@@ -271,4 +285,5 @@ _EOT
 [[ template "common/resources" $c ]]
     }
   }
+[[- end ]]
 }
diff --git a/consul/config/service-defaults/vector-aggregator.hcl b/consul/config/service-defaults/vector-aggregator.hcl
new file mode 100644
index 0000000..ec88768
--- /dev/null
+++ b/consul/config/service-defaults/vector-aggregator.hcl
@@ -0,0 +1,3 @@
+Kind = "service-defaults"
+Name = "vector-aggregator[[ .consul.suffix ]]"
+Protocol = "http"
diff --git a/consul/config/service-intentions/vector-aggregator.hcl b/consul/config/service-intentions/vector-aggregator.hcl
new file mode 100644
index 0000000..2216a22
--- /dev/null
+++ b/consul/config/service-intentions/vector-aggregator.hcl
@@ -0,0 +1,16 @@
+[[- $c := merge .monitoring.aggregator .monitoring . -]]
+Kind = "service-intentions"
+Name = "vector-aggregator[[ .consul.suffix ]]"
+Sources = [
+  {
+    Name = "[[ $c.traefik.instance ]]"
+    Permissions = [
+      {
+        Action = "[[ $c.traefik.enabled | ternary "allow" "deny" ]]"
+        HTTP {
+          Methods = ["POST"]
+        }
+      }
+    ]
+  }
+]
diff --git a/example/.services.nomad.hcl.swp b/example/.services.nomad.hcl.swp
new file mode 100644
index 0000000..e69de29
diff --git a/example/.variables.yml.swp b/example/.variables.yml.swp
index 1e6e138b0e4049b0dea0e9e2eae1496020e9a742..b0d77bb59a65a1cf53bed80088cb84998cc61328 100644
GIT binary patch
literal 35547
zcmeI536LCDd4R_b*cc)UgMlJ8;o-H}U89-VL+5~0mSx$<NV1{}uz^kQOz&(*Gt=Yg
zo?R`L0lR{n%f=)HCgw^41XCoCfU5$9%V3f~I1C}heI_=yIUER>%aHGXM|aQcjL=~h
z1$4W<U2Pw)U;poauit;X_Uy#|Z9CM4(qw_{k%hv#n@*a0&8zpF^wKY_EYxOe7i@j{
z-rbwGY%OiMY=@jNaBFz&IBXXuC#FmOeBAM>eyJKZm!JBAz5>fpr&_Ltt=J1%ekUq7
z!&VT7ouE}O&vsk2hTpSm!RrLxEIUg}&BpS=^F6r)atREVKsTN%u3xcldTMe)P8vDq
zN$Sy$+Bf|C{75c=TmrcSatY)T$R&_VAeTTcfm{N=aT18zrxsqwQ=M#|tZRRt)c5<L
z_U}vV?}fhO53zrrZGYe1cl=cQ_lW)dslMZ<*uO8dzwhrm{xJLZ1@`yTYz19Cf5+}$
zZ+~CWciisIf8`R$C6G%Xmq0FoTmrcSatY)T$R&_VAeTTcf!|6Ac(p=doF{*bH~@(M
zJNy4<D+`660%-t+!!QSb0J~rvOu#tYcY2}lQMe7>1vkNU5P=W7U^Ps`6g(NugF9Cg
z3Ng&VbKnX%6Yl%nLgAb64R{~C6W#$$sKV1=6dnQp{D?x~T8QCUa0#q~bK$cOFBCoq
zZ-JZOFuWKh;bHKT(>MqI4Q_#p;c;*pd>d!O=i$TfPPiEkLj!ifQ{gN)1@6YN@osn%
z+yZ;x5*URu;0HK1Zi0EZ95z7_o&XPr`*Ey%7XAZX4ljde!3rqAmvOAz3V#cK1&c5T
z`(X>Lg;U`NI6&SDH^X1U3*kyQ8Lp>orG3woHa?n7v5ly`^FgGd`LNrlsl~8!KsCcC
zR^gnA=Y16iO@Eb|?Z&F=wNyP=@KqEx`9c0itM03gA9WkC3g(nwU!}s9?=B2lQS7y<
zexw$?NY%o{mKwXV>vsY_Dwz9Hd{Zq3jfRS<9k0#3xL@Q^c+HEw67}+1%3h-4P#<o0
zxH;!_Bei+gHgol?uNt9OoAnxUznYr$<3*oKJxWe$QL;DP*=^4H9jRn;NXqVXTP3wK
zjHMEuew?MSD|=PsFZdlVQ_?D_$Wn6apjGqReyirS;#JCP)%0z{Ic}HMwZdN_XFs!#
zf;*I4(DhP!D=MC?6I6|W+`iFjqZ`G3r|2JShaLVgnq1Nj8$lI;O8I6RQTsh<4|hbK
z&5L8NI`1SHd(i=V`huLAUfcAWVP~nLrY7uhrK;_2MUhBg3a72ZZigmy*Nmv$c9%SL
zSoPu{Y}u3j)<Q*Hb+!F#L~Y?A!lq)k-4!&wx?fSPdeAx;w5nzOk7Bpg?gR@#!>{|b
ziq~k9F_aW>QCK}d+vyACak`N?W(8zwHKG>h10HvtwjhZ}?cKgd&4nGUJ@!N``ao23
zOp9QZA~^{)*3H)nn_g{P-_taiyU2-O(OWzf1<iKDH?^0F(C)IEHb1!3+oq$Y@Dpr4
zD}*6UHa%sJ+a^nz@BwNvMN*bXJo4!qEh9^p&t>myn{$+o5cm!HSnPY<nxuUp+2r9X
z6Q#+?(nN8}De;IJ)~ZU;Jt7pUqRd71MyR;i??xKeF#5Q;pHqb`rd{0`rqjw_k?-2v
zb~!cHBs;CK#DG>0q`T3g(nGbHFp|aHXxz2yd>prDXz5vcg1X|lNxhURDmfNa%4O-Q
zUZdnyn<hz4>1~9c={Z5I<RA1zk4j;uo)nWUKH0gQO34GuLD4eqy^0~%9*?U-D>Ny>
zTVI+kO-@(VtY4QDC>7z@22uh`la<x0r;{U@!jgUN_8eP&cg476hK_zce4vMp<JZI4
zhF3i>8y?JR;?!z1aq<{7Q7%fvawJW3cXYaF)h?bF<#Sp`pGsd7x>z&lxQkVyy7Kzv
zO{a$%a}_GUVN|un>q+Z{40AM4qPGlzPG^UxtJMH>BzY!LjJg*udP_!ov#M!JNzQd@
z!864&J}_L@^(ZPMt{I1=@1VQ9)sk1HkwmSewmSVyHE3$JE&M@x`5eCn$+M-}L#H`r
z<<Zw9vKyj1*%zDK(y7lb5p9n)Xkn)cKojr=Vgh0nO#~n5MoH#OMj5JMw-xhR*EJbY
z>5M2TJ((KG1#Yi?>yJo*o~rq^fVVwkWGk%sGwm>JaOQ~GYNsfX`lE2xw71Nmcx43C
zMdpCAe=G`>Os_<xeJ=_ab-gGG>T;H;Vo{BrFa5H`Y%pq*Xl!q;r>f1MJ|Ek@FvnY)
zxo7`mEzK~_O!KHkEts2QXm7=tl6A$MA}2SLI$H&HZvj13r3>1^dH6AUqdH$iV)rST
zEH1_Kv=wiQv=$X+<H~MDiVotyYs`4HX3&~xFV-^0BEQ=4<Cz2glHEDS+q7n9Tt(IP
z;`xfIcf2{T<#7g+@Y;;I#)@c<%+6&ulP!BdDp>bPy{5%XJwm2+JCZDHeJ4}$X!}5r
z$<H=pqn)~PYxm0C^^9$1ByuyW^ie5{GAac-QyEd0=sP#)crr0%1`#8js8Dc9R&l~(
zcVR>~<zm<xrHOn*Ox3GQ<*+o6YQ%9pNw1(OqY=tSf!(&A7VA?7PS~X>nX((xB5B}N
z*fCZrjm8oM%k*51nrCE^=PHOHS1cC9fW8r<@lP-+#ejAJ=zB1Z{|j!1J+Kv4!?|z<
z`~)NT+weJfBfJutFatYbBaFit@DTVO#_<Q?Hh43<9G(js;Cwg>PJypu7=Hxb18;`w
z;V)nbE`qgi9y|t4hI=rUzX%_O*TXAd1^k@nxfebM?}j(P>){n3&w3r?TP}fI0=Wco
z3FH#UC6G%Xm%y)10vKd1Ssq!dsJPSh3#t16BSR%y$M<S8Ar38jT5^DCulR5)N^JI-
zP8i0StKy}$J(BE{(~J#@LoRG|G05DX;wcy{7OP>S;a6GES(?Ld8!h2IYnGZ~WER_D
z(266hW}K)sf8+DYV>N%_5LK=o;=Dt670Zoamc81udkE*9T_ZYPE<NQ?*pAEE9b1-w
zKzU62m=9GO0S4Uo_}ICJ%Hz&?<8|tpYDvZVUFpsv?kYdflRDG%4pvmgi{%_!{cedh
z&Ih#2!P@KhwqDUIyLeNAu$(p)4_5P2cR~ZxEHW*jEox_U_q3^Nxdkol84*g-OiB%C
zJ2}HWuVVY<N}<I)qXm8BqV<Wc>}7f(H&aAqs#v+RL`d)ubZ|LLPfj?wnKO!b67UVR
zbl_k{tgi}J9S92Mu>9<{2{rj%Gt<1NNedCUyJua#=DHJW{q?B}$~FJwhCT;-`XEQA
z0R3)p(}Z}#r`MJyoe$FPESQIopG8bu!VyJtq}T}SbwWn&_%Wz-qOZd}ZNp;*168{J
zi3U`(g0FNsb@k&KG46cXtslZ@n^kA?aIR4kzfwg)OWOM0gthuU5bN8epMMey``z$1
zSb`PsBdq8z!1W;E>8HRQSj@M=E%3+Cfy-eMo&@(|A%6w_3H}Cd01vjqW_UE{d;jVN
z{`}6l1ab-F638WxOCXm(E`eMExdeXABp`;%h<dgS{Wkp$VLd{(q~j48u9^ffthu?M
zPVAUyqeSg${+!8Z>x2sdac9Xf%w!JC5!@CNfqbJTAyLeu3yeLe4Gf>8o`x|g2u{l%
zX#i3xBO5-9?B&j*u-u2xZ*Gk!7d1AaO?O;}$Oq@#^*9}@hhnDNY2b|^uZOgu$*Fav
z3I3a`Y?#<EUCuf&Qs04lpdBV#kT^9lZ7nR$IcQT8wd<vzpO7DIzhkWWK6j9D8dRgy
z;&x7k9-$P)li(zidKG%?{t*eNN+3Ro+hVqpQ0bEql4%}Y)D;uYmHbYEtfmN;eoAzG
zu@jIvhJ#DZE=i1;JQ7ibZ5Jt#h<+e4z>Fg*jX_JD<RK%;-69LrCUe?2u|8y;-`>y)
z2G>F$nN$8;fZu1%YuJoJQPZ7a@+A5+5_RM>o#&T%o+PWqC~j90#|T=R^g~H#K>BQ*
zuuG;+ucj9Ck<_2pd;cw7SJEjJEhg=&!`&p}1`Yfr64~z}sHIg};x>6Vse$s^Xab3p
zBL8MkBN1}Z!%;!f1Oa=RL_CNRZZ{Pm_TBABjGXr1hSXl%Poy?v+s*dKj)xFT0x225
z@^7b&d(EG{9jx8uM%7fsB{fc$+LQSvvGw~%V19}uLu1`vV#l=`u;#A;A0#PV64Jkl
zHUB}l9bOMFgToNRcK8L>yu{GI3Ll5J!Oidzn1$Ui3a7$PvFg7CpMl%p)o>%+kC)+_
z@M+irQ*aA$@Rjf-asWOAOK>Sn!;|5=B#FNSo(~>e07du#$>CptkHHL-pa=!Hm&EXQ
zz?<QESb`algz+yRi;uy-z%6hy{3*mBa{4yO;P=Aa@DX@BRAC#O1UDhiKZ6&*<?wWv
zfK%Z<62#vJ?}Edy05w<#=fLCO8!W7U6^M%DW43ck<zsd}W=oGqVle5LZH7bBUH0wX
zo_Q-T`m-bxG5pVw$tLOJyckJpl6TygES%7TEo(1=_{6TBS9T0G>FdtbYl0f<o#);8
zwAVT}w@&i64&pv<_V5J%tWP+b-2Y4SD~IHXXJ0kA#bp}g+~TB&Up6@LKk+D=3kpVj
znZ(L$sru;9+((z}r86e=YLGqTx^GgGF1>J(WW}d%rjTrzGAxX!%`zR>VP>B?sgaj2
zDT^i>Gh%IsxrUrKrME$Yh>n%$SR2!viJpm{+(OTl&>Wc&r^;muQoP%p7Ol$Yojcq=
zD`Z888k32$E{<7)3A(9W;eMw%iGJ%Cqtu#0C{I@2xEH?HAV-)KWF3yo9Cx8NHAcQV
z0`D-Bs1(PQDVyv@!go5Ye|o}XK8~mzb|#w~RJAVIis@<+H+@QLQP83s)DcBqCo*{w
z84sC2*_#fXFs0Qq(<R#<^<`#!IQHpMT4v9jNgm%MEt)x2MlI4SlF8}N5vOm=(8@j2
z=@Q3HR_}~x{bJHt)=FB|N6V5Pi9Ob<Li#~R)Haims|^8JEym1}<Ykr!_-d}<9}M)!
z*9Z<^-C_C3;w=)Fxl46k)|a6*`W8Bwx!d9s!_2(tO#ut6#J;ezUCF)mI&r|DN^Y_A
zUqap`btz^fS`jf3H@2lN%T`Q;mY7>I7wve9r8@RScUDXd=?W;PeAT$jLNpe!l)Xh(
zI5quZD$}Bt5uYsfWSL00hQ(n;LJDn%q}sC2l{HM&B)nU+NAwqsr4F(V#O`ywNp?So
zRZDtrG4dO8#WY`cd2M@ZN%+5x_+2%#3Q~|dGyIWOUDb<I^g+{jC8ZOQGkdfaAx2}x
zTl=0N&3ppdtTB}4m8@{B65<))@KT~AqC|8~77fbEz_dkJjEEvciC2Ie^Q`ttEybEA
zh+CRmyS}uxG&NCKx86i_r6pw3i(y!rG*z0eY?z){Q;?nJAcfccIFW!-(Uskx<^-Z|
zX3tI~VVxOrFsYW5h_fNBw=<}n<G>Utx7}}C9(H-((K-}{oB4@RpWeL3q<7f^sUu*h
zk^gL(s40DnJasWI^_n;PW^lH>v+7wAr}MOj*xyHC$7{j5iA+VUnRIb@J!zS;=>!kt
ztVQ2($Rs6|US+DZCbIx!_$sz!iefI%rT460OUu;9`X(rnU$1=|ybrDV>+T@Sa{E@~
z+MIu6A$j&@QL0=b7Cmn?H^DK3qAaUj(o<3KONhr}T*uf~VU=#jY85N|LTiQJh1LBE
zxE4BaCftpsEt%{$z_rkZy|4$?LkS)WC&SmVv|kIaflcsOco=*KOZ$J|{qRcYz#K@-
zeIu0MY<MJ`1mD35e<!>Mo(Fs2Jh&aJJc250geStI;C`&~@4zSF4){lq4E`6v0eB8<
zhE?zw_#V%F7u*TA!|UJ%SOk$t6ZXSCxD?jHJ;>`Wcm_NT#IipNzKaEa7u*i7gX=(+
zhVOzcP=?3DFR<jl4_^kc_GM|fSoj;^Echap{B7`7cpbbHI#7paLK&V2r@{YX+20Kx
zf*Sl7EB+q%1pGVvA<V#jxD+<Q<Kc8zzA3hzBp;diG6ATl(Q~ho?2J;k6`)2X$=bQv
z4P^`jHsgqQDT*jV8ctc(8y(%`cY0!?ck-Kd4>vk88gTatekRroCCiOv2@?J_-G}7e
z8bvH+Cr57ds;kUY%&dh`=SXjw^&-E__*IL_b}7v{>guaGvvd(hOYUm3Pv-AeCl4so
zgySn!uGH&G^qqQ3b(P94?cphU-o4Tw?RtmD>^)2N589iQS&%bi<K<%?*ELt~o2SP+
z&1=pk3c8Dq?~P$@Js=rO?3{u-DwSKF3YIp0=EMO-_2rcfDylE5Y(ODvR%i15hSbM4
zNK%_+$!TbPnwpk&0eQ^fzGi3Kq|3VG{N*NCj|DNu+m|`+OiWH#cD61xw*?j3fsdb|
zLmsBr@@0jdZ0X@D-T6Szu@OWkZyLXQuvlmDeGxmTps!LKVlsD-g=K!{`QdCUXceFD
z&CUh~lXKdQE^GPiJZX^Q*H&wp8I!)>r`D{sx3ga!FdEYLepl*}i)Z$1-Lh}@w!P1u
z*?iHCZ98XnZQirznV0RpXh7dQuu!J!$x80b%C><WU%E_b+3LLI!+<zHbUla66YPUl
zSY5k1o8D*h-^^|&SG5yE7qiEt!KpEc1dZFy*Xv9-4N+92&XO!wWT~$$LzE@%ESYRu
zHwLCLhh^vYC{jk#QfHRAWQJRq;zraSeN#2YuA{15w{~o`OV2;*+Hpx=?aYuhSZZ^P
zE>cR@S{_kky(j<j8rkz*buKpd8rq8e397y0uvh@{#g#=CB-%BQXcUW9Wgf>HRz4A8
zzVNXmrh)9W$|E25aHepoU-DU#iaSLwx;5b)S(EFE)Am?XK9(WLxC?%R&=T=Hs!$^i
z#yU%{hLP@0m#-fXXv};|<mhY`^A(o9_l&5u_ADGBRd~>lJ}P6ndy`2axP8p<D^ihz
zh=#GXZUV`64B!!>`GTA-+p!Gb&T*P110QLSIu6$<wI2F`jUgkAY4^T7;^EdvpLLNg
z{dU%Ye0s*KWAO6@fKX4Th;oN@yLKN<(;p$gDj}XI!FVfZt`eh076D`~*9J=Fn*~@S
zWP8&xl&9T~$>McZ1u#6zdTOy)%;2La%XX`AGkJVqWRO^7Ci$AGo676KS)5NsrtG)E
z<zJ#%O?)P<6Q!$|<!afI$&%>Z@y(P{b2zd3dLr6;%qN;`j9iBg<V2ZQs-TuluD6{!
z4ZGs$(mhN&m}R!K%)va^sq8I~GLhV=XZX<rfbk-uJj*<RsWUm9#F^32xJG7}9Fgad
zaWdKu1eyKT`-`VXzU7mwyqAm-$pAs`+HNhnOAsM1q#!Mk_>e1V-PFW9Z8s?^%;R}&
z;D#8ClFTgz?ufN|WC{1D;STtFkdU(&x|`rUSP7@Y_c3Pgflt7D;TEVu1;pSz6CMgb
z#JK%3d<NbKH-RLaZ-ukrq3|Q>@Bugg`(X=6y`BoE!_TSbo$wyG75)wm!;9f6sKf8W
z6JRBr44=g4m2aWF3;qFah8y68(12+;15SY-V(|Va{42Z#UJKVj2c8EzVFR2GE8!;?
zyI+IP!TUkJmG<WlLj?1%7fyo$1Q@}WgBZpeU<F9X`|}vRcfcFqFtosf9k34MdueCD
z$?#na;9KD@;R@IQKf}=dEZhlqfEdO%!i%8`J76P>!b*@(_%|?yuZ73KY4Cjv-n-!i
za5cm*56=M!fv*D5q}=?)B<?j-bMsdeL$?0RDX|^Q0N0ZhWlkP2!O6{E_hGl({B<+S
z-!k*pnaR#IIM5QMr)BfZ@PQ3TDKj&*0gi>`PLy;~=enjds0(h#5iLV&OrMCkYVK2X
zv)iM)?+IC?a>pavoDYso_TZUU+5D4Zu?sR5P}a`TZE!5*EVm+@VkOP_0GI^g8Pn$e
z(N5);O}3@w#T2zk7y46}Vt|#@Weq%e$<(^$H#o<6N8FH{@tP(*DRmNQyXcs+RJ9`^
zlb^cR=8b0ZRILbYngwZ5Jz1&S#j@A_3VW$Oy=Ro*%A?Ijl}B7QaExTKDdYr+l({r5
zow8+?;Ja)Y>0CYDh~@sAp%GpCaVpFnlZB%nrl&W$w?kj>bMgU8**Xrtg9wEf=!VkT
zOqSM=&$A|7MW3&quO|X3WwBQm$!~Eu7>SF$iLT-lNxzgk%z9L89lz@1x*|R!A0eeE
zDNHt^Cf)YTEZ;)7!lboJ9A<Qta-Y_fe-f%6HJ|d7Yjg*2-aK6fKZ=;0rtAy)zuW23
z2=syjYujcr2IN|^y1l1nuENQgy-%kN){kQdeM^id4Au+yCENdDjQ%%BM0Ous3@5>j
z7@tdU5X3Nj7F-C<IQ;;I=X+oreh(f2KgQU+6XZK$uYzm9hl@cXuup;&a32Qd7vN*?
zUU)0W2f25{S@2_w%G*GU&g<bC2w@j&g*C7OBr5wg_$WwJ_Dyghd<vuS1Mqf`$m=H9
z2&-WX&W8JtgO-I6L;f?DKrVq?0=Wco3H;hiKqeWv!!dU_lE7g7BF7g+a+B3%5!Kw`
zm^&Qn1*0C$r2LI`I1UekCl+ZY6Ne8~o3CmU<JF;HM3p!w#gKG|m&bkwx<-!AIpU(9
zLsy{mFX2sB7|RW1%@WqUEZWJeThjaNl3uZ?T_(gbAIgdOTC+|Qxs7p%NdQSFCh7lg
gN*YgQL4wX)A^%uMkNZ}_^{q?ExQ3GzNzUQ+|7b;bxBvhE

literal 12022
zcmeI2J!~9B6vxMrAPEpaBt}Ry0|t_2Kh93#cxeC=2tp<j1cVYIYj$^Rud`pw&N{Y$
zL_)NDH55odMGB~JXaR`|0wodzQ6Zq9!BiA6691X~x;vj`3Ywr9>*u?D`)2mdf8Lw(
z$>Y1vJuydT>>fe8osj3FzxW5UPmS$<`$iJ_;X}tyoO=4<+2i)?$vNdQvc)WyS#I^*
zDVr}z!2)gvS+dpj!q9+DAp$4JQpuu}i^55=G?JN!(nZHF(=g@(rWGb4=EG>OCGpn5
zjm0Qn6xcz5ePn8~=PIX(d+(w9?|60xe{<U?U=%P47zK<1MggOMQNSo*6!<?WAoDSD
z5fj=~Pinb-9UFSBFU^Zlz$jo8FbWt2i~>dhqkvJsC}0#Y3K#{90@tAeEF@(8213Mc
z1dsp!H{bs^ZzJRy_znCDu7V%HH{f&d8MqAI2akfI;C?U;c7Z={C1eBq488@+;05p`
zcn}-{`@!G)2-yT1;79N>NWob!2RvYbyTR345Cixgd=0(=pMuNa68I3D0?&ZQ!DC<=
z90GTOaj*w$-b~2v;0jm=pMXo?9dHJm26JErJOB=Wi#HK+0h|Pn02_>h17Hjgu(6kr
zpTNi9Bk(SG9jt--z+GT3xV8t+2Va17@E&*zyarB#6Tk%&{DF1(1$+m-1aE>jz^g#5
z*^9t5qkvJsC}0%09u%OIr|KMA=us(3PI8eYTrTml@UZ3A&7udp({L6`4))!Fw}^tG
zw&YnH1rbMQ)pdoHv8>)wT072{BKGK#l)2|Py~%0Y#ZS+hac8ES{;~Q4QDdo=b1~22
zI1B1vD=VdPE$1E$iik9pA}d7z+hE*Q$RulmSMTdEv$#x}nEM1-U~$3qkT^!j7ou~O
zdtUmOWH*lj6JaqIlG}WhC3(#4Of1%+Bs|H)nn#bhwIRq$j~=P61h3+(<>KjzM_+nH
zJ4OjxRPl!_jd<*<(ODT*&q-K{o5FK#C7;!+X0(Dsi&2((v@+{af~-uXM|-%?5jH)W
zr5&{<ke9_vSs~S$xC3j_r;Sb)iq^kQ`k8`Do=P3bl#9RTloO>QD<wkib0^w9lzz&#
zquAXe2L#sxPUunF98+r}*m-*~l1rsu7F?(ko`_GK33UpVKv{>K$2uh3>c+LSS~&2@
z3PkOMQ6U{ssw1F%J_U~#tgbE`j={o^hZbMWGpv`;M?+OSDySTaE@Hu}9Ga-sV_pwh
zne-!TLz4x|nP2FXAZi%Z21~JvrpfA&1xR#P{MxR7nsVvJ)1}h(8!oUBwNv{x`96L0
zSZ&b>Kk07CczwG`d3~`>&UZn1llBp87AoU)hc><SRa*d+mvZUrip`j|ZBRv|L{Y}=
zvNckwHS6EahW0bHqgAzi^l&?LJ=ywtsJ>PCbQMnNHuD_{2W8&tb!Cnyx<Xes^s}ly
z?Ut>!*Lh6dN50WBXX|ieFv8ePqu!J#nw|!ZW~%q7-N9|@Urxi1S#ZwJR$E=wesy@8
ziKrVyJ*(-}PG!q1x`3ye0RNkSa~$>Vg6-L(Mb~y6s`9VHY!eaM)WxqpKfBEV7g1VB
PmZ}e>&Wvx9mPP&n2&$W6

diff --git a/example/agent.nomad.hcl b/example/agent.nomad.hcl
index ccfb727..c233b43 100644
--- a/example/agent.nomad.hcl
+++ b/example/agent.nomad.hcl
@@ -1,8 +1,11 @@
 job "monitoring-agent" {
 
+
   datacenters = ["dc1"]
   region      = "global"
   node_pool   = "all"
+  priority    = 60
+
 
   type = "system"
 
@@ -161,7 +164,7 @@ _EOT
         destination = "local/nomad-vector-logger.toml"
       }
 
-      # Disable the default nomad.toml template
+      # Disable the default nomad.toml template, as we provide our own nomad.yml template
       template {
         data        = "# Disable the default toml template"
         destination = "local/template/nomad.toml"
@@ -184,6 +187,8 @@ sources:
       mode: continue_through
       condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
       timeout_ms: 1000
+    ignore_older_secs: 1800
+    oldest_first: true
 
   {{- end }}
 
@@ -262,11 +267,13 @@ _EOT
       driver = "docker"
 
       config {
-        image   = "busybox:latest"
-        command = "sh"
+        image           = "busybox:latest"
+        readonly_rootfs = true
+        pids_limit      = 20
+        command         = "sh"
         args = [
           "-c",
-          "echo 'Waiting for config ffile /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 2; done"
+          "echo 'Waiting for config file /alloc/data/vector_conf/nomad.yml to be generated'; until ls /alloc/data/vector_conf/nomad.yml >/dev/null 2>&1; do echo '.'; sleep 1; done"
         ]
       }
 
@@ -289,8 +296,10 @@ _EOT
       leader = true
 
       config {
-        image       = "danielberteaud/vector:0.36.1-1"
-        userns_mode = "host"
+        image           = "danielberteaud/vector:0.36.1-1"
+        userns_mode     = "host"
+        readonly_rootfs = true
+        pids_limit      = 200
         args = [
           "--watch-config",
           "--config", "/local/vector.yml",
@@ -331,6 +340,9 @@ _EOT
       }
 
 
+
+
+      # Main vector configuration
       template {
         data            = <<_EOT
 data_dir: /data
@@ -398,8 +410,8 @@ _EOT
 
       resources {
         cpu        = 100
-        memory     = 192
-        memory_max = 384
+        memory     = 384
+        memory_max = 512
       }
 
     }
@@ -436,16 +448,17 @@ _EOT
       driver = "docker"
 
       config {
-        image    = "quay.io/prometheus/node-exporter:latest"
-        pid_mode = "host"
-        #network_mode = "host"
+        image           = "danielberteaud/node-exporter:1.7.0-1"
+        pid_mode        = "host"
         userns_mode     = "host"
         readonly_rootfs = true
         pids_limit      = 50
+        command         = "/usr/local/bin/node_exporter"
         args = [
           "--path.rootfs=/host",
           "--web.config.file=/local/tls.yml",
-          "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}"
+          "--web.listen-address=:${NOMAD_ALLOC_PORT_metrics}",
+          "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)",
         ]
       }
 
@@ -477,6 +490,8 @@ _EOT
       }
 
 
+
+
       template {
         data        = <<_EOT
 tls_server_config:
diff --git a/example/consul/config/service-defaults/vector-aggregator.hcl b/example/consul/config/service-defaults/vector-aggregator.hcl
new file mode 100644
index 0000000..9af7463
--- /dev/null
+++ b/example/consul/config/service-defaults/vector-aggregator.hcl
@@ -0,0 +1,3 @@
+Kind = "service-defaults"
+Name = "vector-aggregator"
+Protocol = "http"
diff --git a/example/consul/config/service-intentions/vector-aggregator.hcl b/example/consul/config/service-intentions/vector-aggregator.hcl
new file mode 100644
index 0000000..5e76613
--- /dev/null
+++ b/example/consul/config/service-intentions/vector-aggregator.hcl
@@ -0,0 +1,15 @@
+Kind = "service-intentions"
+Name = "vector-aggregator"
+Sources = [
+  {
+    Name = "traefik"
+    Permissions = [
+      {
+        Action = "allow"
+        HTTP {
+          Methods = ["POST"]
+        }
+      }
+    ]
+  }
+]
diff --git a/example/exporters.nomad.hcl b/example/exporters.nomad.hcl
index 79fe485..035f046 100644
--- a/example/exporters.nomad.hcl
+++ b/example/exporters.nomad.hcl
@@ -411,7 +411,7 @@ _EOT
 
       resources {
         cpu    = 10
-        memory = 15
+        memory = 20
       }
 
     }
diff --git a/example/images/node-exporter/Dockerfile b/example/images/node-exporter/Dockerfile
new file mode 100644
index 0000000..4a7eb63
--- /dev/null
+++ b/example/images/node-exporter/Dockerfile
@@ -0,0 +1,24 @@
+FROM danielberteaud/alpine:24.3-1 AS builder
+
+ARG EXPORTER_VERSION=1.7.0
+
+ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
+ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/sha256sums.txt /tmp
+
+RUN set -euxo pipefail &&\
+    apk --no-cache add \
+      curl \
+      tar \
+      ca-certificates \
+    &&\
+    cd /tmp &&\
+    grep node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz sha256sums.txt | sha256sum -c &&\
+    tar xvzf node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz &&\
+    mv node_exporter-${EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/node_exporter
+
+FROM danielberteaud/alpine:24.3-1
+MAINTAINER Daniel Berteaud <dbd@ehtrace.com>
+
+COPY --from=builder --chown=root:root --chmod=755 /usr/local/bin/node_exporter /usr/local/bin/node_exporter
+
+CMD ["/usr/local/bin/node_exporter"]
diff --git a/example/services.nomad.hcl b/example/services.nomad.hcl
index fcbd7c0..032dcdf 100644
--- a/example/services.nomad.hcl
+++ b/example/services.nomad.hcl
@@ -5,7 +5,7 @@ job "monitoring-services" {
   region      = "global"
 
 
-  # Metrics is running prometheus and various exporters
+  # Metrics is running prometheus
   group "metrics-server" {
 
     shutdown_delay = "6s"
@@ -67,7 +67,7 @@ job "monitoring-services" {
         type     = "http"
         expose   = true
         path     = "/-/healthy"
-        interval = "15s"
+        interval = "20s"
         timeout  = "8s"
         check_restart {
           limit = 10
@@ -77,11 +77,6 @@ job "monitoring-services" {
 
       tags = [
 
-        "traefik.enable=true",
-        "traefik.http.routers.monitoring-prometheus.entrypoints=https",
-        "traefik.http.routers.monitoring-prometheus.rule=Host(`prometheus.example.org`)",
-        "traefik.http.middlewares.csp-monitoring-prometheus.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
-        "traefik.http.routers.monitoring-prometheus.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-prometheus",
 
       ]
     }
@@ -892,6 +887,410 @@ _EOT
         left_delimiter  = "{{{"
         right_delimiter = "}}}"
       }
+      template {
+        data            = <<_EOT
+groups:
+
+- name: EmbeddedExporter
+
+  rules:
+
+    - alert: LokiProcessTooManyRestarts
+      expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Loki process too many restarts (instance {{ $labels.instance }})
+        description: "A loki process had too many restarts (target {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestErrors
+      expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request errors (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestPanic
+      expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request panic (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestLatency
+      expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request latency (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+_EOT
+        destination     = "local/rules/loki.yml"
+        left_delimiter  = "{{{"
+        right_delimiter = "}}}"
+      }
+      template {
+        data            = <<_EOT
+groups:
+
+- name: NodeExporter
+
+  rules:
+
+    - alert: HostOutOfMemory
+      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of memory (instance {{ $labels.instance }})
+        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryUnderMemoryPressure
+      expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host memory under memory pressure (instance {{ $labels.instance }})
+        description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryIsUnderutilized
+      expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 1w
+      labels:
+        severity: info
+      annotations:
+        summary: Host Memory is underutilized (instance {{ $labels.instance }})
+        description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputIn
+      expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput in (instance {{ $labels.instance }})
+        description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputOut
+      expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput out (instance {{ $labels.instance }})
+        description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadRate
+      expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read rate (instance {{ $labels.instance }})
+        description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteRate
+      expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write rate (instance {{ $labels.instance }})
+        description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfDiskSpace
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of disk space (instance {{ $labels.instance }})
+        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostDiskWillFillIn24Hours
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfInodes
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of inodes (instance {{ $labels.instance }})
+        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostFilesystemDeviceError
+      expr: 'node_filesystem_device_error == 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host filesystem device error (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostInodesWillFillIn24Hours
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadLatency
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteLatency
+      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostHighCpuLoad
+      expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host high CPU load (instance {{ $labels.instance }})
+        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+#    - alert: HostCpuIsUnderutilized
+#      expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 1w
+#      labels:
+#        severity: info
+#      annotations:
+#        summary: Host CPU is underutilized (instance {{ $labels.instance }})
+#        description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuStealNoisyNeighbor
+      expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuHighIowait
+      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU high iowait (instance {{ $labels.instance }})
+        description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskIo
+      expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk IO (instance {{ $labels.instance }})
+        description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostContextSwitching
+      expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host context switching (instance {{ $labels.instance }})
+        description: "Context switching is growing on the node (> 10000 / CPU / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+#    - alert: HostSwapIsFillingUp
+#      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 2m
+#      labels:
+#        severity: warning
+#      annotations:
+#        summary: Host swap is filling up (instance {{ $labels.instance }})
+#        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSystemdServiceCrashed
+      expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host systemd service crashed (instance {{ $labels.instance }})
+        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostPhysicalComponentTooHot
+      expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host physical component too hot (instance {{ $labels.instance }})
+        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNodeOvertemperatureAlarm
+      expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRaidArrayGotInactive
+      expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host RAID array got inactive (instance {{ $labels.instance }})
+        description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRaidDiskFailure
+      expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host RAID disk failure (instance {{ $labels.instance }})
+        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostKernelVersionDeviations
+      expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 6h
+      labels:
+        severity: warning
+      annotations:
+        summary: Host kernel version deviations (instance {{ $labels.instance }})
+        description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOomKillDetected
+      expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host OOM kill detected (instance {{ $labels.instance }})
+        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacCorrectableErrorsDetected
+      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacUncorrectableErrorsDetected
+      expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkReceiveErrors
+      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Receive Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkTransmitErrors
+      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkInterfaceSaturated
+      expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+        description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkBondDegraded
+      expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostConntrackLimit
+      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host conntrack limit (instance {{ $labels.instance }})
+        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockSkew
+      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock skew (instance {{ $labels.instance }})
+        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockNotSynchronising
+      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock not synchronising (instance {{ $labels.instance }})
+        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRequiresReboot
+      expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 4h
+      labels:
+        severity: info
+      annotations:
+        summary: Host requires reboot (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+_EOT
+        destination     = "local/rules/node.yml"
+        left_delimiter  = "{{{"
+        right_delimiter = "}}}"
+      }
 
       # A client cert, to connect to the AlertManager API
       template {
@@ -945,8 +1344,11 @@ _EOT
 
     network {
       mode = "bridge"
+      # Port exposing the web API, with mTLS
       port "web-tls" {}
+      # Port used for gossip between the different alertmanager instance
       port "cluster" {}
+      # Port to expose metrics to prometheus
       port "metrics" {}
     }
 
@@ -1031,101 +1433,10 @@ _EOT
 
       tags = [
 
-        "traefik.enable=true",
-        "traefik.http.routers.monitoring-alertmanager.entrypoints=https",
-        "traefik.http.routers.monitoring-alertmanager.rule=Host(`alerte.example.org`)",
-        "traefik.http.middlewares.csp-monitoring-alertmanager.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
-        "traefik.http.routers.monitoring-alertmanager.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-alertmanager",
 
       ]
     }
 
-
-    # The prometheus metrics proxy, adding mTLS to the metrics endpoint
-    task "metrics-proxy" {
-      driver = "docker"
-      user   = 8995
-
-      config {
-        image      = "nginxinc/nginx-unprivileged:alpine"
-        force_pull = true
-        volumes = [
-          "local/default.conf:/etc/nginx/conf.d/default.conf:ro"
-        ]
-        pids_limit = 100
-      }
-
-      lifecycle {
-        hook    = "poststart"
-        sidecar = true
-      }
-
-      vault {
-        policies = ["metrics"]
-      }
-
-      # Get a certificate from vault to protect the metrics endpoint
-      template {
-        data        = <<_EOT
-{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
-{{ .Cert }}
-{{ .Key }}
-{{- end }}
-_EOT
-        destination = "secrets/metrics.bundle.pem"
-      }
-
-      # Get the root CA
-      template {
-        data        = <<_EOT
-{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
-_EOT
-        destination = "local/monitoring.ca.pem"
-      }
-
-
-      template {
-        data        = <<_EOT
-server {
-  listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
-  http2 on;
-
-  ssl_certificate /secrets/metrics.bundle.pem;
-  ssl_certificate_key /secrets/metrics.bundle.pem;
-  ssl_client_certificate /local/monitoring.ca.pem;
-  ssl_verify_client on;
-  ssl_protocols TLSv1.2 TLSv1.3;
-  ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
-  ssl_session_cache shared:SSL:10m;
-  ssl_session_timeout 1h;
-  ssl_session_tickets off;
-  gzip on;
-  gzip_types
-    text/plain;
-  gzip_vary on;
-
-  server_tokens off;
-
-  if ($request_method !~ ^(GET|HEAD)$ ) {
-    return 405;
-  }
-  location /metrics {
-    proxy_pass http://127.0.0.1:9093/metrics;
-  }
-}
-_EOT
-        destination = "local/default.conf"
-      }
-
-      resources {
-        cpu        = 10
-        memory     = 10
-        memory_max = 20
-      }
-    }
-
-
-
     # This task will handle mTLS to the AlertManager API
     # And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
     task "untls-proxy" {
@@ -1166,10 +1477,11 @@ _EOT
 
       template {
         data        = <<_EOT
+# UnTLS for the web API
 server {
   listen 127.0.0.1:9093;
   location / {
-    proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
+    proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
     proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
     proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
     proxy_ssl_verify on;
@@ -1180,10 +1492,66 @@ server {
   }
 }
 
+# Metrics proxy
+server {
+  listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
+  http2 on;
+
+  ssl_certificate /secrets/metrics.bundle.pem;
+  ssl_certificate_key /secrets/metrics.bundle.pem;
+  ssl_client_certificate /local/monitoring.ca.pem;
+  ssl_verify_client on;
+  ssl_protocols TLSv1.2 TLSv1.3;
+  ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
+  ssl_session_cache shared:SSL:10m;
+  ssl_session_timeout 1h;
+  ssl_session_tickets off;
+  gzip on;
+  gzip_types
+    text/plain;
+  gzip_vary on;
+
+  server_tokens off;
+
+  if ($request_method !~ ^(GET|HEAD)$ ) {
+    return 405;
+  }
+
+  location /metrics {
+    proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
+    proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
+    proxy_ssl_verify on;
+    proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.monitoring.consul;
+    proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
+    proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
+  }
+}
+
+
 _EOT
         destination = "local/alertmanager.conf"
       }
 
+      # Get a certificate from vault to protect the metrics endpoint
+      template {
+        data        = <<_EOT
+{{- with pkiCert "pki/monitoring/issue/metrics" (printf "ip_sans=%s" (env "NOMAD_HOST_IP_metrics")) }}
+{{ .Cert }}
+{{ .Key }}
+{{- end }}
+_EOT
+        destination = "secrets/metrics.bundle.pem"
+      }
+
+      # Get the root CA
+      template {
+        data        = <<_EOT
+{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
+_EOT
+        destination = "local/monitoring.ca.pem"
+      }
+
+
       # Certifiate used by AlertManager
       template {
         data          = <<_EOT
@@ -1203,14 +1571,6 @@ _EOT
         change_signal = "SIGHUP"
       }
 
-      # The trusted CA
-      template {
-        data        = <<_EOT
-{{ with secret "pki/monitoring/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
-_EOT
-        destination = "local/monitoring.ca.pem"
-      }
-
       resources {
         cpu    = 10
         memory = 18
@@ -1300,7 +1660,7 @@ set -euo pipefail
 exec alertmanager \
   --config.file=/secrets/alertmanager.yml \
   --storage.path=/data \
-  --web.external-url=https://alerte.example.org \
+  --web.external-url=https://alert.example.org \
   --web.route-prefix=/ \
   --web.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_web-tls" }} \
   --cluster.listen-address=0.0.0.0:{{ env "NOMAD_ALLOC_PORT_cluster" }} \
@@ -1430,11 +1790,6 @@ _EOT
 
       tags = [
 
-        "traefik.enable=true",
-        "traefik.http.routers.monitoring-loki.entrypoints=https",
-        "traefik.http.routers.monitoring-loki.rule=Host(`loki.example.org`)",
-        "traefik.http.middlewares.csp-monitoring-loki.headers.contentsecuritypolicy=default-src 'self';font-src 'self' data:;img-src 'self' data:;script-src 'self' 'unsafe-inline' 'unsafe-eval';style-src 'self' 'unsafe-inline';",
-        "traefik.http.routers.monitoring-loki.middlewares=security-headers@file,rate-limit-std@file,forward-proto@file,inflight-std@file,hsts@file,compression@file,csp-monitoring-loki",
 
       ]
     }
@@ -2048,7 +2403,7 @@ server {
     return 405;
   }
   location /metrics {
-    proxy_pass http://localhost:3000/metrics;
+    proxy_pass http://127.0.0.1:3000/metrics;
   }
 }
 _EOT
@@ -2132,7 +2487,6 @@ _EOT
       # Use a template block instead of env {} so we can fetch values from vault
       template {
         data        = <<_EOT
-GF_SECURITY_ADMIN_PASSWORD={{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}
 LANG=fr_FR.utf8
 TZ=Europe/Paris
 _EOT
@@ -2142,6 +2496,15 @@ _EOT
       }
 
 
+      template {
+        data        = <<_EOT
+GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "kv/service/monitoring/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
+_EOT
+        destination = "secrets/.grafana.env"
+        perms       = 400
+        env         = true
+      }
+
       # Basic grafana configuration file
       template {
         data        = <<_EOT
diff --git a/images/node-exporter/Dockerfile b/images/node-exporter/Dockerfile
new file mode 100644
index 0000000..05f2cb4
--- /dev/null
+++ b/images/node-exporter/Dockerfile
@@ -0,0 +1,24 @@
+FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]] AS builder
+
+ARG EXPORTER_VERSION=[[ .monitoring.agent.node_exporter.version ]]
+
+ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz /tmp
+ADD https://github.com/prometheus/node_exporter/releases/download/v${EXPORTER_VERSION}/sha256sums.txt /tmp
+
+RUN set -euxo pipefail &&\
+    apk --no-cache add \
+      curl \
+      tar \
+      ca-certificates \
+    &&\
+    cd /tmp &&\
+    grep node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz sha256sums.txt | sha256sum -c &&\
+    tar xvzf node_exporter-${EXPORTER_VERSION}.linux-amd64.tar.gz &&\
+    mv node_exporter-${EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/node_exporter
+
+FROM [[ .docker.repo ]][[ .docker.base_images.alpine.image ]]
+MAINTAINER [[ .docker.maintainer ]]
+
+COPY --from=builder --chown=root:root --chmod=755 /usr/local/bin/node_exporter /usr/local/bin/node_exporter
+
+CMD ["/usr/local/bin/node_exporter"]
diff --git a/services.nomad.hcl b/services.nomad.hcl
index a5b4c0a..cdb2cf1 100644
--- a/services.nomad.hcl
+++ b/services.nomad.hcl
@@ -2,7 +2,7 @@ job "[[ .instance ]]-services" {
 
 [[ template "common/job_start" . ]]
 
-  # Metrics is running prometheus and various exporters
+  # Metrics is running prometheus
   group "metrics-server" {
 [[- $c := merge .monitoring.prometheus .monitoring . ]]
 
@@ -28,7 +28,7 @@ job "[[ .instance ]]-services" {
         type     = "http"
         expose   = true
         path     = "/-/healthy"
-        interval = "15s"
+        interval = "20s"
         timeout  = "8s"
         check_restart {
           limit = 10
@@ -168,8 +168,11 @@ _EOT
 
     network {
       mode = "bridge"
+      # Port exposing the web API, with mTLS
       port "web-tls" {}
+      # Port used for gossip between the different alertmanager instance
       port "cluster" {}
+      # Port to expose metrics to prometheus
       port "metrics" {}
     }
 
@@ -220,8 +223,6 @@ _EOT
       ]
     }
 
-[[ template "common/task.metrics_proxy" $c ]]
-
     # This task will handle mTLS to the AlertManager API
     # And expose it as plain http on 127.0.0.1 for Traefik (through the service mesh) and for the metrics proxy
     task "untls-proxy" {
@@ -253,6 +254,8 @@ _EOT
         destination = "local/alertmanager.conf"
       }
 
+[[ template "common/metrics_cert" $c ]]
+
       # Certifiate used by AlertManager
       template {
         data          = <<_EOT
@@ -272,14 +275,6 @@ _EOT
         change_signal = "SIGHUP"
       }
 
-      # The trusted CA
-      template {
-        data        = <<_EOT
-{{ with secret "[[ $c.vault.pki.path ]]/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }}
-_EOT
-        destination = "local/monitoring.ca.pem"
-      }
-
       resources {
         cpu    = 10
         memory = 18
@@ -617,6 +612,15 @@ _EOT
 [[ template "common/vault.policies" $c ]]
 [[ template "common/file_env" $c ]]
 
+      template {
+        data = <<_EOT
+GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
+_EOT
+        destination = "secrets/.grafana.env"
+        perms = 400
+        env = true
+      }
+
       # Basic grafana configuration file
       template {
         data = <<_EOT
diff --git a/templates/agent/vector-template.yml b/templates/agent/vector-template.yml
index 45d225d..37f8e79 100644
--- a/templates/agent/vector-template.yml
+++ b/templates/agent/vector-template.yml
@@ -12,6 +12,8 @@ sources:
       mode: continue_through
       condition_pattern: "(^([\\s]+at\\s|Caused by:\\s)|common frames omitted$)"
       timeout_ms: 1000
+    ignore_older_secs: 1800
+    oldest_first: true
 
   {{- end }}
 
diff --git a/templates/alertmanager/nginx.conf b/templates/alertmanager/nginx.conf
index a27d027..1b35167 100644
--- a/templates/alertmanager/nginx.conf
+++ b/templates/alertmanager/nginx.conf
@@ -1,7 +1,8 @@
+# UnTLS for the web API
 server {
   listen 127.0.0.1:9093;
   location / {
-    proxy_pass https://localhost:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
+    proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
     proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
     proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
     proxy_ssl_verify on;
@@ -11,3 +12,39 @@ server {
     deny all;
   }
 }
+
+# Metrics proxy
+server {
+  listen {{ env "NOMAD_ALLOC_PORT_metrics" }} ssl;
+  http2 on;
+
+  ssl_certificate /secrets/metrics.bundle.pem;
+  ssl_certificate_key /secrets/metrics.bundle.pem;
+  ssl_client_certificate /local/monitoring.ca.pem;
+  ssl_verify_client on;
+  ssl_protocols TLSv1.2 TLSv1.3;
+  ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
+  ssl_session_cache shared:SSL:10m;
+  ssl_session_timeout 1h;
+  ssl_session_tickets off;
+  gzip on;
+  gzip_types
+    text/plain;
+  gzip_vary on;
+
+  server_tokens off;
+
+  if ($request_method !~ ^(GET|HEAD)$ ) {
+    return 405;
+  }
+
+  location /metrics {
+    proxy_ssl_certificate /secrets/alertmanager.bundle.pem;
+    proxy_ssl_certificate_key /secrets/alertmanager.bundle.pem;
+    proxy_ssl_verify on;
+    proxy_ssl_name alertmanager-{{ env "NOMAD_ALLOC_INDEX" }}.[[ .instance ]].[[ .consul.domain ]];
+    proxy_ssl_trusted_certificate /local/monitoring.ca.pem;
+    proxy_pass https://127.0.0.1:{{ env "NOMAD_ALLOC_PORT_web-tls" }};
+  }
+}
+
diff --git a/templates/prometheus/rules/loki.yml b/templates/prometheus/rules/loki.yml
new file mode 100644
index 0000000..077036a
--- /dev/null
+++ b/templates/prometheus/rules/loki.yml
@@ -0,0 +1,41 @@
+groups:
+
+- name: EmbeddedExporter
+
+  rules:
+
+    - alert: LokiProcessTooManyRestarts
+      expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Loki process too many restarts (instance {{ $labels.instance }})
+        description: "A loki process had too many restarts (target {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestErrors
+      expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request errors (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestPanic
+      expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request panic (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: LokiRequestLatency
+      expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Loki request latency (instance {{ $labels.instance }})
+        description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/templates/prometheus/rules/node.yml b/templates/prometheus/rules/node.yml
new file mode 100644
index 0000000..df796c9
--- /dev/null
+++ b/templates/prometheus/rules/node.yml
@@ -0,0 +1,347 @@
+groups:
+
+- name: NodeExporter
+
+  rules:
+
+    - alert: HostOutOfMemory
+      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of memory (instance {{ $labels.instance }})
+        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryUnderMemoryPressure
+      expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host memory under memory pressure (instance {{ $labels.instance }})
+        description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryIsUnderutilized
+      expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 1w
+      labels:
+        severity: info
+      annotations:
+        summary: Host Memory is underutilized (instance {{ $labels.instance }})
+        description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputIn
+      expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput in (instance {{ $labels.instance }})
+        description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputOut
+      expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput out (instance {{ $labels.instance }})
+        description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadRate
+      expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read rate (instance {{ $labels.instance }})
+        description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteRate
+      expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write rate (instance {{ $labels.instance }})
+        description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfDiskSpace
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of disk space (instance {{ $labels.instance }})
+        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostDiskWillFillIn24Hours
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfInodes
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of inodes (instance {{ $labels.instance }})
+        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostFilesystemDeviceError
+      expr: 'node_filesystem_device_error == 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host filesystem device error (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostInodesWillFillIn24Hours
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadLatency
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteLatency
+      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostHighCpuLoad
+      expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host high CPU load (instance {{ $labels.instance }})
+        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+#    - alert: HostCpuIsUnderutilized
+#      expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 1w
+#      labels:
+#        severity: info
+#      annotations:
+#        summary: Host CPU is underutilized (instance {{ $labels.instance }})
+#        description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuStealNoisyNeighbor
+      expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuHighIowait
+      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU high iowait (instance {{ $labels.instance }})
+        description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskIo
+      expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk IO (instance {{ $labels.instance }})
+        description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostContextSwitching
+      expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host context switching (instance {{ $labels.instance }})
+        description: "Context switching is growing on the node (> 10000 / CPU / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+#    - alert: HostSwapIsFillingUp
+#      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 2m
+#      labels:
+#        severity: warning
+#      annotations:
+#        summary: Host swap is filling up (instance {{ $labels.instance }})
+#        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSystemdServiceCrashed
+      expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host systemd service crashed (instance {{ $labels.instance }})
+        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostPhysicalComponentTooHot
+      expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host physical component too hot (instance {{ $labels.instance }})
+        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNodeOvertemperatureAlarm
+      expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRaidArrayGotInactive
+      expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host RAID array got inactive (instance {{ $labels.instance }})
+        description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRaidDiskFailure
+      expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host RAID disk failure (instance {{ $labels.instance }})
+        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostKernelVersionDeviations
+      expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 6h
+      labels:
+        severity: warning
+      annotations:
+        summary: Host kernel version deviations (instance {{ $labels.instance }})
+        description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOomKillDetected
+      expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host OOM kill detected (instance {{ $labels.instance }})
+        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacCorrectableErrorsDetected
+      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacUncorrectableErrorsDetected
+      expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkReceiveErrors
+      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Receive Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkTransmitErrors
+      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkInterfaceSaturated
+      expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+        description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkBondDegraded
+      expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostConntrackLimit
+      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host conntrack limit (instance {{ $labels.instance }})
+        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockSkew
+      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock skew (instance {{ $labels.instance }})
+        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockNotSynchronising
+      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock not synchronising (instance {{ $labels.instance }})
+        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRequiresReboot
+      expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 4h
+      labels:
+        severity: info
+      annotations:
+        summary: Host requires reboot (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/variables.yml b/variables.yml
index 46e2d82..77ddff7 100644
--- a/variables.yml
+++ b/variables.yml
@@ -76,179 +76,284 @@ monitoring:
       #   - https://portal.acme.com
       http_probes: []
 
-    # Consul exporter will expose consul metrics
+    # Consul exporter will expose consul metrics (mainly registered services status)
     consul:
+      # Version of the exporter
       version: 0.11.0
+      # Docker image to use
       image: '[[ .docker.repo ]]consul-exporter:[[ .monitoring.exporters.consul.version ]]-2'
+      # Custom env var to set in the container
       env: {}
+      # Resource allocation
       resources:
         cpu: 20
         memory: 32
       vault:
+        # Vault policies to attach
         policies:
           - 'consul-exporter[[ .consul.suffix ]]'
 
+    # The cluster exporter is a simple nginx used as a proxy
+    # which handles TLS for the cluster services (vault, consul and nomad)
     cluster:
+      # Docker image to use
       image: nginxinc/nginx-unprivileged:alpine
+      # Custom env
       env: {}
+      # Resource allocation
       resources:
         cpu: 10
-        memory: 15
+        memory: 20
       vault:
+        # Vault policies to attach to the task
         policies:
           - 'cluster-exporter[[ .consul.suffix ]]'
-          - metrics
+          - metrics[[ .consul.suffix ]]
 
+  # The prometheus server
   prometheus:
-
-    version: 2.51.0
-
+    # Number of instances to run. Note that if you run several instances, they will be independant, and all of
+    # them will scrape the same data. Then queries to the prometheus API will be loadbalanced between all instances.
+    # This should work most of the time, but can give some strange result if eg, one of the instances was down (queries
+    # for data during the downtime can give some random result depending on the instance your query is routed to)
     count: 1
-
+    # Version of prometheus
+    version: 2.51.0
+    # Docker image to use
     image: '[[ .docker.repo ]]prometheus:[[ .monitoring.prometheus.version ]]-1'
-
+    # Custom env var to set
     env: {}
-
+    # Resource allocation
     resources:
       cpu: 200
       memory: 512
-
+    # Volumes used for data persistence
+    # You must create a prometheus-data[0] volume as it's a per_alloc volume
     volumes:
       data:
         type: csi
         source: 'prometheus-data'
         per_alloc: true
-
     vault:
+      # Vault policies to attach to the task
       policies:
         - 'prometheus[[ .consul.suffix ]]'
-
+    # A dict of custom jobs. Eg
+    # jobs:
+    #   squid:
+    #     targets:
+    #       - 10.11.2.3:9305
+    #       - 192.168.6.20:782
     jobs: {}
-    alert_rules: {}
+    # A dict of alert rules. Some alert rules are provided with this bundle, but you can load yours by downloading them when prometheus starts. Eg
     # alert_rules:
     #   postgres:
     #     url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
-
+    #   patroni:
+    #     url: https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml
+    # If you need something more flexible (like download an archive of rules and uncompress it, you should use artifacts instead. Just ensure your rules
+    # are in /local/rules/ inside the container
+    alert_rules: {}
+    # The public URL where prometheus will be reachable (if exposed with Traefik)
     public_url: https://prometheus.example.org
+    # Traefik settings
     traefik:
-      enabled: true
+      # Turn this on to expose prometheus with Traefik
+      # Caution : there's no builtin security, you should configure the appropriate middlewares
+      enabled: false
       router: prometheus
-
+    # Metrics retention duration
     retention: 30d
-
+    # always enable prometheus metrics (of course :-) )
     prometheus:
-      enabled: true
+      # This is the URL where metrics are exposed, where the metrics proxy will point at (from the container PoV)
       metrics_url: http://localhost:9090/metrics
 
+  # AlertManager can process and send alerts
   alertmanager:
+    # Number of instances to run. Set > 1 if you wan HA
     count: 1
+    # Version of alertmanager
     version: 0.27.0
+    # DOcker image to use
     image: '[[ .docker.repo ]]alertmanager:[[ .monitoring.alertmanager.version ]]-1'
+    # Custom env var to set in the container
     env: {}
+    # Resource allocation
     resources:
       cpu: 50
       memory: 64
       memory_max: 80
-    public_url: https://alerte.example.org
+    # URL where the web interface is reachable (if exposed with Traefik)
+    public_url: https://alert.example.org
+    # Traefik settings
     traefik:
-      enabled: true
+      # Turn this on to expose alertmanager with traefik
+      # Caution : there's no builtin security, you should configure appropriate middlewares before enabling
+      enabled: false
       router: alertmanager
+      # No need to strip prefix as alertmanager will be configured to handle it
       strip_prefix: false
+    # Volumes used for data persistence. Note : it's a per_alloc volume
+    # so you need to create eg alertmanager-data[0]. This volume should be writeable by user with ID 9093
     volumes:
       data:
         source: 'alertmanager-data'
         type: csi
         per_alloc: true
-    prometheus:
-      metrics_url: http://127.0.0.1:9093/metrics
     vault:
+      # List of vault policies to attach to the task
       policies:
-        - metrics
-        - 'alertmanager[[ .consul.suffix ]]'
+        - metrics[[ .consul.suffix ]]
+        - alertmanager[[ .consul.suffix ]]
+    # Email settings
     email:
       from: alertmanager@[[ .consul.domain ]]
+    # You can merge your own custom config with the default provided one. Eg
+    # custom_config:
+    #   receivers:
+    #     - name: dani
+    #       email_configs:
+    #         - to: dani@example.org
+    #   route:
+    #     group_by: ['alertname', 'cluster', 'job']
+    #     receiver: dani
     custom_config: {}
 
+  # Loki is the log server
   loki:
+    # Version of loki
     version: 2.9.6
+    # Docker image to use
     image: '[[ .docker.repo ]]loki:[[ .monitoring.loki.version ]]-1'
+    # Custom env to set in the container
     env: {}
+    # Resource allocation
     resources:
       cpu: 150
       memory: 512
     vault:
+      # Vault policies to attach in the container
       policies:
         - 'loki[[ .consul.suffix ]]'
+    # URL where loki is exposed (if enabled)
     public_url: https://loki.example.org
+    # Traefik settings
     traefik:
+      # Turn it on to expose Loki with Traefik
+      # Caution : there's no builtin security, you should add appropriate Traefik middlewares
+      enabled: false
       router: loki
+    # Retention for logs. Older will be deleted
     retention: 720h # 1 month
+    # Custom configuration which will be merged on top of the default one
     custom_config: {}
     prometheus:
+      # URL where metrics are available for the metrics proxy (from inside the container PoV)
       metrics_url: http://localhost:3100/metrics
+    # Volumes for data persistence. Should be writable for user id 3100
     volumes:
       data:
         type: csi
         source: 'loki-data'
 
+  # Common vector settings
   vector:
+    # Version of vector
     version: 0.36.1
+    # Docker image to use
     image: '[[ .docker.repo ]]vector:[[ .monitoring.vector.version ]]-1'
 
+  # Vector aggregator can be used to ingest logs from external device (using syslog or fluentd)
+  # Logs will then be forwarded to loki
   aggregator:
+    # Number of instances
     count: 1
+    # Docker image to use
     image: '[[ .monitoring.vector.image ]]'
+    # Custom env to set in the container
     env: {}
+    # Resource allocation
     resources:
       cpu: 100
       memory: 192
     consul:
       connect:
         upstreams:
+          # Connect to loki through the service mesh
           - destination_name: 'loki[[ .consul.suffix ]]'
             local_bind_port: 3100
     vault:
+      # Vault policies to attach to the task.
+      # Note : vector can expose its metrics with mTLS natively, so we do not add a metrics_proxy task
+      # but we need to grant the metrics policy to the vector task instead
       policies:
         - metrics[[ .consul.suffix ]]
+    # Fluentd source settings
     fluentd:
       enabled: false
       traefik:
         router: fluentd
         entrypoints:
           - fluentd
+    # Syslog source settings
     syslog_udp:
       enabled: false
       traefik:
         router: syslog-udp
         entrypoints:
-          - syslog
+          - syslog-udp
+    # Syslog (tcp) source settings
+    syslog_tcp:
+      enabled: false
+      traefik:
+        router: syslog-tcp
+        entrypoints:
+          - syslog-tcp
+    # Native vector (http) source settings
     vector:
       enabled: true
+      # URL where the vector endpoint is available from the outside (if exposed with Traefik)
       public_url: https://vector.example.org
       traefik:
+        # Set to true if you want to expose the service with Traefik
+        # Caution : there's no builtin security, you should configure appropriate middlewares before enabling it
         enabled: false
 
+  # Grafana settings
   grafana:
+    # Grafana version
     version: 10.4.1
+    # Docker image to use
     image: '[[ .docker.repo ]]grafana:[[ .monitoring.grafana.version ]]-1'
-    env:
-      GF_SECURITY_ADMIN_PASSWORD: '{{ with secret "[[ .vault.root ]]kv/service/[[ .instance ]]/grafana" }}{{ .Data.data.initial_admin_pwd }}{{ end }}'
+    # Custom env var to set in the container
+    env: {}
+    # Resource allocation
     resources:
       cpu: 100
       memory: 256
+    # URL where Grafana is reachable
     public_url: https://grafana.example.org
+    # List of plugins to install. Note : plugins are installed at image build time, so you need to rebuild
+    # the image if you want to update it
     plugins:
-      #- alexanderzobnin-zabbix-app
-      #- ddurieux-glpi-app
       - grafana-clock-panel
       - grafana-piechart-panel
+    # Dict of feature toggles. See https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/feature-toggles/
+    # Example:
+    # feature_toggles:
+    #   featureToggleAdminPage: true
+    #   ssoSettingsApi: true
     feature_toggles: {}
+    # Traefik settings
     traefik:
       enabled: true
       router: grafana
+      # No need to strip prefix as Grafana will be configured to handle it correctly
       strip_prefix: false
     consul:
       connect:
+        # Connect to postgres, loki and prometheus with the service mesh
         upstreams:
           - destination_name: postgres[[ .consul.suffix ]]
             local_bind_port: 5432
@@ -256,16 +361,20 @@ monitoring:
             local_bind_port: 3100
           - destination_name: prometheus[[ .consul.suffix ]]
             local_bind_port: 9090
+    # Volumes for data persistence
     volumes:
       data:
         type: csi
         source: 'grafana-data'
     vault:
+      # Vault policies to attach to the task
       policies:
         - 'grafana[[ .consul.suffix ]]'
+      # Postgres DB settings
       database:
         role: grafana
         pgrole: grafana
+    # Override some default postgres handling
     postgres:
       database: grafana
       user: '{{ with secret "[[ .vault.root ]]database/creds/grafana" }}{{ .Data.username }}{{ end }}'
@@ -273,64 +382,131 @@ monitoring:
       pooler:
         mode: session
     prometheus:
-      metrics_url: http://localhost:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics
+      # URL where Grafana metrics are reachable for the metrics proxy (from inside the container PoV)
+      metrics_url: http://127.0.0.1:3000[[ (urlParse .monitoring.grafana.public_url).Path ]]/metrics
 
+  # Agent runs as a system jobs, on all the nodes
   agent:
     consul:
       meta:
+        # Override the alloc service meta, the hostname will be more useful than a 0)
         alloc: '${node.unique.name}'
+    # Nomad settings
     nomad:
+      # Run on all node pools
       node_pool: all
+      # Run with an above average priority
+      priority: 60
+
+    # Nomad vector logger is a small container which will query the Nomad API to discover running allocation on the current node
+    # Then generate a vector configuration with scraping for all the discovered allocation.
     nomad_vector_logger:
-      version: 24.3
-      image: '[[ .docker.repo ]]nomad-vector-logger:[[ .monitoring.agent.nomad_vector_logger.version ]]-2'
+      # Docker image to use
+      image: '[[ .docker.repo ]]nomad-vector-logger:24.3-2'
+      # Custom env to set in the container
       env: {}
+      # Resource allocation
       resources:
         cpu: 20
         memory: 24
         memory_max: 50
       vault:
+        # Vault policies to attach to the task
         policies:
           - nomad-vector-logger[[ .consul.suffix ]]
+
+    # Vector is the main task. It'll read it's config created by nomad-vector-logger and will read log files
+    # accordingly, add useful metadata (like node, job, group, task, alloc etc.) and push logs to loki
     vector:
+      # Docker image to use
       image: '[[ .monitoring.vector.image ]]'
+      # Custom env to set in the container
       env: {}
+      # Resource allocation
       resources:
         cpu: 100
-        memory: 192
-        memory_max: 384
+        memory: 384
+        memory_max: 512
       vault:
+        # Vault policies to attach to the container. Vector being able to use mTLS on the metrics endpoint
+        # there's no need to add a metrics_proxy task. Instead, we grant the metrics policy to vector so it can get
+        # a certificate from vault
         policies:
           - metrics[[ .consul.suffix ]]
       consul:
         connect:
           upstreams:
+            # Connect to loki with the service mesh
             - destination_name: loki[[ .consul.suffix ]]
               local_bind_port: 3100
+      # Volumes for data persistence
       volumes:
+        # The nomad volume should expose the Nomad alloc dir (eg /opt/nomad/data/alloc) where vector will be able
+        # to read the logs. You should create a host volume in nomad client config of all your nodes. Eg
+        # client {
+        #   enabled = true
+        #   host_volume "nomad_alloc" {
+        #     path = "/opt/nomad/data/alloc"
+        #     read_only = "true"
+        #   }
+        # }
         nomad:
           type: host
           source: nomad_alloc
           read_only: true
+        # The data volume will be used by vector for buffering (in case loki is unavailable)
+        # You can create a host volume in Nomad's client config, eg
+        # client {
+        #   enabled = true
+        #   host_volume "nomad_alloc" {
+        #     path = "/data/vector-agent"
+        #   }
+        # }
         data:
           type: host
           source: vector_data
 
+    # The node exporter can be used to expose the host metrics to prometheus
     node_exporter:
-      image: quay.io/prometheus/node-exporter:latest
+      # Is the node exporter enabled ? (set to false if you don't want it, or if you
+      # already manage the node-exporter separatly)
+      enabled: true
+      # Version of the exporter
+      version: 1.7.0
+      # Docker image to use
+      image: '[[ .docker.repo ]]node-exporter:[[ .monitoring.agent.node_exporter.version ]]-1'
+      # Custom env to set in the container
       env: {}
+      # Resource allocation
       resources:
         cpu: 50
         memory: 24
         memory_max: 32
       vault:
+        # Vault policies to atatch to the task
+        # This exporter can handle mTLS itself, so no need to create a metrics_proxy task, instead, grant the metrics policy
+        # So it can get a certificate from vault
         policies:
           - metrics[[ .consul.suffix ]]
+      # Args to add to the exporter on start
+      args:
+        - '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/(docker|containers)/.+|opt/nomad/data/(alloc|client))($|/)'
+      # Volumes
       volumes:
+        # The exporter should access the host root filesystem
+        # For this, you should create a host volume in Nomad's client config, eg
+        # client {
+        #   enabled = true
+        #   host_volume "host_root" {
+        #     path = "/"
+        #     read_only = true
+        #   }
+        # }
         host:
           type: host
           source: host_root
           read_only: true
 
+# Enable globaly prometheus for this bundle :-)
 prometheus:
   enabled: true