job "postgres-server" { datacenters = ["dc1"] region = "global" priority = 80 group "server" { count = 1 shutdown_delay = "6s" constraint { operator = "distinct_hosts" value = "true" } update { healthy_deadline = "2h" progress_deadline = "3h" } ephemeral_disk { # Use minimal ephemeral disk size = 101 } network { mode = "bridge" # Patroni API for nodes to check each others port "patroni" { to = 8080 } # When running with patroni, nodes must reach each others postgres service, so we expose a port port "postgres" { to = 5432 } } service { name = "postgres" port = 5432 meta { alloc = "${NOMAD_ALLOC_INDEX}" datacenter = "${NOMAD_DC}" group = "${NOMAD_GROUP_NAME}" job = "${NOMAD_JOB_NAME}" namespace = "${NOMAD_NAMESPACE}" node = "${node.unique.name}" region = "${NOMAD_REGION}" } connect { sidecar_service { } sidecar_task { logs { disabled = false } config { args = [ "-c", "${NOMAD_SECRETS_DIR}/envoy_bootstrap.json", "-l", "${meta.connect.log_level}", "--concurrency", "${meta.connect.proxy_concurrency}", "--disable-hot-restart" ] } resources { cpu = 50 memory = 64 } } } tags = [ "postgres-${NOMAD_ALLOC_INDEX}", # Note : we don't add traefik.enable=true # This will be done dynamically only on the current master node using the update_tags.sh script "traefik.tcp.routers.postgres.rule=HostSNI(`*`)", "traefik.tcp.routers.postgres.tls=true", "traefik.tcp.routers.postgres.entrypoints=postgres", ] # Use patroni health endpoint to verify postgres status check { name = "healthy" type = "http" port = "patroni" path = "/health" protocol = "https" # Patroni REST API is using a cert from a private CA tls_skip_verify = true interval = "30s" timeout = "5s" } # This check will ensure the current role is published in Consul tags (if the callback during a role change failed for example) check { name = "tags" type = "script" command = "/local/update_tags.sh" task = "postgres" interval = "30s" timeout = "5s" } check { name = "ready" type = "script" interval = "30s" timeout = "5s" task = "postgres" command = "pg_isready" } # Patroni will run a script to update the tags (master / replica) enable_tag_override = true } volume "backup" { source = "postgres-backup" type = "csi" access_mode = "multi-node-multi-writer" attachment_mode = "file-system" } volume "data" { source = "postgres-data" type = "csi" access_mode = "single-node-writer" attachment_mode = "file-system" per_alloc = true } task "postgres" { driver = "docker" leader = true kill_timeout = "10m" config { image = "danielberteaud/patroni:15-3.3.0-2" # Set shm_size to half of the total size shm_size = 536870912 volumes = ["local/mkdir-socket.sh:/entrypoint.d/70-mkdir-socket.sh"] command = "patroni" args = ["/secrets/patroni.yml"] pids_limit = 700 } vault { policies = ["postgres"] env = false disable_file = true change_mode = "noop" } # Use a template block instead of env {} so we can fetch values from vault template { data = <<_EOT LANG=fr_FR.utf8 PGBACKREST_STANZA=postgres TZ=Europe/Paris _EOT destination = "secrets/.env" perms = 400 env = true } template { data = <<_EOT # Get a Consul token from vault, so we're able to update the tags in Consul from the containers CONSUL_HTTP_TOKEN={{ with secret "consul/creds/postgres" }}{{ .Data.token }}{{ end }} PATRONICTL_CONFIG_FILE=/secrets/patroni.yml _EOT destination = "secrets/pg.env" uid = 100000 gid = 100026 perms = 440 change_mode = "noop" env = true } # Scripts to update tags attached to the service in consul catalog # with either master or replica template { data = <<_EOT def nest_service_params: # Select all objects with keys that start with the the name 'Service' # Remove 'Service' prefix from key names with_entries(select(.key | startswith("Service")) | .key = (.key | sub("^Service"; ""))) ; def create_svc_txn: # Select our own instance select(.ServiceTags[] | contains($mytag)) # Add needed tags, remove unwanted ones | .ServiceTags -= ["master"] | .ServiceTags -= ["replica"] | .ServiceTags -= ["uninitialized"] | .ServiceTags -= ["traefik.enable=true"] | .ServiceTags += [$role] | if $role == "master" then .ServiceTags += ["traefik.enable=true"] else .ServiceTags -= ["traefik.enable=true"] end # Rename 'ServiceName' to 'ServiceService' # Will be transformed to 'Service.Service later' | . + {ServiceService: .ServiceName} | del(.ServiceName) | nest_service_params as $nested_params # Create transaction to update services using check-and-set option. # If service has been modified since our last read, or no longer exists, the # transaction will fail. | { Service: { Verb: "cas", Node: .Node, # Nest service-related parameters under 'Service' key # Add ModfyIndex key Service: ($nested_params + {ModifyIndex: .ModifyIndex}) } } ; . | map(create_svc_txn) _EOT destination = "local/serviceformat.jq" change_mode = "noop" } template { data = <<_EOT #!/bin/sh # vim: syntax=sh set -eo pipefail EVENT=$1 NEW_ROLE=$2 if [ -z "${NEW_ROLE}" ]; then # If new role is not given as argument, fetch it from patroni NEW_ROLE=$(curl --silent --insecure https://localhost:${NOMAD_PORT_patroni}/health | jq -r .role) fi # Source env file to get our consul token source /secrets/pg.env # translate promoted = master and demoted = recplica if [ "${NEW_ROLE}" = "promoted" ]; then NEW_ROLE="master" elif [ "${NEW_ROLE}" = "demoted" ]; then NEW_ROLE="replica" fi CURL_OPTS="--connect-timeout 5 --max-time 10 --retry 5 --retry-delay 1 --retry-max-time 40 --retry-connrefused --silent" # Update tags on the main service SERVICE_HAS_TAG=$(curl ${CURL_OPTS} \ -H "X-Consul-Token: ${CONSUL_HTTP_TOKEN}" \ http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500/v1/catalog/service/postgres |\ jq ".[] | select( .ServiceTags as \$tags | \"postgres-{{ env "NOMAD_ALLOC_INDEX" }}\" | IN(\$tags[]) ) | .ServiceTags | any(.==\"${NEW_ROLE}\")") if [ "${SERVICE_HAS_TAG}" = "false" ]; then echo "Updating tags for the main service" curl ${CURL_OPTS} -H "X-Consul-Token: ${CONSUL_HTTP_TOKEN}" http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500/v1/catalog/service/postgres |\ jq --from-file /local/serviceformat.jq --arg role "${NEW_ROLE}" --arg mytag postgres-{{ env "NOMAD_ALLOC_INDEX" }} |\ curl ${CORL_OPTS} -H "X-Consul-Token: ${CONSUL_HTTP_TOKEN}" -X PUT -d @- http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500/v1/txn > /dev/null else echo "Main service already has the expected ${NEW_ROLE} tag" fi # Update tags on the sidecar service (connect-proxy) SIDECAR_HAS_TAG=$(curl ${CURL_OPTS} \ -H "X-Consul-Token: ${CONSUL_HTTP_TOKEN}" \ http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500/v1/catalog/service/postgres-sidecar-proxy |\ jq ".[] | select( .ServiceTags as \$tags | \"postgres-{{ env "NOMAD_ALLOC_INDEX" }}\" | IN(\$tags[]) ) | .ServiceTags | any(.==\"${NEW_ROLE}\")") if [ "${SIDECAR_HAS_TAG}" = "false" ]; then echo "Updating tags for the sidecar" curl ${CURL_OPTS} -H "X-Consul-Token: ${CONSUL_HTTP_TOKEN}" http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500/v1/catalog/service/postgres-sidecar-proxy |\ jq --from-file /local/serviceformat.jq --arg role "${NEW_ROLE}" --arg mytag postgres-{{ env "NOMAD_ALLOC_INDEX" }} |\ curl ${CURL_OPTS} -H "X-Consul-Token: ${CONSUL_HTTP_TOKEN}" -X PUT -d @- http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500/v1/txn > /dev/null else echo "Sidecar service already has the expected ${NEW_ROLE} tag" fi _EOT destination = "local/update_tags.sh" perms = 755 change_mode = "noop" } # A small entrypoint scriptlet to ensure /alloc/data/postgres dir exists template { data = <<_EOT #!/bin/sh set -eu mkdir -p /alloc/data/postgres _EOT destination = "local/mkdir-socket.sh" perms = 755 } # Patroni main configuration file template { data = <<_EOT name: postgres-{{ env "NOMAD_ALLOC_INDEX" }} scope: postgres consul: url: http://{{ sockaddr "GetInterfaceIP \"nomad\"" }}:8500 token: {{ with secret "consul/creds/postgres" }}{{ .Data.token }}{{ end }} bootstrap: dcs: synchronous_mode: False initdb: - data-checksum - encoding: UTF-8 post_bootstrap: /local/create_users.sh postgresql: create_replica_methods: - basebackup callbacks: on_role_change: /local/update_tags.sh on_start: /local/update_tags.sh connect_address: {{ env "NOMAD_HOST_ADDR_postgres" }} bin_dir: /usr/pgsql-15/bin data_dir: /data/db/15 listen: 0.0.0.0:{{ env "NOMAD_ALLOC_PORT_postgres" }} use_pg_rewind: True #remove_data_directory_on_rewind_failure: True pg_hba: - local all postgres peer - local replication postgres peer - local all postgres scram-sha-256 - host all all 127.0.0.0/8 scram-sha-256 - host replication backup 127.0.0.0/8 scram-sha-256 - hostssl replication replicator 0.0.0.0/0 cert clientcert=verify-full map=patroni-map - hostssl postgres rewind 0.0.0.0/0 cert clientcert=verify-full map=patroni-map - hostssl all all 0.0.0.0/0 cert clientcert=verify-full pg_ident: - patroni-map postgres.service.consul postgres - patroni-map postgres.service.consul replicator - patroni-map postgres.service.consul rewind parameters: ssl: on ssl_cert_file: /secrets/postgres.bundle.pem ssl_key_file: /secrets/postgres.bundle.pem ssl_ca_file: /local/postgres.ca.pem #ssl_crl_file: /local/postgres.crl.pem # Add a socket in /alloc/data/postgres # so other tasks in the same group can reach it unix_socket_directories: /run/postgresql, /alloc/data/postgres autovacuum_analyze_scale_factor: 0.05 autovacuum_analyze_threshold: 500 autovacuum_vacuum_scale_factor: 0.1 autovacuum_vacuum_threshold: 500 datestyle: ISO, DMY log_connections: on log_destination: stderr log_directory: /proc/1/fd log_disconnections: on log_filename: 1 log_line_prefix: '[%m] u=%u,d=%d,a=%a,c=%h,xid=%x ' log_min_duration_statement: 2000 log_statement: ddl log_timezone: {{ env "TZ" }} maintenance_work_mem: 51MB shared_buffers: 512MB timezone: {{ env "TZ" }} wal_compression: zstd wal_keep_size: 512 work_mem: 10MB recovery_conf: authentication: superuser: username: postgres password: '{{ with secret "kv/service/postgres" }}{{ .Data.data.pg_pwd }}{{ end }}' sslmode: verify-ca sslrootcert: /local/postgres.ca.pem replication: username: replicator sslmode: verify-ca sslrootcert: /local/postgres.ca.pem sslcert: /secrets/postgres.bundle.pem sslkey: /secrets/postgres.bundle.pem rewind: username: rewind sslmode: verify-ca sslrootcert: /local/postgres.ca.pem sslcert: /secrets/postgres.bundle.pem sslkey: /secrets/postgres.bundle.pem restapi: connect_address: {{ env "NOMAD_HOST_ADDR_patroni" }} listen: 0.0.0.0:{{ env "NOMAD_ALLOC_PORT_patroni" }} keyfile: /secrets/postgres.bundle.pem certfile: /secrets/postgres.bundle.pem cafile: /local/postgres.ca.pem verify_client: optional authentication: username: patroni password: '{{ with secret "kv/service/postgres" }}{{ .Data.data.api_pwd }}{{ end }}' ctl: insecure: False keyfile: /secrets/postgres.bundle.pem certfile: /secrets/postgres.bundle.pem cafile: /local/postgres.ca.pem watchdog: mode: off _EOT destination = "secrets/patroni.yml" perms = "0400" uid = 100026 gid = 100026 change_mode = "signal" change_signal = "SIGHUP" } # Post bootstrap script, to create users template { data = <<_EOT #!/bin/sh set -euo pipefail # Create roles needed for patroni {{ with secret "kv/service/postgres" }} psql <<'_EOSQL' ALTER ROLE postgres WITH SUPERUSER LOGIN PASSWORD '{{ .Data.data.pg_pwd }}'; CREATE ROLE replicator WITH LOGIN REPLICATION PASSWORD '{{ .Data.data.replicator_pwd }}'; CREATE ROLE rewind WITH LOGIN PASSWORD '{{ .Data.data.rewind_pwd }}'; CREATE ROLE vault WITH LOGIN SUPERUSER PASSWORD '{{ .Data.data.vault_initial_pwd }}'; CREATE ROLE monitor WITH LOGIN PASSWORD '{{ .Data.data.monitor_pwd }}'; GRANT "pg_monitor" TO "monitor"; _EOSQL {{ end }} _EOT destination = "secrets/create_users.sh" perms = "0750" uid = 100026 gid = 100026 change_mode = "noop" } # Post bootstrap wrapper, as /secrets is mounted with noexec template { data = <<_EOT #!/bin/sh set -euo pipefail sh /secrets/create_users.sh _EOT destination = "local/create_users.sh" perms = "0750" uid = 100026 gid = 100026 change_mode = "noop" } # Obtain a certificate from Vault template { data = <<_EOT {{ with pkiCert "pki/postgres/issue/postgres-server" "common_name=postgres.service.consul" (printf "alt_name=%s.postgres.service.consul" (env "NOMAD_ALLOC_INDEX")) (printf "ip_sans=%s" (env "NOMAD_IP_patroni")) "ttl=72h" }} {{ .Cert }} {{ .Key }} {{ end }} _EOT destination = "secrets/postgres.bundle.pem" perms = "0400" uid = 100026 gid = 100026 change_mode = "signal" change_signal = "SIGHUP" } # CA certificate chains template { data = <<_EOT {{ with secret "pki/postgres/cert/ca_chain" }}{{ .Data.ca_chain }}{{ end }} _EOT destination = "local/postgres.ca.pem" change_mode = "signal" change_signal = "SIGHUP" } # Mount the persistent volume in /data volume_mount { volume = "data" destination = "/data" } # Mount the backup volume (which can be used for PITR with pgbackrest) volume_mount { volume = "backup" destination = "/backup" } resources { cpu = 1000 memory = 1024 } } } }