Skip to content

Instantly share code, notes, and snippets.

@Gurpartap
Last active March 24, 2022 03:07
Show Gist options
  • Save Gurpartap/943f8e435c110231e9f553ff1d9a5e34 to your computer and use it in GitHub Desktop.
Save Gurpartap/943f8e435c110231e9f553ff1d9a5e34 to your computer and use it in GitHub Desktop.
Cadence job file for the HashiStack (consul and nomad)
# see files 3 to 6 to run each server service component individually.
# comes handy when you must allocate resources and/or measure metrics
# more precisely.
#
# individual service jobs can be mixed with cadence-server.nomad.hcl
# on the same cluster without conflict.
# currently requires you to run cassandra schema migrations manually
# (run 1 job with 0.11.0-auto-setup image)
job cadence-server {
datacenters = ["dc1"]
type = "service"
priority = 60
# constraint {
# attribute = "${meta.tags}"
# operator = "set_contains"
# value = "cadence-services"
# }
group cadence-server {
count = 3
constraint {
distinct_hosts = true
}
task cadence-server {
driver = "docker"
kill_timeout = "30s"
config {
image = "ubercadence/server:0.11.0"
network_mode = "host"
volumes = [
"local/dynamicconfig.yml:/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
]
}
service {
name = "cadence-frontend"
port = "frontend"
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
service {
name = "cadence-history"
port = "history"
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
service {
name = "cadence-matching"
port = "matching"
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
service {
name = "cadence-worker"
port = "worker"
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
service {
name = "cadence-server"
tags = ["metrics", "metrics-port=${NOMAD_PORT_prometheus}"]
}
env {
# change requires db reset
NUM_HISTORY_SHARDS = 4
LOG_LEVEL = "info"
SERVICES = "frontend,history,matching,worker"
BIND_ON_IP = "${NOMAD_IP_frontend}"
CASSANDRA_SEEDS = "cassandra-cluster1-node1.node.consul,cassandra-cluster1-node2.node.consul,cassandra-cluster1-node3.node.consul"
DB = "cassandra"
RF = 3
KEYSPACE = "cadence"
VISIBILITY_KEYSPACE = "cadence_visibility"
SKIP_SCHEMA_SETUP = true
DYNAMIC_CONFIG_FILE_PATH = "/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
RINGPOP_BOOTSTRAP_MODE = "dns"
RINGPOP_SEEDS = "cadence-frontend.service.consul:7933,cadence-history.service.consul:7934,cadence-matching.service.consul:7935,cadence-worker.service.consul:7939"
PROMETHEUS_ENDPOINT = "${NOMAD_ADDR_prometheus}"
}
template {
change_mode = "noop"
destination = "local/dynamicconfig.yml"
data = <<EOH
---
system.minRetentionDays:
- value: 0
constraints: {}
system.historyArchivalStatus:
- value: "disabled"
constraints: {}
system.visibilityArchivalStatus:
- value: "disabled"
constraints: {}
frontend.enableClientVersionCheck:
- value: true
constraints: {}
frontend.visibilityListMaxQPS:
- value: 100
constraints: {}
history.EnableConsistentQueryByDomain:
- value: true
constraints: {}
EOH
}
resources {
cpu = 2999
memory = 2048
network {
mbits = 100
port frontend {
static = 7933
}
port history {
static = 7934
}
port matching {
static = 7935
}
port worker {
static = 7939
}
port prometheus {}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 5
delay = "5s"
mode = "delay"
interval = "1m"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "15s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "1m"
progress_deadline = "2m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
job cadence-web {
datacenters = ["dc1"]
type = "service"
priority = 20
# constraint {
# attribute = "${meta.tags}"
# operator = "set_contains"
# value = "cadence-services"
# }
group cadence-web {
count = 1
task cadence-web {
driver = "docker"
kill_timeout = "30s"
config {
image = "ubercadence/web:latest"
port_map = {
http = 8088
}
}
# restarts job when cadence-frontend service changes
template {
env = true
destination = "${NOMAD_SECRETS_DIR}/env"
data = <<EOF
CADENCE_TCHANNEL_PEERS={{range $index, $service := service "cadence-frontend" }}{{if ne $index 0}},{{end}}{{$service.Address}}:{{$service.Port}}{{end}}
EOF
}
service {
name = "cadence-web"
port = "http"
check {
type = "http"
path = "/"
interval = "5s"
timeout = "3s"
}
}
resources {
cpu = 1000
memory = 768
network {
mbits = 100
port http {
static = 8088
}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 3
delay = "10s"
interval = "1m"
mode = "delay"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "10s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "2m"
progress_deadline = "3m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
job cadence-frontend {
datacenters = ["dc1"]
type = "service"
priority = 60
group cadence-frontend {
count = 3
constraint {
distinct_hosts = true
}
task cadence-frontend {
driver = "docker"
kill_timeout = "30s"
config {
image = "ubercadence/server:0.11.0"
network_mode = "host"
volumes = [
"local/dynamicconfig.yml:/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
]
}
service {
name = "cadence-frontend"
port = "frontend"
tags = ["metrics", "metrics-port=${NOMAD_PORT_prometheus}"]
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
env {
# change requires db reset
NUM_HISTORY_SHARDS = 4
LOG_LEVEL = "info"
SERVICES = "frontend"
BIND_ON_IP = "${NOMAD_IP_frontend}"
CASSANDRA_SEEDS = "cassandra-cluster1-node1.node.consul,cassandra-cluster1-node2.node.consul,cassandra-cluster1-node3.node.consul"
DB = "cassandra"
RF = 3
KEYSPACE = "cadence"
VISIBILITY_KEYSPACE = "cadence_visibility"
SKIP_SCHEMA_SETUP = true
DYNAMIC_CONFIG_FILE_PATH = "/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
RINGPOP_BOOTSTRAP_MODE = "dns"
RINGPOP_SEEDS = "cadence-frontend.service.consul:7933,cadence-history.service.consul:7934,cadence-matching.service.consul:7935,cadence-worker.service.consul:7939"
PROMETHEUS_ENDPOINT = "${NOMAD_ADDR_prometheus}"
}
template {
change_mode = "noop"
destination = "local/dynamicconfig.yml"
data = <<EOH
---
system.minRetentionDays:
- value: 0
constraints: {}
system.historyArchivalStatus:
- value: "disabled"
constraints: {}
system.visibilityArchivalStatus:
- value: "disabled"
constraints: {}
frontend.enableClientVersionCheck:
- value: true
constraints: {}
frontend.visibilityListMaxQPS:
- value: 100
constraints: {}
EOH
}
resources {
cpu = 1500
memory = 256
network {
mbits = 100
port frontend {
static = 7933
}
port prometheus {}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 5
delay = "5s"
mode = "delay"
interval = "1m"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "15s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "1m"
progress_deadline = "2m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
job cadence-history {
datacenters = ["dc1"]
type = "service"
priority = 60
group cadence-history {
count = 3
constraint {
distinct_hosts = true
}
task cadence-history {
driver = "docker"
kill_timeout = "30s"
config {
image = "ubercadence/server:0.11.0"
network_mode = "host"
volumes = [
"local/dynamicconfig.yml:/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
]
}
service {
name = "cadence-history"
port = "history"
tags = ["metrics", "metrics-port=${NOMAD_PORT_prometheus}"]
check
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
env {
# change requires db reset
NUM_HISTORY_SHARDS = 4
LOG_LEVEL = "info"
SERVICES = "history"
BIND_ON_IP = "${NOMAD_IP_history}"
CASSANDRA_SEEDS = "cassandra-cluster1-node3.node.consul,cassandra-cluster1-node2.node.consul,cassandra-cluster1-node1.node.consul"
DB = "cassandra"
RF = 3
KEYSPACE = "cadence"
VISIBILITY_KEYSPACE = "cadence_visibility"
SKIP_SCHEMA_SETUP = true
DYNAMIC_CONFIG_FILE_PATH = "/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
RINGPOP_BOOTSTRAP_MODE = "dns"
RINGPOP_SEEDS = "cadence-frontend.service.consul:7933,cadence-history.service.consul:7934,cadence-matching.service.consul:7935,cadence-worker.service.consul:7939"
PROMETHEUS_ENDPOINT = "${NOMAD_ADDR_prometheus}"
}
template {
change_mode = "noop"
destination = "local/dynamicconfig.yml"
data = <<EOH
---
system.minRetentionDays:
- value: 0
constraints: {}
system.historyArchivalStatus:
- value: "disabled"
constraints: {}
system.visibilityArchivalStatus:
- value: "disabled"
constraints: {}
history.EnableConsistentQueryByDomain:
- value: true
constraints: {}
EOH
}
resources {
cpu = 1999
memory = 1536
network {
mbits = 100
port history {
static = 7934
}
port prometheus {}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 5
delay = "5s"
mode = "delay"
interval = "1m"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "15s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "1m"
progress_deadline = "2m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
job cadence-matching {
datacenters = ["dc1"]
type = "service"
priority = 60
group cadence-matching {
count = 3
constraint {
distinct_hosts = true
}
task cadence-matching {
driver = "docker"
kill_timeout = "30s"
config {
image = "ubercadence/server:0.11.0"
network_mode = "host"
volumes = [
"local/dynamicconfig.yml:/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
]
}
service {
name = "cadence-matching"
port = "matching"
tags = ["metrics", "metrics-port=${NOMAD_PORT_prometheus}"]
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
env {
# change requires db reset
NUM_HISTORY_SHARDS = 4
LOG_LEVEL = "info"
SERVICES = "matching"
BIND_ON_IP = "${NOMAD_IP_matching}"
CASSANDRA_SEEDS = "cassandra-cluster1-node1.node.consul,cassandra-cluster1-node2.node.consul,cassandra-cluster1-node3.node.consul"
DB = "cassandra"
RF = 3
KEYSPACE = "cadence"
VISIBILITY_KEYSPACE = "cadence_visibility"
SKIP_SCHEMA_SETUP = true
DYNAMIC_CONFIG_FILE_PATH = "/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
RINGPOP_BOOTSTRAP_MODE = "dns"
RINGPOP_SEEDS = "cadence-frontend.service.consul:7933,cadence-history.service.consul:7934,cadence-matching.service.consul:7935,cadence-worker.service.consul:7939"
PROMETHEUS_ENDPOINT = "${NOMAD_ADDR_prometheus}"
}
template {
change_mode = "noop"
destination = "local/dynamicconfig.yml"
data = <<EOH
---
system.minRetentionDays:
- value: 0
constraints: {}
system.historyArchivalStatus:
- value: "disabled"
constraints: {}
system.visibilityArchivalStatus:
- value: "disabled"
constraints: {}
EOH
}
resources {
cpu = 500
memory = 256
network {
mbits = 100
port matching {
static = 7935
}
port prometheus {}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 5
delay = "5s"
mode = "delay"
interval = "1m"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "15s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "1m"
progress_deadline = "2m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
job cadence-worker {
datacenters = ["dc1"]
type = "service"
priority = 60
group cadence-worker {
count = 3
constraint {
distinct_hosts = true
}
task cadence-worker {
driver = "docker"
kill_timeout = "30s"
config {
image = "ubercadence/server:0.11.0"
network_mode = "host"
volumes = [
"local/dynamicconfig.yml:/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
]
}
service {
name = "cadence-worker"
port = "worker"
tags = ["metrics", "metrics-port=${NOMAD_PORT_prometheus}"]
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
env {
# change requires db reset
NUM_HISTORY_SHARDS = 4
LOG_LEVEL = "info"
SERVICES = "worker"
BIND_ON_IP = "${NOMAD_IP_worker}"
CASSANDRA_SEEDS = "cassandra-cluster1-node1.node.consul,cassandra-cluster1-node2.node.consul,cassandra-cluster1-node3.node.consul"
DB = "cassandra"
RF = 3
KEYSPACE = "cadence"
VISIBILITY_KEYSPACE = "cadence_visibility"
SKIP_SCHEMA_SETUP = true
DYNAMIC_CONFIG_FILE_PATH = "/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
RINGPOP_BOOTSTRAP_MODE = "dns"
RINGPOP_SEEDS = "cadence-frontend.service.consul:7933,cadence-history.service.consul:7934,cadence-matching.service.consul:7935,cadence-worker.service.consul:7939"
PROMETHEUS_ENDPOINT = "${NOMAD_ADDR_prometheus}"
}
template {
change_mode = "noop"
destination = "local/dynamicconfig.yml"
data = <<EOH
---
system.minRetentionDays:
- value: 0
constraints: {}
system.historyArchivalStatus:
- value: "disabled"
constraints: {}
system.visibilityArchivalStatus:
- value: "disabled"
constraints: {}
EOH
}
resources {
cpu = 500
memory = 256
network {
mbits = 100
port worker {
static = 7939
}
port prometheus {}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 5
delay = "5s"
mode = "delay"
interval = "1m"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "15s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "1m"
progress_deadline = "2m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
@Gurpartap
Copy link
Author

Gurpartap commented Apr 14, 2020

To have prometheus collect metrics automatically for the above services, set this in your prometheus.yml:

scrape_configs:
  - job_name: 'consul'
    consul_sd_configs:
    - services: []
    relabel_configs:
      - source_labels: [__meta_consul_tags]
        regex: .*,metrics,.*
        action: keep
      - source_labels: [__meta_consul_service]
        target_label: job
      - source_labels: [__meta_consul_node]
        target_label: hostname
      - source_labels: ['__address__', '__meta_consul_tags']
        regex:         '(.*):.*;.*,metrics-port=(\d+),.*'
        target_label:  '__address__'
        replacement:   '$1:$2'

If you use nomad to also run your prometheus instance, here's prometheus.nomad:

# not configured to persist metrics data

job prometheus {
  datacenters = ["dc1"]
  type        = "service"

  # constraint {
  #   attribute = "${meta.tags}"
  #   operator  = "set_contains"
  #   value     = "workerpool1"
  # }

  group prometheus {
    count = 1

    task prometheus {
      driver       = "docker"
      kill_timeout = "60s"

      config {
        image        = "prom/prometheus:latest"
        args         = [
          # defaults from https://github.com/prometheus/prometheus/blob/master/Dockerfile
          "--config.file=/etc/prometheus/prometheus.yml",
          "--storage.tsdb.path=/prometheus",
          "--web.console.libraries=/usr/share/prometheus/console_libraries",
          "--web.console.templates=/usr/share/prometheus/consoles",
          # custom overrides
          "--storage.tsdb.retention.size=1GB",
          "--storage.tsdb.wal-compression",
          "--web.enable-admin-api"
        ]
        volumes      = [
          "local/prometheus.yml:/etc/prometheus/prometheus.yml"
        ]
        network_mode = "host"
      }

      resources {
        cpu    = 350
        memory = 1536

        network {
          mbits = 10

          port prometheus_ui {
            static = 9090
          }
        }
      }

      service {
        name = "prometheus"
        port = "prometheus_ui"
        check {
          name     = "prometheus_ui port alive"
          type     = "http"
          path     = "/-/healthy"
          interval = "10s"
          timeout  = "2s"
        }
      }

      template {
        change_mode = "noop"
        destination = "local/prometheus.yml"
        # language=yml
        data        = <<EOH
---
global:
  scrape_interval:     15s
  evaluation_interval: 5s

scrape_configs:
  # …
  # other configs
  # …

  - job_name: 'consul'
    consul_sd_configs:
    - services: []
    relabel_configs:
      - source_labels: [__meta_consul_tags]
        regex: .*,metrics,.*
        action: keep
      - source_labels: [__meta_consul_service]
        target_label: job
      - source_labels: [__meta_consul_node]
        target_label: node
      - source_labels: ['__address__', '__meta_consul_tags']
        regex:         '(.*):.*;.*,metrics-port=(\d+),.*'
        target_label:  '__address__'
        replacement:   '$1:$2'

EOH
      }
    }

    restart {
      attempts = 3
      delay    = "10s"
      interval = "3m"
      mode     = "delay"
    }
  }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment