diff --git a/docker-compose.yml b/docker-compose.yml index 733507a..3a1cf6f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -247,7 +247,7 @@ services: deploy: resources: limits: - memory: 512M + memory: 786M reservations: memory: 256M healthcheck: diff --git a/s/lib.sh b/s/lib.sh index 885071b..d3dd891 100755 --- a/s/lib.sh +++ b/s/lib.sh @@ -345,7 +345,10 @@ get_resource_limits() { bench-sandbox) echo "CPU_LIMIT=0.5 MEMORY_LIMIT=1G CPU_RESERVE=0.2 MEMORY_RESERVE=512M" ;; - bench-api|server) + bench-api) + echo "CPU_LIMIT=0.5 MEMORY_LIMIT=786M CPU_RESERVE=0.2 MEMORY_RESERVE=256M" + ;; + server) echo "CPU_LIMIT=0.5 MEMORY_LIMIT=512M CPU_RESERVE=0.2 MEMORY_RESERVE=256M" ;; scheduler) @@ -380,6 +383,34 @@ is_main_branch() { [[ "$branch" == "main" || "$branch" == "master" ]] } +# True when the swarm has a single worker (typical prod EC2); start-first rollouts +# need 2× memory reservations and often fail with "insufficient resources". +swarm_is_single_node() { + local nodes + nodes="$(docker node ls -q 2>/dev/null | wc -l | tr -d ' ')" + [[ "${nodes:-0}" -le 1 ]] +} + +# stop-first on single-node for memory-heavy services; start-first elsewhere when safe. +swarm_update_order_for_service() { + local svc="$1" + case "$svc" in + bench-sandbox|server) + echo "stop-first" + ;; + bench-api|bench-worker) + if swarm_is_single_node; then + echo "stop-first" + else + echo "start-first" + fi + ;; + *) + echo "start-first" + ;; + esac +} + # Recover services left with no running tasks (e.g. after a failed stop-first rollout). swarm_recover_if_no_running_tasks() { local svc="$1" @@ -451,6 +482,21 @@ swarm_check_service_scheduling_failure() { fi } +# After a failed start-first rollout, Swarm can leave extra running tasks on one node. +swarm_reconcile_extra_running_tasks() { + local svc="$1" + local running_ps want_replicas + running_ps="$(docker service ps "$svc" --filter "desired-state=running" --format '{{.CurrentState}}' \ + | grep -c '^Running' || true)" + want_replicas="$(docker service inspect "$svc" --format '{{if .Spec.Mode.Replicated}}{{.Spec.Mode.Replicated.Replicas}}{{else}}1{{end}}' 2>/dev/null)" || want_replicas="1" + if [[ "$running_ps" -gt "$want_replicas" ]]; then + log WARN "Service $svc has ${running_ps} running tasks (want ${want_replicas}); forcing reconcile" + docker service update --force --update-order stop-first --detach "$svc" \ + || die "Failed to reconcile extra tasks on $svc" + wait_for_service "$svc" 120 + fi +} + wait_for_service() { local svc="$1" local max_attempts="${2:-30}" @@ -507,7 +553,7 @@ wait_for_service_rollout() { running_ps="$(docker service ps "$svc" --filter "desired-state=running" --format '{{.CurrentState}}' \ | grep -c '^Running' || true)" want_replicas="$(docker service inspect "$svc" --format '{{if .Spec.Mode.Replicated}}{{.Spec.Mode.Replicated.Replicas}}{{else}}1{{end}}' 2>/dev/null)" || want_replicas="1" - if [[ "$running_ps" -ge "$want_replicas" && "$running_ps" -gt 0 ]]; then + if [[ "$running_ps" -eq "$want_replicas" && "$running_ps" -gt 0 ]]; then log INFO "Service $svc rollout complete (${running_ps}/${want_replicas} tasks running via service ps)" return 0 fi diff --git a/s/ops/deploy.sh b/s/ops/deploy.sh index 216c9da..821bfcd 100755 --- a/s/ops/deploy.sh +++ b/s/ops/deploy.sh @@ -79,6 +79,7 @@ deploy_service() { if [[ "$service_exists" == "true" ]]; then swarm_recover_if_no_running_tasks "$svc" + swarm_reconcile_extra_running_tasks "$svc" if docker service inspect "$staging_name" &>/dev/null; then log WARN "Removing leftover staging service: $staging_name" @@ -90,10 +91,8 @@ deploy_service() { done fi - local update_order="start-first" - if [[ "$svc" == "bench-sandbox" || "$svc" == "server" ]]; then - update_order="stop-first" - fi + local update_order + update_order="$(swarm_update_order_for_service "$svc")" log INFO "Rolling update $svc (${update_order}; staging skipped on single-node swarm)" local -a update_args=( --image "$image" @@ -163,7 +162,7 @@ deploy_service() { --reserve-cpu "$CPU_RESERVE" \ --reserve-memory "$MEMORY_RESERVE" \ --restart-condition any \ - --update-order start-first \ + --update-order "$(swarm_update_order_for_service "$svc")" \ --update-delay 30s \ --with-registry-auth \ "${extra_args[@]}" \