Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ services:
deploy:
resources:
limits:
memory: 512M
memory: 786M
reservations:
memory: 256M
healthcheck:
Expand Down
50 changes: 48 additions & 2 deletions s/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,10 @@ get_resource_limits() {
bench-sandbox)
echo "CPU_LIMIT=0.5 MEMORY_LIMIT=1G CPU_RESERVE=0.2 MEMORY_RESERVE=512M"
;;
bench-api|server)
bench-api)
echo "CPU_LIMIT=0.5 MEMORY_LIMIT=786M CPU_RESERVE=0.2 MEMORY_RESERVE=256M"
;;
server)
echo "CPU_LIMIT=0.5 MEMORY_LIMIT=512M CPU_RESERVE=0.2 MEMORY_RESERVE=256M"
;;
scheduler)
Expand Down Expand Up @@ -380,6 +383,34 @@ is_main_branch() {
[[ "$branch" == "main" || "$branch" == "master" ]]
}

# True when the swarm has a single worker (typical prod EC2); start-first rollouts
# need 2× memory reservations and often fail with "insufficient resources".
swarm_is_single_node() {
local nodes
nodes="$(docker node ls -q 2>/dev/null | wc -l | tr -d ' ')"
[[ "${nodes:-0}" -le 1 ]]
}

# stop-first on single-node for memory-heavy services; start-first elsewhere when safe.
swarm_update_order_for_service() {
local svc="$1"
case "$svc" in
bench-sandbox|server)
echo "stop-first"
;;
bench-api|bench-worker)
if swarm_is_single_node; then
echo "stop-first"
else
echo "start-first"
fi
;;
*)
echo "start-first"
;;
esac
}

# Recover services left with no running tasks (e.g. after a failed stop-first rollout).
swarm_recover_if_no_running_tasks() {
local svc="$1"
Expand Down Expand Up @@ -451,6 +482,21 @@ swarm_check_service_scheduling_failure() {
fi
}

# After a failed start-first rollout, Swarm can leave extra running tasks on one node.
swarm_reconcile_extra_running_tasks() {
local svc="$1"
local running_ps want_replicas
running_ps="$(docker service ps "$svc" --filter "desired-state=running" --format '{{.CurrentState}}' \
| grep -c '^Running' || true)"
want_replicas="$(docker service inspect "$svc" --format '{{if .Spec.Mode.Replicated}}{{.Spec.Mode.Replicated.Replicas}}{{else}}1{{end}}' 2>/dev/null)" || want_replicas="1"
if [[ "$running_ps" -gt "$want_replicas" ]]; then
log WARN "Service $svc has ${running_ps} running tasks (want ${want_replicas}); forcing reconcile"
docker service update --force --update-order stop-first --detach "$svc" \
|| die "Failed to reconcile extra tasks on $svc"
wait_for_service "$svc" 120
fi
}

wait_for_service() {
local svc="$1"
local max_attempts="${2:-30}"
Expand Down Expand Up @@ -507,7 +553,7 @@ wait_for_service_rollout() {
running_ps="$(docker service ps "$svc" --filter "desired-state=running" --format '{{.CurrentState}}' \
| grep -c '^Running' || true)"
want_replicas="$(docker service inspect "$svc" --format '{{if .Spec.Mode.Replicated}}{{.Spec.Mode.Replicated.Replicas}}{{else}}1{{end}}' 2>/dev/null)" || want_replicas="1"
if [[ "$running_ps" -ge "$want_replicas" && "$running_ps" -gt 0 ]]; then
if [[ "$running_ps" -eq "$want_replicas" && "$running_ps" -gt 0 ]]; then
log INFO "Service $svc rollout complete (${running_ps}/${want_replicas} tasks running via service ps)"
return 0
fi
Expand Down
9 changes: 4 additions & 5 deletions s/ops/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ deploy_service() {

if [[ "$service_exists" == "true" ]]; then
swarm_recover_if_no_running_tasks "$svc"
swarm_reconcile_extra_running_tasks "$svc"

if docker service inspect "$staging_name" &>/dev/null; then
log WARN "Removing leftover staging service: $staging_name"
Expand All @@ -90,10 +91,8 @@ deploy_service() {
done
fi

local update_order="start-first"
if [[ "$svc" == "bench-sandbox" || "$svc" == "server" ]]; then
update_order="stop-first"
fi
local update_order
update_order="$(swarm_update_order_for_service "$svc")"
log INFO "Rolling update $svc (${update_order}; staging skipped on single-node swarm)"
local -a update_args=(
--image "$image"
Expand Down Expand Up @@ -163,7 +162,7 @@ deploy_service() {
--reserve-cpu "$CPU_RESERVE" \
--reserve-memory "$MEMORY_RESERVE" \
--restart-condition any \
--update-order start-first \
--update-order "$(swarm_update_order_for_service "$svc")" \
--update-delay 30s \
--with-registry-auth \
"${extra_args[@]}" \
Expand Down
Loading