From bd2db232523cfbbe3517a76226bdf883e790b74b Mon Sep 17 00:00:00 2001 From: rajashish147 Date: Sat, 4 Apr 2026 18:28:22 +0530 Subject: [PATCH 1/2] fix(docker): align HEALTHCHECK timing with liveness; robust healthcheck.js; deploy health debug Made-with: Cursor --- Dockerfile | 8 ++++++-- healthcheck.js | 47 +++++++++++++++++++++++++++++++++----------- scripts/deploy.sh | 20 ++++++++++++++++--- src/routes/health.ts | 6 ++++-- 4 files changed, 62 insertions(+), 19 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6a5cd47..8c2d531 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,9 @@ # No shell, no package manager, no tar, no apt, no curl — the entire OS toolchain # CVE surface present in bookworm-slim is eliminated. # • :nonroot variant runs as uid 65532 (nobody) by default — no USER directive needed. -# • HEALTHCHECK uses Node built-in `http` module (curl unavailable in distroless). +# • HEALTHCHECK uses Node http (distroless has no curl). Equivalent to: +# curl -fsS http://127.0.0.1:3000/health || exit 1 +# Use /health (liveness) only — not /ready (Redis/DB); deploy gate matches this. # ---- Stage 1: Build -------------------------------------------------------- # Pinned to specific version to prevent supply chain attacks. @@ -80,7 +82,9 @@ COPY healthcheck.js ./healthcheck.js EXPOSE 3000 # Exec-form required — distroless has no shell to expand shell-form commands. -HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ +# start-period must cover cold start (OTel, env, Fastify listen); interval allows +# timely transition starting → healthy once /health returns 200. +HEALTHCHECK --interval=10s --timeout=5s --start-period=30s --retries=5 \ CMD ["/nodejs/bin/node", "/app/healthcheck.js"] CMD ["dist/server.js"] diff --git a/healthcheck.js b/healthcheck.js index 9408ff7..c43844c 100644 --- a/healthcheck.js +++ b/healthcheck.js @@ -1,20 +1,43 @@ -// Lightweight health probe for distroless containers (no curl available). -// Runs as the HEALTHCHECK CMD: /nodejs/bin/node /app/healthcheck.js -// Exits 0 when /health returns HTTP 200, exits 1 on any error or non-200 response. -// -// CommonJS (not ESM) — this file is copied to /app/healthcheck.js in the -// container where the repo root package.json (no "type":"module") applies. +// Distroless-compatible liveness probe (no curl). Semantics align with: +// curl -fsS http://127.0.0.1:3000/health || exit 1 +// - 127.0.0.1 + IPv4 only (avoid ::1 / dual-stack quirks) +// - exit 0 only on HTTP 200; any other status or error → exit 1 +// - bounded wall time < Docker --timeout (5s) 'use strict'; + const http = require('http'); +const TIMEOUT_MS = 4500; +let settled = false; + +function finish(code) { + if (settled) { + return; + } + settled = true; + process.exit(code); +} + const req = http.request( - { host: '127.0.0.1', port: 3000, path: '/health', method: 'GET' }, + { + host: '127.0.0.1', + port: 3000, + path: '/health', + method: 'GET', + family: 4, + }, (res) => { - process.exitCode = res.statusCode === 200 ? 0 : 1; - res.resume(); // drain response so socket closes cleanly - } + res.on('data', () => {}); + res.on('end', () => { + finish(res.statusCode === 200 ? 0 : 1); + }); + res.on('error', () => finish(1)); + }, ); -req.on('error', () => { process.exitCode = 1; }); -req.setTimeout(4000, () => { req.destroy(); process.exitCode = 1; }); +req.on('error', () => finish(1)); +req.setTimeout(TIMEOUT_MS, () => { + req.destroy(); + finish(1); +}); req.end(); diff --git a/scripts/deploy.sh b/scripts/deploy.sh index 8ef9170..da128bb 100644 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -217,19 +217,33 @@ _ft_final_state() { # --------------------------------------------------------------------------- # DOCKER HEALTH GATE # --------------------------------------------------------------------------- +_ft_dump_container_health_json() { + local name="$1" + local json + json=$(docker inspect "$name" --format '{{json .State.Health}}' 2>/dev/null || echo "{}") + _ft_log "msg='State.Health (docker inspect)' container=$name json=$json" +} + _ft_wait_docker_health() { local name="$1" i=1 STATUS - while [ "$i" -le 30 ]; do + # Allow start-period (30s) + several intervals (10s) + retries — 45×2s ≈ 90s + local max_attempts=45 + while [ "$i" -le "$max_attempts" ]; do STATUS=$(docker inspect --format='{{.State.Health.Status}}' "$name" 2>/dev/null || echo "none") case "$STATUS" in healthy) _ft_log "msg='docker health check passed' container=$name"; return 0 ;; - unhealthy) _ft_error "msg='docker health check failed' container=$name status=unhealthy"; return 1 ;; + unhealthy) + _ft_error "msg='docker health check failed' container=$name status=unhealthy" + _ft_dump_container_health_json "$name" + return 1 + ;; none) _ft_error "msg='docker HEALTHCHECK not found — add HEALTHCHECK to Dockerfile; required for deploy gate' container=$name status=none"; return 1 ;; esac - [ $(( i % 5 )) -eq 0 ] && _ft_log "msg='waiting for docker health' attempt=$i/30 status=$STATUS container=$name" + [ $(( i % 5 )) -eq 0 ] && _ft_log "msg='waiting for docker health' attempt=$i/$max_attempts status=$STATUS container=$name" sleep 2; i=$(( i + 1 )) done _ft_error "msg='docker health timeout' container=$name last_status=$STATUS" + _ft_dump_container_health_json "$name" return 1 } diff --git a/src/routes/health.ts b/src/routes/health.ts index 434fbf3..5b924bc 100644 --- a/src/routes/health.ts +++ b/src/routes/health.ts @@ -4,8 +4,10 @@ import { shouldStartWorkers, areWorkersStarted, getExpectedWorkerCount } from ". // Bootstrap flag: set to true only after Fastify has fully initialised // (plugins registered, routes attached, app.listen() resolved). -// /health returns 503 until this is set — prevents the deploy gate from -// treating a partially-initialised process as healthy. +// /health returns 503 until this is set — prevents the deploy gate and Docker +// HEALTHCHECK (127.0.0.1:3000/health) from treating a partial boot as healthy. +// /ready is separate: deep checks (Redis, Supabase, queues) — never use it for +// Docker HEALTHCHECK or deploy.sh; workers must not block liveness. let isBootstrapped = false; export function setBootstrapped(): void { From 2d42c08f4e530acdf3a05f4ea4d96fe27091856a Mon Sep 17 00:00:00 2001 From: rajashish147 Date: Sat, 4 Apr 2026 18:46:23 +0530 Subject: [PATCH 2/2] fix: ESM healthcheck + CI exec validation Made-with: Cursor --- .github/workflows/pr.yml | 10 ++++++ healthcheck.js | 70 +++++++++++++++++++++++++++------------- 2 files changed, 57 insertions(+), 23 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 66ed18e..bf25543 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -287,6 +287,12 @@ jobs: echo "✓ /health returned 200" + # Same binary + script path as Docker HEALTHCHECK (exec form); catches ESM/require + # regressions and confirms 127.0.0.1:3000/health from inside the container. + echo "Validating /app/healthcheck.js (ESM, distroless node)..." + docker exec api-ci-test /nodejs/bin/node /app/healthcheck.js + echo "✓ healthcheck.js exited 0" + # Smoke tests: admin endpoints must reject unauthenticated requests with 401 for ENDPOINT in /admin/audit-log /admin/webhook-dlq; do ECODE=$(docker run --rm \ @@ -541,6 +547,10 @@ jobs: exit 1 fi + echo "Validating /app/healthcheck.js (same as Docker HEALTHCHECK)..." + docker exec api-blue /nodejs/bin/node /app/healthcheck.js + echo "✓ healthcheck.js exited 0" + # Smoke: auth guards must reject unauthenticated requests with 401. for ENDPOINT in /admin/audit-log /admin/webhook-dlq; do CODE=$(docker run --rm \ diff --git a/healthcheck.js b/healthcheck.js index c43844c..7bbf775 100644 --- a/healthcheck.js +++ b/healthcheck.js @@ -1,11 +1,10 @@ // Distroless-compatible liveness probe (no curl). Semantics align with: // curl -fsS http://127.0.0.1:3000/health || exit 1 +// ESM: package.json has "type":"module"; this file must not use require(). // - 127.0.0.1 + IPv4 only (avoid ::1 / dual-stack quirks) // - exit 0 only on HTTP 200; any other status or error → exit 1 // - bounded wall time < Docker --timeout (5s) -'use strict'; - -const http = require('http'); +import http from 'node:http'; const TIMEOUT_MS = 4500; let settled = false; @@ -18,26 +17,51 @@ function finish(code) { process.exit(code); } -const req = http.request( - { - host: '127.0.0.1', - port: 3000, - path: '/health', - method: 'GET', - family: 4, - }, - (res) => { - res.on('data', () => {}); - res.on('end', () => { - finish(res.statusCode === 200 ? 0 : 1); - }); - res.on('error', () => finish(1)); - }, -); +function logErr(prefix, err) { + console.error(`[healthcheck] ${prefix}`, err); +} + +process.on('uncaughtException', (err) => { + logErr('uncaughtException', err); + finish(1); +}); -req.on('error', () => finish(1)); -req.setTimeout(TIMEOUT_MS, () => { - req.destroy(); +process.on('unhandledRejection', (reason) => { + logErr('unhandledRejection', reason); finish(1); }); -req.end(); + +try { + const req = http.request( + { + host: '127.0.0.1', + port: 3000, + path: '/health', + method: 'GET', + family: 4, + }, + (res) => { + res.on('data', () => {}); + res.on('end', () => { + finish(res.statusCode === 200 ? 0 : 1); + }); + res.on('error', (err) => { + logErr('response error', err); + finish(1); + }); + }, + ); + + req.on('error', (err) => { + logErr('request error', err); + finish(1); + }); + req.setTimeout(TIMEOUT_MS, () => { + req.destroy(); + finish(1); + }); + req.end(); +} catch (err) { + logErr('fatal', err); + finish(1); +}