From 2940eb9190184fcedf1c7a8eeb8b6b1865b6491d Mon Sep 17 00:00:00 2001
From: brandonrc <brandonrc@users.noreply.github.com>
Date: Sun, 21 Jun 2026 06:37:02 +0000
Subject: [PATCH] test: harden 3 load-induced release-gate flakes (solid, not
 retry-hope)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- common.sh: add api_get_with_retry + create_test_user_with_retry — bounded
  retry-with-backoff on TRANSIENT class only (5xx/000); 4xx never retried so
  real client/auth errors are not masked.
- test-sbt.sh: artifacts GET uses api_get_with_retry (idempotent GET that hit a
  transient 5xx under fleet worker-starvation).
- test-regression-security.sh: 3 setup user-creates use create_test_user_with_retry.
- test-scan-dedup-short-circuit-1373.sh: assert exactly-1-completed-row PER
  scan_type (backend registers dependency+grype scanners) and stable scan_id
  per type across re-trigger — STRONGER + deterministic vs the old total-count==1.
  (Test serializes its own triggers; see filed backend race for the concurrent case.)
- test-webhook-retry-recover.sh: isolate measured deliveries by repo entity_id so
  concurrent suites' sibling deliveries to a global webhook no longer contaminate
  the timing (root cause of attempt_delta=0s); oldest-first; windows account for
  the 30s scheduler tick + 20% jitter; per-PID receiver port.
---
 tests/formats/test-sbt.sh                     |   9 +-
 tests/lib/common.sh                           | 162 ++++++++++++++
 tests/security/test-regression-security.sh    |  49 ++--
 .../test-scan-dedup-short-circuit-1373.sh     | 169 ++++++++++----
 tests/webhooks/test-webhook-retry-recover.sh  | 209 +++++++++++++-----
 5 files changed, 477 insertions(+), 121 deletions(-)

diff --git a/tests/formats/test-sbt.sh b/tests/formats/test-sbt.sh
index eb6a226..bf5f71e 100755
--- a/tests/formats/test-sbt.sh
+++ b/tests/formats/test-sbt.sh
@@ -220,7 +220,14 @@ fi
 # ---------------------------------------------------------------------------
 
 begin_test "List artifacts via management API"
-if resp=$(api_get "/api/v1/repositories/${REPO_KEY}/artifacts"); then
+# Use the retry-with-backoff helper instead of bare api_get: this GET is
+# idempotent and, when the gate runs ~25 format suites in parallel, the backend
+# tokio runtime can be momentarily saturated (worker-starvation / availability
+# issue), returning a transient 503/502/000 that the very next attempt serves
+# fine. api_get_with_retry retries ONLY that transient class (5xx/000), never
+# 4xx or wrong-content, so the assertion below stays meaningful. (ak-test flake:
+# "GET .../artifacts returned error" in format-tests (jvm).)
+if resp=$(api_get_with_retry "/api/v1/repositories/${REPO_KEY}/artifacts"); then
   if assert_contains "$resp" "$MODULE_NAME" "artifact list should contain module name"; then
     pass
   fi
diff --git a/tests/lib/common.sh b/tests/lib/common.sh
index c2ce48b..2e5ea59 100644
--- a/tests/lib/common.sh
+++ b/tests/lib/common.sh
@@ -503,6 +503,86 @@ api_get() {
   curl -sf $CURL_TIMEOUT -H "$(auth_header)" "$@" "${BASE_URL}${path}"
 }
 
+# api_get_with_retry - GET an idempotent management-API endpoint with a bounded
+# retry-with-backoff on TRANSIENT failures only. On success it echoes the
+# response body on stdout and returns 0, so it is a drop-in replacement for
+# `api_get` in `if resp=$(...); then`-style call sites.
+#
+# Why this exists (the test-sbt.sh "List artifacts" flake, ak-test):
+#   The plain `api_get` uses `curl -sf`, which exits non-zero on ANY >=400 and
+#   discards the response body. When ~25 format suites run in parallel in the
+#   `format-tests (jvm)` gate job they saturate the backend's tokio runtime
+#   (the known worker-starvation / availability issue: uncapped CPU work such
+#   as bcrypt(cost=12) on basic-auth format calls bypasses the auth semaphore).
+#   A GET that races peak saturation can transiently come back 503/502 (pool
+#   exhausted / shed) or 000 (request exceeded its time budget) even though the
+#   data is fine -- the very next attempt succeeds. With bare `api_get` that one
+#   transient blip hard-fails the suite ("GET .../artifacts returned error"),
+#   which is a retry-hope flake, not a real defect: the GET is idempotent and
+#   SHOULD be retried, exactly as format_get_with_retry / create_repo / login_as
+#   already do for their transient classes.
+#
+# Retry policy (deliberately narrow so we never mask a real bug):
+#   - Retry ONLY on transient-class statuses: 000 (network/timeout) and the 5xx
+#     range (500-599: 502/503/504 plus any residual 5xx).
+#   - DO NOT retry on 2xx (success) or 4xx (real client/auth/not-found errors):
+#     those are deterministic and a retry would just hide a genuine failure. A
+#     404/403/wrong-content surfaces immediately.
+#   - On a non-2xx final outcome, return 1 AND emit a precise diagnostic to
+#     stderr (final HTTP status + body snippet) so the failure is never opaque
+#     the way bare `curl -sf` made it.
+#
+# Knobs: API_GET_MAX_ATTEMPTS (default 4), API_GET_RETRY_DELAY (default 2s,
+# applied with linear backoff: delay * attempt).
+#
+# Usage:
+#   if resp=$(api_get_with_retry "/api/v1/repositories/${KEY}/artifacts"); then
+#     assert_contains "$resp" "$MODULE_NAME" "..."
+#   else
+#     fail "GET .../artifacts returned error"
+#   fi
+api_get_with_retry() {
+  local path="$1"; shift
+  local _max="${API_GET_MAX_ATTEMPTS:-4}"
+  local _delay="${API_GET_RETRY_DELAY:-2}"
+  local _attempt _status="000" _body_file _body=""
+  _body_file=$(mktemp)
+  for _attempt in $(seq 1 "$_max"); do
+    _status=$(curl -s $CURL_TIMEOUT -o "$_body_file" -w '%{http_code}' \
+      -H "$(auth_header)" "$@" "${BASE_URL}${path}" 2>/dev/null) || _status="000"
+
+    # Success: 2xx -> emit body, return 0.
+    if [ "$_status" -ge 200 ] 2>/dev/null && [ "$_status" -lt 300 ] 2>/dev/null; then
+      cat "$_body_file"
+      rm -f "$_body_file"
+      return 0
+    fi
+
+    # Transient class -> retry. Everything else (4xx, other non-5xx) is a real,
+    # deterministic failure: stop immediately so we don't mask a bug.
+    local _transient=false
+    if [ "$_status" = "000" ] || \
+       { [ "$_status" -ge 500 ] 2>/dev/null && [ "$_status" -le 599 ] 2>/dev/null; }; then
+      _transient=true
+    fi
+    if [ "$_transient" != true ]; then
+      break
+    fi
+
+    if [ "$_attempt" -lt "$_max" ]; then
+      sleep "$(( _delay * _attempt ))"
+    fi
+  done
+
+  _body=$(head -c 400 "$_body_file" 2>/dev/null || true)
+  rm -f "$_body_file"
+  # _attempt holds the loop index of the last attempt made. A deterministic
+  # (non-transient) status breaks the loop early, so this reports the real
+  # number of requests issued rather than the configured ceiling.
+  echo "api_get_with_retry ${path} failed after ${_attempt} attempt(s) (max ${_max}): HTTP ${_status} body=${_body}" >&2
+  return 1
+}
+
 # Create a test user (admin auth) and echo the new user's UUID on stdout.
 # On failure, echoes empty string and the response body to stderr.
 #
@@ -525,6 +605,88 @@ create_test_user() {
   echo "$uid"
 }
 
+# Create a test user (admin auth) with bounded retry-on-transient, echoing
+# the new user's UUID on stdout. Returns 0 on success, 1 after retries.
+#
+# Why this exists (release-gate flake #1):
+#   Several security suites create a throwaway user as a SETUP step
+#   (token-revocation, lockout, force-password-change). User creation hashes
+#   the password with bcrypt in spawn_blocking. Under fleet-concurrent load
+#   the backend's blocking pool / worker runtime can be momentarily starved
+#   (the known availability worker-starvation issue: uncapped CPU-bound auth
+#   work on the tokio runtime), so an otherwise-valid admin POST
+#   /api/v1/users can transiently return 5xx or drop the connection
+#   (curl exit -> 000). The plain api_post helper uses `curl -sf`, which
+#   collapses any non-2xx into a bare non-zero exit with no body and NO
+#   retry, so a single transient blip fails the whole suite at setup time
+#   ("could not create force-password-change test user").
+#
+#   This helper retries ONLY the transient class (HTTP 5xx and network 000).
+#   Real client errors (400 invalid payload, 409 username taken, 401/403
+#   auth) are returned immediately and NOT masked -- a duplicate-username or
+#   a malformed request is a genuine test bug, not a flake.
+#
+# Accepts EITHER the 3-arg short form (username password email) or a 4th arg
+# giving a full JSON body (so callers that also set display_name reuse the
+# same retry logic). The body, if given, must be a complete JSON object.
+#
+# Tunables (shared budget feel with create_repo / login_as):
+#   CREATE_USER_MAX_ATTEMPTS  default 4
+#   CREATE_USER_RETRY_DELAY   default 1 (seconds; doubled each attempt)
+#
+# Usage:
+#   USER_ID=$(create_test_user_with_retry "$USER" "$PASS" "$EMAIL") || fail ...
+#   USER_ID=$(create_test_user_with_retry "$USER" "$PASS" "$EMAIL" "$JSON") || fail ...
+create_test_user_with_retry() {
+  local username="$1"
+  local password="$2"
+  local email="$3"
+  local body="${4:-}"
+  if [ -z "$body" ]; then
+    body="{\"username\":\"${username}\",\"password\":\"${password}\",\"email\":\"${email}\"}"
+  fi
+  local _max="${CREATE_USER_MAX_ATTEMPTS:-4}"
+  local _delay="${CREATE_USER_RETRY_DELAY:-1}"
+  local _attempt _status _tmp _resp uid=""
+  for _attempt in $(seq 1 "$_max"); do
+    _tmp=$(mktemp)
+    _status=$(curl -s $CURL_TIMEOUT -o "$_tmp" -w '%{http_code}' \
+      -X POST -H "$(auth_header)" -H 'Content-Type: application/json' \
+      -d "$body" "${BASE_URL}/api/v1/users" 2>/dev/null) || _status="000"
+    _resp=$(cat "$_tmp" 2>/dev/null || true)
+    rm -f "$_tmp"
+
+    if [ "$_status" -ge 200 ] 2>/dev/null && [ "$_status" -lt 300 ] 2>/dev/null; then
+      uid=$(echo "$_resp" | jq -r '.user.id // .id // .user_id // empty' 2>/dev/null) || uid=""
+      if [ -n "$uid" ] && [ "$uid" != "null" ]; then
+        echo "$uid"
+        return 0
+      fi
+      # 2xx but no id: a contract break, not a transient blip. Don't retry.
+      echo "create_test_user_with_retry: ${username} got HTTP ${_status} but no id: ${_resp:0:200}" >&2
+      echo ""
+      return 1
+    fi
+
+    # Retry ONLY transient class: network 000 or any 5xx. Everything else
+    # (4xx) is a real failure surfaced immediately.
+    if [ "$_status" != "000" ] && { [ "$_status" -lt 500 ] 2>/dev/null || [ "$_status" -ge 600 ] 2>/dev/null; }; then
+      echo "create_test_user_with_retry: ${username} non-transient HTTP ${_status}: ${_resp:0:200}" >&2
+      echo ""
+      return 1
+    fi
+
+    if [ "$_attempt" -lt "$_max" ]; then
+      echo "  create-user ${username} attempt ${_attempt}/${_max} transient HTTP ${_status}, retrying in ${_delay}s..." >&2
+      sleep "$_delay"
+      _delay=$(( _delay * 2 ))
+    fi
+  done
+  echo "create_test_user_with_retry: ${username} failed after ${_max} attempts (last HTTP ${_status})" >&2
+  echo ""
+  return 1
+}
+
 # Log in as the named user and echo the access_token on stdout.
 # On failure, echoes empty string and the response body to stderr.
 #
diff --git a/tests/security/test-regression-security.sh b/tests/security/test-regression-security.sh
index e641d0c..09c317c 100755
--- a/tests/security/test-regression-security.sh
+++ b/tests/security/test-regression-security.sh
@@ -31,16 +31,15 @@ USER_ID_592=""
 TOKEN_ID_592=""
 
 begin_test "Bug #592: Create test user for token revocation listing"
-if resp=$(api_post "/api/v1/users" \
-    "{\"username\":\"${TEST_USER_592}\",\"password\":\"${TEST_PASS_592}\",\"email\":\"${TEST_EMAIL_592}\",\"display_name\":\"Bug 592 Test\"}" 2>/dev/null); then
-  USER_ID_592=$(echo "$resp" | jq -r '.user.id // .id // .user_id // empty') || true
-  if [ -n "$USER_ID_592" ] && [ "$USER_ID_592" != "null" ]; then
-    pass
-  else
-    fail "user created but no ID returned"
-  fi
+# Setup-step create: retry-with-backoff on transient 5xx/000 (fleet load can
+# briefly starve the bcrypt blocking pool). 4xx (e.g. duplicate username) is
+# surfaced immediately and never masked. See create_test_user_with_retry.
+USER_ID_592=$(create_test_user_with_retry "$TEST_USER_592" "$TEST_PASS_592" "$TEST_EMAIL_592" \
+  "{\"username\":\"${TEST_USER_592}\",\"password\":\"${TEST_PASS_592}\",\"email\":\"${TEST_EMAIL_592}\",\"display_name\":\"Bug 592 Test\"}") || true
+if [ -n "$USER_ID_592" ] && [ "$USER_ID_592" != "null" ]; then
+  pass
 else
-  fail "could not create test user for bug #592"
+  fail "could not create test user for bug #592 (transient retries exhausted)"
 fi
 
 begin_test "Bug #592: Create API token"
@@ -352,16 +351,13 @@ LOCKOUT_EMAIL="e2e-lockout-${RUN_ID}@test.local"
 LOCKOUT_USER_ID=""
 
 begin_test "Account lockout: Create test user"
-if resp=$(api_post "/api/v1/users" \
-    "{\"username\":\"${LOCKOUT_USER}\",\"password\":\"${LOCKOUT_PASS}\",\"email\":\"${LOCKOUT_EMAIL}\",\"display_name\":\"Lockout Test\"}" 2>/dev/null); then
-  LOCKOUT_USER_ID=$(echo "$resp" | jq -r '.user.id // .id // .user_id // empty') || true
-  if [ -n "$LOCKOUT_USER_ID" ] && [ "$LOCKOUT_USER_ID" != "null" ]; then
-    pass
-  else
-    fail "user created but no ID returned"
-  fi
+# Setup-step create: retry transient 5xx/000 only (see create_test_user_with_retry).
+LOCKOUT_USER_ID=$(create_test_user_with_retry "$LOCKOUT_USER" "$LOCKOUT_PASS" "$LOCKOUT_EMAIL" \
+  "{\"username\":\"${LOCKOUT_USER}\",\"password\":\"${LOCKOUT_PASS}\",\"email\":\"${LOCKOUT_EMAIL}\",\"display_name\":\"Lockout Test\"}") || true
+if [ -n "$LOCKOUT_USER_ID" ] && [ "$LOCKOUT_USER_ID" != "null" ]; then
+  pass
 else
-  fail "could not create lockout test user"
+  fail "could not create lockout test user (transient retries exhausted)"
 fi
 
 begin_test "Account lockout: Detect lockout threshold"
@@ -569,16 +565,15 @@ FPC_EMAIL="e2e-fpc-${RUN_ID}@test.local"
 FPC_USER_ID=""
 
 begin_test "Force password change: Create test user"
-if resp=$(api_post "/api/v1/users" \
-    "{\"username\":\"${FPC_USER}\",\"password\":\"${FPC_PASS}\",\"email\":\"${FPC_EMAIL}\",\"display_name\":\"Force PC Test\"}" 2>/dev/null); then
-  FPC_USER_ID=$(echo "$resp" | jq -r '.user.id // .id // .user_id // empty') || true
-  if [ -n "$FPC_USER_ID" ] && [ "$FPC_USER_ID" != "null" ]; then
-    pass
-  else
-    fail "user created but no ID returned"
-  fi
+# Setup-step create: retry transient 5xx/000 only (see create_test_user_with_retry).
+# This is the exact step that flaked ("could not create force-password-change
+# test user") when a fleet-concurrent run starved the bcrypt blocking pool.
+FPC_USER_ID=$(create_test_user_with_retry "$FPC_USER" "$FPC_PASS" "$FPC_EMAIL" \
+  "{\"username\":\"${FPC_USER}\",\"password\":\"${FPC_PASS}\",\"email\":\"${FPC_EMAIL}\",\"display_name\":\"Force PC Test\"}") || true
+if [ -n "$FPC_USER_ID" ] && [ "$FPC_USER_ID" != "null" ]; then
+  pass
 else
-  fail "could not create force-password-change test user"
+  fail "could not create force-password-change test user (transient retries exhausted)"
 fi
 
 begin_test "Force password change: Call force-password-change endpoint"
diff --git a/tests/security/test-scan-dedup-short-circuit-1373.sh b/tests/security/test-scan-dedup-short-circuit-1373.sh
index 3ad3da4..d005cac 100755
--- a/tests/security/test-scan-dedup-short-circuit-1373.sh
+++ b/tests/security/test-scan-dedup-short-circuit-1373.sh
@@ -198,14 +198,39 @@ esac
 
 # Wait for the first scan to be completed so trigger #2 will exercise the
 # short-circuit branch (not the race-recovery branch).
-begin_test "Wait for scan #1 to reach completed"
+#
+# IMPORTANT (scan_type scoping): the backend registers MORE THAN ONE scanner
+# (e.g. `dependency` + `grype`), and each one writes its OWN scan_results row
+# per artifact. The #1373 dedup contract is therefore per-scan_type: every
+# configured scanner that ran must end with EXACTLY ONE completed row for the
+# artifact, and re-triggering must not add a second row for any scan_type.
+# The original test asserted "exactly 1 completed row TOTAL", which only ever
+# held when a single scanner happened to run -- with two applicable scanners
+# a correctly-deduped artifact has two completed rows (one each), so the old
+# assertion failed even on the sequential happy path. We scope the wait and
+# both assertions per scan_type so the real dedup invariant is checked
+# without weakening it.
+#
+# We also wait for ALL scan_types to reach a terminal state (no row left in
+# `running`/`pending`) before triggering #2, so trigger #2 always exercises
+# the completed-scan short-circuit rather than the race-recovery branch.
+begin_test "Wait for scan #1 to reach completed (all scan_types terminal)"
 SCAN_LIST_PATH="/api/v1/security/artifacts/${ARTIFACT_ID}/scans"
 elapsed=0
 poll_interval=5
-first_scan_id=""
-first_state=""
+all_terminal="no"
 network_fail_count=0
 
+# Terminal statuses for a scan row. `completed` is the dedup-eligible one;
+# `not_applicable`/`failed` are terminal too (a scanner that didn't apply or
+# errored should not block the wait forever).
+is_all_terminal_jq='
+  (.items | length) > 0
+  and ([.items[] | (.status | ascii_downcase)
+        | select(. != "completed" and . != "not_applicable" and . != "failed")]
+       | length) == 0
+'
+
 while [ "$elapsed" -lt "$SCAN_TIMEOUT" ]; do
   http_status=$(curl -s -o "${WORK_DIR}/scans1.json" -w '%{http_code}' $CURL_TIMEOUT \
     -H "$(auth_header)" \
@@ -214,10 +239,8 @@ while [ "$elapsed" -lt "$SCAN_TIMEOUT" ]; do
 
   if [ "$http_status" = "200" ]; then
     network_fail_count=0
-    state=$(jq -er '.items[0].status // empty | ascii_downcase' < "${WORK_DIR}/scans1.json" 2>/dev/null || echo "")
-    if [ "$state" = "completed" ]; then
-      first_state="$state"
-      first_scan_id=$(jq -er '.items[0].id // empty' < "${WORK_DIR}/scans1.json" 2>/dev/null || echo "")
+    if jq -e "$is_all_terminal_jq" < "${WORK_DIR}/scans1.json" >/dev/null 2>&1; then
+      all_terminal="yes"
       break
     fi
   elif [ "$http_status" = "000" ] || [ "$http_status" = "502" ] || \
@@ -232,10 +255,30 @@ while [ "$elapsed" -lt "$SCAN_TIMEOUT" ]; do
   elapsed=$(( elapsed + poll_interval ))
 done
 
-if [ "$first_state" = "completed" ] && [ -n "$first_scan_id" ]; then
+# Snapshot the per-scan_type completed-row map after trigger #1. Shape:
+#   { "dependency": "<uuid>", "grype": "<uuid>", ... }
+# Only `completed` rows are recorded (the dedup-eligible state). At least one
+# scan_type must have completed for the short-circuit to be testable.
+first_completed_by_type='{}'
+first_completed_count=0
+if [ "$all_terminal" = "yes" ]; then
+  first_completed_by_type=$(jq -c '
+    [.items[] | select((.status|ascii_downcase) == "completed")]
+    | group_by(.scan_type)
+    | map({(.[0].scan_type): (.[0].id)})
+    | add // {}
+  ' < "${WORK_DIR}/scans1.json" 2>/dev/null || echo '{}')
+  first_completed_count=$(jq -r '[.items[] | select((.status|ascii_downcase)=="completed")] | length' \
+                          < "${WORK_DIR}/scans1.json" 2>/dev/null || echo 0)
+fi
+
+if [ "$all_terminal" = "yes" ] && [ "${first_completed_count:-0}" -ge 1 ] 2>/dev/null; then
   pass
 else
-  fail "scan #1 did not reach 'completed' within ${SCAN_TIMEOUT}s (last state: '${first_state:-none}')"
+  last_states=$(jq -c '[.items[] | {scan_type, status}]' < "${WORK_DIR}/scans1.json" 2>/dev/null | cut -c1-300)
+  fail "scan #1 did not reach a terminal state with >=1 completed row within ${SCAN_TIMEOUT}s" \
+"all_terminal=${all_terminal} completed_rows=${first_completed_count}
+states: ${last_states}"
 fi
 
 # -------------------------------------------------------------------------
@@ -269,60 +312,100 @@ fi
 # second trigger because the placeholder insert is synchronous.
 sleep 3
 
-# -------------------------------------------------------------------------
-# Load-bearing assertion #1: per-artifact list shows exactly 1 completed row.
-#
-# This is the symptom the customer saw: a second `completed` row appearing
-# for what should have been a single logical scan. Pin the count to exactly 1.
-# -------------------------------------------------------------------------
-
-begin_test "Per-artifact scans list has exactly 1 completed row (#1373 regression)"
+# Snapshot scan state after trigger #2 settles, then run the two assertions.
 list_status=$(curl -s -o "${WORK_DIR}/scans2.json" -w '%{http_code}' $CURL_TIMEOUT \
   -H "$(auth_header)" \
   -H "Accept: application/json" \
   "${BASE_URL}${SCAN_LIST_PATH}") || list_status="000"
 
+# -------------------------------------------------------------------------
+# Load-bearing assertion #1: EXACTLY ONE completed row PER scan_type.
+#
+# This is the precise #1373 dedup contract. The backend runs >=1 scanner and
+# each writes its own row, so "one completed row total" is wrong; the real
+# invariant is that no single scan_type accumulates a second completed row.
+# We assert the max completed-count across scan_types is exactly 1 (and that
+# at least one scan_type completed). A duplicate placeholder promoted to
+# 'completed' for ANY scan_type makes that scan_type's count >= 2 and fails
+# here -- which is exactly the symptom we must keep catching. We do NOT
+# loosen this to "<= N"; per-scan_type the bound stays hard at 1.
+# -------------------------------------------------------------------------
+begin_test "Each scan_type has exactly 1 completed row (#1373 regression)"
 if [ "$list_status" != "200" ]; then
   fail "scan-list returned HTTP ${list_status}"
 else
-  completed_count=$(jq '[.items[] | select((.status | ascii_downcase) == "completed")] | length' \
-                    < "${WORK_DIR}/scans2.json" 2>/dev/null || echo "0")
-  total_count=$(jq '.items | length // 0' < "${WORK_DIR}/scans2.json" 2>/dev/null || echo "0")
-  if [ "$completed_count" = "1" ]; then
+  # Per-scan_type completed counts, e.g. [{"scan_type":"grype","completed":1},...]
+  per_type=$(jq -c '
+    [.items[] | select((.status|ascii_downcase) == "completed")]
+    | group_by(.scan_type)
+    | map({scan_type: .[0].scan_type, completed: length})
+  ' < "${WORK_DIR}/scans2.json" 2>/dev/null || echo '[]')
+  num_types=$(echo "$per_type" | jq 'length' 2>/dev/null || echo 0)
+  max_per_type=$(echo "$per_type" | jq '[.[].completed] | max // 0' 2>/dev/null || echo 0)
+
+  if [ "${num_types:-0}" -ge 1 ] 2>/dev/null && [ "${max_per_type:-0}" = "1" ]; then
     pass
   else
-    snippet=$(jq -c '[.items[] | {id, status}]' < "${WORK_DIR}/scans2.json" 2>/dev/null | cut -c 1-500)
-    fail "expected exactly 1 completed scan for artifact ${ARTIFACT_ID}, got ${completed_count} (total rows: ${total_count})" \
-"This is the #1373 symptom: the second trigger_scan call inserted a new
-placeholder row that the worker promoted to 'completed', leaving 2+
-completed rows where the short-circuit should have returned the existing
+    snippet=$(jq -c '[.items[] | {id, scan_type, status, is_reused}]' < "${WORK_DIR}/scans2.json" 2>/dev/null | cut -c 1-700)
+    fail "dedup contract broken: a scan_type has >1 completed row for artifact ${ARTIFACT_ID} (scan_types=${num_types}, max completed/type=${max_per_type})" \
+"This is the #1373 symptom: a trigger_scan call inserted a new placeholder
+row that the worker promoted to 'completed', leaving 2+ completed rows for
+one scan_type where the short-circuit should have returned the existing
 scan_id.
 
+per-scan_type completed counts: ${per_type}
 scans (projected): ${snippet}
 endpoint: GET ${BASE_URL}${SCAN_LIST_PATH}
-first scan_id: ${first_scan_id}"
+completed-by-type after trigger #1: ${first_completed_by_type}
+
+NOTE: if this fails ONLY under concurrent load (and passes sequentially),
+the cause is the known check-then-act race in
+ScannerService::prepare_artifact_scan: find_existing_scan_for_artifact
+(SELECT) and create_scan_result_with_checksum (INSERT) are not atomic and
+there is no unique constraint on (artifact_id, scan_type, checksum_sha256),
+so concurrent triggers each insert a fresh placeholder. That is a REAL
+backend defect, not a test bug -- do NOT mask it by loosening this bound."
   fi
 fi
 
 # -------------------------------------------------------------------------
-# Load-bearing assertion #2: the second-most-recent (if present) is the
-# same scan_id as the first. Asserting this on top of the count-of-1
-# assertion catches a regression where a placeholder is inserted then
-# garbage-collected (count=1 again, but with a different id than expected).
-#
-# If the list has only one row, the id check trivially holds.
+# Load-bearing assertion #2: the completed scan_id for each scan_type is
+# UNCHANGED from the post-trigger-#1 snapshot. The short-circuit must return
+# the EXISTING id; a different id for any scan_type means a fresh placeholder
+# was inserted and promoted to 'completed' (the regression #1388 fixes), even
+# if some janitor later collapsed the count back to 1.
 # -------------------------------------------------------------------------
-
-begin_test "Latest completed scan_id equals first scan_id (no duplicate placeholder)"
-latest_completed_id=$(jq -er '[.items[] | select((.status | ascii_downcase) == "completed")][0].id // empty' \
-                      < "${WORK_DIR}/scans2.json" 2>/dev/null || echo "")
-if [ -n "$latest_completed_id" ] && [ "$latest_completed_id" = "$first_scan_id" ]; then
-  pass
+begin_test "Completed scan_id per scan_type unchanged after re-trigger (no dup placeholder)"
+if [ "$list_status" != "200" ]; then
+  fail "scan-list returned HTTP ${list_status}"
 else
-  fail "latest completed scan_id (${latest_completed_id:-<empty>}) does not match first scan_id (${first_scan_id})" \
-"The short-circuit must return the existing scan_id; a different id here
-means a fresh placeholder was inserted and promoted to 'completed' (the
-exact regression #1388 fixes)."
+  second_completed_by_type=$(jq -c '
+    [.items[] | select((.status|ascii_downcase) == "completed")]
+    | group_by(.scan_type)
+    | map({(.[0].scan_type): (.[0].id)})
+    | add // {}
+  ' < "${WORK_DIR}/scans2.json" 2>/dev/null || echo '{}')
+
+  # Every scan_type present in the trigger-#1 snapshot must map to the SAME
+  # completed id after trigger #2. New scan_types appearing is also a dup
+  # signal, so require the two maps to be equal.
+  if [ "$first_completed_by_type" = "$second_completed_by_type" ] && [ "$first_completed_by_type" != "{}" ]; then
+    pass
+  elif echo "$first_completed_by_type" | jq -e \
+        --argjson after "$second_completed_by_type" \
+        'to_entries | all(.value == ($after[.key])) and (length > 0) and (($after | length) == length)' \
+        >/dev/null 2>&1; then
+    # Robust equality independent of key ordering.
+    pass
+  else
+    fail "completed scan_id changed for some scan_type after re-trigger" \
+"The short-circuit must return the existing scan_id per scan_type; a changed
+or added id means a fresh placeholder was inserted and promoted to
+'completed' (the exact regression #1388 fixes).
+
+after trigger #1: ${first_completed_by_type}
+after trigger #2: ${second_completed_by_type}"
+  fi
 fi
 
 end_suite
diff --git a/tests/webhooks/test-webhook-retry-recover.sh b/tests/webhooks/test-webhook-retry-recover.sh
index 0b8f05c..03abd48 100755
--- a/tests/webhooks/test-webhook-retry-recover.sh
+++ b/tests/webhooks/test-webhook-retry-recover.sh
@@ -12,22 +12,56 @@
 # release-gate stays under WEBHOOK_RETRY_TIMEOUT (default 360s). The first
 # three intervals (30s, 60s, 120s base) plus +/-20% jitter give acceptable
 # windows of:
-#   attempt 1 ->  24-36s
-#   attempt 2 ->  48-72s   (cumulative 72-108s)
-#   attempt 3 ->  96-144s  (cumulative 168-252s)
+#   attempt 1 ->  24-36s   backoff
+#   attempt 2 ->  48-72s   backoff
+#   attempt 3 ->  96-144s  backoff
 # The full 12-attempt schedule is exercised in the dead-letter test.
 #
-# Receiver discovery:
-#   WEBHOOK_RECEIVER_PORT  - port the local mock listens on (default 18765).
+# IMPORTANT — what we actually measure, and the scheduler-tick quantization
+# that the (now fixed) windows MUST account for:
+#
+# The backend does NOT fire a retry at the exact instant next_retry_at
+# elapses. A background scheduler tick runs every 30s
+# (services/scheduler_service.rs: interval(Duration::from_secs(30))) and
+# only then drains rows whose next_retry_at <= NOW(). So the *observed* gap
+# between two consecutive POSTs at the receiver is:
+#
+#     observed_gap = backoff(attempt)  +  [0 .. 30s tick latency]  +  dial
+#
+# i.e. the backoff rounded UP to the next 30s scheduler tick. The previous
+# version of this test asserted the raw backoff window (24-36s etc.) and
+# IGNORED tick quantization, so a perfectly healthy backend whose 30s base
+# delay landed mid-tick was measured as e.g. a 55s gap and reported
+# "outside 24-36". The windows below add a +30s (one full tick) upper
+# allowance and a small floor.
+#
+# Note also: this test must KEEP the receiver FAILING through attempt 3.
+# If the mock recovers (returns 200) after the first failure the delivery
+# row is marked success/next_retry_at=NULL and NO further POSTs occur — you
+# can never observe a 2nd or 3rd retry interval. WEBHOOK_FAIL_FIRST_N is
+# therefore defaulted to EXPECT_ATTEMPTS so the receiver rejects attempts
+# 1..3 (producing the 3 spaced retry POSTs we measure) and accepts the 4th,
+# proving the recover path.
+#
+# Receiver discovery / isolation:
+#   WEBHOOK_RECEIVER_PORT  - port the local mock listens on. Defaults to a
+#                            per-PID port so concurrent webhook suites in the
+#                            same gate run do NOT share one receiver. A shared
+#                            receiver was the root cause of the "all deltas=0s"
+#                            flake: a second suite's deliveries drained in the
+#                            same 30s scheduler tick landed as near-simultaneous
+#                            POSTs in this suite's log, so every measured gap
+#                            collapsed to 0s.
 #   WEBHOOK_RECEIVER_URL   - URL the backend POSTs to. Defaults to the
 #                            local mock; override in environments where
 #                            the backend cannot reach loopback.
 #   WEBHOOK_FAIL_FIRST_N   - how many POSTs the mock should reject with
-#                            500 before flipping to 200 (default 1, so we
-#                            observe the recover path on the first retry).
-#   WEBHOOK_RETRY_TIMEOUT  - seconds to wait for the recover round-trip
-#                            (default 360 to cover up to attempt 3 with
-#                            jitter headroom).
+#                            500 before flipping to 200. Defaults to 3 so we
+#                            observe THREE spaced retries then recover on the
+#                            4th. (Must be >= EXPECT_ATTEMPTS.)
+#   WEBHOOK_RETRY_TIMEOUT  - seconds to wait for the full attempt-3 round-trip
+#                            (default 480 to cover backoff 30+60+120s plus up
+#                            to three 30s scheduler ticks plus jitter headroom).
 #
 # Companion backend PR: artifact-keeper#1140. Producer feature flag is
 # WEBHOOKS_V2_PRODUCER_ENABLED; harness sets it to 1.
@@ -38,10 +72,19 @@ source "$(dirname "$0")/../lib/common.sh"
 
 begin_suite "webhook-retry-recover"
 
-WEBHOOK_RECEIVER_PORT="${WEBHOOK_RECEIVER_PORT:-18765}"
+# Per-PID default port so concurrent webhook suites never share a receiver
+# (shared-receiver simultaneous drains were the "all deltas=0s" root cause).
+WEBHOOK_RECEIVER_PORT="${WEBHOOK_RECEIVER_PORT:-$(( 18700 + $$ % 200 ))}"
 WEBHOOK_RECEIVER_URL="${WEBHOOK_RECEIVER_URL:-http://${WEBHOOK_RECEIVER_HOST:-127.0.0.1}:${WEBHOOK_RECEIVER_PORT}/hook}"
-WEBHOOK_FAIL_FIRST_N="${WEBHOOK_FAIL_FIRST_N:-1}"
-WEBHOOK_RETRY_TIMEOUT="${WEBHOOK_RETRY_TIMEOUT:-360}"
+# Number of attempts we observe a spaced retry for (attempt 1, 2, 3).
+EXPECT_ATTEMPTS="${EXPECT_ATTEMPTS:-3}"
+# Keep failing through attempt 3 so all three retry intervals materialize,
+# then recover on the 4th POST. MUST be >= EXPECT_ATTEMPTS.
+WEBHOOK_FAIL_FIRST_N="${WEBHOOK_FAIL_FIRST_N:-${EXPECT_ATTEMPTS}}"
+# Backoff 30+60+120=210s base; with +20% jitter (~252s) plus up to three 30s
+# scheduler ticks (~90s) of quantization plus the ~15s scheduler warmup and
+# producer enqueue latency => ~360s worst case. 480s gives load headroom.
+WEBHOOK_RETRY_TIMEOUT="${WEBHOOK_RETRY_TIMEOUT:-480}"
 WEBHOOK_RECEIVER_LOG="${WEBHOOK_RECEIVER_LOG:-/tmp/mock-webhook-receiver-${RUN_ID}.log}"
 RECEIVER_PID=""
 
@@ -89,6 +132,8 @@ else
     sleep 0.2
   done
   if [ "$ready" = true ]; then
+    # Reset any stale state if the port happened to be reused by a prior run.
+    curl -sf --max-time 2 "http://127.0.0.1:${WEBHOOK_RECEIVER_PORT}/__reset" >/dev/null 2>&1 || true
     pass
   else
     fail "mock receiver did not come up on 127.0.0.1:${WEBHOOK_RECEIVER_PORT}"
@@ -131,26 +176,67 @@ fi
 # create a repository (-> repository.created -> producer inserts a delivery
 # row). The mock rejects the first WEBHOOK_FAIL_FIRST_N POSTs with 500, which
 # drives the retry-then-recover path we assert on below.
+#
+# Isolation against sibling deliveries: this webhook is global (repository_id
+# NULL), so the producer also enqueues a delivery for it on repository.created
+# events fired by OTHER suites running concurrently. Those sibling deliveries
+# would land as extra POSTs in this receiver's log and, because they drain in
+# the same 30s scheduler tick, collapse the measured retry gaps to 0s -- the
+# original flake. We defend against that by capturing THIS repository's id and
+# filtering the receiver log down to only the POSTs whose payload .entity_id
+# matches it (see select_repo_posts below). Every delivery payload carries
+# entity_id = the created repo's UUID (services/webhook_producer.rs
+# build_event_payload), so the filter isolates our own retry sequence exactly.
 # -------------------------------------------------------------------------
 
+REPO_ID=""
+
+# Echo, oldest-first, the .ts of every receiver-log POST whose payload
+# .entity_id matches our repo (falls back to all POSTs if entity_id could not
+# be determined, e.g. an older producer that omitted it).
+select_repo_posts() {
+  if [ -n "$REPO_ID" ]; then
+    jq -r --arg rid "$REPO_ID" \
+      'select((.body | fromjson? | .entity_id) == $rid) | .ts' \
+      "$WEBHOOK_RECEIVER_LOG" 2>/dev/null
+  else
+    jq -r '.ts' "$WEBHOOK_RECEIVER_LOG" 2>/dev/null
+  fi
+}
+
+# Echo, oldest-first, the full JSON record of every receiver-log POST whose
+# payload .entity_id matches our repo (one compact JSON object per line).
+select_repo_records() {
+  if [ -n "$REPO_ID" ]; then
+    jq -c --arg rid "$REPO_ID" \
+      'select((.body | fromjson? | .entity_id) == $rid)' \
+      "$WEBHOOK_RECEIVER_LOG" 2>/dev/null
+  else
+    jq -c '.' "$WEBHOOK_RECEIVER_LOG" 2>/dev/null
+  fi
+}
+
 begin_test "Trigger delivery via repository.created and observe initial POST at receiver"
 if [ "$SUITE_BLOCKED" = "true" ] || [ -z "$WEBHOOK_ID" ]; then
   skip "no webhook id"
 else
   REPO_CREATE_PAYLOAD="{\"key\":\"${REPO_KEY}\",\"name\":\"${REPO_KEY}\",\"format\":\"generic\",\"repo_type\":\"local\",\"is_public\":true}"
-  if api_post "/api/v1/repositories" "$REPO_CREATE_PAYLOAD" >/dev/null 2>&1; then
+  if repo_resp=$(api_post "/api/v1/repositories" "$REPO_CREATE_PAYLOAD" 2>/dev/null); then
+    REPO_ID=$(echo "$repo_resp" | jq -r '.id // empty')
     seen=0
     # First attempt is rejected (500) by the mock but still lands as a POST;
     # allow generous headroom for the producer's async enqueue + first dial.
+    # Count only POSTs that belong to OUR repo so a sibling suite's delivery
+    # cannot satisfy this gate.
     for _ in $(seq 1 30); do
-      seen=$(curl -sf --max-time 2 "http://127.0.0.1:${WEBHOOK_RECEIVER_PORT}/__count" 2>/dev/null || echo 0)
+      seen=$(select_repo_posts | grep -c . 2>/dev/null || echo 0)
       [ "$seen" -gt 0 ] && break
       sleep 1
     done
     if [ "$seen" -ge 1 ]; then
       pass
     else
-      fail "receiver saw 0 POSTs after repository.created event"
+      fail "receiver saw 0 POSTs for repo ${REPO_KEY} after repository.created event"
     fi
   else
     SUITE_BLOCKED=true
@@ -159,60 +245,79 @@ else
 fi
 
 # -------------------------------------------------------------------------
-# Retry windows for v2 schedule (with +/-20% jitter applied):
-#   attempt 1: 24..36s   after the failed try
-#   attempt 2: 48..72s   after attempt 1
-#   attempt 3: 96..144s  after attempt 2
-# We observe the timestamps of consecutive POSTs at the receiver and
-# assert each gap is within window. For the smoke profile we cap at 3
-# attempts (covers attempt 1, 2, 3 deltas).
+# Retry windows. We observe the wall-clock timestamps of consecutive POSTs
+# at the receiver and assert each gap is within window. The window is the
+# jittered backoff PLUS the scheduler-tick quantization (a retry only fires
+# on the next 30s tick after next_retry_at elapses):
+#
+#   observed_gap(attempt) in [ backoff_min , backoff_max + 30s_tick + slack ]
+#
+#   attempt 1: backoff 24..36   -> window 20..72   (36 + 30 tick + 6 slack)
+#   attempt 2: backoff 48..72   -> window 40..108  (72 + 30 tick + 6 slack)
+#   attempt 3: backoff 96..144  -> window 84..180  (144 + 30 tick + 6 slack)
+#
+# The lower bound is set slightly below backoff_min to tolerate a POST that
+# lands a hair early on a fast tick / clock skew, but stays WELL above 0 so a
+# collapsed/simultaneous-drain run (the original flake) still FAILS loudly:
+# this remains a meaningful assertion that retries ARE spaced by backoff.
 # -------------------------------------------------------------------------
 
-EXPECT_ATTEMPTS=3
-RETRY_MIN=(24 48 96)
-RETRY_MAX=(36 72 144)
-
-# Mark the receiver log so we can read deltas freshly.
-LOG_BASELINE_LINES=0
-if [ -f "$WEBHOOK_RECEIVER_LOG" ]; then
-  LOG_BASELINE_LINES=$(wc -l < "$WEBHOOK_RECEIVER_LOG" | tr -d ' ')
-fi
+RETRY_MIN=(20 40 84)
+RETRY_MAX=(72 108 180)
 
 begin_test "Retry intervals for attempts 1..3 fall within v2 schedule (+/-20% jitter)"
 if [ "$SUITE_BLOCKED" = "true" ] || [ -z "$WEBHOOK_ID" ]; then
   skip "no webhook id"
 else
-  # Wait for at least EXPECT_ATTEMPTS + 1 entries (the initial delivery
-  # plus the retry attempts). Bail at WEBHOOK_RETRY_TIMEOUT.
+  # We need EXPECT_ATTEMPTS+1 POSTs for our repo: the initial failing dial
+  # plus one POST per observed retry (attempts 1..N), where the (N+1)th POST
+  # is the one whose preceding gap closes attempt N. With FAIL_FIRST_N >=
+  # EXPECT_ATTEMPTS the receiver keeps rejecting through attempt N, so all N
+  # spaced retries materialize.
   expected_entries=$(( EXPECT_ATTEMPTS + 1 ))
+
+  # Poll on the count of OUR repo's POSTs (entity_id filtered), and require it
+  # to be STABLE for one extra poll before measuring -- this guards against
+  # reading mid-write (a half-written final line) and against a retry POST
+  # that is in-flight when we snapshot. Bail at WEBHOOK_RETRY_TIMEOUT.
   elapsed=0
   poll=5
+  prev_seen=-1
+  stable=0
+  observed=0
   while [ "$elapsed" -lt "$WEBHOOK_RETRY_TIMEOUT" ]; do
-    seen=$(curl -sf --max-time 2 "http://127.0.0.1:${WEBHOOK_RECEIVER_PORT}/__count" 2>/dev/null || echo 0)
-    if [ "$seen" -ge "$expected_entries" ]; then
-      break
+    observed=$(select_repo_posts | grep -c . 2>/dev/null || echo 0)
+    if [ "$observed" -ge "$expected_entries" ]; then
+      if [ "$observed" -eq "$prev_seen" ]; then
+        stable=$(( stable + 1 ))
+        [ "$stable" -ge 1 ] && break
+      else
+        stable=0
+      fi
     fi
+    prev_seen="$observed"
     sleep "$poll"
     elapsed=$(( elapsed + poll ))
   done
 
-  total_lines=$(wc -l < "$WEBHOOK_RECEIVER_LOG" 2>/dev/null | tr -d ' ' || echo 0)
-  new_lines=$(( total_lines - LOG_BASELINE_LINES ))
-  if [ "$new_lines" -lt "$expected_entries" ]; then
-    skip "only ${new_lines} delivery entries observed within ${WEBHOOK_RETRY_TIMEOUT}s; producer may not be running, or jitter pushed timing out of window"
+  if [ "$observed" -lt "$expected_entries" ]; then
+    fail "only ${observed} POST(s) for repo ${REPO_KEY} observed within ${WEBHOOK_RETRY_TIMEOUT}s (need ${expected_entries}); the retry scheduler did not deliver the expected attempts (FAIL_FIRST_N=${WEBHOOK_FAIL_FIRST_N})"
   else
-    # Pull the last `expected_entries` timestamps (in seconds, float).
-    mapfile -t ts_lines < <(tail -n "$expected_entries" "$WEBHOOK_RECEIVER_LOG" \
-      | jq -r '.ts' 2>/dev/null)
+    # Pull the FIRST `expected_entries` timestamps for our repo, oldest-first.
+    # (Oldest-first is the genuine attempt order; taking the tail could splice
+    # in a stray late POST.)
+    mapfile -t ts_lines < <(select_repo_posts | head -n "$expected_entries")
     if [ "${#ts_lines[@]}" -lt "$expected_entries" ]; then
-      fail "could not parse ${expected_entries} timestamps from receiver log"
+      fail "could not parse ${expected_entries} timestamps for repo ${REPO_KEY} from receiver log"
     else
       bad=""
+      deltas=""
       for i in $(seq 1 "$EXPECT_ATTEMPTS"); do
         prev="${ts_lines[$((i-1))]}"
         curr="${ts_lines[$i]}"
         # Integer delta (sec). awk handles the float subtraction.
         delta=$(awk -v a="$curr" -v b="$prev" 'BEGIN { printf "%d", (a - b) }')
+        deltas="${deltas} a${i}=${delta}s"
         min="${RETRY_MIN[$((i-1))]}"
         max="${RETRY_MAX[$((i-1))]}"
         if [ "$delta" -lt "$min" ] || [ "$delta" -gt "$max" ]; then
@@ -220,9 +325,11 @@ else
         fi
       done
       if [ -z "$bad" ]; then
+        echo "    observed retry gaps:${deltas}"
         pass
       else
-        fail "retry interval(s) outside v2 +/-20% window:${bad}"
+        fail "retry interval(s) outside v2 backoff+tick window:${bad}" \
+             "observed gaps:${deltas}; windows attempt1=${RETRY_MIN[0]}-${RETRY_MAX[0]} attempt2=${RETRY_MIN[1]}-${RETRY_MAX[1]} attempt3=${RETRY_MIN[2]}-${RETRY_MAX[2]}. delta=0 across the board => simultaneous drain (receiver contamination) not real backoff."
       fi
     fi
   fi
@@ -238,11 +345,13 @@ begin_test "X-ArtifactKeeper-Retry-Attempt header present on retry deliveries"
 if [ "$SUITE_BLOCKED" = "true" ] || [ ! -s "$WEBHOOK_RECEIVER_LOG" ]; then
   skip "no receiver log"
 else
-  total_lines=$(wc -l < "$WEBHOOK_RECEIVER_LOG" 2>/dev/null | tr -d ' ' || echo 0)
-  if [ "$total_lines" -lt 2 ]; then
-    skip "only ${total_lines} delivery entry/entries; cannot check retry header"
+  # Use the SECOND POST for OUR repo (entity_id filtered): the first POST is
+  # the initial delivery (no retry header), the second is retry attempt 1.
+  repo_post_count=$(select_repo_records | grep -c . 2>/dev/null || echo 0)
+  if [ "$repo_post_count" -lt 2 ]; then
+    fail "only ${repo_post_count} delivery POST(s) for repo ${REPO_KEY}; cannot check retry header (expected the first retry to have fired)"
   else
-    second_headers=$(sed -n '2p' "$WEBHOOK_RECEIVER_LOG" | jq -r '.headers')
+    second_headers=$(select_repo_records | sed -n '2p' | jq -r '.headers')
     retry_attempt=$(echo "$second_headers" | jq -r '
       (.["X-ArtifactKeeper-Retry-Attempt"] //
        .["x-artifactkeeper-retry-attempt"] //