From 2940eb9190184fcedf1c7a8eeb8b6b1865b6491d Mon Sep 17 00:00:00 2001 From: brandonrc Date: Sun, 21 Jun 2026 06:37:02 +0000 Subject: [PATCH] test: harden 3 load-induced release-gate flakes (solid, not retry-hope) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - common.sh: add api_get_with_retry + create_test_user_with_retry — bounded retry-with-backoff on TRANSIENT class only (5xx/000); 4xx never retried so real client/auth errors are not masked. - test-sbt.sh: artifacts GET uses api_get_with_retry (idempotent GET that hit a transient 5xx under fleet worker-starvation). - test-regression-security.sh: 3 setup user-creates use create_test_user_with_retry. - test-scan-dedup-short-circuit-1373.sh: assert exactly-1-completed-row PER scan_type (backend registers dependency+grype scanners) and stable scan_id per type across re-trigger — STRONGER + deterministic vs the old total-count==1. (Test serializes its own triggers; see filed backend race for the concurrent case.) - test-webhook-retry-recover.sh: isolate measured deliveries by repo entity_id so concurrent suites' sibling deliveries to a global webhook no longer contaminate the timing (root cause of attempt_delta=0s); oldest-first; windows account for the 30s scheduler tick + 20% jitter; per-PID receiver port. --- tests/formats/test-sbt.sh | 9 +- tests/lib/common.sh | 162 ++++++++++++++ tests/security/test-regression-security.sh | 49 ++-- .../test-scan-dedup-short-circuit-1373.sh | 169 ++++++++++---- tests/webhooks/test-webhook-retry-recover.sh | 209 +++++++++++++----- 5 files changed, 477 insertions(+), 121 deletions(-) diff --git a/tests/formats/test-sbt.sh b/tests/formats/test-sbt.sh index eb6a226..bf5f71e 100755 --- a/tests/formats/test-sbt.sh +++ b/tests/formats/test-sbt.sh @@ -220,7 +220,14 @@ fi # --------------------------------------------------------------------------- begin_test "List artifacts via management API" -if resp=$(api_get "/api/v1/repositories/${REPO_KEY}/artifacts"); then +# Use the retry-with-backoff helper instead of bare api_get: this GET is +# idempotent and, when the gate runs ~25 format suites in parallel, the backend +# tokio runtime can be momentarily saturated (worker-starvation / availability +# issue), returning a transient 503/502/000 that the very next attempt serves +# fine. api_get_with_retry retries ONLY that transient class (5xx/000), never +# 4xx or wrong-content, so the assertion below stays meaningful. (ak-test flake: +# "GET .../artifacts returned error" in format-tests (jvm).) +if resp=$(api_get_with_retry "/api/v1/repositories/${REPO_KEY}/artifacts"); then if assert_contains "$resp" "$MODULE_NAME" "artifact list should contain module name"; then pass fi diff --git a/tests/lib/common.sh b/tests/lib/common.sh index c2ce48b..2e5ea59 100644 --- a/tests/lib/common.sh +++ b/tests/lib/common.sh @@ -503,6 +503,86 @@ api_get() { curl -sf $CURL_TIMEOUT -H "$(auth_header)" "$@" "${BASE_URL}${path}" } +# api_get_with_retry - GET an idempotent management-API endpoint with a bounded +# retry-with-backoff on TRANSIENT failures only. On success it echoes the +# response body on stdout and returns 0, so it is a drop-in replacement for +# `api_get` in `if resp=$(...); then`-style call sites. +# +# Why this exists (the test-sbt.sh "List artifacts" flake, ak-test): +# The plain `api_get` uses `curl -sf`, which exits non-zero on ANY >=400 and +# discards the response body. When ~25 format suites run in parallel in the +# `format-tests (jvm)` gate job they saturate the backend's tokio runtime +# (the known worker-starvation / availability issue: uncapped CPU work such +# as bcrypt(cost=12) on basic-auth format calls bypasses the auth semaphore). +# A GET that races peak saturation can transiently come back 503/502 (pool +# exhausted / shed) or 000 (request exceeded its time budget) even though the +# data is fine -- the very next attempt succeeds. With bare `api_get` that one +# transient blip hard-fails the suite ("GET .../artifacts returned error"), +# which is a retry-hope flake, not a real defect: the GET is idempotent and +# SHOULD be retried, exactly as format_get_with_retry / create_repo / login_as +# already do for their transient classes. +# +# Retry policy (deliberately narrow so we never mask a real bug): +# - Retry ONLY on transient-class statuses: 000 (network/timeout) and the 5xx +# range (500-599: 502/503/504 plus any residual 5xx). +# - DO NOT retry on 2xx (success) or 4xx (real client/auth/not-found errors): +# those are deterministic and a retry would just hide a genuine failure. A +# 404/403/wrong-content surfaces immediately. +# - On a non-2xx final outcome, return 1 AND emit a precise diagnostic to +# stderr (final HTTP status + body snippet) so the failure is never opaque +# the way bare `curl -sf` made it. +# +# Knobs: API_GET_MAX_ATTEMPTS (default 4), API_GET_RETRY_DELAY (default 2s, +# applied with linear backoff: delay * attempt). +# +# Usage: +# if resp=$(api_get_with_retry "/api/v1/repositories/${KEY}/artifacts"); then +# assert_contains "$resp" "$MODULE_NAME" "..." +# else +# fail "GET .../artifacts returned error" +# fi +api_get_with_retry() { + local path="$1"; shift + local _max="${API_GET_MAX_ATTEMPTS:-4}" + local _delay="${API_GET_RETRY_DELAY:-2}" + local _attempt _status="000" _body_file _body="" + _body_file=$(mktemp) + for _attempt in $(seq 1 "$_max"); do + _status=$(curl -s $CURL_TIMEOUT -o "$_body_file" -w '%{http_code}' \ + -H "$(auth_header)" "$@" "${BASE_URL}${path}" 2>/dev/null) || _status="000" + + # Success: 2xx -> emit body, return 0. + if [ "$_status" -ge 200 ] 2>/dev/null && [ "$_status" -lt 300 ] 2>/dev/null; then + cat "$_body_file" + rm -f "$_body_file" + return 0 + fi + + # Transient class -> retry. Everything else (4xx, other non-5xx) is a real, + # deterministic failure: stop immediately so we don't mask a bug. + local _transient=false + if [ "$_status" = "000" ] || \ + { [ "$_status" -ge 500 ] 2>/dev/null && [ "$_status" -le 599 ] 2>/dev/null; }; then + _transient=true + fi + if [ "$_transient" != true ]; then + break + fi + + if [ "$_attempt" -lt "$_max" ]; then + sleep "$(( _delay * _attempt ))" + fi + done + + _body=$(head -c 400 "$_body_file" 2>/dev/null || true) + rm -f "$_body_file" + # _attempt holds the loop index of the last attempt made. A deterministic + # (non-transient) status breaks the loop early, so this reports the real + # number of requests issued rather than the configured ceiling. + echo "api_get_with_retry ${path} failed after ${_attempt} attempt(s) (max ${_max}): HTTP ${_status} body=${_body}" >&2 + return 1 +} + # Create a test user (admin auth) and echo the new user's UUID on stdout. # On failure, echoes empty string and the response body to stderr. # @@ -525,6 +605,88 @@ create_test_user() { echo "$uid" } +# Create a test user (admin auth) with bounded retry-on-transient, echoing +# the new user's UUID on stdout. Returns 0 on success, 1 after retries. +# +# Why this exists (release-gate flake #1): +# Several security suites create a throwaway user as a SETUP step +# (token-revocation, lockout, force-password-change). User creation hashes +# the password with bcrypt in spawn_blocking. Under fleet-concurrent load +# the backend's blocking pool / worker runtime can be momentarily starved +# (the known availability worker-starvation issue: uncapped CPU-bound auth +# work on the tokio runtime), so an otherwise-valid admin POST +# /api/v1/users can transiently return 5xx or drop the connection +# (curl exit -> 000). The plain api_post helper uses `curl -sf`, which +# collapses any non-2xx into a bare non-zero exit with no body and NO +# retry, so a single transient blip fails the whole suite at setup time +# ("could not create force-password-change test user"). +# +# This helper retries ONLY the transient class (HTTP 5xx and network 000). +# Real client errors (400 invalid payload, 409 username taken, 401/403 +# auth) are returned immediately and NOT masked -- a duplicate-username or +# a malformed request is a genuine test bug, not a flake. +# +# Accepts EITHER the 3-arg short form (username password email) or a 4th arg +# giving a full JSON body (so callers that also set display_name reuse the +# same retry logic). The body, if given, must be a complete JSON object. +# +# Tunables (shared budget feel with create_repo / login_as): +# CREATE_USER_MAX_ATTEMPTS default 4 +# CREATE_USER_RETRY_DELAY default 1 (seconds; doubled each attempt) +# +# Usage: +# USER_ID=$(create_test_user_with_retry "$USER" "$PASS" "$EMAIL") || fail ... +# USER_ID=$(create_test_user_with_retry "$USER" "$PASS" "$EMAIL" "$JSON") || fail ... +create_test_user_with_retry() { + local username="$1" + local password="$2" + local email="$3" + local body="${4:-}" + if [ -z "$body" ]; then + body="{\"username\":\"${username}\",\"password\":\"${password}\",\"email\":\"${email}\"}" + fi + local _max="${CREATE_USER_MAX_ATTEMPTS:-4}" + local _delay="${CREATE_USER_RETRY_DELAY:-1}" + local _attempt _status _tmp _resp uid="" + for _attempt in $(seq 1 "$_max"); do + _tmp=$(mktemp) + _status=$(curl -s $CURL_TIMEOUT -o "$_tmp" -w '%{http_code}' \ + -X POST -H "$(auth_header)" -H 'Content-Type: application/json' \ + -d "$body" "${BASE_URL}/api/v1/users" 2>/dev/null) || _status="000" + _resp=$(cat "$_tmp" 2>/dev/null || true) + rm -f "$_tmp" + + if [ "$_status" -ge 200 ] 2>/dev/null && [ "$_status" -lt 300 ] 2>/dev/null; then + uid=$(echo "$_resp" | jq -r '.user.id // .id // .user_id // empty' 2>/dev/null) || uid="" + if [ -n "$uid" ] && [ "$uid" != "null" ]; then + echo "$uid" + return 0 + fi + # 2xx but no id: a contract break, not a transient blip. Don't retry. + echo "create_test_user_with_retry: ${username} got HTTP ${_status} but no id: ${_resp:0:200}" >&2 + echo "" + return 1 + fi + + # Retry ONLY transient class: network 000 or any 5xx. Everything else + # (4xx) is a real failure surfaced immediately. + if [ "$_status" != "000" ] && { [ "$_status" -lt 500 ] 2>/dev/null || [ "$_status" -ge 600 ] 2>/dev/null; }; then + echo "create_test_user_with_retry: ${username} non-transient HTTP ${_status}: ${_resp:0:200}" >&2 + echo "" + return 1 + fi + + if [ "$_attempt" -lt "$_max" ]; then + echo " create-user ${username} attempt ${_attempt}/${_max} transient HTTP ${_status}, retrying in ${_delay}s..." >&2 + sleep "$_delay" + _delay=$(( _delay * 2 )) + fi + done + echo "create_test_user_with_retry: ${username} failed after ${_max} attempts (last HTTP ${_status})" >&2 + echo "" + return 1 +} + # Log in as the named user and echo the access_token on stdout. # On failure, echoes empty string and the response body to stderr. # diff --git a/tests/security/test-regression-security.sh b/tests/security/test-regression-security.sh index e641d0c..09c317c 100755 --- a/tests/security/test-regression-security.sh +++ b/tests/security/test-regression-security.sh @@ -31,16 +31,15 @@ USER_ID_592="" TOKEN_ID_592="" begin_test "Bug #592: Create test user for token revocation listing" -if resp=$(api_post "/api/v1/users" \ - "{\"username\":\"${TEST_USER_592}\",\"password\":\"${TEST_PASS_592}\",\"email\":\"${TEST_EMAIL_592}\",\"display_name\":\"Bug 592 Test\"}" 2>/dev/null); then - USER_ID_592=$(echo "$resp" | jq -r '.user.id // .id // .user_id // empty') || true - if [ -n "$USER_ID_592" ] && [ "$USER_ID_592" != "null" ]; then - pass - else - fail "user created but no ID returned" - fi +# Setup-step create: retry-with-backoff on transient 5xx/000 (fleet load can +# briefly starve the bcrypt blocking pool). 4xx (e.g. duplicate username) is +# surfaced immediately and never masked. See create_test_user_with_retry. +USER_ID_592=$(create_test_user_with_retry "$TEST_USER_592" "$TEST_PASS_592" "$TEST_EMAIL_592" \ + "{\"username\":\"${TEST_USER_592}\",\"password\":\"${TEST_PASS_592}\",\"email\":\"${TEST_EMAIL_592}\",\"display_name\":\"Bug 592 Test\"}") || true +if [ -n "$USER_ID_592" ] && [ "$USER_ID_592" != "null" ]; then + pass else - fail "could not create test user for bug #592" + fail "could not create test user for bug #592 (transient retries exhausted)" fi begin_test "Bug #592: Create API token" @@ -352,16 +351,13 @@ LOCKOUT_EMAIL="e2e-lockout-${RUN_ID}@test.local" LOCKOUT_USER_ID="" begin_test "Account lockout: Create test user" -if resp=$(api_post "/api/v1/users" \ - "{\"username\":\"${LOCKOUT_USER}\",\"password\":\"${LOCKOUT_PASS}\",\"email\":\"${LOCKOUT_EMAIL}\",\"display_name\":\"Lockout Test\"}" 2>/dev/null); then - LOCKOUT_USER_ID=$(echo "$resp" | jq -r '.user.id // .id // .user_id // empty') || true - if [ -n "$LOCKOUT_USER_ID" ] && [ "$LOCKOUT_USER_ID" != "null" ]; then - pass - else - fail "user created but no ID returned" - fi +# Setup-step create: retry transient 5xx/000 only (see create_test_user_with_retry). +LOCKOUT_USER_ID=$(create_test_user_with_retry "$LOCKOUT_USER" "$LOCKOUT_PASS" "$LOCKOUT_EMAIL" \ + "{\"username\":\"${LOCKOUT_USER}\",\"password\":\"${LOCKOUT_PASS}\",\"email\":\"${LOCKOUT_EMAIL}\",\"display_name\":\"Lockout Test\"}") || true +if [ -n "$LOCKOUT_USER_ID" ] && [ "$LOCKOUT_USER_ID" != "null" ]; then + pass else - fail "could not create lockout test user" + fail "could not create lockout test user (transient retries exhausted)" fi begin_test "Account lockout: Detect lockout threshold" @@ -569,16 +565,15 @@ FPC_EMAIL="e2e-fpc-${RUN_ID}@test.local" FPC_USER_ID="" begin_test "Force password change: Create test user" -if resp=$(api_post "/api/v1/users" \ - "{\"username\":\"${FPC_USER}\",\"password\":\"${FPC_PASS}\",\"email\":\"${FPC_EMAIL}\",\"display_name\":\"Force PC Test\"}" 2>/dev/null); then - FPC_USER_ID=$(echo "$resp" | jq -r '.user.id // .id // .user_id // empty') || true - if [ -n "$FPC_USER_ID" ] && [ "$FPC_USER_ID" != "null" ]; then - pass - else - fail "user created but no ID returned" - fi +# Setup-step create: retry transient 5xx/000 only (see create_test_user_with_retry). +# This is the exact step that flaked ("could not create force-password-change +# test user") when a fleet-concurrent run starved the bcrypt blocking pool. +FPC_USER_ID=$(create_test_user_with_retry "$FPC_USER" "$FPC_PASS" "$FPC_EMAIL" \ + "{\"username\":\"${FPC_USER}\",\"password\":\"${FPC_PASS}\",\"email\":\"${FPC_EMAIL}\",\"display_name\":\"Force PC Test\"}") || true +if [ -n "$FPC_USER_ID" ] && [ "$FPC_USER_ID" != "null" ]; then + pass else - fail "could not create force-password-change test user" + fail "could not create force-password-change test user (transient retries exhausted)" fi begin_test "Force password change: Call force-password-change endpoint" diff --git a/tests/security/test-scan-dedup-short-circuit-1373.sh b/tests/security/test-scan-dedup-short-circuit-1373.sh index 3ad3da4..d005cac 100755 --- a/tests/security/test-scan-dedup-short-circuit-1373.sh +++ b/tests/security/test-scan-dedup-short-circuit-1373.sh @@ -198,14 +198,39 @@ esac # Wait for the first scan to be completed so trigger #2 will exercise the # short-circuit branch (not the race-recovery branch). -begin_test "Wait for scan #1 to reach completed" +# +# IMPORTANT (scan_type scoping): the backend registers MORE THAN ONE scanner +# (e.g. `dependency` + `grype`), and each one writes its OWN scan_results row +# per artifact. The #1373 dedup contract is therefore per-scan_type: every +# configured scanner that ran must end with EXACTLY ONE completed row for the +# artifact, and re-triggering must not add a second row for any scan_type. +# The original test asserted "exactly 1 completed row TOTAL", which only ever +# held when a single scanner happened to run -- with two applicable scanners +# a correctly-deduped artifact has two completed rows (one each), so the old +# assertion failed even on the sequential happy path. We scope the wait and +# both assertions per scan_type so the real dedup invariant is checked +# without weakening it. +# +# We also wait for ALL scan_types to reach a terminal state (no row left in +# `running`/`pending`) before triggering #2, so trigger #2 always exercises +# the completed-scan short-circuit rather than the race-recovery branch. +begin_test "Wait for scan #1 to reach completed (all scan_types terminal)" SCAN_LIST_PATH="/api/v1/security/artifacts/${ARTIFACT_ID}/scans" elapsed=0 poll_interval=5 -first_scan_id="" -first_state="" +all_terminal="no" network_fail_count=0 +# Terminal statuses for a scan row. `completed` is the dedup-eligible one; +# `not_applicable`/`failed` are terminal too (a scanner that didn't apply or +# errored should not block the wait forever). +is_all_terminal_jq=' + (.items | length) > 0 + and ([.items[] | (.status | ascii_downcase) + | select(. != "completed" and . != "not_applicable" and . != "failed")] + | length) == 0 +' + while [ "$elapsed" -lt "$SCAN_TIMEOUT" ]; do http_status=$(curl -s -o "${WORK_DIR}/scans1.json" -w '%{http_code}' $CURL_TIMEOUT \ -H "$(auth_header)" \ @@ -214,10 +239,8 @@ while [ "$elapsed" -lt "$SCAN_TIMEOUT" ]; do if [ "$http_status" = "200" ]; then network_fail_count=0 - state=$(jq -er '.items[0].status // empty | ascii_downcase' < "${WORK_DIR}/scans1.json" 2>/dev/null || echo "") - if [ "$state" = "completed" ]; then - first_state="$state" - first_scan_id=$(jq -er '.items[0].id // empty' < "${WORK_DIR}/scans1.json" 2>/dev/null || echo "") + if jq -e "$is_all_terminal_jq" < "${WORK_DIR}/scans1.json" >/dev/null 2>&1; then + all_terminal="yes" break fi elif [ "$http_status" = "000" ] || [ "$http_status" = "502" ] || \ @@ -232,10 +255,30 @@ while [ "$elapsed" -lt "$SCAN_TIMEOUT" ]; do elapsed=$(( elapsed + poll_interval )) done -if [ "$first_state" = "completed" ] && [ -n "$first_scan_id" ]; then +# Snapshot the per-scan_type completed-row map after trigger #1. Shape: +# { "dependency": "", "grype": "", ... } +# Only `completed` rows are recorded (the dedup-eligible state). At least one +# scan_type must have completed for the short-circuit to be testable. +first_completed_by_type='{}' +first_completed_count=0 +if [ "$all_terminal" = "yes" ]; then + first_completed_by_type=$(jq -c ' + [.items[] | select((.status|ascii_downcase) == "completed")] + | group_by(.scan_type) + | map({(.[0].scan_type): (.[0].id)}) + | add // {} + ' < "${WORK_DIR}/scans1.json" 2>/dev/null || echo '{}') + first_completed_count=$(jq -r '[.items[] | select((.status|ascii_downcase)=="completed")] | length' \ + < "${WORK_DIR}/scans1.json" 2>/dev/null || echo 0) +fi + +if [ "$all_terminal" = "yes" ] && [ "${first_completed_count:-0}" -ge 1 ] 2>/dev/null; then pass else - fail "scan #1 did not reach 'completed' within ${SCAN_TIMEOUT}s (last state: '${first_state:-none}')" + last_states=$(jq -c '[.items[] | {scan_type, status}]' < "${WORK_DIR}/scans1.json" 2>/dev/null | cut -c1-300) + fail "scan #1 did not reach a terminal state with >=1 completed row within ${SCAN_TIMEOUT}s" \ +"all_terminal=${all_terminal} completed_rows=${first_completed_count} +states: ${last_states}" fi # ------------------------------------------------------------------------- @@ -269,60 +312,100 @@ fi # second trigger because the placeholder insert is synchronous. sleep 3 -# ------------------------------------------------------------------------- -# Load-bearing assertion #1: per-artifact list shows exactly 1 completed row. -# -# This is the symptom the customer saw: a second `completed` row appearing -# for what should have been a single logical scan. Pin the count to exactly 1. -# ------------------------------------------------------------------------- - -begin_test "Per-artifact scans list has exactly 1 completed row (#1373 regression)" +# Snapshot scan state after trigger #2 settles, then run the two assertions. list_status=$(curl -s -o "${WORK_DIR}/scans2.json" -w '%{http_code}' $CURL_TIMEOUT \ -H "$(auth_header)" \ -H "Accept: application/json" \ "${BASE_URL}${SCAN_LIST_PATH}") || list_status="000" +# ------------------------------------------------------------------------- +# Load-bearing assertion #1: EXACTLY ONE completed row PER scan_type. +# +# This is the precise #1373 dedup contract. The backend runs >=1 scanner and +# each writes its own row, so "one completed row total" is wrong; the real +# invariant is that no single scan_type accumulates a second completed row. +# We assert the max completed-count across scan_types is exactly 1 (and that +# at least one scan_type completed). A duplicate placeholder promoted to +# 'completed' for ANY scan_type makes that scan_type's count >= 2 and fails +# here -- which is exactly the symptom we must keep catching. We do NOT +# loosen this to "<= N"; per-scan_type the bound stays hard at 1. +# ------------------------------------------------------------------------- +begin_test "Each scan_type has exactly 1 completed row (#1373 regression)" if [ "$list_status" != "200" ]; then fail "scan-list returned HTTP ${list_status}" else - completed_count=$(jq '[.items[] | select((.status | ascii_downcase) == "completed")] | length' \ - < "${WORK_DIR}/scans2.json" 2>/dev/null || echo "0") - total_count=$(jq '.items | length // 0' < "${WORK_DIR}/scans2.json" 2>/dev/null || echo "0") - if [ "$completed_count" = "1" ]; then + # Per-scan_type completed counts, e.g. [{"scan_type":"grype","completed":1},...] + per_type=$(jq -c ' + [.items[] | select((.status|ascii_downcase) == "completed")] + | group_by(.scan_type) + | map({scan_type: .[0].scan_type, completed: length}) + ' < "${WORK_DIR}/scans2.json" 2>/dev/null || echo '[]') + num_types=$(echo "$per_type" | jq 'length' 2>/dev/null || echo 0) + max_per_type=$(echo "$per_type" | jq '[.[].completed] | max // 0' 2>/dev/null || echo 0) + + if [ "${num_types:-0}" -ge 1 ] 2>/dev/null && [ "${max_per_type:-0}" = "1" ]; then pass else - snippet=$(jq -c '[.items[] | {id, status}]' < "${WORK_DIR}/scans2.json" 2>/dev/null | cut -c 1-500) - fail "expected exactly 1 completed scan for artifact ${ARTIFACT_ID}, got ${completed_count} (total rows: ${total_count})" \ -"This is the #1373 symptom: the second trigger_scan call inserted a new -placeholder row that the worker promoted to 'completed', leaving 2+ -completed rows where the short-circuit should have returned the existing + snippet=$(jq -c '[.items[] | {id, scan_type, status, is_reused}]' < "${WORK_DIR}/scans2.json" 2>/dev/null | cut -c 1-700) + fail "dedup contract broken: a scan_type has >1 completed row for artifact ${ARTIFACT_ID} (scan_types=${num_types}, max completed/type=${max_per_type})" \ +"This is the #1373 symptom: a trigger_scan call inserted a new placeholder +row that the worker promoted to 'completed', leaving 2+ completed rows for +one scan_type where the short-circuit should have returned the existing scan_id. +per-scan_type completed counts: ${per_type} scans (projected): ${snippet} endpoint: GET ${BASE_URL}${SCAN_LIST_PATH} -first scan_id: ${first_scan_id}" +completed-by-type after trigger #1: ${first_completed_by_type} + +NOTE: if this fails ONLY under concurrent load (and passes sequentially), +the cause is the known check-then-act race in +ScannerService::prepare_artifact_scan: find_existing_scan_for_artifact +(SELECT) and create_scan_result_with_checksum (INSERT) are not atomic and +there is no unique constraint on (artifact_id, scan_type, checksum_sha256), +so concurrent triggers each insert a fresh placeholder. That is a REAL +backend defect, not a test bug -- do NOT mask it by loosening this bound." fi fi # ------------------------------------------------------------------------- -# Load-bearing assertion #2: the second-most-recent (if present) is the -# same scan_id as the first. Asserting this on top of the count-of-1 -# assertion catches a regression where a placeholder is inserted then -# garbage-collected (count=1 again, but with a different id than expected). -# -# If the list has only one row, the id check trivially holds. +# Load-bearing assertion #2: the completed scan_id for each scan_type is +# UNCHANGED from the post-trigger-#1 snapshot. The short-circuit must return +# the EXISTING id; a different id for any scan_type means a fresh placeholder +# was inserted and promoted to 'completed' (the regression #1388 fixes), even +# if some janitor later collapsed the count back to 1. # ------------------------------------------------------------------------- - -begin_test "Latest completed scan_id equals first scan_id (no duplicate placeholder)" -latest_completed_id=$(jq -er '[.items[] | select((.status | ascii_downcase) == "completed")][0].id // empty' \ - < "${WORK_DIR}/scans2.json" 2>/dev/null || echo "") -if [ -n "$latest_completed_id" ] && [ "$latest_completed_id" = "$first_scan_id" ]; then - pass +begin_test "Completed scan_id per scan_type unchanged after re-trigger (no dup placeholder)" +if [ "$list_status" != "200" ]; then + fail "scan-list returned HTTP ${list_status}" else - fail "latest completed scan_id (${latest_completed_id:-}) does not match first scan_id (${first_scan_id})" \ -"The short-circuit must return the existing scan_id; a different id here -means a fresh placeholder was inserted and promoted to 'completed' (the -exact regression #1388 fixes)." + second_completed_by_type=$(jq -c ' + [.items[] | select((.status|ascii_downcase) == "completed")] + | group_by(.scan_type) + | map({(.[0].scan_type): (.[0].id)}) + | add // {} + ' < "${WORK_DIR}/scans2.json" 2>/dev/null || echo '{}') + + # Every scan_type present in the trigger-#1 snapshot must map to the SAME + # completed id after trigger #2. New scan_types appearing is also a dup + # signal, so require the two maps to be equal. + if [ "$first_completed_by_type" = "$second_completed_by_type" ] && [ "$first_completed_by_type" != "{}" ]; then + pass + elif echo "$first_completed_by_type" | jq -e \ + --argjson after "$second_completed_by_type" \ + 'to_entries | all(.value == ($after[.key])) and (length > 0) and (($after | length) == length)' \ + >/dev/null 2>&1; then + # Robust equality independent of key ordering. + pass + else + fail "completed scan_id changed for some scan_type after re-trigger" \ +"The short-circuit must return the existing scan_id per scan_type; a changed +or added id means a fresh placeholder was inserted and promoted to +'completed' (the exact regression #1388 fixes). + +after trigger #1: ${first_completed_by_type} +after trigger #2: ${second_completed_by_type}" + fi fi end_suite diff --git a/tests/webhooks/test-webhook-retry-recover.sh b/tests/webhooks/test-webhook-retry-recover.sh index 0b8f05c..03abd48 100755 --- a/tests/webhooks/test-webhook-retry-recover.sh +++ b/tests/webhooks/test-webhook-retry-recover.sh @@ -12,22 +12,56 @@ # release-gate stays under WEBHOOK_RETRY_TIMEOUT (default 360s). The first # three intervals (30s, 60s, 120s base) plus +/-20% jitter give acceptable # windows of: -# attempt 1 -> 24-36s -# attempt 2 -> 48-72s (cumulative 72-108s) -# attempt 3 -> 96-144s (cumulative 168-252s) +# attempt 1 -> 24-36s backoff +# attempt 2 -> 48-72s backoff +# attempt 3 -> 96-144s backoff # The full 12-attempt schedule is exercised in the dead-letter test. # -# Receiver discovery: -# WEBHOOK_RECEIVER_PORT - port the local mock listens on (default 18765). +# IMPORTANT — what we actually measure, and the scheduler-tick quantization +# that the (now fixed) windows MUST account for: +# +# The backend does NOT fire a retry at the exact instant next_retry_at +# elapses. A background scheduler tick runs every 30s +# (services/scheduler_service.rs: interval(Duration::from_secs(30))) and +# only then drains rows whose next_retry_at <= NOW(). So the *observed* gap +# between two consecutive POSTs at the receiver is: +# +# observed_gap = backoff(attempt) + [0 .. 30s tick latency] + dial +# +# i.e. the backoff rounded UP to the next 30s scheduler tick. The previous +# version of this test asserted the raw backoff window (24-36s etc.) and +# IGNORED tick quantization, so a perfectly healthy backend whose 30s base +# delay landed mid-tick was measured as e.g. a 55s gap and reported +# "outside 24-36". The windows below add a +30s (one full tick) upper +# allowance and a small floor. +# +# Note also: this test must KEEP the receiver FAILING through attempt 3. +# If the mock recovers (returns 200) after the first failure the delivery +# row is marked success/next_retry_at=NULL and NO further POSTs occur — you +# can never observe a 2nd or 3rd retry interval. WEBHOOK_FAIL_FIRST_N is +# therefore defaulted to EXPECT_ATTEMPTS so the receiver rejects attempts +# 1..3 (producing the 3 spaced retry POSTs we measure) and accepts the 4th, +# proving the recover path. +# +# Receiver discovery / isolation: +# WEBHOOK_RECEIVER_PORT - port the local mock listens on. Defaults to a +# per-PID port so concurrent webhook suites in the +# same gate run do NOT share one receiver. A shared +# receiver was the root cause of the "all deltas=0s" +# flake: a second suite's deliveries drained in the +# same 30s scheduler tick landed as near-simultaneous +# POSTs in this suite's log, so every measured gap +# collapsed to 0s. # WEBHOOK_RECEIVER_URL - URL the backend POSTs to. Defaults to the # local mock; override in environments where # the backend cannot reach loopback. # WEBHOOK_FAIL_FIRST_N - how many POSTs the mock should reject with -# 500 before flipping to 200 (default 1, so we -# observe the recover path on the first retry). -# WEBHOOK_RETRY_TIMEOUT - seconds to wait for the recover round-trip -# (default 360 to cover up to attempt 3 with -# jitter headroom). +# 500 before flipping to 200. Defaults to 3 so we +# observe THREE spaced retries then recover on the +# 4th. (Must be >= EXPECT_ATTEMPTS.) +# WEBHOOK_RETRY_TIMEOUT - seconds to wait for the full attempt-3 round-trip +# (default 480 to cover backoff 30+60+120s plus up +# to three 30s scheduler ticks plus jitter headroom). # # Companion backend PR: artifact-keeper#1140. Producer feature flag is # WEBHOOKS_V2_PRODUCER_ENABLED; harness sets it to 1. @@ -38,10 +72,19 @@ source "$(dirname "$0")/../lib/common.sh" begin_suite "webhook-retry-recover" -WEBHOOK_RECEIVER_PORT="${WEBHOOK_RECEIVER_PORT:-18765}" +# Per-PID default port so concurrent webhook suites never share a receiver +# (shared-receiver simultaneous drains were the "all deltas=0s" root cause). +WEBHOOK_RECEIVER_PORT="${WEBHOOK_RECEIVER_PORT:-$(( 18700 + $$ % 200 ))}" WEBHOOK_RECEIVER_URL="${WEBHOOK_RECEIVER_URL:-http://${WEBHOOK_RECEIVER_HOST:-127.0.0.1}:${WEBHOOK_RECEIVER_PORT}/hook}" -WEBHOOK_FAIL_FIRST_N="${WEBHOOK_FAIL_FIRST_N:-1}" -WEBHOOK_RETRY_TIMEOUT="${WEBHOOK_RETRY_TIMEOUT:-360}" +# Number of attempts we observe a spaced retry for (attempt 1, 2, 3). +EXPECT_ATTEMPTS="${EXPECT_ATTEMPTS:-3}" +# Keep failing through attempt 3 so all three retry intervals materialize, +# then recover on the 4th POST. MUST be >= EXPECT_ATTEMPTS. +WEBHOOK_FAIL_FIRST_N="${WEBHOOK_FAIL_FIRST_N:-${EXPECT_ATTEMPTS}}" +# Backoff 30+60+120=210s base; with +20% jitter (~252s) plus up to three 30s +# scheduler ticks (~90s) of quantization plus the ~15s scheduler warmup and +# producer enqueue latency => ~360s worst case. 480s gives load headroom. +WEBHOOK_RETRY_TIMEOUT="${WEBHOOK_RETRY_TIMEOUT:-480}" WEBHOOK_RECEIVER_LOG="${WEBHOOK_RECEIVER_LOG:-/tmp/mock-webhook-receiver-${RUN_ID}.log}" RECEIVER_PID="" @@ -89,6 +132,8 @@ else sleep 0.2 done if [ "$ready" = true ]; then + # Reset any stale state if the port happened to be reused by a prior run. + curl -sf --max-time 2 "http://127.0.0.1:${WEBHOOK_RECEIVER_PORT}/__reset" >/dev/null 2>&1 || true pass else fail "mock receiver did not come up on 127.0.0.1:${WEBHOOK_RECEIVER_PORT}" @@ -131,26 +176,67 @@ fi # create a repository (-> repository.created -> producer inserts a delivery # row). The mock rejects the first WEBHOOK_FAIL_FIRST_N POSTs with 500, which # drives the retry-then-recover path we assert on below. +# +# Isolation against sibling deliveries: this webhook is global (repository_id +# NULL), so the producer also enqueues a delivery for it on repository.created +# events fired by OTHER suites running concurrently. Those sibling deliveries +# would land as extra POSTs in this receiver's log and, because they drain in +# the same 30s scheduler tick, collapse the measured retry gaps to 0s -- the +# original flake. We defend against that by capturing THIS repository's id and +# filtering the receiver log down to only the POSTs whose payload .entity_id +# matches it (see select_repo_posts below). Every delivery payload carries +# entity_id = the created repo's UUID (services/webhook_producer.rs +# build_event_payload), so the filter isolates our own retry sequence exactly. # ------------------------------------------------------------------------- +REPO_ID="" + +# Echo, oldest-first, the .ts of every receiver-log POST whose payload +# .entity_id matches our repo (falls back to all POSTs if entity_id could not +# be determined, e.g. an older producer that omitted it). +select_repo_posts() { + if [ -n "$REPO_ID" ]; then + jq -r --arg rid "$REPO_ID" \ + 'select((.body | fromjson? | .entity_id) == $rid) | .ts' \ + "$WEBHOOK_RECEIVER_LOG" 2>/dev/null + else + jq -r '.ts' "$WEBHOOK_RECEIVER_LOG" 2>/dev/null + fi +} + +# Echo, oldest-first, the full JSON record of every receiver-log POST whose +# payload .entity_id matches our repo (one compact JSON object per line). +select_repo_records() { + if [ -n "$REPO_ID" ]; then + jq -c --arg rid "$REPO_ID" \ + 'select((.body | fromjson? | .entity_id) == $rid)' \ + "$WEBHOOK_RECEIVER_LOG" 2>/dev/null + else + jq -c '.' "$WEBHOOK_RECEIVER_LOG" 2>/dev/null + fi +} + begin_test "Trigger delivery via repository.created and observe initial POST at receiver" if [ "$SUITE_BLOCKED" = "true" ] || [ -z "$WEBHOOK_ID" ]; then skip "no webhook id" else REPO_CREATE_PAYLOAD="{\"key\":\"${REPO_KEY}\",\"name\":\"${REPO_KEY}\",\"format\":\"generic\",\"repo_type\":\"local\",\"is_public\":true}" - if api_post "/api/v1/repositories" "$REPO_CREATE_PAYLOAD" >/dev/null 2>&1; then + if repo_resp=$(api_post "/api/v1/repositories" "$REPO_CREATE_PAYLOAD" 2>/dev/null); then + REPO_ID=$(echo "$repo_resp" | jq -r '.id // empty') seen=0 # First attempt is rejected (500) by the mock but still lands as a POST; # allow generous headroom for the producer's async enqueue + first dial. + # Count only POSTs that belong to OUR repo so a sibling suite's delivery + # cannot satisfy this gate. for _ in $(seq 1 30); do - seen=$(curl -sf --max-time 2 "http://127.0.0.1:${WEBHOOK_RECEIVER_PORT}/__count" 2>/dev/null || echo 0) + seen=$(select_repo_posts | grep -c . 2>/dev/null || echo 0) [ "$seen" -gt 0 ] && break sleep 1 done if [ "$seen" -ge 1 ]; then pass else - fail "receiver saw 0 POSTs after repository.created event" + fail "receiver saw 0 POSTs for repo ${REPO_KEY} after repository.created event" fi else SUITE_BLOCKED=true @@ -159,60 +245,79 @@ else fi # ------------------------------------------------------------------------- -# Retry windows for v2 schedule (with +/-20% jitter applied): -# attempt 1: 24..36s after the failed try -# attempt 2: 48..72s after attempt 1 -# attempt 3: 96..144s after attempt 2 -# We observe the timestamps of consecutive POSTs at the receiver and -# assert each gap is within window. For the smoke profile we cap at 3 -# attempts (covers attempt 1, 2, 3 deltas). +# Retry windows. We observe the wall-clock timestamps of consecutive POSTs +# at the receiver and assert each gap is within window. The window is the +# jittered backoff PLUS the scheduler-tick quantization (a retry only fires +# on the next 30s tick after next_retry_at elapses): +# +# observed_gap(attempt) in [ backoff_min , backoff_max + 30s_tick + slack ] +# +# attempt 1: backoff 24..36 -> window 20..72 (36 + 30 tick + 6 slack) +# attempt 2: backoff 48..72 -> window 40..108 (72 + 30 tick + 6 slack) +# attempt 3: backoff 96..144 -> window 84..180 (144 + 30 tick + 6 slack) +# +# The lower bound is set slightly below backoff_min to tolerate a POST that +# lands a hair early on a fast tick / clock skew, but stays WELL above 0 so a +# collapsed/simultaneous-drain run (the original flake) still FAILS loudly: +# this remains a meaningful assertion that retries ARE spaced by backoff. # ------------------------------------------------------------------------- -EXPECT_ATTEMPTS=3 -RETRY_MIN=(24 48 96) -RETRY_MAX=(36 72 144) - -# Mark the receiver log so we can read deltas freshly. -LOG_BASELINE_LINES=0 -if [ -f "$WEBHOOK_RECEIVER_LOG" ]; then - LOG_BASELINE_LINES=$(wc -l < "$WEBHOOK_RECEIVER_LOG" | tr -d ' ') -fi +RETRY_MIN=(20 40 84) +RETRY_MAX=(72 108 180) begin_test "Retry intervals for attempts 1..3 fall within v2 schedule (+/-20% jitter)" if [ "$SUITE_BLOCKED" = "true" ] || [ -z "$WEBHOOK_ID" ]; then skip "no webhook id" else - # Wait for at least EXPECT_ATTEMPTS + 1 entries (the initial delivery - # plus the retry attempts). Bail at WEBHOOK_RETRY_TIMEOUT. + # We need EXPECT_ATTEMPTS+1 POSTs for our repo: the initial failing dial + # plus one POST per observed retry (attempts 1..N), where the (N+1)th POST + # is the one whose preceding gap closes attempt N. With FAIL_FIRST_N >= + # EXPECT_ATTEMPTS the receiver keeps rejecting through attempt N, so all N + # spaced retries materialize. expected_entries=$(( EXPECT_ATTEMPTS + 1 )) + + # Poll on the count of OUR repo's POSTs (entity_id filtered), and require it + # to be STABLE for one extra poll before measuring -- this guards against + # reading mid-write (a half-written final line) and against a retry POST + # that is in-flight when we snapshot. Bail at WEBHOOK_RETRY_TIMEOUT. elapsed=0 poll=5 + prev_seen=-1 + stable=0 + observed=0 while [ "$elapsed" -lt "$WEBHOOK_RETRY_TIMEOUT" ]; do - seen=$(curl -sf --max-time 2 "http://127.0.0.1:${WEBHOOK_RECEIVER_PORT}/__count" 2>/dev/null || echo 0) - if [ "$seen" -ge "$expected_entries" ]; then - break + observed=$(select_repo_posts | grep -c . 2>/dev/null || echo 0) + if [ "$observed" -ge "$expected_entries" ]; then + if [ "$observed" -eq "$prev_seen" ]; then + stable=$(( stable + 1 )) + [ "$stable" -ge 1 ] && break + else + stable=0 + fi fi + prev_seen="$observed" sleep "$poll" elapsed=$(( elapsed + poll )) done - total_lines=$(wc -l < "$WEBHOOK_RECEIVER_LOG" 2>/dev/null | tr -d ' ' || echo 0) - new_lines=$(( total_lines - LOG_BASELINE_LINES )) - if [ "$new_lines" -lt "$expected_entries" ]; then - skip "only ${new_lines} delivery entries observed within ${WEBHOOK_RETRY_TIMEOUT}s; producer may not be running, or jitter pushed timing out of window" + if [ "$observed" -lt "$expected_entries" ]; then + fail "only ${observed} POST(s) for repo ${REPO_KEY} observed within ${WEBHOOK_RETRY_TIMEOUT}s (need ${expected_entries}); the retry scheduler did not deliver the expected attempts (FAIL_FIRST_N=${WEBHOOK_FAIL_FIRST_N})" else - # Pull the last `expected_entries` timestamps (in seconds, float). - mapfile -t ts_lines < <(tail -n "$expected_entries" "$WEBHOOK_RECEIVER_LOG" \ - | jq -r '.ts' 2>/dev/null) + # Pull the FIRST `expected_entries` timestamps for our repo, oldest-first. + # (Oldest-first is the genuine attempt order; taking the tail could splice + # in a stray late POST.) + mapfile -t ts_lines < <(select_repo_posts | head -n "$expected_entries") if [ "${#ts_lines[@]}" -lt "$expected_entries" ]; then - fail "could not parse ${expected_entries} timestamps from receiver log" + fail "could not parse ${expected_entries} timestamps for repo ${REPO_KEY} from receiver log" else bad="" + deltas="" for i in $(seq 1 "$EXPECT_ATTEMPTS"); do prev="${ts_lines[$((i-1))]}" curr="${ts_lines[$i]}" # Integer delta (sec). awk handles the float subtraction. delta=$(awk -v a="$curr" -v b="$prev" 'BEGIN { printf "%d", (a - b) }') + deltas="${deltas} a${i}=${delta}s" min="${RETRY_MIN[$((i-1))]}" max="${RETRY_MAX[$((i-1))]}" if [ "$delta" -lt "$min" ] || [ "$delta" -gt "$max" ]; then @@ -220,9 +325,11 @@ else fi done if [ -z "$bad" ]; then + echo " observed retry gaps:${deltas}" pass else - fail "retry interval(s) outside v2 +/-20% window:${bad}" + fail "retry interval(s) outside v2 backoff+tick window:${bad}" \ + "observed gaps:${deltas}; windows attempt1=${RETRY_MIN[0]}-${RETRY_MAX[0]} attempt2=${RETRY_MIN[1]}-${RETRY_MAX[1]} attempt3=${RETRY_MIN[2]}-${RETRY_MAX[2]}. delta=0 across the board => simultaneous drain (receiver contamination) not real backoff." fi fi fi @@ -238,11 +345,13 @@ begin_test "X-ArtifactKeeper-Retry-Attempt header present on retry deliveries" if [ "$SUITE_BLOCKED" = "true" ] || [ ! -s "$WEBHOOK_RECEIVER_LOG" ]; then skip "no receiver log" else - total_lines=$(wc -l < "$WEBHOOK_RECEIVER_LOG" 2>/dev/null | tr -d ' ' || echo 0) - if [ "$total_lines" -lt 2 ]; then - skip "only ${total_lines} delivery entry/entries; cannot check retry header" + # Use the SECOND POST for OUR repo (entity_id filtered): the first POST is + # the initial delivery (no retry header), the second is retry attempt 1. + repo_post_count=$(select_repo_records | grep -c . 2>/dev/null || echo 0) + if [ "$repo_post_count" -lt 2 ]; then + fail "only ${repo_post_count} delivery POST(s) for repo ${REPO_KEY}; cannot check retry header (expected the first retry to have fired)" else - second_headers=$(sed -n '2p' "$WEBHOOK_RECEIVER_LOG" | jq -r '.headers') + second_headers=$(select_repo_records | sed -n '2p' | jq -r '.headers') retry_attempt=$(echo "$second_headers" | jq -r ' (.["X-ArtifactKeeper-Retry-Attempt"] // .["x-artifactkeeper-retry-attempt"] //