From 3996c5430b6fe4c3fb325ef3c033025696acb8c7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 11 Jun 2026 05:29:48 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20sponsor=20monitor=20hardening=20?= =?UTF-8?q?=E2=80=94=20freshness=20SLA,=20alerting,=20scheduler=20checks,?= =?UTF-8?q?=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/sponsor-monitor-cron.yml | 16 ++ client/src/components/HeroSection.tsx | 45 ++++- server/routes/__tests__/ops.test.ts | 121 ++++++++++++++ server/routes/health.ts | 97 ++++++++--- server/routes/ops.ts | 33 ++++ server/routes/sponsorPages.ts | 14 +- .../utils/__tests__/freshnessHealth.test.ts | 158 ++++++++++++++++++ server/utils/scheduler.ts | 21 ++- server/utils/sponsorMonitorJob.ts | 21 ++- 9 files changed, 495 insertions(+), 31 deletions(-) create mode 100644 server/utils/__tests__/freshnessHealth.test.ts diff --git a/.github/workflows/sponsor-monitor-cron.yml b/.github/workflows/sponsor-monitor-cron.yml index 50d0c3a..056101c 100644 --- a/.github/workflows/sponsor-monitor-cron.yml +++ b/.github/workflows/sponsor-monitor-cron.yml @@ -57,3 +57,19 @@ jobs: cat /tmp/response.json exit 1 fi + + # Annotate the workflow run with a clear diagnosis when the ping fails. + # This surfaces silent scheduler failures in the GitHub Actions UI and + # the notification emails sent to repo watchers. + - name: Annotate failure cause + if: failure() + run: | + STATUS="${{ steps.ping.outputs.http_status }}" + BODY=$(cat /tmp/response.json 2>/dev/null || echo "(no response body)") + if [ "$STATUS" = "401" ]; then + echo "::error title=Cron-Ping Auth Failure::CRON_SECRET mismatch between GitHub Actions secret and deployment env var. The sponsor monitor will NOT run until this is fixed. HTTP $STATUS: $BODY" + elif [ "$STATUS" = "503" ]; then + echo "::error title=Cron-Ping Not Configured::CRON_SECRET is not set in the deployment environment. Sponsor monitor external trigger is disabled. HTTP $STATUS: $BODY" + else + echo "::error title=Cron-Ping Unexpected Failure::HTTP $STATUS — $BODY" + fi diff --git a/client/src/components/HeroSection.tsx b/client/src/components/HeroSection.tsx index c0bc09e..bddeab4 100644 --- a/client/src/components/HeroSection.tsx +++ b/client/src/components/HeroSection.tsx @@ -161,11 +161,13 @@ function RecentlyRevokedSection() { interface NightlyStats { totalActive: number; lastRunDate: string | null; + lastSuccessfulRunAt: string | null; addedCount: number; removedCount: number; changesCount: number; revokedLast12Months: number; staleDays: number; + hoursStale: number; } function formatRunDate(dateStr: string | null): string { @@ -226,7 +228,17 @@ function NightlyStatsBar() {
{dateLabel}
-Register last checked
++ Register last checked + {!isLoading && data?.lastSuccessfulRunAt && ( + + {new Date(data.lastSuccessfulRunAt).toLocaleTimeString("en-GB", { hour: "2-digit", minute: "2-digit", timeZone: "UTC" })} UTC + + )} +
+ Register data is out of date — last checked {formatRunDate(lastRunDate)}.{" "} + No update in {hoursStale}h. The nightly pipeline may be experiencing an issue. +
+- Register data may be out of date — last checked {formatRunDate(lastRunDate)}. The nightly update may have been delayed. + Register data may be out of date — last checked {formatRunDate(lastRunDate)}.{" "} + The nightly update may have been delayed.
The external cron-ping endpoint received a request but CRON_SECRET is not set on this deployment.
The sponsor monitor will not be triggered by the GitHub Actions schedule until this env var is configured.
+Request IP: ${req.ip ?? "unknown"} — ${new Date().toISOString()}
`, + ).catch(() => {}); return res.status(503).json({ message: "External cron not configured." }); } @@ -817,9 +832,27 @@ export function registerOpsRoutes(app: Express): void { if (!provided || provided.length !== cronSecret.length || !crypto.timingSafeEqual(Buffer.from(provided), Buffer.from(cronSecret))) { log.warn({ ip: req.ip }, "[CronPing] Invalid or missing secret."); + cronPingAuthFailureCount += 1; + const now = Date.now(); + if ( + cronPingAuthFailureCount >= CRON_PING_AUTH_ALERT_THRESHOLD && + now - cronPingAuthAlertLastSentAt > CRON_PING_AUTH_ALERT_COOLDOWN_MS + ) { + cronPingAuthAlertLastSentAt = now; + sendAdminAlert( + "🔐 CheckByAI: Repeated cron-ping auth failures — possible config drift", + `${cronPingAuthFailureCount} consecutive cron-ping authentication failures have been detected.
+This usually means the CRON_SECRET in GitHub Actions secrets does not match the deployment env var.
The sponsor monitor will not run via the external schedule until this is fixed.
+Last failure: ${new Date().toISOString()} — IP: ${req.ip ?? "unknown"}
`, + ).catch(() => {}); + } return res.status(401).json({ message: "Unauthorized." }); } + // Successful auth — reset failure counter. + cronPingAuthFailureCount = 0; + const today = new Date().toISOString().split("T")[0]; const existing = await db .select({ status: monitorJobRuns.status }) diff --git a/server/routes/sponsorPages.ts b/server/routes/sponsorPages.ts index 887a92f..423bf03 100644 --- a/server/routes/sponsorPages.ts +++ b/server/routes/sponsorPages.ts @@ -279,7 +279,7 @@ export function registerSponsorPageRoutes(app: Express): void { // This powers the "Register last checked" stat — shows when the job actually ran, // independent of which digest is displayed. db - .select({ runDate: monitorJobRuns.runDate }) + .select({ runDate: monitorJobRuns.runDate, completedAt: monitorJobRuns.completedAt }) .from(monitorJobRuns) .where(eq(monitorJobRuns.status, "success")) .orderBy(desc(monitorJobRuns.runDate)) @@ -289,23 +289,33 @@ export function registerSponsorPageRoutes(app: Express): void { const totalActive = countResult[0]?.total ?? 0; const revokedLast12Months = revokedResult[0]?.total ?? 0; const latest = digestRows[0] ?? null; + const lastRun = lastRunRows[0] ?? null; // Use the last successful job run date as "Register last checked". // Fall back to the active digest's snapshot date if no run recorded yet. - const lastRunDate = lastRunRows[0]?.runDate ?? latest?.snapshotDate ?? null; + const lastRunDate = lastRun?.runDate ?? latest?.snapshotDate ?? null; + // Precise UTC timestamp of the last successful completion — used for <24h freshness check. + const lastSuccessfulRunAt = lastRun?.completedAt?.toISOString() ?? null; + const now = Date.now(); const today = new Date().toISOString().split("T")[0]; const staleDays = lastRunDate ? Math.round((Date.parse(today) - Date.parse(lastRunDate)) / 86400000) : 0; + // Hours since last successful run (precise, from completedAt if available). + const hoursStale = lastSuccessfulRunAt + ? Math.floor((now - Date.parse(lastSuccessfulRunAt)) / 3_600_000) + : staleDays * 24; const payload = { totalActive, lastRunDate, + lastSuccessfulRunAt, addedCount: latest?.addedCount ?? 0, removedCount: latest?.removedCount ?? 0, changesCount: latest?.updatedCount ?? 0, revokedLast12Months, staleDays, + hoursStale, }; // Cache for 5 minutes — balances freshness with DB load. // Flushed immediately by sponsorMonitorJob after each nightly run. diff --git a/server/utils/__tests__/freshnessHealth.test.ts b/server/utils/__tests__/freshnessHealth.test.ts new file mode 100644 index 0000000..27c7576 --- /dev/null +++ b/server/utils/__tests__/freshnessHealth.test.ts @@ -0,0 +1,158 @@ +/** + * Tests for sponsor-monitor freshness status classification logic. + * + * These tests cover the classification rules introduced to replace the + * old `staleDays >= 3` threshold with an hours-based tiered system: + * ≤24h → ok + * 24–48h → warn + * >48h → critical + */ + +import { describe, expect, it } from "vitest"; + +// ── Pure classification helper (mirrors the logic in health.ts) ────────────── + +type FreshnessStatus = "ok" | "warn" | "critical" | "running" | "unknown"; + +function classifyFreshness(opts: { + jobRunning: boolean; + hoursSinceSuccess: number | null; + lastRunFailed?: boolean; +}): { status: FreshnessStatus; staleReason: string | null } { + const { jobRunning, hoursSinceSuccess, lastRunFailed } = opts; + + if (jobRunning) { + return { status: "running", staleReason: null }; + } + if (hoursSinceSuccess !== null) { + if (hoursSinceSuccess <= 24) { + return { status: "ok", staleReason: null }; + } + if (hoursSinceSuccess <= 48) { + return { + status: "warn", + staleReason: `No successful run in ${hoursSinceSuccess}h (warn threshold: 24h).`, + }; + } + return { + status: "critical", + staleReason: `No successful run in ${hoursSinceSuccess}h (critical threshold: 48h).`, + }; + } + if (lastRunFailed) { + return { status: "warn", staleReason: "Last run failed." }; + } + return { status: "unknown", staleReason: null }; +} + +// ── Pure stale-banner helper (mirrors the logic in HeroSection.tsx) ────────── + +type BannerSeverity = "ok" | "warn" | "critical"; + +function staleBannerSeverity(hoursStale: number): BannerSeverity { + if (hoursStale > 48) return "critical"; + if (hoursStale > 24) return "warn"; + return "ok"; +} + +// ── Tests ───────────────────────────────────────────────────────────────────── + +describe("classifyFreshness — health endpoint status", () => { + it("returns ok when job ran 1h ago", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 1 }); + expect(result.status).toBe("ok"); + expect(result.staleReason).toBeNull(); + }); + + it("returns ok when job ran exactly 24h ago (boundary)", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 24 }); + expect(result.status).toBe("ok"); + }); + + it("returns warn when job ran 25h ago (just over 24h threshold)", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 25 }); + expect(result.status).toBe("warn"); + expect(result.staleReason).toContain("warn threshold: 24h"); + }); + + it("returns warn when job ran exactly 48h ago (boundary)", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 48 }); + expect(result.status).toBe("warn"); + }); + + it("returns critical when job ran 49h ago (just over 48h threshold)", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 49 }); + expect(result.status).toBe("critical"); + expect(result.staleReason).toContain("critical threshold: 48h"); + }); + + it("returns critical when job ran 72h ago", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 72 }); + expect(result.status).toBe("critical"); + }); + + it("returns running when job is currently executing", () => { + const result = classifyFreshness({ jobRunning: true, hoursSinceSuccess: null }); + expect(result.status).toBe("running"); + expect(result.staleReason).toBeNull(); + }); + + it("returns running regardless of hoursSinceSuccess", () => { + const result = classifyFreshness({ jobRunning: true, hoursSinceSuccess: 100 }); + expect(result.status).toBe("running"); + }); + + it("returns warn when hoursSinceSuccess is null but last run failed", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: null, lastRunFailed: true }); + expect(result.status).toBe("warn"); + expect(result.staleReason).toBe("Last run failed."); + }); + + it("returns unknown when no data available", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: null }); + expect(result.status).toBe("unknown"); + expect(result.staleReason).toBeNull(); + }); +}); + +describe("staleBannerSeverity — HeroSection stale warning tier", () => { + it("returns ok for 0 hours stale (just ran)", () => { + expect(staleBannerSeverity(0)).toBe("ok"); + }); + + it("returns ok for exactly 24h stale", () => { + expect(staleBannerSeverity(24)).toBe("ok"); + }); + + it("returns warn for 25h stale", () => { + expect(staleBannerSeverity(25)).toBe("warn"); + }); + + it("returns warn for exactly 48h stale", () => { + expect(staleBannerSeverity(48)).toBe("warn"); + }); + + it("returns critical for 49h stale", () => { + expect(staleBannerSeverity(49)).toBe("critical"); + }); + + it("returns critical for 72h stale (3 days)", () => { + expect(staleBannerSeverity(72)).toBe("critical"); + }); + + it("returns critical for 168h stale (1 week)", () => { + expect(staleBannerSeverity(168)).toBe("critical"); + }); + + // Old threshold (3 calendar days = 72h) would not have warned at 2 days. + // New threshold (24h) correctly warns at 25h. + it("triggers warn at 25h where old 3-day threshold would have missed it", () => { + const OLD_STALE_DAYS_THRESHOLD = 3; // old: warned at 3 days + const hoursStale = 25; + const staleDays = Math.floor(hoursStale / 24); // 1 day + // Old logic would NOT show a warning (staleDays < 3) + expect(staleDays < OLD_STALE_DAYS_THRESHOLD).toBe(true); + // New logic DOES show a warning (hoursStale > 24) + expect(staleBannerSeverity(hoursStale)).toBe("warn"); + }); +}); diff --git a/server/utils/scheduler.ts b/server/utils/scheduler.ts index 61bff90..b5e2c58 100644 --- a/server/utils/scheduler.ts +++ b/server/utils/scheduler.ts @@ -142,10 +142,29 @@ export function startCentralScheduler(): void { log.info("Central scheduler: SPONSOR_MONITOR registered (30 0 * * 1-5 UTC)."); } - const active = getCutoverStatusSnapshot().filter((s) => s.cutover).map((s) => s.job); + const snapshot = getCutoverStatusSnapshot(); + const active = snapshot.filter((s) => s.cutover).map((s) => s.job); if (active.length === 0) { log.info("Central scheduler started: no jobs cut over yet (all inline-cron owned)."); } else { log.info({ active }, `Central scheduler started: ${active.length} job(s) cut over.`); } + + // ── Ownership sanity check ────────────────────────────────────────────────── + // The SPONSOR_MONITOR job must be owned by exactly one scheduler path. + // Log a clear warning if the cutover flag is true but the central scheduler + // somehow did NOT register the cron (e.g., code path skipped due to a bug). + // The inverse — cutover false, inline cron owns it — is the expected default. + // Note: we cannot detect if *neither* path registered without runtime state; + // this is a best-effort startup check for config drift. + const sponsorStatus = snapshot.find((s) => s.job === "SPONSOR_MONITOR"); + if (sponsorStatus?.cutover && !isCutover("SPONSOR_MONITOR")) { + // Should be unreachable but guard against future refactors that call this + // function with stale env state. + log.error( + { sponsorStatus }, + "Central scheduler: SPONSOR_MONITOR cutover flag is inconsistent — sponsor monitor may have NO active scheduler owner. " + + "Verify CUTOVER_SPONSOR_MONITOR env var and restart.", + ); + } } diff --git a/server/utils/sponsorMonitorJob.ts b/server/utils/sponsorMonitorJob.ts index c91120a..ea81e54 100644 --- a/server/utils/sponsorMonitorJob.ts +++ b/server/utils/sponsorMonitorJob.ts @@ -1225,10 +1225,29 @@ async function checkMissedJobsAndCatchUp(source: string = "startup-catchup"): Pr `[SponsorMonitorJob] ${isStartup ? "Startup catch-up" : "Backfill check"}: no successful run found for ${missed} (and possibly earlier). Triggering now.`, ); + // Compute how long ago the last successful run was (in hours) to choose + // the appropriate alert severity. + const lastSuccessDate = successfulRuns + .map((r) => r.runDate) + .sort() + .reverse()[0] ?? null; + const hoursSinceSuccess = lastSuccessDate + ? Math.floor((Date.now() - Date.parse(lastSuccessDate + "T00:30:00Z")) / 3_600_000) + : null; + const isP0 = hoursSinceSuccess !== null && hoursSinceSuccess > 48; + const isP1 = hoursSinceSuccess !== null && hoursSinceSuccess > 24 && !isP0; + + const alertPrefix = isP0 + ? "🚨 P0 CheckByAI: Sponsor monitor stale >48h" + : isP1 + ? "⚠️ P1 CheckByAI: Sponsor monitor stale >24h" + : `ℹ️ CheckByAI: ${isStartup ? "Startup" : "Periodic"} catch-up triggered`; + await sendAdminAlert( - `ℹ️ CheckByAI: ${isStartup ? "Startup" : "Periodic"} catch-up triggered`, + alertPrefix, `${isStartup ? "Server booted" : "Periodic 6-hour check"} detected a missed sponsor monitor job.
Most recent missed weekday: ${missed}
+Hours since last successful run: ${hoursSinceSuccess ?? "unknown"}
Successful runs found: ${[...successDates].join(", ") || "none in last 7 days"}
Running now to fetch the latest register CSV and apply any accumulated changes.
`, ).catch(() => {});