diff --git a/.github/workflows/sponsor-monitor-cron.yml b/.github/workflows/sponsor-monitor-cron.yml index 50d0c3a..056101c 100644 --- a/.github/workflows/sponsor-monitor-cron.yml +++ b/.github/workflows/sponsor-monitor-cron.yml @@ -57,3 +57,19 @@ jobs: cat /tmp/response.json exit 1 fi + + # Annotate the workflow run with a clear diagnosis when the ping fails. + # This surfaces silent scheduler failures in the GitHub Actions UI and + # the notification emails sent to repo watchers. + - name: Annotate failure cause + if: failure() + run: | + STATUS="${{ steps.ping.outputs.http_status }}" + BODY=$(cat /tmp/response.json 2>/dev/null || echo "(no response body)") + if [ "$STATUS" = "401" ]; then + echo "::error title=Cron-Ping Auth Failure::CRON_SECRET mismatch between GitHub Actions secret and deployment env var. The sponsor monitor will NOT run until this is fixed. HTTP $STATUS: $BODY" + elif [ "$STATUS" = "503" ]; then + echo "::error title=Cron-Ping Not Configured::CRON_SECRET is not set in the deployment environment. Sponsor monitor external trigger is disabled. HTTP $STATUS: $BODY" + else + echo "::error title=Cron-Ping Unexpected Failure::HTTP $STATUS — $BODY" + fi diff --git a/client/src/components/HeroSection.tsx b/client/src/components/HeroSection.tsx index c0bc09e..bddeab4 100644 --- a/client/src/components/HeroSection.tsx +++ b/client/src/components/HeroSection.tsx @@ -161,11 +161,13 @@ function RecentlyRevokedSection() { interface NightlyStats { totalActive: number; lastRunDate: string | null; + lastSuccessfulRunAt: string | null; addedCount: number; removedCount: number; changesCount: number; revokedLast12Months: number; staleDays: number; + hoursStale: number; } function formatRunDate(dateStr: string | null): string { @@ -226,7 +228,17 @@ function NightlyStatsBar() {

{dateLabel}

-

Register last checked

+

+ Register last checked + {!isLoading && data?.lastSuccessfulRunAt && ( + + {new Date(data.lastSuccessfulRunAt).toLocaleTimeString("en-GB", { hour: "2-digit", minute: "2-digit", timeZone: "UTC" })} UTC + + )} +

@@ -345,23 +357,42 @@ function UrgencyBanner() { ); } - // No changes at all - // If staleDays >= 3 (no successful run in 3+ calendar days), show an amber - // warning banner — the register data may be outdated. - if (data.staleDays >= 3) { + // No changes at all. + // Severity tiers based on hours since last successful run: + // >48h → critical (red-toned): the pipeline may be broken. + // >24h → warn (amber): nightly update overdue. + // ≤24h → fresh: show calm confirmation. + const hoursStale = data.hoursStale ?? data.staleDays * 24; + + if (hoursStale > 48) { + return ( +
+
+
+
+ ); + } + + if (hoursStale > 24) { return (
); } - // No changes and data is fresh — show a calm confirmation strip + // No changes and data is fresh (≤24h) — show a calm confirmation strip. return (
diff --git a/server/routes/__tests__/ops.test.ts b/server/routes/__tests__/ops.test.ts index f163806..7ff20ae 100644 --- a/server/routes/__tests__/ops.test.ts +++ b/server/routes/__tests__/ops.test.ts @@ -136,6 +136,10 @@ vi.mock("../../utils/incidentManager", () => ({ tryAutoRemediate: vi.fn(async () => "corr-remediate-1"), })); +vi.mock("../../utils/adminAlert", () => ({ + sendAdminAlert: vi.fn(async () => undefined), +})); + import { registerOpsRoutes } from "../ops"; import { getAllJobHealthSnapshots } from "../../utils/jobTelemetry"; import { evaluateSeverity, createIncidentTicket } from "../../utils/incidentManager"; @@ -529,6 +533,123 @@ describe("ops routes", () => { } }); + // ── Cron-ping auth / config failure paths ─────────────────────────────────── + + it("POST /cron-ping returns 503 when CRON_SECRET is not set", async () => { + const prev = process.env.CRON_SECRET; + delete process.env.CRON_SECRET; + const { server, baseUrl } = await startTestServer(); + try { + const response = await fetch(`${baseUrl}/api/ops/cron-ping`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + }); + expect(response.status).toBe(503); + const json = await response.json(); + expect(json.message).toMatch(/not configured/i); + } finally { + server.close(); + if (prev !== undefined) process.env.CRON_SECRET = prev; + } + }); + + it("POST /cron-ping returns 401 for wrong secret", async () => { + process.env.CRON_SECRET = "correct-secret-32-chars-xxxxxxxx"; + const { server, baseUrl } = await startTestServer(); + try { + const response = await fetch(`${baseUrl}/api/ops/cron-ping`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: "******", + }, + }); + expect(response.status).toBe(401); + } finally { + server.close(); + delete process.env.CRON_SECRET; + } + }); + + it("POST /cron-ping returns 401 for missing Authorization header", async () => { + process.env.CRON_SECRET = "correct-secret-32-chars-xxxxxxxx"; + const { server, baseUrl } = await startTestServer(); + try { + const response = await fetch(`${baseUrl}/api/ops/cron-ping`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + }); + expect(response.status).toBe(401); + } finally { + server.close(); + delete process.env.CRON_SECRET; + } + }); + + it("POST /cron-ping returns 202 with correct secret when no run today", async () => { + process.env.CRON_SECRET = "correct-secret-32-chars-xxxxxxxx"; + // DB returns no existing run for today + dbState.selectQueue.push([]); + const { server, baseUrl } = await startTestServer(); + try { + const response = await fetch(`${baseUrl}/api/ops/cron-ping`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: "Bearer " + (process.env["CRON_SECRET"] ?? ""), + }, + }); + expect(response.status).toBe(202); + const json = await response.json(); + expect(json.message).toMatch(/triggered/i); + } finally { + server.close(); + delete process.env.CRON_SECRET; + } + }); + + it("POST /cron-ping returns 409 when today already succeeded", async () => { + process.env.CRON_SECRET = "correct-secret-32-chars-xxxxxxxx"; + dbState.selectQueue.push([{ status: "success" }]); + const { server, baseUrl } = await startTestServer(); + try { + const response = await fetch(`${baseUrl}/api/ops/cron-ping`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: "Bearer " + (process.env["CRON_SECRET"] ?? ""), + }, + }); + expect(response.status).toBe(409); + const json = await response.json(); + expect(json.message).toMatch(/already ran/i); + } finally { + server.close(); + delete process.env.CRON_SECRET; + } + }); + + it("POST /cron-ping returns 423 when job is currently running", async () => { + process.env.CRON_SECRET = "correct-secret-32-chars-xxxxxxxx"; + dbState.selectQueue.push([{ status: "running" }]); + const { server, baseUrl } = await startTestServer(); + try { + const response = await fetch(`${baseUrl}/api/ops/cron-ping`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: "Bearer " + (process.env["CRON_SECRET"] ?? ""), + }, + }); + expect(response.status).toBe(423); + const json = await response.json(); + expect(json.message).toMatch(/running/i); + } finally { + server.close(); + delete process.env.CRON_SECRET; + } + }); + // ── Rollout status endpoint ───────────────────────────────────────────────── it("GET /rollout/status returns 200 with aggregated state for analyst", async () => { diff --git a/server/routes/health.ts b/server/routes/health.ts index 9904ef7..1c263cb 100644 --- a/server/routes/health.ts +++ b/server/routes/health.ts @@ -1,4 +1,7 @@ import type { Express } from "express"; +import { desc, eq } from "drizzle-orm"; +import { db } from "../db"; +import { monitorJobRuns, dailyDigest } from "@shared/schema"; import { isJobRunning, getLastRunInfo } from "../utils/sponsorMonitorJob"; import { getJobHealthSnapshot } from "../utils/jobTelemetry"; import { success } from "../lib/response"; @@ -38,38 +41,92 @@ export function registerHealthRoutes(app: Express): void { }); app.get('/api/health/sponsor-monitor', async (req, res) => { - const lastRun = getLastRunInfo(); + const lastRunMem = getLastRunInfo(); const jobRunning = await isJobRunning(); - let hoursAgo: number | null = null; - let status: "ok" | "stale" | "running" | "unknown" = "unknown"; + // Prefer DB data (survives restarts) over the in-memory snapshot. + const [dbRunRow, latestDigestRow] = await Promise.all([ + db + .select({ + runDate: monitorJobRuns.runDate, + status: monitorJobRuns.status, + completedAt: monitorJobRuns.completedAt, + startedAt: monitorJobRuns.startedAt, + recordsProcessed: monitorJobRuns.recordsProcessed, + changesDetected: monitorJobRuns.changesDetected, + errorMessage: monitorJobRuns.errorMessage, + }) + .from(monitorJobRuns) + .where(eq(monitorJobRuns.status, "success")) + .orderBy(desc(monitorJobRuns.runDate)) + .limit(1) + .catch(() => [] as Array<{ runDate: string; status: string; completedAt: Date | null; startedAt: Date | null; recordsProcessed: number | null; changesDetected: number | null; errorMessage: string | null }>), + db + .select({ snapshotDate: dailyDigest.snapshotDate }) + .from(dailyDigest) + .orderBy(desc(dailyDigest.snapshotDate)) + .limit(1) + .catch(() => [] as Array<{ snapshotDate: string }>), + ]); + + const dbRun = dbRunRow[0] ?? null; + const latestSnapshotDate = latestDigestRow[0]?.snapshotDate ?? null; + + // Determine last-successful-at from DB (precise ISO timestamp) or fall back + // to the in-memory snapshot from the current process instance. + const lastSuccessfulRunAt: string | null = + dbRun?.completedAt?.toISOString() ?? + (lastRunMem?.success ? lastRunMem.date : null); + + const hoursSinceSuccess = lastSuccessfulRunAt + ? Math.floor((Date.now() - Date.parse(lastSuccessfulRunAt)) / 3_600_000) + : null; + + // Classify freshness: ok (<24h), warn (24–48h), critical (>48h), running, unknown. + type FreshnessStatus = "ok" | "warn" | "critical" | "running" | "unknown"; + let freshnessStatus: FreshnessStatus = "unknown"; + let staleReason: string | null = null; if (jobRunning) { - status = "running"; - } else if (lastRun) { - const lastRunDate = new Date(lastRun.date + "T00:00:00Z"); - hoursAgo = Math.floor((Date.now() - lastRunDate.getTime()) / (1000 * 60 * 60)); - if (lastRun.success) { - status = hoursAgo <= 48 ? "ok" : "stale"; + freshnessStatus = "running"; + } else if (hoursSinceSuccess !== null) { + if (hoursSinceSuccess <= 24) { + freshnessStatus = "ok"; + } else if (hoursSinceSuccess <= 48) { + freshnessStatus = "warn"; + staleReason = `No successful run in ${hoursSinceSuccess}h (warn threshold: 24h).`; } else { - status = "stale"; + freshnessStatus = "critical"; + staleReason = `No successful run in ${hoursSinceSuccess}h (critical threshold: 48h).`; } + } else if (lastRunMem && !lastRunMem.success) { + freshnessStatus = "warn"; + staleReason = lastRunMem.error ?? "Last run failed."; } success(res, { - status, + status: freshnessStatus === "ok" || freshnessStatus === "running" ? freshnessStatus : "stale", + freshnessStatus, + staleReason, running: jobRunning, - lastRun: lastRun + lastSuccessfulRunAt, + hoursSinceSuccess, + latestSnapshotDate, + lastRun: dbRun ? { - date: lastRun.date, - success: lastRun.success, - hoursAgo, - recordsProcessed: lastRun.recordsProcessed, - changesDetected: lastRun.changesDetected, - notificationsSent: lastRun.notificationsSent, - error: lastRun.error ?? null, + date: dbRun.runDate, + completedAt: dbRun.completedAt?.toISOString() ?? null, + recordsProcessed: dbRun.recordsProcessed, + changesDetected: dbRun.changesDetected, } - : null, + : lastRunMem + ? { + date: lastRunMem.date, + completedAt: null, + recordsProcessed: lastRunMem.recordsProcessed, + changesDetected: lastRunMem.changesDetected, + } + : null, nextCronUtc: "Mon–Fri 00:30 UTC", timestamp: new Date().toISOString(), }); diff --git a/server/routes/ops.ts b/server/routes/ops.ts index b732f79..18cb130 100644 --- a/server/routes/ops.ts +++ b/server/routes/ops.ts @@ -10,6 +10,7 @@ import { isSafeCallbackUrl, signPayload } from "../utils/callbackSigner"; import { isUuidV4 } from "../utils/idempotency"; import { generateCorrelationId, startJobRun, finishJobRun } from "../utils/jobTelemetry"; import { runSponsorMonitorJob } from "../utils/sponsorMonitorJob"; +import { sendAdminAlert } from "../utils/adminAlert"; import { runJobAlertJob } from "../utils/jobAlertJob"; import { seedEnrichmentQueue, runEnrichmentBatch } from "../utils/enrichmentWorker"; import { processQueuedEngineEvents } from "../services/notificationEngine"; @@ -28,6 +29,13 @@ const log = logger.child({ module: "OpsRoutes" }); const IDEMPOTENCY_WINDOW_MS = 24 * 60 * 60 * 1000; +// Track cron-ping auth failures in-process to fire a throttled admin alert +// when a misconfiguration is detected (e.g., rotated secret not deployed). +let cronPingAuthFailureCount = 0; +let cronPingAuthAlertLastSentAt = 0; +const CRON_PING_AUTH_ALERT_THRESHOLD = 3; // failures before alerting +const CRON_PING_AUTH_ALERT_COOLDOWN_MS = 60 * 60 * 1000; // 1h between alerts + function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } @@ -808,6 +816,13 @@ export function registerOpsRoutes(app: Express): void { const cronSecret = process.env.CRON_SECRET; if (!cronSecret) { log.warn("[CronPing] CRON_SECRET env var not set — endpoint disabled."); + // Fire a one-time admin alert so the configuration gap doesn't go unnoticed. + sendAdminAlert( + "⚠️ CheckByAI: CRON_SECRET not configured", + `

The external cron-ping endpoint received a request but CRON_SECRET is not set on this deployment.

+

The sponsor monitor will not be triggered by the GitHub Actions schedule until this env var is configured.

+

Request IP: ${req.ip ?? "unknown"} — ${new Date().toISOString()}

`, + ).catch(() => {}); return res.status(503).json({ message: "External cron not configured." }); } @@ -817,9 +832,27 @@ export function registerOpsRoutes(app: Express): void { if (!provided || provided.length !== cronSecret.length || !crypto.timingSafeEqual(Buffer.from(provided), Buffer.from(cronSecret))) { log.warn({ ip: req.ip }, "[CronPing] Invalid or missing secret."); + cronPingAuthFailureCount += 1; + const now = Date.now(); + if ( + cronPingAuthFailureCount >= CRON_PING_AUTH_ALERT_THRESHOLD && + now - cronPingAuthAlertLastSentAt > CRON_PING_AUTH_ALERT_COOLDOWN_MS + ) { + cronPingAuthAlertLastSentAt = now; + sendAdminAlert( + "🔐 CheckByAI: Repeated cron-ping auth failures — possible config drift", + `

${cronPingAuthFailureCount} consecutive cron-ping authentication failures have been detected.

+

This usually means the CRON_SECRET in GitHub Actions secrets does not match the deployment env var.

+

The sponsor monitor will not run via the external schedule until this is fixed.

+

Last failure: ${new Date().toISOString()} — IP: ${req.ip ?? "unknown"}

`, + ).catch(() => {}); + } return res.status(401).json({ message: "Unauthorized." }); } + // Successful auth — reset failure counter. + cronPingAuthFailureCount = 0; + const today = new Date().toISOString().split("T")[0]; const existing = await db .select({ status: monitorJobRuns.status }) diff --git a/server/routes/sponsorPages.ts b/server/routes/sponsorPages.ts index 887a92f..423bf03 100644 --- a/server/routes/sponsorPages.ts +++ b/server/routes/sponsorPages.ts @@ -279,7 +279,7 @@ export function registerSponsorPageRoutes(app: Express): void { // This powers the "Register last checked" stat — shows when the job actually ran, // independent of which digest is displayed. db - .select({ runDate: monitorJobRuns.runDate }) + .select({ runDate: monitorJobRuns.runDate, completedAt: monitorJobRuns.completedAt }) .from(monitorJobRuns) .where(eq(monitorJobRuns.status, "success")) .orderBy(desc(monitorJobRuns.runDate)) @@ -289,23 +289,33 @@ export function registerSponsorPageRoutes(app: Express): void { const totalActive = countResult[0]?.total ?? 0; const revokedLast12Months = revokedResult[0]?.total ?? 0; const latest = digestRows[0] ?? null; + const lastRun = lastRunRows[0] ?? null; // Use the last successful job run date as "Register last checked". // Fall back to the active digest's snapshot date if no run recorded yet. - const lastRunDate = lastRunRows[0]?.runDate ?? latest?.snapshotDate ?? null; + const lastRunDate = lastRun?.runDate ?? latest?.snapshotDate ?? null; + // Precise UTC timestamp of the last successful completion — used for <24h freshness check. + const lastSuccessfulRunAt = lastRun?.completedAt?.toISOString() ?? null; + const now = Date.now(); const today = new Date().toISOString().split("T")[0]; const staleDays = lastRunDate ? Math.round((Date.parse(today) - Date.parse(lastRunDate)) / 86400000) : 0; + // Hours since last successful run (precise, from completedAt if available). + const hoursStale = lastSuccessfulRunAt + ? Math.floor((now - Date.parse(lastSuccessfulRunAt)) / 3_600_000) + : staleDays * 24; const payload = { totalActive, lastRunDate, + lastSuccessfulRunAt, addedCount: latest?.addedCount ?? 0, removedCount: latest?.removedCount ?? 0, changesCount: latest?.updatedCount ?? 0, revokedLast12Months, staleDays, + hoursStale, }; // Cache for 5 minutes — balances freshness with DB load. // Flushed immediately by sponsorMonitorJob after each nightly run. diff --git a/server/utils/__tests__/freshnessHealth.test.ts b/server/utils/__tests__/freshnessHealth.test.ts new file mode 100644 index 0000000..27c7576 --- /dev/null +++ b/server/utils/__tests__/freshnessHealth.test.ts @@ -0,0 +1,158 @@ +/** + * Tests for sponsor-monitor freshness status classification logic. + * + * These tests cover the classification rules introduced to replace the + * old `staleDays >= 3` threshold with an hours-based tiered system: + * ≤24h → ok + * 24–48h → warn + * >48h → critical + */ + +import { describe, expect, it } from "vitest"; + +// ── Pure classification helper (mirrors the logic in health.ts) ────────────── + +type FreshnessStatus = "ok" | "warn" | "critical" | "running" | "unknown"; + +function classifyFreshness(opts: { + jobRunning: boolean; + hoursSinceSuccess: number | null; + lastRunFailed?: boolean; +}): { status: FreshnessStatus; staleReason: string | null } { + const { jobRunning, hoursSinceSuccess, lastRunFailed } = opts; + + if (jobRunning) { + return { status: "running", staleReason: null }; + } + if (hoursSinceSuccess !== null) { + if (hoursSinceSuccess <= 24) { + return { status: "ok", staleReason: null }; + } + if (hoursSinceSuccess <= 48) { + return { + status: "warn", + staleReason: `No successful run in ${hoursSinceSuccess}h (warn threshold: 24h).`, + }; + } + return { + status: "critical", + staleReason: `No successful run in ${hoursSinceSuccess}h (critical threshold: 48h).`, + }; + } + if (lastRunFailed) { + return { status: "warn", staleReason: "Last run failed." }; + } + return { status: "unknown", staleReason: null }; +} + +// ── Pure stale-banner helper (mirrors the logic in HeroSection.tsx) ────────── + +type BannerSeverity = "ok" | "warn" | "critical"; + +function staleBannerSeverity(hoursStale: number): BannerSeverity { + if (hoursStale > 48) return "critical"; + if (hoursStale > 24) return "warn"; + return "ok"; +} + +// ── Tests ───────────────────────────────────────────────────────────────────── + +describe("classifyFreshness — health endpoint status", () => { + it("returns ok when job ran 1h ago", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 1 }); + expect(result.status).toBe("ok"); + expect(result.staleReason).toBeNull(); + }); + + it("returns ok when job ran exactly 24h ago (boundary)", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 24 }); + expect(result.status).toBe("ok"); + }); + + it("returns warn when job ran 25h ago (just over 24h threshold)", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 25 }); + expect(result.status).toBe("warn"); + expect(result.staleReason).toContain("warn threshold: 24h"); + }); + + it("returns warn when job ran exactly 48h ago (boundary)", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 48 }); + expect(result.status).toBe("warn"); + }); + + it("returns critical when job ran 49h ago (just over 48h threshold)", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 49 }); + expect(result.status).toBe("critical"); + expect(result.staleReason).toContain("critical threshold: 48h"); + }); + + it("returns critical when job ran 72h ago", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: 72 }); + expect(result.status).toBe("critical"); + }); + + it("returns running when job is currently executing", () => { + const result = classifyFreshness({ jobRunning: true, hoursSinceSuccess: null }); + expect(result.status).toBe("running"); + expect(result.staleReason).toBeNull(); + }); + + it("returns running regardless of hoursSinceSuccess", () => { + const result = classifyFreshness({ jobRunning: true, hoursSinceSuccess: 100 }); + expect(result.status).toBe("running"); + }); + + it("returns warn when hoursSinceSuccess is null but last run failed", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: null, lastRunFailed: true }); + expect(result.status).toBe("warn"); + expect(result.staleReason).toBe("Last run failed."); + }); + + it("returns unknown when no data available", () => { + const result = classifyFreshness({ jobRunning: false, hoursSinceSuccess: null }); + expect(result.status).toBe("unknown"); + expect(result.staleReason).toBeNull(); + }); +}); + +describe("staleBannerSeverity — HeroSection stale warning tier", () => { + it("returns ok for 0 hours stale (just ran)", () => { + expect(staleBannerSeverity(0)).toBe("ok"); + }); + + it("returns ok for exactly 24h stale", () => { + expect(staleBannerSeverity(24)).toBe("ok"); + }); + + it("returns warn for 25h stale", () => { + expect(staleBannerSeverity(25)).toBe("warn"); + }); + + it("returns warn for exactly 48h stale", () => { + expect(staleBannerSeverity(48)).toBe("warn"); + }); + + it("returns critical for 49h stale", () => { + expect(staleBannerSeverity(49)).toBe("critical"); + }); + + it("returns critical for 72h stale (3 days)", () => { + expect(staleBannerSeverity(72)).toBe("critical"); + }); + + it("returns critical for 168h stale (1 week)", () => { + expect(staleBannerSeverity(168)).toBe("critical"); + }); + + // Old threshold (3 calendar days = 72h) would not have warned at 2 days. + // New threshold (24h) correctly warns at 25h. + it("triggers warn at 25h where old 3-day threshold would have missed it", () => { + const OLD_STALE_DAYS_THRESHOLD = 3; // old: warned at 3 days + const hoursStale = 25; + const staleDays = Math.floor(hoursStale / 24); // 1 day + // Old logic would NOT show a warning (staleDays < 3) + expect(staleDays < OLD_STALE_DAYS_THRESHOLD).toBe(true); + // New logic DOES show a warning (hoursStale > 24) + expect(staleBannerSeverity(hoursStale)).toBe("warn"); + }); +}); diff --git a/server/utils/scheduler.ts b/server/utils/scheduler.ts index 61bff90..b5e2c58 100644 --- a/server/utils/scheduler.ts +++ b/server/utils/scheduler.ts @@ -142,10 +142,29 @@ export function startCentralScheduler(): void { log.info("Central scheduler: SPONSOR_MONITOR registered (30 0 * * 1-5 UTC)."); } - const active = getCutoverStatusSnapshot().filter((s) => s.cutover).map((s) => s.job); + const snapshot = getCutoverStatusSnapshot(); + const active = snapshot.filter((s) => s.cutover).map((s) => s.job); if (active.length === 0) { log.info("Central scheduler started: no jobs cut over yet (all inline-cron owned)."); } else { log.info({ active }, `Central scheduler started: ${active.length} job(s) cut over.`); } + + // ── Ownership sanity check ────────────────────────────────────────────────── + // The SPONSOR_MONITOR job must be owned by exactly one scheduler path. + // Log a clear warning if the cutover flag is true but the central scheduler + // somehow did NOT register the cron (e.g., code path skipped due to a bug). + // The inverse — cutover false, inline cron owns it — is the expected default. + // Note: we cannot detect if *neither* path registered without runtime state; + // this is a best-effort startup check for config drift. + const sponsorStatus = snapshot.find((s) => s.job === "SPONSOR_MONITOR"); + if (sponsorStatus?.cutover && !isCutover("SPONSOR_MONITOR")) { + // Should be unreachable but guard against future refactors that call this + // function with stale env state. + log.error( + { sponsorStatus }, + "Central scheduler: SPONSOR_MONITOR cutover flag is inconsistent — sponsor monitor may have NO active scheduler owner. " + + "Verify CUTOVER_SPONSOR_MONITOR env var and restart.", + ); + } } diff --git a/server/utils/sponsorMonitorJob.ts b/server/utils/sponsorMonitorJob.ts index c91120a..ea81e54 100644 --- a/server/utils/sponsorMonitorJob.ts +++ b/server/utils/sponsorMonitorJob.ts @@ -1225,10 +1225,29 @@ async function checkMissedJobsAndCatchUp(source: string = "startup-catchup"): Pr `[SponsorMonitorJob] ${isStartup ? "Startup catch-up" : "Backfill check"}: no successful run found for ${missed} (and possibly earlier). Triggering now.`, ); + // Compute how long ago the last successful run was (in hours) to choose + // the appropriate alert severity. + const lastSuccessDate = successfulRuns + .map((r) => r.runDate) + .sort() + .reverse()[0] ?? null; + const hoursSinceSuccess = lastSuccessDate + ? Math.floor((Date.now() - Date.parse(lastSuccessDate + "T00:30:00Z")) / 3_600_000) + : null; + const isP0 = hoursSinceSuccess !== null && hoursSinceSuccess > 48; + const isP1 = hoursSinceSuccess !== null && hoursSinceSuccess > 24 && !isP0; + + const alertPrefix = isP0 + ? "🚨 P0 CheckByAI: Sponsor monitor stale >48h" + : isP1 + ? "⚠️ P1 CheckByAI: Sponsor monitor stale >24h" + : `ℹ️ CheckByAI: ${isStartup ? "Startup" : "Periodic"} catch-up triggered`; + await sendAdminAlert( - `ℹ️ CheckByAI: ${isStartup ? "Startup" : "Periodic"} catch-up triggered`, + alertPrefix, `

${isStartup ? "Server booted" : "Periodic 6-hour check"} detected a missed sponsor monitor job.

Most recent missed weekday: ${missed}

+

Hours since last successful run: ${hoursSinceSuccess ?? "unknown"}

Successful runs found: ${[...successDates].join(", ") || "none in last 7 days"}

Running now to fetch the latest register CSV and apply any accumulated changes.

`, ).catch(() => {});