From f213ae7ed31d0abe5cf0200d1cdabe3eec1cf3b8 Mon Sep 17 00:00:00 2001 From: clagentic <10177887+akuehner@users.noreply.github.com> Date: Sat, 13 Jun 2026 12:44:53 -0400 Subject: [PATCH] harden external-trigger watcher: polling backstop + periodic re-arm + health export fs.watch on Linux can silently die (inotify queue overflow, inode rebind after dir recreation) with no error event and no observable signal. Three layers added: Layer 1 (pollInterval): setInterval at 30 s calls scanExisting(). The dispatched map makes re-scanning idempotent. Uses .unref() so it does not keep the process alive. Cleared in stopWatcher(). Layer 2 (rearmTimer): setInterval at 5 min closes and re-opens the fs.watch binding via a shared armWatcher() helper. Guarantees the inotify registration is never more than 5 min stale. Uses .unref(). Cleared in stopWatcher(). Layer 3 (getHealth): watcherLastEventMs tracks the timestamp of the last watcher callback (updated before the .json guard). getHealth() returns watcherAlive, lastEventMs, and pollActive for external monitoring. No new dependencies. All 309 existing tests pass. --- lib/project-external-trigger.js | 69 +++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 7 deletions(-) diff --git a/lib/project-external-trigger.js b/lib/project-external-trigger.js index 51e340d..0ce1420 100644 --- a/lib/project-external-trigger.js +++ b/lib/project-external-trigger.js @@ -3,6 +3,8 @@ var path = require("path"); // How long (ms) to keep processed trigger files before pruning on startup. var PROCESSED_MAX_AGE_MS = 30 * 24 * 60 * 60 * 1000; // 30 days +var POLL_INTERVAL_MS = 30 * 1000; // 30 s backstop scan +var WATCHER_REARM_INTERVAL_MS = 5 * 60 * 1000; // 5 min inotify rebind /** * External trigger watcher — global singleton. @@ -35,6 +37,14 @@ var PROCESSED_MAX_AGE_MS = 30 * 24 * 60 * 60 * 1000; // 30 days * Daemon-down recovery: unprocessed files that predate the current process * start are picked up via a startup scan (scanExisting). No file is lost * if the daemon is restarted while triggers are pending. + * + * Watcher hardening (three layers): + * Layer 1 — 30 s polling backstop: scanExisting() on an interval so files + * are never permanently missed if fs.watch silently dies. + * Layer 2 — 5 min periodic re-arm: close and re-open the fs.watch binding + * so the inotify registration is never more than 5 min stale. + * Layer 3 — health export: getHealth() returns watcher liveness, last-event + * timestamp, and poll-interval presence for external monitoring. */ function attachExternalTrigger(ctx) { var triggersDir = ctx.triggersDir; @@ -47,6 +57,11 @@ function attachExternalTrigger(ctx) { // from the initial scan + watcher race. var dispatched = {}; + // Hardening state + var pollInterval = null; + var rearmTimer = null; + var watcherLastEventMs = 0; + // --- Directory setup --- function ensureDirs() { @@ -231,12 +246,16 @@ function attachExternalTrigger(ctx) { }, 200); } - function startWatcher() { - ensureDirs(); - pruneOldProcessed(); - scanExisting(); // pick up files dropped while daemon was down + // --- Layer 2: periodic watcher re-arm --- + + function armWatcher() { + if (watcher) { + try { watcher.close(); } catch (e) {} + watcher = null; + } try { watcher = fs.watch(triggersDir, function (eventType, filename) { + watcherLastEventMs = Date.now(); if (filename && !filename.endsWith(".json")) return; onDirChange(); }); @@ -244,18 +263,47 @@ function attachExternalTrigger(ctx) { console.error("[external-trigger] Watcher error:", e.message || e); stopWatcher(); }); - console.log("[external-trigger] Watching:", triggersDir); } catch (e) { - console.error("[external-trigger] Failed to start watcher:", e.message || e); + console.error("[external-trigger] Failed to arm watcher:", e.message || e); } } + function startWatcher() { + ensureDirs(); + pruneOldProcessed(); + scanExisting(); // pick up files dropped while daemon was down + + armWatcher(); + console.log("[external-trigger] Watching:", triggersDir); + + // Layer 1: polling backstop — catches files if fs.watch silently dies + pollInterval = setInterval(function () { + scanExisting(); + }, POLL_INTERVAL_MS); + pollInterval.unref(); + + // Layer 2: periodic re-arm — keeps inotify registration fresh + rearmTimer = setInterval(function () { + armWatcher(); + console.log("[external-trigger] Watcher re-armed"); + }, WATCHER_REARM_INTERVAL_MS); + rearmTimer.unref(); + } + function stopWatcher() { clearTimeout(debounce); if (watcher) { try { watcher.close(); } catch (e) {} watcher = null; } + if (pollInterval) { + clearInterval(pollInterval); + pollInterval = null; + } + if (rearmTimer) { + clearInterval(rearmTimer); + rearmTimer = null; + } } // --- Startup scan (daemon-down recovery) --- @@ -289,8 +337,15 @@ function attachExternalTrigger(ctx) { return { startWatcher: startWatcher, stopWatcher: stopWatcher, + // Layer 3: health export for external monitoring + getHealth: function () { + return { + watcherAlive: !!watcher, + lastEventMs: watcherLastEventMs, + pollActive: !!pollInterval, + }; + }, }; } module.exports = { attachExternalTrigger: attachExternalTrigger }; -