diff --git a/tests/packaging/lock-helper.test.ts b/tests/packaging/lock-helper.test.ts index 0f11cfa9..d207e7e3 100644 --- a/tests/packaging/lock-helper.test.ts +++ b/tests/packaging/lock-helper.test.ts @@ -22,6 +22,18 @@ describe("withPackagingLock", () => { .mockImplementationOnce(() => 123); const closeSpy = vi.spyOn(fs, "closeSync").mockImplementation(() => undefined); const rmSpy = vi.spyOn(fs, "rmSync").mockImplementation(() => undefined); + vi.spyOn(fs, "writeFileSync").mockImplementation(() => undefined); + // Keep the staleness probe hermetic: a transient contention error is not a + // stale lock, so model "no lock file on disk" regardless of any real + // .packaging-test.lock a concurrent packaging test may hold on CI. This + // exercises the not-stale path, where the retry simply succeeds once + // contention clears, instead of letting the test depend on shared FS state. + vi.spyOn(fs, "readFileSync").mockImplementation(() => { + throw createFsError("ENOENT"); + }); + vi.spyOn(fs, "statSync").mockImplementation(() => { + throw createFsError("ENOENT"); + }); const result = await withPackagingLock(async () => "ok"); @@ -31,6 +43,32 @@ describe("withPackagingLock", () => { expect(rmSpy).toHaveBeenCalledTimes(1); }); + it("reclaims a stale lock whose owning process is dead", async () => { + const staleLock = JSON.stringify({ pid: 999999, timestamp: Date.now() }); + const openSpy = vi + .spyOn(fs, "openSync") + .mockImplementationOnce(() => { + throw createFsError("EEXIST"); + }) + .mockImplementationOnce(() => 456); + vi.spyOn(fs, "readFileSync").mockReturnValue(staleLock); + const killSpy = vi.spyOn(process, "kill").mockImplementation(() => { + throw createFsError("ESRCH"); + }); + const rmSpy = vi.spyOn(fs, "rmSync").mockImplementation(() => undefined); + vi.spyOn(fs, "closeSync").mockImplementation(() => undefined); + vi.spyOn(fs, "writeFileSync").mockImplementation(() => undefined); + + const result = await withPackagingLock(async () => "reclaimed"); + + expect(result).toBe("reclaimed"); + // First open hits the orphan; after reclaim the retry acquires it. + expect(openSpy).toHaveBeenCalledTimes(2); + expect(killSpy).toHaveBeenCalledWith(999999, 0); + // Once to reclaim the orphan, once to release in the finally block. + expect(rmSpy).toHaveBeenCalledTimes(2); + }); + it("throws for non-retryable filesystem errors", async () => { vi.spyOn(fs, "openSync").mockImplementation(() => { throw createFsError("ENOENT"); diff --git a/tests/packaging/lock.ts b/tests/packaging/lock.ts index 7061a133..6a67ceb4 100644 --- a/tests/packaging/lock.ts +++ b/tests/packaging/lock.ts @@ -3,18 +3,130 @@ import path from "node:path"; const PACKAGING_LOCK_PATH = path.resolve(process.cwd(), ".packaging-test.lock"); const LOCK_RETRY_INTERVAL_MS = 25; +// A lock holder writes its {pid,timestamp} within milliseconds of creating the +// file, so a lock that is still empty after this grace window lost its writer +// (a 0-byte lock left by a previous run, or a holder that crashed mid-create). +const EMPTY_LOCK_GRACE_MS = 5_000; +// TTL backstop for a lock whose owning pid still appears alive — e.g. after pid +// reuse on a long-lived runner. It is deliberately far larger than any plausible +// single packaging hold (build + npm pack run in seconds, not minutes) because +// those builds run via blocking execSync: a slow-but-live build must never be +// reclaimed out from under itself, which would let two builds race the same +// dist/. Dead-pid detection, not this TTL, is the primary staleness signal. +const STALE_LOCK_TTL_MS = 120_000; +// Bound the acquire wait so a lock held by a live, non-stale process surfaces an +// actionable error instead of an opaque "Test timed out". Kept just under the +// 30s per-test budget: every packaging critical section is a multi-second build, +// so a waiter that has not acquired the lock by here cannot finish before the +// per-test timeout anyway — this only ever turns a would-be timeout into a clear +// message, never failing a waiter that would otherwise have acquired in time. +const LOCK_ACQUIRE_TIMEOUT_MS = 28_000; const RETRYABLE_LOCK_ERROR_CODES = new Set(["EEXIST", "EPERM", "EACCES"]); +/** + * On-disk lock ownership record. Written into `.packaging-test.lock` when the + * lock is acquired so that a later waiter can decide whether an existing lock is + * held by a live process or is an orphan left behind by a holder that died (or + * by a vitest test that timed out before its `finally` could release the lock). + */ +interface LockMetadata { + pid: number; + timestamp: number; +} + function sleep(ms: number): Promise { return new Promise((resolve) => { setTimeout(resolve, ms); }); } +/** + * Returns true if a process with the given pid is still running. Uses the + * signal-0 probe, which performs the permission/existence check without actually + * delivering a signal. An EPERM means the process exists but is owned by another + * user, so it is treated as alive; any other error (notably ESRCH) means dead. + */ +function isProcessAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch (error) { + return error instanceof Error && "code" in error && error.code === "EPERM"; + } +} + +/** + * Reads the ownership metadata of the current lock file, or undefined if the + * lock is absent, empty, or not yet written (a holder that created the file but + * crashed before writing its metadata). + */ +function readLockMetadata(): LockMetadata | undefined { + let raw: string; + try { + raw = fs.readFileSync(PACKAGING_LOCK_PATH, "utf8"); + } catch { + return undefined; + } + + try { + const parsed = JSON.parse(raw) as Partial; + if (typeof parsed.pid === "number" && typeof parsed.timestamp === "number") { + return { pid: parsed.pid, timestamp: parsed.timestamp }; + } + } catch { + // Fall through: an unparseable lock is handled by the staleness fallback. + } + return undefined; +} + +/** + * Age of the lock file on disk in milliseconds, used to judge staleness of a + * lock that has no parseable owner metadata. Returns undefined if the file no + * longer exists. + */ +function lockFileAgeMs(): number | undefined { + try { + return Date.now() - fs.statSync(PACKAGING_LOCK_PATH).mtimeMs; + } catch { + return undefined; + } +} + +/** + * Reclaims the lock if it is an orphan, returning true when one was removed. + * + * A lock is treated as stale when its owning process is dead, when it exceeds + * the (large) TTL backstop, or — for a lock with no parseable metadata — once it + * is older than the short empty-lock grace window. A live holder is never + * reclaimed: it writes its metadata immediately on acquire, and its blocking + * build completes far inside the TTL. + */ +function reclaimStaleLock(): boolean { + const metadata = readLockMetadata(); + + let isStale: boolean; + if (metadata === undefined) { + const ageMs = lockFileAgeMs(); + isStale = ageMs !== undefined && ageMs > EMPTY_LOCK_GRACE_MS; + } else { + isStale = + !isProcessAlive(metadata.pid) || + Date.now() - metadata.timestamp > STALE_LOCK_TTL_MS; + } + + if (!isStale) { + return false; + } + + fs.rmSync(PACKAGING_LOCK_PATH, { force: true }); + return true; +} + export async function withPackagingLock( callback: () => Promise | T ): Promise { let lockFd: number | undefined; + const deadline = Date.now() + LOCK_ACQUIRE_TIMEOUT_MS; while (lockFd === undefined) { try { @@ -29,11 +141,29 @@ export async function withPackagingLock( throw error; } + // The lock is held. If it is an orphan (dead holder, or a stale 0-byte + // lock left by a previous run), reclaim it and retry immediately; + // otherwise back off and wait for the live holder to release it. + if (reclaimStaleLock()) { + continue; + } + + if (Date.now() >= deadline) { + throw new Error( + `Timed out after ${LOCK_ACQUIRE_TIMEOUT_MS}ms acquiring packaging lock ` + + `at ${PACKAGING_LOCK_PATH}. It is held by a live, non-stale process; ` + + `if it is orphaned, delete the file and re-run.`, + { cause: error } + ); + } + await sleep(LOCK_RETRY_INTERVAL_MS); } } try { + const metadata: LockMetadata = { pid: process.pid, timestamp: Date.now() }; + fs.writeFileSync(lockFd, JSON.stringify(metadata)); return await callback(); } finally { fs.closeSync(lockFd);