From 3b3894ed7fbb1133d5d6d88fa53429300b1b4f11 Mon Sep 17 00:00:00 2001 From: Chisanan232 Date: Fri, 26 Jun 2026 12:00:37 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E2=9C=A8=20(test):=20Write=20pid+timestamp?= =?UTF-8?q?=20metadata=20into=20packaging=20lock?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Record the holder's pid and acquisition time inside .packaging-test.lock so a later waiter can distinguish a live holder from an orphaned lock. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_019mSz31RysZF6DYToUoBWLf --- tests/packaging/lock-helper.test.ts | 1 + tests/packaging/lock.ts | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/tests/packaging/lock-helper.test.ts b/tests/packaging/lock-helper.test.ts index 0f11cfa9..c4fed0ff 100644 --- a/tests/packaging/lock-helper.test.ts +++ b/tests/packaging/lock-helper.test.ts @@ -22,6 +22,7 @@ describe("withPackagingLock", () => { .mockImplementationOnce(() => 123); const closeSpy = vi.spyOn(fs, "closeSync").mockImplementation(() => undefined); const rmSpy = vi.spyOn(fs, "rmSync").mockImplementation(() => undefined); + vi.spyOn(fs, "writeFileSync").mockImplementation(() => undefined); const result = await withPackagingLock(async () => "ok"); diff --git a/tests/packaging/lock.ts b/tests/packaging/lock.ts index 7061a133..11e8c1aa 100644 --- a/tests/packaging/lock.ts +++ b/tests/packaging/lock.ts @@ -5,6 +5,17 @@ const PACKAGING_LOCK_PATH = path.resolve(process.cwd(), ".packaging-test.lock"); const LOCK_RETRY_INTERVAL_MS = 25; const RETRYABLE_LOCK_ERROR_CODES = new Set(["EEXIST", "EPERM", "EACCES"]); +/** + * On-disk lock ownership record. Written into `.packaging-test.lock` when the + * lock is acquired so that a later waiter can decide whether an existing lock is + * held by a live process or is an orphan left behind by a holder that died (or + * by a vitest test that timed out before its `finally` could release the lock). + */ +interface LockMetadata { + pid: number; + timestamp: number; +} + function sleep(ms: number): Promise { return new Promise((resolve) => { setTimeout(resolve, ms); @@ -34,6 +45,8 @@ export async function withPackagingLock( } try { + const metadata: LockMetadata = { pid: process.pid, timestamp: Date.now() }; + fs.writeFileSync(lockFd, JSON.stringify(metadata)); return await callback(); } finally { fs.closeSync(lockFd); From 95bb1e46959951751c26efd14b77ad5237135bf4 Mon Sep 17 00:00:00 2001 From: Chisanan232 Date: Fri, 26 Jun 2026 12:19:29 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=F0=9F=90=9B=20(test):=20Detect=20and=20rec?= =?UTF-8?q?laim=20a=20stale=20packaging=20lock=20with=20a=20bounded=20wait?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the unbounded retry loop with stale-lock detection: an orphaned .packaging-test.lock (dead owner pid, or a 0-byte lock left by a previous run past a short grace) is reclaimed and retried, and a large TTL backs that up against pid reuse without ever evicting a live, slow-but-legitimate build. Also bound the acquire wait just under the 30s per-test timeout so a lock held by a live, non-stale process fails with an actionable error instead of an opaque "Test timed out". Previously a leaked lock made every later packaging test spin until its own timeout, cascading unrelated failures. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_019mSz31RysZF6DYToUoBWLf --- tests/packaging/lock.ts | 117 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/tests/packaging/lock.ts b/tests/packaging/lock.ts index 11e8c1aa..6a67ceb4 100644 --- a/tests/packaging/lock.ts +++ b/tests/packaging/lock.ts @@ -3,6 +3,24 @@ import path from "node:path"; const PACKAGING_LOCK_PATH = path.resolve(process.cwd(), ".packaging-test.lock"); const LOCK_RETRY_INTERVAL_MS = 25; +// A lock holder writes its {pid,timestamp} within milliseconds of creating the +// file, so a lock that is still empty after this grace window lost its writer +// (a 0-byte lock left by a previous run, or a holder that crashed mid-create). +const EMPTY_LOCK_GRACE_MS = 5_000; +// TTL backstop for a lock whose owning pid still appears alive — e.g. after pid +// reuse on a long-lived runner. It is deliberately far larger than any plausible +// single packaging hold (build + npm pack run in seconds, not minutes) because +// those builds run via blocking execSync: a slow-but-live build must never be +// reclaimed out from under itself, which would let two builds race the same +// dist/. Dead-pid detection, not this TTL, is the primary staleness signal. +const STALE_LOCK_TTL_MS = 120_000; +// Bound the acquire wait so a lock held by a live, non-stale process surfaces an +// actionable error instead of an opaque "Test timed out". Kept just under the +// 30s per-test budget: every packaging critical section is a multi-second build, +// so a waiter that has not acquired the lock by here cannot finish before the +// per-test timeout anyway — this only ever turns a would-be timeout into a clear +// message, never failing a waiter that would otherwise have acquired in time. +const LOCK_ACQUIRE_TIMEOUT_MS = 28_000; const RETRYABLE_LOCK_ERROR_CODES = new Set(["EEXIST", "EPERM", "EACCES"]); /** @@ -22,10 +40,93 @@ function sleep(ms: number): Promise { }); } +/** + * Returns true if a process with the given pid is still running. Uses the + * signal-0 probe, which performs the permission/existence check without actually + * delivering a signal. An EPERM means the process exists but is owned by another + * user, so it is treated as alive; any other error (notably ESRCH) means dead. + */ +function isProcessAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch (error) { + return error instanceof Error && "code" in error && error.code === "EPERM"; + } +} + +/** + * Reads the ownership metadata of the current lock file, or undefined if the + * lock is absent, empty, or not yet written (a holder that created the file but + * crashed before writing its metadata). + */ +function readLockMetadata(): LockMetadata | undefined { + let raw: string; + try { + raw = fs.readFileSync(PACKAGING_LOCK_PATH, "utf8"); + } catch { + return undefined; + } + + try { + const parsed = JSON.parse(raw) as Partial; + if (typeof parsed.pid === "number" && typeof parsed.timestamp === "number") { + return { pid: parsed.pid, timestamp: parsed.timestamp }; + } + } catch { + // Fall through: an unparseable lock is handled by the staleness fallback. + } + return undefined; +} + +/** + * Age of the lock file on disk in milliseconds, used to judge staleness of a + * lock that has no parseable owner metadata. Returns undefined if the file no + * longer exists. + */ +function lockFileAgeMs(): number | undefined { + try { + return Date.now() - fs.statSync(PACKAGING_LOCK_PATH).mtimeMs; + } catch { + return undefined; + } +} + +/** + * Reclaims the lock if it is an orphan, returning true when one was removed. + * + * A lock is treated as stale when its owning process is dead, when it exceeds + * the (large) TTL backstop, or — for a lock with no parseable metadata — once it + * is older than the short empty-lock grace window. A live holder is never + * reclaimed: it writes its metadata immediately on acquire, and its blocking + * build completes far inside the TTL. + */ +function reclaimStaleLock(): boolean { + const metadata = readLockMetadata(); + + let isStale: boolean; + if (metadata === undefined) { + const ageMs = lockFileAgeMs(); + isStale = ageMs !== undefined && ageMs > EMPTY_LOCK_GRACE_MS; + } else { + isStale = + !isProcessAlive(metadata.pid) || + Date.now() - metadata.timestamp > STALE_LOCK_TTL_MS; + } + + if (!isStale) { + return false; + } + + fs.rmSync(PACKAGING_LOCK_PATH, { force: true }); + return true; +} + export async function withPackagingLock( callback: () => Promise | T ): Promise { let lockFd: number | undefined; + const deadline = Date.now() + LOCK_ACQUIRE_TIMEOUT_MS; while (lockFd === undefined) { try { @@ -40,6 +141,22 @@ export async function withPackagingLock( throw error; } + // The lock is held. If it is an orphan (dead holder, or a stale 0-byte + // lock left by a previous run), reclaim it and retry immediately; + // otherwise back off and wait for the live holder to release it. + if (reclaimStaleLock()) { + continue; + } + + if (Date.now() >= deadline) { + throw new Error( + `Timed out after ${LOCK_ACQUIRE_TIMEOUT_MS}ms acquiring packaging lock ` + + `at ${PACKAGING_LOCK_PATH}. It is held by a live, non-stale process; ` + + `if it is orphaned, delete the file and re-run.`, + { cause: error } + ); + } + await sleep(LOCK_RETRY_INTERVAL_MS); } } From aa0526a029a6f9f3e3a85e08b110f023214a50db Mon Sep 17 00:00:00 2001 From: Chisanan232 Date: Fri, 26 Jun 2026 12:19:29 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E2=9C=85=20(test):=20Add=20stale-lock=20re?= =?UTF-8?q?claim=20regression=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cover the path where the lock is held by a dead owner: the helper probes pid liveness, reclaims the orphan, and acquires the lock so the callback runs. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_019mSz31RysZF6DYToUoBWLf --- tests/packaging/lock-helper.test.ts | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/packaging/lock-helper.test.ts b/tests/packaging/lock-helper.test.ts index c4fed0ff..377d5ba3 100644 --- a/tests/packaging/lock-helper.test.ts +++ b/tests/packaging/lock-helper.test.ts @@ -32,6 +32,32 @@ describe("withPackagingLock", () => { expect(rmSpy).toHaveBeenCalledTimes(1); }); + it("reclaims a stale lock whose owning process is dead", async () => { + const staleLock = JSON.stringify({ pid: 999999, timestamp: Date.now() }); + const openSpy = vi + .spyOn(fs, "openSync") + .mockImplementationOnce(() => { + throw createFsError("EEXIST"); + }) + .mockImplementationOnce(() => 456); + vi.spyOn(fs, "readFileSync").mockReturnValue(staleLock); + const killSpy = vi.spyOn(process, "kill").mockImplementation(() => { + throw createFsError("ESRCH"); + }); + const rmSpy = vi.spyOn(fs, "rmSync").mockImplementation(() => undefined); + vi.spyOn(fs, "closeSync").mockImplementation(() => undefined); + vi.spyOn(fs, "writeFileSync").mockImplementation(() => undefined); + + const result = await withPackagingLock(async () => "reclaimed"); + + expect(result).toBe("reclaimed"); + // First open hits the orphan; after reclaim the retry acquires it. + expect(openSpy).toHaveBeenCalledTimes(2); + expect(killSpy).toHaveBeenCalledWith(999999, 0); + // Once to reclaim the orphan, once to release in the finally block. + expect(rmSpy).toHaveBeenCalledTimes(2); + }); + it("throws for non-retryable filesystem errors", async () => { vi.spyOn(fs, "openSync").mockImplementation(() => { throw createFsError("ENOENT"); From 0e6de6fbe7f6d078cb938f98737c54d81f9bdf51 Mon Sep 17 00:00:00 2001 From: Chisanan232 Date: Fri, 26 Jun 2026 13:07:59 +0800 Subject: [PATCH 4/4] =?UTF-8?q?=F0=9F=90=9B=20(test):=20Make=20contention-?= =?UTF-8?q?retry=20lock=20test=20hermetic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new stale-detection path reads the lock file and probes the owner pid, so the existing contention-retry test became dependent on whatever real .packaging-test.lock a concurrent packaging test holds on CI — a stale orphan left by a sibling test made reclaim fire, breaking the assertions (and on the slower CI Node 18 runner, timing the test out). Model "no lock on disk" by mocking readFileSync/statSync so the test deterministically exercises the not-stale fast-retry path independent of shared filesystem state. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_019mSz31RysZF6DYToUoBWLf --- tests/packaging/lock-helper.test.ts | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/packaging/lock-helper.test.ts b/tests/packaging/lock-helper.test.ts index 377d5ba3..d207e7e3 100644 --- a/tests/packaging/lock-helper.test.ts +++ b/tests/packaging/lock-helper.test.ts @@ -23,6 +23,17 @@ describe("withPackagingLock", () => { const closeSpy = vi.spyOn(fs, "closeSync").mockImplementation(() => undefined); const rmSpy = vi.spyOn(fs, "rmSync").mockImplementation(() => undefined); vi.spyOn(fs, "writeFileSync").mockImplementation(() => undefined); + // Keep the staleness probe hermetic: a transient contention error is not a + // stale lock, so model "no lock file on disk" regardless of any real + // .packaging-test.lock a concurrent packaging test may hold on CI. This + // exercises the not-stale path, where the retry simply succeeds once + // contention clears, instead of letting the test depend on shared FS state. + vi.spyOn(fs, "readFileSync").mockImplementation(() => { + throw createFsError("ENOENT"); + }); + vi.spyOn(fs, "statSync").mockImplementation(() => { + throw createFsError("ENOENT"); + }); const result = await withPackagingLock(async () => "ok");