From 3b3894ed7fbb1133d5d6d88fa53429300b1b4f11 Mon Sep 17 00:00:00 2001
From: Chisanan232 <bulls23mj1991@gmail.com>
Date: Fri, 26 Jun 2026 12:00:37 +0800
Subject: [PATCH 1/4] =?UTF-8?q?=E2=9C=A8=20(test):=20Write=20pid+timestamp?=
 =?UTF-8?q?=20metadata=20into=20packaging=20lock?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Record the holder's pid and acquisition time inside .packaging-test.lock so
a later waiter can distinguish a live holder from an orphaned lock.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_019mSz31RysZF6DYToUoBWLf
---
 tests/packaging/lock-helper.test.ts |  1 +
 tests/packaging/lock.ts             | 13 +++++++++++++
 2 files changed, 14 insertions(+)
diff --git a/tests/packaging/lock-helper.test.ts b/tests/packaging/lock-helper.test.ts
index 0f11cfa9..c4fed0ff 100644
--- a/tests/packaging/lock-helper.test.ts
+++ b/tests/packaging/lock-helper.test.ts
@@ -22,6 +22,7 @@ describe("withPackagingLock", () => {
       .mockImplementationOnce(() => 123);
     const closeSpy = vi.spyOn(fs, "closeSync").mockImplementation(() => undefined);
     const rmSpy = vi.spyOn(fs, "rmSync").mockImplementation(() => undefined);
+    vi.spyOn(fs, "writeFileSync").mockImplementation(() => undefined);
 
     const result = await withPackagingLock(async () => "ok");
 
diff --git a/tests/packaging/lock.ts b/tests/packaging/lock.ts
index 7061a133..11e8c1aa 100644
--- a/tests/packaging/lock.ts
+++ b/tests/packaging/lock.ts
@@ -5,6 +5,17 @@ const PACKAGING_LOCK_PATH = path.resolve(process.cwd(), ".packaging-test.lock");
 const LOCK_RETRY_INTERVAL_MS = 25;
 const RETRYABLE_LOCK_ERROR_CODES = new Set(["EEXIST", "EPERM", "EACCES"]);
 
+/**
+ * On-disk lock ownership record. Written into `.packaging-test.lock` when the
+ * lock is acquired so that a later waiter can decide whether an existing lock is
+ * held by a live process or is an orphan left behind by a holder that died (or
+ * by a vitest test that timed out before its `finally` could release the lock).
+ */
+interface LockMetadata {
+  pid: number;
+  timestamp: number;
+}
+
 function sleep(ms: number): Promise<void> {
   return new Promise((resolve) => {
     setTimeout(resolve, ms);
@@ -34,6 +45,8 @@ export async function withPackagingLock<T>(
   }
 
   try {
+    const metadata: LockMetadata = { pid: process.pid, timestamp: Date.now() };
+    fs.writeFileSync(lockFd, JSON.stringify(metadata));
     return await callback();
   } finally {
     fs.closeSync(lockFd);

From 95bb1e46959951751c26efd14b77ad5237135bf4 Mon Sep 17 00:00:00 2001
From: Chisanan232 <bulls23mj1991@gmail.com>
Date: Fri, 26 Jun 2026 12:19:29 +0800
Subject: [PATCH 2/4] =?UTF-8?q?=F0=9F=90=9B=20(test):=20Detect=20and=20rec?=
 =?UTF-8?q?laim=20a=20stale=20packaging=20lock=20with=20a=20bounded=20wait?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the unbounded retry loop with stale-lock detection: an orphaned
.packaging-test.lock (dead owner pid, or a 0-byte lock left by a previous run
past a short grace) is reclaimed and retried, and a large TTL backs that up
against pid reuse without ever evicting a live, slow-but-legitimate build.

Also bound the acquire wait just under the 30s per-test timeout so a lock held
by a live, non-stale process fails with an actionable error instead of an
opaque "Test timed out". Previously a leaked lock made every later packaging
test spin until its own timeout, cascading unrelated failures.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_019mSz31RysZF6DYToUoBWLf
---
 tests/packaging/lock.ts | 117 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)

diff --git a/tests/packaging/lock.ts b/tests/packaging/lock.ts
index 11e8c1aa..6a67ceb4 100644
--- a/tests/packaging/lock.ts
+++ b/tests/packaging/lock.ts
@@ -3,6 +3,24 @@ import path from "node:path";
 
 const PACKAGING_LOCK_PATH = path.resolve(process.cwd(), ".packaging-test.lock");
 const LOCK_RETRY_INTERVAL_MS = 25;
+// A lock holder writes its {pid,timestamp} within milliseconds of creating the
+// file, so a lock that is still empty after this grace window lost its writer
+// (a 0-byte lock left by a previous run, or a holder that crashed mid-create).
+const EMPTY_LOCK_GRACE_MS = 5_000;
+// TTL backstop for a lock whose owning pid still appears alive — e.g. after pid
+// reuse on a long-lived runner. It is deliberately far larger than any plausible
+// single packaging hold (build + npm pack run in seconds, not minutes) because
+// those builds run via blocking execSync: a slow-but-live build must never be
+// reclaimed out from under itself, which would let two builds race the same
+// dist/. Dead-pid detection, not this TTL, is the primary staleness signal.
+const STALE_LOCK_TTL_MS = 120_000;
+// Bound the acquire wait so a lock held by a live, non-stale process surfaces an
+// actionable error instead of an opaque "Test timed out". Kept just under the
+// 30s per-test budget: every packaging critical section is a multi-second build,
+// so a waiter that has not acquired the lock by here cannot finish before the
+// per-test timeout anyway — this only ever turns a would-be timeout into a clear
+// message, never failing a waiter that would otherwise have acquired in time.
+const LOCK_ACQUIRE_TIMEOUT_MS = 28_000;
 const RETRYABLE_LOCK_ERROR_CODES = new Set(["EEXIST", "EPERM", "EACCES"]);
 
 /**
@@ -22,10 +40,93 @@ function sleep(ms: number): Promise<void> {
   });
 }
 
+/**
+ * Returns true if a process with the given pid is still running. Uses the
+ * signal-0 probe, which performs the permission/existence check without actually
+ * delivering a signal. An EPERM means the process exists but is owned by another
+ * user, so it is treated as alive; any other error (notably ESRCH) means dead.
+ */
+function isProcessAlive(pid: number): boolean {
+  try {
+    process.kill(pid, 0);
+    return true;
+  } catch (error) {
+    return error instanceof Error && "code" in error && error.code === "EPERM";
+  }
+}
+
+/**
+ * Reads the ownership metadata of the current lock file, or undefined if the
+ * lock is absent, empty, or not yet written (a holder that created the file but
+ * crashed before writing its metadata).
+ */
+function readLockMetadata(): LockMetadata | undefined {
+  let raw: string;
+  try {
+    raw = fs.readFileSync(PACKAGING_LOCK_PATH, "utf8");
+  } catch {
+    return undefined;
+  }
+
+  try {
+    const parsed = JSON.parse(raw) as Partial<LockMetadata>;
+    if (typeof parsed.pid === "number" && typeof parsed.timestamp === "number") {
+      return { pid: parsed.pid, timestamp: parsed.timestamp };
+    }
+  } catch {
+    // Fall through: an unparseable lock is handled by the staleness fallback.
+  }
+  return undefined;
+}
+
+/**
+ * Age of the lock file on disk in milliseconds, used to judge staleness of a
+ * lock that has no parseable owner metadata. Returns undefined if the file no
+ * longer exists.
+ */
+function lockFileAgeMs(): number | undefined {
+  try {
+    return Date.now() - fs.statSync(PACKAGING_LOCK_PATH).mtimeMs;
+  } catch {
+    return undefined;
+  }
+}
+
+/**
+ * Reclaims the lock if it is an orphan, returning true when one was removed.
+ *
+ * A lock is treated as stale when its owning process is dead, when it exceeds
+ * the (large) TTL backstop, or — for a lock with no parseable metadata — once it
+ * is older than the short empty-lock grace window. A live holder is never
+ * reclaimed: it writes its metadata immediately on acquire, and its blocking
+ * build completes far inside the TTL.
+ */
+function reclaimStaleLock(): boolean {
+  const metadata = readLockMetadata();
+
+  let isStale: boolean;
+  if (metadata === undefined) {
+    const ageMs = lockFileAgeMs();
+    isStale = ageMs !== undefined && ageMs > EMPTY_LOCK_GRACE_MS;
+  } else {
+    isStale =
+      !isProcessAlive(metadata.pid) ||
+      Date.now() - metadata.timestamp > STALE_LOCK_TTL_MS;
+  }
+
+  if (!isStale) {
+    return false;
+  }
+
+  fs.rmSync(PACKAGING_LOCK_PATH, { force: true });
+  return true;
+}
+
 export async function withPackagingLock<T>(
   callback: () => Promise<T> | T
 ): Promise<T> {
   let lockFd: number | undefined;
+  const deadline = Date.now() + LOCK_ACQUIRE_TIMEOUT_MS;
 
   while (lockFd === undefined) {
     try {
@@ -40,6 +141,22 @@ export async function withPackagingLock<T>(
         throw error;
       }
 
+      // The lock is held. If it is an orphan (dead holder, or a stale 0-byte
+      // lock left by a previous run), reclaim it and retry immediately;
+      // otherwise back off and wait for the live holder to release it.
+      if (reclaimStaleLock()) {
+        continue;
+      }
+
+      if (Date.now() >= deadline) {
+        throw new Error(
+          `Timed out after ${LOCK_ACQUIRE_TIMEOUT_MS}ms acquiring packaging lock ` +
+            `at ${PACKAGING_LOCK_PATH}. It is held by a live, non-stale process; ` +
+            `if it is orphaned, delete the file and re-run.`,
+          { cause: error }
+        );
+      }
+
       await sleep(LOCK_RETRY_INTERVAL_MS);
     }
   }

From aa0526a029a6f9f3e3a85e08b110f023214a50db Mon Sep 17 00:00:00 2001
From: Chisanan232 <bulls23mj1991@gmail.com>
Date: Fri, 26 Jun 2026 12:19:29 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E2=9C=85=20(test):=20Add=20stale-lock=20re?=
 =?UTF-8?q?claim=20regression=20test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cover the path where the lock is held by a dead owner: the helper probes pid
liveness, reclaims the orphan, and acquires the lock so the callback runs.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_019mSz31RysZF6DYToUoBWLf
---
 tests/packaging/lock-helper.test.ts | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/packaging/lock-helper.test.ts b/tests/packaging/lock-helper.test.ts
index c4fed0ff..377d5ba3 100644
--- a/tests/packaging/lock-helper.test.ts
+++ b/tests/packaging/lock-helper.test.ts
@@ -32,6 +32,32 @@ describe("withPackagingLock", () => {
     expect(rmSpy).toHaveBeenCalledTimes(1);
   });
 
+  it("reclaims a stale lock whose owning process is dead", async () => {
+    const staleLock = JSON.stringify({ pid: 999999, timestamp: Date.now() });
+    const openSpy = vi
+      .spyOn(fs, "openSync")
+      .mockImplementationOnce(() => {
+        throw createFsError("EEXIST");
+      })
+      .mockImplementationOnce(() => 456);
+    vi.spyOn(fs, "readFileSync").mockReturnValue(staleLock);
+    const killSpy = vi.spyOn(process, "kill").mockImplementation(() => {
+      throw createFsError("ESRCH");
+    });
+    const rmSpy = vi.spyOn(fs, "rmSync").mockImplementation(() => undefined);
+    vi.spyOn(fs, "closeSync").mockImplementation(() => undefined);
+    vi.spyOn(fs, "writeFileSync").mockImplementation(() => undefined);
+
+    const result = await withPackagingLock(async () => "reclaimed");
+
+    expect(result).toBe("reclaimed");
+    // First open hits the orphan; after reclaim the retry acquires it.
+    expect(openSpy).toHaveBeenCalledTimes(2);
+    expect(killSpy).toHaveBeenCalledWith(999999, 0);
+    // Once to reclaim the orphan, once to release in the finally block.
+    expect(rmSpy).toHaveBeenCalledTimes(2);
+  });
+
   it("throws for non-retryable filesystem errors", async () => {
     vi.spyOn(fs, "openSync").mockImplementation(() => {
       throw createFsError("ENOENT");

From 0e6de6fbe7f6d078cb938f98737c54d81f9bdf51 Mon Sep 17 00:00:00 2001
From: Chisanan232 <bulls23mj1991@gmail.com>
Date: Fri, 26 Jun 2026 13:07:59 +0800
Subject: [PATCH 4/4] =?UTF-8?q?=F0=9F=90=9B=20(test):=20Make=20contention-?=
 =?UTF-8?q?retry=20lock=20test=20hermetic?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new stale-detection path reads the lock file and probes the owner pid, so
the existing contention-retry test became dependent on whatever real
.packaging-test.lock a concurrent packaging test holds on CI — a stale orphan
left by a sibling test made reclaim fire, breaking the assertions (and on the
slower CI Node 18 runner, timing the test out). Model "no lock on disk" by
mocking readFileSync/statSync so the test deterministically exercises the
not-stale fast-retry path independent of shared filesystem state.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_019mSz31RysZF6DYToUoBWLf
---
 tests/packaging/lock-helper.test.ts | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/packaging/lock-helper.test.ts b/tests/packaging/lock-helper.test.ts
index 377d5ba3..d207e7e3 100644
--- a/tests/packaging/lock-helper.test.ts
+++ b/tests/packaging/lock-helper.test.ts
@@ -23,6 +23,17 @@ describe("withPackagingLock", () => {
     const closeSpy = vi.spyOn(fs, "closeSync").mockImplementation(() => undefined);
     const rmSpy = vi.spyOn(fs, "rmSync").mockImplementation(() => undefined);
     vi.spyOn(fs, "writeFileSync").mockImplementation(() => undefined);
+    // Keep the staleness probe hermetic: a transient contention error is not a
+    // stale lock, so model "no lock file on disk" regardless of any real
+    // .packaging-test.lock a concurrent packaging test may hold on CI. This
+    // exercises the not-stale path, where the retry simply succeeds once
+    // contention clears, instead of letting the test depend on shared FS state.
+    vi.spyOn(fs, "readFileSync").mockImplementation(() => {
+      throw createFsError("ENOENT");
+    });
+    vi.spyOn(fs, "statSync").mockImplementation(() => {
+      throw createFsError("ENOENT");
+    });
 
     const result = await withPackagingLock(async () => "ok");