Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0ac35f2
fix(store): add realpath:false to prevent ENOENT after stale lock cle…
jlin53882 Apr 21, 2026
af18dab
fix(store): handle C1 TOCTOU race with ELOCKED retry-and-cleanup
jlin53882 Apr 21, 2026
d993cc6
test: add ELOCKED cleanup edge case tests for PR #674
jlin53882 Apr 21, 2026
fba399f
ci: add lock-recovery.test.mjs to core-regression group (Issue #670/P…
jlin53882 Apr 21, 2026
d4426b3
ci: add lock-recovery.test.mjs to verify baseline (Issue #670/PR #674)
jlin53882 Apr 21, 2026
8428447
fix(store): add lockfilePath + nested ELOCKED retry + reduce second r…
jlin53882 Apr 22, 2026
59b148c
fix(store): remove proactive cleanup noise logs (W3, PR#674)
jlin53882 Apr 22, 2026
8da1374
fix(store): guard release() in finally to prevent undefined TypeError…
jlin53882 Apr 28, 2026
5b55307
test: mark cleanup-failure test as it.skip — inert, cannot trigger EP…
jlin53882 Apr 28, 2026
fda4362
fix(store): ELOCKED safety + ENOTDIR recovery + stale artifact age ch…
Apr 29, 2026
6eb4f27
fix(test): remove git conflict marker left in lock-recovery.test.mjs …
Apr 29, 2026
abe77ea
test: improve cleanup-failure test — parent-dir chmod strategy, full …
Apr 29, 2026
9463ced
docs(store): 補充 proactive cleanup(5min) vs ELOCKED handler(10s) 雙重 th…
May 1, 2026
23682fb
Merge master into fix/issue670-clean: resolve package.json test scrip…
jlin53882 May 3, 2026
7cd0f54
fix(store): sync comment with actual retry params (~3s, not ~151s)
jlin53882 May 3, 2026
6ad0f73
fix(store): restore PR#415 retry params + add legacy .lock artifact c…
jlin53882 May 4, 2026
55fe730
fix(test): restore W2 concurrent write tests + fix comment drift (W2 …
jlin53882 May 4, 2026
3c623bf
fix(store): address all rwmjhb review issues (M1+M2+W3)
jlin53882 May 4, 2026
5a9ab61
fix(F1+F4 review fixes): align ELOCKED threshold to 5min; fix stale a…
jlin53882 May 4, 2026
91e64e3
fix(F6): downgrade ELOCKED handler console.warn to console.debug
jlin53882 May 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"author": "win4r",
"license": "MIT",
"scripts": {
"test": "node test/embedder-error-hints.test.mjs && node test/cjk-recursion-regression.test.mjs && node test/migrate-legacy-schema.test.mjs && node --test test/config-session-strategy-migration.test.mjs && node --test test/scope-access-undefined.test.mjs && node --test test/reflection-bypass-hook.test.mjs && node --test test/smart-extractor-scope-filter.test.mjs && node --test test/store-empty-scope-filter.test.mjs && node --test test/recall-text-cleanup.test.mjs && node test/update-consistency-lancedb.test.mjs && node --test test/strip-envelope-metadata.test.mjs && node test/cli-smoke.mjs && node test/functional-e2e.mjs && node --test test/per-agent-auto-recall.test.mjs && node test/retriever-rerank-regression.mjs && node test/smart-memory-lifecycle.mjs && node test/smart-extractor-branches.mjs && node test/plugin-manifest-regression.mjs && node --test test/session-summary-before-reset.test.mjs && node --test test/sync-plugin-version.test.mjs && node test/smart-metadata-v2.mjs && node test/vector-search-cosine.test.mjs && node test/context-support-e2e.mjs && node test/temporal-facts.test.mjs && node test/memory-update-supersede.test.mjs && node test/memory-upgrader-diagnostics.test.mjs && node --test test/llm-api-key-client.test.mjs && node --test test/llm-oauth-client.test.mjs && node --test test/cli-oauth-login.test.mjs && node --test test/workflow-fork-guards.test.mjs && node --test test/clawteam-scope.test.mjs && node --test test/cross-process-lock.test.mjs && node --test test/preference-slots.test.mjs && node test/is-latest-auto-supersede.test.mjs && node --test test/temporal-awareness.test.mjs && node --test test/command-reflection-guard.test.mjs",
"test": "node test/embedder-error-hints.test.mjs && node test/cjk-recursion-regression.test.mjs && node test/migrate-legacy-schema.test.mjs && node --test test/config-session-strategy-migration.test.mjs && node --test test/scope-access-undefined.test.mjs && node --test test/reflection-bypass-hook.test.mjs && node --test test/smart-extractor-scope-filter.test.mjs && node --test test/store-empty-scope-filter.test.mjs && node --test test/recall-text-cleanup.test.mjs && node test/update-consistency-lancedb.test.mjs && node --test test/strip-envelope-metadata.test.mjs && node test/cli-smoke.mjs && node test/functional-e2e.mjs && node --test test/per-agent-auto-recall.test.mjs && node test/retriever-rerank-regression.mjs && node test/smart-memory-lifecycle.mjs && node test/smart-extractor-branches.mjs && node test/plugin-manifest-regression.mjs && node --test test/session-summary-before-reset.test.mjs && node --test test/sync-plugin-version.test.mjs && node test/smart-metadata-v2.mjs && node test/vector-search-cosine.test.mjs && node test/context-support-e2e.mjs && node test/temporal-facts.test.mjs && node test/memory-update-supersede.test.mjs && node test/memory-update-metadata-refresh.test.mjs && node test/memory-upgrader-diagnostics.test.mjs && node --test test/llm-api-key-client.test.mjs && node --test test/llm-oauth-client.test.mjs && node --test test/cli-oauth-login.test.mjs && node --test test/workflow-fork-guards.test.mjs && node --test test/clawteam-scope.test.mjs && node --test test/cross-process-lock.test.mjs && node --test test/preference-slots.test.mjs && node test/is-latest-auto-supersede.test.mjs && node --test test/temporal-awareness.test.mjs && node --test test/command-reflection-guard.test.mjs && node --test test/lock-recovery.test.mjs",
"test:cli-smoke": "node scripts/run-ci-tests.mjs --group cli-smoke",
"test:core-regression": "node scripts/run-ci-tests.mjs --group core-regression",
"test:storage-and-schema": "node scripts/run-ci-tests.mjs --group storage-and-schema",
Expand Down
2 changes: 2 additions & 0 deletions scripts/ci-test-manifest.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ export const CI_TEST_MANIFEST = [
{ group: "storage-and-schema", runner: "node", file: "test/cross-process-lock.test.mjs", args: ["--test"] },
{ group: "core-regression", runner: "node", file: "test/lock-stress-test.mjs", args: ["--test"] },
{ group: "core-regression", runner: "node", file: "test/lock-release-on-error.test.mjs", args: ["--test"] },
// Issue #670 / PR #674: ELOCKED cleanup + TOCTOU race recovery
{ group: "core-regression", runner: "node", file: "test/lock-recovery.test.mjs", args: ["--test"] },
{ group: "core-regression", runner: "node", file: "test/preference-slots.test.mjs", args: ["--test"] },
{ group: "core-regression", runner: "node", file: "test/is-latest-auto-supersede.test.mjs" },
{ group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] },
Expand Down
2 changes: 2 additions & 0 deletions scripts/verify-ci-test-manifest.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ const EXPECTED_BASELINE = [
{ group: "storage-and-schema", runner: "node", file: "test/cross-process-lock.test.mjs", args: ["--test"] },
{ group: "core-regression", runner: "node", file: "test/lock-stress-test.mjs", args: ["--test"] },
{ group: "core-regression", runner: "node", file: "test/lock-release-on-error.test.mjs", args: ["--test"] },
// Issue #670 / PR #674: ELOCKED cleanup + TOCTOU race recovery
{ group: "core-regression", runner: "node", file: "test/lock-recovery.test.mjs", args: ["--test"] },
{ group: "core-regression", runner: "node", file: "test/preference-slots.test.mjs", args: ["--test"] },
{ group: "core-regression", runner: "node", file: "test/is-latest-auto-supersede.test.mjs" },
{ group: "core-regression", runner: "node", file: "test/temporal-awareness.test.mjs", args: ["--test"] },
Expand Down
224 changes: 188 additions & 36 deletions src/store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
mkdirSync,
realpathSync,
lstatSync,
rmSync,
statSync,
unlinkSync,
} from "node:fs";
Expand Down Expand Up @@ -212,49 +213,197 @@ export class MemoryStore {
private async runWithFileLock<T>(fn: () => Promise<T>): Promise<T> {
const lockfile = await loadLockfile();
const lockPath = join(this.config.dbPath, ".memory-write.lock");
if (!existsSync(lockPath)) {
try { mkdirSync(dirname(lockPath), { recursive: true }); } catch {}
try { const { writeFileSync } = await import("node:fs"); writeFileSync(lockPath, "", { flag: "wx" }); } catch {}
// Proactive cleanup threshold: 5 minutes —保守設定
// 只清理明顯過時(>5 分鐘)的 artifact,避免誤刪還在工作中的 lock holder
// 5 分鐘比 proper-lockfile 內部 stale threshold(10 秒)更寬鬆,因為:
// - proactive cleanup 是「預防性」清理(還沒有人抱怨),保守為上
// - ELOCKED retry 才是「復原性」處理(有人已經抱怨了),可以更積極
const staleThresholdMs = 5 * 60 * 1000;

// Ensure parent directory exists for lock artifact
if (!existsSync(this.config.dbPath)) {
try { mkdirSync(this.config.dbPath, { recursive: true }); } catch {}
}
// 【修復 #415】調整 retries:max wait 從 ~3100ms → ~151秒
// 指數退避:1s, 2s, 4s, 8s, 16s, 30s×5,總計約 151 秒

// Helper: proactively cleanup stale lock artifacts
// Cleans up both FILE artifacts (old v3) and DIRECTORY artifacts (v4)
// Also cleans legacy ${lockPath}.lock (v3 default artifact) for rolling upgrades.
// F3 NOTE: cleanup only checks lockPath + ${lockPath}.lock. If proper-lockfile
// is configured with a custom lockfilePath, this cleanup will miss it. Custom
// lockfilePath support would require storing the artifact path in config.
const cleanupStaleArtifact = () => {
// Primary artifact (v4 default: ${lockPath}.lock/ — directory)
// Note: we check lockPath directly; if artifact is at a custom lockfilePath
// (not supported in current impl), this cleanup will not detect it.
if (existsSync(lockPath)) {
try {
const stat = statSync(lockPath);
const ageMs = Date.now() - stat.mtimeMs;
if (ageMs > staleThresholdMs) {
if (stat.isDirectory()) {
try { rmSync(lockPath, { recursive: true, force: true }); } catch {}
} else {
try { unlinkSync(lockPath); } catch {}
}
}
} catch {}
}
// Legacy artifact (v3 default: ${lockPath}.lock) — rolling upgrade compatibility
const legacyPath = lockPath + ".lock";
if (existsSync(legacyPath)) {
try {
const stat = statSync(legacyPath);
const ageMs = Date.now() - stat.mtimeMs;
if (ageMs > staleThresholdMs) {
try { unlinkSync(legacyPath); } catch {}
}
} catch {} // intentionally swallow — proactive, non-critical
}
};

// Proactive cleanup before first lock attempt
cleanupStaleArtifact();

// 【修復 #415】保守設定:支撐 event loop 阻塞的高負載場景
// max wait ~151秒:指數退避 1s+2s+4s+8s+16s+30s×4,total ~151s
// ECOMPROMISED 透過 onCompromised callback 觸發(非 throw),使用 flag 機制正確處理
let isCompromised = false;
let compromisedErr: unknown = null;
let fnSucceeded = false;
let fnError: unknown = null;

// Proactive cleanup of stale lock artifacts(from PR #626)
// 根本避免 >5 分鐘的 lock artifact 導致 ECOMPROMISED
if (existsSync(lockPath)) {
try {
const stat = statSync(lockPath);
const ageMs = Date.now() - stat.mtimeMs;
const staleThresholdMs = 5 * 60 * 1000;
if (ageMs > staleThresholdMs) {
try { unlinkSync(lockPath); } catch {}
console.warn(`[memory-lancedb-pro] cleared stale lock: ${lockPath} ageMs=${ageMs}`);
const doLock = (retryOptions?: { retries: number; factor: number; minTimeout: number; maxTimeout: number }) =>
lockfile.lock(lockPath, {
// M2 FIX: 不再指定 lockfilePath,保持 v4 預設 artifact 行為。
// 預設 artifact = ${lockPath}.lock(目錄),與 legacy v3/v4 代碼一致。
// 移除 lockfilePath: lockPath 的原因:
// 新版設定 lockfilePath=lockPath 時,artifact = lockPath 本身(與 legacy 隔離),
// 混合版本環境下新舊進程可能各自持有不同的 lock 而不自知,破壞 mutual exclusion。
// v4 的 artifact 是 ${lockPath}.lock/ 目錄(裡面放微文件)。
// v3 的 artifact 是 ${lockPath}.lock/ FILE。
// cleanupStaleArtifact 和 ELOCKED handler 必須同時支援 FILE 和 DIR 兩種 artifact。
realpath: false,
retries: retryOptions ?? {
// 【修復 #415 保守設定】支撐 event loop 阻塞的高負載場景
// max wait ~151秒:指數退避 1s+2s+4s+8s+16s+30s×4,total ~151s
retries: 10,
factor: 2,
minTimeout: 1000, // James 保守設定:避免高負載下過度密集重試
maxTimeout: 30000, // James 保守設定:支撐更久的 event loop 阻塞
// IMPORTANT: This gives active-holder time to resolve (concurrent writes).
// If retries are exhausted, we check artifact age to decide cleanup vs propagate.
},
stale: 10000, // 10 秒後視為 stale,觸發 ECOMPROMISED callback
// 注意:ECOMPROMISED 是 ambiguous degradation 訊號,mtime 無法區分
// "holder 崩潰" vs "holder event loop 阻塞",所以不嘗試區分
onCompromised: (err: unknown) => {
// 【修復 #415 關鍵】必須是同步 callback
// setLockAsCompromised() 不等待 Promise,async throw 無法傳回 caller
isCompromised = true;
compromisedErr = err;
},
});

// F1 FIX: ELOCKED/ENOTDIR handler threshold — 統一用 proactive cleanup 的 5min threshold
// 避免不一致:若用 10s threshold,30s old 的 active holder artifact 會被
// ELOCKED handler 刪除(>10s),但 proactive cleanup 不會(<5min)——破壞 mutual exclusion。
// 統一用 5min:artifact age > 5min 才視為 stale,低於此值不刪,保持與 proactive cleanup 一致。
const STALE_THRESHOLD_MS = staleThresholdMs;

let release: (() => Promise<void>) | undefined;
try {
release = await doLock();
} catch (err: unknown) {
// C1: TOCTOU race — artifact created between proactive cleanup and lock().
// C2: ENOTDIR — proper-lockfile tried to create DIR artifact but found a FILE
// (legacy v3 FILE artifact lying around, or path collision).
// M2: Legacy rolling upgrade — old v3 code uses ${lockPath}.lock as artifact.
// Only retry if the artifact is confirmed STALE (another process died).
// If artifact is NOT stale, it belongs to an ACTIVE holder — we must NOT
// delete it, otherwise two processes enter the critical section simultaneously.
const errCode = (err as NodeJS.ErrnoException).code;

if (errCode === "ELOCKED" || errCode === "ENOTDIR") {
console.debug(`[memory-lancedb-pro] ${errCode} on first attempt, checking artifact age: ${lockPath}`);

// Helper: check + cleanup artifact at a given path, return whether retry should proceed
const tryCleanup = (artifactPath: string): boolean => {
if (!existsSync(artifactPath)) return false; // not found, caller handles
try {
const stat = statSync(artifactPath);
const age = Date.now() - stat.mtimeMs;
if (age > STALE_THRESHOLD_MS) {
// Artifact is STALE — the previous holder crashed or hung. Safe to delete.
// W3 FIX: wrap rmSync to handle race where another process recreates
// the artifact between statSync and rmSync (EOF/ENOENT means already gone).
try {
rmSync(artifactPath, { recursive: true, force: true });
console.debug(`[memory-lancedb-pro] removed stale ${artifactPath} (age=${age}ms>${STALE_THRESHOLD_MS}ms), retrying`);
return true; // proceed to retry
} catch (rmErr: unknown) {
const rmCode = (rmErr as NodeJS.ErrnoException).code;
if (rmCode === "ENOENT" || rmCode === "EBUSY") {
// Race: artifact was recreated or already deleted — treat as gone, proceed to retry
console.debug(`[memory-lancedb-pro] ${errCode} cleanup: rmSync ${rmCode} (artifact changed during cleanup), retrying`);
return true;
}
// Genuine cleanup failure
const wrapped = new Error(`${errCode} cleanup rm failed (${rmCode}): ${rmErr}`, { cause: rmErr });
(wrapped as NodeJS.ErrnoException).code = rmCode;
throw wrapped;
}
} else {
// Artifact is NOT stale — belongs to an ACTIVE holder.
// MR2 FIX: do not claim "NOT stale" for ENOTDIR — staleness is ELOCKED-specific.
const wrapped = new Error(`${errCode}: ${artifactPath} exists and is NOT stale (age=${age}ms≤${STALE_THRESHOLD_MS}ms); active holder present, not removing`, { cause: err });
(wrapped as NodeJS.ErrnoException).code = errCode;
throw wrapped;
}
} catch (statErr: unknown) {
const statCode = (statErr as NodeJS.ErrnoException).code;
if (statCode === "ENOENT") {
// TOCTOU: artifact disappeared between existsSync and statSync (another
// process released). Proceed to retry.
console.debug(`[memory-lancedb-pro] ${errCode} cleanup: statSync ENOENT (artifact gone), retrying`);
return true;
} else {
const errMsg = statErr instanceof Error ? (statErr as Error).message : String(statErr);
const wrapped = new Error(`${errCode} cleanup stat failed (${statCode}): ${errMsg}`, { cause: statErr });
(wrapped as NodeJS.ErrnoException).code = statCode;
throw wrapped;
}
}
};

let shouldRetry = tryCleanup(lockPath);
// M2: If primary path didn't trigger retry, check legacy v3 artifact for rolling upgrades
if (!shouldRetry) {
const legacyPath = lockPath + ".lock";
shouldRetry = tryCleanup(legacyPath);
}
} catch {}
}

const release = await lockfile.lock(lockPath, {
retries: {
retries: 10,
factor: 2,
minTimeout: 1000, // James 保守設定:避免高負載下過度密集重試
maxTimeout: 30000, // James 保守設定:支撐更久的 event loop 阻塞
},
stale: 10000, // 10 秒後視為 stale,觸發 ECOMPROMISED callback
// 注意:ECOMPROMISED 是 ambiguous degradation 訊號,mtime 無法區分
// "holder 崩潰" vs "holder event loop 阻塞",所以不嘗試區分
onCompromised: (err: unknown) => {
// 【修復 #415 關鍵】必須是同步 callback
// setLockAsCompromised() 不等待 Promise,async throw 無法傳回 caller
isCompromised = true;
compromisedErr = err;
},
});
// M1 FIX: must never execute fn() without a confirmed lock acquisition.
// shouldRetry=false means an ACTIVE holder is present (age <= STALE_THRESHOLD_MS).
// Fall-through would run fn() without holding any lock — breaks mutual exclusion.
if (shouldRetry) {
// FIX_W2: second attempt — use minimal retries since artifact was confirmed stale
try {
release = await doLock({ retries: 2, factor: 1, minTimeout: 100, maxTimeout: 500 });
} catch (retryErr: unknown) {
const errMsg = retryErr instanceof Error ? (retryErr as Error).message : String(retryErr);
const retryCode = (retryErr as NodeJS.ErrnoException).code || "UNKNOWN";
throw new Error(`${errCode} retry failed (${retryCode}): ${errMsg}`, { cause: retryErr });
}
} else {
// shouldRetry=false means active holder present; propagate ELOCKED instead.
const wrapped = new Error(`ELOCKED: active lock holder present on both ${lockPath} and ${lockPath + ".lock"}; not removing, propagating`, { cause: err });
(wrapped as NodeJS.ErrnoException).code = "ELOCKED";
throw wrapped;
}
} else {
throw err;
}
}

try {
const result = await fn();
Expand All @@ -266,8 +415,12 @@ export class MemoryStore {
} finally {
// 【修復 #415 BUG】release() 必須在 isCompromised 判斷之前呼叫
// 否則當 fnError !== null 且 isCompromised === true 時,release() 不會被呼叫,lock 永久洩漏
// 【修復 #415 Must Fix #1】若 release 未被賦值(non-ELOCKED error 在賦值前就 throw),
// 直接跳過 release,避免 TypeError: release is not a function
try {
await release();
if (release) {
await release();
}
} catch (e: unknown) {
if ((e as NodeJS.ErrnoException).code === 'ERELEASED') {
// ERELEASED 是預期行為(compromised lock release),忽略
Expand Down Expand Up @@ -295,7 +448,6 @@ export class MemoryStore {
}
}
}

get dbPath(): string {
return this.config.dbPath;
}
Expand Down
Loading
Loading