From db0da2ef1c3e8b2caf986a62280921c60663fb92 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 20 May 2026 07:43:34 -0700 Subject: [PATCH 1/3] fix(markdown): YAML-aware NESTED_QUOTES validator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The validator at src/core/markdown.ts:219-238 was a syntactic count-of-quotes heuristic that flagged any frontmatter line with 3+ unescaped " characters. That heuristic is too dumb: valid YAML flow sequences like `tags: ["yc", "w2025"]` and single-quoted scalars like `title: 'a: "b" "c"'` both have 3+ unescaped " by design. Fix: keep the count fast path, then disambiguate with js-yaml.safeLoad on the value. Only flag lines that genuinely fail to parse. The full-frontmatter YAML_PARSE check (check 6) still catches structural failures. Closes the 6,981-error class on Garry's 105K-page brain in one ~10 LOC change — existing data on disk was already valid YAML; the validator was wrong about it. No `gbrain frontmatter generate --fix` sweep needed. js-yaml@3.14.2 promoted from transitive (via gray-matter) to direct dependency. @types/js-yaml@3.12.10 added to devDependencies. 5 new YAML-aware test cases in test/markdown-validation.test.ts: - flow sequence with quoted tags does NOT trigger (6,981 regression guard) - single-quoted scalar with literal inner double quotes does NOT trigger - escaped-as-'' quotes inside flow seq do NOT trigger - genuinely broken nested quotes STILL trigger - unclosed bracket STILL surfaces NESTED_QUOTES or YAML_PARSE Closes PR #1217 — outside-voice (codex) review caught that the bug was the validator, not the emitter. Original 6,981-error signal from @garrytan-agents. --- bun.lock | 4 +++ package.json | 4 ++- src/core/markdown.ts | 26 ++++++++++++++++--- test/markdown-validation.test.ts | 44 ++++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 4 deletions(-) diff --git a/bun.lock b/bun.lock index 86f62516e..1f5e9dbe7 100644 --- a/bun.lock +++ b/bun.lock @@ -25,6 +25,7 @@ "express-rate-limit": "^7.5.0", "gray-matter": "^4.0.3", "heic-decode": "^2.1.0", + "js-yaml": "^3.14.2", "marked": "^18.0.0", "openai": "^4.0.0", "pgvector": "^0.2.0", @@ -38,6 +39,7 @@ "@types/cookie-parser": "^1.4.7", "@types/cors": "^2.8.19", "@types/express": "^5.0.6", + "@types/js-yaml": "^3.12.10", "bun-types": "^1.3.13", "typescript": "^5.6.0", }, @@ -275,6 +277,8 @@ "@types/http-errors": ["@types/http-errors@2.0.5", "", {}, "sha512-r8Tayk8HJnX0FztbZN7oVqGccWgw98T/0neJphO91KkmOzug1KkofZURD4UaD5uH8AqcFLfdPErnBod0u71/qg=="], + "@types/js-yaml": ["@types/js-yaml@3.12.10", "", {}, "sha512-/Mtaq/wf+HxXpvhzFYzrzCqNRcA958sW++7JOFC8nPrZcvfi/TrzOaaGbvt27ltJB2NQbHVAg5a1wUCsyMH7NA=="], + "@types/node": ["@types/node@25.5.2", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-tO4ZIRKNC+MDWV4qKVZe3Ql/woTnmHDr5JD8UI5hn2pwBrHEwOEMZK7WlNb5RKB6EoJ02gwmQS9OrjuFnZYdpg=="], "@types/node-fetch": ["@types/node-fetch@2.6.13", "", { "dependencies": { "@types/node": "*", "form-data": "^4.0.4" } }, "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw=="], diff --git a/package.json b/package.json index 1735a0685..924d6f6d1 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gbrain", - "version": "0.37.1.0", + "version": "0.40.0.1", "description": "Postgres-native personal knowledge brain with hybrid RAG search", "type": "module", "main": "src/core/index.ts", @@ -98,6 +98,7 @@ "express-rate-limit": "^7.5.0", "gray-matter": "^4.0.3", "heic-decode": "^2.1.0", + "js-yaml": "^3.14.2", "marked": "^18.0.0", "openai": "^4.0.0", "pgvector": "^0.2.0", @@ -111,6 +112,7 @@ "@types/cookie-parser": "^1.4.7", "@types/cors": "^2.8.19", "@types/express": "^5.0.6", + "@types/js-yaml": "^3.12.10", "bun-types": "^1.3.13", "typescript": "^5.6.0" }, diff --git a/src/core/markdown.ts b/src/core/markdown.ts index 1d697f26d..d0f61ef4b 100644 --- a/src/core/markdown.ts +++ b/src/core/markdown.ts @@ -1,4 +1,5 @@ import matter from 'gray-matter'; +import { safeLoad as yamlSafeLoad } from 'js-yaml'; import type { PageType } from './types.ts'; import { slugifyPath } from './sync.ts'; @@ -217,8 +218,14 @@ function collectValidationErrors( } // 5. NESTED_QUOTES — common breakage pattern: `title: "Name "Nick" Last"`. - // Detect any frontmatter `key: ...` line whose value contains 3 or more - // unescaped double-quote characters. A clean quoted value has 2. + // The heuristic: a frontmatter `key: value` line with 3+ unescaped + // double-quote characters is suspicious. But raw quote-counting is + // too dumb: a YAML flow sequence like `tags: ["yc", "w2025"]` has + // 4 unescaped `"` by design (valid), and a single-quoted scalar + // like `title: 'a: "b" "c"'` has literal inner `"` (also valid). + // Disambiguate by running js-yaml on just the value; only flag + // lines that genuinely fail to parse. The full-frontmatter YAML + // parse error is caught separately by check 6 (YAML_PARSE) below. for (let i = firstNonEmpty + 1; i < closeLine; i++) { const line = lines[i]; const m = line.match(/^\s*[A-Za-z_][\w-]*\s*:\s*(.*)$/); @@ -228,7 +235,20 @@ function collectValidationErrors( for (let j = 0; j < value.length; j++) { if (value[j] === '"' && (j === 0 || value[j - 1] !== '\\')) count++; } - if (count >= 3) { + if (count < 3) continue; + + // 3+ unescaped quotes — could be valid YAML (flow seq, single-quoted + // scalar with inner quotes, bare scalar with embedded quotes) or + // genuinely broken. Parse the value to disambiguate. + let isValidYaml = false; + try { + yamlSafeLoad(value); + isValidYaml = true; + } catch { + // YAML parse failed — line is genuinely broken + } + + if (!isValidYaml) { errors.push({ code: 'NESTED_QUOTES', message: 'Nested double quotes in YAML value (use single quotes for the outer)', diff --git a/test/markdown-validation.test.ts b/test/markdown-validation.test.ts index 0b2b03d0e..9e5f8aab0 100644 --- a/test/markdown-validation.test.ts +++ b/test/markdown-validation.test.ts @@ -135,6 +135,50 @@ describe('parseMarkdown validation surface', () => { }); }); + // The validator's count-of-quotes heuristic is too dumb: it flagged + // valid YAML flow sequences (the v0.x 6,981-error class on Garry's + // brain) and single-quoted scalars with literal inner quotes. The + // fallback runs js-yaml.safeLoad on suspicious values; only flags + // genuinely unparseable lines. + describe('NESTED_QUOTES — YAML-aware fallback', () => { + test('flow sequence with quoted tags does NOT trigger (6,981-error regression guard)', () => { + const md = `${fence}\ntype: concept\ntitle: x\ntags: ["yc", "w2025", "ai"]\n${fence}\n\nbody`; + const parsed = parseMarkdown(md, undefined, { validate: true }); + expect(parsed.errors!.filter(e => e.code === 'NESTED_QUOTES')).toHaveLength(0); + }); + + test('single-quoted scalar with literal inner double quotes does NOT trigger', () => { + // value: 'a: "b" "c" "d"' — 6 unescaped " by raw count, but valid YAML + const md = `${fence}\ntype: concept\ntitle: 'a: "b" "c" "d"'\n${fence}\n\nbody`; + const parsed = parseMarkdown(md, undefined, { validate: true }); + expect(parsed.errors!.filter(e => e.code === 'NESTED_QUOTES')).toHaveLength(0); + }); + + test('escaped-as-single-pair quotes inside flow seq do NOT trigger', () => { + const md = `${fence}\ntype: concept\ntitle: x\ntags: ["Men''s Fashion", "yc"]\n${fence}\n\nbody`; + const parsed = parseMarkdown(md, undefined, { validate: true }); + expect(parsed.errors!.filter(e => e.code === 'NESTED_QUOTES')).toHaveLength(0); + }); + + test('genuinely broken nested quotes STILL trigger', () => { + // Outer " followed by stray inner " — yaml.safeLoad throws. + const md = `${fence}\ntype: concept\ntitle: "Foo "bar" baz "qux" end"\n${fence}\n\nbody`; + const parsed = parseMarkdown(md, undefined, { validate: true }); + expect(parsed.errors!.map(e => e.code)).toContain('NESTED_QUOTES'); + }); + + test('unclosed bracket on a suspicious line STILL surfaces some parse error', () => { + // Either NESTED_QUOTES (line-level parse fail) or YAML_PARSE + // (whole-frontmatter parse fail) — never silent. + const md = `${fence}\ntype: concept\ntitle: x\ntags: ["yc", "w2025"\n${fence}\n\nbody`; + const parsed = parseMarkdown(md, undefined, { validate: true }); + const broken = parsed.errors!.filter( + e => e.code === 'NESTED_QUOTES' || e.code === 'YAML_PARSE' + ); + expect(broken.length).toBeGreaterThan(0); + }); + }); + describe('EMPTY_FRONTMATTER', () => { test('--- --- with nothing between', () => { const md = `${fence}\n${fence}\n\nbody`; From b0dd4d2bf71b52fa9dd9507aceaffa4921a33568 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 20 May 2026 07:43:41 -0700 Subject: [PATCH 2/3] chore: bump version and changelog (v0.40.0.1) Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ TODOS.md | 5 +++++ VERSION | 2 +- 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6643310e8..f8132dcc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,55 @@ All notable changes to GBrain will be documented in this file. +## [0.40.0.1] - 2026-05-20 + +**`gbrain doctor` stops flagging your tags as broken when they're not.** + +If you have a tag line like `tags: ["yc", "w2025", "ai"]` in your frontmatter, that's perfectly valid YAML. The doctor used to flag it anyway. One user with a 105K-page brain saw 6,981 of these flagged at once. The fix: the validator now parses suspicious values with a real YAML parser before complaining. Valid YAML stops getting flagged. Genuinely broken titles like `title: "Foo "bar" baz"` still get caught. + +**How to use it** + +```bash +gbrain upgrade +gbrain doctor --json | jq '.checks[] + | select(.name=="frontmatter_integrity") + | .breakdown.NESTED_QUOTES' +``` + +The NESTED_QUOTES count on your brain should drop toward zero on next `gbrain doctor`. No data rewrite needed. No `gbrain frontmatter generate --fix` sweep. The existing files are already valid YAML. + +**Why this is the right layer** + +`@garrytan-agents` opened PR #1217 with a one-line fix to the emitter side: switch tag serialization from `JSON.stringify` (double-quoted) to single-quoted YAML. That made the headline 6,981-error case go away by changing what new writes look like. But Codex's outside-voice review during planning caught a deeper bug: even with a perfect emitter, the validator at `src/core/markdown.ts:219-238` was a raw quote counter (`count(unescaped ") >= 3 => error`) that doesn't understand YAML at all. It would still flag a clean single-quoted scalar like `title: 'a: "b" "c"'` (6 unescaped `"` characters, but valid YAML). The fix had to land on the dumb side, not the emitter side. + +PR #1217 was closed; thanks to @garrytan-agents for the 6,981-error signal that exposed the underlying class. + +**What's safe to know about** + +- The fix is additive. Lines with `< 3` unescaped quotes still pass instantly (existing fast path). Only suspicious lines pay the per-line YAML parse, and only when count >= 3 (rare on healthy data). +- `js-yaml@3.14.2` is now a direct dependency (was transitive via gray-matter). Adding a direct pin so a future gray-matter major bump can't yank the import. +- The frontmatter emitter at `src/core/frontmatter-inference.ts` is unchanged. Existing tag style (`tags: ["yc"]`) is now correctly recognized as valid; the cosmetic consistency with `brain-writer.ts:184`'s single-quote repair style is a follow-up TODO, not part of this fix. + +### Itemized changes + +#### Fixed + +- `src/core/markdown.ts:219-238` — NESTED_QUOTES validator now disambiguates via `js-yaml.safeLoad`. The count-of-quotes heuristic stays as a fast path; suspicious lines (count >= 3) are parsed before being flagged. Closes the 6,981-error class for any brain whose frontmatter has been valid all along. +- `js-yaml` declared as a direct dependency in `package.json`; `@types/js-yaml` added to devDependencies. `bun.lock` re-resolves the transitive entry to a top-level pin (no version change). + +#### Added + +- 5 new YAML-aware regression cases in `test/markdown-validation.test.ts`: + - flow sequence with quoted tags does NOT trigger (6,981-error regression guard) + - single-quoted scalar with literal inner double quotes does NOT trigger + - escaped-as-`''` quotes inside flow seq do NOT trigger + - genuinely broken nested quotes STILL trigger + - unclosed bracket STILL surfaces NESTED_QUOTES or YAML_PARSE (never silent) + +#### For contributors + +- `js-yaml` is now an explicit direct dep. New code that needs YAML emission/parsing should import from it directly rather than relying on gray-matter's transitive resolution. + ## [0.37.1.0] - 2026-05-19 **Your brain can now collide its own ideas.** diff --git a/TODOS.md b/TODOS.md index 9e9731cca..35791697e 100644 --- a/TODOS.md +++ b/TODOS.md @@ -1,6 +1,11 @@ # TODOS +## v0.40.0.1 NESTED_QUOTES validator follow-up + +- [ ] **v0.40.x+: unify `serializeFrontmatter` tag/title quoting with `brain-writer.ts:184`'s single-quote-with-`''`-escape style for consistency.** Cosmetic only now that the validator at `src/core/markdown.ts:219-238` is YAML-aware (v0.40.0.1). Today the emitter still produces `tags: ["yc"]` (double-quoted via `JSON.stringify`) while the repair path produces `tags: ['yc']` (single-quoted). Both are valid YAML and the validator accepts both, so this is cosmetic — but new writes drifting from repair-side output reads as inconsistency. Original signal: PR #1217 by @garrytan-agents (closed in favor of the validator fix). Touch `src/core/frontmatter-inference.ts:391-416` only; should be ~5 LOC + the existing test at `test/frontmatter-inference.test.ts:239` updated. + + ## v0.35.6.0 floor-ratio gate follow-ups (v0.36.x+) - [ ] **v0.36.x: Run gbrain-side floor-ratio ablation before flipping any mode-bundle default.** v0.35.6.0 ships the gate default-off (`MODE_BUNDLES[*].floor_ratio = undefined`) because the SkyTwin labeled-retrieval ablation that surfaced the regression isn't reproducible on gbrain's own eval surfaces from outside. Before any mode-bundle default flip, run the gate at `floor_ratio: undefined`, 0.85, 0.90, 0.95 across `gbrain eval longmemeval`, `gbrain eval whoknows`, `gbrain eval suspected-contradictions`, and the BrainBench-Real replay (sibling gbrain-evals repo). Quantify per-mode P@k / R@k / nDCG@k / top-1 stability deltas. Look for: regression on queries that genuinely need the long-tail boost (specific entity lookups, low-frequency topics) vs improvement on queries where weak-overlap pages were leapfrogging. The corpus-level finding determines whether tokenmax (most exposure to the failure mode) should flip first, or whether the gate stays a per-call opt-in indefinitely. Filed during v0.35.6.0 codex outside-voice review. diff --git a/VERSION b/VERSION index 5f878d2ab..ddaa694e8 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.37.1.0 \ No newline at end of file +0.40.0.1 \ No newline at end of file From d38441195187d27adb22a22a0fc45577e9c78ab8 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 20 May 2026 18:28:44 -0700 Subject: [PATCH 3/3] chore: retarget version slot v0.40.0.1 -> v0.37.5.0 Same content, different slot in the version queue. v0.40.0.1 was the queue allocator's default safe slot (bumped past PR #1128's claimed 0.40.0.0). v0.37.5.0 is a PATCH above #1228's claimed 0.37.4.0 and sits closer to current master (0.37.1.0) in CHANGELOG ordering. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 2 +- TODOS.md | 4 ++-- VERSION | 2 +- package.json | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f8132dcc2..562926a87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes to GBrain will be documented in this file. -## [0.40.0.1] - 2026-05-20 +## [0.37.5.0] - 2026-05-20 **`gbrain doctor` stops flagging your tags as broken when they're not.** diff --git a/TODOS.md b/TODOS.md index 35791697e..c7085dd56 100644 --- a/TODOS.md +++ b/TODOS.md @@ -1,9 +1,9 @@ # TODOS -## v0.40.0.1 NESTED_QUOTES validator follow-up +## v0.37.5.0 NESTED_QUOTES validator follow-up -- [ ] **v0.40.x+: unify `serializeFrontmatter` tag/title quoting with `brain-writer.ts:184`'s single-quote-with-`''`-escape style for consistency.** Cosmetic only now that the validator at `src/core/markdown.ts:219-238` is YAML-aware (v0.40.0.1). Today the emitter still produces `tags: ["yc"]` (double-quoted via `JSON.stringify`) while the repair path produces `tags: ['yc']` (single-quoted). Both are valid YAML and the validator accepts both, so this is cosmetic — but new writes drifting from repair-side output reads as inconsistency. Original signal: PR #1217 by @garrytan-agents (closed in favor of the validator fix). Touch `src/core/frontmatter-inference.ts:391-416` only; should be ~5 LOC + the existing test at `test/frontmatter-inference.test.ts:239` updated. +- [ ] **v0.37.x+: unify `serializeFrontmatter` tag/title quoting with `brain-writer.ts:184`'s single-quote-with-`''`-escape style for consistency.** Cosmetic only now that the validator at `src/core/markdown.ts:219-238` is YAML-aware (v0.37.5.0). Today the emitter still produces `tags: ["yc"]` (double-quoted via `JSON.stringify`) while the repair path produces `tags: ['yc']` (single-quoted). Both are valid YAML and the validator accepts both, so this is cosmetic — but new writes drifting from repair-side output reads as inconsistency. Original signal: PR #1217 by @garrytan-agents (closed in favor of the validator fix). Touch `src/core/frontmatter-inference.ts:391-416` only; should be ~5 LOC + the existing test at `test/frontmatter-inference.test.ts:239` updated. ## v0.35.6.0 floor-ratio gate follow-ups (v0.36.x+) diff --git a/VERSION b/VERSION index ddaa694e8..62767a28d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.40.0.1 \ No newline at end of file +0.37.5.0 \ No newline at end of file diff --git a/package.json b/package.json index 924d6f6d1..7672b1e3d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gbrain", - "version": "0.40.0.1", + "version": "0.37.5.0", "description": "Postgres-native personal knowledge brain with hybrid RAG search", "type": "module", "main": "src/core/index.ts",