From 8baa953d0ae9ba9c0f32fd8861b25332f35f293c Mon Sep 17 00:00:00 2001 From: garrytan-agents Date: Wed, 20 May 2026 19:07:13 +0000 Subject: [PATCH] fix(frontmatter): defense-in-depth against JSON-style arrays in YAML MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three layers to stop NESTED_QUOTES from recurring: 1. **autoFixFrontmatter (brain-writer.ts):** New step 3a detects and rewrites JSON-style arrays (`["x", "y"]` → `['x', 'y']`) before the existing nested-quote scalar fix. Handles apostrophes in values by falling back to double quotes. Runs on `frontmatter validate --fix` and `writeBrainPage({autoFix: true})`. 2. **Validator (markdown.ts):** NESTED_QUOTES detection now has two sub-patterns — 5a catches JSON-style arrays specifically (with a clearer error message: "use single quotes") and 5b catches the original nested scalar quotes. 3. **put_page normalization (operations.ts):** Every `put_page` call now runs `autoFixFrontmatter()` on incoming content before import. Non-blocking — if normalization throws, original content is used. This means agent-written pages with JSON arrays are silently fixed on write instead of accumulating thousands of validation errors. 4. **Agent guidance (frontmatter-guard SKILL.md):** New "Prevention" section with correct/incorrect YAML examples, explaining WHY JSON.stringify causes the bug and what to do instead. Agents that read this skill before writing frontmatter will avoid the pattern. Root cause: LLMs and ingestion code use JSON.stringify for YAML array items, producing `tags: ["yc", "w2025"]` which breaks YAML parsing. This caused 6,981 errors across a 105K-page brain. Companion to PR #1217 (serializer fix in frontmatter-inference.ts). --- skills/frontmatter-guard/SKILL.md | 47 ++++++++++++++++++++++ src/core/brain-writer.ts | 67 ++++++++++++++++++++++++++++--- src/core/markdown.ts | 20 +++++++-- src/core/operations.ts | 14 ++++++- 4 files changed, 138 insertions(+), 10 deletions(-) diff --git a/skills/frontmatter-guard/SKILL.md b/skills/frontmatter-guard/SKILL.md index 61e5e4ec2..7567a9fbf 100644 --- a/skills/frontmatter-guard/SKILL.md +++ b/skills/frontmatter-guard/SKILL.md @@ -167,6 +167,53 @@ JSON envelope (when `--json` is passed): `gbrain frontmatter validate --json` returns a similar envelope keyed on per-file results instead of per-source. +## Prevention — Writing Valid Frontmatter + +**This is the most important section.** Fixing broken frontmatter is good. Not breaking it is better. + +### YAML arrays (the #1 error source) + +```yaml +# ✅ CORRECT — single-quoted YAML flow +tags: ['yc', 'w2025', 'ai'] + +# ✅ CORRECT — unquoted (if values have no special chars) +tags: [yc, w2025, ai] + +# ✅ CORRECT — block style +tags: + - yc + - w2025 + +# ❌ WRONG — JSON-style double quotes (causes NESTED_QUOTES) +tags: ["yc", "w2025"] + +# ❌ WRONG — mixed JSON objects and strings +tags: [{"name": "sports"}, "posterous"] +``` + +**Why this happens:** `JSON.stringify()` wraps strings in double quotes. When code does `tags: [${items.map(t => JSON.stringify(t)).join(', ')}]`, it produces the broken pattern. Use single quotes instead: `tags: [${items.map(t => "'" + t + "'").join(', ')}]` (with apostrophe fallback to double quotes). + +### Quoted scalars + +```yaml +# ✅ CORRECT — single quotes for values with special chars +title: 'My "Quoted" Title' + +# ✅ CORRECT — double quotes when value has apostrophes +title: "Men's Fashion Guide" + +# ❌ WRONG — double quotes wrapping inner double quotes +title: "My "Quoted" Title" +``` + +### When to quote at all + +- **Unquoted** is fine for simple values: `type: person`, `batch: w2025` +- **Quote** when the value contains `: " ' # [ ] { } | > & * ! ? ,` or starts with `@` +- **Single quotes** are the default safe choice +- **Double quotes** only when the value itself contains apostrophes + ## Anti-Patterns **Don't auto-fix `MISSING_OPEN` or `EMPTY_FRONTMATTER` without user input.** These usually mean a human author started a page and didn't finish — silently inserting `---` markers around an unfinished draft is wrong. diff --git a/src/core/brain-writer.ts b/src/core/brain-writer.ts index 6a25eb510..6967172ec 100644 --- a/src/core/brain-writer.ts +++ b/src/core/brain-writer.ts @@ -152,7 +152,67 @@ export function autoFixFrontmatter( } } - // 3. NESTED_QUOTES — rewrite `key: "...inner..."` lines that have 3+ unescaped + // 3a. JSON_ARRAY_IN_YAML — rewrite `key: ["x", "y"]` to `key: ['x', 'y']`. + // This is the #1 source of NESTED_QUOTES errors. LLMs and ingestion + // scripts serialize YAML arrays with JSON.stringify, producing double- + // quoted items that break YAML parsing. Fix: single-quote each item, + // falling back to double quotes only when the value itself contains + // an apostrophe. + { + const lines = working.split('\n'); + let firstNonEmpty = -1; + for (let i = 0; i < lines.length; i++) { + if (lines[i].trim().length > 0) { firstNonEmpty = i; break; } + } + if (firstNonEmpty >= 0 && lines[firstNonEmpty].trim() === '---') { + let closeIdx = lines.length; + for (let i = firstNonEmpty + 1; i < lines.length; i++) { + if (lines[i].trim() === '---') { closeIdx = i; break; } + } + let fixedAny = false; + for (let i = firstNonEmpty + 1; i < closeIdx; i++) { + // Detect JSON-style arrays: key: ["val1", "val2"] + const arrMatch = lines[i].match(/^(\s*[A-Za-z_][\w-]*\s*:\s*)\[(.*)\]\s*$/); + if (arrMatch && arrMatch[2].includes('"')) { + const [, prefix, inner] = arrMatch; + // Parse the items: split on ", " boundaries respecting quotes + const items: string[] = []; + let current = ''; + let inQuote = false; + for (let j = 0; j < inner.length; j++) { + const ch = inner[j]; + if (ch === '"' && (j === 0 || inner[j - 1] !== '\\')) { + inQuote = !inQuote; + } else if (ch === ',' && !inQuote) { + items.push(current.trim()); + current = ''; + } else { + current += ch; + } + } + if (current.trim()) items.push(current.trim()); + + // Re-quote each item with single quotes (double if it contains apostrophe) + const reQuoted = items.map(v => { + const clean = v.replace(/^"|"$/g, '').trim(); + if (!clean) return "''"; + return clean.includes("'") ? `"${clean}"` : `'${clean}'`; + }); + lines[i] = `${prefix}[${reQuoted.join(', ')}]`; + fixedAny = true; + } + } + if (fixedAny) { + working = lines.join('\n'); + fixes.push({ + code: 'NESTED_QUOTES', + description: 'Rewrote JSON-style double-quoted arrays to single-quoted YAML', + }); + } + } + } + + // 3b. NESTED_QUOTES — rewrite `key: "...inner..."` lines that have 3+ unescaped // double-quotes by switching the outer wrapper to single quotes and // leaving inner quotes alone. { @@ -175,12 +235,7 @@ export function autoFixFrontmatter( for (let j = 0; j < inner.length; j++) { if (inner[j] === '"' && (j === 0 || inner[j - 1] !== '\\')) count++; } - // Total " on the line includes the two outer quotes the regex - // captured, plus whatever's in inner. We need 3+ to trigger. if (count >= 1) { - // Inner already has unescaped " — outer wrap is causing the YAML - // parse failure. Rewrite to 'single-quoted'. YAML escapes `'` inside - // a single-quoted string by doubling it. const escapedInner = inner.replace(/'/g, "''"); lines[i] = `${prefix}'${escapedInner}'${trailing ? ' ' + trailing : ''}`.replace(/\s+$/, ''); fixedAny = true; diff --git a/src/core/markdown.ts b/src/core/markdown.ts index 1d697f26d..39ef9d904 100644 --- a/src/core/markdown.ts +++ b/src/core/markdown.ts @@ -216,14 +216,28 @@ function collectValidationErrors( }); } - // 5. NESTED_QUOTES — common breakage pattern: `title: "Name "Nick" Last"`. - // Detect any frontmatter `key: ...` line whose value contains 3 or more - // unescaped double-quote characters. A clean quoted value has 2. + // 5. NESTED_QUOTES — two sub-patterns: + // 5a. JSON-style arrays: `tags: ["yc", "w2025"]` — the #1 source. + // LLMs and ingestion scripts use JSON.stringify for array items. + // 5b. Nested scalar quotes: `title: "Name "Nick" Last"` — 3+ unescaped + // double-quote characters in a scalar value. for (let i = firstNonEmpty + 1; i < closeLine; i++) { const line = lines[i]; const m = line.match(/^\s*[A-Za-z_][\w-]*\s*:\s*(.*)$/); if (!m) continue; const value = m[1]; + + // 5a. JSON-style array: ["...", "..."] + if (/^\[.*".*".*\]$/.test(value.trim())) { + errors.push({ + code: 'NESTED_QUOTES', + message: 'JSON-style double-quoted array in YAML (use single quotes: [\'val1\', \'val2\'])', + line: i + 1, + }); + continue; + } + + // 5b. Nested scalar quotes: 3+ unescaped double-quote chars. let count = 0; for (let j = 0; j < value.length; j++) { if (value[j] === '"' && (j === 0 || value[j - 1] !== '\\')) count++; diff --git a/src/core/operations.ts b/src/core/operations.ts index eb733ea67..427db16e9 100644 --- a/src/core/operations.ts +++ b/src/core/operations.ts @@ -585,7 +585,19 @@ const put_page: Operation = { // default-source clobber path. importFromContent already accepts // opts.sourceId (PR #707/#757 engine work); previously the op handler // just didn't pass it. - const result = await importFromContent(ctx.engine, slug, p.content as string, { + // Pre-write normalization: auto-fix mechanical frontmatter issues + // (JSON-style arrays, nested quotes) before import. Non-blocking — + // if autoFixFrontmatter throws, fall through with original content. + let normalizedContent = p.content as string; + try { + const { autoFixFrontmatter } = await import('./brain-writer.ts'); + const { content: fixed } = autoFixFrontmatter(normalizedContent); + normalizedContent = fixed; + } catch { + // Non-fatal; proceed with original content. + } + + const result = await importFromContent(ctx.engine, slug, normalizedContent, { noEmbed, ...(ctx.sourceId ? { sourceId: ctx.sourceId } : {}), });