diff --git a/docs/proposals/takes-bootstrap.md b/docs/proposals/takes-bootstrap.md new file mode 100644 index 000000000..ec7e7bf74 --- /dev/null +++ b/docs/proposals/takes-bootstrap.md @@ -0,0 +1,132 @@ +# Proposal: Takes Bootstrap from Existing Content + +## Problem + +The takes system in gbrain — typed claims with weights, calibration tracking, and attribution — has full infrastructure but zero data in production. Despite being fully supported in the schema and having CLI commands, no agent or workflow ever populates it because there's no automated bootstrap path. + +The brain contains thousands of concept pages, atom pages, and lore entries that are rich with claims, opinions, and predictions. These exist as unstructured text but aren't captured as takes. + +### Scale of Impact + +| Metric | Value | +|--------|-------| +| Total pages | ~165,000 | +| Takes in the brain | 0 | +| Concept/atom/lore pages (estimated) | ~2,000+ | +| Claims embedded in those pages | Thousands | + +## Proposed Solution + +### Takes Extraction from Existing Pages + +Add `gbrain takes extract --from-pages` that scans content-rich pages and extracts structured claims. + +### How It Works + +1. **Scan eligible pages**: concept, atom, lore, and analysis page types +2. **Identify claims**: Statements that express a position, prediction, observation, or fact +3. **Classify each claim** by kind: + - `fact`: Verifiable statement ("Acme has 500 customers") + - `take`: Opinion or analysis ("Remote work will become the default") + - `bet`: Prediction with implicit timeline ("AI will replace 30% of coding by 2026") + - `hunch`: Low-confidence intuition ("Something feels off about this market") +4. **Extract metadata**: + - Claim text + - Attribution (who said/wrote it, if identifiable) + - Source page + - Optional weight (0.0-1.0 confidence) + - Tags/topics +5. **Store as takes** in the brain's takes system + +### CLI Interface + +```bash +# Bootstrap takes from all concept/atom/lore pages +gbrain takes extract --from-pages + +# Extract from specific page types +gbrain takes extract --from-pages --type concept,atom + +# Dry run to preview extractions +gbrain takes extract --from-pages --dry-run + +# Extract with a specific confidence threshold +gbrain takes extract --from-pages --min-confidence 0.6 + +# Extract takes from a specific page +gbrain takes extract --from-page "concepts/remote-work-thesis" +``` + +### Schema Pack Integration + +Schema packs should be able to declare: +1. **Custom takes kinds** (already supported) +2. **Extraction rules per type**: which page types to scan, what patterns indicate claims + +```yaml +takes: + kinds: + - fact + - take + - bet + - hunch + - thesis # custom kind + extraction: + eligible_types: + - concept + - atom + - lore + - analysis + patterns: + bet: ["will", "by 20\\d{2}", "predict", "expect"] + take: ["should", "believe", "think", "argue"] + hunch: ["might", "could", "feels like", "wonder if"] +``` + +### Dream Cycle Integration + +Add a takes extraction step to the dream cycle for recently-modified pages: + +``` +dream cycle: + ... + 6. extract takes (new) — only for recently modified concept/atom/lore pages +``` + +## Agent Onboarding + +### Features Detection + +`gbrain features` should detect zero takes: + +``` +ℹ Takes system: 0 takes recorded + Your brain has ~2,000 concept/atom/lore pages with extractable claims. + Run `gbrain takes extract --from-pages` to bootstrap the claims system. +``` + +### Migration Prompt + +``` +Your brain has 2,000+ concept/atom pages but 0 takes. +Run `gbrain takes extract --from-pages` to bootstrap the claims system? [y/N] +``` + +## Evidence + +The production brain has a fully functional takes system — the schema supports it, the CLI commands exist, the storage is ready. But zero takes have been recorded because: +1. No agent workflow includes takes extraction +2. No dream cycle step populates takes +3. Manual takes entry is too high-friction for daily use +4. There's no bootstrap command to seed from existing content + +Meanwhile, the brain's concept and atom pages contain hundreds of extractable claims that would make the takes system immediately useful for calibration tracking and knowledge synthesis. + +## Risks & Mitigations + +| Risk | Mitigation | +|------|------------| +| Low-quality extractions | Confidence threshold, dry-run preview, review mode | +| Duplicate takes from overlapping pages | Dedup by claim similarity | +| Misclassified claim types | Allow reclassification, learn from corrections | +| Attribution errors | Default to page author, flag uncertain attributions | diff --git a/src/core/cycle/synthesize.ts b/src/core/cycle/synthesize.ts index 9828a4f3a..4cbccbf3d 100644 --- a/src/core/cycle/synthesize.ts +++ b/src/core/cycle/synthesize.ts @@ -110,6 +110,22 @@ function warnUnknownModelOnce(model: string): void { ); } +// ── Surrogate-safe string slicing ───────────────────────────────────── + +/** + * Slice a string at `index` without splitting a UTF-16 surrogate pair. + * If `index` lands between a high and low surrogate, back up by one so + * the pair stays intact in the left half. + */ +function safeSliceEnd(index: number, str: string): number { + if (index <= 0 || index >= str.length) return index; + const code = str.charCodeAt(index - 1); + // If the char just before the cut is a high surrogate (D800–DBFF), + // the cut would orphan it. Back up one. + if (code >= 0xD800 && code <= 0xDBFF) return index - 1; + return index; +} + // ── Hash-deterministic transcript chunker (D9) ──────────────────────── /** @@ -178,8 +194,9 @@ function findBoundary(text: string, maxChars: number, searchStart: number): numb // Tier 3: any newline. const nlIdx = window.lastIndexOf('\n'); if (nlIdx >= 0) return searchStart + nlIdx; - // No boundary fits; hard-split at maxChars (deterministic). - return maxChars; + // No boundary fits; hard-split at maxChars (deterministic), + // but avoid splitting a UTF-16 surrogate pair. + return safeSliceEnd(maxChars, text); } /** @@ -659,7 +676,7 @@ export async function judgeSignificance( // doesn't need the full body; the opening + closing sections are usually // representative of significance. const trimmed = t.content.length > 8000 - ? t.content.slice(0, 4000) + '\n[...truncated...]\n' + t.content.slice(-4000) + ? t.content.slice(0, safeSliceEnd(4000, t.content)) + '\n[...truncated...]\n' + t.content.slice(safeSliceEnd(t.content.length - 4000, t.content)) : t.content; const sys = `You judge whether a conversation transcript is worth synthesizing into a personal knowledge brain.