diff --git a/src/commands/extract.ts b/src/commands/extract.ts index b2f6c1a6c..c1e81a9a5 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -24,11 +24,14 @@ import { parseMarkdown } from '../core/markdown.ts'; import { extractPageLinks, parseTimelineEntries, inferLinkType, makeResolver, extractFrontmatterLinks, + WikilinkAliasIndex, + deriveTitleFromContent, + type PageAliasFields, type UnresolvedFrontmatterRef, } from '../core/link-extraction.ts'; import { createProgress } from '../core/progress.ts'; import { getCliOptions, cliOptsToProgressOptions } from '../core/cli-options.ts'; -import { pathToSlug, pruneDir, isSyncable } from '../core/sync.ts'; +import { pathToSlug, pruneDir, isSyncable, slugifyPath } from '../core/sync.ts'; // Batch size for addLinksBatch / addTimelineEntriesBatch. // Postgres bind-parameter limit is 65535. Links use 4 cols/row → 16K hard ceiling; @@ -44,6 +47,13 @@ export interface ExtractedLink { to_slug: string; link_type: string; context: string; + /** + * v0.36.0 (TIM-28): which resolver pinned the target slug at extraction + * time. See LinkBatchInput.resolution_type for the enum. NULL/undefined + * when the caller can't attribute the resolution (e.g. frontmatter edges + * that don't go through the wikilink fallback chain). + */ + resolution_type?: string; } export interface ExtractedTimelineEntry { @@ -79,7 +89,14 @@ export function walkMarkdownFiles(dir: string): { path: string; relPath: string if (st.isDirectory()) { if (!pruneDir(entry)) continue; walk(full); - } else if (entry.endsWith('.md') && !entry.startsWith('_')) { + } else if (entry.endsWith('.md')) { + // v0.36.0 (TIM-28): dropped the `!entry.startsWith('_')` guard. + // The pre-fix filter skipped Obsidian hub pages like + // `_Team-Training.md` — but `gbrain sync` (canonical ingestion) + // happily ingests them via `isSyncable`, so they live in the DB + // and are legitimate wikilink targets. Walking them here keeps + // FS-source extract aligned with the DB and lets the wikilink + // alias index see the basenames Obsidian users actually link to. const rel = relative(dir, full); if (!isSyncable(rel, { strategy: 'markdown' })) continue; files.push({ path: full, relPath: rel }); @@ -104,15 +121,17 @@ export function walkMarkdownFiles(dir: string): { path: string; relPath: string * (containing ://) are always skipped. For wikilinks, the .md suffix is added * if absent and section anchors (#heading) are stripped. */ -export function extractMarkdownLinks(content: string): { name: string; relTarget: string }[] { - const results: { name: string; relTarget: string }[] = []; +export function extractMarkdownLinks(content: string): { name: string; relTarget: string; rawTarget: string }[] { + const results: { name: string; relTarget: string; rawTarget: string }[] = []; const mdPattern = /\[([^\]]+)\]\(([^)]+\.md)\)/g; let match; while ((match = mdPattern.exec(content)) !== null) { const target = match[2]; if (target.includes('://')) continue; - results.push({ name: match[1], relTarget: target }); + // Strip the .md extension to give fallback resolvers a clean alias key. + const rawTarget = target.replace(/\.mdx?$/i, ''); + results.push({ name: match[1], relTarget: target, rawTarget }); } const wikiPattern = /\[\[([^|\]]+?)(?:\|[^\]]*?)?\]\]/g; @@ -125,7 +144,9 @@ export function extractMarkdownLinks(content: string): { name: string; relTarget const relTarget = pagePath.endsWith('.md') ? pagePath : pagePath + '.md'; const pipeIdx = match[0].indexOf('|'); const displayName = pipeIdx >= 0 ? match[0].slice(pipeIdx + 1, -2).trim() : rawPath; - results.push({ name: displayName, relTarget }); + // rawTarget preserves the wikilink target as authored (no .md, no anchor) + // so WikilinkAliasIndex.tryResolve can normalize it as an alias key. + results.push({ name: displayName, relTarget, rawTarget: pagePath }); } return results; @@ -146,13 +167,19 @@ export function extractMarkdownLinks(content: string): { name: string; relTarget export function resolveSlug(fileDir: string, relTarget: string, allSlugs: Set): string | null { const targetNoExt = relTarget.endsWith('.md') ? relTarget.slice(0, -3) : relTarget; - const s1 = join(fileDir, targetNoExt); + // Obsidian vaults frequently mix case (`10_Projects/...` vs the canonical + // lowercased `10_projects/...` DB slug), so the candidate must be slugified + // before lookup or every PascalCase path falls through to the alias + // resolver — which works, but at the cost of a true 'path' resolution. + const norm = (s: string) => slugifyPath(s); + + const s1 = norm(join(fileDir, targetNoExt)); if (allSlugs.has(s1)) return s1; const parts = fileDir.split('/').filter(Boolean); for (let strip = 1; strip <= parts.length; strip++) { const ancestor = parts.slice(0, parts.length - strip).join('/'); - const candidate = ancestor ? join(ancestor, targetNoExt) : targetNoExt; + const candidate = norm(ancestor ? join(ancestor, targetNoExt) : targetNoExt); if (allSlugs.has(candidate)) return candidate; } @@ -205,21 +232,39 @@ function parseFrontmatterFromContent(content: string, relPath: string): Record, - opts?: { includeFrontmatter?: boolean }, + opts?: { includeFrontmatter?: boolean; aliasIndex?: WikilinkAliasIndex }, ): Promise { const links: ExtractedLink[] = []; const slug = pathToSlug(relPath); const fileDir = dirname(relPath); const fm = parseFrontmatterFromContent(content, relPath); - for (const { name, relTarget } of extractMarkdownLinks(content)) { - const resolved = resolveSlug(fileDir, relTarget, allSlugs); - if (resolved !== null) { + for (const { name, relTarget, rawTarget } of extractMarkdownLinks(content)) { + // 1. Exact path match against the local slug set. + const pathHit = resolveSlug(fileDir, relTarget, allSlugs); + if (pathHit !== null) { links.push({ - from_slug: slug, to_slug: resolved, - link_type: inferTypeByDir(fileDir, dirname(resolved), fm), + from_slug: slug, to_slug: pathHit, + link_type: inferTypeByDir(fileDir, dirname(pathHit), fm), context: `markdown link: [${name}]`, + resolution_type: 'path', }); + continue; + } + + // 2. Fallback resolvers (TIM-28): frontmatter aliases, H1 title, + // filename basename. Skipped when no index was provided (e.g. + // legacy callers that haven't built one). + if (opts?.aliasIndex) { + const hit = opts.aliasIndex.tryResolve(rawTarget, name); + if (hit) { + links.push({ + from_slug: slug, to_slug: hit.slug, + link_type: inferTypeByDir(fileDir, dirname(hit.slug), fm), + context: `markdown link: [${name}] (${hit.resolutionType})`, + resolution_type: hit.resolutionType, + }); + } } } @@ -313,6 +358,14 @@ export interface ExtractOpts { * Pass undefined or omit for a full walk (CLI / first-run path). */ slugs?: string[]; + /** + * v0.36.0 (TIM-28) — explicit source id for the JOIN in `addLinksBatch`. + * When omitted, the engine JOINs against `source_id='default'`. In a + * multi-source brain whose pages live under a non-default source (e.g. + * `timelycare-vault`), the JOIN drops every row silently. CLI callers + * should resolve this from `sources.local_path` and pass it through. + */ + sourceId?: string; } /** @@ -340,7 +393,7 @@ export async function runExtractCore(engine: BrainEngine, opts: ExtractOpts): Pr // Nothing changed — skip entirely. return result; } - const r = await extractForSlugs(engine, opts.dir, opts.slugs, opts.mode, dryRun, jsonMode); + const r = await extractForSlugs(engine, opts.dir, opts.slugs, opts.mode, dryRun, jsonMode, opts.sourceId); result.links_created = r.links_created; result.timeline_entries_created = r.timeline_created; result.pages_processed = r.pages; @@ -349,7 +402,7 @@ export async function runExtractCore(engine: BrainEngine, opts: ExtractOpts): Pr // Full walk path: CLI `gbrain extract` or first-run. if (opts.mode === 'links' || opts.mode === 'all') { - const r = await extractLinksFromDir(engine, opts.dir, dryRun, jsonMode); + const r = await extractLinksFromDir(engine, opts.dir, dryRun, jsonMode, opts.sourceId); result.links_created = r.created; result.pages_processed = r.pages; } @@ -455,11 +508,20 @@ export async function runExtract(engine: BrainEngine, args: string[]) { result.pages_processed = Math.max(result.pages_processed, r.pages); } } else { + // v0.36.0 (TIM-28): resolve the source id for the JOIN in + // addLinksBatch. Pre-fix, FS extract always wrote with the implicit + // 'default' source — silently dropping every row in a brain whose + // pages live under another source (e.g. `timelycare-vault`). The + // resolver walks: explicit --source > GBRAIN_SOURCE env > .gbrain-source + // dotfile > sources(local_path) match > brain default > 'default'. + const { resolveSourceId } = await import('../core/source-resolver.ts'); + const sourceId = await resolveSourceId(engine, null, brainDir); result = await runExtractCore(engine, { mode: subcommand as 'links' | 'timeline' | 'all', dir: brainDir, dryRun, jsonMode, + sourceId, }); } } catch (e) { @@ -491,6 +553,7 @@ async function extractForSlugs( mode: 'links' | 'timeline' | 'all', dryRun: boolean, jsonMode: boolean, + sourceId?: string, ): Promise<{ links_created: number; timeline_created: number; pages: number }> { // Build the full slug set for link resolution (fast: just readdir, no file reads) const allFiles = walkMarkdownFiles(brainDir); @@ -549,7 +612,16 @@ async function extractForSlugs( if (!jsonMode) console.log(` ${link.from_slug} → ${link.to_slug} (${link.link_type})`); linksCreated++; } else { - linkBatch.push(link); + linkBatch.push({ + from_slug: link.from_slug, + to_slug: link.to_slug, + link_type: link.link_type, + context: link.context, + resolution_type: link.resolution_type, + from_source_id: sourceId, + to_source_id: sourceId, + origin_source_id: sourceId, + }); if (linkBatch.length >= BATCH_SIZE) await flushLinks(); } } @@ -563,7 +635,11 @@ async function extractForSlugs( if (!jsonMode) console.log(` ${entry.slug}: ${entry.date} — ${entry.summary}`); timelineCreated++; } else { - timelineBatch.push({ slug: entry.slug, date: entry.date, source: entry.source, summary: entry.summary, detail: entry.detail }); + timelineBatch.push({ + slug: entry.slug, date: entry.date, source: entry.source, + summary: entry.summary, detail: entry.detail, + source_id: sourceId, + }); if (timelineBatch.length >= BATCH_SIZE) await flushTimeline(); } } @@ -587,11 +663,29 @@ async function extractForSlugs( } async function extractLinksFromDir( - engine: BrainEngine, brainDir: string, dryRun: boolean, jsonMode: boolean, + engine: BrainEngine, brainDir: string, dryRun: boolean, jsonMode: boolean, sourceId?: string, ): Promise<{ created: number; pages: number }> { const files = walkMarkdownFiles(brainDir); const allSlugs = new Set(files.map(f => pathToSlug(f.relPath))); + // v0.36.0 (TIM-28): build the wikilink alias index by parsing each file's + // frontmatter + H1 once up-front. Cost: one extra readFileSync per file in + // this directory walk. For a 1K-page Obsidian vault that's ~tens of ms, + // worth it to lift wikilink → edge recall from single digits to >>50%. + const aliasEntries: PageAliasFields[] = []; + for (const f of files) { + try { + const content = readFileSync(f.path, 'utf-8'); + const fm = parseFrontmatterFromContent(content, f.relPath); + aliasEntries.push({ + slug: pathToSlug(f.relPath), + aliases: fm.aliases, + title: deriveTitleFromContent(content), + }); + } catch { /* skip unreadable */ } + } + const aliasIndex = new WikilinkAliasIndex(aliasEntries); + // Progress stream on stderr (separate from the action-events --json writes // to stdout, which tests grep for). Rate-gated; respects global --quiet / // --progress-json flags. @@ -623,16 +717,27 @@ async function extractLinksFromDir( for (let i = 0; i < files.length; i++) { try { const content = readFileSync(files[i].path, 'utf-8'); - const links = await extractLinksFromFile(content, files[i].relPath, allSlugs); + const links = await extractLinksFromFile(content, files[i].relPath, allSlugs, { aliasIndex }); for (const link of links) { if (dryRunSeen) { const key = `${link.from_slug}::${link.to_slug}::${link.link_type}`; if (dryRunSeen.has(key)) continue; dryRunSeen.add(key); - if (!jsonMode) console.log(` ${link.from_slug} → ${link.to_slug} (${link.link_type})`); + if (!jsonMode) console.log(` ${link.from_slug} → ${link.to_slug} (${link.link_type})${link.resolution_type ? ` [${link.resolution_type}]` : ''}`); created++; } else { - batch.push(link); + batch.push({ + from_slug: link.from_slug, + to_slug: link.to_slug, + link_type: link.link_type, + context: link.context, + resolution_type: link.resolution_type, + // v0.36.0 (TIM-28): thread source_id through so the batch JOIN + // matches the right page rows in a non-default-source brain. + from_source_id: sourceId, + to_source_id: sourceId, + origin_source_id: sourceId, + }); if (batch.length >= BATCH_SIZE) await flush(); } } @@ -644,7 +749,8 @@ async function extractLinksFromDir( if (!jsonMode) { const label = dryRun ? '(dry run) would create' : 'created'; - console.log(`Links: ${label} ${created} from ${files.length} pages`); + const sizes = aliasIndex.size(); + console.log(`Links: ${label} ${created} from ${files.length} pages (alias index: ${sizes.aliases} alias / ${sizes.titles} title / ${sizes.basenames} basename)`); } return { created, pages: files.length }; } @@ -802,6 +908,27 @@ async function extractLinksFromDB( list.push(ref.source_id); slugToSources.set(ref.slug, list); } + + // v0.36.0 (TIM-28): wikilink alias index. Built from each page's + // frontmatter `aliases:` + first H1. Used when an extracted candidate's + // `targetSlug` isn't in allSlugs (i.e. the wikilink path drifted). + // Two passes over pages here: one to build the index, one to extract + // links. The brain might have 46K pages, but the index pass only + // touches `frontmatter` + `title` columns (cheap getPage already loads + // them) and the iteration cost is dwarfed by the per-page link-type + // inference work. + const aliasEntries: PageAliasFields[] = []; + for (const { slug: s, source_id } of allRefs) { + const p = await engine.getPage(s, { sourceId: source_id }); + if (!p) continue; + aliasEntries.push({ + slug: s, + aliases: (p.frontmatter as Record)?.aliases, + title: p.title || deriveTitleFromContent(p.compiled_truth), + }); + } + const aliasIndex = new WikilinkAliasIndex(aliasEntries); + let processed = 0, created = 0; const progress = createProgress(cliOptsToProgressOptions(getCliOptions())); @@ -852,7 +979,31 @@ async function extractLinksFromDB( // fromSlug !== the page being processed; we need that page to exist // too or the JOIN drops the row anyway. const fromSlug = c.fromSlug ?? slug; - if (!allSlugs.has(c.targetSlug)) continue; + // v0.36.0 (TIM-28): when the candidate target isn't a known slug, + // try the wikilink alias index. Only applied to markdown-derived + // candidates — frontmatter edges already went through `makeResolver` + // which has its own pg_trgm fallback. Re-pinning a frontmatter + // candidate's target here would silently override that resolver's + // confidence threshold. + let resolvedTarget = c.targetSlug; + let resolutionType: string | undefined; + if (!allSlugs.has(resolvedTarget)) { + if (c.linkSource !== 'frontmatter') { + const hit = aliasIndex.tryResolve(c.targetSlug); + if (hit && allSlugs.has(hit.slug)) { + resolvedTarget = hit.slug; + resolutionType = hit.resolutionType; + } else { + continue; + } + } else { + continue; + } + } else { + // Path-equality win — mirrors the FS-source 'path' resolution_type + // (but only set for markdown edges; frontmatter edges keep NULL). + if (c.linkSource !== 'frontmatter') resolutionType = 'path'; + } if (!allSlugs.has(fromSlug)) continue; // v0.32.8 F10: cross-source link resolution. @@ -863,7 +1014,7 @@ async function extractLinksFromDB( const fromSources = slugToSources.get(fromSlug) ?? []; const fromSourceId = fromSources.includes(source_id) ? source_id : (fromSources.includes('default') ? 'default' : fromSources[0]); - const targetSources = slugToSources.get(c.targetSlug) ?? []; + const targetSources = slugToSources.get(resolvedTarget) ?? []; let toSourceId: string; if (targetSources.includes(fromSourceId)) { toSourceId = fromSourceId; @@ -879,23 +1030,24 @@ async function extractLinksFromDB( } if (dryRunSeen) { - const key = `${fromSourceId}::${fromSlug}::${toSourceId}::${c.targetSlug}::${c.linkType}::${c.linkSource ?? 'markdown'}`; + const key = `${fromSourceId}::${fromSlug}::${toSourceId}::${resolvedTarget}::${c.linkType}::${c.linkSource ?? 'markdown'}`; if (dryRunSeen.has(key)) continue; dryRunSeen.add(key); if (jsonMode) { process.stdout.write(JSON.stringify({ action: 'add_link', from: fromSlug, from_source_id: fromSourceId, - to: c.targetSlug, to_source_id: toSourceId, + to: resolvedTarget, to_source_id: toSourceId, type: c.linkType, context: c.context, link_source: c.linkSource, + resolution_type: resolutionType ?? null, }) + '\n'); } else { - console.log(` ${fromSlug} → ${c.targetSlug} (${c.linkType})${c.linkSource === 'frontmatter' ? ' [fm]' : ''}`); + console.log(` ${fromSlug} → ${resolvedTarget} (${c.linkType})${c.linkSource === 'frontmatter' ? ' [fm]' : ''}${resolutionType ? ` [${resolutionType}]` : ''}`); } created++; } else { batch.push({ from_slug: fromSlug, - to_slug: c.targetSlug, + to_slug: resolvedTarget, link_type: c.linkType, context: c.context, link_source: c.linkSource, @@ -907,6 +1059,7 @@ async function extractLinksFromDB( from_source_id: fromSourceId, to_source_id: toSourceId, origin_source_id: source_id, + resolution_type: resolutionType, }); if (batch.length >= BATCH_SIZE) await flush(); } diff --git a/src/core/engine.ts b/src/core/engine.ts index 34001a401..75ae538dd 100644 --- a/src/core/engine.ts +++ b/src/core/engine.ts @@ -84,6 +84,18 @@ export interface LinkBatchInput { from_source_id?: string; to_source_id?: string; origin_source_id?: string; + /** + * v0.36.0 (TIM-28): which extraction-time resolver pinned `to_slug`. + * - 'qualified' — `[[source-id:slug]]` literal source prefix + * - 'unqualified' — bare `[[slug]]` exact-path match + * - 'path' — markdown `[Name](path)` or wikilink path that resolved + * against the local-source slug set + * - 'alias' — fell through to a page's frontmatter `aliases:` entry + * - 'title' — fell through to a page's H1 heading text + * - 'basename' — fell through to a slug's last `/`-segment + * NULL for frontmatter/manual edges (they aren't subject to wikilink drift). + */ + resolution_type?: string; } /** Input row for addTimelineEntriesBatch. Optional fields default to '' (matches NOT NULL DDL). */ diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts index bcd9d4430..3e8f5b7e4 100644 --- a/src/core/link-extraction.ts +++ b/src/core/link-extraction.ts @@ -12,6 +12,7 @@ */ import type { BrainEngine } from './engine.ts'; +import { slugifyPath } from './sync.ts'; import type { PageType } from './types.ts'; // ─── Entity references ────────────────────────────────────────── @@ -42,8 +43,18 @@ export type LinkResolutionType = 'qualified' | 'unqualified'; * - Gbrain canonical: people, companies, meetings, concepts, deal, civic, project, source, media, yc, projects * - Our domain extensions: tech, finance, personal, openclaw (domain-organized wikis) * - Our entity prefix: entities (we kept some legacy entities/projects/ pages) + * - [PARA-PATCH] PARA-numbered Obsidian dirs: `\d+_word` (e.g. `10_projects`, + * `20_meetings`, `30_resources`, `40_areas`, `50_pulse`, `80_archived`). + * Without this, vaults that follow the Tiago-Forte PARA layout (numeric + * prefix to force sidebar order) extract 0 wikilinks even though the source + * has hundreds. The PARA alternative uses [A-Za-z] so PascalCase + * `10_Projects/...` matches without forcing an `i` flag on the whole + * regex (which would also relax the kebab-case source-id grammar in + * QUALIFIED_WIKILINK_RE). Extracted slugs are normalized via `slugifyPath` + * so PascalCase / spaced segments match the lowercased DB slug. + * Tracked upstream: see TIM-27 in the Paperclip TimelyCare project. */ -const DIR_PATTERN = '(?:people|companies|meetings|concepts|deal|civic|project|projects|source|media|yc|tech|finance|personal|openclaw|entities)'; +const DIR_PATTERN = '(?:\\d+_[A-Za-z][A-Za-z0-9_-]*|people|companies|meetings|concepts|deal|civic|project|projects|source|media|yc|tech|finance|personal|openclaw|entities)'; /** * Match `[Name](path)` markdown links pointing to entity directories. @@ -90,6 +101,218 @@ const QUALIFIED_WIKILINK_RE = new RegExp( 'g', ); +// ─── Alias / title fallback index (TIM-28) ───────────────────── +// +// Wikilink target → canonical slug fallback resolvers, used when path-equality +// against the DB slug fails. Obsidian links commonly use: +// +// - short form: `[[_Team-Training]]` (no path; just a filename basename) +// - aliased-display: `[[10_Projects/.../_User-Research|User Research]]` +// where the path exists but the display name is the human-friendly anchor +// - title-only links to renamed/moved pages where the slug has drifted +// +// On a typical Obsidian-style vault this lifts the wikilink → edge ratio from +// single digits to >>50% with no false-positives because every alias map is +// derived from authored content (frontmatter `aliases:`, the page's H1, the +// filename basename) rather than guesswork. +// +// Build the index once per extract run and pass it down to the resolvers. +// The maps are first-write-wins so vaults that accidentally share an alias +// across two pages stay deterministic — the first ingested page owns it. + +/** Which fallback resolver pinned a wikilink target. Mirrors links.resolution_type. */ +export type WikilinkResolutionType = 'path' | 'alias' | 'title' | 'basename'; + +/** Result of an alias-index resolution attempt. */ +export interface WikilinkResolution { + /** Canonical page slug for the resolved target. */ + slug: string; + /** Which resolver fired. Persisted into links.resolution_type for audit. */ + resolutionType: WikilinkResolutionType; +} + +/** Per-page authored fields the alias index consumes. */ +export interface PageAliasFields { + /** Canonical slug (lowercased, hyphenated, e.g. `10_projects/team/_team`). */ + slug: string; + /** Frontmatter `aliases:` values (any shape — strings, single string, mixed). */ + aliases?: unknown; + /** First H1 heading text (without the leading `#`). */ + title?: string; +} + +/** + * Lowercase + hyphenate an Obsidian-style alias / title / basename so it can + * be looked up against the DB slug grammar. Mirrors `slugifySegment` but + * tolerant of unicode — `'_Team-Assessment'` → `'_team-assessment'`, + * `'Rapid Research Framework'` → `'rapid-research-framework'`. + */ +function normalizeAliasKey(input: string): string { + if (!input) return ''; + return input + .trim() + .toLowerCase() + .replace(/\.mdx?$/i, '') + .replace(/\s+/g, '-') + .replace(/-+/g, '-') + .replace(/^-|-$/g, ''); +} + +/** Strip a leading `#` heading marker and any markdown formatting. */ +function extractH1Title(content: string): string | undefined { + // First H1 only. Two-state machine: skip the YAML frontmatter block + // entirely (delimited by `---` on its own line, top-of-file only), then + // return the first H1 heading we see. + const lines = content.split('\n'); + let inFrontmatter = false; + let sawFirstLine = false; + for (const line of lines) { + if (!sawFirstLine && line.trim() === '---') { + inFrontmatter = true; + sawFirstLine = true; + continue; + } + sawFirstLine = true; + if (inFrontmatter) { + if (line.trim() === '---') inFrontmatter = false; + continue; + } + const m = line.match(/^#\s+(.+?)\s*$/); + if (m) return m[1].replace(/[*_`]/g, '').trim(); + } + return undefined; +} + +/** Public helper: derive a page's H1 title from content. Used by both FS and DB paths. */ +export function deriveTitleFromContent(content: string): string | undefined { + return extractH1Title(content); +} + +/** + * Wikilink fallback resolver. Given an Obsidian-style wikilink target that + * did not exact-match a known slug, try (in order): + * 1. Frontmatter `aliases:` declared by any page. + * 2. First H1 heading of any page. + * 3. Last path segment (filename basename) of any known slug. + * + * First-write-wins on alias collisions, so a vault that accidentally shares + * an alias across two pages stays deterministic. + */ +export class WikilinkAliasIndex { + /** alias-key → canonical slug. Keys are normalized via `normalizeAliasKey`. */ + private aliasMap = new Map(); + /** title-key → canonical slug. */ + private titleMap = new Map(); + /** basename-key → canonical slug. */ + private basenameMap = new Map(); + /** Number of pages that contributed to the index (debug). */ + public readonly pageCount: number; + + constructor(entries: PageAliasFields[]) { + this.pageCount = entries.length; + for (const entry of entries) { + const slug = entry.slug; + if (!slug) continue; + + // Frontmatter aliases — accept string, string[], or anything coercible. + const aliasValues = collectAliasStrings(entry.aliases); + for (const a of aliasValues) { + const key = normalizeAliasKey(a); + if (key && !this.aliasMap.has(key)) this.aliasMap.set(key, slug); + } + + // H1 title. + if (entry.title) { + const key = normalizeAliasKey(entry.title); + if (key && !this.titleMap.has(key)) this.titleMap.set(key, slug); + } + + // Filename basename — last `/`-segment of the canonical slug. + const lastSlash = slug.lastIndexOf('/'); + const basename = lastSlash >= 0 ? slug.slice(lastSlash + 1) : slug; + const baseKey = normalizeAliasKey(basename); + if (baseKey && !this.basenameMap.has(baseKey)) this.basenameMap.set(baseKey, slug); + } + } + + /** + * Resolve a wikilink target via fallback resolvers. Returns null when no + * fallback fires. The `displayName` (right side of `[[path|Display]]`) is + * used as a secondary alias key — Obsidian users frequently use the human + * name as an alias for the slug-cased target. + */ + tryResolve(rawTarget: string, displayName?: string): WikilinkResolution | null { + const candidates: string[] = []; + + // Last segment of the (possibly path-shaped) wikilink target. + const lastSlash = rawTarget.lastIndexOf('/'); + const lastSeg = lastSlash >= 0 ? rawTarget.slice(lastSlash + 1) : rawTarget; + + candidates.push(lastSeg); + if (lastSlash >= 0) candidates.push(rawTarget); + if (displayName && displayName !== rawTarget) candidates.push(displayName); + + // Pass 1: aliases (most specific — authored intent). + for (const c of candidates) { + const key = normalizeAliasKey(c); + if (!key) continue; + const slug = this.aliasMap.get(key); + if (slug) return { slug, resolutionType: 'alias' }; + } + + // Pass 2: H1 title. + for (const c of candidates) { + const key = normalizeAliasKey(c); + if (!key) continue; + const slug = this.titleMap.get(key); + if (slug) return { slug, resolutionType: 'title' }; + } + + // Pass 3: filename basename. + for (const c of candidates) { + const key = normalizeAliasKey(c); + if (!key) continue; + const slug = this.basenameMap.get(key); + if (slug) return { slug, resolutionType: 'basename' }; + } + + return null; + } + + /** Total alias keys held by the index (debug / smoke). */ + size(): { aliases: number; titles: number; basenames: number } { + return { + aliases: this.aliasMap.size, + titles: this.titleMap.size, + basenames: this.basenameMap.size, + }; + } +} + +/** + * Coerce frontmatter `aliases:` into a flat string array. YAML lets users + * write any of: + * aliases: foo + * aliases: [foo, bar] + * aliases: + * - foo + * - bar + * Non-string entries are silently dropped. + */ +function collectAliasStrings(value: unknown): string[] { + if (value == null) return []; + if (typeof value === 'string') return [value]; + if (!Array.isArray(value)) return []; + const out: string[] = []; + for (const v of value) { + if (typeof v === 'string') out.push(v); + else if (v && typeof v === 'object' && typeof (v as Record).name === 'string') { + out.push((v as Record).name); + } + } + return out; +} + /** * Strip fenced code blocks (```...```) and inline code (`...`) from markdown, * replacing them with whitespace of equivalent length. Preserves byte offsets @@ -239,8 +462,11 @@ export function extractEntityRefs(content: string): EntityRef[] { while ((match = mdPattern.exec(stripped)) !== null) { const name = match[1]; const fullPath = match[2]; - const slug = fullPath; - const dir = fullPath.split('/')[0]; + // [PARA-PATCH] Normalize via slugifyPath so PascalCase / spaced segments + // (e.g. `10_Projects/User-Research`) match the lowercased DB slug. No-op + // for already-canonical refs like `people/alice-chen`. + const slug = slugifyPath(fullPath); + const dir = slug.split('/')[0]; refs.push({ name, slug, dir }); } @@ -256,6 +482,8 @@ export function extractEntityRefs(content: string): EntityRef[] { if (slug.includes('://')) continue; if (slug.endsWith('.md')) slug = slug.slice(0, -3); const displayName = (match[3] || slug).trim(); + // [PARA-PATCH] Match wikilink path against DB slug grammar. + slug = slugifyPath(slug); const dir = slug.split('/')[0]; refs.push({ name: displayName, slug, dir, sourceId }); qualifiedRanges.push([match.index, match.index + match[0].length]); @@ -271,6 +499,8 @@ export function extractEntityRefs(content: string): EntityRef[] { if (slug.includes('://')) continue; if (slug.endsWith('.md')) slug = slug.slice(0, -3); const displayName = (match[2] || slug).trim(); + // [PARA-PATCH] Match wikilink path against DB slug grammar. + slug = slugifyPath(slug); const dir = slug.split('/')[0]; refs.push({ name: displayName, slug, dir }); } diff --git a/src/core/migrate.ts b/src/core/migrate.ts index 1748234f4..291619d32 100644 --- a/src/core/migrate.ts +++ b/src/core/migrate.ts @@ -3241,6 +3241,37 @@ export const MIGRATIONS: Migration[] = [ WHERE claim_metric IS NOT NULL; `, }, + { + version: 68, + name: 'links_resolution_type_widen', + // v0.36.0 (TIM-28) — widen links.resolution_type to record which + // wikilink-fallback resolver pinned each edge. Before this migration the + // column was limited to 'qualified'/'unqualified', mirroring the v0.17 + // source-id resolver. TIM-28 adds three more values: + // - 'path' — exact path-equality against the local-source slug set + // (the normal happy path for `[Name](path)` markdown + // refs and wikilinks whose target is already a slug) + // - 'alias' — fell through to a page's frontmatter `aliases:` entry + // - 'title' — fell through to a page's H1 heading text + // - 'basename' — fell through to a slug's last `/`-segment + // + // We drop the old constraint and add a wider one. The column itself stays + // nullable so frontmatter/manual edges keep their current shape. + idempotent: true, + sql: ` + ALTER TABLE links DROP CONSTRAINT IF EXISTS links_resolution_type_check; + DO $$ BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_constraint WHERE conname = 'links_resolution_type_check' + ) THEN + ALTER TABLE links ADD CONSTRAINT links_resolution_type_check + CHECK (resolution_type IS NULL OR resolution_type IN ( + 'qualified', 'unqualified', 'path', 'alias', 'title', 'basename' + )); + END IF; + END $$; + `, + }, ]; export const LATEST_VERSION = MIGRATIONS.length > 0 diff --git a/src/core/pglite-engine.ts b/src/core/pglite-engine.ts index 428427521..65efcdc11 100644 --- a/src/core/pglite-engine.ts +++ b/src/core/pglite-engine.ts @@ -1599,17 +1599,26 @@ export class PGLiteEngine implements BrainEngine { const fromSourceIds = links.map(l => l.from_source_id || 'default'); const toSourceIds = links.map(l => l.to_source_id || 'default'); const originSourceIds = links.map(l => l.origin_source_id || 'default'); + // v0.36.0 (TIM-28): pin which extraction-time resolver wrote each edge. + const resolutionTypes = links.map(l => l.resolution_type || null); + // ON CONFLICT DO NOTHING (not DO UPDATE) is mandatory here: a batch can + // contain duplicate (from, to, type, source, origin) tuples — e.g. when + // the same edge is extracted from both compiled_truth and timeline on + // the same page. Postgres refuses to let `DO UPDATE` affect the same row + // twice in one statement, while `DO NOTHING` quietly drops the dupes. + // The cost: resolution_type can't be back-filled on existing rows from + // here. Callers that need to refresh it should DELETE + re-INSERT. const result = await this.db.query( - `INSERT INTO links (from_page_id, to_page_id, link_type, context, link_source, origin_page_id, origin_field) - SELECT f.id, t.id, v.link_type, v.context, v.link_source, o.id, v.origin_field - FROM unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::text[], $6::text[], $7::text[], $8::text[], $9::text[], $10::text[]) - AS v(from_slug, to_slug, link_type, context, link_source, origin_slug, origin_field, from_source_id, to_source_id, origin_source_id) + `INSERT INTO links (from_page_id, to_page_id, link_type, context, link_source, origin_page_id, origin_field, resolution_type) + SELECT f.id, t.id, v.link_type, v.context, v.link_source, o.id, v.origin_field, v.resolution_type + FROM unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::text[], $6::text[], $7::text[], $8::text[], $9::text[], $10::text[], $11::text[]) + AS v(from_slug, to_slug, link_type, context, link_source, origin_slug, origin_field, from_source_id, to_source_id, origin_source_id, resolution_type) JOIN pages f ON f.slug = v.from_slug AND f.source_id = v.from_source_id JOIN pages t ON t.slug = v.to_slug AND t.source_id = v.to_source_id LEFT JOIN pages o ON o.slug = v.origin_slug AND o.source_id = v.origin_source_id ON CONFLICT (from_page_id, to_page_id, link_type, link_source, origin_page_id) DO NOTHING RETURNING 1`, - [fromSlugs, toSlugs, linkTypes, contexts, linkSources, originSlugs, originFields, fromSourceIds, toSourceIds, originSourceIds] + [fromSlugs, toSlugs, linkTypes, contexts, linkSources, originSlugs, originFields, fromSourceIds, toSourceIds, originSourceIds, resolutionTypes] ); return result.rows.length; } diff --git a/src/core/pglite-schema.ts b/src/core/pglite-schema.ts index 3d05dff70..3853ae5ee 100644 --- a/src/core/pglite-schema.ts +++ b/src/core/pglite-schema.ts @@ -147,8 +147,10 @@ CREATE TABLE IF NOT EXISTS links ( link_source TEXT CHECK (link_source IS NULL OR link_source IN ('markdown', 'frontmatter', 'manual')), origin_page_id INTEGER REFERENCES pages(id) ON DELETE SET NULL, origin_field TEXT, - -- v0.18.0 Step 4: see src/schema.sql. - resolution_type TEXT CHECK (resolution_type IS NULL OR resolution_type IN ('qualified', 'unqualified')), + -- v0.18.0 Step 4 + v0.36.0 (TIM-28): see src/schema.sql. + resolution_type TEXT CHECK (resolution_type IS NULL OR resolution_type IN ( + 'qualified', 'unqualified', 'path', 'alias', 'title', 'basename' + )), created_at TIMESTAMPTZ NOT NULL DEFAULT now(), CONSTRAINT links_from_to_type_source_origin_unique UNIQUE NULLS NOT DISTINCT (from_page_id, to_page_id, link_type, link_source, origin_page_id) diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index a6118c6ed..f31869edd 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -1618,15 +1618,23 @@ export class PostgresEngine implements BrainEngine { const fromSourceIds = links.map(l => l.from_source_id || 'default'); const toSourceIds = links.map(l => l.to_source_id || 'default'); const originSourceIds = links.map(l => l.origin_source_id || 'default'); + // v0.36.0 (TIM-28): pin which extraction-time resolver wrote each edge. + const resolutionTypes = links.map(l => l.resolution_type || null); + // ON CONFLICT DO NOTHING (not DO UPDATE): batches commonly contain + // duplicate (from, to, type, source, origin) tuples (e.g. same edge + // mentioned in both compiled_truth and timeline). Postgres rejects + // `DO UPDATE` when one statement would affect the same row twice; + // `DO NOTHING` quietly drops the dupes. See pglite-engine for the + // matching note. const result = await sql` - INSERT INTO links (from_page_id, to_page_id, link_type, context, link_source, origin_page_id, origin_field) - SELECT f.id, t.id, v.link_type, v.context, v.link_source, o.id, v.origin_field + INSERT INTO links (from_page_id, to_page_id, link_type, context, link_source, origin_page_id, origin_field, resolution_type) + SELECT f.id, t.id, v.link_type, v.context, v.link_source, o.id, v.origin_field, v.resolution_type FROM unnest( ${fromSlugs}::text[], ${toSlugs}::text[], ${linkTypes}::text[], ${contexts}::text[], ${linkSources}::text[], ${originSlugs}::text[], ${originFields}::text[], ${fromSourceIds}::text[], ${toSourceIds}::text[], - ${originSourceIds}::text[] - ) AS v(from_slug, to_slug, link_type, context, link_source, origin_slug, origin_field, from_source_id, to_source_id, origin_source_id) + ${originSourceIds}::text[], ${resolutionTypes}::text[] + ) AS v(from_slug, to_slug, link_type, context, link_source, origin_slug, origin_field, from_source_id, to_source_id, origin_source_id, resolution_type) JOIN pages f ON f.slug = v.from_slug AND f.source_id = v.from_source_id JOIN pages t ON t.slug = v.to_slug AND t.source_id = v.to_source_id LEFT JOIN pages o ON o.slug = v.origin_slug AND o.source_id = v.origin_source_id diff --git a/src/core/schema-embedded.ts b/src/core/schema-embedded.ts index dbfb70708..eec22bd5d 100644 --- a/src/core/schema-embedded.ts +++ b/src/core/schema-embedded.ts @@ -275,7 +275,13 @@ CREATE TABLE IF NOT EXISTS links ( -- [[source:slug]] (target source pinned). 'unqualified' when written -- as bare [[slug]] and resolved via local-first fallback at -- extraction time. NULL for legacy/manual/frontmatter edges. - resolution_type TEXT CHECK (resolution_type IS NULL OR resolution_type IN ('qualified', 'unqualified')), + -- + -- v0.36.0 (TIM-28): widened to record which wikilink-fallback resolver + -- pinned each edge — 'path' (exact slug), 'alias' (frontmatter aliases), + -- 'title' (H1 heading), 'basename' (filename last segment). + resolution_type TEXT CHECK (resolution_type IS NULL OR resolution_type IN ( + 'qualified', 'unqualified', 'path', 'alias', 'title', 'basename' + )), created_at TIMESTAMPTZ NOT NULL DEFAULT now(), -- NULLS NOT DISTINCT (PG15+) so two rows with link_source IS NULL or -- origin_page_id IS NULL collide as expected. Without this, every row with diff --git a/src/schema.sql b/src/schema.sql index c19221def..1a69767e9 100644 --- a/src/schema.sql +++ b/src/schema.sql @@ -271,7 +271,14 @@ CREATE TABLE IF NOT EXISTS links ( -- [[source:slug]] (target source pinned). 'unqualified' when written -- as bare [[slug]] and resolved via local-first fallback at -- extraction time. NULL for legacy/manual/frontmatter edges. - resolution_type TEXT CHECK (resolution_type IS NULL OR resolution_type IN ('qualified', 'unqualified')), + -- + -- v0.36.0 (TIM-28) — widen to record which wikilink-fallback resolver + -- pinned each edge: 'path' (exact slug match), 'alias' (frontmatter + -- aliases hit), 'title' (H1 heading hit), 'basename' (filename + -- last-segment hit). + resolution_type TEXT CHECK (resolution_type IS NULL OR resolution_type IN ( + 'qualified', 'unqualified', 'path', 'alias', 'title', 'basename' + )), created_at TIMESTAMPTZ NOT NULL DEFAULT now(), -- NULLS NOT DISTINCT (PG15+) so two rows with link_source IS NULL or -- origin_page_id IS NULL collide as expected. Without this, every row with diff --git a/test/extract-wikilink-aliases.test.ts b/test/extract-wikilink-aliases.test.ts new file mode 100644 index 000000000..1f2aaee26 --- /dev/null +++ b/test/extract-wikilink-aliases.test.ts @@ -0,0 +1,264 @@ +/** + * TIM-28: wikilink alias / title / basename fallback resolution. + * + * Pre-fix the FS extractor resolved wikilink targets via path-equality only, + * which on Obsidian-style vaults yielded a ~5% wikilink → edge ratio (45/812 + * on the TimelyCare vault). This suite locks the four resolution paths added + * in v0.36.0: + * + * 1. `path` — `[Name](path)` markdown ref or `[[a/b/c]]` wikilink whose + * slugified target lives in the local slug set. + * 2. `alias` — frontmatter `aliases: [...]` resolver hit. + * 3. `title` — first H1 heading hit. + * 4. `basename` — last `/`-segment of a known slug hit. + * + * The link `resolution_type` column is the audit trail — it records which + * resolver fired so we can grade recall later. + */ + +import { describe, test, expect, beforeAll, afterAll, beforeEach } from 'bun:test'; +import { mkdtempSync, writeFileSync, mkdirSync } from 'fs'; +import { join } from 'path'; +import { tmpdir } from 'os'; +import { PGLiteEngine } from '../src/core/pglite-engine.ts'; +import { runExtract } from '../src/commands/extract.ts'; +import { + WikilinkAliasIndex, + deriveTitleFromContent, + type PageAliasFields, +} from '../src/core/link-extraction.ts'; +import type { PageInput } from '../src/core/types.ts'; + +let engine: PGLiteEngine; +let brainDir: string; + +beforeAll(async () => { + engine = new PGLiteEngine(); + await engine.connect({}); + await engine.initSchema(); +}, 60_000); + +afterAll(async () => { + await engine.disconnect(); +}); + +async function truncateAll() { + for (const t of ['content_chunks', 'links', 'tags', 'raw_data', 'timeline_entries', 'page_versions', 'ingest_log', 'pages']) { + await (engine as unknown as { db: { exec: (s: string) => Promise } }).db.exec(`DELETE FROM ${t}`); + } +} + +const page = (title: string, body = '', extraFrontmatter: Record = {}): PageInput => ({ + type: 'concept', title, compiled_truth: body, timeline: '', frontmatter: extraFrontmatter, +}); + +beforeEach(async () => { + await truncateAll(); + brainDir = mkdtempSync(join(tmpdir(), 'gbrain-extract-aliases-')); +}, 15_000); + +function writeFile(rel: string, content: string) { + const full = join(brainDir, rel); + mkdirSync(join(full, '..'), { recursive: true }); + writeFileSync(full, content); +} + +// ─── Unit tests: WikilinkAliasIndex ───────────────────────────── + +describe('WikilinkAliasIndex (unit)', () => { + const entries: PageAliasFields[] = [ + { + slug: '10_projects/team-training/_team-training', + aliases: ['Team Training', 'TT', 'training-hub'], + title: 'Team Training', + }, + { + slug: '10_projects/user-research/_user-research', + aliases: 'User Research', // string, not array — also valid YAML + title: 'User Research', + }, + { + slug: '30_resources/rapid-research-framework-v9', + // no aliases — title-only resolution + title: 'Rapid Research Framework', + }, + ]; + + test('alias-string and alias-array both populate the index', () => { + const idx = new WikilinkAliasIndex(entries); + expect(idx.tryResolve('User Research')?.slug) + .toBe('10_projects/user-research/_user-research'); + expect(idx.tryResolve('Team Training')?.slug) + .toBe('10_projects/team-training/_team-training'); + }); + + test('alias resolver beats title and basename', () => { + const idx = new WikilinkAliasIndex(entries); + // 'TT' only matches via aliases. + expect(idx.tryResolve('TT')).toEqual({ + slug: '10_projects/team-training/_team-training', + resolutionType: 'alias', + }); + }); + + test('title resolver matches when no alias is declared', () => { + const idx = new WikilinkAliasIndex(entries); + expect(idx.tryResolve('Rapid Research Framework')).toEqual({ + slug: '30_resources/rapid-research-framework-v9', + resolutionType: 'title', + }); + }); + + test('basename resolver matches short-form wikilinks like [[_Team-Training]]', () => { + const idx = new WikilinkAliasIndex(entries); + // Obsidian short form. The slug `10_projects/team-training/_team-training` + // has basename `_team-training` — basename map keys are lowercased. + expect(idx.tryResolve('_Team-Training')).toEqual({ + slug: '10_projects/team-training/_team-training', + resolutionType: 'basename', + }); + }); + + test('display-aliased form: pipe display name is consulted as an alias key', () => { + const idx = new WikilinkAliasIndex(entries); + // `[[some/missing/path|User Research]]` — path part misses but display + // matches the frontmatter alias. + const r = idx.tryResolve('some/missing/path', 'User Research'); + expect(r?.slug).toBe('10_projects/user-research/_user-research'); + expect(r?.resolutionType).toBe('alias'); + }); + + test('returns null when no resolver matches', () => { + const idx = new WikilinkAliasIndex(entries); + expect(idx.tryResolve('totally-unknown-target')).toBeNull(); + }); + + test('first-write-wins on basename collisions stays deterministic', () => { + const idx = new WikilinkAliasIndex([ + { slug: 'a/notes', title: 'Notes' }, + { slug: 'b/notes', title: 'Notes' }, + ]); + // First page registered owns 'notes' for both basename and title. + expect(idx.tryResolve('Notes')?.slug).toBe('a/notes'); + }); + + test('size() reports per-map cardinality', () => { + const idx = new WikilinkAliasIndex(entries); + const s = idx.size(); + // aliases: TT, team-training, training-hub, user-research + expect(s.aliases).toBeGreaterThanOrEqual(4); + expect(s.titles).toBe(3); + expect(s.basenames).toBe(3); + }); +}); + +describe('deriveTitleFromContent', () => { + test('returns first H1 text', () => { + expect(deriveTitleFromContent('# Hello World\n\nbody')).toBe('Hello World'); + }); + test('skips frontmatter and finds the first H1 below', () => { + expect(deriveTitleFromContent('---\nfoo: bar\n---\n\n# Real Title\n')) + .toBe('Real Title'); + }); + test('strips inline markdown decoration from the heading', () => { + expect(deriveTitleFromContent('# **My** _Title_')) + .toBe('My Title'); + }); + test('returns undefined when there is no H1', () => { + expect(deriveTitleFromContent('## not h1\nbody')).toBeUndefined(); + }); +}); + +// ─── Integration: extract --source fs ────────────────────────── + +describe('gbrain extract links --source fs uses alias/title fallback (TIM-28)', () => { + test('aliased-display wikilink resolves via path when slug matches', async () => { + await engine.putPage('10_projects/user-research/_user-research', page('User Research')); + await engine.putPage('10_projects/team-training/_team-training', page('Team Training')); + + writeFile('10_Projects/user-research/_User-Research.md', '---\ntitle: User Research\n---\n# User Research\n'); + writeFile('10_Projects/team-training/_Team-Training.md', + '---\ntitle: Team Training\n---\n# Team Training\n\n' + + 'See [[10_Projects/user-research/_User-Research|User Research]] for context.\n'); + + await runExtract(engine, ['links', '--dir', brainDir]); + const links = await engine.getLinks('10_projects/team-training/_team-training'); + const userRefs = links.filter(l => l.to_slug === '10_projects/user-research/_user-research'); + expect(userRefs.length).toBe(1); + // PascalCase path slugifies + matches the canonical slug → 'path'. + // The resolution_type is persisted; verify via the engine. + const rows = await (engine as unknown as { db: { query: (s: string) => Promise<{ rows: { resolution_type: string | null }[] }> } }).db + .query("SELECT resolution_type FROM links WHERE to_page_id = (SELECT id FROM pages WHERE slug = '10_projects/user-research/_user-research')"); + expect(rows.rows.length).toBeGreaterThan(0); + expect(rows.rows[0].resolution_type).toBe('path'); + }); + + test('short-form wikilink resolves via basename fallback', async () => { + await engine.putPage('10_projects/team-assessment/_team-assessment', page('Team Assessment')); + await engine.putPage('10_projects/foo/_foo', page('Foo')); + + writeFile('10_Projects/team-assessment/_Team-Assessment.md', '---\ntitle: Team Assessment\n---\n'); + writeFile('10_Projects/foo/_Foo.md', + '---\ntitle: Foo\n---\nLink to [[_Team-Assessment]].\n'); + + await runExtract(engine, ['links', '--dir', brainDir]); + const links = await engine.getLinks('10_projects/foo/_foo'); + const hit = links.find(l => l.to_slug === '10_projects/team-assessment/_team-assessment'); + expect(hit).toBeDefined(); + + const rows = await (engine as unknown as { db: { query: (s: string) => Promise<{ rows: { resolution_type: string | null }[] }> } }).db + .query("SELECT resolution_type FROM links WHERE to_page_id = (SELECT id FROM pages WHERE slug = '10_projects/team-assessment/_team-assessment')"); + expect(rows.rows[0].resolution_type).toBe('basename'); + }); + + test('frontmatter aliases resolve a renamed page', async () => { + // Page lives at a versioned slug, but exposes the canonical name as an alias. + await engine.putPage('30_resources/rapid-research-framework-v9', page('Rapid Research Framework v9')); + await engine.putPage('80_archived/some-old-page', page('Old Page')); + + writeFile('30_Resources/rapid-research-framework-v9.md', + '---\ntitle: Rapid Research Framework v9\naliases:\n - rapid-research-framework\n - rrf\n---\n# Rapid Research Framework v9\n'); + writeFile('80_Archived/some-old-page.md', + '---\ntitle: Old Page\n---\n' + + 'Refers to [[rapid-research-framework]].\n'); + + await runExtract(engine, ['links', '--dir', brainDir]); + const links = await engine.getLinks('80_archived/some-old-page'); + const hit = links.find(l => l.to_slug === '30_resources/rapid-research-framework-v9'); + expect(hit).toBeDefined(); + + const rows = await (engine as unknown as { db: { query: (s: string) => Promise<{ rows: { resolution_type: string | null }[] }> } }).db + .query("SELECT resolution_type FROM links WHERE to_page_id = (SELECT id FROM pages WHERE slug = '30_resources/rapid-research-framework-v9')"); + expect(rows.rows[0].resolution_type).toBe('alias'); + }); + + test('H1 title falls through when no alias is declared', async () => { + await engine.putPage('30_resources/some-heading-page', page('Some Heading Page')); + await engine.putPage('30_resources/referrer', page('Referrer')); + + writeFile('30_Resources/some-heading-page.md', + '---\ntitle: Some Heading Page\n---\n# Cool Topic\n\nbody\n'); + writeFile('30_Resources/referrer.md', + '---\ntitle: Referrer\n---\nSee [[Cool Topic]].\n'); + + await runExtract(engine, ['links', '--dir', brainDir]); + const links = await engine.getLinks('30_resources/referrer'); + const hit = links.find(l => l.to_slug === '30_resources/some-heading-page'); + expect(hit).toBeDefined(); + + const rows = await (engine as unknown as { db: { query: (s: string) => Promise<{ rows: { resolution_type: string | null }[] }> } }).db + .query("SELECT resolution_type FROM links WHERE to_page_id = (SELECT id FROM pages WHERE slug = '30_resources/some-heading-page')"); + expect(rows.rows[0].resolution_type).toBe('title'); + }); + + test('a truly dangling wikilink stays dangling (no false-positive)', async () => { + await engine.putPage('10_projects/foo/_foo', page('Foo')); + + writeFile('10_Projects/foo/_Foo.md', + '---\ntitle: Foo\n---\nDangling [[this-page-does-not-exist]] reference.\n'); + + await runExtract(engine, ['links', '--dir', brainDir]); + const links = await engine.getLinks('10_projects/foo/_foo'); + expect(links.length).toBe(0); + }); +}); diff --git a/test/link-extraction.test.ts b/test/link-extraction.test.ts index 6829ffeca..88da2392d 100644 --- a/test/link-extraction.test.ts +++ b/test/link-extraction.test.ts @@ -109,6 +109,69 @@ describe('extractEntityRefs', () => { expect(refs.length).toBe(1); expect(refs[0].dir).toBe('meetings'); }); + + // [PARA-PATCH] Obsidian + PARA layout (Tiago Forte numeric-prefix dirs). + // Without DIR_PATTERN's `\d+_word` alternative + slugifyPath normalization, + // a 947-wikilink TimelyCare vault extracts zero edges (see TIM-27). + describe('PARA-numbered Obsidian dirs', () => { + test('extracts lowercase PARA-numbered wikilink', () => { + const refs = extractEntityRefs('See [[10_projects/user-research/_user-research|User Research]]'); + expect(refs.length).toBe(1); + expect(refs[0].slug).toBe('10_projects/user-research/_user-research'); + expect(refs[0].dir).toBe('10_projects'); + expect(refs[0].name).toBe('User Research'); + }); + + test('extracts PascalCase PARA wikilink and normalizes slug to DB form', () => { + const refs = extractEntityRefs('See [[10_Projects/user-research/_User-Research|User Research]]'); + expect(refs.length).toBe(1); + // slugifyPath lowercases segments so the extracted slug matches what + // slugifyPath(filePath) produced when the vault was imported. + expect(refs[0].slug).toBe('10_projects/user-research/_user-research'); + expect(refs[0].dir).toBe('10_projects'); + }); + + test('normalizes spaced segments to hyphens to match DB slug', () => { + const refs = extractEntityRefs( + 'See [[20_Meetings/30_Meeting Transcripts/Onboarding/Gregg Intro|Gregg]]', + ); + expect(refs.length).toBe(1); + expect(refs[0].slug).toBe( + '20_meetings/30_meeting-transcripts/onboarding/gregg-intro', + ); + }); + + test('extracts each canonical PARA top-level dir (projects/meetings/resources/areas/pulse/archived)', () => { + const samples = [ + ['[[10_Projects/foo]]', '10_projects/foo'], + ['[[20_Meetings/bar]]', '20_meetings/bar'], + ['[[30_Resources/baz]]', '30_resources/baz'], + ['[[40_Areas/qux]]', '40_areas/qux'], + ['[[50_Pulse/zed]]', '50_pulse/zed'], + ['[[80_Archived/old]]', '80_archived/old'], + ]; + for (const [src, expected] of samples) { + const refs = extractEntityRefs(src); + expect(refs.length).toBe(1); + expect(refs[0].slug).toBe(expected); + } + }); + + test('extracts PARA-style markdown link as well as wikilink', () => { + const refs = extractEntityRefs('See [Plan](10_Projects/foo/_plan.md) details.'); + expect(refs.length).toBe(1); + expect(refs[0].slug).toBe('10_projects/foo/_plan'); + expect(refs[0].dir).toBe('10_projects'); + }); + + test('still skips bare unmatched dirs like 10_unknown that are not present as pages (just extracts; downstream filters)', () => { + // The regex is permissive — any `\d+_word/...` path matches. Validation + // happens at extract.ts via allSlugs.has(). This test pins the contract. + const refs = extractEntityRefs('See [[99_random/whatever]]'); + expect(refs.length).toBe(1); + expect(refs[0].slug).toBe('99_random/whatever'); + }); + }); }); // ─── extractPageLinks ──────────────────────────────────────────