Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 182 additions & 29 deletions src/commands/extract.ts

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions src/core/engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,18 @@ export interface LinkBatchInput {
from_source_id?: string;
to_source_id?: string;
origin_source_id?: string;
/**
* v0.36.0 (TIM-28): which extraction-time resolver pinned `to_slug`.
* - 'qualified' — `[[source-id:slug]]` literal source prefix
* - 'unqualified' — bare `[[slug]]` exact-path match
* - 'path' — markdown `[Name](path)` or wikilink path that resolved
* against the local-source slug set
* - 'alias' — fell through to a page's frontmatter `aliases:` entry
* - 'title' — fell through to a page's H1 heading text
* - 'basename' — fell through to a slug's last `/`-segment
* NULL for frontmatter/manual edges (they aren't subject to wikilink drift).
*/
resolution_type?: string;
}

/** Input row for addTimelineEntriesBatch. Optional fields default to '' (matches NOT NULL DDL). */
Expand Down
236 changes: 233 additions & 3 deletions src/core/link-extraction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
*/

import type { BrainEngine } from './engine.ts';
import { slugifyPath } from './sync.ts';
import type { PageType } from './types.ts';

// ─── Entity references ──────────────────────────────────────────
Expand Down Expand Up @@ -42,8 +43,18 @@ export type LinkResolutionType = 'qualified' | 'unqualified';
* - Gbrain canonical: people, companies, meetings, concepts, deal, civic, project, source, media, yc, projects
* - Our domain extensions: tech, finance, personal, openclaw (domain-organized wikis)
* - Our entity prefix: entities (we kept some legacy entities/projects/ pages)
* - [PARA-PATCH] PARA-numbered Obsidian dirs: `\d+_word` (e.g. `10_projects`,
* `20_meetings`, `30_resources`, `40_areas`, `50_pulse`, `80_archived`).
* Without this, vaults that follow the Tiago-Forte PARA layout (numeric
* prefix to force sidebar order) extract 0 wikilinks even though the source
* has hundreds. The PARA alternative uses [A-Za-z] so PascalCase
* `10_Projects/...` matches without forcing an `i` flag on the whole
* regex (which would also relax the kebab-case source-id grammar in
* QUALIFIED_WIKILINK_RE). Extracted slugs are normalized via `slugifyPath`
* so PascalCase / spaced segments match the lowercased DB slug.
* Tracked upstream: see TIM-27 in the Paperclip TimelyCare project.
*/
const DIR_PATTERN = '(?:people|companies|meetings|concepts|deal|civic|project|projects|source|media|yc|tech|finance|personal|openclaw|entities)';
const DIR_PATTERN = '(?:\\d+_[A-Za-z][A-Za-z0-9_-]*|people|companies|meetings|concepts|deal|civic|project|projects|source|media|yc|tech|finance|personal|openclaw|entities)';

/**
* Match `[Name](path)` markdown links pointing to entity directories.
Expand Down Expand Up @@ -90,6 +101,218 @@ const QUALIFIED_WIKILINK_RE = new RegExp(
'g',
);

// ─── Alias / title fallback index (TIM-28) ─────────────────────
//
// Wikilink target → canonical slug fallback resolvers, used when path-equality
// against the DB slug fails. Obsidian links commonly use:
//
// - short form: `[[_Team-Training]]` (no path; just a filename basename)
// - aliased-display: `[[10_Projects/.../_User-Research|User Research]]`
// where the path exists but the display name is the human-friendly anchor
// - title-only links to renamed/moved pages where the slug has drifted
//
// On a typical Obsidian-style vault this lifts the wikilink → edge ratio from
// single digits to >>50% with no false-positives because every alias map is
// derived from authored content (frontmatter `aliases:`, the page's H1, the
// filename basename) rather than guesswork.
//
// Build the index once per extract run and pass it down to the resolvers.
// The maps are first-write-wins so vaults that accidentally share an alias
// across two pages stay deterministic — the first ingested page owns it.

/** Which fallback resolver pinned a wikilink target. Mirrors links.resolution_type. */
export type WikilinkResolutionType = 'path' | 'alias' | 'title' | 'basename';

/** Result of an alias-index resolution attempt. */
export interface WikilinkResolution {
/** Canonical page slug for the resolved target. */
slug: string;
/** Which resolver fired. Persisted into links.resolution_type for audit. */
resolutionType: WikilinkResolutionType;
}

/** Per-page authored fields the alias index consumes. */
export interface PageAliasFields {
/** Canonical slug (lowercased, hyphenated, e.g. `10_projects/team/_team`). */
slug: string;
/** Frontmatter `aliases:` values (any shape — strings, single string, mixed). */
aliases?: unknown;
/** First H1 heading text (without the leading `#`). */
title?: string;
}

/**
* Lowercase + hyphenate an Obsidian-style alias / title / basename so it can
* be looked up against the DB slug grammar. Mirrors `slugifySegment` but
* tolerant of unicode — `'_Team-Assessment'` → `'_team-assessment'`,
* `'Rapid Research Framework'` → `'rapid-research-framework'`.
*/
function normalizeAliasKey(input: string): string {
if (!input) return '';
return input
.trim()
.toLowerCase()
.replace(/\.mdx?$/i, '')
.replace(/\s+/g, '-')
.replace(/-+/g, '-')
.replace(/^-|-$/g, '');
}

/** Strip a leading `#` heading marker and any markdown formatting. */
function extractH1Title(content: string): string | undefined {
// First H1 only. Two-state machine: skip the YAML frontmatter block
// entirely (delimited by `---` on its own line, top-of-file only), then
// return the first H1 heading we see.
const lines = content.split('\n');
let inFrontmatter = false;
let sawFirstLine = false;
for (const line of lines) {
if (!sawFirstLine && line.trim() === '---') {
inFrontmatter = true;
sawFirstLine = true;
continue;
}
sawFirstLine = true;
if (inFrontmatter) {
if (line.trim() === '---') inFrontmatter = false;
continue;
}
const m = line.match(/^#\s+(.+?)\s*$/);
if (m) return m[1].replace(/[*_`]/g, '').trim();
}
return undefined;
}

/** Public helper: derive a page's H1 title from content. Used by both FS and DB paths. */
export function deriveTitleFromContent(content: string): string | undefined {
return extractH1Title(content);
}

/**
* Wikilink fallback resolver. Given an Obsidian-style wikilink target that
* did not exact-match a known slug, try (in order):
* 1. Frontmatter `aliases:` declared by any page.
* 2. First H1 heading of any page.
* 3. Last path segment (filename basename) of any known slug.
*
* First-write-wins on alias collisions, so a vault that accidentally shares
* an alias across two pages stays deterministic.
*/
export class WikilinkAliasIndex {
/** alias-key → canonical slug. Keys are normalized via `normalizeAliasKey`. */
private aliasMap = new Map<string, string>();
/** title-key → canonical slug. */
private titleMap = new Map<string, string>();
/** basename-key → canonical slug. */
private basenameMap = new Map<string, string>();
/** Number of pages that contributed to the index (debug). */
public readonly pageCount: number;

constructor(entries: PageAliasFields[]) {
this.pageCount = entries.length;
for (const entry of entries) {
const slug = entry.slug;
if (!slug) continue;

// Frontmatter aliases — accept string, string[], or anything coercible.
const aliasValues = collectAliasStrings(entry.aliases);
for (const a of aliasValues) {
const key = normalizeAliasKey(a);
if (key && !this.aliasMap.has(key)) this.aliasMap.set(key, slug);
}

// H1 title.
if (entry.title) {
const key = normalizeAliasKey(entry.title);
if (key && !this.titleMap.has(key)) this.titleMap.set(key, slug);
}

// Filename basename — last `/`-segment of the canonical slug.
const lastSlash = slug.lastIndexOf('/');
const basename = lastSlash >= 0 ? slug.slice(lastSlash + 1) : slug;
const baseKey = normalizeAliasKey(basename);
if (baseKey && !this.basenameMap.has(baseKey)) this.basenameMap.set(baseKey, slug);
}
}

/**
* Resolve a wikilink target via fallback resolvers. Returns null when no
* fallback fires. The `displayName` (right side of `[[path|Display]]`) is
* used as a secondary alias key — Obsidian users frequently use the human
* name as an alias for the slug-cased target.
*/
tryResolve(rawTarget: string, displayName?: string): WikilinkResolution | null {
const candidates: string[] = [];

// Last segment of the (possibly path-shaped) wikilink target.
const lastSlash = rawTarget.lastIndexOf('/');
const lastSeg = lastSlash >= 0 ? rawTarget.slice(lastSlash + 1) : rawTarget;

candidates.push(lastSeg);
if (lastSlash >= 0) candidates.push(rawTarget);
if (displayName && displayName !== rawTarget) candidates.push(displayName);

// Pass 1: aliases (most specific — authored intent).
for (const c of candidates) {
const key = normalizeAliasKey(c);
if (!key) continue;
const slug = this.aliasMap.get(key);
if (slug) return { slug, resolutionType: 'alias' };
}

// Pass 2: H1 title.
for (const c of candidates) {
const key = normalizeAliasKey(c);
if (!key) continue;
const slug = this.titleMap.get(key);
if (slug) return { slug, resolutionType: 'title' };
}

// Pass 3: filename basename.
for (const c of candidates) {
const key = normalizeAliasKey(c);
if (!key) continue;
const slug = this.basenameMap.get(key);
if (slug) return { slug, resolutionType: 'basename' };
}

return null;
}

/** Total alias keys held by the index (debug / smoke). */
size(): { aliases: number; titles: number; basenames: number } {
return {
aliases: this.aliasMap.size,
titles: this.titleMap.size,
basenames: this.basenameMap.size,
};
}
}

/**
* Coerce frontmatter `aliases:` into a flat string array. YAML lets users
* write any of:
* aliases: foo
* aliases: [foo, bar]
* aliases:
* - foo
* - bar
* Non-string entries are silently dropped.
*/
function collectAliasStrings(value: unknown): string[] {
if (value == null) return [];
if (typeof value === 'string') return [value];
if (!Array.isArray(value)) return [];
const out: string[] = [];
for (const v of value) {
if (typeof v === 'string') out.push(v);
else if (v && typeof v === 'object' && typeof (v as Record<string, unknown>).name === 'string') {
out.push((v as Record<string, string>).name);
}
}
return out;
}

/**
* Strip fenced code blocks (```...```) and inline code (`...`) from markdown,
* replacing them with whitespace of equivalent length. Preserves byte offsets
Expand Down Expand Up @@ -239,8 +462,11 @@ export function extractEntityRefs(content: string): EntityRef[] {
while ((match = mdPattern.exec(stripped)) !== null) {
const name = match[1];
const fullPath = match[2];
const slug = fullPath;
const dir = fullPath.split('/')[0];
// [PARA-PATCH] Normalize via slugifyPath so PascalCase / spaced segments
// (e.g. `10_Projects/User-Research`) match the lowercased DB slug. No-op
// for already-canonical refs like `people/alice-chen`.
const slug = slugifyPath(fullPath);
const dir = slug.split('/')[0];
refs.push({ name, slug, dir });
}

Expand All @@ -256,6 +482,8 @@ export function extractEntityRefs(content: string): EntityRef[] {
if (slug.includes('://')) continue;
if (slug.endsWith('.md')) slug = slug.slice(0, -3);
const displayName = (match[3] || slug).trim();
// [PARA-PATCH] Match wikilink path against DB slug grammar.
slug = slugifyPath(slug);
const dir = slug.split('/')[0];
refs.push({ name: displayName, slug, dir, sourceId });
qualifiedRanges.push([match.index, match.index + match[0].length]);
Expand All @@ -271,6 +499,8 @@ export function extractEntityRefs(content: string): EntityRef[] {
if (slug.includes('://')) continue;
if (slug.endsWith('.md')) slug = slug.slice(0, -3);
const displayName = (match[2] || slug).trim();
// [PARA-PATCH] Match wikilink path against DB slug grammar.
slug = slugifyPath(slug);
const dir = slug.split('/')[0];
refs.push({ name: displayName, slug, dir });
}
Expand Down
31 changes: 31 additions & 0 deletions src/core/migrate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3241,6 +3241,37 @@ export const MIGRATIONS: Migration[] = [
WHERE claim_metric IS NOT NULL;
`,
},
{
version: 68,
name: 'links_resolution_type_widen',
// v0.36.0 (TIM-28) — widen links.resolution_type to record which
// wikilink-fallback resolver pinned each edge. Before this migration the
// column was limited to 'qualified'/'unqualified', mirroring the v0.17
// source-id resolver. TIM-28 adds three more values:
// - 'path' — exact path-equality against the local-source slug set
// (the normal happy path for `[Name](path)` markdown
// refs and wikilinks whose target is already a slug)
// - 'alias' — fell through to a page's frontmatter `aliases:` entry
// - 'title' — fell through to a page's H1 heading text
// - 'basename' — fell through to a slug's last `/`-segment
//
// We drop the old constraint and add a wider one. The column itself stays
// nullable so frontmatter/manual edges keep their current shape.
idempotent: true,
sql: `
ALTER TABLE links DROP CONSTRAINT IF EXISTS links_resolution_type_check;
DO $$ BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint WHERE conname = 'links_resolution_type_check'
) THEN
ALTER TABLE links ADD CONSTRAINT links_resolution_type_check
CHECK (resolution_type IS NULL OR resolution_type IN (
'qualified', 'unqualified', 'path', 'alias', 'title', 'basename'
));
END IF;
END $$;
`,
},
];

export const LATEST_VERSION = MIGRATIONS.length > 0
Expand Down
19 changes: 14 additions & 5 deletions src/core/pglite-engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1599,17 +1599,26 @@ export class PGLiteEngine implements BrainEngine {
const fromSourceIds = links.map(l => l.from_source_id || 'default');
const toSourceIds = links.map(l => l.to_source_id || 'default');
const originSourceIds = links.map(l => l.origin_source_id || 'default');
// v0.36.0 (TIM-28): pin which extraction-time resolver wrote each edge.
const resolutionTypes = links.map(l => l.resolution_type || null);
// ON CONFLICT DO NOTHING (not DO UPDATE) is mandatory here: a batch can
// contain duplicate (from, to, type, source, origin) tuples — e.g. when
// the same edge is extracted from both compiled_truth and timeline on
// the same page. Postgres refuses to let `DO UPDATE` affect the same row
// twice in one statement, while `DO NOTHING` quietly drops the dupes.
// The cost: resolution_type can't be back-filled on existing rows from
// here. Callers that need to refresh it should DELETE + re-INSERT.
const result = await this.db.query(
`INSERT INTO links (from_page_id, to_page_id, link_type, context, link_source, origin_page_id, origin_field)
SELECT f.id, t.id, v.link_type, v.context, v.link_source, o.id, v.origin_field
FROM unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::text[], $6::text[], $7::text[], $8::text[], $9::text[], $10::text[])
AS v(from_slug, to_slug, link_type, context, link_source, origin_slug, origin_field, from_source_id, to_source_id, origin_source_id)
`INSERT INTO links (from_page_id, to_page_id, link_type, context, link_source, origin_page_id, origin_field, resolution_type)
SELECT f.id, t.id, v.link_type, v.context, v.link_source, o.id, v.origin_field, v.resolution_type
FROM unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::text[], $6::text[], $7::text[], $8::text[], $9::text[], $10::text[], $11::text[])
AS v(from_slug, to_slug, link_type, context, link_source, origin_slug, origin_field, from_source_id, to_source_id, origin_source_id, resolution_type)
JOIN pages f ON f.slug = v.from_slug AND f.source_id = v.from_source_id
JOIN pages t ON t.slug = v.to_slug AND t.source_id = v.to_source_id
LEFT JOIN pages o ON o.slug = v.origin_slug AND o.source_id = v.origin_source_id
ON CONFLICT (from_page_id, to_page_id, link_type, link_source, origin_page_id) DO NOTHING
RETURNING 1`,
[fromSlugs, toSlugs, linkTypes, contexts, linkSources, originSlugs, originFields, fromSourceIds, toSourceIds, originSourceIds]
[fromSlugs, toSlugs, linkTypes, contexts, linkSources, originSlugs, originFields, fromSourceIds, toSourceIds, originSourceIds, resolutionTypes]
);
return result.rows.length;
}
Expand Down
6 changes: 4 additions & 2 deletions src/core/pglite-schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,10 @@ CREATE TABLE IF NOT EXISTS links (
link_source TEXT CHECK (link_source IS NULL OR link_source IN ('markdown', 'frontmatter', 'manual')),
origin_page_id INTEGER REFERENCES pages(id) ON DELETE SET NULL,
origin_field TEXT,
-- v0.18.0 Step 4: see src/schema.sql.
resolution_type TEXT CHECK (resolution_type IS NULL OR resolution_type IN ('qualified', 'unqualified')),
-- v0.18.0 Step 4 + v0.36.0 (TIM-28): see src/schema.sql.
resolution_type TEXT CHECK (resolution_type IS NULL OR resolution_type IN (
'qualified', 'unqualified', 'path', 'alias', 'title', 'basename'
)),
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
CONSTRAINT links_from_to_type_source_origin_unique
UNIQUE NULLS NOT DISTINCT (from_page_id, to_page_id, link_type, link_source, origin_page_id)
Expand Down
Loading