diff --git a/.charter/config.json b/.charter/config.json index 25f7f34..b877fbb 100644 --- a/.charter/config.json +++ b/.charter/config.json @@ -44,5 +44,19 @@ } ] } + }, + "ontology": { + "ignoreAliases": [ + "token", + "tokens", + "key", + "keys", + "usage", + "audit", + "tier", + "plan", + "limit", + "limits" + ] } } diff --git a/packages/cli/src/commands/validate-ontology.ts b/packages/cli/src/commands/validate-ontology.ts new file mode 100644 index 0000000..e56a0f1 --- /dev/null +++ b/packages/cli/src/commands/validate-ontology.ts @@ -0,0 +1,394 @@ +/** + * charter validate --policy typed-data-access + * + * Ontology policy check: loads a data-registry YAML file (or uses an + * explicit path) and scans the current diff for references to registered + * business concepts. Flags non-canonical alias usage in new code as WARN. + * + * Delegates the detection logic to @stackbilt/validate's ontology module; + * this file handles only the CLI surface: flag parsing, config lookup, + * registry loading, git diff extraction, output formatting. + */ + +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { + parseOntologyRegistry, + checkOntologyDiff, + normalizeToken, + type OntologyRegistry, + type OntologyChangedLine, + type OntologyCheckResult, + type OntologyReference, + type OntologyViolation, +} from '@stackbilt/validate'; +import type { CLIOptions } from '../index'; +import { CLIError, EXIT_CODE } from '../index'; +import { getFlag } from '../flags'; +import { runGit } from '../git-helpers'; +import { loadConfig } from '../config'; + +// ============================================================================ +// Types +// ============================================================================ + +interface OntologyValidateOutput { + status: 'PASS' | 'WARN' | 'FAIL'; + summary: string; + registryPath: string; + registrySource: 'explicit-flag' | 'config' | 'default'; + conceptCount: number; + changedLineCount: number; + scannedFileCount: number; + referencedConceptSummary: Array<{ + canonical: string; + owner: string; + sensitivity: string; + count: number; + }>; + violations: OntologyViolation[]; + references: OntologyReference[]; + suggestions: string[]; +} + +// ============================================================================ +// Entry Point +// ============================================================================ + +export function runOntologyPolicyCheck(options: CLIOptions, args: string[]): number { + const ciMode = options.ciMode; + const config = loadConfig(options.configPath); + + // ---- Load registry -------------------------------------------------------- + + const registryInfo = resolveRegistryPath(args, options, config); + let registry: OntologyRegistry; + try { + const yamlText = fs.readFileSync(registryInfo.path, 'utf-8'); + registry = parseOntologyRegistry(yamlText); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + throw new CLIError( + `Ontology registry load failed at ${registryInfo.path}: ${msg}\n Configure with .charter/config.json → ontology.registry or pass --registry .`, + EXIT_CODE.RUNTIME_ERROR + ); + } + + // ---- Collect changed lines ------------------------------------------------ + + const range = getDiffRange(args); + let changedLines: OntologyChangedLine[]; + const scannedFiles = new Set(); + try { + changedLines = collectChangedLines(range, scannedFiles, args); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + throw new CLIError( + `Ontology diff extraction failed for range ${range}: ${msg}`, + EXIT_CODE.RUNTIME_ERROR + ); + } + + // ---- Resolve per-repo alias ignore list ----------------------------------- + + const ignoredAliasTokens = new Set( + (config.ontology?.ignoreAliases ?? []).map(normalizeToken) + ); + + // ---- Run the check -------------------------------------------------------- + + const result = checkOntologyDiff(changedLines, registry, { + ignoredAliasTokens: ignoredAliasTokens.size > 0 ? ignoredAliasTokens : undefined, + }); + + // ---- Format output -------------------------------------------------------- + + const output: OntologyValidateOutput = buildOutputPayload( + result, + registry, + registryInfo, + changedLines, + scannedFiles, + range + ); + + if (options.format === 'json') { + console.log(JSON.stringify(output, null, 2)); + } else { + printTextOutput(output, range); + } + + // ---- Decide exit code ----------------------------------------------------- + + const hasFailViolation = result.violations.some(v => v.severity === 'FAIL'); + const hasWarnViolation = result.violations.some(v => v.severity === 'WARN'); + + if (hasFailViolation) return EXIT_CODE.POLICY_VIOLATION; + if (ciMode && hasWarnViolation) return EXIT_CODE.POLICY_VIOLATION; + return EXIT_CODE.SUCCESS; +} + +// ============================================================================ +// Registry Path Resolution +// ============================================================================ + +function resolveRegistryPath( + args: string[], + options: CLIOptions, + config: ReturnType +): { path: string; source: 'explicit-flag' | 'config' | 'default' } { + const explicitFlag = getFlag(args, '--registry'); + if (explicitFlag) { + return { path: path.resolve(explicitFlag), source: 'explicit-flag' }; + } + + const configured = config.ontology?.registry; + if (configured && configured.length > 0) { + // Resolve relative to the .charter/ config dir + const configDir = path.resolve(options.configPath); + const resolved = path.isAbsolute(configured) + ? configured + : path.resolve(configDir, configured); + return { path: resolved, source: 'config' }; + } + + // Default: look for .charter/data-registry.yaml + const defaultPath = path.resolve(options.configPath, 'data-registry.yaml'); + return { path: defaultPath, source: 'default' }; +} + +// ============================================================================ +// Diff Extraction +// ============================================================================ + +function getDiffRange(args: string[]): string { + const explicit = getFlag(args, '--range'); + if (explicit) return explicit; + // Default: current working tree vs the merge-base with main/master, + // falling back to HEAD~1..HEAD if no base is found. + try { + const mainRef = (() => { + try { + return runGit(['rev-parse', '--verify', 'main']).trim(); + } catch { + return runGit(['rev-parse', '--verify', 'master']).trim(); + } + })(); + const head = runGit(['rev-parse', 'HEAD']).trim(); + if (mainRef && mainRef !== head) { + return `${mainRef}..HEAD`; + } + } catch { + // Fall through + } + return 'HEAD~1..HEAD'; +} + +/** + * Default skip patterns: test files and fixture directories whose alias + * content is expected and shouldn't trigger violations. Controlled by + * `--scan-tests` (opts back in) and `--include-fixtures` flags. + */ +const DEFAULT_SKIP_PATTERNS = [ + /(?:^|\/)__tests__\//, + /(?:^|\/)__fixtures__\//, + /(?:^|\/)__mocks__\//, + /\.test\.(?:ts|tsx|js|jsx|mjs|cjs)$/, + /\.spec\.(?:ts|tsx|js|jsx|mjs|cjs)$/, + /(?:^|\/)fixtures\//, + /(?:^|\/)test-fixtures\//, +]; + +function shouldSkipFile(filePath: string, args: string[]): boolean { + if (args.includes('--scan-tests')) return false; + return DEFAULT_SKIP_PATTERNS.some(re => re.test(filePath)); +} + +/** + * Run git diff --unified=0 for the given range and return added lines + * (lines starting with + in the hunk body, excluding the +++ file header). + * + * Files matching DEFAULT_SKIP_PATTERNS (test files, fixture dirs) are + * filtered out unless `--scan-tests` is passed. Test fixtures intentionally + * contain alias strings (e.g., `aliases: [workspace, organization]` in a + * registry YAML fixture) and should not count as production alias usage. + */ +function collectChangedLines( + range: string, + scannedFilesOut: Set, + args: string[] = [] +): OntologyChangedLine[] { + // --unified=0 yields hunks with no context lines, so every +line is a real + // added line. Exclude the "+++ b/file" header line explicitly. + const diffOutput = runGit(['diff', '--unified=0', range]); + + const lines: OntologyChangedLine[] = []; + let currentFile: string | null = null; + let currentAddLine = 0; + let skipCurrentFile = false; + + for (const raw of diffOutput.split(/\r?\n/)) { + // File header: "+++ b/path/to/file.ts" + if (raw.startsWith('+++ ')) { + const match = raw.match(/^\+\+\+ (?:b\/)?(.+)$/); + if (match && match[1] !== '/dev/null') { + currentFile = match[1]; + skipCurrentFile = shouldSkipFile(currentFile, args); + if (!skipCurrentFile) { + scannedFilesOut.add(currentFile); + } + } else { + currentFile = null; + skipCurrentFile = false; + } + continue; + } + // Skip other file headers: "--- a/...", "diff --git", "index ...", binary markers, etc. + if ( + raw.startsWith('--- ') || + raw.startsWith('diff --git') || + raw.startsWith('index ') || + raw.startsWith('new file') || + raw.startsWith('deleted file') || + raw.startsWith('rename ') || + raw.startsWith('similarity ') || + raw.startsWith('Binary files ') + ) { + continue; + } + + // Hunk header: "@@ -a,b +c,d @@" — advance line counter + const hunkMatch = raw.match(/^@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@/); + if (hunkMatch) { + currentAddLine = parseInt(hunkMatch[1], 10); + continue; + } + + // Added line: "+content" (not "+++" which is caught above) + if (raw.startsWith('+') && currentFile && !skipCurrentFile) { + const text = raw.slice(1); + lines.push({ file: currentFile, line: currentAddLine, text }); + currentAddLine++; + continue; + } + + // Removed lines (-prefix) don't affect the add counter in --unified=0 + } + + return lines; +} + +// ============================================================================ +// Output Formatting +// ============================================================================ + +function buildOutputPayload( + result: OntologyCheckResult, + registry: OntologyRegistry, + registryInfo: { path: string; source: 'explicit-flag' | 'config' | 'default' }, + changedLines: OntologyChangedLine[], + scannedFiles: Set, + range: string +): OntologyValidateOutput { + const hasWarn = result.violations.some(v => v.severity === 'WARN'); + const hasFail = result.violations.some(v => v.severity === 'FAIL'); + const status: 'PASS' | 'WARN' | 'FAIL' = hasFail ? 'FAIL' : hasWarn ? 'WARN' : 'PASS'; + + const referencedConceptSummary = Array.from(result.referencedConcepts.entries()) + .map(([canonical, count]) => { + const concept = registry.concepts.get(canonical)!; + return { + canonical, + owner: concept.owner, + sensitivity: concept.sensitivity, + count, + }; + }) + .sort((a, b) => b.count - a.count); + + const suggestions: string[] = []; + if (result.violations.length > 0) { + const uniqueAliases = new Set( + result.violations + .filter(v => v.type === 'NON_CANONICAL_ALIAS') + .map(v => `${v.identifier} → ${v.canonical}`) + ); + if (uniqueAliases.size > 0) { + suggestions.push( + `Prefer canonical forms in new code: ${Array.from(uniqueAliases).slice(0, 5).join(', ')}${uniqueAliases.size > 5 ? ` (+${uniqueAliases.size - 5} more)` : ''}` + ); + } + suggestions.push('Aliases are acceptable in user-facing copy only; rename variable/type identifiers to the canonical name.'); + } + if (result.passed && result.references.length === 0) { + suggestions.push('No registered concepts were referenced in this diff — ontology check surfaced nothing to validate.'); + } + + const plural = (n: number, singular: string, plural: string): string => `${n} ${n === 1 ? singular : plural}`; + const summary = + status === 'FAIL' + ? `${plural(result.violations.filter(v => v.severity === 'FAIL').length, 'policy violation', 'policy violations')} in ontology check (range: ${range}).` + : status === 'WARN' + ? `${plural(result.violations.length, 'non-canonical alias', 'non-canonical aliases')} found in ${plural(scannedFiles.size, 'changed file', 'changed files')}.` + : result.references.length === 0 + ? `No registered concepts referenced in ${plural(scannedFiles.size, 'changed file', 'changed files')}.` + : `${plural(result.referencedConcepts.size, 'registered concept', 'registered concepts')} referenced cleanly across ${plural(scannedFiles.size, 'changed file', 'changed files')}.`; + + return { + status, + summary, + registryPath: registryInfo.path, + registrySource: registryInfo.source, + conceptCount: registry.concepts.size, + changedLineCount: changedLines.length, + scannedFileCount: scannedFiles.size, + referencedConceptSummary, + violations: result.violations, + references: result.references, + suggestions, + }; +} + +function printTextOutput(output: OntologyValidateOutput, range: string): void { + const icon = output.status === 'PASS' ? '[ok]' : output.status === 'WARN' ? '[warn]' : '[fail]'; + console.log(''); + console.log(` ${icon} Ontology policy: ${output.status}`); + console.log(` ${output.summary}`); + console.log(` Registry: ${output.registryPath} (${output.registrySource})`); + console.log(` Concepts loaded: ${output.conceptCount}`); + console.log(` Range: ${range} — ${output.changedLineCount} added line(s) across ${output.scannedFileCount} file(s)`); + + if (output.referencedConceptSummary.length > 0) { + console.log(''); + console.log(' Referenced concepts:'); + for (const entry of output.referencedConceptSummary.slice(0, 15)) { + console.log(` - ${entry.canonical.padEnd(20)} ${String(entry.count).padStart(3)}× | ${entry.owner} | ${entry.sensitivity}`); + } + if (output.referencedConceptSummary.length > 15) { + console.log(` (+${output.referencedConceptSummary.length - 15} more)`); + } + } + + if (output.violations.length > 0) { + console.log(''); + console.log(` Violations (${output.violations.length}):`); + for (const v of output.violations.slice(0, 20)) { + const loc = v.file ? `${v.file}:${v.line}` : '(unknown location)'; + console.log(` - [${v.severity}] ${loc}`); + console.log(` ${v.message}`); + } + if (output.violations.length > 20) { + console.log(` (+${output.violations.length - 20} more)`); + } + } + + if (output.suggestions.length > 0) { + console.log(''); + console.log(' Suggestions:'); + for (const s of output.suggestions) { + console.log(` - ${s}`); + } + } + + console.log(''); +} diff --git a/packages/cli/src/commands/validate.ts b/packages/cli/src/commands/validate.ts index e5b9580..1e54f4e 100644 --- a/packages/cli/src/commands/validate.ts +++ b/packages/cli/src/commands/validate.ts @@ -6,12 +6,13 @@ */ import type { CLIOptions } from '../index'; -import { EXIT_CODE } from '../index'; +import { CLIError, EXIT_CODE } from '../index'; import type { GitCommit } from '@stackbilt/types'; import { loadConfig } from '../config'; import { parseAllTrailers } from '@stackbilt/git'; import { assessCommitRisk, generateSuggestions } from '@stackbilt/git'; import { runGit, hasCommits, getGitErrorMessage, parseCommitMetadata, parseChangedFilesByCommit, getRecentCommitRange } from '../git-helpers'; +import { getFlag } from '../flags'; interface LocalValidationResult { status: 'PASS' | 'WARN' | 'FAIL'; @@ -66,6 +67,19 @@ interface GitCommitLoadResult { } export async function validateCommand(options: CLIOptions, args: string[]): Promise { + // Policy dispatch: --policy typed-data-access runs the ontology check + // instead of the default trailer validation. + const policy = getFlag(args, '--policy'); + if (policy === 'typed-data-access') { + const { runOntologyPolicyCheck } = await import('./validate-ontology'); + return runOntologyPolicyCheck(options, args); + } + if (policy && policy !== 'typed-data-access') { + throw new CLIError( + `Unknown policy: ${policy}. Supported: typed-data-access` + ); + } + const config = loadConfig(options.configPath); if (!hasCommits()) { diff --git a/packages/cli/src/config.ts b/packages/cli/src/config.ts index cb1ab3f..c251bec 100644 --- a/packages/cli/src/config.ts +++ b/packages/cli/src/config.ts @@ -14,7 +14,7 @@ import type { Pattern } from '@stackbilt/types'; // Config Types // ============================================================================ -export interface CharterConfig { +export interface CharterConfig { /** Project name */ project: string; /** Version of config schema */ @@ -40,35 +40,61 @@ export interface CharterConfig { exclude: string[]; }; - /** Validation settings */ - validation: { - /** Citation strictness: FAIL/STRICT, WARN, PERMISSIVE */ - citationStrictness: 'FAIL' | 'STRICT' | 'WARN' | 'PERMISSIVE'; - }; + /** Validation settings */ + validation: { + /** Citation strictness: FAIL/STRICT, WARN, PERMISSIVE */ + citationStrictness: 'FAIL' | 'STRICT' | 'WARN' | 'PERMISSIVE'; + }; - /** CI behavior */ - ci: { + /** CI behavior */ + ci: { /** Fail CI on WARN (default: only fail on FAIL) */ failOnWarn: boolean; /** Post PR comments (requires GitHub token) */ - postComments: boolean; - }; - - /** Audit scoring behavior */ - audit: { - policyCoverage: { - enabled: boolean; - requiredSections: Array<{ - id: string; - title: string; - match: string[]; - }>; - }; - }; -} - -const DEFAULT_CONFIG: CharterConfig = { - project: 'my-project', + postComments: boolean; + }; + + /** Audit scoring behavior */ + audit: { + policyCoverage: { + enabled: boolean; + requiredSections: Array<{ + id: string; + title: string; + match: string[]; + }>; + }; + }; + + /** + * Ontology / typed-data-access policy settings. + * Used by `charter validate --policy typed-data-access` to load a data + * registry file that declares canonical business concepts, sensitivity + * tiers, and aliases. See Stackbilt-dev/charter#69. + */ + ontology?: { + /** + * Path to a data-registry YAML file. Absolute, or relative to the + * .charter/ config directory. When unset, defaults to + * `.charter/data-registry.yaml`. + */ + registry?: string; + /** + * Alias tokens to suppress from violation reporting in this repo. + * Useful when a generic alias (e.g., `token`, `key`, `usage`) collides + * with common programming vocabulary in this codebase. Each entry is + * matched against normalized tokens (lowercased, underscores removed). + * + * Does not affect canonical name matching — only silences the specific + * alias-form collision. Prefer fixing the registry upstream when the + * alias is globally noisy; use this list for repo-local overrides. + */ + ignoreAliases?: string[]; + }; +} + +const DEFAULT_CONFIG: CharterConfig = { + project: 'my-project', version: '0.1', git: { requireTrailers: true, @@ -80,41 +106,41 @@ const DEFAULT_CONFIG: CharterConfig = { include: ['**/*.ts', '**/*.js', '**/*.tsx', '**/*.jsx'], exclude: ['node_modules/**', 'dist/**', '.git/**', 'coverage/**'], }, - validation: { - citationStrictness: 'FAIL', - }, - ci: { - failOnWarn: false, - postComments: false, - }, - audit: { - policyCoverage: { - enabled: true, - requiredSections: [ - { - id: 'commit_trailers', - title: 'Commit Trailers', - match: ['commit trailers', 'governed-by', 'resolves-request'], - }, - { - id: 'change_classification', - title: 'Change Classification', - match: ['change classification', 'surface', 'local', 'cross_cutting'], - }, - { - id: 'exception_path', - title: 'Exception Path', - match: ['exception', 'waiver', 'override'], - }, - { - id: 'escalation_approval', - title: 'Escalation & Approval', - match: ['escalation', 'approval', 'architectural review'], - }, - ], - }, - }, -}; + validation: { + citationStrictness: 'FAIL', + }, + ci: { + failOnWarn: false, + postComments: false, + }, + audit: { + policyCoverage: { + enabled: true, + requiredSections: [ + { + id: 'commit_trailers', + title: 'Commit Trailers', + match: ['commit trailers', 'governed-by', 'resolves-request'], + }, + { + id: 'change_classification', + title: 'Change Classification', + match: ['change classification', 'surface', 'local', 'cross_cutting'], + }, + { + id: 'exception_path', + title: 'Exception Path', + match: ['exception', 'waiver', 'override'], + }, + { + id: 'escalation_approval', + title: 'Escalation & Approval', + match: ['escalation', 'approval', 'architectural review'], + }, + ], + }, + }, +}; // ============================================================================ // Config Loading @@ -138,19 +164,20 @@ export function loadConfig(configPath: string): CharterConfig { return { project: parsed.project || DEFAULT_CONFIG.project, version: parsed.version || DEFAULT_CONFIG.version, - git: { ...DEFAULT_CONFIG.git, ...parsed.git }, - drift: { ...DEFAULT_CONFIG.drift, ...parsed.drift }, - validation: { ...DEFAULT_CONFIG.validation, ...parsed.validation }, - ci: { ...DEFAULT_CONFIG.ci, ...parsed.ci }, - audit: { - policyCoverage: { - ...DEFAULT_CONFIG.audit.policyCoverage, - ...(parsed.audit?.policyCoverage || {}), - requiredSections: parsed.audit?.policyCoverage?.requiredSections || DEFAULT_CONFIG.audit.policyCoverage.requiredSections, - }, - }, - }; - } catch (err) { + git: { ...DEFAULT_CONFIG.git, ...parsed.git }, + drift: { ...DEFAULT_CONFIG.drift, ...parsed.drift }, + validation: { ...DEFAULT_CONFIG.validation, ...parsed.validation }, + ci: { ...DEFAULT_CONFIG.ci, ...parsed.ci }, + audit: { + policyCoverage: { + ...DEFAULT_CONFIG.audit.policyCoverage, + ...(parsed.audit?.policyCoverage || {}), + requiredSections: parsed.audit?.policyCoverage?.requiredSections || DEFAULT_CONFIG.audit.policyCoverage.requiredSections, + }, + }, + ontology: parsed.ontology, + }; + } catch (err) { console.warn(`Warning: Failed to parse ${configFile}, using defaults`); return DEFAULT_CONFIG; } @@ -159,7 +186,7 @@ export function loadConfig(configPath: string): CharterConfig { /** * Load blessed patterns from .charter/patterns/*.json */ -export function loadPatterns(configPath: string): Pattern[] { +export function loadPatterns(configPath: string): Pattern[] { const patternsDir = path.join(configPath, 'patterns'); if (!fs.existsSync(patternsDir)) { @@ -171,19 +198,19 @@ export function loadPatterns(configPath: string): Pattern[] { for (const file of files) { try { - const raw = fs.readFileSync(path.join(patternsDir, file), 'utf-8'); - const parsed = JSON.parse(raw); - const normalized = !Array.isArray(parsed) - && parsed - && typeof parsed === 'object' - && Array.isArray((parsed as { patterns?: unknown[] }).patterns) - ? (parsed as { patterns: unknown[] }).patterns - : parsed; - - // Support both single pattern and array of patterns - const items = Array.isArray(normalized) ? normalized : [normalized]; - for (const item of items) { - patterns.push({ + const raw = fs.readFileSync(path.join(patternsDir, file), 'utf-8'); + const parsed = JSON.parse(raw); + const normalized = !Array.isArray(parsed) + && parsed + && typeof parsed === 'object' + && Array.isArray((parsed as { patterns?: unknown[] }).patterns) + ? (parsed as { patterns: unknown[] }).patterns + : parsed; + + // Support both single pattern and array of patterns + const items = Array.isArray(normalized) ? normalized : [normalized]; + for (const item of items) { + patterns.push({ id: item.id || `local-${file}-${patterns.length}`, name: item.name, category: item.category || 'COMPUTE', @@ -202,41 +229,41 @@ export function loadPatterns(configPath: string): Pattern[] { } } - return patterns; -} - -export function getPatternCustomizationStatus(configPath: string): boolean | null { - const patternsDir = path.join(configPath, 'patterns'); - if (!fs.existsSync(patternsDir)) return null; - - const files = fs.readdirSync(patternsDir).filter(f => f.endsWith('.json')); - const flaggedValues: boolean[] = []; - - for (const file of files) { - try { - const raw = fs.readFileSync(path.join(patternsDir, file), 'utf-8'); - const parsed = JSON.parse(raw) as { customized?: unknown }; - if (typeof parsed?.customized === 'boolean') { - flaggedValues.push(parsed.customized); - } - } catch { - // ignore malformed pattern file metadata - } - } - - if (flaggedValues.length === 0) return null; - return flaggedValues.every(Boolean); -} + return patterns; +} + +export function getPatternCustomizationStatus(configPath: string): boolean | null { + const patternsDir = path.join(configPath, 'patterns'); + if (!fs.existsSync(patternsDir)) return null; + + const files = fs.readdirSync(patternsDir).filter(f => f.endsWith('.json')); + const flaggedValues: boolean[] = []; + + for (const file of files) { + try { + const raw = fs.readFileSync(path.join(patternsDir, file), 'utf-8'); + const parsed = JSON.parse(raw) as { customized?: unknown }; + if (typeof parsed?.customized === 'boolean') { + flaggedValues.push(parsed.customized); + } + } catch { + // ignore malformed pattern file metadata + } + } + + if (flaggedValues.length === 0) return null; + return flaggedValues.every(Boolean); +} /** * Get the default config as JSON string (for init command). */ -export function getDefaultConfigJSON(projectName?: string): string { - const config = { - ...DEFAULT_CONFIG, - project: projectName || DEFAULT_CONFIG.project, - }; - return JSON.stringify(config, null, 2); -} +export function getDefaultConfigJSON(projectName?: string): string { + const config = { + ...DEFAULT_CONFIG, + project: projectName || DEFAULT_CONFIG.project, + }; + return JSON.stringify(config, null, 2); +} export { DEFAULT_CONFIG }; diff --git a/packages/validate/src/__tests__/ontology.test.ts b/packages/validate/src/__tests__/ontology.test.ts new file mode 100644 index 0000000..a05f593 --- /dev/null +++ b/packages/validate/src/__tests__/ontology.test.ts @@ -0,0 +1,473 @@ +import { describe, it, expect } from 'vitest'; +import { + parseOntologyRegistry, + parseInlineFlowSequence, + extractIdentifiersFromLine, + stripCommentsAndStrings, + checkOntologyDiff, + normalizeToken, + type OntologyChangedLine, +} from '../ontology'; + +// ============================================================================ +// Fixture +// ============================================================================ + +const FIXTURE_REGISTRY_YAML = `# Test registry — minimal subset of real data-registry.yaml shape +concepts: + + # ─── edge-auth ────────────────────────── + + tenant: + owner: edge-auth + table: tenants + sensitivity: cross_service_rpc + definition: User workspace or account boundary. + aliases: [tenants, workspace, workspaces, organization] + rpc_method: getTenant + mcp_tool: edge-auth + + user: + owner: edge-auth + table: users + sensitivity: pii_scoped + definition: Authenticated user identity. + aliases: [users, account, accounts] + rpc_method: getUser + + quota: + owner: edge-auth + table: quotas + sensitivity: cross_service_rpc + definition: Resource usage limits and current balance per tenant. + aliases: [credits, credit, stackbilt_credits, usage, limits] + rpc_method: checkQuota + + subscription: + owner: edge-auth + table: tenants.tier + sensitivity: billing_critical + definition: Subscription level. + aliases: [tier, tiers, plan] +`; + +// ============================================================================ +// Helpers +// ============================================================================ + +function buildFixtureRegistry() { + return parseOntologyRegistry(FIXTURE_REGISTRY_YAML); +} + +function line(text: string, file = 'src/handler.ts', lineNumber = 1): OntologyChangedLine { + return { file, line: lineNumber, text }; +} + +// ============================================================================ +// normalizeToken +// ============================================================================ + +describe('normalizeToken', () => { + it('lowercases', () => { + expect(normalizeToken('Tenant')).toBe('tenant'); + expect(normalizeToken('TENANT')).toBe('tenant'); + }); + + it('strips underscores', () => { + expect(normalizeToken('tenant_id')).toBe('tenantid'); + expect(normalizeToken('api_key_hash')).toBe('apikeyhash'); + }); + + it('strips spaces', () => { + expect(normalizeToken('api key')).toBe('apikey'); + }); + + it('strips hyphens', () => { + expect(normalizeToken('edge-auth')).toBe('edgeauth'); + }); +}); + +// ============================================================================ +// parseInlineFlowSequence +// ============================================================================ + +describe('parseInlineFlowSequence', () => { + it('parses a simple flow sequence', () => { + expect(parseInlineFlowSequence('[a, b, c]')).toEqual(['a', 'b', 'c']); + }); + + it('handles whitespace', () => { + expect(parseInlineFlowSequence('[ tenant, workspace ,organization ]')).toEqual([ + 'tenant', + 'workspace', + 'organization', + ]); + }); + + it('handles items with underscores', () => { + expect(parseInlineFlowSequence('[credits, stackbilt_credits, usage]')).toEqual([ + 'credits', + 'stackbilt_credits', + 'usage', + ]); + }); + + it('returns empty array for non-sequence input', () => { + expect(parseInlineFlowSequence('not a sequence')).toEqual([]); + expect(parseInlineFlowSequence('[]')).toEqual([]); + }); +}); + +// ============================================================================ +// parseOntologyRegistry +// ============================================================================ + +describe('parseOntologyRegistry', () => { + it('loads all 4 concepts from the fixture', () => { + const registry = buildFixtureRegistry(); + expect(registry.concepts.size).toBe(4); + expect(registry.concepts.has('tenant')).toBe(true); + expect(registry.concepts.has('user')).toBe(true); + expect(registry.concepts.has('quota')).toBe(true); + expect(registry.concepts.has('subscription')).toBe(true); + }); + + it('populates concept fields correctly', () => { + const registry = buildFixtureRegistry(); + const tenant = registry.concepts.get('tenant')!; + expect(tenant.owner).toBe('edge-auth'); + expect(tenant.table).toBe('tenants'); + expect(tenant.sensitivity).toBe('cross_service_rpc'); + expect(tenant.definition).toContain('User workspace'); + expect(tenant.rpcMethod).toBe('getTenant'); + expect(tenant.mcpTool).toBe('edge-auth'); + expect(tenant.aliases).toContain('workspace'); + }); + + it('handles concepts without optional fields', () => { + const registry = buildFixtureRegistry(); + const user = registry.concepts.get('user')!; + expect(user.rpcMethod).toBe('getUser'); + expect(user.mcpTool).toBeUndefined(); + }); + + it('indexes aliases to canonical names', () => { + const registry = buildFixtureRegistry(); + expect(registry.aliasIndex.get('workspace')).toBe('tenant'); + expect(registry.aliasIndex.get('credits')).toBe('quota'); + expect(registry.aliasIndex.get('credit')).toBe('quota'); + expect(registry.aliasIndex.get('tiers')).toBe('subscription'); + }); + + it('populates canonical token index', () => { + const registry = buildFixtureRegistry(); + expect(registry.canonicalTokens.has('tenant')).toBe(true); + expect(registry.canonicalTokens.has('subscription')).toBe(true); + }); + + it('populates alias token set', () => { + const registry = buildFixtureRegistry(); + expect(registry.aliasTokens.has('workspace')).toBe(true); + expect(registry.aliasTokens.has('stackbiltcredits')).toBe(true); + }); + + it('skips comment lines and blank lines', () => { + const withComments = `# top comment +concepts: + # section header + + tenant: + owner: edge-auth + sensitivity: cross_service_rpc + definition: test + aliases: [workspace] +`; + const registry = parseOntologyRegistry(withComments); + expect(registry.concepts.size).toBe(1); + expect(registry.concepts.get('tenant')!.owner).toBe('edge-auth'); + }); + + it('throws on empty/malformed input', () => { + expect(() => parseOntologyRegistry('')).toThrow(/no concepts found/); + expect(() => parseOntologyRegistry('# just a comment')).toThrow(/no concepts found/); + }); + + it('handles table: null for derived concepts', () => { + const withNullTable = `concepts: + flow: + owner: edgestack-v2 + table: null + sensitivity: cross_service_rpc + definition: Derived flow + aliases: [flows] +`; + const registry = parseOntologyRegistry(withNullTable); + expect(registry.concepts.get('flow')!.table).toBeNull(); + }); +}); + +// ============================================================================ +// extractIdentifiersFromLine +// ============================================================================ + +describe('extractIdentifiersFromLine', () => { + it('extracts simple identifiers', () => { + const tokens = extractIdentifiersFromLine('const tenant = getTenant(id);'); + expect(tokens).toContain('tenant'); + expect(tokens).toContain('gettenant'); + expect(tokens).toContain('id'); + }); + + it('extracts snake_case identifiers as normalized tokens', () => { + const tokens = extractIdentifiersFromLine('const tenant_id = row.tenant_id;'); + // tenant_id splits into 'tenant' and 'id' because _ is a separator in \w + // Actually \w includes _, so it stays together. Let me verify... + // The regex [a-zA-Z_][a-zA-Z0-9_]* matches tenant_id as a single token. + expect(tokens.some(t => t === 'tenantid')).toBe(true); + }); + + it('handles SQL-style statements', () => { + const tokens = extractIdentifiersFromLine("SELECT * FROM tenants WHERE user_id = ?"); + expect(tokens).toContain('tenants'); + expect(tokens).toContain('userid'); + }); + + it('ignores punctuation and numbers', () => { + const tokens = extractIdentifiersFromLine('const PI = 3.14;'); + expect(tokens).toContain('pi'); + expect(tokens).not.toContain('3'); + expect(tokens).not.toContain('14'); + }); + + it('strips line comments before tokenizing', () => { + const tokens = extractIdentifiersFromLine('const quota = 1; // alias usage!'); + expect(tokens).toContain('quota'); + // 'usage' is in the comment — should not be extracted + expect(tokens).not.toContain('usage'); + }); + + it('strips # comments before tokenizing', () => { + const tokens = extractIdentifiersFromLine('quota: 1 # credits and usage notes'); + expect(tokens).toContain('quota'); + expect(tokens).not.toContain('credits'); + expect(tokens).not.toContain('usage'); + }); + + it('strips SQL -- comments before tokenizing', () => { + const tokens = extractIdentifiersFromLine('SELECT quota FROM tenants -- credits table'); + expect(tokens).toContain('quota'); + expect(tokens).toContain('tenants'); + expect(tokens).not.toContain('credits'); + }); + + it('strips string literals before tokenizing', () => { + const tokens = extractIdentifiersFromLine('const label = "credit balance usage";'); + expect(tokens).toContain('label'); + // credits, usage inside string literal — should not be extracted + expect(tokens).not.toContain('credit'); + expect(tokens).not.toContain('usage'); + expect(tokens).not.toContain('balance'); + }); + + it('strips single-quoted strings', () => { + const tokens = extractIdentifiersFromLine("throw new Error('tenant not found');"); + expect(tokens).toContain('error'); + expect(tokens).toContain('throw'); + // 'tenant' is inside the string literal + expect(tokens).not.toContain('tenant'); + }); + + it('strips template literals', () => { + const tokens = extractIdentifiersFromLine('log(`tenant credits updated`);'); + expect(tokens).toContain('log'); + expect(tokens).not.toContain('tenant'); + expect(tokens).not.toContain('credits'); + }); + + it('does not strip URLs starting with http://', () => { + const tokens = extractIdentifiersFromLine('const url = http://api.example.com;'); + // http://api.example.com is not a // line comment because of the : prefix + expect(tokens).toContain('url'); + expect(tokens).toContain('http'); + }); + + it('preserves block comment stripping for /* inline */ spans', () => { + const tokens = extractIdentifiersFromLine('const tenant = /* credits TODO */ null;'); + expect(tokens).toContain('tenant'); + expect(tokens).not.toContain('credits'); + expect(tokens).not.toContain('todo'); + }); +}); + +describe('stripCommentsAndStrings', () => { + it('leaves plain code unchanged', () => { + const result = stripCommentsAndStrings('const tenant = getTenant(id);'); + expect(result).toContain('tenant'); + expect(result).toContain('getTenant'); + }); + + it('strips trailing line comments', () => { + const result = stripCommentsAndStrings('const x = 1; // comment with credits'); + expect(result).not.toContain('credits'); + }); + + it('strips YAML-style # comments with space guard', () => { + const result = stripCommentsAndStrings('key: value # comment'); + expect(result).not.toContain('comment'); + expect(result).toContain('key'); + }); + + it('preserves # directives without space (guarded)', () => { + const result = stripCommentsAndStrings('#include
'); + expect(result).toContain('#include'); + }); + + it('strips JSDoc interior lines starting with * ', () => { + const result = stripCommentsAndStrings(' * tiers and sensitivity levels.'); + // Whole line is treated as comment + const tokens = extractIdentifiersFromLine(' * tiers and sensitivity levels.'); + expect(tokens).not.toContain('tiers'); + expect(tokens).not.toContain('sensitivity'); + }); + + it('strips JSDoc opener lines (/** ...)', () => { + const tokens = extractIdentifiersFromLine('/** Describes tenant and quota.'); + expect(tokens).not.toContain('tenant'); + expect(tokens).not.toContain('quota'); + }); + + it('does not strip multiplication lines like 2 * tier', () => { + // Leading `*` only triggers if it's the first non-whitespace char + const tokens = extractIdentifiersFromLine('const result = 2 * tier;'); + expect(tokens).toContain('tier'); + expect(tokens).toContain('result'); + }); + + it('handles block comment closer on same line as content', () => { + const result = stripCommentsAndStrings(' * close comment */ const tenant = 1;'); + // Content after */ is preserved + const tokens = extractIdentifiersFromLine(' * close comment */ const tenant = 1;'); + expect(tokens).toContain('tenant'); + // The pre-*/ text is stripped + expect(tokens).not.toContain('close'); + expect(tokens).not.toContain('comment'); + }); +}); + +// ============================================================================ +// checkOntologyDiff +// ============================================================================ + +describe('checkOntologyDiff', () => { + it('flags alias usage as WARN violation', () => { + const registry = buildFixtureRegistry(); + const result = checkOntologyDiff( + [line('const credits = await checkCredits(tenantId);')], + registry + ); + + // 'credits' is an alias for 'quota' + const aliasViolations = result.violations.filter(v => v.type === 'NON_CANONICAL_ALIAS'); + expect(aliasViolations.length).toBeGreaterThan(0); + expect(aliasViolations.some(v => v.identifier === 'credits' && v.canonical === 'quota')).toBe(true); + expect(result.passed).toBe(false); + }); + + it('does not flag canonical usage', () => { + const registry = buildFixtureRegistry(); + const result = checkOntologyDiff( + [line('const quota = await checkQuota(tenantId);')], + registry + ); + expect(result.violations.filter(v => v.severity === 'WARN').length).toBe(0); + expect(result.passed).toBe(true); + // quota is referenced canonically + const references = result.references.filter(r => r.canonical === 'quota'); + expect(references.length).toBeGreaterThan(0); + expect(references.every(r => !r.isAlias)).toBe(true); + }); + + it('reports informational references even on a clean diff', () => { + const registry = buildFixtureRegistry(); + const result = checkOntologyDiff( + [line('const t = await getTenant(tenantId);')], + registry + ); + // 'tenant' canonical is referenced via 'tenantid' normalized token + // Actually: getTenant normalizes to 'gettenant', tenantId → 'tenantid' + // Neither matches 'tenant' exactly. We only flag exact token matches. + // So this diff should have no references if no exact 'tenant' token. + // Add a line with bare 'tenant' to prove the reference flow: + const result2 = checkOntologyDiff( + [line('async function handleTenant(tenant: Tenant) {}')], + registry + ); + expect(result2.references.some(r => r.canonical === 'tenant' && !r.isAlias)).toBe(true); + expect(result2.passed).toBe(true); + }); + + it('summarizes referenced concept counts', () => { + const registry = buildFixtureRegistry(); + const result = checkOntologyDiff( + [ + line('const tenant = await getTenant();', 'a.ts', 1), // canonical + line('const user = await getUser();', 'b.ts', 1), // canonical + line('const workspace = tenant;', 'c.ts', 1), // alias AND canonical + ], + registry + ); + // tenant: line 1 (canonical) + line 3 (alias via 'workspace') + line 3 (canonical via 'tenant') = 3 + expect(result.referencedConcepts.get('tenant')).toBe(3); + expect(result.referencedConcepts.get('user')).toBe(1); + }); + + it('does not double-count the same token on the same line', () => { + const registry = buildFixtureRegistry(); + const result = checkOntologyDiff( + [line('const tenant = tenant.workspace;', 'a.ts', 1)], + registry + ); + const tenantRefs = result.references.filter(r => r.canonical === 'tenant' && !r.isAlias); + expect(tenantRefs.length).toBe(1); + }); + + it('suppresses alias violations when ignoreAliasViolations is set', () => { + const registry = buildFixtureRegistry(); + const result = checkOntologyDiff( + [line('const credits = checkCredits();')], + registry, + { ignoreAliasViolations: true } + ); + expect(result.violations.filter(v => v.type === 'NON_CANONICAL_ALIAS').length).toBe(0); + expect(result.passed).toBe(true); + // But still reports it as a reference + expect(result.references.some(r => r.canonical === 'quota' && r.isAlias)).toBe(true); + }); + + it('flags each changed line independently', () => { + const registry = buildFixtureRegistry(); + const result = checkOntologyDiff( + [ + line('const credits = 1;', 'a.ts', 10), + line('const credit = 2;', 'a.ts', 11), + ], + registry + ); + const violations = result.violations.filter(v => v.type === 'NON_CANONICAL_ALIAS'); + expect(violations.length).toBe(2); + expect(violations.every(v => v.canonical === 'quota')).toBe(true); + expect(violations.map(v => v.line).sort()).toEqual([10, 11]); + }); + + it('includes canonical concept metadata in violation messages', () => { + const registry = buildFixtureRegistry(); + const result = checkOntologyDiff( + [line('const tier = plan.tier;')], // 'tier' is alias for 'subscription', 'plan' too + registry + ); + const violation = result.violations.find(v => v.identifier === 'tier' || v.identifier === 'plan'); + expect(violation).toBeDefined(); + expect(violation!.owner).toBe('edge-auth'); + expect(violation!.sensitivity).toBe('billing_critical'); + }); +}); diff --git a/packages/validate/src/index.ts b/packages/validate/src/index.ts index 862f39c..f07b000 100644 --- a/packages/validate/src/index.ts +++ b/packages/validate/src/index.ts @@ -14,3 +14,18 @@ export { type MessageIntent, type DudePhase, } from './message-classifier'; + +export { + parseOntologyRegistry, + parseInlineFlowSequence, + extractIdentifiersFromLine, + checkOntologyDiff, + normalizeToken, + type OntologySensitivityTier, + type OntologyConcept, + type OntologyRegistry, + type OntologyChangedLine, + type OntologyViolation, + type OntologyReference, + type OntologyCheckResult, +} from './ontology'; diff --git a/packages/validate/src/ontology.ts b/packages/validate/src/ontology.ts new file mode 100644 index 0000000..99fbe13 --- /dev/null +++ b/packages/validate/src/ontology.ts @@ -0,0 +1,428 @@ +/** + * Ontology Policy Validator + * + * Validates that changed code references business concepts by their canonical + * registered names rather than arbitrary aliases. Consumes a data-registry + * YAML file (e.g., stackbilt_llc/policies/data-registry.yaml) that declares + * each concept's canonical name, owner service, sensitivity tier, and aliases. + * + * Pure logic — no filesystem, no network, no external dependencies. Callers + * read the registry file and the diff lines, then pass them in. + * + * Related: Stackbilt-dev/charter#69 — typed data access policy umbrella. + */ + +// ============================================================================ +// Types +// ============================================================================ + +export type OntologySensitivityTier = + | 'public' + | 'service_internal' + | 'cross_service_rpc' + | 'pii_scoped' + | 'billing_critical' + | 'secrets'; + +export interface OntologyConcept { + /** Canonical name, e.g. 'tenant', 'subscription', 'quota' */ + name: string; + /** Owning service, e.g. 'edge-auth' */ + owner: string; + /** D1 table name (or null for derived concepts) */ + table: string | null; + /** Sensitivity tier controlling access patterns */ + sensitivity: OntologySensitivityTier; + /** Human-readable definition */ + definition: string; + /** Non-canonical synonyms that refer to the same concept */ + aliases: string[]; + /** RPC method on the owning service, if the concept is exposed via RPC */ + rpcMethod?: string; + /** MCP tool name for the owning service, if applicable */ + mcpTool?: string; +} + +export interface OntologyRegistry { + /** Map of canonical name → concept */ + concepts: Map; + /** Index of alias (lowercased, spaces-removed) → canonical name */ + aliasIndex: Map; + /** Set of all known alias tokens (normalized) */ + aliasTokens: Set; + /** Set of all canonical name tokens (normalized) */ + canonicalTokens: Set; +} + +export interface OntologyChangedLine { + file: string; + line: number; + text: string; +} + +export interface OntologyViolation { + type: 'NON_CANONICAL_ALIAS' | 'REGISTRY_PARSE_ERROR'; + severity: 'INFO' | 'WARN' | 'FAIL'; + identifier: string; + canonical?: string; + owner?: string; + sensitivity?: OntologySensitivityTier; + file?: string; + line?: number; + message: string; +} + +export interface OntologyReference { + identifier: string; + canonical: string; + owner: string; + sensitivity: OntologySensitivityTier; + isAlias: boolean; + file: string; + line: number; +} + +export interface OntologyCheckResult { + /** Whether the check passed overall (no WARN or FAIL violations) */ + passed: boolean; + /** All registered-concept references found in the diff (informational) */ + references: OntologyReference[]; + /** Violations (alias usage, etc.) */ + violations: OntologyViolation[]; + /** Summary counts by canonical name */ + referencedConcepts: Map; +} + +// ============================================================================ +// Registry Loading +// ============================================================================ + +/** + * Normalize an identifier for token matching: lowercased, stripped of + * surrounding non-word characters, underscores removed. + * + * Examples: + * 'tenantId' → 'tenantid' + * 'tenant_id' → 'tenantid' + * 'TENANTS' → 'tenants' + * 'api key' → 'apikey' + */ +export function normalizeToken(raw: string): string { + return raw.toLowerCase().replace(/[_\s\-]/g, ''); +} + +/** + * Parse a data-registry YAML file (the format used by + * stackbilt_llc/policies/data-registry.yaml) into an OntologyRegistry. + * + * This is a minimal YAML subset parser tailored to the registry format. + * It does NOT support: anchors, multi-line strings, complex flow mappings, + * quoted strings, or tags. For the specific registry shape it handles: + * - Comments (`# ...`) + * - Nested scalar maps (2-space indent) + * - Inline flow sequences (`[a, b, c]`) + * - Bare string values + * + * @throws Error if the registry structure is malformed + */ +export function parseOntologyRegistry(yamlText: string): OntologyRegistry { + const lines = yamlText.split(/\r?\n/); + const concepts = new Map(); + const aliasIndex = new Map(); + const aliasTokens = new Set(); + const canonicalTokens = new Set(); + + let inConcepts = false; + let currentConceptName: string | null = null; + let currentConcept: Partial | null = null; + + const flushConcept = (): void => { + if (currentConceptName && currentConcept && currentConcept.owner && currentConcept.sensitivity) { + const concept: OntologyConcept = { + name: currentConceptName, + owner: currentConcept.owner, + table: currentConcept.table ?? null, + sensitivity: currentConcept.sensitivity as OntologySensitivityTier, + definition: currentConcept.definition ?? '', + aliases: currentConcept.aliases ?? [], + rpcMethod: currentConcept.rpcMethod, + mcpTool: currentConcept.mcpTool, + }; + concepts.set(currentConceptName, concept); + canonicalTokens.add(normalizeToken(currentConceptName)); + for (const alias of concept.aliases) { + const token = normalizeToken(alias); + if (token.length > 0) { + aliasTokens.add(token); + aliasIndex.set(token, currentConceptName); + } + } + } + }; + + for (let i = 0; i < lines.length; i++) { + const rawLine = lines[i]; + // Strip comments + const commentIdx = rawLine.indexOf('#'); + const line = commentIdx >= 0 ? rawLine.slice(0, commentIdx) : rawLine; + if (!line.trim()) continue; + + // Top-level 'concepts:' marker + if (/^concepts:\s*$/.test(line)) { + inConcepts = true; + continue; + } + if (!inConcepts) continue; + + // Concept name at 2-space indent followed by a colon (e.g. ' tenant:') + const conceptHeader = line.match(/^ {2}([a-zA-Z_][a-zA-Z0-9_]*):\s*$/); + if (conceptHeader) { + flushConcept(); + currentConceptName = conceptHeader[1]; + currentConcept = {}; + continue; + } + + // Field at 4-space indent (e.g. ' owner: edge-auth') + const fieldMatch = line.match(/^ {4}([a-zA-Z_][a-zA-Z0-9_]*):\s*(.*)$/); + if (fieldMatch && currentConcept) { + const [, key, rawValue] = fieldMatch; + const value = rawValue.trim(); + + switch (key) { + case 'owner': + currentConcept.owner = value; + break; + case 'table': + currentConcept.table = value === 'null' || value === '' ? null : value; + break; + case 'sensitivity': + currentConcept.sensitivity = value as OntologySensitivityTier; + break; + case 'definition': + currentConcept.definition = value; + break; + case 'rpc_method': + currentConcept.rpcMethod = value; + break; + case 'mcp_tool': + currentConcept.mcpTool = value; + break; + case 'aliases': + currentConcept.aliases = parseInlineFlowSequence(value); + break; + // Silently ignore unknown fields for forward compatibility + } + } + } + + flushConcept(); + + if (concepts.size === 0) { + throw new Error('Ontology registry parse error: no concepts found. Expected top-level "concepts:" key with indented concept entries.'); + } + + return { concepts, aliasIndex, aliasTokens, canonicalTokens }; +} + +/** + * Parse an inline YAML flow-sequence literal: [a, b, c] → ['a', 'b', 'c']. + * Handles bare strings with spaces and underscores but not quoted strings. + */ +export function parseInlineFlowSequence(raw: string): string[] { + const trimmed = raw.trim(); + if (!trimmed.startsWith('[') || !trimmed.endsWith(']')) { + return []; + } + const inner = trimmed.slice(1, -1); + if (!inner.trim()) return []; + return inner + .split(',') + .map(s => s.trim()) + .filter(s => s.length > 0); +} + +// ============================================================================ +// Diff Checking +// ============================================================================ + +/** + * Strip common comment syntax from a line before token extraction. + * Supports: + * - `// ...` (JS/TS/C) + * - `# ...` (YAML/Python/shell/TOML) + * - `-- ...` (SQL) + * - Inline `/* ... *\/` (JS/C) + * - JSDoc/block-comment interior lines starting with ` * ` or `/** ` + * + * Also strips string literals (single, double, backtick-quoted) to avoid + * matching business terms that happen to appear in user-facing copy. + * + * Note: multi-line block comments without a leading `*` on each line are + * not fully handled — we process lines independently, so content between + * an opening `/*` and closing `*\/` on separate lines without interior + * leading stars will still be tokenized. In practice this is rare since + * most block-comment conventions (JSDoc, TSDoc, JavaDoc) prefix continuation + * lines with `*`. + */ +export function stripCommentsAndStrings(line: string): string { + // JSDoc/block-comment interior line: leading whitespace + `*` + content. + // Also catches the `/**` opening line. Matches `\t* text`, ` * text`, + // `/** text`, and similar patterns. Does not match bare `*` (multiplication). + if (/^\s*(?:\/\*+|\*+)(?:\s|$)/.test(line)) { + // Keep any non-comment content after a closing */ on the same line + const closeIdx = line.indexOf('*/'); + return closeIdx >= 0 ? line.slice(closeIdx + 2) : ''; + } + + // Remove inline block comments /* ... */ + let result = line.replace(/\/\*[^*]*\*+(?:[^/*][^*]*\*+)*\//g, ' '); + // Remove trailing // ... line comments (must not be inside a URL like http://) + result = result.replace(/(^|[^:])\/\/.*$/, '$1'); + // Remove trailing # ... comments (YAML/Python/shell/TOML) + // Guarded so we don't strip `#region` markers or `#include` directives. + result = result.replace(/(^|\s)#(?:\s|$).*$/, '$1'); + // Remove trailing -- ... comments (SQL) + result = result.replace(/(^|\s)--\s.*$/, '$1'); + // Remove string literals to avoid matching words inside user-facing copy + result = result + .replace(/"(?:[^"\\]|\\.)*"/g, ' ') + .replace(/'(?:[^'\\]|\\.)*'/g, ' ') + .replace(/`(?:[^`\\]|\\.)*`/g, ' '); + return result; +} + +/** + * Extract candidate identifier tokens from a line of source code. + * Returns normalized tokens (lowercased, punctuation-stripped) suitable + * for matching against the registry's alias/canonical indexes. + * + * Strips comments and string literals first so that natural-language + * prose (comments, user-facing strings) doesn't trigger false positives + * on alias words that appear in English sentences (e.g. "usage" or + * "account" in a TODO comment). + * + * The extractor is language-agnostic: a single regex yields every + * word-like token in the remaining code. This works across TypeScript, + * JavaScript, SQL, YAML, and markdown source. + */ +export function extractIdentifiersFromLine(line: string): string[] { + const stripped = stripCommentsAndStrings(line); + const matches = stripped.match(/[a-zA-Z_][a-zA-Z0-9_]*/g); + if (!matches) return []; + return matches.map(normalizeToken); +} + +/** + * Check a set of changed lines against the ontology registry. Returns + * registered-concept references (informational) and any alias violations + * (non-canonical usage of a known alias in new code). + * + * Caller is responsible for filtering the diff to NEW lines only if that's + * the desired scope. This function treats every input line as in-scope. + * + * `options.ignoreAliasViolations` suppresses ALL alias warnings (reports + * references but produces no violations). `options.ignoredAliasTokens` + * suppresses specific alias tokens — normalized form, same as the + * aliasTokens set. Useful for repo-local overrides of generic aliases + * (token, key, usage) that collide with common programming vocabulary. + */ +export function checkOntologyDiff( + changedLines: OntologyChangedLine[], + registry: OntologyRegistry, + options: { ignoreAliasViolations?: boolean; ignoredAliasTokens?: Set } = {} +): OntologyCheckResult { + const references: OntologyReference[] = []; + const violations: OntologyViolation[] = []; + const referencedConcepts = new Map(); + + for (const line of changedLines) { + const tokens = extractIdentifiersFromLine(line.text); + const seenOnLine = new Set(); + + for (const token of tokens) { + // Avoid double-reporting if the same token appears twice on a line + if (seenOnLine.has(token)) continue; + seenOnLine.add(token); + + // Canonical match? Report as clean reference. + if (registry.canonicalTokens.has(token)) { + const canonicalName = findCanonicalByToken(token, registry); + if (canonicalName) { + const concept = registry.concepts.get(canonicalName)!; + references.push({ + identifier: token, + canonical: canonicalName, + owner: concept.owner, + sensitivity: concept.sensitivity, + isAlias: false, + file: line.file, + line: line.line, + }); + referencedConcepts.set(canonicalName, (referencedConcepts.get(canonicalName) ?? 0) + 1); + } + continue; + } + + // Alias match? Report as reference AND violation unless suppressed. + if (registry.aliasTokens.has(token)) { + const canonicalName = registry.aliasIndex.get(token); + if (!canonicalName) continue; + const concept = registry.concepts.get(canonicalName); + if (!concept) continue; + + // An alias that is also the canonical name's own lowercase form + // doesn't count as a violation — it's the canonical itself. + if (normalizeToken(canonicalName) === token) continue; + + // Per-repo ignore list: suppress noisy aliases that collide with + // common programming vocabulary (token, key, usage, etc.) + const isIgnored = options.ignoredAliasTokens?.has(token) ?? false; + + if (!isIgnored) { + references.push({ + identifier: token, + canonical: canonicalName, + owner: concept.owner, + sensitivity: concept.sensitivity, + isAlias: true, + file: line.file, + line: line.line, + }); + referencedConcepts.set(canonicalName, (referencedConcepts.get(canonicalName) ?? 0) + 1); + } + + if (!options.ignoreAliasViolations && !isIgnored) { + violations.push({ + type: 'NON_CANONICAL_ALIAS', + severity: 'WARN', + identifier: token, + canonical: canonicalName, + owner: concept.owner, + sensitivity: concept.sensitivity, + file: line.file, + line: line.line, + message: `Uses alias '${token}' for concept '${canonicalName}' (owned by ${concept.owner}, ${concept.sensitivity}). Prefer the canonical form in new code; aliases are acceptable in user-facing copy only.`, + }); + } + } + } + } + + const passed = violations.every(v => v.severity !== 'WARN' && v.severity !== 'FAIL'); + + return { + passed, + references, + violations, + referencedConcepts, + }; +} + +/** Walk the concepts map to find the canonical name whose normalized form matches. */ +function findCanonicalByToken(token: string, registry: OntologyRegistry): string | null { + for (const [name] of registry.concepts) { + if (normalizeToken(name) === token) return name; + } + return null; +}