From 78eb6768e30092e2eea77bd685f176af0135a353 Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Fri, 5 Jun 2026 17:07:21 +0100 Subject: [PATCH] fix(incremental): use --name-status -M to detect renames and prune old paths (#366) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Incremental `/understand` runs build the changed-file list via `git diff ..HEAD --name-only`. Git's default rename detection collapses a rename into a single new-path line, so the prune step never sees the old path. The pre-rename node (and every edge that touched it) survives in the graph as an orphan after the rename commit is analyzed. Switch to `git diff ..HEAD --name-status -M` and route each status: - `M`, `A`, `T`, `U` → analyze - `D` → prune-only - `R\told\tnew` → prune `old`, analyze `new` - `C\told\tnew` → analyze `new` only (source survives) Parsing happens in a new shared helper, `parse-git-changes.mjs`, which writes two newline-delimited files (`changed-files.txt`, `prune-files.txt`) that SKILL.md feeds to `compute-batches.mjs --changed-files=…` and to the prune step. The prune step now takes the union of both sets so renamed/deleted files no longer leave stale nodes + edges behind. Adds `tests/skill/understand/test_parse_git_changes.test.mjs` covering the parser's branches (M, A, D, R, C, T, U, unknown status, empty input) plus a CLI end-to-end check that the two output files contain the expected sorted paths. No real git invocation needed — the fixtures are raw `--name-status` strings. Addresses Bug 1 of 3 from #366. Bugs 2 (inbound-edge pruning) and 3 (importMap recovery) are covered by separate follow-up PRs. Refs #366 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../test_parse_git_changes.test.mjs | 198 ++++++++++++++++++ .../skills/understand/SKILL.md | 46 ++-- .../skills/understand/parse-git-changes.mjs | 146 +++++++++++++ 3 files changed, 377 insertions(+), 13 deletions(-) create mode 100644 tests/skill/understand/test_parse_git_changes.test.mjs create mode 100644 understand-anything-plugin/skills/understand/parse-git-changes.mjs diff --git a/tests/skill/understand/test_parse_git_changes.test.mjs b/tests/skill/understand/test_parse_git_changes.test.mjs new file mode 100644 index 00000000..d2376e07 --- /dev/null +++ b/tests/skill/understand/test_parse_git_changes.test.mjs @@ -0,0 +1,198 @@ +import { describe, it, expect, afterEach } from 'vitest'; +import { mkdtempSync, writeFileSync, readFileSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join, dirname, resolve } from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; + +import { parseGitNameStatus } + from '../../../understand-anything-plugin/skills/understand/parse-git-changes.mjs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const SCRIPT = resolve( + __dirname, + '../../../understand-anything-plugin/skills/understand/parse-git-changes.mjs', +); + +describe('parseGitNameStatus — module API', () => { + it('returns empty sets for empty input', () => { + const { changedSet, pruneSet } = parseGitNameStatus(''); + expect([...changedSet]).toEqual([]); + expect([...pruneSet]).toEqual([]); + }); + + it('returns empty sets for null/undefined input', () => { + expect(parseGitNameStatus(null).changedSet.size).toBe(0); + expect(parseGitNameStatus(undefined).pruneSet.size).toBe(0); + }); + + it('classifies M / A as changed-only', () => { + const diff = ['M\tsrc/a.ts', 'A\tsrc/b.ts'].join('\n'); + const { changedSet, pruneSet } = parseGitNameStatus(diff); + expect([...changedSet].sort()).toEqual(['src/a.ts', 'src/b.ts']); + expect([...pruneSet]).toEqual([]); + }); + + it('classifies D as prune-only (no analyze)', () => { + const diff = 'D\tsrc/gone.ts'; + const { changedSet, pruneSet } = parseGitNameStatus(diff); + expect([...changedSet]).toEqual([]); + expect([...pruneSet]).toEqual(['src/gone.ts']); + }); + + it('classifies R as BOTH prune-old AND analyze-new (the Bug 1 fix)', () => { + // The exact repro from issue #366: `git mv docs/a.md docs/b.md`. + const diff = 'R100\tdocs/a.md\tdocs/b.md'; + const { changedSet, pruneSet } = parseGitNameStatus(diff); + expect([...changedSet]).toEqual(['docs/b.md']); + expect([...pruneSet]).toEqual(['docs/a.md']); + }); + + it('classifies R with a non-100 similarity score (rename + small edit)', () => { + // Renamed AND modified — git emits R with score < 100. + const diff = 'R087\tsrc/old/util.ts\tsrc/new/util.ts'; + const { changedSet, pruneSet } = parseGitNameStatus(diff); + expect([...changedSet]).toEqual(['src/new/util.ts']); + expect([...pruneSet]).toEqual(['src/old/util.ts']); + }); + + it('classifies C as analyze-new only (source file still exists)', () => { + // Copy detection: original survives, must NOT be pruned. + const diff = 'C50\tsrc/template.ts\tsrc/copy.ts'; + const { changedSet, pruneSet } = parseGitNameStatus(diff); + expect([...changedSet]).toEqual(['src/copy.ts']); + expect([...pruneSet]).toEqual([]); + }); + + it('classifies T (type change) as changed-only', () => { + // T appears when a file flips between regular file / symlink / submodule. + const diff = 'T\tsrc/link.ts'; + const { changedSet, pruneSet } = parseGitNameStatus(diff); + expect([...changedSet]).toEqual(['src/link.ts']); + expect([...pruneSet]).toEqual([]); + }); + + it('classifies U (unmerged) as changed so user must resolve it', () => { + const diff = 'U\tsrc/conflict.ts'; + const { changedSet, pruneSet } = parseGitNameStatus(diff); + expect([...changedSet]).toEqual(['src/conflict.ts']); + expect([...pruneSet]).toEqual([]); + }); + + it('skips blank lines silently', () => { + const diff = ['', 'M\tsrc/a.ts', '', '', 'A\tsrc/b.ts', ''].join('\n'); + const { changedSet, pruneSet } = parseGitNameStatus(diff); + expect([...changedSet].sort()).toEqual(['src/a.ts', 'src/b.ts']); + expect([...pruneSet]).toEqual([]); + }); + + it('handles a realistic mixed diff (M + A + D + R + C)', () => { + const diff = [ + 'M\tpackages/core/src/foo.ts', + 'A\tpackages/core/src/new.ts', + 'D\tpackages/core/src/dead.ts', + 'R100\tdocs/a.md\tdocs/b.md', + 'R087\tsrc/util.ts\tsrc/utils/index.ts', + 'C75\tsrc/template.ts\tsrc/copy.ts', + ].join('\n'); + const { changedSet, pruneSet } = parseGitNameStatus(diff); + expect([...changedSet].sort()).toEqual([ + 'docs/b.md', + 'packages/core/src/foo.ts', + 'packages/core/src/new.ts', + 'src/copy.ts', + 'src/utils/index.ts', + ]); + expect([...pruneSet].sort()).toEqual([ + 'docs/a.md', + 'packages/core/src/dead.ts', + 'src/util.ts', + ]); + // No path may appear in BOTH sets — they MUST be disjoint, otherwise the + // prune step would delete the freshly-re-analyzed nodes. + for (const p of changedSet) expect(pruneSet.has(p)).toBe(false); + }); + + it('emits a warning to stderr for unknown status codes (does not throw)', () => { + // Use a status code git would never emit today (e.g. 'X' = unknown). + // We don't actually capture stderr from inside the test, but we can at + // least assert the parser is robust enough to skip and continue. + const diff = ['X\tsomething/weird.ts', 'M\tsrc/a.ts'].join('\n'); + const { changedSet, pruneSet } = parseGitNameStatus(diff); + expect([...changedSet]).toEqual(['src/a.ts']); + expect([...pruneSet]).toEqual([]); + }); +}); + +describe('parse-git-changes.mjs — CLI', () => { + let tmp; + + afterEach(() => { + if (tmp) rmSync(tmp, { recursive: true, force: true }); + }); + + it('writes sorted changed + prune sets to the two output files', () => { + tmp = mkdtempSync(join(tmpdir(), 'ua-pgc-')); + const diffPath = join(tmp, 'diff.txt'); + const changedPath = join(tmp, 'changed.txt'); + const prunePath = join(tmp, 'prune.txt'); + + // Mixed diff covering every branch the parser handles. + const diff = [ + 'M\tsrc/a.ts', + 'A\tsrc/b.ts', + 'D\tsrc/dead.ts', + 'R100\tdocs/old.md\tdocs/new.md', + 'C50\tsrc/template.ts\tsrc/copy.ts', + ].join('\n'); + writeFileSync(diffPath, diff); + + const r = spawnSync('node', [SCRIPT, diffPath, changedPath, prunePath], { + encoding: 'utf-8', + }); + expect(r.status).toBe(0); + // Stderr summary line is informational, not a Warning: + expect(r.stderr).toMatch(/parse-git-changes: 4 changed, 2 prune-only/); + + // Outputs are sorted, newline-terminated. + expect(readFileSync(changedPath, 'utf-8')).toBe( + ['docs/new.md', 'src/a.ts', 'src/b.ts', 'src/copy.ts'].join('\n') + '\n', + ); + expect(readFileSync(prunePath, 'utf-8')).toBe( + ['docs/old.md', 'src/dead.ts'].join('\n') + '\n', + ); + }); + + it('CLI handles empty diff (no commits between base..HEAD)', () => { + tmp = mkdtempSync(join(tmpdir(), 'ua-pgc-empty-')); + const diffPath = join(tmp, 'diff.txt'); + const changedPath = join(tmp, 'changed.txt'); + const prunePath = join(tmp, 'prune.txt'); + writeFileSync(diffPath, ''); + + const r = spawnSync('node', [SCRIPT, diffPath, changedPath, prunePath], { + encoding: 'utf-8', + }); + expect(r.status).toBe(0); + // Empty outputs — empty strings, not "missing files". + expect(readFileSync(changedPath, 'utf-8')).toBe(''); + expect(readFileSync(prunePath, 'utf-8')).toBe(''); + }); + + it('CLI exits non-zero with no args', () => { + const r = spawnSync('node', [SCRIPT], { encoding: 'utf-8' }); + expect(r.status).toBe(1); + expect(r.stderr).toMatch(/Usage: node parse-git-changes\.mjs/); + }); + + it('CLI exits non-zero when diff file is missing', () => { + tmp = mkdtempSync(join(tmpdir(), 'ua-pgc-miss-')); + const r = spawnSync( + 'node', + [SCRIPT, join(tmp, 'no-such.txt'), join(tmp, 'c.txt'), join(tmp, 'p.txt')], + { encoding: 'utf-8' }, + ); + expect(r.status).toBe(1); + expect(r.stderr).toMatch(/cannot read diff input/); + }); +}); diff --git a/understand-anything-plugin/skills/understand/SKILL.md b/understand-anything-plugin/skills/understand/SKILL.md index 35c0b112..4954a7ab 100644 --- a/understand-anything-plugin/skills/understand/SKILL.md +++ b/understand-anything-plugin/skills/understand/SKILL.md @@ -173,11 +173,25 @@ Determine whether to run a full analysis or incremental update. **Review-only path:** Copy the existing `knowledge-graph.json` to `$PROJECT_ROOT/.understand-anything/intermediate/assembled-graph.json`, then jump directly to Phase 6 step 3. - For incremental updates, get the changed file list: + For incremental updates, get the changed file list using `--name-status -M` so renames are reported as **both** the old and new path (a plain `--name-only` only reports the new path, leaving the pre-rename node + every edge that referenced it orphaned in the graph — issue #366, Bug 1): + ```bash - git diff ..HEAD --name-only + git diff ..HEAD --name-status -M \ + > $PROJECT_ROOT/.understand-anything/tmp/git-changes.txt + node /parse-git-changes.mjs \ + $PROJECT_ROOT/.understand-anything/tmp/git-changes.txt \ + $PROJECT_ROOT/.understand-anything/tmp/changed-files.txt \ + $PROJECT_ROOT/.understand-anything/tmp/prune-files.txt ``` - If this returns no files, report "Graph is up to date" and STOP. + + The parser splits each `--name-status` line into two disjoint sets: + + - `changed-files.txt` — newline-delimited paths to **analyze** (statuses `M`, `A`, `T`, `U`, plus the new path of `R` renames and `C` copies). + - `prune-files.txt` — newline-delimited paths to **prune-only** (statuses `D`, plus the old path of `R` renames). These are never re-analyzed; their nodes and edges are dropped from the existing graph so renamed/deleted files don't leave orphans. + + Copies (`C`) intentionally do **not** prune the source path because the source file still exists on disk. + + If both files are empty, report "Graph is up to date" and STOP. (mkdir `$PROJECT_ROOT/.understand-anything/tmp` first if it does not already exist — Phase 0 step 3 creates it as part of normal flow.) 8. **Collect project context for subagent injection:** - Read `README.md` (or `README.rst`, `readme.md`) from `$PROJECT_ROOT` if it exists. Store as `$README_CONTENT` (first 3000 characters). @@ -362,12 +376,14 @@ Include the script's warnings in `$PHASE_WARNINGS` for the reviewer. ### Incremental update path -Write the changed-files list (one path per line) to a temp file: -```bash -git diff ..HEAD --name-only > $PROJECT_ROOT/.understand-anything/tmp/changed-files.txt -``` +Reuse the two files produced by Phase 0 step 7's `parse-git-changes.mjs`: + +- `$PROJECT_ROOT/.understand-anything/tmp/changed-files.txt` — paths to re-analyze (`M`, `A`, `T`, `U`, plus `R` new paths and `C` new paths). +- `$PROJECT_ROOT/.understand-anything/tmp/prune-files.txt` — paths to prune-only (`D` deletions plus `R` old paths). These are NOT re-analyzed; their existing nodes and edges are removed from the graph so renamed/deleted files don't leave orphans. + +If Phase 0 did not write these files yet (e.g. when this path is reached out of order), re-run the parser now using the same commands as Phase 0 step 7. -Run compute-batches with `--changed-files`: +Run compute-batches with `--changed-files` (only the changed-set — the prune-only set has nothing to analyze): ```bash node /compute-batches.mjs $PROJECT_ROOT \ --changed-files=$PROJECT_ROOT/.understand-anything/tmp/changed-files.txt @@ -377,15 +393,19 @@ This produces a `batches.json` that contains only batches with changed files, bu Then dispatch file-analyzer subagents per the same template as the full path. -After batches complete: -1. Remove old nodes whose `filePath` matches any changed file from the existing graph -2. Remove old edges whose `source` or `target` references a removed node -3. Write the pruned existing nodes/edges as `batch-existing.json` in the intermediate directory -4. Run the same merge script — it will combine `batch-existing.json` with the fresh `batch-*.json` files: +After batches complete, **prune using the union of both sets** (`changed-files.txt` ∪ `prune-files.txt`) so renamed/deleted files no longer appear in the graph: + +1. Build the prune-path set: read `$PROJECT_ROOT/.understand-anything/tmp/changed-files.txt` AND `$PROJECT_ROOT/.understand-anything/tmp/prune-files.txt`, take the union. +2. Remove old nodes whose `filePath` matches any path in the prune-path set from the existing graph. +3. Remove old edges whose `source` or `target` references a removed node. +4. Write the pruned existing nodes/edges as `batch-existing.json` in the intermediate directory. +5. Run the same merge script — it will combine `batch-existing.json` with the fresh `batch-*.json` files: ```bash python /merge-batch-graphs.py $PROJECT_ROOT ``` +**Why the union?** Re-analyzed files (`changed-files.txt`) need their old nodes/edges removed before the fresh batches add the new ones, otherwise stale function/class nodes survive alongside the rewritten file. Prune-only files (`prune-files.txt`) need removal without re-analysis — there is no replacement node to merge in. + --- ## Phase 3 — ASSEMBLE REVIEW diff --git a/understand-anything-plugin/skills/understand/parse-git-changes.mjs b/understand-anything-plugin/skills/understand/parse-git-changes.mjs new file mode 100644 index 00000000..3bb3ab29 --- /dev/null +++ b/understand-anything-plugin/skills/understand/parse-git-changes.mjs @@ -0,0 +1,146 @@ +#!/usr/bin/env node +/** + * parse-git-changes.mjs — parse `git diff ..HEAD --name-status -M` + * output into two disjoint sets: + * + * - changedSet — files to re-analyze (M, A, R-new, C-new) + * - pruneSet — files whose nodes/edges must be dropped from the existing + * graph (D, R-old). Subset of operations that drop content. + * + * Why `-M` (rename detection): + * Without it, `git mv old new` is reported only as the new path on a single + * line, so the existing graph keeps a stale node + edges keyed off `old` + * even after re-analysis adds `new` (issue #366, Bug 1). + * + * With `-M`, the same rename appears as: + * R\t\t + * which lets us prune `old` AND analyze `new`. + * + * Copy (`C\t\t`) is treated as "analyze new, do NOT prune + * old" — the source file still exists, so its nodes must survive. + * + * Modes: + * - CLI: `parse-git-changes.mjs ` + * Reads the diff text from , writes newline-delimited + * paths to and . + * - Importable: `parseGitNameStatus(rawDiff)` returns + * `{ changedSet: Set, pruneSet: Set }`. + */ + +import { readFileSync, writeFileSync, realpathSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; + +/** + * Parse the raw output of `git diff ..HEAD --name-status -M`. + * + * Each line is tab-separated: + * M\t → modified, add to changedSet + * A\t → added, add to changedSet + * D\t → deleted, add to pruneSet (no analysis) + * T\t → type change (e.g. file ↔ symlink), + * treat as modified — add to changedSet + * R\t\t → rename, prune old, analyze new + * C\t\t → copy, analyze new only (old still exists) + * U\t → unmerged, add to changedSet so the user + * is forced to resolve it during analysis + * X\t... → "unknown" status; surface a warning and + * skip rather than silently dropping data + * + * Blank lines are ignored. Unknown statuses are skipped (with a stderr + * warning) rather than dropped silently so regressions surface. + */ +export function parseGitNameStatus(rawDiff) { + const changedSet = new Set(); + const pruneSet = new Set(); + if (!rawDiff) return { changedSet, pruneSet }; + + const lines = rawDiff.split('\n'); + for (const line of lines) { + if (!line.trim()) continue; + const parts = line.split('\t'); + const status = parts[0]; + if (!status) continue; + + // First char drives semantics; rename/copy carry a similarity score + // (e.g. R100, C75) we don't need for routing. + const op = status[0]; + + if (op === 'M' || op === 'A' || op === 'T' || op === 'U') { + if (parts[1]) changedSet.add(parts[1]); + } else if (op === 'D') { + if (parts[1]) pruneSet.add(parts[1]); + } else if (op === 'R') { + // R\t\t — prune old, analyze new + if (parts[1]) pruneSet.add(parts[1]); + if (parts[2]) changedSet.add(parts[2]); + } else if (op === 'C') { + // C\t\t — copy: source still exists, don't prune. + if (parts[2]) changedSet.add(parts[2]); + } else { + // Unknown / future status (e.g. X = "unknown changes"). Surface it + // rather than silently dropping the line. + process.stderr.write( + `Warning: parse-git-changes: unknown git --name-status code '${status}' ` + + `on line '${line}' — skipped\n`, + ); + } + } + return { changedSet, pruneSet }; +} + +function writeLines(path, items) { + // Sort for determinism — tests + repro runs need byte-identical output. + const sorted = [...items].sort(); + writeFileSync(path, sorted.length ? sorted.join('\n') + '\n' : '', 'utf-8'); +} + +async function main() { + const [diffPath, changedOutPath, pruneOutPath] = process.argv.slice(2); + if (!diffPath || !changedOutPath || !pruneOutPath) { + process.stderr.write( + 'Usage: node parse-git-changes.mjs \n', + ); + process.exit(1); + } + + let raw; + try { + raw = readFileSync(diffPath, 'utf-8'); + } catch (err) { + process.stderr.write( + `Error: parse-git-changes: cannot read diff input at ${diffPath} (${err.message})\n`, + ); + process.exit(1); + } + + const { changedSet, pruneSet } = parseGitNameStatus(raw); + writeLines(changedOutPath, changedSet); + writeLines(pruneOutPath, pruneSet); + + process.stderr.write( + `parse-git-changes: ${changedSet.size} changed, ${pruneSet.size} prune-only\n`, + ); +} + +// Run only when executed directly as a CLI (see compute-batches.mjs for the +// canonicalize-via-realpath rationale — symlinked plugin caches break a raw +// equality check on import.meta.url vs process.argv[1]). +function isCliEntry() { + if (!process.argv[1]) return false; + try { + const modulePath = realpathSync(fileURLToPath(import.meta.url)); + const argvPath = realpathSync(process.argv[1]); + return modulePath === argvPath; + } catch { + return false; + } +} + +if (isCliEntry()) { + try { + await main(); + } catch (err) { + process.stderr.write(`parse-git-changes.mjs failed: ${err.message}\n${err.stack}\n`); + process.exit(1); + } +}