From 9895f1bec27ddf58ffc9d5b87d09b122839ec36d Mon Sep 17 00:00:00 2001 From: tpateeq Date: Sat, 4 Jul 2026 23:31:57 +0530 Subject: [PATCH 1/2] fix(detect): source-content fingerprint for Office sidecars + cached word counts (#1649, #1656) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit convert_office_file now fingerprints the SOURCE by its raw bytes (md5, which does NOT unzip/parse the OOXML container, so it stays cheap) and records it in the sidecar header. On a later run it re-parses and rewrites the sidecar only when that fingerprint differs — so an edited .docx/.xlsx re-enters --update (#1649), while an unchanged one is never re-parsed and its mtime never churns (#1656, preserving the #1226 no-churn guarantee). The single fingerprint resolves both issues, which previously pulled in opposite directions (the #1226 early-return skipped the write but still parsed every run, and keyed on the source PATH, not its CONTENT, so edits were silently frozen). Word counts are now cached in the manifest entry and reused for unchanged files, so PDFs are no longer re-parsed for their word count on every incremental run (#1656). save_manifest seeds/refreshes the count (reusing the previous value when the content hash is unchanged); detect() reuses it when the file's mtime is unchanged. total_words stays correct, so the benchmark command's corpus_words keeps working. Legacy manifests without word_count recompute once (backward compatible via the existing dict passthrough in _normalise_entry). Tests: edited/unchanged .docx and .xlsx via convert_office_file; legacy sidecar upgrade; edited Office file re-entering detect_incremental; and an unchanged PDF not re-parsed on a second incremental run. Co-Authored-By: Claude Opus 4.8 (1M context) --- graphify/detect.py | 123 ++++++++++++++++++++++++++++++------ tests/test_detect.py | 146 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 249 insertions(+), 20 deletions(-) diff --git a/graphify/detect.py b/graphify/detect.py index 0e1c4ba30..b41ca8610 100644 --- a/graphify/detect.py +++ b/graphify/detect.py @@ -601,6 +601,26 @@ def _edge(src: str, tgt: str, relation: str) -> None: return {"nodes": nodes, "edges": edges} +# Sidecar header records the source's raw-byte fingerprint so a later run can +# tell whether the Office file changed without re-parsing it (#1649, #1656). +_SIDECAR_SOURCE_FP_RE = re.compile(r"source-md5:\s*([0-9a-f]+)") + + +def _read_sidecar_source_fingerprint(out_path: Path) -> str | None: + """Read the source fingerprint stored in an existing sidecar's header. + + Returns ``None`` for legacy sidecars written before the fingerprint was + added, so they are treated as needing a refresh on the next run. + """ + try: + with out_path.open("r", encoding="utf-8") as fh: + first_line = fh.readline() + except OSError: + return None + m = _SIDECAR_SOURCE_FP_RE.search(first_line) + return m.group(1) if m else None + + def convert_office_file(path: Path, out_dir: Path) -> Path | None: """Convert a .docx or .xlsx to a markdown sidecar in out_dir. @@ -608,14 +628,7 @@ def convert_office_file(path: Path, out_dir: Path) -> Path | None: or the required library is not installed. """ ext = path.suffix.lower() - if ext == ".docx": - text = docx_to_markdown(path) - elif ext == ".xlsx": - text = xlsx_to_markdown(path) - else: - return None - - if not text.strip(): + if ext not in (".docx", ".xlsx"): return None out_dir.mkdir(parents=True, exist_ok=True) @@ -630,13 +643,31 @@ def convert_office_file(path: Path, out_dir: Path) -> Path | None: normalized_path = unicodedata.normalize("NFC", str(path.resolve())) name_hash = hashlib.sha256(normalized_path.encode()).hexdigest()[:8] out_path = out_dir / f"{path.stem}_{name_hash}.md" - # Once the hash is stable the sidecar name is deterministic; skip re-writing - # an existing sidecar so an unchanged source never churns its mtime (which - # would still flag it as changed in detect_incremental). + + # Fingerprint the SOURCE by its raw bytes (md5). Reading the bytes does NOT + # unzip/parse the OOXML container, so this is cheap — that is the whole + # point. If a sidecar already exists and records the same fingerprint the + # source is unchanged: return it WITHOUT parsing (avoids the per-run + # re-parse of #1656) and WITHOUT rewriting (so an unchanged source never + # churns its mtime, which detect_incremental would otherwise flag as + # changed — #1226). If the fingerprint differs (or is missing, e.g. a legacy + # sidecar) the source was edited: re-parse and rewrite so the change flows + # through detect_incremental via the sidecar's new mtime/md5 (#1649). + source_fp = _md5_file(path) if out_path.exists(): - return out_path + if not source_fp or _read_sidecar_source_fingerprint(out_path) == source_fp: + return out_path + + if ext == ".docx": + text = docx_to_markdown(path) + else: + text = xlsx_to_markdown(path) + + if not text.strip(): + return None + out_path.write_text( - f"\n\n{text}", + f"\n\n{text}", encoding="utf-8", ) return out_path @@ -1015,7 +1046,14 @@ def _resolves_under_root(path: Path, root: Path) -> bool: return True -def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace: bool | None = None, extra_excludes: list[str] | None = None) -> dict: +def detect( + root: Path, + *, + follow_symlinks: bool | None = None, + google_workspace: bool | None = None, + extra_excludes: list[str] | None = None, + prior_word_counts: dict[str, tuple[float, int]] | None = None, +) -> dict: root = root.resolve() if follow_symlinks is None: follow_symlinks = False @@ -1029,6 +1067,21 @@ def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace: } total_words = 0 + # Word counting parses PDFs (extract_pdf_text) and reads sidecars on every + # file; re-doing it for unchanged files on each incremental run is pure waste + # (#1656). When the caller supplies the previous run's per-file counts, reuse + # the cached count for any file whose mtime is unchanged instead of parsing. + def _count_words(fp: Path) -> int: + if prior_word_counts is not None: + cached = prior_word_counts.get(str(fp)) + if cached is not None: + try: + if fp.stat().st_mtime == cached[0]: + return cached[1] + except OSError: + pass + return count_words(fp) + skipped_sensitive: list[str] = [] ignore_patterns = _load_graphifyignore(root) ignore_cache: dict[Path, bool] = {} # shared across all _is_ignored calls in this scan @@ -1133,7 +1186,7 @@ def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace: if _is_ignored(md_path, root, ignore_patterns, _cache=ignore_cache): continue files[ftype].append(str(md_path)) - total_words += count_words(md_path) + total_words += _count_words(md_path) else: skipped_sensitive.append(str(p) + " [Google Workspace export produced no readable text]") continue @@ -1144,14 +1197,14 @@ def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace: if _is_ignored(md_path, root, ignore_patterns, _cache=ignore_cache): continue files[ftype].append(str(md_path)) - total_words += count_words(md_path) + total_words += _count_words(md_path) else: # Conversion failed (library not installed) - skip with note skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]") continue files[ftype].append(str(p)) if ftype != FileType.VIDEO: - total_words += count_words(p) + total_words += _count_words(p) for ftype in files: files[ftype].sort() @@ -1323,6 +1376,9 @@ def _normalise_entry(entry): continue all_files = [f for file_list in files.values() for f in file_list] + # Video files are never word-counted (mirrors detect(), and avoids reading + # large media in as text). + video_set = set(files.get("video", [])) with ThreadPoolExecutor() as pool: raw = pool.map(_stat_and_hash, all_files) hashed: dict[str, tuple[float, str]] = { @@ -1334,6 +1390,7 @@ def _normalise_entry(entry): continue # file deleted between detect() and manifest write mtime, h = hashed[f] prev = _normalise_entry(existing.get(f, {})) or {} + content_unchanged = h == prev.get("ast_hash", "") entry: dict = {"mtime": mtime} if kind in ("ast", "both"): entry["ast_hash"] = h @@ -1343,7 +1400,17 @@ def _normalise_entry(entry): entry["semantic_hash"] = h else: # Preserve semantic_hash only when content is unchanged - entry["semantic_hash"] = prev.get("semantic_hash", "") if h == prev.get("ast_hash", "") else "" + entry["semantic_hash"] = prev.get("semantic_hash", "") if content_unchanged else "" + # Cache the word count so detect() can reuse it for unchanged files + # rather than re-parsing (esp. PDFs) on every incremental run (#1656). + # Reuse the previous count whenever the content hash is unchanged; only + # (re)parse a genuinely new or changed file. + if f in video_set: + entry["word_count"] = 0 + elif content_unchanged and isinstance(prev.get("word_count"), int): + entry["word_count"] = prev["word_count"] + else: + entry["word_count"] = count_words(Path(f)) manifest[f] = entry if root is not None: # Persist in portable form: forward-slash relative paths. Keys outside @@ -1386,11 +1453,27 @@ def detect_incremental( runs. ``None`` (default) does not follow symlinked directories; callers must opt in explicitly, and resolved targets outside the scan root are skipped. """ - full = detect(root, follow_symlinks=follow_symlinks, google_workspace=google_workspace, extra_excludes=extra_excludes) # Pass ``root`` so a manifest written with relative keys (post-#777) is # re-anchored to the absolute form the rest of this function compares - # against. Legacy absolute-keyed manifests pass through unchanged. + # against. Legacy absolute-keyed manifests pass through unchanged. Loaded + # before detect() so the cached per-file word counts can be handed down and + # reused for unchanged files (avoids re-parsing every PDF each run — #1656). manifest = load_manifest(manifest_path, root=root) + prior_word_counts: dict[str, tuple[float, int]] = {} + for key, entry in manifest.items(): + if isinstance(entry, dict): + wc = entry.get("word_count") + mt = entry.get("mtime") + if isinstance(wc, int) and isinstance(mt, (int, float)): + prior_word_counts[key] = (mt, wc) + + full = detect( + root, + follow_symlinks=follow_symlinks, + google_workspace=google_workspace, + extra_excludes=extra_excludes, + prior_word_counts=prior_word_counts, + ) if not manifest: # No previous run - treat everything as new diff --git a/tests/test_detect.py b/tests/test_detect.py index 458bb6a84..227892385 100644 --- a/tests/test_detect.py +++ b/tests/test_detect.py @@ -1538,3 +1538,149 @@ def test_convert_office_file_does_not_rewrite_existing_sidecar(tmp_path, monkeyp second = detect_mod.convert_office_file(src, out_dir) assert second == first assert second.stat().st_mtime_ns == mtime_before + + +def _flat(files: dict) -> list: + return [p for bucket in files.values() for p in bucket] + + +def test_convert_office_file_reparses_when_source_changes(tmp_path, monkeypatch): + """A source fingerprint in the sidecar header lets convert_office_file skip + re-parsing an unchanged .docx (#1656) yet re-parse + rewrite an edited one so + the change flows through detect_incremental (#1649).""" + calls = {"n": 0} + + def fake_docx(path): + calls["n"] += 1 + return path.read_bytes().decode("latin-1", "ignore") + + monkeypatch.setattr(detect_mod, "docx_to_markdown", fake_docx) + + out_dir = tmp_path / "converted" + src = tmp_path / "report.docx" + src.write_bytes(b"version one") + + first = detect_mod.convert_office_file(src, out_dir) + assert first is not None + assert calls["n"] == 1 + assert "version one" in first.read_text(encoding="utf-8") + mtime_before = first.stat().st_mtime_ns + + # Source unchanged: fingerprint matches -> no re-parse, no rewrite (#1656/#1226). + second = detect_mod.convert_office_file(src, out_dir) + assert second == first + assert calls["n"] == 1 + assert second.stat().st_mtime_ns == mtime_before + + # Source edited: fingerprint differs -> re-parse + rewrite the sidecar (#1649). + src.write_bytes(b"version two, now considerably longer") + third = detect_mod.convert_office_file(src, out_dir) + assert third == first # deterministic sidecar name is unchanged + assert calls["n"] == 2 + assert "version two" in third.read_text(encoding="utf-8") + + +def test_convert_office_file_reparses_xlsx_on_edit(tmp_path, monkeypatch): + """Same source-change detection for .xlsx sources (#1649/#1656).""" + calls = {"n": 0} + + def fake_xlsx(path): + calls["n"] += 1 + return "sheet: " + path.read_bytes().decode("latin-1", "ignore") + + monkeypatch.setattr(detect_mod, "xlsx_to_markdown", fake_xlsx) + + out_dir = tmp_path / "converted" + src = tmp_path / "budget.xlsx" + src.write_bytes(b"q1") + + first = detect_mod.convert_office_file(src, out_dir) + assert first is not None and calls["n"] == 1 + + detect_mod.convert_office_file(src, out_dir) # unchanged + assert calls["n"] == 1 + + src.write_bytes(b"q1 q2 q3 q4") # edited + detect_mod.convert_office_file(src, out_dir) + assert calls["n"] == 2 + + +def test_convert_office_file_upgrades_legacy_sidecar(tmp_path, monkeypatch): + """A sidecar written before source fingerprints existed (no source-md5 in the + header) is treated as stale and rewritten once, then reused thereafter.""" + monkeypatch.setattr(detect_mod, "docx_to_markdown", lambda p: "hello world") + + out_dir = tmp_path / "converted" + src = tmp_path / "legacy.docx" + src.write_bytes(b"body") + + # First conversion writes a fingerprinted sidecar. + sidecar = detect_mod.convert_office_file(src, out_dir) + assert sidecar is not None + assert detect_mod._read_sidecar_source_fingerprint(sidecar) is not None + + # Simulate an old (pre-fingerprint) sidecar header and confirm a run refreshes it. + sidecar.write_text("\n\nhello world", encoding="utf-8") + assert detect_mod._read_sidecar_source_fingerprint(sidecar) is None + refreshed = detect_mod.convert_office_file(src, out_dir) + assert refreshed == sidecar + assert detect_mod._read_sidecar_source_fingerprint(sidecar) is not None + + +def test_detect_incremental_reenters_edited_office_file(tmp_path, monkeypatch): + """End-to-end: after an Office source is edited, detect_incremental must + surface its sidecar as changed so `--update` re-extracts it (#1649). An + unchanged source stays out of the changed set (#1226/#1656).""" + monkeypatch.setattr( + detect_mod, + "docx_to_markdown", + lambda p: p.read_bytes().decode("latin-1", "ignore"), + ) + + manifest_path = str(tmp_path / "graphify-out" / "manifest.json") + src = tmp_path / "spec.docx" + src.write_bytes(b"first draft") + + r1 = detect_incremental(tmp_path, manifest_path) + sidecars = r1["files"]["document"] + assert sidecars, "office sidecar should be classified as a document" + save_manifest(r1["files"], manifest_path, root=tmp_path) + + # Unchanged run: sidecar is not re-queued. + r2 = detect_incremental(tmp_path, manifest_path) + assert not r2["new_files"]["document"] + assert r2["unchanged_files"]["document"] == sidecars + save_manifest(r2["files"], manifest_path, root=tmp_path) + + # Edit the source: the sidecar is rewritten and re-queued as changed. + src.write_bytes(b"second draft, substantially revised and longer") + r3 = detect_incremental(tmp_path, manifest_path) + assert r3["new_files"]["document"] == sidecars + + +def test_detect_incremental_pdf_not_reparsed_when_unchanged(tmp_path, monkeypatch): + """An unchanged PDF must not be re-parsed for its word count on a later + incremental run — the count is cached in the manifest and reused (#1656).""" + calls = {"n": 0} + + def fake_pdf(path): + calls["n"] += 1 + return "alpha beta gamma delta" + + monkeypatch.setattr(detect_mod, "extract_pdf_text", fake_pdf) + + manifest_path = str(tmp_path / "graphify-out" / "manifest.json") + pdf = tmp_path / "paper.pdf" + pdf.write_bytes(b"%PDF-1.4 stub content") + + r1 = detect_incremental(tmp_path, manifest_path) + assert any(p.endswith("paper.pdf") for p in _flat(r1["files"])) + save_manifest(r1["files"], manifest_path, root=tmp_path) + after_run1 = calls["n"] + assert after_run1 >= 1 # parsed at least once to seed the cached word count + + r2 = detect_incremental(tmp_path, manifest_path) + save_manifest(r2["files"], manifest_path, root=tmp_path) + assert calls["n"] == after_run1 # #1656: unchanged PDF is not re-parsed + assert not any(p.endswith("paper.pdf") for p in _flat(r2["new_files"])) + assert any(p.endswith("paper.pdf") for p in _flat(r2["unchanged_files"])) From 00fffd5d3a8504219d0fa1953153c15dec8414ed Mon Sep 17 00:00:00 2001 From: tpateeq Date: Sat, 4 Jul 2026 23:49:54 +0530 Subject: [PATCH 2/2] fix(detect): review follow-ups for Office source-change detection (#1649, #1656) Address code-review findings on the source-fingerprint / word-count-cache work without altering the reviewed-correct core design: - Anchor the sidecar fingerprint regex to the trailing ` | source-md5: -->` delimiter/terminator so a source filename that itself contains a "source-md5: " substring can no longer be captured as the fingerprint (which would make the real fingerprint never match, so the file re-parsed + rewrote + re-queued on every run). Regression test with a pathological filename asserts an unchanged source is parsed once. - Future-proof the save_manifest word_count cache: content_unchanged now keys off the prior hash of the matching kind, so a semantic-only manifest (ast_hash never populated) can actually reuse its cached count. The kind="both"/"ast" paths are unchanged (still key off ast_hash), so a real content change still recomputes. - Add the missing CHANGELOG `## Unreleased` entry covering both issues. Preserves the #1226 no-churn guarantee and one-time-double-parse behavior. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 1 + graphify/detect.py | 15 ++++++-- tests/test_detect.py | 81 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 626fac385..949be565b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Full release notes with details on each version: [GitHub Releases](https://githu ## Unreleased +- Fix: an edited Office source now re-enters `--update`, and unchanged PDFs/Office files are no longer re-parsed on every run (#1649 / #1656, thanks @Ns2384-star). A modified `.docx`/`.xlsx` reused a byte-identical, deterministically-named sidecar whose content was never refreshed, so `graphify update` never re-extracted it (#1649); meanwhile every incremental run re-unzipped each Office file and re-parsed each PDF just to recount words (#1656). The `.docx`/`.xlsx` sidecar header now records a raw-byte `source-md5` fingerprint of the source — reading the bytes doesn't unzip the OOXML container, so it's cheap. A matching fingerprint returns the existing sidecar without re-parsing or rewriting it (preserving the #1226 unchanged-mtime no-churn guarantee), while a differing or legacy-missing fingerprint re-parses and rewrites so the new sidecar mtime/hash flows the edit through `detect_incremental`. Per-file word counts are also cached in the manifest and reused for any file whose mtime is unchanged, so an unchanged PDF/Office file is parsed once rather than on every run. - Fix: a malformed semantic chunk no longer crashes `extract` and discards every successful chunk (#1631, thanks @ssazy). When an LLM returned a well-formed object whose `edges` (or `nodes`/`hyperedges`) array carried a stray non-dict entry — a nested list where an edge object belongs — the AST+semantic merge and the semantic-cache write both called `.get()` per entry and raised `AttributeError: 'list' object has no attribute 'get'`. On a 34-chunk run where 33 succeeded, that meant no `graph.json` was written and the cache write failed too, so a re-run re-extracted everything. `_parse_llm_json` now sanitizes each fragment at the single parse chokepoint (keeping only dict entries and coercing a non-list value to `[]`), so the cache writer, the adaptive-retry merge, and the CLI merge are all protected in one place. - Fix: an unresolved bare npm import no longer aliases onto an unrelated same-named local file (#1638, thanks @EveX1). `import colors from "tailwindcss/colors"` in a `.tsx` file emitted an `imports_from` edge to the bare id `colors`, and build.py's pre-migration alias index (which registers every local file's bare stem) then remapped it onto an unrelated `backend/utils/colors.py` — a confident (`EXTRACTED`) cross-language phantom edge, and one per `.tsx` file sharing the import. In a real monorepo eight unrelated `.tsx` files all landed on a single Python module. Common package subpaths (`colors`, `utils`, `types`, `config`, `client`) collide this way constantly. The external-import fallback now namespaces its target with the `ref` prefix (the same J-4 convention used for tsconfig `extends`/`$ref` externals), so it can never collapse to a local file/symbol id; the ref-namespaced target has no node, so build drops it as an external reference — the correct outcome for a third-party import. - Fix: `graph.json` node/edge ordering is now stable run-to-run for document/semantic corpora (#1632, thanks @umeshpsatwe). With a parallel LLM backend, `extract_corpus_parallel` merged chunk results in completion order, so which network call happened to return first reordered the nodes and edges even when the model returned identical content — churning `graph.json` between otherwise-identical runs. Chunks are now merged in deterministic submission order after the pool drains (matching the serial path); the progress callback still fires in completion order so long local runs aren't silent. Note: the semantic content the LLM extracts is itself nondeterministic run-to-run — this fix removes the pipeline's own ordering churn, not the model's variance. diff --git a/graphify/detect.py b/graphify/detect.py index b41ca8610..ffc1d2512 100644 --- a/graphify/detect.py +++ b/graphify/detect.py @@ -603,7 +603,11 @@ def _edge(src: str, tgt: str, relation: str) -> None: # Sidecar header records the source's raw-byte fingerprint so a later run can # tell whether the Office file changed without re-parsing it (#1649, #1656). -_SIDECAR_SOURCE_FP_RE = re.compile(r"source-md5:\s*([0-9a-f]+)") +# Anchor to the ` | source-md5: -->` delimiter/terminator at the END of the +# header line so a source *filename* that happens to contain a "source-md5: ..." +# substring can't be captured as the fingerprint — otherwise the real fingerprint +# would never match and the file would re-parse+rewrite (and re-queue) every run. +_SIDECAR_SOURCE_FP_RE = re.compile(r"\| source-md5:\s*([0-9a-f]+)\s*-->\s*$") def _read_sidecar_source_fingerprint(out_path: Path) -> str | None: @@ -1390,7 +1394,14 @@ def _normalise_entry(entry): continue # file deleted between detect() and manifest write mtime, h = hashed[f] prev = _normalise_entry(existing.get(f, {})) or {} - content_unchanged = h == prev.get("ast_hash", "") + # Compare the file's content hash (h) against the prior hash of the SAME + # kind so an unchanged file is recognised regardless of which pipeline + # wrote the manifest. ast/both key off the prior ast_hash (unchanged + # behaviour); a semantic-only manifest never populates ast_hash, so key + # off the prior semantic_hash there — otherwise content_unchanged would + # always be False and the word_count cache below could never be reused. + prior_hash = prev.get("semantic_hash", "") if kind == "semantic" else prev.get("ast_hash", "") + content_unchanged = h == prior_hash entry: dict = {"mtime": mtime} if kind in ("ast", "both"): entry["ast_hash"] = h diff --git a/tests/test_detect.py b/tests/test_detect.py index 227892385..7e71e5566 100644 --- a/tests/test_detect.py +++ b/tests/test_detect.py @@ -1684,3 +1684,84 @@ def fake_pdf(path): assert calls["n"] == after_run1 # #1656: unchanged PDF is not re-parsed assert not any(p.endswith("paper.pdf") for p in _flat(r2["new_files"])) assert any(p.endswith("paper.pdf") for p in _flat(r2["unchanged_files"])) + + +def test_convert_office_file_pathological_source_filename_fingerprint(tmp_path, monkeypatch): + """A source *filename* that itself contains a ``source-md5: `` substring + must not fool the sidecar fingerprint reader. The header interpolates the + filename before the real trailing fingerprint, so a naive un-anchored regex + would capture the filename's fake hash — the real fingerprint would then + never match and the file would re-parse + rewrite (and re-queue) on EVERY + run. The reader anchors to the trailing delimiter so only the real + fingerprint matches and an unchanged source is parsed exactly once (#1656).""" + calls = {"n": 0} + + def fake_docx(path): + calls["n"] += 1 + return path.read_bytes().decode("latin-1", "ignore") + + monkeypatch.setattr(detect_mod, "docx_to_markdown", fake_docx) + + out_dir = tmp_path / "converted" + # Filename embeds a decoy "source-md5: deadbeef" substring. + src = tmp_path / "report source-md5: deadbeef.docx" + src.write_bytes(b"payload one") + + first = detect_mod.convert_office_file(src, out_dir) + assert first is not None + assert calls["n"] == 1 + # The stored fingerprint is the real source md5, not the filename's decoy. + stored = detect_mod._read_sidecar_source_fingerprint(first) + assert stored is not None and stored != "deadbeef" + + # Unchanged source: must NOT re-parse despite the misleading filename. + second = detect_mod.convert_office_file(src, out_dir) + assert second == first + assert calls["n"] == 1 + + # An actual edit still re-parses. + src.write_bytes(b"payload two, now different") + third = detect_mod.convert_office_file(src, out_dir) + assert third == first + assert calls["n"] == 2 + + +def test_save_manifest_semantic_kind_reuses_cached_word_count(tmp_path, monkeypatch): + """For a semantic-only manifest ``ast_hash`` is never populated, so keying the + word_count cache off it would always miss. save_manifest keys the reuse check + off the prior hash of the matching kind, so an unchanged file's cached count + is reused rather than recomputed on the next kind='semantic' write (#1656).""" + from graphify.detect import load_manifest + + calls = {"n": 0} + real_count_words = detect_mod.count_words + + def counting(path): + calls["n"] += 1 + return real_count_words(path) + + monkeypatch.setattr(detect_mod, "count_words", counting) + + manifest_path = str(tmp_path / "graphify-out" / "manifest.json") + doc = tmp_path / "notes.md" + doc.write_text("alpha beta gamma delta epsilon", encoding="utf-8") + files = {"document": [str(doc)]} + + save_manifest(files, manifest_path, kind="semantic", root=tmp_path) + assert calls["n"] == 1 # parsed once to seed the cache + stored = load_manifest(manifest_path, root=tmp_path) + (entry,) = stored.values() + assert entry["word_count"] == 5 + assert entry.get("ast_hash", "") == "" # semantic-only: ast_hash stays empty + + # Second semantic write, content unchanged: cached count is reused. + save_manifest(files, manifest_path, kind="semantic", root=tmp_path) + assert calls["n"] == 1 + + # A real content change must still recompute. + doc.write_text("one two three", encoding="utf-8") + save_manifest(files, manifest_path, kind="semantic", root=tmp_path) + assert calls["n"] == 2 + stored = load_manifest(manifest_path, root=tmp_path) + (entry,) = stored.values() + assert entry["word_count"] == 3