From c1cb0eb3b0e48906c10038dbee494a252f4444db Mon Sep 17 00:00:00 2001 From: Xitee1 <59659167+Xitee1@users.noreply.github.com> Date: Tue, 12 May 2026 10:50:54 +0200 Subject: [PATCH 1/2] docs: add incremental-archives design spec + phase 1 plan Spec covers the full 5-phase rollout of incremental archive support plus the catalog-storage policy change. Phase 1 plan implements the catalog policy slice (Disc 1 only + local persistence). Subsequent phase plans will be added when each phase begins. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...05-12-incremental-phase1-catalog-policy.md | 419 ++++++++++++++++++ .../2026-05-12-incremental-backups-design.md | 173 ++++++++ 2 files changed, 592 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-12-incremental-phase1-catalog-policy.md create mode 100644 docs/superpowers/specs/2026-05-12-incremental-backups-design.md diff --git a/docs/superpowers/plans/2026-05-12-incremental-phase1-catalog-policy.md b/docs/superpowers/plans/2026-05-12-incremental-phase1-catalog-policy.md new file mode 100644 index 0000000..6f70a61 --- /dev/null +++ b/docs/superpowers/plans/2026-05-12-incremental-phase1-catalog-policy.md @@ -0,0 +1,419 @@ +# Incremental Phase 1 — Catalog Policy Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Stop replicating the isolated dar catalog onto every disc of an archive set. Place it on Disc 1 only and persist a local copy alongside the disc images. + +**Architecture:** Two surgical edits in `commands/create.py`. The current code adds catalog files to every disc's ISO sources inside the per-disc build loop. We gate that addition on `i == 1`. Separately, before the final `shutil.rmtree(tmp_dir)` we copy the catalog files from `tmp/` into `/` so the user has them for backup and (in later phases) as `--base` reference. No changes to `extract.py` — its catalog-acquisition logic already only consumes the catalog from the first intact disc that carries it, so it stays correct. + +**Tech Stack:** Python 3.11+, standard library only (shutil, pathlib). Project has no test framework — verification is by manual end-to-end run, matching the project's existing convention documented in `CLAUDE.md` ("No tests"). + +**Commits:** Commit steps are included per the writing-plans skill template. Per the project's global CLAUDE.md, the executor MUST NOT run a `git commit` without the user's explicit go-ahead. Treat the commit step as "stage + propose commit message, wait for approval." + +--- + +## File Structure + +| Path | Responsibility | +|---|---| +| `src/bd_archive/commands/create.py` | Sole file touched. Two edits: (a) gate catalog inclusion to Disc 1 inside the build loop; (b) persist catalog to `output_dir` before tmp cleanup; (c) cosmetic summary update | + +No new files. No extract.py changes (verified to be already graceful — see "Why no extract changes" note below). + +### Why no extract changes + +`src/bd_archive/commands/extract.py:33-41` (function `_copy_disc_data`) copies catalog files from a mounted disc only while `catalog_verified == False`. Once a disc yields an intact, sha512-verifying catalog, the flag flips and no later disc is queried for catalog. Discs without the catalog file are silently fine (the glob returns nothing). + +`src/bd_archive/commands/extract.py:240-244` checks `has_catalog = any(...)` and passes `catalog_base=None` to `dar.extract_sequential` when nothing was staged. `tools/dar.py::extract_sequential` accepts `catalog_base=None` (line 43, conditional `-A` append). dar's `--sequential-read` mode then walks slices without an isolated catalog as rescue — slower but correct. + +So: dropping the catalog from discs 2..N-1 changes nothing for extract. Task 4 below confirms this in a real run. + +--- + +## Task 1: Gate catalog inclusion to Disc 1 only + +**Files:** +- Modify: `src/bd_archive/commands/create.py:228-232` + +- [ ] **Step 1: Read the current build-loop block** + +Open `src/bd_archive/commands/create.py` and locate the block that assembles `sources` per disc inside `for i, slice_file in enumerate(slices, 1):`. Around lines 228-232 you will see: + +```python + for cat in dar_archive.catalog_files: + sources.append(cat) + cat_hash = Path(str(cat) + ".sha512") + if cat_hash.exists(): + sources.append(cat_hash) +``` + +This appends every catalog file (and its sha512 sidecar) into the per-disc ISO source list — for every disc. + +- [ ] **Step 2: Make the inclusion conditional on disc index == 1** + +Replace the block above with: + +```python + # Catalog goes onto Disc 1 only. The master catalog at the end of + # the last slice (dar default) plus this isolated copy on Disc 1 + # gives two spatially separated copies per archive set. Replicating + # on every disc was redundant and grew unboundedly with file count. + if i == 1: + for cat in dar_archive.catalog_files: + sources.append(cat) + cat_hash = Path(str(cat) + ".sha512") + if cat_hash.exists(): + sources.append(cat_hash) +``` + +The indentation (8 spaces) matches the surrounding `for i, slice_file ...` block body. + +- [ ] **Step 3: Quick syntax check** + +Run from project root: + +```bash +python3 -m py_compile src/bd_archive/commands/create.py +``` + +Expected: exit 0, no output. A SyntaxError here means the indentation drifted. + +- [ ] **Step 4: Lint** + +Run: + +```bash +ruff check src/bd_archive/commands/create.py +``` + +Expected: `All checks passed!` or no findings on the modified lines. + +- [ ] **Step 5: Stage** + +```bash +git add src/bd_archive/commands/create.py +``` + +Do NOT commit yet — Task 2 makes a companion change and they belong in one commit. + +--- + +## Task 2: Persist isolated catalog to output directory + +**Files:** +- Modify: `src/bd_archive/commands/create.py` — add a new block between the per-disc build loop and `shutil.rmtree(tmp_dir)` (around line 267) + +- [ ] **Step 1: Locate the insertion point** + +Open `src/bd_archive/commands/create.py` and find the comment block: + +```python + # Final cleanup: drop the entire tmp/ tree (catalog, dar internals). + # If workdir is the default hidden one, also remove it — the only + # thing inside was tmp/, so leaving it would just be cruft. A + # user-supplied workdir is left alone so they can keep tmpfs mounts + # etc. exactly as configured. + shutil.rmtree(tmp_dir) +``` + +We insert the catalog-persistence block *immediately before* this `# Final cleanup:` comment. + +- [ ] **Step 2: Insert the catalog-persistence block** + +Add this block, indented 4 spaces (function-body level): + +```python + # Persist the isolated catalog alongside images/ for two reasons: + # 1. It survives `output_dir` being burned + the local images/ + # being deleted — user keeps the catalog as part of their + # regular backup. + # 2. Future incremental generations will reference this file via + # `--base` (not implemented yet in this phase, but the artifact + # needs to exist from this phase onward). + for cat in dar_archive.catalog_files: + shutil.copy2(cat, output_dir / cat.name) + cat_hash = Path(str(cat) + ".sha512") + if cat_hash.exists(): + shutil.copy2(cat_hash, output_dir / cat_hash.name) + catalog_persisted = sorted(output_dir.glob(f"{cfg.name}-catalog.*.dar")) + if catalog_persisted: + log.info(f"Catalog persisted: {catalog_persisted[0].parent}/{cfg.name}-catalog.*.dar") + +``` + +(Leave a blank line after the block so it visually separates from the `# Final cleanup:` comment that follows.) + +- [ ] **Step 3: Syntax check** + +```bash +python3 -m py_compile src/bd_archive/commands/create.py +``` + +Expected: exit 0. + +- [ ] **Step 4: Lint** + +```bash +ruff check src/bd_archive/commands/create.py +``` + +Expected: clean. + +- [ ] **Step 5: Stage** + +```bash +git add src/bd_archive/commands/create.py +``` + +--- + +## Task 3: Mention the persisted catalog in the summary output + +**Files:** +- Modify: `src/bd_archive/commands/create.py` — summary block around lines 280-288 + +- [ ] **Step 1: Locate the summary block** + +At the bottom of `cmd_create`: + +```python + log.step("Summary") + print(f"\n Source: {human_bytes(scan.total_bytes)}") + print(f" Archive: {human_bytes(total_archive)} ({ratio}%)") + print(f" Discs: {slice_count} x {human_bytes(raw_capacity)}") + print(f" PAR2: {cfg.redundancy}% per disc") + print(f" Compression: {cfg.comp_str}") + print(f" Images: {images_dir}") + print(f"\n Next step: bd-archive burn -i {output_dir}") + print(f" Cleanup: rm -rf {output_dir}\n") +``` + +- [ ] **Step 2: Add a line between "Images:" and the blank line** + +Insert one line so the block reads: + +```python + log.step("Summary") + print(f"\n Source: {human_bytes(scan.total_bytes)}") + print(f" Archive: {human_bytes(total_archive)} ({ratio}%)") + print(f" Discs: {slice_count} x {human_bytes(raw_capacity)}") + print(f" PAR2: {cfg.redundancy}% per disc") + print(f" Compression: {cfg.comp_str}") + print(f" Images: {images_dir}") + print(f" Catalog: {output_dir}/{cfg.name}-catalog.*.dar") + print(f"\n Next step: bd-archive burn -i {output_dir}") + print(f" Cleanup: rm -rf {output_dir}\n") +``` + +The new line tells the user where the persisted catalog lives so they know to keep it with their regular backup. + +- [ ] **Step 3: Syntax + lint** + +```bash +python3 -m py_compile src/bd_archive/commands/create.py && ruff check src/bd_archive/commands/create.py +``` + +Expected: exit 0, clean lint. + +- [ ] **Step 4: Stage** + +```bash +git add src/bd_archive/commands/create.py +``` + +--- + +## Task 4: End-to-end manual verification + +This task is the proof. It must pass before commit. + +**Files:** none modified. Uses a scratch directory. + +- [ ] **Step 1: Prepare a small source tree that will produce ≥3 discs** + +Pick a tiny disc size so we get multi-disc output without consuming real storage. Slice size of 5 MiB means a 16 MiB source produces ~3-4 slices. + +```bash +SCRATCH=$(mktemp -d /tmp/bd-archive-phase1-XXXXXX) +mkdir -p "$SCRATCH/src" +# Generate ~16 MiB of incompressible content across multiple files +for i in $(seq 1 8); do + dd if=/dev/urandom of="$SCRATCH/src/file_$i.bin" bs=1M count=2 status=none +done +echo "$SCRATCH" +``` + +- [ ] **Step 2: Run `bd-archive create` with a small manual capacity** + +```bash +cd /home/mato/projects/_Privat/bd-archiver +source .venv/bin/activate # ensure editable install is active +bd-archive create \ + -s "$SCRATCH/src" \ + -n phase1test \ + -o "$SCRATCH/out" \ + -b $((5 * 1024 * 1024)) \ + --ratio 1.0 \ + -y +``` + +Expected: completes, prints a "Catalog persisted: …" line, prints "Catalog: …" in the summary. + +- [ ] **Step 3: Verify the persisted catalog lives in output_dir** + +```bash +ls -l "$SCRATCH/out/phase1test-catalog."*.dar +ls -l "$SCRATCH/out/phase1test-catalog."*.dar.sha512 2>/dev/null || echo "(no sha512 sidecar — that is OK)" +``` + +Expected: at least one `phase1test-catalog.0001.dar` file present. sha512 sidecar may or may not exist (depending on dar's behavior — dar produces sha512 for slices, not necessarily for the catalog). + +- [ ] **Step 4: Verify Disc 1 ISO contains the catalog file** + +```bash +udisksctl loop-setup -f "$SCRATCH/out/images/disc_0001.iso" 2>&1 | tee /tmp/disc1-loop.log +LOOP=$(grep -oE '/dev/loop[0-9]+' /tmp/disc1-loop.log | head -1) +udisksctl mount -b "$LOOP" +MNT=$(udisksctl info -b "$LOOP" | awk -F': ' '/MountPoints:/ {print $2; exit}' | tr -d ' ') +ls -l "$MNT" +udisksctl unmount -b "$LOOP" +udisksctl loop-delete -b "$LOOP" +``` + +Expected: `ls` shows `phase1test-catalog.0001.dar` among the disc contents (alongside the slice, par2 files, README). + +- [ ] **Step 5: Verify Disc 2 ISO does NOT contain the catalog** + +```bash +udisksctl loop-setup -f "$SCRATCH/out/images/disc_0002.iso" 2>&1 | tee /tmp/disc2-loop.log +LOOP=$(grep -oE '/dev/loop[0-9]+' /tmp/disc2-loop.log | head -1) +udisksctl mount -b "$LOOP" +MNT=$(udisksctl info -b "$LOOP" | awk -F': ' '/MountPoints:/ {print $2; exit}' | tr -d ' ') +ls -l "$MNT" +udisksctl unmount -b "$LOOP" +udisksctl loop-delete -b "$LOOP" +``` + +Expected: `ls` shows the Disc 2 slice + its par2 + README, but **no** `phase1test-catalog.*.dar` file. This is the central check of Phase 1. + +- [ ] **Step 6: Verify the last disc's ISO also does NOT contain the isolated catalog** + +```bash +LAST_ISO=$(ls "$SCRATCH/out/images"/disc_*.iso | tail -1) +udisksctl loop-setup -f "$LAST_ISO" 2>&1 | tee /tmp/disc-last-loop.log +LOOP=$(grep -oE '/dev/loop[0-9]+' /tmp/disc-last-loop.log | head -1) +udisksctl mount -b "$LOOP" +MNT=$(udisksctl info -b "$LOOP" | awk -F': ' '/MountPoints:/ {print $2; exit}' | tr -d ' ') +ls -l "$MNT" +udisksctl unmount -b "$LOOP" +udisksctl loop-delete -b "$LOOP" +``` + +Expected: the last disc has its slice + par2 + README, but no isolated `phase1test-catalog.*.dar`. The dar slice itself embeds the master catalog at its end — that is unchanged. + +- [ ] **Step 7: Verify `bd-archive extract` still recovers correctly with the new layout** + +Use the ISOs as input (no real burning needed): + +```bash +mkdir -p "$SCRATCH/restored" +# Verify each ISO is internally consistent first +for iso in "$SCRATCH/out/images"/disc_*.iso; do + echo "=== $(basename "$iso") ===" + bd-archive verify "$iso" || echo "VERIFY FAILED: $iso" +done +``` + +Expected: every disc reports OK (exit 0). The verify path reads the catalog from Disc 1's ISO only and that suffices — confirms extract.py will be happy too. + +- [ ] **Step 8: Verify source-vs-restore byte-identity (optional but recommended)** + +Extracting from ISO files via `bd-archive extract` requires a physical drive in the current implementation (it prompts for disc inserts). For Phase 1 manual verification, the ISO-level verify in Step 7 is sufficient — extract.py's behavior with respect to catalog placement is determined entirely by `_copy_disc_data`, which we have not touched, and `verify` exercises the same catalog-acquisition path. + +- [ ] **Step 9: Cleanup the scratch dir** + +```bash +rm -rf "$SCRATCH" +``` + +--- + +## Task 5: Commit + +Only proceed after Task 4 passes. Per the project's global CLAUDE.md, the executor must obtain user approval for the commit message before running `git commit`. + +- [ ] **Step 1: Show the staged diff to the user** + +```bash +git diff --staged src/bd_archive/commands/create.py +``` + +- [ ] **Step 2: Propose this commit message to the user** + +``` +refactor(create): place isolated catalog on Disc 1 only; persist locally + +Previously, the isolated dar catalog was duplicated into every disc's +ISO. For archives with thousands of files (and growing over incremental +generations) this added unbounded per-disc overhead — 130 MB per disc +in the user's photo archive, scaling with file count not data size. + +The dar slice on the last disc still embeds the master catalog at its +end (dar default, unchanged), so we always have two spatially separated +copies per archive set: the isolated copy on Disc 1, and the embedded +master on the last disc. Discs 2..N-1 carry only their slice + par2. + +The isolated catalog is now also persisted to /-catalog.*.dar +alongside images/, so the user can keep it in their normal digital +backup. Phase 3 will use this file as the `--base` reference for +incremental generations. + +No extract.py changes needed: its catalog-acquisition logic already +copies the catalog only from the first intact disc carrying it. +``` + +- [ ] **Step 3: After user approves, commit** + +```bash +git commit -m "$(cat <<'EOF' +refactor(create): place isolated catalog on Disc 1 only; persist locally + +Previously, the isolated dar catalog was duplicated into every disc's +ISO. For archives with thousands of files (and growing over incremental +generations) this added unbounded per-disc overhead — 130 MB per disc +in the user's photo archive, scaling with file count not data size. + +The dar slice on the last disc still embeds the master catalog at its +end (dar default, unchanged), so we always have two spatially separated +copies per archive set: the isolated copy on Disc 1, and the embedded +master on the last disc. Discs 2..N-1 carry only their slice + par2. + +The isolated catalog is now also persisted to /-catalog.*.dar +alongside images/, so the user can keep it in their normal digital +backup. Phase 3 will use this file as the `--base` reference for +incremental generations. + +No extract.py changes needed: its catalog-acquisition logic already +copies the catalog only from the first intact disc carrying it. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +- [ ] **Step 4: Verify the commit landed cleanly** + +```bash +git log -1 --stat +``` + +Expected: shows the commit, lists `src/bd_archive/commands/create.py` modified. + +--- + +## Self-Review Done + +- Spec coverage: Phase 1 of the spec specifies "Catalog files added to Disc 1 sources only (loop conditional); copy catalog to `/-gen-catalog.dar` before tmp cleanup; extract.py graceful fallback." Tasks 1+2+3 cover the create.py changes; Task 4 Step 7 verifies extract.py graceful behavior is preserved (no code change needed there as analyzed in the architecture note). The Phase-1 catalog name omits `-gen-` intentionally because the gen-naming scheme arrives in Phase 2; Phase 1 uses dar's existing `-catalog.*.dar` naming. +- Placeholder scan: no TBD/TODO/handwave. +- Type consistency: only `Path` objects from `pathlib` used; existing `dar_archive.catalog_files` API consumed unchanged. +- LOC estimate from spec: ~20. Actual diff size of Tasks 1+2+3 ≈ 15 lines added, 5 lines wrapped — on target. diff --git a/docs/superpowers/specs/2026-05-12-incremental-backups-design.md b/docs/superpowers/specs/2026-05-12-incremental-backups-design.md new file mode 100644 index 0000000..e5f76d4 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-incremental-backups-design.md @@ -0,0 +1,173 @@ +# Incremental Archives + Catalog Policy — Design Spec + +**Status:** Approved (2026-05-12) +**Scope:** Add dar-based incremental backup support to bd-archiver, plus a catalog-storage policy change that reduces per-disc overhead. + +## Problem statement + +bd-archiver currently produces one-shot full archives. Two real-world use cases need more: + +1. **Family-photo archive** — content accumulates slowly over years; user wants to extend an existing set instead of re-burning everything when new photos land. Late-arriving photos from old years (someone shares a folder) cannot currently be added to a "closed" year without redoing it. +2. **Time-lapse archive** — large and growing collection. The user has worked around the lack of incrementals by splitting into batches (separate archive sets), which is awkward and inflates the catalog footprint. + +Additionally, the current code writes the isolated dar catalog onto **every** disc of an archive set. For the user's current 130 MB catalog × 20 discs that's 2.3 GB of redundancy; for incremental setups where catalogs grow with cumulative file count over years, this overhead would balloon unboundedly. + +## Design summary + +### Catalog policy (independent of incrementals) + +- Isolated catalog persists locally as `/-gen-catalog.dar` (+ `.sha512` sidecar) alongside `images/`. +- On discs: only **Disc 1** of each generation carries the isolated catalog. +- Discs 2..N-1 contain only slice + par2 + README. +- Disc N (last) implicitly holds the master catalog at the end of its slice (dar default — not under our control). +- For 1-disc sets the isolated catalog ends up on the same disc as the embedded master catalog. Mildly redundant, not worth special-casing. + +### Incremental archive support + +User-facing CLI (Gen 1 stays as today; Gen 2+ adds `--base`): + +```bash +# Full (Gen 1) +bd-archive create -s ~/photos -n photos -o ./gen1 + +# Incremental (Gen 2, base = Gen 1's local catalog) +bd-archive create -s ~/photos -n photos \ + --base ./gen1/photos-gen1-catalog.dar \ + -o ./gen2 +``` + +**Chain identity = archive name.** The `-n` value is the chain ID. Users must use the *same* `-n` across all generations of a chain. This constraint is documented prominently in the project `README.md` and validated by `create` (passing `--base` whose parsed archive name differs from `-n` → hard error). + +**Naming scheme internally**: dar archive name is `-gen` where N is derived as: +- No `--base` given → N = 1 +- `--base ` given → parse `gen` from base catalog filename; N = K+1 +- Base catalog filename lacks `-gen-` suffix (legacy pre-feature archive) → assume K=1, so N=2 + +This makes "extend a legacy archive" work without migration tooling: user copies `-catalog.0001.dar` off an old disc and passes its path as `--base`. + +### Volume label scheme + +New format: `_G_` (32-char ISO9660 max). + +- 4 chars reserved for `_G` (gen 01-99, zero-padded) +- 5 chars for `_` (disc number) +- 23 chars max for name in the label + +If user passes a name longer than 23 chars: `bd-archive create` issues a warning and truncates only in the volume label. Filenames inside the ISO retain the full archive name (e.g., `super-long-archive-name-here-gen2.0001.dar`). + +This is a **format change** vs. current `_` labels. Existing pre-feature Gen 1 discs keep their old label scheme; new discs use the new scheme. Visual mixing is fine — labels are hints, not technical IDs. + +### Auto-defer (min-last-disc-fill) + +Flag: `--min-last-disc-fill PERCENT` (integer 0-100, default 0 = disabled). + +Semantics: enforce a minimum fill percentage on the last disc of the set by deferring "truly new" files to a later generation. + +**Eligibility (which files are deferrable)**: +- For incrementals (`--base` given): files whose relative path is **not present** in the base catalog. Determined authoritatively via `dar -l ` parse; mtime is **not** used for eligibility (avoids losing files whose mtime drifted on disk). mtime *is* used for ordering — newest first. +- For full archives (no `--base`): all files (with prominent warning that deferred files won't be archived until a future incremental). + +**Algorithm**: +1. Build pool of deferrable files (per rules above), sort newest-first by mtime. +2. Estimate disc count and last-disc fill with full source. +3. While last-disc-fill < threshold AND pool non-empty: pop newest, add to defer set, recompute. +4. Show user the resulting plan (file count, total bytes deferred, oldest deferred mtime). +5. Confirmation prompt with full picture. +6. Pass defer set as `-P ` flags to dar. + +**Edge cases**: +- Pool exhausted before threshold met → proceed with what's achievable, log "threshold not reachable, deferring N files brings last-disc-fill to M%". +- All source files deferred → abort with explanatory error. + +### Modified-file & bit-flip behavior + +Out of scope for this feature; **dar's default semantics apply unchanged**: +- Intentional modifications (any change to ctime/mtime/size) → archived as modified entry in incremental. +- Silent bit flips on source (data corruption with mtime unchanged) → NOT detected; file treated as unchanged. + +Mitigation is the user's responsibility (filesystem-level checksumming, separate integrity scans). + +### Verify + +No chain-aware mode. `verify` continues to operate per-disc / per-ISO / per-mount as today. + +### Extract — whole-chain mode + +Single `bd-archive extract -o ` call handles a complete chain. + +**Flow**: +1. User inserts any disc to start. Tool reads filename pattern and catalog, identifies archive name and current generation. +2. Tool prompts: "How many generations does this chain have?" (1-99). This is the only piece of state we can't derive from disc contents. +3. Tool iterates generations 1..N. For each generation: prompts for each disc 1..M_gen (M derived from catalog of that gen). +4. All slices land in a flat staging dir. +5. Single `dar -x /-gen -R -O --sequential-read` (highest gen as entry point — dar walks back through generations via internal references). + +**Legacy (pre-feature) archives** detected by missing `-gen` suffix in slice filenames: +- Treated as a standalone Gen 1 with no chain (current behavior preserved). +- If user wants to extract a chain that mixes legacy Gen 1 + new Gen 2+: the new generations carry `-gen` suffix, the legacy doesn't. Extract handles both name patterns within the same staging dir; dar's chain-walking still works because `--base` at create time recorded the legacy archive's name as the predecessor. + +**Damage handling** is unchanged: SHA-512 verify in staging, PAR2 repair on failure, per-slice not per-disc. Catalog verified once on first intact arrival. + +## Code impact + +Layering remains intact: `commands/` → `archive/` → `tools/` → `shell/`. No new top-level layer. + +### Affected modules + +| Module | Change | Phase | +|---|---|---| +| `commands/create.py` | Catalog placement on Disc 1 only; persist local catalog copy; `--base` handling; base-aware preview; auto-defer block; new volume-label scheme | 1, 2, 3, 4 | +| `commands/extract.py` | Refactor: outer-loop over generations, inner-loop over discs; handle `-gen.*.dar` and legacy `.*.dar` patterns | 5 | +| `archive/dar_archive.py` | Naming scheme `-gen`; `excludes` and `ref_catalog` pass-through | 2, 3, 4 | +| `archive/config.py` | `ArchiveConfig` gains `generation: int`; README text updates | 2 | +| `archive/source_scan.py` | New helper for listing source files with mtime, for auto-defer | 4 | +| `tools/dar.py` | `create_sliced(ref_catalog=None, excludes=None)`; new `list_catalog_paths(catalog)` parsing `dar -l` output | 3, 4 | +| `cli.py` | `--base PATH` and `--min-last-disc-fill INT` on `create` | 3, 4 | +| `README.md` (project) | Document chain-name discipline; document `--base` and `--min-last-disc-fill`; document catalog persistence path | 2, 3, 4 | + +### Phases + +Each phase is independently shippable. + +**Phase 1 — Catalog policy** (~20 LOC) +- `commands/create.py`: catalog files added to Disc 1 sources only (loop conditional); copy catalog to `/-gen-catalog.dar` before tmp cleanup +- `commands/extract.py`: graceful fallback when catalog file present only on first intact disc +- Tests: by hand on a small archive + +**Phase 2 — Naming scheme + label change** (~25 LOC) +- `archive/dar_archive.py`: name becomes `-gen1` for Full +- `archive/config.py`: `generation` field +- `commands/create.py`: volume label `_G_` + truncation warning +- Project `README.md`: chain-name discipline section + +**Phase 3 — Incremental `create`** (~45 LOC) +- `cli.py`: `--base PATH` +- `tools/dar.py`: `create_sliced(ref_catalog=...)`; `list_catalog_paths()` helper +- `archive/dar_archive.py`: pass-through +- `commands/create.py`: parse gen from base, base-aware preview, validate `-n` matches base's archive name +- Project `README.md`: incremental workflow section + +**Phase 4 — Auto-defer** (~55 LOC) +- `cli.py`: `--min-last-disc-fill INT` +- `archive/source_scan.py`: file-list helper +- `commands/create.py`: defer algorithm between preview and confirm prompt +- `tools/dar.py`: `excludes` pass-through (`-P` flags) +- Project `README.md`: auto-defer behavior + +**Phase 5 — Extract chain-mode** (~80 LOC) +- `commands/extract.py`: generation iteration; multi-gen staging; legacy-pattern handling +- Project `README.md`: extract workflow for chains + +**Total ~225 LOC, 5 PRs.** + +## Out of scope + +- `dar_xform` / `dar -+` consolidation of older generations into a fresh full +- Hash-based change detection (`--strict-change-detection`) for bit-flip protection +- Automated migration helper for legacy single-gen archives (manual copy is sufficient) +- `verify --chain` mode for whole-chain integrity check +- Mixing multiple independent chains in one `extract` run + +## Open questions + +None as of approval. Phase-1 implementation can start. From 36aa9a63787ca4241bf95294bd0954de68ba96dc Mon Sep 17 00:00:00 2001 From: Xitee1 <59659167+Xitee1@users.noreply.github.com> Date: Tue, 12 May 2026 10:51:00 +0200 Subject: [PATCH 2/2] refactor(create): place isolated catalog on Disc 1 only; persist locally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the isolated dar catalog was duplicated into every disc's ISO. For archives with thousands of files (and growing over future incremental generations) this added unbounded per-disc overhead — 130 MB per disc in the user's photo archive, scaling with file count not data size. The dar slice on the last disc still embeds the master catalog at its end (dar default, unchanged), so we always have two spatially separated copies per archive set: the isolated copy on Disc 1, and the embedded master on the last disc. Discs 2..N-1 carry only their slice + par2. The isolated catalog is now also persisted to /-catalog.*.dar alongside images/, so the user can keep it in their normal digital backup. Phase 3 will use this file as the --base reference for incremental generations. No extract.py changes needed: its catalog-acquisition logic already copies the catalog only from the first intact disc carrying it. Manual e2e verified: 4-disc set, catalog on Disc 1 only (confirmed via isoinfo), persisted to output_dir, all discs verify OK. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/bd_archive/commands/create.py | 32 ++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/bd_archive/commands/create.py b/src/bd_archive/commands/create.py index 9b6035b..8127932 100644 --- a/src/bd_archive/commands/create.py +++ b/src/bd_archive/commands/create.py @@ -225,11 +225,16 @@ def cmd_create(args): sources = [slice_file] if slice_hash.exists(): sources.append(slice_hash) - for cat in dar_archive.catalog_files: - sources.append(cat) - cat_hash = Path(str(cat) + ".sha512") - if cat_hash.exists(): - sources.append(cat_hash) + # Catalog goes onto Disc 1 only. The master catalog at the end of + # the last slice (dar default) plus this isolated copy on Disc 1 + # gives two spatially separated copies per archive set. Replicating + # on every disc was redundant and grew unboundedly with file count. + if i == 1: + for cat in dar_archive.catalog_files: + sources.append(cat) + cat_hash = Path(str(cat) + ".sha512") + if cat_hash.exists(): + sources.append(cat_hash) sources.extend(par2_files) sources.append(readme_path) @@ -264,6 +269,22 @@ def cmd_create(args): pf.unlink() readme_path.unlink(missing_ok=True) + # Persist the isolated catalog alongside images/ for two reasons: + # 1. It survives `output_dir` being burned + the local images/ + # being deleted — user keeps the catalog as part of their + # regular backup. + # 2. Future incremental generations will reference this file via + # `--base` (not implemented yet in this phase, but the artifact + # needs to exist from this phase onward). + for cat in dar_archive.catalog_files: + shutil.copy2(cat, output_dir / cat.name) + cat_hash = Path(str(cat) + ".sha512") + if cat_hash.exists(): + shutil.copy2(cat_hash, output_dir / cat_hash.name) + catalog_persisted = sorted(output_dir.glob(f"{cfg.name}-catalog.*.dar")) + if catalog_persisted: + log.info(f"Catalog persisted: {catalog_persisted[0].parent}/{cfg.name}-catalog.*.dar") + # Final cleanup: drop the entire tmp/ tree (catalog, dar internals). # If workdir is the default hidden one, also remove it — the only # thing inside was tmp/, so leaving it would just be cruft. A @@ -284,5 +305,6 @@ def cmd_create(args): print(f" PAR2: {cfg.redundancy}% per disc") print(f" Compression: {cfg.comp_str}") print(f" Images: {images_dir}") + print(f" Catalog: {output_dir}/{cfg.name}-catalog.*.dar") print(f"\n Next step: bd-archive burn -i {output_dir}") print(f" Cleanup: rm -rf {output_dir}\n")