From 3720f15408e4f0cdacdf6255915e1eeaa57ac949 Mon Sep 17 00:00:00 2001 From: Xitee1 <59659167+Xitee1@users.noreply.github.com> Date: Tue, 12 May 2026 18:18:59 +0200 Subject: [PATCH 1/7] feat(create): generation-aware naming + new volume label format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 of incremental-archives. Internal dar archive name is now -gen (every new full is Gen 1; Phase 3 derives higher N from --base). Volume labels switch to _G_ — the gen suffix lives in the label, the human-meaningful name truncates to 23 chars if longer. Filenames inside the ISO keep the full name. Pre-Phase-2 (legacy) Gen 1 archives are unaffected: their old labels and naming stay on the burned discs. New archives produced from this phase onward carry the new scheme. Why the truncation tradeoff: physically distinguishing Gen 1 Disc 1 from Gen 2 Disc 1 of the same chain is more useful than seeing the last few characters of an already-known archive name. The archive name acts as the chain identity (see project README, updated in a later phase), which discipline the user enforces by keeping `-n` constant across generations. README on disc gains a Generation line and a CHAIN: hint explaining the name-consistency rule. Manual e2e verified: phase2test_G01_0001 / phase2test_G01_0002 labels on a 2-disc set; phase2test-gen1.NNNN.dar slices on the discs via UDF; phase2test-gen1-catalog.0001.dar persisted to output_dir. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/bd_archive/archive/config.py | 19 +++++++++++++++-- src/bd_archive/commands/create.py | 35 ++++++++++++++++++++++--------- src/bd_archive/constants.py | 8 +++++-- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/src/bd_archive/archive/config.py b/src/bd_archive/archive/config.py index 7a63232..d245c31 100644 --- a/src/bd_archive/archive/config.py +++ b/src/bd_archive/archive/config.py @@ -12,23 +12,38 @@ class ArchiveConfig: redundancy: int compression: str comp_level: str | None + generation: int = 1 @property def comp_str(self) -> str: return self.compression + (f" ({self.comp_level})" if self.comp_level else "") + @property + def dar_name(self) -> str: + """Internal dar archive name including generation suffix. + + File naming uses `-gen` so slices from different + generations of the same chain coexist in one staging dir during + extract. The user-facing `name` (from `-n`) is the chain + identity — see project README for the rule that name must stay + identical across all generations of one chain. + """ + return f"{self.name}-gen{self.generation}" + def write_readme( readme_path: Path, cfg: ArchiveConfig, disc_num: int, total_discs: int, slice_name: str ): ts = datetime.now().strftime("%Y-%m-%d %H:%M") readme_path.write_text( - f"BD-ARCHIVE | {cfg.name} | Disc {disc_num}/{total_discs}" + f"BD-ARCHIVE | {cfg.name} | Gen {cfg.generation} | Disc {disc_num}/{total_discs}" f" | {ts} | Capacity {human_bytes(cfg.disc_bytes)}" f" | PAR2 {cfg.redundancy}% | {cfg.comp_str}\n\n" - f"RESTORE: dar -x {cfg.name} -R /target\n" + f"RESTORE: dar -x {cfg.dar_name} -R /target\n" f"VERIFY: sha512sum -c {slice_name}.sha512\n" f" par2 verify {slice_name}.par2\n" f"REPAIR: par2 repair {slice_name}.par2\n" f"DEPENDS: pacman -S dar par2cmdline | apt install dar par2\n" + f"\nCHAIN: Name '{cfg.name}' identifies this archive chain.\n" + f" Future incremental generations must use the same name.\n" ) diff --git a/src/bd_archive/commands/create.py b/src/bd_archive/commands/create.py index 8127932..2a5a61a 100644 --- a/src/bd_archive/commands/create.py +++ b/src/bd_archive/commands/create.py @@ -11,6 +11,7 @@ from bd_archive.archive.source_scan import scan_source from bd_archive.constants import ( DISC_END_MARGIN, + ISO9660_LABEL_NAME_MAX, ISO9660_VOLUME_LABEL_MAX, PAR2_AND_MISC_OVERHEAD, ) @@ -26,14 +27,23 @@ def cmd_create(args): check_deps("dar", "par2", "mkisofs", "dvd+rw-mediainfo") - max_name_len = ISO9660_VOLUME_LABEL_MAX - 5 # "_NNNN" suffix - if len(args.name) > max_name_len: + # Hard cap matches the pre-Phase-2 label format (32 - 5) so existing + # archive names that lived right up against the old limit still work. + # Names longer than ISO9660_LABEL_NAME_MAX (23) get truncated in the + # volume label only; filenames inside the ISO keep the full name. + legacy_max_name_len = ISO9660_VOLUME_LABEL_MAX - 5 + if len(args.name) > legacy_max_name_len: log.error( f"--name '{args.name}' is {len(args.name)} chars; " - f"max {max_name_len} (ISO9660 volume label limit " - f"{ISO9660_VOLUME_LABEL_MAX} minus 5-char disc suffix)" + f"max {legacy_max_name_len}" ) sys.exit(1) + if len(args.name) > ISO9660_LABEL_NAME_MAX: + log.warn( + f"--name '{args.name}' is {len(args.name)} chars; " + f"volume labels will be truncated to {ISO9660_LABEL_NAME_MAX} chars " + f"('{args.name[:ISO9660_LABEL_NAME_MAX]}'). Filenames on disc keep the full name." + ) source = Path(args.source).resolve() if not source.is_dir(): @@ -109,12 +119,15 @@ def cmd_create(args): last_disc_free_raw = int(last_disc_free / max(ratio, 0.001)) par2_est = slice_bytes * args.redundancy // 100 + # Phase 2: every archive starts as Gen 1. Phase 3 lets `--base` + # derive higher generation numbers from a predecessor catalog. cfg = ArchiveConfig( name=args.name, disc_bytes=raw_capacity, redundancy=args.redundancy, compression=args.compression, comp_level=args.level, + generation=1, ) log.step("Source") @@ -156,7 +169,7 @@ def cmd_create(args): output_dir.rmdir() sys.exit(0) - dar_archive = DarArchive(cfg.name, work_dir) + dar_archive = DarArchive(cfg.dar_name, work_dir) tmp_dir = dar_archive.tmp_dir # ── Create dar archive ────────────────────────────────────────────── @@ -238,8 +251,10 @@ def cmd_create(args): sources.extend(par2_files) sources.append(readme_path) - # Build ISO directly from in-place files (no staging copies) - volume_label = f"{cfg.name}_{i:04d}" + # Build ISO directly from in-place files (no staging copies). + # Label is "_G_" — name truncated to + # ISO9660_LABEL_NAME_MAX (23) so gen + disc suffix always fit. + volume_label = f"{cfg.name[:ISO9660_LABEL_NAME_MAX]}_G{cfg.generation:02d}_{i:04d}" iso_path = images_dir / f"disc_{i:04d}.iso" log.info(f" building {iso_path.name}...") mkisofs.build(iso_path, sources, volume_label, publisher) @@ -281,9 +296,9 @@ def cmd_create(args): cat_hash = Path(str(cat) + ".sha512") if cat_hash.exists(): shutil.copy2(cat_hash, output_dir / cat_hash.name) - catalog_persisted = sorted(output_dir.glob(f"{cfg.name}-catalog.*.dar")) + catalog_persisted = sorted(output_dir.glob(f"{cfg.dar_name}-catalog.*.dar")) if catalog_persisted: - log.info(f"Catalog persisted: {catalog_persisted[0].parent}/{cfg.name}-catalog.*.dar") + log.info(f"Catalog persisted: {catalog_persisted[0].parent}/{cfg.dar_name}-catalog.*.dar") # Final cleanup: drop the entire tmp/ tree (catalog, dar internals). # If workdir is the default hidden one, also remove it — the only @@ -305,6 +320,6 @@ def cmd_create(args): print(f" PAR2: {cfg.redundancy}% per disc") print(f" Compression: {cfg.comp_str}") print(f" Images: {images_dir}") - print(f" Catalog: {output_dir}/{cfg.name}-catalog.*.dar") + print(f" Catalog: {output_dir}/{cfg.dar_name}-catalog.*.dar") print(f"\n Next step: bd-archive burn -i {output_dir}") print(f" Cleanup: rm -rf {output_dir}\n") diff --git a/src/bd_archive/constants.py b/src/bd_archive/constants.py index 05e33a2..6f36c33 100644 --- a/src/bd_archive/constants.py +++ b/src/bd_archive/constants.py @@ -25,9 +25,13 @@ # ISO9660 caps the Primary Volume Descriptor's Volume Identifier at 32 # bytes. mkisofs/growisofs reject longer labels outright. Volume labels -# here are "_NNNN", so archive_name must leave room for -# the 5-char disc suffix. +# here are "_G_" — 9 fixed chars for the gen + +# disc suffixes, leaving 23 chars for the (possibly truncated) name. +# Filenames *inside* the ISO keep the untruncated archive name, so the +# label is purely a human hint, not a technical identifier. ISO9660_VOLUME_LABEL_MAX = 32 +ISO9660_LABEL_SUFFIX_LEN = 9 # "_G_" +ISO9660_LABEL_NAME_MAX = ISO9660_VOLUME_LABEL_MAX - ISO9660_LABEL_SUFFIX_LEN # 23 # PAR2 recovery volumes are named ".volNNN+NN.par2"; the index file # is plain ".par2". This pattern matches recovery volumes only. From aeebee871f0092c4adf33034c81de008120610f8 Mon Sep 17 00:00:00 2001 From: Xitee1 <59659167+Xitee1@users.noreply.github.com> Date: Tue, 12 May 2026 18:23:39 +0200 Subject: [PATCH 2/7] feat(create): incremental archives via --base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3 of incremental-archives. New `--base ` flag on `bd-archive create` makes the run produce an incremental archive against the supplied isolated catalog. dar's `-A` flag does the actual work; this commit wires it up end-to-end. The base catalog filename encodes the predecessor generation (`-gen-catalog.NNNN.dar`), so the new gen number is derived without any sidecar metadata file. Legacy catalogs (pre-Phase-2, filename `-catalog.NNNN.dar`) are treated as Gen 1; the new gen becomes Gen 2. The pre-archive preview is now base-aware: when --base is given, the estimated archive size reflects only files that are new or modified since the base catalog (tools.dar.list_catalog_paths parses `dar -l` output; mtime > catalog-mtime catches modifications heuristically). Disc-count and last-disc-fill estimates use this delta, not the full source — without this, an incremental's preview would massively overstate. Chain identity is the archive name: --base whose embedded archive name disagrees with -n fails with a clear error pointing at the mismatch. Same name across generations is the user's discipline. archive/dar_archive.py gains parse_dar_filename(), a single regex that handles both Phase-2+ generational filenames and legacy ones. Used here for --base validation and reusable by Phase 5's chain detection in extract. Manual e2e: Gen 1 full of 50 MiB source → 2 discs. Adding 15 MiB of new files and running Gen 2 with --base produced a single-disc incremental containing only the delta (phase3test-gen2.0001.dar of 15 MiB, plus its catalog). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/bd_archive/archive/dar_archive.py | 39 +++++++++++- src/bd_archive/archive/source_scan.py | 25 ++++++++ src/bd_archive/cli.py | 9 +++ src/bd_archive/commands/create.py | 85 ++++++++++++++++++++++++--- src/bd_archive/tools/dar.py | 33 +++++++++++ 5 files changed, 182 insertions(+), 9 deletions(-) diff --git a/src/bd_archive/archive/dar_archive.py b/src/bd_archive/archive/dar_archive.py index c460422..e5a0c09 100644 --- a/src/bd_archive/archive/dar_archive.py +++ b/src/bd_archive/archive/dar_archive.py @@ -1,7 +1,37 @@ +import re from pathlib import Path from bd_archive.tools import dar +# Matches both Phase-2+ generational filenames and legacy ones: +# photos-gen3.0001.dar → ('photos', 3, False) +# photos-gen3-catalog.0001.dar → ('photos', 3, True) +# photos.0001.dar → ('photos', 1, False) [legacy] +# photos-catalog.0001.dar → ('photos', 1, True) [legacy] +# The non-greedy archive-name group keeps `-gen` and `-catalog` +# detection deterministic when the archive name itself contains +# hyphens. +_DAR_FILENAME_RE = re.compile( + r"^(?P.+?)(?:-gen(?P\d+))?(?P-catalog)?\.\d+\.dar$" +) + + +def parse_dar_filename(filename: str) -> tuple[str, int, bool] | None: + """Parse a dar slice or catalog filename. + + Returns ``(archive_name, generation, is_catalog)`` or ``None`` if the + name does not look like a dar slice/catalog file. Generation + defaults to 1 for legacy (pre-Phase-2) filenames that lack the + ``-gen`` segment. + """ + m = _DAR_FILENAME_RE.match(filename) + if not m: + return None + name = m.group("name") + gen = int(m.group("gen")) if m.group("gen") else 1 + is_catalog = m.group("catalog") is not None + return name, gen, is_catalog + class DarArchive: def __init__(self, name: str, work_dir: Path): @@ -27,9 +57,16 @@ def create( compression: str, comp_level: str | None, par2_hook: str | None = None, + ref_catalog: Path | None = None, ): dar.create_sliced( - self.base_path, source, slice_bytes, compression, comp_level, execute_hook=par2_hook + self.base_path, + source, + slice_bytes, + compression, + comp_level, + execute_hook=par2_hook, + ref_catalog=ref_catalog, ) def isolate_catalog(self): diff --git a/src/bd_archive/archive/source_scan.py b/src/bd_archive/archive/source_scan.py index 814260a..7f3ec29 100644 --- a/src/bd_archive/archive/source_scan.py +++ b/src/bd_archive/archive/source_scan.py @@ -31,3 +31,28 @@ def scan_source(source: Path) -> SourceScan: except (OSError, ValueError): catalog += PER_ENTRY + 256 return SourceScan(total_bytes=total, entry_count=count, catalog_est=catalog) + + +def scan_delta_bytes(source: Path, known_paths: set[str], base_mtime: float) -> int: + """Sum sizes of files that are either new or modified vs. a base catalog. + + Approximates the data payload size of an incremental archive for + preview purposes. A file is counted when either its relative path + is not in known_paths (truly new) or its mtime exceeds base_mtime + (likely modified since base). mtime is a heuristic — dar's actual + diff uses ctime/size/hash and may include or exclude slightly + different files; the estimate is good enough for disc-count + planning. + """ + total = 0 + for p in source.rglob("*"): + try: + if not p.is_file() or p.is_symlink(): + continue + rel = p.relative_to(source).as_posix() + st = p.stat() + if rel not in known_paths or st.st_mtime > base_mtime: + total += st.st_size + except (OSError, ValueError): + pass + return total diff --git a/src/bd_archive/cli.py b/src/bd_archive/cli.py index c6ab118..d0de7a4 100644 --- a/src/bd_archive/cli.py +++ b/src/bd_archive/cli.py @@ -58,6 +58,15 @@ def build_parser() -> argparse.ArgumentParser: help="Compression algorithm (default: zstd)", ) cr.add_argument("-l", "--level", help="Compression level") + cr.add_argument( + "--base", + default=None, + help="Path to the isolated catalog of a previous generation " + "(e.g. /-gen-catalog.0001.dar). When set, " + "this run produces an incremental archive (Gen N+1) containing " + "only files new or changed since that catalog. Archive name " + "(-n) must match the predecessor — chain identity is the name.", + ) ratio_group = cr.add_mutually_exclusive_group() ratio_group.add_argument( "--ratio", diff --git a/src/bd_archive/commands/create.py b/src/bd_archive/commands/create.py index 2a5a61a..340dffc 100644 --- a/src/bd_archive/commands/create.py +++ b/src/bd_archive/commands/create.py @@ -1,4 +1,5 @@ import contextlib +import re import shlex import shutil import sys @@ -6,9 +7,9 @@ from bd_archive import __version__ from bd_archive.archive.config import ArchiveConfig, write_readme -from bd_archive.archive.dar_archive import DarArchive +from bd_archive.archive.dar_archive import DarArchive, parse_dar_filename from bd_archive.archive.sizing import compute_slice_bytes, measure_compression_ratio -from bd_archive.archive.source_scan import scan_source +from bd_archive.archive.source_scan import scan_delta_bytes, scan_source from bd_archive.constants import ( DISC_END_MARGIN, ISO9660_LABEL_NAME_MAX, @@ -18,11 +19,47 @@ from bd_archive.shell.deps import check_deps from bd_archive.shell.format import human_bytes from bd_archive.tools import mkisofs +from bd_archive.tools.dar import list_catalog_paths from bd_archive.tools.mediainfo import detect_disc_capacity from bd_archive.tools.optical import resolve_device from bd_archive.ui.logger import log from bd_archive.ui.prompts import prompt_yn +# Catalog slice files end in ".NNNN.dar"; strip that to get the dar +# basename suitable for `-A`. dar resolves the actual slice file(s) +# from the basename, so we never hand it the raw filename. +_CATALOG_SLICE_SUFFIX_RE = re.compile(r"\.\d+\.dar$") + + +def _resolve_base(base_arg: str, archive_name: str) -> tuple[Path, int]: + """Validate and unpack a --base argument. + + Returns ``(catalog_basename_path, base_generation)``. Raises + SystemExit with a user-readable error if the path is missing, the + filename doesn't look like a dar catalog slice, or the embedded + archive name disagrees with ``-n``. + """ + base_path = Path(base_arg).resolve() + if not base_path.is_file(): + log.error(f"--base path does not exist: {base_path}") + sys.exit(1) + parsed = parse_dar_filename(base_path.name) + if parsed is None or not parsed[2]: + log.error( + f"--base must point to a dar catalog slice " + f"([-gen]-catalog.NNNN.dar); got '{base_path.name}'" + ) + sys.exit(1) + base_name, base_gen, _ = parsed + if base_name != archive_name: + log.error( + f"--base belongs to archive '{base_name}' but -n is '{archive_name}'. " + f"Chain identity is the archive name; keep it consistent across generations." + ) + sys.exit(1) + base_stem = _CATALOG_SLICE_SUFFIX_RE.sub("", base_path.name) + return base_path.parent / base_stem, base_gen + def cmd_create(args): check_deps("dar", "par2", "mkisofs", "dvd+rw-mediainfo") @@ -45,6 +82,15 @@ def cmd_create(args): f"('{args.name[:ISO9660_LABEL_NAME_MAX]}'). Filenames on disc keep the full name." ) + # --base: parse and validate. Sets `ref_catalog` (dar -A argument) + # and `generation` (current run's gen number = base_gen + 1). + ref_catalog: Path | None = None + generation = 1 + if args.base is not None: + ref_catalog, base_gen = _resolve_base(args.base, args.name) + generation = base_gen + 1 + log.info(f"Incremental against: {ref_catalog.name} (Gen {base_gen}) → new Gen {generation}") + source = Path(args.source).resolve() if not source.is_dir(): log.error(f"Does not exist: {source}") @@ -107,7 +153,21 @@ def cmd_create(args): ratio = 1.0 ratio_source = "default (no compression assumed)" - archive_est = int(scan.total_bytes * ratio) + # For an incremental, the data payload is only new/changed files; + # estimating against the full source overstates disc count and + # last-disc fill. Re-scan the source against the base catalog to + # get a delta-aware payload size. mtime is a heuristic — see + # scan_delta_bytes for why it's good enough for previews. + if ref_catalog is not None: + base_paths = list_catalog_paths(ref_catalog) + # Stat the user-supplied catalog slice file directly — its mtime + # is the timestamp dar wrote the catalog at, which we use as the + # cutoff for "modified since base". + base_mtime = Path(args.base).resolve().stat().st_mtime + delta_bytes = scan_delta_bytes(source, base_paths, base_mtime) + archive_est = int(delta_bytes * ratio) + else: + archive_est = int(scan.total_bytes * ratio) n_discs = max(1, (archive_est + slice_bytes - 1) // slice_bytes) last_slice = archive_est - (n_discs - 1) * slice_bytes if last_slice == 0: @@ -119,15 +179,13 @@ def cmd_create(args): last_disc_free_raw = int(last_disc_free / max(ratio, 0.001)) par2_est = slice_bytes * args.redundancy // 100 - # Phase 2: every archive starts as Gen 1. Phase 3 lets `--base` - # derive higher generation numbers from a predecessor catalog. cfg = ArchiveConfig( name=args.name, disc_bytes=raw_capacity, redundancy=args.redundancy, compression=args.compression, comp_level=args.level, - generation=1, + generation=generation, ) log.step("Source") @@ -140,7 +198,8 @@ def cmd_create(args): log.info(f"Slice size: {human_bytes(slice_bytes)}") log.info(f"PAR2 redundancy: {cfg.redundancy}% (~{human_bytes(par2_est)})") log.info(f"Compression: {cfg.comp_str} (ratio {ratio:.3f}, {ratio_source})") - log.info(f"Estimated archive: {human_bytes(archive_est)}") + archive_kind = "delta vs base" if ref_catalog is not None else "full source" + log.info(f"Estimated archive: {human_bytes(archive_est)} ({archive_kind})") log.step("Estimate") fill_pct = last_disc_content * 100 // sizing_target @@ -157,6 +216,9 @@ def cmd_create(args): log.info(f"Source: {source}") log.info(f"Output: {output_dir}") log.info(f"Workdir: {work_dir}{' (default)' if workdir_is_default else ' (custom)'}") + log.info(f"Generation: {cfg.generation} ({'incremental' if ref_catalog else 'full'})") + if ref_catalog is not None: + log.info(f"Base catalog: {args.base}") if not args.yes and not prompt_yn("Proceed with creation?"): log.warn("Cancelled by user") @@ -184,7 +246,14 @@ def cmd_create(args): par2_hook = ( f'{shlex.quote(sys.executable)} -m bd_archive._par2_helper "%p" "%b" %N {cfg.redundancy}' ) - dar_archive.create(source, slice_bytes, cfg.compression, cfg.comp_level, par2_hook=par2_hook) + dar_archive.create( + source, + slice_bytes, + cfg.compression, + cfg.comp_level, + par2_hook=par2_hook, + ref_catalog=ref_catalog, + ) slices = dar_archive.slices slice_count = len(slices) diff --git a/src/bd_archive/tools/dar.py b/src/bd_archive/tools/dar.py index 303c624..6d16b07 100644 --- a/src/bd_archive/tools/dar.py +++ b/src/bd_archive/tools/dar.py @@ -20,6 +20,7 @@ def create_sliced( compression: str, comp_level: str | None, execute_hook: str | None = None, + ref_catalog: Path | None = None, ): """Create a sliced dar archive with sha512 hashes. @@ -27,6 +28,12 @@ def create_sliced( been completed (verified against dar 2.7.17). This is used by cmd_create to run par2 on each slice while its bytes are still in the OS page cache. + + If ref_catalog is set, dar runs in incremental mode (`-A `): + only files new or changed relative to that reference catalog are + archived. Pass the basename of the catalog without the + ``.NNNN.dar`` suffix (dar accepts the catalog basename and finds + the slice files itself). """ cmd = [ "dar", @@ -47,11 +54,37 @@ def create_sliced( if comp_level: flag += f":{comp_level}" cmd += [flag, "-am"] + if ref_catalog is not None: + cmd += ["-A", str(ref_catalog)] if execute_hook is not None: cmd += ["-E", execute_hook] run(cmd, label="dar") +def list_catalog_paths(catalog_base: Path) -> set[str]: + """Return the set of relative paths stored in a dar catalog. + + Runs ``dar -l -as`` and parses the listing. dar's + entry lines use tab separators between the user, group, size, date, + and filename columns — the filename is always the last tab-separated + field. Header and separator lines lack tabs entirely, so the + "contains a tab" filter is sufficient to discard them. + + Directories are included; the consumer (auto-defer pool filter) + treats the set as "anything dar already knows about", which keeps + the filter conservative. + """ + r = run(["dar", "-l", str(catalog_base), "-as", "-Q"], capture=True, check=True) + paths: set[str] = set() + for line in r.stdout.splitlines(): + if "\t" not in line: + continue + path = line.split("\t")[-1].rstrip() + if path: + paths.add(path) + return paths + + def isolate_catalog(base_path: Path): """Isolate the catalog into a separate dar archive with sha512 hashes.""" run( From 180d9ed76561ec4935db4b089bd894987aa879bf Mon Sep 17 00:00:00 2001 From: Xitee1 <59659167+Xitee1@users.noreply.github.com> Date: Tue, 12 May 2026 18:27:10 +0200 Subject: [PATCH 3/7] feat(create): --min-last-disc-fill auto-defers newest files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 4 of incremental-archives. When the projected last-disc fill is below `--min-last-disc-fill PERCENT`, bd-archive automatically defers the newest files until enough has been removed to either drop a disc from the set or empty the candidate pool. Pool selection is deliberately conservative: - With `--base`: only files whose relative path is not in the base catalog (truly new). Determined via `dar -l ` parse, so files that have merely had their mtime touched on disk stay in the archive (no silent loss across generations). - Without `--base` (Full): all files are candidates, with a loud warning that deferred files won't be archived until a future incremental run picks them up. The preview block now shows what would be deferred (file count, byte count, oldest mtime, sample paths) BEFORE the confirm prompt, so the user can abort if the plan looks wrong. When the threshold is unreachable (entire pool deferred without ever crossing the fill threshold), the run still proceeds with the partial deferral — the user gets a warning, not an abort. The only fatal case is "deferring everything would archive zero bytes", which exits 1. archive/source_scan.py grows a SourceFile dataclass and list_source_files() walker — separate from scan_source's aggregate view because the defer algorithm needs per-file rel_path/size/mtime. tools/dar.py::create_sliced grows an `excludes` parameter that turns each entry into a `-P ` flag, with dar -P being the relative-subpath exclude operator. Manual e2e: Gen 1 of 50 MiB, then 60 MiB delta. Without --min-last- disc-fill: 2 discs, last disc 46%. With --min-last-disc-fill 50: 20 MiB deferred (2 files), single disc, last fill 94%. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/bd_archive/archive/dar_archive.py | 2 + src/bd_archive/archive/source_scan.py | 29 ++++++++ src/bd_archive/cli.py | 11 +++ src/bd_archive/commands/create.py | 103 +++++++++++++++++++++++--- src/bd_archive/tools/dar.py | 8 ++ 5 files changed, 144 insertions(+), 9 deletions(-) diff --git a/src/bd_archive/archive/dar_archive.py b/src/bd_archive/archive/dar_archive.py index e5a0c09..23040d8 100644 --- a/src/bd_archive/archive/dar_archive.py +++ b/src/bd_archive/archive/dar_archive.py @@ -58,6 +58,7 @@ def create( comp_level: str | None, par2_hook: str | None = None, ref_catalog: Path | None = None, + excludes: list[str] | None = None, ): dar.create_sliced( self.base_path, @@ -67,6 +68,7 @@ def create( comp_level, execute_hook=par2_hook, ref_catalog=ref_catalog, + excludes=excludes, ) def isolate_catalog(self): diff --git a/src/bd_archive/archive/source_scan.py b/src/bd_archive/archive/source_scan.py index 7f3ec29..fbde050 100644 --- a/src/bd_archive/archive/source_scan.py +++ b/src/bd_archive/archive/source_scan.py @@ -9,6 +9,35 @@ class SourceScan: catalog_est: int # estimated isolated dar catalog size +@dataclass(frozen=True) +class SourceFile: + """Per-file metadata used by the auto-defer pool.""" + + rel_path: str # POSIX-style relative path from source root (matches dar) + size: int + mtime: float + + +def list_source_files(source: Path) -> list[SourceFile]: + """Walk source, return regular-file entries with size + mtime. + + Used by the auto-defer pool builder. Skips directories, symlinks, + and anything we can't stat. The rel_path uses POSIX separators so + it compares directly against dar's catalog path listing. + """ + files: list[SourceFile] = [] + for p in source.rglob("*"): + try: + if not p.is_file() or p.is_symlink(): + continue + rel = p.relative_to(source).as_posix() + st = p.stat() + files.append(SourceFile(rel_path=rel, size=st.st_size, mtime=st.st_mtime)) + except (OSError, ValueError): + pass + return files + + def scan_source(source: Path) -> SourceScan: """Walk source once; return size, entry count, and catalog estimate. diff --git a/src/bd_archive/cli.py b/src/bd_archive/cli.py index d0de7a4..cbffcdd 100644 --- a/src/bd_archive/cli.py +++ b/src/bd_archive/cli.py @@ -67,6 +67,17 @@ def build_parser() -> argparse.ArgumentParser: "only files new or changed since that catalog. Archive name " "(-n) must match the predecessor — chain identity is the name.", ) + cr.add_argument( + "--min-last-disc-fill", + type=int, + default=0, + metavar="PERCENT", + help="Auto-defer newest files until the last disc of the set is " + "at least PERCENT full (0-100). With --base, defers only files " + "not already in the base catalog. Without --base (full archive), " + "defers any files — and they will NOT be archived until a future " + "incremental run picks them up. Default 0 = no deferral.", + ) ratio_group = cr.add_mutually_exclusive_group() ratio_group.add_argument( "--ratio", diff --git a/src/bd_archive/commands/create.py b/src/bd_archive/commands/create.py index 340dffc..fe851f5 100644 --- a/src/bd_archive/commands/create.py +++ b/src/bd_archive/commands/create.py @@ -9,7 +9,12 @@ from bd_archive.archive.config import ArchiveConfig, write_readme from bd_archive.archive.dar_archive import DarArchive, parse_dar_filename from bd_archive.archive.sizing import compute_slice_bytes, measure_compression_ratio -from bd_archive.archive.source_scan import scan_delta_bytes, scan_source +from bd_archive.archive.source_scan import ( + SourceFile, + list_source_files, + scan_delta_bytes, + scan_source, +) from bd_archive.constants import ( DISC_END_MARGIN, ISO9660_LABEL_NAME_MAX, @@ -158,6 +163,7 @@ def cmd_create(args): # last-disc fill. Re-scan the source against the base catalog to # get a delta-aware payload size. mtime is a heuristic — see # scan_delta_bytes for why it's good enough for previews. + base_paths: set[str] = set() if ref_catalog is not None: base_paths = list_catalog_paths(ref_catalog) # Stat the user-supplied catalog slice file directly — its mtime @@ -168,13 +174,77 @@ def cmd_create(args): archive_est = int(delta_bytes * ratio) else: archive_est = int(scan.total_bytes * ratio) - n_discs = max(1, (archive_est + slice_bytes - 1) // slice_bytes) - last_slice = archive_est - (n_discs - 1) * slice_bytes - if last_slice == 0: - last_slice = slice_bytes - last_disc_content = ( - last_slice + last_slice * args.redundancy // 100 + scan.catalog_est + PAR2_AND_MISC_OVERHEAD - ) + + def _layout(est: int) -> tuple[int, int, int]: + """(n_discs, last_disc_content, last_fill_pct) for a given archive size.""" + n = max(1, (est + slice_bytes - 1) // slice_bytes) + last_sl = est - (n - 1) * slice_bytes + if last_sl == 0: + last_sl = slice_bytes + last_content = ( + last_sl + last_sl * args.redundancy // 100 + + scan.catalog_est + PAR2_AND_MISC_OVERHEAD + ) + return n, last_content, last_content * 100 // sizing_target + + n_discs, last_disc_content, fill_pct = _layout(archive_est) + + # ── Auto-defer (--min-last-disc-fill) ─────────────────────────────── + # When the last disc would be too empty, push newest files to a + # future generation so this set "rounds down" to fewer discs with + # higher fill. Pool is "files truly new vs. base catalog" when + # incremental, "all files" when full (with warning — those files + # won't be archived anywhere until a later incremental run picks + # them up). + deferred_files: list[SourceFile] = [] + if args.min_last_disc_fill > 0 and fill_pct < args.min_last_disc_fill: + if ref_catalog is not None: + pool = [f for f in list_source_files(source) if f.rel_path not in base_paths] + pool_kind = "files not in base catalog" + else: + pool = list_source_files(source) + pool_kind = "all source files" + log.warn( + "--min-last-disc-fill on a Full archive defers files that will " + "NOT be archived until a future incremental run picks them up." + ) + pool.sort(key=lambda f: f.mtime, reverse=True) + + cum_size = 0 + reached = False + for f in pool: + cum_size += f.size + new_est = max(0, archive_est - int(cum_size * ratio)) + new_n, new_last, new_fill = _layout(new_est) if new_est > 0 else (0, 0, 0) + deferred_files.append(f) + if new_est == 0: + # Pool would empty the archive entirely — stop here. + break + if new_fill >= args.min_last_disc_fill: + archive_est, n_discs, last_disc_content, fill_pct = ( + new_est, new_n, new_last, new_fill + ) + reached = True + break + + if not reached: + log.warn( + f"--min-last-disc-fill {args.min_last_disc_fill}% not reachable; " + f"pool ({len(pool)} candidate file(s), {pool_kind}) exhausted " + f"after deferring {human_bytes(cum_size)}. Proceeding with " + f"what we have." + ) + if archive_est - int(cum_size * ratio) > 0: + archive_est, n_discs, last_disc_content, fill_pct = ( + archive_est - int(cum_size * ratio), new_n, new_last, new_fill + ) + else: + log.error( + "Deferring all candidates would leave 0 bytes to archive. " + "Lower --min-last-disc-fill or skip the run." + ) + sys.exit(1) + last_disc_free = max(0, sizing_target - last_disc_content) last_disc_free_raw = int(last_disc_free / max(ratio, 0.001)) @@ -202,7 +272,6 @@ def cmd_create(args): log.info(f"Estimated archive: {human_bytes(archive_est)} ({archive_kind})") log.step("Estimate") - fill_pct = last_disc_content * 100 // sizing_target log.info(f"Discs needed: {n_discs}") log.info( f"Last disc fill: {human_bytes(last_disc_content)} / " @@ -212,6 +281,21 @@ def cmd_create(args): if abs(ratio - 1.0) > 0.001: log.info(f" ~{human_bytes(last_disc_free_raw)} raw (at ratio {ratio:.3f})") + if deferred_files: + defer_bytes = sum(f.size for f in deferred_files) + oldest_deferred = min(f.mtime for f in deferred_files) + from datetime import datetime as _dt + + log.step(f"Auto-defer (--min-last-disc-fill {args.min_last_disc_fill}%)") + log.info(f"Files deferred: {len(deferred_files)}") + log.info(f"Bytes deferred: {human_bytes(defer_bytes)} (raw)") + log.info(f"Oldest deferred: mtime {_dt.fromtimestamp(oldest_deferred):%Y-%m-%d %H:%M}") + sample = deferred_files[:3] + for f in sample: + log.info(f" - {f.rel_path}") + if len(deferred_files) > len(sample): + log.info(f" - ... and {len(deferred_files) - len(sample)} more") + log.step("Configuration") log.info(f"Source: {source}") log.info(f"Output: {output_dir}") @@ -253,6 +337,7 @@ def cmd_create(args): cfg.comp_level, par2_hook=par2_hook, ref_catalog=ref_catalog, + excludes=[f.rel_path for f in deferred_files] if deferred_files else None, ) slices = dar_archive.slices diff --git a/src/bd_archive/tools/dar.py b/src/bd_archive/tools/dar.py index 6d16b07..1769d8c 100644 --- a/src/bd_archive/tools/dar.py +++ b/src/bd_archive/tools/dar.py @@ -21,6 +21,7 @@ def create_sliced( comp_level: str | None, execute_hook: str | None = None, ref_catalog: Path | None = None, + excludes: list[str] | None = None, ): """Create a sliced dar archive with sha512 hashes. @@ -34,6 +35,10 @@ def create_sliced( archived. Pass the basename of the catalog without the ``.NNNN.dar`` suffix (dar accepts the catalog basename and finds the slice files itself). + + If excludes is set, each entry is passed to dar as ``-P ``, + excluding that exact relative subpath from the archive. Used by + auto-defer to push specific files to a later generation. """ cmd = [ "dar", @@ -56,6 +61,9 @@ def create_sliced( cmd += [flag, "-am"] if ref_catalog is not None: cmd += ["-A", str(ref_catalog)] + if excludes: + for path in excludes: + cmd += ["-P", path] if execute_hook is not None: cmd += ["-E", execute_hook] run(cmd, label="dar") From df3c8d9172d740e70392039ade14aff70d270440 Mon Sep 17 00:00:00 2001 From: Xitee1 <59659167+Xitee1@users.noreply.github.com> Date: Tue, 12 May 2026 18:31:00 +0200 Subject: [PATCH 4/7] feat(extract): chain-aware whole-restore in one run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 5 of incremental-archives. `bd-archive extract` now restores an entire incremental chain (Gen 1 + all subsequent gens) in a single invocation. Previously it could only restore one archive set. User flow: - User runs `bd-archive extract -o ./restored`. - Tool prompts for discs one at a time. Each disc's filenames are parsed (via archive.dar_archive.parse_dar_filename) to detect the chain name and which generation that disc belongs to. Order doesn't matter; discs from any gen, any order, all accepted. - All slices land in one flat staging dir. Different generations have different dar basenames (photos-gen1, photos-gen2, …), so they coexist without collision. - When the user says "no more discs", the tool runs `dar -x` once per generation in order. The first gen extracts into the clean output; later gens use dar's -wa flag to overwrite files that earlier gens already wrote (later gens carry the newer content). The current archive_name variable is replaced with two pieces of state: chain_name (the -n value, identical across all gens) and a gen→dar_basename mapping (because legacy pre-Phase-2 gen 1 archives have basename "photos" while new ones have "photos-gen1"). Per-generation catalog verification: each gen has its own catalog file with a different basename, so the "verified" flag is now a dict keyed by gen number rather than a single bool. A disc that fails its gen's catalog sha512 drops it from staging so the next disc of the same gen can refetch — same convergence logic as before, just generation-scoped. The damage path (par2 repair) is unchanged in mechanics. tools/dar.py::extract_sequential grows an `overwrite` parameter that toggles dar's `-wa` flag. Required for chain extracts where gen N's data replaces gen N-1's; no effect on the first gen which extracts into an empty output dir. Smoketest: built a 2-gen chain (5 original files + 1 sub-dir file, then 1 modified + 1 new in gen 2), invoked extract_sequential for each gen in order against staged slices. diff -rq between source and restored output: byte-identical, no differences. Disc-mounting flow (prompt/mount/copy/verify) is preserved from the previous implementation; refactored to track per-gen state but the per-disc UX is the same. A full e2e against a real optical drive remains a manual user verification step. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/bd_archive/commands/extract.py | 219 +++++++++++++++++------------ src/bd_archive/tools/dar.py | 8 ++ 2 files changed, 138 insertions(+), 89 deletions(-) diff --git a/src/bd_archive/commands/extract.py b/src/bd_archive/commands/extract.py index 93fbba3..a776ea6 100644 --- a/src/bd_archive/commands/extract.py +++ b/src/bd_archive/commands/extract.py @@ -1,10 +1,12 @@ import contextlib +import re import shutil import sys import tempfile from pathlib import Path from bd_archive.archive.checksums import verify_slice +from bd_archive.archive.dar_archive import parse_dar_filename from bd_archive.archive.disc import DiscIO from bd_archive.shell.deps import check_deps from bd_archive.shell.format import human_bytes @@ -15,6 +17,17 @@ from bd_archive.ui.progress import Progress, copy_with_progress from bd_archive.ui.prompts import prompt_disc, prompt_yn +# A dar slice or catalog filename ends in ".NNNN.dar"; stripping that +# off yields the dar archive basename (e.g. "photos-gen1" or, on legacy +# pre-Phase-2 archives, just "photos"). That basename is what dar -x +# wants as input, and what we use to group files by generation in +# staging. +_SLICE_SUFFIX_RE = re.compile(r"\.\d+\.dar$") + + +def _dar_basename(filename: str) -> str: + return _SLICE_SUFFIX_RE.sub("", filename) + def _mount_with_prompt(dio: DiscIO, mount_dir: Path, prompt_msg: str) -> Path | None: while True: @@ -28,23 +41,29 @@ def _mount_with_prompt(dio: DiscIO, mount_dir: Path, prompt_msg: str) -> Path | def _copy_disc_data( - mounted: Path, archive_name: str, staging: Path, catalog_verified: bool + mounted: Path, disc_basename: str, staging: Path, catalog_verified: bool ) -> list[Path]: - """Copy slices + sha512 sidecars (and catalog if not yet verified) from - disc to staging. par2 files are NOT copied — fetched lazily on damage. - Returns list of slice paths in staging for this disc.""" + """Copy slices + sha512 sidecars (and the catalog of this disc's + generation, if not yet verified) from disc to staging. par2 files + are NOT copied — fetched lazily on damage. + + Returns the list of slice paths in staging that came from this disc. + """ + catalog_basename = f"{disc_basename}-catalog" if not catalog_verified: - for cat in mounted.glob(f"{archive_name}-catalog.*.dar"): + for cat in mounted.glob(f"{catalog_basename}.*.dar"): dest = staging / cat.name if not dest.exists(): shutil.copy2(cat, dest) - for cat_hash in mounted.glob(f"{archive_name}-catalog.*.dar.sha512"): + for cat_hash in mounted.glob(f"{catalog_basename}.*.dar.sha512"): dest = staging / cat_hash.name if not dest.exists(): shutil.copy2(cat_hash, dest) slices = sorted( - p for p in mounted.glob(f"{archive_name}.[0-9]*.dar") if "-catalog" not in p.name + p + for p in mounted.glob(f"{disc_basename}.[0-9]*.dar") + if "-catalog" not in p.name ) copied: list[Path] = [] for sp in slices: @@ -61,17 +80,16 @@ def _copy_disc_data( return copied -def _verify_catalog_on_staging(staging: Path, archive_name: str) -> bool: - """Verify every catalog slice currently in staging. Drop any that - fail sha512. Return True only when all present slices verified. +def _verify_catalog_on_staging(staging: Path, catalog_basename: str) -> bool: + """Verify every catalog slice currently in staging for one generation. + Drop any that fail sha512 so the next disc carrying them can refetch. - Iterates all slices (no early return) so multi-slice catalogs with - multiple failures get every corrupt slice flagged + deleted in a - single pass. The next disc's _copy_disc_data re-fetches anything - missing, so the loop converges in fewer disc-iterations than the - naive 'stop at first failure' variant. + Returns True only when every present slice verified — a single pass + flags every corrupt slice (no early return), so multi-slice catalogs + converge in one fewer disc-iteration than a 'stop at first failure' + variant would. """ - catalog_files = sorted(staging.glob(f"{archive_name}-catalog.*.dar")) + catalog_files = sorted(staging.glob(f"{catalog_basename}.*.dar")) if not catalog_files: return False all_ok = True @@ -147,16 +165,18 @@ def cmd_extract(args): log.info(f"Device: {device}") log.info(f"Output: {output_dir}") log.info(f"Staging: {staging}") - - archive_name: str | None = None - catalog_verified = False - disc_num = 0 - # Slices that sha512 + par2 both failed on. Files coming from them - # may end up corrupt in the output — we collect this so the final - # corrupted-files.txt explains which disc to blame even when dar's - # per-file error parser couldn't pinpoint individual files (e.g. - # archive-metadata corruption). + log.info("Insert discs from any generation, in any order. The tool") + log.info("detects generations from filenames and extracts the chain") + log.info("in order at the end.") + + # Per-generation state. Catalog verification and dar basename live + # under each gen because the chain may mix legacy (gen 1 without + # -gen suffix) and new-format generations. + chain_name: str | None = None + catalogs_verified: dict[int, bool] = {} + gen_basenames: dict[int, str] = {} unrepairable_slices: list[str] = [] + disc_num = 0 while True: target = disc_num + 1 @@ -169,19 +189,37 @@ def cmd_extract(args): sys.exit(1) try: - if archive_name is None: - dar_files = [p for p in mounted.glob("*.dar") if "-catalog" not in p.name] - if not dar_files: - log.error("No dar files found on disc — try another") - continue - archive_name = dar_files[0].stem.rsplit(".", 1)[0] - log.info(f"Archive detected: {archive_name}") + # Detect chain name + generation from any slice filename. + dar_files = [p for p in mounted.glob("*.dar") if "-catalog" not in p.name] + if not dar_files: + log.error("No dar files found on disc — try another") + continue + parsed = parse_dar_filename(dar_files[0].name) + if parsed is None: + log.error(f"Unrecognised dar filename: {dar_files[0].name}") + continue + disc_name, disc_gen, _ = parsed + disc_basename = _dar_basename(dar_files[0].name) + + if chain_name is None: + chain_name = disc_name + log.info(f"Chain: {chain_name}") + elif disc_name != chain_name: + log.error( + f"Disc belongs to chain '{disc_name}', but this run is for " + f"chain '{chain_name}'. Eject and insert a matching disc." + ) + continue + + log.info(f"Disc {target}: Gen {disc_gen} ({disc_basename})") + gen_basenames.setdefault(disc_gen, disc_basename) + catalog_verified = catalogs_verified.get(disc_gen, False) disc_num = target # ── 2. Copy data (no par2) ──────────────────────────────────── log.info(f"Copying disc {disc_num}...") - copied = _copy_disc_data(mounted, archive_name, staging, catalog_verified) + copied = _copy_disc_data(mounted, disc_basename, staging, catalog_verified) log.ok(f" {len(copied)} slice(s) staged") finally: dio.umount(mounted) @@ -189,11 +227,11 @@ def cmd_extract(args): mount_dir.rmdir() dio.eject() - # ── 3. Verify catalog (only first time it lands intact) ────────── - if not catalog_verified: - log.info("Verifying catalog on staging...") - if _verify_catalog_on_staging(staging, archive_name): - catalog_verified = True + # ── 3. Verify catalog for this generation (first time it lands) ─── + if not catalogs_verified.get(disc_gen, False): + log.info(f"Verifying Gen {disc_gen} catalog on staging...") + if _verify_catalog_on_staging(staging, f"{disc_basename}-catalog"): + catalogs_verified[disc_gen] = True # ── 4. Verify slices on staging via sha512 ─────────────────────── log.info(f"Verifying disc {disc_num} slices on staging...") @@ -222,9 +260,8 @@ def cmd_extract(args): continue log.error(f" {sp.name}: unrecoverable damage") log.warn( - f" {sp.name}: keeping as-is — files from " - f"this slice may be corrupt; will be listed " - f"in corrupted-files.txt" + f" {sp.name}: keeping as-is — files from this slice may " + f"be corrupt; will be listed in corrupted-files.txt" ) unrepairable_slices.append(sp.name) finally: @@ -234,56 +271,61 @@ def cmd_extract(args): dio.eject() _cleanup_par2(staging) - collected = sorted(staging.glob(f"{archive_name}.[0-9]*.dar")) - collected = [c for c in collected if "-catalog" not in c.name] - log.info(f"Collected: {len(collected)} slice(s)") + # Report current chain collection state. + gens_collected = sorted(gen_basenames) + log.info(f"Chain so far: Gen {gens_collected} ({disc_num} disc(s) total)") if not prompt_yn("Insert another disc?"): break - # ── Extract ───────────────────────────────────────────────────────── - log.step("Extracting archive") - collected = [ - c for c in sorted(staging.glob(f"{archive_name}.[0-9]*.dar")) if "-catalog" not in c.name - ] - log.info(f"Slices: {len(collected)}") - log.info(f"Output: {output_dir}") - - dar_base = staging / archive_name - catalog_base = staging / f"{archive_name}-catalog" - has_catalog = any(staging.glob(f"{archive_name}-catalog.*.dar")) - - rc, corrupted_files = dar.extract_sequential( - dar_base, - output_dir, - catalog_base=catalog_base if has_catalog else None, - ) + if chain_name is None: + log.error("No discs processed") + sys.exit(1) - if rc == 0 and not corrupted_files and not unrepairable_slices: + # ── Extract: one dar -x per generation in order ────────────────────── + log.step("Extracting archive chain") + sorted_gens = sorted(gen_basenames) + log.info(f"Chain: {chain_name}") + log.info(f"Generations: {sorted_gens}") + + all_corrupted: list[str] = [] + for i, gen in enumerate(sorted_gens): + basename = gen_basenames[gen] + log.info(f"Gen {gen}: dar -x {basename}") + catalog_basename = f"{basename}-catalog" + has_catalog = any(staging.glob(f"{catalog_basename}.*.dar")) + # Subsequent generations must overwrite earlier ones (later gens + # carry the newer file contents). Gen 1 extracts into a clean + # output dir, so overwrite is a no-op there — but we set it + # uniformly to keep the call site simple. + rc, corrupted = dar.extract_sequential( + staging / basename, + output_dir, + catalog_base=staging / catalog_basename if has_catalog else None, + overwrite=i > 0, + ) + all_corrupted.extend(corrupted) + if rc != 0: + log.error(f"Gen {gen} dar extract failed (exit {rc})") + log.info(f"Slices remain in: {staging}") + log.info( + f"Manual retry: dar -x {staging / basename} -R {output_dir} --sequential-read -wa" + ) + sys.exit(1) + + if not all_corrupted and not unrepairable_slices: log.ok("Extraction complete!") - elif rc == 0: - # dar exited cleanly but reported per-file CRC errors and/or - # we already know slices were unrepairable. Tell the user. + else: log.warn( f"Extraction finished with corruption: " - f"{len(corrupted_files)} file(s) reported by dar, " + f"{len(all_corrupted)} file(s) reported by dar, " f"{len(unrepairable_slices)} slice(s) unrepairable" ) - else: - log.error(f"dar extraction failed (exit {rc})") - log.info(f"Slices are in: {staging}") - if has_catalog: - log.info( - f"Retry without rescue catalog: dar -x {dar_base} -R {output_dir} --sequential-read" - ) - else: - log.info(f"Manual: dar -x {dar_base} -R {output_dir} --sequential-read") - sys.exit(1) # Write corrupted-files.txt manifest into output_dir (NOT into the # workdir, which may be auto-cleaned) when anything went sideways. manifest_path: Path | None = None - if corrupted_files or unrepairable_slices: + if all_corrupted or unrepairable_slices: manifest_path = output_dir / "corrupted-files.txt" lines = [ "# bd-archive: corrupted-files manifest", @@ -293,9 +335,9 @@ def cmd_extract(args): "# them with intact data if the par2 recovery succeeds.", "", ] - if corrupted_files: - lines.append(f"## {len(corrupted_files)} file(s) reported by dar with bad CRC:") - for fp in corrupted_files: + if all_corrupted: + lines.append(f"## {len(all_corrupted)} file(s) reported by dar with bad CRC:") + for fp in all_corrupted: try: rel = str(Path(fp).resolve().relative_to(output_dir.resolve())) except ValueError: @@ -325,19 +367,18 @@ def cmd_extract(args): shutil.rmtree(work_dir, ignore_errors=True) log.step("Restore complete") - print(f"\n Archive: {archive_name}") - print(f" Slices: {len(collected)}") - print(f" Discs: {disc_num}") - print(f" Output: {output_dir}") - print(f" Size: {human_bytes(total)}") + print(f"\n Chain: {chain_name}") + print(f" Generations: {sorted_gens}") + print(f" Discs: {disc_num}") + print(f" Output: {output_dir}") + print(f" Size: {human_bytes(total)}") if manifest_path is not None: - print(f" CORRUPT: {manifest_path}") + print(f" CORRUPT: {manifest_path}") if not workdir_is_default: print(f"\n Cleanup staging: rm -rf {work_dir}") print() # Non-zero exit when corruption was detected so scripts know the - # restore was not fully clean. The output is still useful (best- - # effort restore), but callers should consult corrupted-files.txt. - if corrupted_files or unrepairable_slices: + # restore was not fully clean. + if all_corrupted or unrepairable_slices: sys.exit(1) diff --git a/src/bd_archive/tools/dar.py b/src/bd_archive/tools/dar.py index 1769d8c..75b1b28 100644 --- a/src/bd_archive/tools/dar.py +++ b/src/bd_archive/tools/dar.py @@ -128,6 +128,7 @@ def extract_sequential( base_path: Path, output_dir: Path, catalog_base: Path | None = None, + overwrite: bool = False, ) -> tuple[int, list[str]]: """Extract a dar archive with --sequential-read. @@ -137,6 +138,11 @@ def extract_sequential( With a complete slice set, no prompts fire and the ESC stream goes unused. + Set overwrite=True to make dar replace existing files without + prompting (`-wa`). Required when extracting an incremental on + top of a previously-extracted generation, where later gens + update files that earlier gens already restored. + Returns (exit_code, corrupted_files). corrupted_files contains the paths dar reported as "Bad CRC" during extract — these files were (partially) written to output and need attention. @@ -144,6 +150,8 @@ def extract_sequential( the caller must check this list, not just the exit code. """ cmd = ["dar", "-x", str(base_path), "-R", str(output_dir), "-O", "--sequential-read"] + if overwrite: + cmd.append("-wa") if catalog_base is not None: # -A uses the isolated catalog as rescue source — handles # corruption of the in-archive catalog (PAR2 covers slice From a251eef797259b861f997b0f01de0bf668dd215e Mon Sep 17 00:00:00 2001 From: Xitee1 <59659167+Xitee1@users.noreply.github.com> Date: Tue, 12 May 2026 18:34:18 +0200 Subject: [PATCH 5/7] docs: README + AGENTS sync with incremental-archives feature set README gains an "Adding an incremental generation" section with the --base workflow and --min-last-disc-fill explanation, plus updates the extract section to describe whole-chain restore. Adds a "Chain identity = archive name" callout near the top to explain the discipline of keeping -n constant across generations. AGENTS.md create / extract architecture descriptions are rewritten to cover the new naming scheme, --base flow, list_catalog_paths / scan_delta_bytes, auto-defer pool semantics, per-gen catalog state, and dar -x -wa chain restore. Layout section notes the new constants (ISO9660_LABEL_NAME_MAX / _SUFFIX_LEN), new helpers (parse_dar_filename, list_source_files / SourceFile, scan_delta_bytes), and the extended dar wrapper surface. Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 18 +++++++++-------- README.md | 59 ++++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index b0c0ce2..baa784e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -19,7 +19,7 @@ PYTHONPATH=src python3 -m bd_archive ... ``` ```bash -bd-archive create -s -n -o [-w ] [-D /dev/srN] [-b BYTES] [-r %] [-c zstd|lzma|...] [-l ] [--ratio | --sample ] [-y] +bd-archive create -s -n -o [-w ] [-D /dev/srN] [-b BYTES] [-r %] [-c zstd|lzma|...] [-l ] [--ratio | --sample ] [--base ] [--min-last-disc-fill PERCENT] [-y] bd-archive burn -i [-D /dev/srN] [--start N] [--no-verify] [--skip-fit-check] [-S ] bd-archive verify [] bd-archive extract -o [-D /dev/srN] [-w ] @@ -47,11 +47,11 @@ src/bd_archive/ ├── __main__.py # entry point for `python -m bd_archive` ├── _par2_helper.py # dar -E hook: invoked as `python -m bd_archive._par2_helper ...` ├── cli.py # argparse + dispatch + top-level exception handling (uniform cancel/error output) -├── constants.py # MiB, DISC_OVERSIZE_TOLERANCE, PAR2_AND_MISC_OVERHEAD, DISC_END_MARGIN, POST_BURN_MOUNT_TIMEOUT, ISO9660_VOLUME_LABEL_MAX, PAR2_RECOVERY_RE +├── constants.py # MiB, DISC_OVERSIZE_TOLERANCE, PAR2_AND_MISC_OVERHEAD, DISC_END_MARGIN, POST_BURN_MOUNT_TIMEOUT, ISO9660_VOLUME_LABEL_MAX, ISO9660_LABEL_NAME_MAX, ISO9660_LABEL_SUFFIX_LEN, PAR2_RECOVERY_RE ├── ui/ # logger, prompts (interactive), progress (byte-counted, TTY-aware) ├── shell/ # runner.py: run() (+ SIGINT handling); deps.py: check_deps(); format.py: human_bytes() ├── tools/ # one thin wrapper per external CLI -│ ├── dar.py # dar create_sliced/isolate_catalog/compress/extract_sequential (Bad-CRC parser) +│ ├── dar.py # dar create_sliced (incl. -A ref, -P excludes, -E hook) / isolate_catalog / compress / extract_sequential (-wa overwrite for chain restore, Bad-CRC parser) / list_catalog_paths (`dar -l` parse) │ ├── par2.py # par2 create/verify/repair (+ VerifyResult, is_par2_index) │ ├── mkisofs.py # ISO9660+UDF image build (`-iso-level 3 -udf -V -publisher -input-charset utf-8 -graft-points`) │ ├── growisofs.py # burn (+ DeviceBusyError on sg lock, SIGINT double-press abort with BURN_ABORT_GRACE_S=5s) @@ -63,11 +63,11 @@ src/bd_archive/ │ └── lsof.py # find_device_holders (optional — no-op if lsof absent) ├── archive/ # domain logic over tools/ │ ├── checksums.py # sha512 verify (verify_slice per-file, used by extract on staging) -│ ├── config.py # ArchiveConfig, write_readme -│ ├── dar_archive.py # DarArchive (slices, catalog, work-dir layout) +│ ├── config.py # ArchiveConfig (incl. generation, dar_name), write_readme +│ ├── dar_archive.py # DarArchive (slices, catalog, work-dir layout) + parse_dar_filename (chain/gen detection from filename) │ ├── disc.py # DiscIO (mount/mount_with_retry/umount/eject/close_tray_if_open/burn) + find_sg_device │ ├── sizing.py # compute_slice_bytes, measure_compression_ratio -│ ├── source_scan.py # SourceScan + scan_source +│ ├── source_scan.py # SourceScan + scan_source; SourceFile + list_source_files (auto-defer pool); scan_delta_bytes (incremental preview) │ └── verify.py # verify_disc() └── commands/ # one file per subcommand ├── create.py @@ -82,10 +82,12 @@ Layering: `commands/` → `archive/` → `tools/` → `shell/`. Lower layers nev Four subcommands form a pipeline. `create` previews disc count + last-disc fill before prompting for confirmation, so users can dry-run sizing without committing. -1. **`create`** (`commands/create.py`) reads disc capacity via `tools.mediainfo.detect_disc_capacity` (or `args.bytes`), scans the source, and computes slice sizing plus a disc-count estimate (optionally measuring the compression ratio via `--sample`). The user confirms via `prompt_yn` before any heavy work begins (skip with `-y`). Then runs `tools.dar.create_sliced` with `--hash sha512 --min-digits 4 -Q` (plus `-z[:level] -am` when compression is enabled) to slice the source into per-disc-sized `.dar` files in `/tmp/`. par2 is generated **inline** via dar's `-E` hook (`bd_archive._par2_helper`) — the hook fires after each slice is fully written, so par2 reads the slice while it is still hot in the OS page cache, eliminating most SSD read traffic of the create phase. After dar completes, the catalog is isolated. For each slice in order: regenerate `README.txt` with the right disc number and call `tools.mkisofs.build` (mkisofs `-iso-level 3 -udf -V