From 3fca5ad6924829f56f15a491ed8e9058282e868c Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 13:47:02 +0900 Subject: [PATCH 1/9] feat(pipeline): add stage-level timings and a benchmark harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nothing in the pipeline measured where a run's time went: ProgressEvents carry no timestamps and the stage logs no durations, so optimization work had no baseline to argue against. This adds the measurement layer: - --timings: a StageTimingSink (composed in the CLI shell) prints a stable, machine-parseable per-stage breakdown to stderr when a run ends ("timing: = s (%)"), including the still-open stage on failure. - PipelineRunner logs each stage directory's byte total, making the intermediate I/O of every stage visible. - benchPipeline: a Gradle task driving the installDist launcher with --timings (PipelineBenchmark, test sources), measuring E2E wall, the per-stage medians, peak RSS via /proc VmHWM, and output size, over a -Pjobs sweep; writes pipeline/docs/perf-baseline.md. - createSampleScan: a deterministic synthetic 600-dpi A5 scan book (specks for despeckle, ±0.5° skew for deskew) so the benchmark needs no copyrighted input and stays comparable across machines. Baseline on the 200-page fixture (8 CPUs): conv 14.48s at -j8 — despeckle 68%, register 22.6%, extract 7.9%, spread 1.5% — and a 3.44x scale-up from -j1, recorded in pipeline/docs/perf-baseline.md. Co-Authored-By: Claude Fable 5 --- pipeline/README.md | 1 + pipeline/app/build.gradle.kts | 47 ++ .../p4suta/pipeline/cli/PipelineCommand.java | 32 +- .../p4suta/pipeline/cli/StageTimingSink.java | 93 ++++ .../pipeline/cli/StageTimingSinkTest.java | 75 +++ .../pipeline/tools/PipelineBenchmark.java | 498 ++++++++++++++++++ .../pipeline/tools/SampleScanGenerator.java | 135 +++++ .../pipeline/application/PipelineRunner.java | 35 +- pipeline/docs/perf-baseline.md | 35 ++ 9 files changed, 947 insertions(+), 4 deletions(-) create mode 100644 pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java create mode 100644 pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java create mode 100644 pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java create mode 100644 pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java create mode 100644 pipeline/docs/perf-baseline.md diff --git a/pipeline/README.md b/pipeline/README.md index b8d47ef..12d15ef 100644 --- a/pipeline/README.md +++ b/pipeline/README.md @@ -33,6 +33,7 @@ never stops the rest; existing outputs are skipped unless `--force`). | `--pdf-a` | off | emit PDF/A-2b conformance | | `--force` | off | overwrite an existing output (batch: regenerate, don't skip) | | `--progress-file ` | — | write machine-readable JSONL progress events (single input only) | +| `--timings` | off | print a per-stage wall-clock breakdown to stderr when each run ends | | `-i, --interactive` | off | guided mode: prompt for the input, options and output | | `-h, --help` | — | show help and exit | | `-V, --version` | — | print version and exit | diff --git a/pipeline/app/build.gradle.kts b/pipeline/app/build.gradle.kts index 9764cfe..5c2572d 100644 --- a/pipeline/app/build.gradle.kts +++ b/pipeline/app/build.gradle.kts @@ -34,6 +34,10 @@ dependencies { implementation(libs.commons.cli) implementation(libs.slf4j.api) runtimeOnly(libs.slf4j.simple) + + // The benchmark fixture generator (test sources, never shipped — mirroring register's + // createSamplePdf) draws synthetic scan pages with PDFBox directly. + testImplementation(libs.pdfbox) } // The one place native access is granted to the launched app; run, test and JavaExec inherit it. @@ -85,3 +89,46 @@ selfContainedApp { // jbig2 (its register stage writes TIFF-G4; the spread pack embeds CCITT G4). bundleQpdf(this, libs.versions.qpdf.get()) } + +// ---- Stage-level benchmark (see pipeline/docs/perf-baseline.md) --------------------------------- + +// Deterministic synthetic scan book for the benchmark: an existing output is reused, so the +// generation cost (a minute at 200 pages × 600 dpi) is paid once. Knob: -Ppages=N (default 200). +tasks.register("createSampleScan") { + group = "verification" + description = "Generate the synthetic bitonal scan book the benchmark converts (cached)" + dependsOn(tasks.named("testClasses")) + classpath = sourceSets["test"].runtimeClasspath + mainClass = "io.github.p4suta.pipeline.tools.SampleScanGenerator" + val pages = providers.gradleProperty("pages").getOrElse("200") + args = listOf("build/test-data/sample-scan-${pages}p.pdf", pages, "600") +} + +// Stage-level runtime + memory benchmark (the pdfbook counterpart of tate's benchRuntime): runs the +// installDist launcher in-container with --timings, parses the per-stage breakdown, samples peak +// RSS from /proc, and writes pipeline/docs/perf-baseline.md. Knobs: -Pruns=N (warm runs, default +// 3), -Pjobs=1,8 (comma-separated -j sweep; default auto = the launcher's CPU-count default), +// -Ppages=N (fixture size, default 200), -Pinputs="a.pdf b.pdf" (real books instead of the +// fixture; resolved against the repo root). +tasks.register("benchPipeline") { + group = "verification" + description = "Benchmark pdfbook stage timings + peak memory; writes pipeline/docs/perf-baseline.md" + dependsOn(tasks.named("installDist"), tasks.named("createSampleScan")) + classpath = sourceSets["test"].runtimeClasspath + mainClass = "io.github.p4suta.pipeline.tools.PipelineBenchmark" + workingDir = rootDir + val runs = providers.gradleProperty("runs").getOrElse("3") + val jobs = providers.gradleProperty("jobs").getOrElse("auto") + val pages = providers.gradleProperty("pages").getOrElse("200") + val extraInputs = + providers + .gradleProperty("inputs") + .orNull + ?.split(Regex("\\s+")) + ?.filter { it.isNotBlank() } + ?: emptyList() + val launcher = "pipeline/app/build/install/pdfbook/bin/pdfbook" + val inputs = + extraInputs.ifEmpty { listOf("pipeline/app/build/test-data/sample-scan-${pages}p.pdf") } + args = listOf(launcher, "qpdf", "pipeline/docs/perf-baseline.md", runs, jobs) + inputs +} diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java index 99e453c..f29449d 100644 --- a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java @@ -153,6 +153,13 @@ private static Options buildOptions() { "Write machine-readable JSONL progress events to this file (single" + " input only); used by front ends to report progress.") .get()); + options.addOption( + Option.builder() + .longOpt("timings") + .desc( + "Print a per-stage wall-clock breakdown to stderr when each run" + + " ends.") + .get()); CliDocs.options(options); return options; } @@ -351,6 +358,7 @@ record Plan(Path input, Path output, Config config) {} deskew, scale, pdfA, + false, force); return new Plan(input, output, config); } @@ -380,14 +388,32 @@ private static String defaultOutput(Path input) { private static void runOne(Path input, Path output, Config config, @Nullable Path progressFile) throws IOException { if (progressFile == null) { - runWith(input, output, config, ProgressSink.NO_OP); + runWith(input, output, config, withTimings(config, ProgressSink.NO_OP)); } else { try (JsonlFileProgressSink progress = new JsonlFileProgressSink(progressFile)) { - runWith(input, output, config, progress); + runWith(input, output, config, withTimings(config, progress)); } } } + /** + * Wraps {@code sink} with a fresh {@link StageTimingSink} when {@code --timings} is set, so + * each run (every book of a batch separately) prints its own per-stage breakdown to stderr. + */ + private static ProgressSink withTimings(Config config, ProgressSink sink) { + if (!config.timings()) { + return sink; + } + StageTimingSink timings = new StageTimingSink(System.err); + if (sink == ProgressSink.NO_OP) { + return timings; + } + return event -> { + sink.emit(event); + timings.emit(event); + }; + } + // Resolves the progress sink first so the stages and sink report page-level PageProcessed // events into the same sink PipelineRunner reports stage boundaries into. With no // --progress-file the sink is NO_OP and every emit is a no-op. @@ -447,6 +473,7 @@ private static Config parseConfig(CommandLine cmd) throws ParseException { !cmd.hasOption("no-deskew"), !cmd.hasOption("no-scale"), cmd.hasOption("pdf-a"), + cmd.hasOption("timings"), cmd.hasOption("force")); } @@ -475,5 +502,6 @@ record Config( boolean deskew, boolean scale, boolean pdfA, + boolean timings, boolean force) {} } diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java new file mode 100644 index 0000000..496b0e4 --- /dev/null +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java @@ -0,0 +1,93 @@ +package io.github.p4suta.pipeline.cli; + +import io.github.p4suta.shared.progress.ProgressEvent; +import io.github.p4suta.shared.progress.ProgressSink; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import org.jspecify.annotations.Nullable; + +/** + * Measures each stage's wall clock from its {@link ProgressEvent.StageStarted}/{@link + * ProgressEvent.StageCompleted} boundaries and prints a per-stage breakdown when the run ends — the + * {@code --timings} flag's implementation. One line per stage in completion order, then the + * run-wide total: + * + *
{@code
+ * timing: extract = 4.21s (18.3%)
+ * timing: despeckle = 9.87s (42.9%)
+ * timing: total = 23.01s
+ * }
+ * + *

The {@code timing: = s} shape is a stable contract the {@code benchPipeline} + * harness parses; keep it machine-readable. A stage still open when the run fails is reported with + * its elapsed-so-far, so a failed run still shows where the time went. Thread-safe like every + * {@link ProgressSink}: events are handled under one lock. + */ +final class StageTimingSink implements ProgressSink { + + private final PrintStream out; + private final Object lock = new Object(); + private final List stages = new ArrayList<>(); + private final List stageNanos = new ArrayList<>(); + private @Nullable String openStage; + private long openedAtNanos; + private long runStartedAtNanos; + private boolean runStarted; + + StageTimingSink(PrintStream out) { + this.out = out; + } + + @Override + public void emit(ProgressEvent event) { + synchronized (lock) { + switch (event) { + case ProgressEvent.RunStarted ignored -> markRunStarted(); + case ProgressEvent.StageStarted s -> { + // Defensive: a sink wired mid-run still measures from the first boundary. + markRunStarted(); + openStage = s.stage(); + openedAtNanos = System.nanoTime(); + } + case ProgressEvent.StageCompleted ignored -> closeOpenStage(); + case ProgressEvent.PageProcessed ignored -> { + // Stage boundaries carry all the timing information. + } + case ProgressEvent.RunCompleted ignored -> report(); + case ProgressEvent.RunFailed ignored -> report(); + } + } + } + + private void markRunStarted() { + if (!runStarted) { + runStartedAtNanos = System.nanoTime(); + runStarted = true; + } + } + + private void closeOpenStage() { + @Nullable String stage = openStage; + if (stage != null) { + stages.add(stage); + stageNanos.add(System.nanoTime() - openedAtNanos); + openStage = null; + } + } + + private void report() { + closeOpenStage(); + long totalNanos = runStarted ? System.nanoTime() - runStartedAtNanos : 0; + for (int i = 0; i < stages.size(); i++) { + out.printf( + Locale.ROOT, + "timing: %s = %.2fs (%.1f%%)%n", + stages.get(i), + stageNanos.get(i) / 1e9, + totalNanos > 0 ? stageNanos.get(i) * 100.0 / totalNanos : 0.0); + } + out.printf(Locale.ROOT, "timing: total = %.2fs%n", totalNanos / 1e9); + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java new file mode 100644 index 0000000..3fbb3f7 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java @@ -0,0 +1,75 @@ +package io.github.p4suta.pipeline.cli; + +import static org.assertj.core.api.Assertions.assertThat; + +import io.github.p4suta.shared.progress.ProgressEvent; +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import org.junit.jupiter.api.Test; + +/** + * Pins the {@code --timings} report: one machine-parseable {@code timing: = s} + * line per completed stage (in completion order, percentages attached) plus a {@code timing: total} + * line, printed only when the run ends — and on failure, the still-open stage is reported with its + * elapsed-so-far. The line shape is the contract the {@code benchPipeline} harness parses. + */ +final class StageTimingSinkTest { + + private final ByteArrayOutputStream buf = new ByteArrayOutputStream(); + private final StageTimingSink sink = + new StageTimingSink(new PrintStream(buf, true, StandardCharsets.UTF_8)); + + private String output() { + return buf.toString(StandardCharsets.UTF_8); + } + + @Test + void completedRunReportsEachStageInOrderAndATotal() { + sink.emit(new ProgressEvent.RunStarted(2)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 2)); + sink.emit(new ProgressEvent.PageProcessed("extract", 1, 2)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + sink.emit(new ProgressEvent.StageStarted("spread", 1, 2)); + sink.emit(new ProgressEvent.StageCompleted("spread")); + sink.emit(new ProgressEvent.RunCompleted()); + + assertThat(output().lines()) + .hasSize(3) + .satisfiesExactly( + extract -> + assertThat(extract) + .matches( + "timing: extract = \\d+\\.\\d{2}s" + + " \\(\\d+\\.\\d%\\)"), + spread -> + assertThat(spread) + .matches( + "timing: spread = \\d+\\.\\d{2}s" + + " \\(\\d+\\.\\d%\\)"), + total -> assertThat(total).matches("timing: total = \\d+\\.\\d{2}s")); + } + + @Test + void nothingIsPrintedBeforeTheRunEnds() { + sink.emit(new ProgressEvent.RunStarted(1)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 1)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + + assertThat(output()).isEmpty(); + } + + @Test + void failedRunReportsTheStillOpenStage() { + sink.emit(new ProgressEvent.RunStarted(2)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 2)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + sink.emit(new ProgressEvent.StageStarted("register", 1, 2)); + sink.emit(new ProgressEvent.RunFailed("INTERNAL", "boom")); + + assertThat(output()) + .contains("timing: extract = ") + .contains("timing: register = ") + .contains("timing: total = "); + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java new file mode 100644 index 0000000..1b59397 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java @@ -0,0 +1,498 @@ +package io.github.p4suta.pipeline.tools; + +import java.io.IOException; +import java.lang.management.ManagementFactory; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Stage-level runtime + memory benchmark for the installed {@code pdfbook} launcher — the pdfbook + * counterpart of tate's {@code RuntimeBenchmark}, with per-stage attribution as the addition. + * + *

Drives {@code pdfbook -o /out.pdf --force --timings [-j N]} as a child process, + * measuring end-to-end wall around the process ({@link System#nanoTime()}), peak RSS by sampling + * the child's {@code /proc//status} {@code VmHWM} (Linux-only; {@code n/a} elsewhere), and the + * per-stage wall by parsing the stable {@code timing: = s} lines {@code + * StageTimingSink} prints. Writes a Markdown report. + * + *

Test-sources tool (driven by the {@code benchPipeline} Gradle task): it never ships in the + * production launcher, and it expects the dev container's native toolchain (pdfimages, Leptonica, + * qpdf) on PATH — the installDist launcher, not the jpackage image, is what it measures. + * + *

Usage: {@code PipelineBenchmark + * ...} — {@code jobsCsv} is a comma-separated {@code -j} sweep ({@code auto} = omit + * {@code -j}, i.e. the launcher's CPU-count default). + */ +public final class PipelineBenchmark { + + private static final Pattern TIMING = + Pattern.compile("^timing: (\\S+) = ([0-9.]+)s", Pattern.MULTILINE); + private static final Pattern VM_HWM = Pattern.compile("VmHWM:\\s*([0-9]+)"); + private static final long POLL_MILLIS = 5; + private static final long PROCESS_TIMEOUT_NANOS = TimeUnit.MINUTES.toNanos(30); + private static final long MIB = 1024L * 1024L; + + private final Path launcher; + private final String qpdf; + private final Path outDoc; + private final int runs; + private final List jobsSweep; + + private PipelineBenchmark( + Path launcher, String qpdf, Path outDoc, int runs, List jobsSweep) { + this.launcher = launcher; + this.qpdf = qpdf; + this.outDoc = outDoc; + this.runs = runs; + this.jobsSweep = jobsSweep; + } + + public static void main(String[] args) throws IOException, InterruptedException { + if (args.length < 6) { + System.err.println( + "usage: PipelineBenchmark " + + " ..."); + System.exit(2); + return; + } + var benchmark = + new PipelineBenchmark( + Path.of(args[0]), + args[1], + Path.of(args[2]), + Integer.parseInt(args[3]), + Arrays.stream(args[4].split(",")).map(String::trim).toList()); + List inputs = Arrays.stream(args).skip(5).map(Path::of).toList(); + benchmark.run(inputs); + } + + // Result records + + /** One measured child run: wall seconds, peak RSS (KiB, -1 if unavailable), merged output. */ + private record Timed(double elapsedSeconds, long maxRssKib, String output) {} + + /** A finished input × jobs measurement, ready to render. */ + private record Row( + String name, + String jobs, + int pages, + long inputBytes, + double wallMedian, + double coldWall, + Map stageMedians, + long rssMedianKib, + long outputBytes) {} + + // Orchestration + + private void run(List inputs) throws IOException, InterruptedException { + requireExecutable(launcher, "pdfbook launcher", "build it first: just pdfbook-install"); + + List rows = new ArrayList<>(); + for (Path input : inputs) { + if (!Files.isRegularFile(input)) { + System.err.println("skip (not found): " + input); + continue; + } + for (String jobs : jobsSweep) { + rows.add(measure(input, jobs)); + } + } + + String report = render(rows); + Files.createDirectories(requireParent(outDoc)); + Files.writeString(outDoc, report, StandardCharsets.UTF_8); + System.out.print(report); + System.err.println(); + System.err.println("→ wrote " + outDoc); + } + + private Row measure(Path input, String jobs) throws IOException, InterruptedException { + int pages = pageCount(input); + long inputBytes = Files.size(input); + System.err.printf( + Locale.ROOT, + "Measuring: %s (%dp, %s MiB, jobs=%s)…%n", + fileName(input), + pages, + mib(inputBytes), + jobs); + + Path work = Files.createTempDirectory("pdfbook-bench"); + try { + Path out = work.resolve("out.pdf"); + List convert = new ArrayList<>(); + convert.add(launcher.toString()); + convert.add(input.toString()); + convert.add("-o"); + convert.add(out.toString()); + convert.add("--force"); + convert.add("--timings"); + if (!"auto".equals(jobs)) { + convert.add("-j"); + convert.add(jobs); + } + + // Cold run (fresh page cache for the input is not guaranteed, but a fresh JVM is) — + // recorded separately from the warm median. + Timed cold = timed(convert); + + double[] walls = new double[runs]; + long[] rsss = new long[runs]; + Map> stages = new LinkedHashMap<>(); + for (int r = 0; r < runs; r++) { + Timed t = timed(convert); + walls[r] = t.elapsedSeconds(); + rsss[r] = t.maxRssKib(); + parseTimings(t.output()) + .forEach( + (stage, seconds) -> + stages.computeIfAbsent(stage, ignored -> new ArrayList<>()) + .add(seconds)); + } + Map stageMedians = new LinkedHashMap<>(); + stages.forEach((stage, seconds) -> stageMedians.put(stage, median(seconds))); + + long outputBytes = Files.isRegularFile(out) ? Files.size(out) : -1; + return new Row( + fileName(input), + jobs, + pages, + inputBytes, + median(walls), + cold.elapsedSeconds(), + stageMedians, + medianLong(rsss), + outputBytes); + } finally { + deleteTree(work); + } + } + + /** + * The per-stage seconds of one run, keyed by stage label in print order ({@code total} + * included). Repeated labels (a batch run) sum, though the harness always converts one book. + */ + private static Map parseTimings(String output) { + Map timings = new LinkedHashMap<>(); + Matcher m = TIMING.matcher(output); + while (m.find()) { + timings.merge(m.group(1), Double.parseDouble(m.group(2)), Double::sum); + } + return timings; + } + + // Subprocess measurement + + /** + * Runs {@code command}, returning its wall time, peak RSS (sampled from {@code + * /proc//status} {@code VmHWM}; -1 where unavailable), and its merged stdout+stderr (so + * the {@code timing:} lines can be parsed). Output is drained on a separate thread so a chatty + * child cannot deadlock on a full pipe. + */ + private static Timed timed(List command) throws IOException, InterruptedException { + long start = System.nanoTime(); + Process process = new ProcessBuilder(command).redirectErrorStream(true).start(); + + var captured = new AtomicReference<>(""); + Thread drainer = + Thread.ofVirtual() + .start( + () -> { + try (var in = process.getInputStream()) { + captured.set( + new String( + in.readAllBytes(), StandardCharsets.UTF_8)); + } catch (IOException ignored) { + // Process gone; whatever was read is lost — acceptable for + // a benchmark. + } + }); + + Path status = Path.of("/proc", Long.toString(process.pid()), "status"); + long peakRssKib = -1; + while (process.isAlive()) { + if (System.nanoTime() - start > PROCESS_TIMEOUT_NANOS) { + process.destroyForcibly(); + throw new IOException("timed command did not finish: " + command); + } + peakRssKib = Math.max(peakRssKib, readVmHwmKib(status)); + Thread.sleep(POLL_MILLIS); + } + double elapsed = (System.nanoTime() - start) / 1.0e9; + int exit = process.waitFor(); + drainer.join(); + if (exit != 0) { + throw new IOException( + "benchmark child failed with exit " + exit + ": " + captured.get()); + } + return new Timed(elapsed, peakRssKib, captured.get()); + } + + /** Peak RSS (KiB) from {@code /proc//status}, or -1 if unreadable / non-Linux. */ + private static long readVmHwmKib(Path status) { + try { + Matcher m = VM_HWM.matcher(Files.readString(status, StandardCharsets.UTF_8)); + return m.find() ? Long.parseLong(m.group(1)) : -1; + } catch (IOException | RuntimeException e) { + return -1; // process already exited, or /proc not present + } + } + + /** Page count via {@code qpdf --show-npages} (PATH or absolute), or -1 when unavailable. */ + private int pageCount(Path pdf) throws InterruptedException { + try { + Process process = + new ProcessBuilder(qpdf, "--show-npages", pdf.toString()) + .redirectErrorStream(true) + .start(); + String output; + try (var in = process.getInputStream()) { + output = new String(in.readAllBytes(), StandardCharsets.UTF_8); + } + if (!process.waitFor(1, TimeUnit.MINUTES)) { + process.destroyForcibly(); + return -1; + } + // The count is the one digits-only line; qpdf may surround it with warning lines. + return output.lines() + .map(String::strip) + .filter(line -> line.matches("\\d+")) + .findFirst() + .map(Integer::parseInt) + .orElse(-1); + } catch (IOException e) { + return -1; // qpdf not installed — page count is cosmetic here + } + } + + // Numeric helpers + + private static double median(double[] values) { + double[] sorted = values.clone(); + Arrays.sort(sorted); + int n = sorted.length; + if (n == 0) { + return 0; + } + return (n % 2 == 1) ? sorted[n / 2] : (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0; + } + + private static double median(List values) { + return median(values.stream().mapToDouble(Double::doubleValue).toArray()); + } + + private static long medianLong(long[] values) { + return Math.round(median(Arrays.stream(values).asDoubleStream().toArray())); + } + + private static String mib(long bytes) { + return bytes < 0 ? "n/a" : String.format(Locale.ROOT, "%.1f", bytes / (double) MIB); + } + + // Rendering + + private String render(List rows) { + // Stage columns: the union of stage labels across rows in first-appearance order, with + // "total" (the launcher's in-process conversion time) pulled out as its own column. + Set stageNames = new LinkedHashSet<>(); + for (Row row : rows) { + stageNames.addAll(row.stageMedians().keySet()); + } + stageNames.remove("total"); + + var sb = new StringBuilder(); + sb.append("# pdfbook runtime baseline (stage-level)\n\n") + .append("Generated by `PipelineBenchmark`") + .append(" (`./gradlew :pipeline:app:benchPipeline`, in the dev container).\n") + .append( + "Tracks the **per-stage wall-clock breakdown, end-to-end runtime and peak" + + " memory**\n") + .append( + "of the installDist `pdfbook` launcher. Re-run after any change to the" + + " pipeline\n") + .append( + "and compare against the previous run before merging (acceptance: ≥5%" + + " median\n") + .append( + "total-wall improvement, or an explicit RSS/disk win, with output" + + " validated).\n\n"); + appendHostInfo(sb); + sb.append("\n## Stage breakdown (warm median of ").append(runs).append(" runs)\n\n"); + sb.append( + "`conv` is the launcher's in-process total (`timing: total`);" + + " `startup+init` = E2E wall − conv\n") + .append( + "(JVM boot + first-touch PDFBox/AWT init). `jobs=auto` is the launcher's" + + " CPU-count default.\n\n"); + sb.append("| Input | Jobs | Pages | E2E wall | conv |"); + for (String stage : stageNames) { + sb.append(' ').append(stage).append(" |"); + } + sb.append(" startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) |\n"); + sb.append("|---|---|---:|---:|---:|"); + sb.append("---:|".repeat(stageNames.size())); + sb.append("---:|---:|---:|---:|\n"); + for (Row row : rows) { + double conv = row.stageMedians().getOrDefault("total", 0.0); + sb.append("| ") + .append(row.name()) + .append(" | ") + .append(row.jobs()) + .append(" | ") + .append(pages(row.pages())) + .append(" | ") + .append(secs(row.wallMedian())) + .append(" | ") + .append(conv > 0 ? secs(conv) : "n/a") + .append(" |"); + for (String stage : stageNames) { + Double seconds = row.stageMedians().get(stage); + sb.append(' ').append(seconds == null ? "n/a" : secs(seconds)).append(" |"); + } + sb.append(' ') + .append(conv > 0 ? secs(Math.max(0, row.wallMedian() - conv)) : "n/a") + .append(" | ") + .append(secs(row.coldWall())) + .append(" | ") + .append(rssMib(row.rssMedianKib())) + .append(" | ") + .append(mib(row.outputBytes())) + .append(" |\n"); + } + sb.append("\n## Stage shares (of conv, warm median)\n\n") + .append( + "The shares that decide where optimization effort goes: a stage that is" + + " ~5% of conv\n") + .append("cannot pay for a parallelization rewrite no matter how elegant.\n\n"); + sb.append("| Input | Jobs |"); + for (String stage : stageNames) { + sb.append(' ').append(stage).append(" |"); + } + sb.append('\n').append("|---|---|").append("---:|".repeat(stageNames.size())).append('\n'); + for (Row row : rows) { + double conv = row.stageMedians().getOrDefault("total", 0.0); + sb.append("| ").append(row.name()).append(" | ").append(row.jobs()).append(" |"); + for (String stage : stageNames) { + Double seconds = row.stageMedians().get(stage); + sb.append(' ') + .append( + seconds == null || conv <= 0 + ? "n/a" + : String.format( + Locale.ROOT, "%.1f%%", seconds * 100.0 / conv)) + .append(" |"); + } + sb.append('\n'); + } + return sb.toString(); + } + + private void appendHostInfo(StringBuilder sb) { + String date = + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss", Locale.ROOT) + .withZone(ZoneOffset.UTC) + .format(Instant.now()); + long totalRamBytes = totalPhysicalMemoryBytes(); + sb.append("- Date (UTC): ").append(date).append('\n'); + sb.append("- Host: ") + .append(System.getProperty("os.name", "?")) + .append(' ') + .append(System.getProperty("os.version", "?")) + .append(' ') + .append(System.getProperty("os.arch", "?")) + .append(", ") + .append(Runtime.getRuntime().availableProcessors()) + .append(" CPUs, RAM ") + .append(totalRamBytes > 0 ? Math.round(totalRamBytes / 1.073741824e9) + "Gi" : "?") + .append('\n'); + sb.append("- Launcher: `").append(launcher).append("`\n"); + sb.append("- Samples per measurement: cold (1st run) + warm median of ") + .append(runs) + .append(".\n"); + sb.append( + "- The default input is the deterministic synthetic fixture" + + " (`createSampleScan`,\n") + .append( + " seeded, so identical across machines). Real books are pluggable via" + + " `-Pinputs=\"…\"`;\n") + .append(" only their page count and byte size are reported.\n"); + } + + private static long totalPhysicalMemoryBytes() { + if (ManagementFactory.getOperatingSystemMXBean() + instanceof com.sun.management.OperatingSystemMXBean os) { + return os.getTotalMemorySize(); + } + return -1; + } + + private static String secs(double seconds) { + return String.format(Locale.ROOT, "%.2fs", seconds); + } + + private static String pages(int pages) { + return pages < 0 ? "?" : Integer.toString(pages); + } + + private static String rssMib(long rssKib) { + return rssKib < 0 ? "n/a" : Long.toString(Math.round(rssKib / 1024.0)); + } + + // Small utilities + + private static String fileName(Path path) { + Path name = path.getFileName(); + return name != null ? name.toString() : path.toString(); + } + + private static void requireExecutable(Path path, String what, String hint) { + if (!Files.isExecutable(path)) { + System.err.println("error: " + what + " not found at " + path); + System.err.println(" " + hint); + System.exit(1); + } + } + + private static Path requireParent(Path path) { + Path parent = path.getParent(); + if (parent == null) { + throw new IllegalArgumentException("output path has no parent: " + path); + } + return parent; + } + + private static void deleteTree(Path dir) throws IOException { + if (!Files.exists(dir)) { + return; + } + try (var paths = Files.walk(dir)) { + paths.sorted((a, b) -> b.getNameCount() - a.getNameCount()) + .forEach( + p -> { + try { + Files.deleteIfExists(p); + } catch (IOException e) { + System.err.println( + "warn: could not delete " + p + ": " + e.getMessage()); + } + }); + } + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java new file mode 100644 index 0000000..8b111a7 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java @@ -0,0 +1,135 @@ +package io.github.p4suta.pipeline.tools; + +import java.awt.Color; +import java.awt.Graphics2D; +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Locale; +import java.util.Random; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; + +/** + * Generates the synthetic, copyright-free bitonal "scan" book the {@code benchPipeline} harness + * converts: A5 pages at the requested dpi carrying vertical text-like columns with per-page + * position jitter, a small per-page skew of up to ±0.5° (so the register stage's deskew has real + * work) and salt-and-pepper specks (so despeckle has real work), embedded as CCITT G4 so {@code + * pdfimages} extracts them exactly like a real scan. A fixed seed keeps every generation + * byte-identical, so benchmark runs stay comparable across machines and branches. + * + *

This deliberately lives in test sources (driven by the {@code createSampleScan} Gradle task), + * mirroring register's {@code SamplePdfGenerator}: the dev tool never ships in the production + * launcher. An existing output is reused, so repeated benchmark runs skip the generation cost. + * + *

Usage: {@code SampleScanGenerator [pages] [dpi]} + */ +public final class SampleScanGenerator { + + private SampleScanGenerator() {} + + /** {@code SampleScanGenerator [pages] [dpi]} — writes the synthetic scan book. */ + public static void main(String[] args) throws IOException { + Path out = Path.of(args.length > 0 ? args[0] : "sample-scan.pdf"); + int pages = args.length > 1 ? Integer.parseInt(args[1]) : 200; + int dpi = args.length > 2 ? Integer.parseInt(args[2]) : 600; + if (Files.isRegularFile(out)) { + System.out.println("reusing existing " + out + " (delete it to regenerate)"); + return; + } + long start = System.nanoTime(); + write(out, pages, dpi); + System.out.printf( + Locale.ROOT, + "wrote %s: %d page(s) at %d dpi, %.1f MiB in %.1fs%n", + out, + pages, + dpi, + Files.size(out) / (1024.0 * 1024.0), + (System.nanoTime() - start) / 1e9); + } + + /** Writes a {@code pages}-page synthetic bitonal scan book to {@code out} (A5 geometry). */ + public static void write(Path out, int pages, int dpi) throws IOException { + int width = Math.round(148f * dpi / 25.4f); // A5 portrait: 148 mm × 210 mm + int height = Math.round(210f * dpi / 25.4f); + Random random = new Random(42); + Path parent = out.toAbsolutePath().getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + try (PDDocument doc = new PDDocument()) { + float widthPt = width * 72f / dpi; + float heightPt = height * 72f / dpi; + for (int i = 0; i < pages; i++) { + PDImageXObject image = + CCITTFactory.createFromImage(doc, page(width, height, random)); + PDPage page = new PDPage(new PDRectangle(widthPt, heightPt)); + doc.addPage(page); + try (PDPageContentStream content = new PDPageContentStream(doc, page)) { + content.drawImage(image, 0, 0, widthPt, heightPt); + } + } + doc.save(out.toFile()); + } + } + + /** One page: slightly skewed text-like columns plus unrotated scanner-dust specks. */ + private static BufferedImage page(int width, int height, Random random) { + BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); + Graphics2D g = img.createGraphics(); + try { + g.setColor(Color.WHITE); + g.fillRect(0, 0, width, height); + g.setColor(Color.BLACK); + double skew = Math.toRadians(random.nextDouble() - 0.5); // ±0.5° + g.rotate(skew, width / 2.0, height / 2.0); + drawColumns(g, width, height, random); + g.rotate(-skew, width / 2.0, height / 2.0); + drawSpecks(g, width, height, random); + } finally { + g.dispose(); + } + return img; + } + + /** + * Vertical "text" columns right-to-left (Japanese book layout): stacked glyph-sized blocks with + * per-page jitter so registration has a real column position to detect and correct, and random + * early line breaks so the texture resembles prose rather than a solid block. + */ + private static void drawColumns(Graphics2D g, int width, int height, Random random) { + int margin = width / 10; + int glyph = Math.max(4, width / 60); + int leading = glyph / 2; + int jitterX = random.nextInt(glyph + 1) - glyph / 2; + int top = height / 12 + random.nextInt(glyph + 1); + int bottom = height - height / 12; + for (int x = width - margin - glyph + jitterX; x >= margin; x -= glyph + leading) { + int y = top; + while (y + glyph <= bottom) { + // ~8% of glyph slots end the "sentence" early, leaving prose-like white runs. + if (random.nextInt(100) < 8) { + y += glyph * (2 + random.nextInt(4)); + continue; + } + g.fillRect(x, y, glyph - 2, glyph - 2); + y += glyph; + } + } + } + + /** Salt-and-pepper dust: ~1 speck of 1–3 px per 25k pixels, what despeckle exists to remove. */ + private static void drawSpecks(Graphics2D g, int width, int height, Random random) { + int specks = width * height / 25_000; + for (int i = 0; i < specks; i++) { + int size = 1 + random.nextInt(3); + g.fillRect(random.nextInt(width - size), random.nextInt(height - size), size, size); + } + } +} diff --git a/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java b/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java index 0810505..8ae99b8 100644 --- a/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java +++ b/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java @@ -70,7 +70,11 @@ public void run( progress.emit(new ProgressEvent.StageStarted(source.name(), position, total)); Corpus corpus = source.open(stageDir(work, 0, source.name())); - log.info("source: {} page(s) at {} dpi", corpus.pageCount(), corpus.dpi()); + log.info( + "source: {} page(s) at {} dpi, {}", + corpus.pageCount(), + corpus.dpi(), + intermediatesSize(corpus.dir())); progress.emit(new ProgressEvent.StageCompleted(source.name())); position++; @@ -78,7 +82,12 @@ public void run( for (Stage stage : stages) { progress.emit(new ProgressEvent.StageStarted(stage.name(), position, total)); corpus = stage.apply(corpus, stageDir(work, dirIndex, stage.name())); - log.info("stage {} ({}): {} page(s)", dirIndex, stage.name(), corpus.pageCount()); + log.info( + "stage {} ({}): {} page(s), {}", + dirIndex, + stage.name(), + corpus.pageCount(), + intermediatesSize(corpus.dir())); progress.emit(new ProgressEvent.StageCompleted(stage.name())); position++; dirIndex++; @@ -111,6 +120,28 @@ private static Path stageDir(Path work, int index, String name) throws IOExcepti work.resolve(String.format(Locale.ROOT, "%02d-%s", index, name))); } + /** + * The stage directory's total file bytes rendered as MiB — visibility into how much + * intermediate I/O each stage produces (best-effort: {@code ?} when the walk fails). + */ + private static String intermediatesSize(Path dir) { + try (Stream files = Files.walk(dir)) { + long bytes = + files.filter(Files::isRegularFile).mapToLong(PipelineRunner::sizeQuietly).sum(); + return String.format(Locale.ROOT, "%.1f MiB", bytes / (1024.0 * 1024.0)); + } catch (IOException e) { + return "? MiB"; + } + } + + private static long sizeQuietly(Path file) { + try { + return Files.size(file); + } catch (IOException e) { + return 0L; + } + } + private static void deleteRecursively(Path dir) { try (Stream walk = Files.walk(dir)) { walk.sorted(Comparator.reverseOrder()).forEach(PipelineRunner::deleteQuietly); diff --git a/pipeline/docs/perf-baseline.md b/pipeline/docs/perf-baseline.md new file mode 100644 index 0000000..9727575 --- /dev/null +++ b/pipeline/docs/perf-baseline.md @@ -0,0 +1,35 @@ +# pdfbook runtime baseline (stage-level) + +Generated by `PipelineBenchmark` (`./gradlew :pipeline:app:benchPipeline`, in the dev container). +Tracks the **per-stage wall-clock breakdown, end-to-end runtime and peak memory** +of the installDist `pdfbook` launcher. Re-run after any change to the pipeline +and compare against the previous run before merging (acceptance: ≥5% median +total-wall improvement, or an explicit RSS/disk win, with output validated). + +- Date (UTC): 2026-06-10 04:46:07 +- Host: Linux 6.8.0-124-generic amd64, 8 CPUs, RAM 16Gi +- Launcher: `pipeline/app/build/install/pdfbook/bin/pdfbook` +- Samples per measurement: cold (1st run) + warm median of 3. +- The default input is the deterministic synthetic fixture (`createSampleScan`, + seeded, so identical across machines). Real books are pluggable via `-Pinputs="…"`; + only their page count and byte size are reported. + +## Stage breakdown (warm median of 3 runs) + +`conv` is the launcher's in-process total (`timing: total`); `startup+init` = E2E wall − conv +(JVM boot + first-touch PDFBox/AWT init). `jobs=auto` is the launcher's CPU-count default. + +| Input | Jobs | Pages | E2E wall | conv | extract | despeckle | register | spread | startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) | +|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| sample-scan-200p.pdf | 1 | 200 | 50.10s | 49.85s | 4.57s | 32.16s | 12.94s | 0.20s | 0.25s | 51.42s | 156 | 6.4 | +| sample-scan-200p.pdf | 8 | 200 | 14.77s | 14.48s | 1.15s | 9.85s | 3.27s | 0.21s | 0.29s | 15.04s | 328 | 6.4 | + +## Stage shares (of conv, warm median) + +The shares that decide where optimization effort goes: a stage that is ~5% of conv +cannot pay for a parallelization rewrite no matter how elegant. + +| Input | Jobs | extract | despeckle | register | spread | +|---|---|---:|---:|---:|---:| +| sample-scan-200p.pdf | 1 | 9.2% | 64.5% | 26.0% | 0.4% | +| sample-scan-200p.pdf | 8 | 7.9% | 68.0% | 22.6% | 1.5% | From 62c37bf10271edca106bb2e61bf997ecb0eb7fb8 Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:12:25 +0900 Subject: [PATCH 2/9] perf(extract): remux all-CCITT sources instead of decoding, finer chunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pdfimages -tiff decodes every embedded G4 image into an uncompressed TIFF (~2.2 MB per 600-dpi page; ~434 MB of transient intermediates for a 200-page book) even though the typical self-scanned source is CCITT G4 end to end. (The originally planned `-tiffcompression g4` flag does not exist on pdfimages — it is a pdftoppm option.) The extractor now picks its mode from one pdfimages -list pass: when every embedded image is 1-bpp CCITT, each chunk dumps the raw G4 streams (-ccitt) and CcittTiffs wraps them verbatim into single-strip CCITT-G4 TIFFs — a pure remux: no decode/re-encode, intermediates drop ~60x, and the image's true ppi is stamped instead of pdfimages' default 72 dpi. Because PDF's EncodedByteAlign never reaches the dumped .params file, every wrapped page is decoded back once through Leptonica as verification; a chunk that deviates in any way (params shape, count, or a wrap that fails to decode) is re-extracted decoded, which is also the whole-run mode for any non-CCITT source. The photometric mapping (-B -> WhiteIsZero, -W -> BlackIsZero) is pinned empirically by a pixel-identical round trip test. Extraction chunks also shrink from total/jobs to ~12 pages (capped at 4*jobs): fast finishers free their pool slot early, and a future streaming source can consume pages chunk by chunk. Benchmark (200-page fixture, warm median of 3, vs the PR #28 baseline): extract 1.15s -> 0.46s at -j8 (4.57s -> 0.88s at -j1), conv 49.85s -> 45.98s (-7.8%) at -j1, intermediates ~434 MB -> ~7 MB. Output validated with qpdf --check (100 spreads, linearized, no errors). Co-Authored-By: Claude Fable 5 --- .../p4suta/pipeline/cli/PipelineCommand.java | 8 +- pipeline/docs/perf-baseline.md | 10 +- .../infrastructure/G4EncodeStage.java | 13 +- .../github/p4suta/shared/pdf/CcittTiffs.java | 159 +++++++++++++++ .../shared/pdf/PdfImagesCliExtractor.java | 190 ++++++++++++++++-- .../p4suta/shared/pdf/PdfListingParser.java | 68 +++++-- .../p4suta/shared/pdf/CcittTiffsTest.java | 160 +++++++++++++++ .../shared/pdf/PdfImagesCliExtractorTest.java | 63 ++++++ .../shared/pdf/PdfListingParserTest.java | 23 +++ 9 files changed, 650 insertions(+), 44 deletions(-) create mode 100644 shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java create mode 100644 shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java index f29449d..25f8125 100644 --- a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java @@ -427,9 +427,11 @@ private static void runWith(Path input, Path output, Config config, ProgressSink stages.add(new RegisterStage(config.jobs(), config.deskew(), config.scale(), progress)); } if (stages.isEmpty()) { - // --no-despeckle --no-register: the raw pdfimages TIFFs are not CCITT G4, which the - // spread sink's pass-through embedding requires; despeckle/register each re-encode G4 - // themselves, so only the no-stage path needs this normalization. + // --no-despeckle --no-register: a non-CCITT source extracts as decoded TIFFs that are + // not the single-strip CCITT G4 the spread sink's pass-through embedding requires; + // despeckle/register each re-encode G4 themselves, so only the no-stage path needs + // this normalization (an all-CCITT source arrives already G4 — then this is a cheap + // lossless re-encode that keeps the path uniform). stages.add(new G4EncodeStage(config.jobs(), progress)); } Source source = new PdfExtractSource(input, config.jobs()); diff --git a/pipeline/docs/perf-baseline.md b/pipeline/docs/perf-baseline.md index 9727575..06ea504 100644 --- a/pipeline/docs/perf-baseline.md +++ b/pipeline/docs/perf-baseline.md @@ -6,7 +6,7 @@ of the installDist `pdfbook` launcher. Re-run after any change to the pipeline and compare against the previous run before merging (acceptance: ≥5% median total-wall improvement, or an explicit RSS/disk win, with output validated). -- Date (UTC): 2026-06-10 04:46:07 +- Date (UTC): 2026-06-10 05:11:24 - Host: Linux 6.8.0-124-generic amd64, 8 CPUs, RAM 16Gi - Launcher: `pipeline/app/build/install/pdfbook/bin/pdfbook` - Samples per measurement: cold (1st run) + warm median of 3. @@ -21,8 +21,8 @@ total-wall improvement, or an explicit RSS/disk win, with output validated). | Input | Jobs | Pages | E2E wall | conv | extract | despeckle | register | spread | startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) | |---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| -| sample-scan-200p.pdf | 1 | 200 | 50.10s | 49.85s | 4.57s | 32.16s | 12.94s | 0.20s | 0.25s | 51.42s | 156 | 6.4 | -| sample-scan-200p.pdf | 8 | 200 | 14.77s | 14.48s | 1.15s | 9.85s | 3.27s | 0.21s | 0.29s | 15.04s | 328 | 6.4 | +| sample-scan-200p.pdf | 1 | 200 | 46.18s | 45.98s | 0.88s | 31.95s | 12.94s | 0.20s | 0.20s | 47.85s | 173 | 6.4 | +| sample-scan-200p.pdf | 8 | 200 | 14.43s | 14.23s | 0.46s | 10.19s | 3.28s | 0.20s | 0.20s | 14.16s | 322 | 6.4 | ## Stage shares (of conv, warm median) @@ -31,5 +31,5 @@ cannot pay for a parallelization rewrite no matter how elegant. | Input | Jobs | extract | despeckle | register | spread | |---|---|---:|---:|---:|---:| -| sample-scan-200p.pdf | 1 | 9.2% | 64.5% | 26.0% | 0.4% | -| sample-scan-200p.pdf | 8 | 7.9% | 68.0% | 22.6% | 1.5% | +| sample-scan-200p.pdf | 1 | 1.9% | 69.5% | 28.1% | 0.4% | +| sample-scan-200p.pdf | 8 | 3.2% | 71.6% | 23.0% | 1.4% | diff --git a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java index bcd1008..bb3ad78 100644 --- a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java +++ b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java @@ -18,11 +18,14 @@ /** * The G4-normalization {@link Stage}: re-encodes each extracted page as single-strip CCITT G4 TIFF - * via Leptonica, which {@link SpreadPackSink}'s pass-through CCITT embedding requires. {@code - * pdfimages -tiff} writes poppler's default (non-G4) TIFF compression, so the raw extract output - * cannot be embedded directly; despeckle and register each re-encode their output as G4 themselves, - * so the composition root inserts this stage only when neither of them runs. The corpus dpi is - * stamped on every page, since {@code pdfimages} tags the extracted TIFFs at a default 72 dpi. + * via Leptonica, which {@link SpreadPackSink}'s pass-through CCITT embedding requires. The + * extractor's decoded mode ({@code pdfimages -tiff}, used for any source that is not all-CCITT) + * writes poppler's default (non-G4) TIFF compression at a default 72 dpi, so that output cannot be + * embedded directly; despeckle and register each re-encode their output as G4 themselves, so the + * composition root inserts this stage only when neither of them runs. The corpus dpi is stamped on + * every page. (For an all-CCITT source the extractor's remux already produces stamped single-strip + * G4 — this stage then re-encodes losslessly, a small constant cost that keeps the no-stage path + * uniform.) */ public final class G4EncodeStage implements Stage { diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java new file mode 100644 index 0000000..1bfd7c3 --- /dev/null +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java @@ -0,0 +1,159 @@ +package io.github.p4suta.shared.pdf; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Files; +import java.nio.file.Path; +import org.jspecify.annotations.Nullable; + +/** + * Wraps the raw CCITT G4 stream {@code pdfimages -ccitt} dumps into a single-strip TIFF — the + * pass-through half of the extractor's remux mode: the scan's embedded G4 bytes become a readable + * TIFF without ever being decoded and re-encoded. + * + *

poppler writes a fax2tiff-style {@code .params} file beside each dump ({@code -4} G4 / {@code + * -1 -2} G3, {@code -A} EOL markers / {@code -P} none, {@code -X }, {@code -W} BlackIs1 / + * {@code -B} not, {@code -M} MSB-first). Only the plain shape TIFF's T.6 compression can represent + * verbatim is {@linkplain #supported supported}: G4, no EOL markers, MSB-first. Crucially, PDF's + * {@code EncodedByteAlign} never reaches the params file, so a wrapped stream is only trusted after + * the caller decodes it back successfully (see the extractor's read-back verification). + */ +final class CcittTiffs { + + private static final short TYPE_SHORT = 3; + private static final short TYPE_LONG = 4; + private static final short TYPE_RATIONAL = 5; + + private CcittTiffs() {} + + /** + * The decode parameters poppler records beside a {@code .ccitt} dump. + * + * @param kind the coding scheme flag: {@code -4} (G4), {@code -2} (G3 2D) or {@code -1} (G3 1D) + * @param endOfLine whether rows are prefixed with EOL markers ({@code -A}) + * @param columns the row width in pixels ({@code -X}) + * @param blackIs1 whether decoded 1-bits are black ({@code -W}) or 0-bits are ({@code -B}) + */ + record Params(String kind, boolean endOfLine, int columns, boolean blackIs1) {} + + /** Parse a {@code .params} file's text, or {@code null} when any token is unrecognized. */ + static @Nullable Params parseParams(String text) { + @Nullable String kind = null; + @Nullable Boolean endOfLine = null; + @Nullable Integer columns = null; + @Nullable Boolean blackIs1 = null; + boolean msbFirst = false; + String[] tokens = text.trim().split("\\s+", -1); + for (int i = 0; i < tokens.length; i++) { + switch (tokens[i]) { + case "-4", "-2", "-1" -> kind = tokens[i]; + case "-A" -> endOfLine = true; + case "-P" -> endOfLine = false; + case "-W" -> blackIs1 = true; + case "-B" -> blackIs1 = false; + case "-M" -> msbFirst = true; + case "-X" -> { + i++; + if (i >= tokens.length) { + return null; + } + try { + columns = Integer.parseInt(tokens[i]); + } catch (NumberFormatException e) { + return null; + } + } + default -> { + return null; + } + } + } + if (kind == null || endOfLine == null || columns == null || blackIs1 == null || !msbFirst) { + return null; + } + return new Params(kind, endOfLine, columns, blackIs1); + } + + /** + * Whether {@code params} describes a stream TIFF T.6 represents verbatim: Group 4, no EOL + * markers, and a width agreeing with the listing row the dump corresponds to. + */ + static boolean supported(Params params, int expectedWidth) { + return "-4".equals(params.kind()) + && !params.endOfLine() + && params.columns() == expectedWidth; + } + + /** + * Write {@code g4} as a little-endian, single-strip CCITT-G4 TIFF — header, the verbatim stream + * as the one strip, then the IFD. + * + * @param out the TIFF to write + * @param g4 the raw G4 (T.6) stream, verbatim + * @param width the row width in pixels + * @param height the row count + * @param blackIs1 the params' photometric hint: decoded 1-bits are black ({@code -W}) + * @param dpi the resolution to stamp, or {@code <= 0} to omit the resolution tags + */ + static void writeSingleStripG4( + Path out, byte[] g4, int width, int height, boolean blackIs1, int dpi) + throws IOException { + boolean withResolution = dpi > 0; + int entryCount = withResolution ? 14 : 11; + int stripOffset = 8; + int padding = g4.length % 2; // IFD offsets must be word-aligned + int ifdOffset = stripOffset + g4.length + padding; + int rationalOffset = ifdOffset + 2 + entryCount * 12 + 4; + ByteBuffer buf = + ByteBuffer.allocate(rationalOffset + (withResolution ? 16 : 0)) + .order(ByteOrder.LITTLE_ENDIAN); + + buf.put((byte) 'I').put((byte) 'I').putShort((short) 42).putInt(ifdOffset); + buf.put(g4); + if (padding == 1) { + buf.put((byte) 0); + } + + buf.putShort((short) entryCount); // entries below stay sorted by tag id + entry(buf, 256, TYPE_LONG, width); // ImageWidth + entry(buf, 257, TYPE_LONG, height); // ImageLength + entryShort(buf, 258, 1); // BitsPerSample + entryShort(buf, 259, 4); // Compression: CCITT T.6 (Group 4) + // The G4 stream encodes white/black runs; this tag tells readers which sense to + // materialize them in. The PDF default (-B, BlackIs1=false) is the standard fax sense — + // TIFF WhiteIsZero (0); -W (BlackIs1=true) is the inverted sense, BlackIsZero (1). + // Pinned empirically by CcittTiffsTest's pixel-identical round trip. + entryShort(buf, 262, blackIs1 ? 1 : 0); // PhotometricInterpretation + entryShort(buf, 266, 1); // FillOrder: MSB first (params -M) + entry(buf, 273, TYPE_LONG, stripOffset); // StripOffsets + entryShort(buf, 277, 1); // SamplesPerPixel + entry(buf, 278, TYPE_LONG, height); // RowsPerStrip: the single strip + entry(buf, 279, TYPE_LONG, g4.length); // StripByteCounts + if (withResolution) { + entry(buf, 282, TYPE_RATIONAL, rationalOffset); // XResolution + entry(buf, 283, TYPE_RATIONAL, rationalOffset + 8); // YResolution + } + entry(buf, 293, TYPE_LONG, 0); // T6Options: none + if (withResolution) { + entryShort(buf, 296, 2); // ResolutionUnit: inch + } + buf.putInt(0); // no next IFD + + if (withResolution) { + buf.putInt(dpi).putInt(1).putInt(dpi).putInt(1); + } + Files.write(out, buf.array()); + } + + /** One IFD entry holding an inline LONG (or a RATIONAL's value offset). */ + private static void entry(ByteBuffer buf, int tag, short type, int value) { + buf.putShort((short) tag).putShort(type).putInt(1).putInt(value); + } + + /** One IFD entry holding an inline SHORT (left-justified in the 4-byte value field). */ + private static void entryShort(ByteBuffer buf, int tag, int value) { + buf.putShort((short) tag).putShort(TYPE_SHORT).putInt(1); + buf.putShort((short) value).putShort((short) 0); + } +} diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java index 35c8b77..2c4bfac 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java @@ -1,24 +1,32 @@ package io.github.p4suta.shared.pdf; +import io.github.p4suta.shared.imaging.Pix; import io.github.p4suta.shared.process.ProcessRunner; import io.github.p4suta.shared.process.Tasks; import io.github.p4suta.shared.process.ToolPath; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.nio.file.Path; import java.time.Duration; import java.util.ArrayList; +import java.util.Comparator; import java.util.List; import java.util.Locale; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeoutException; +import java.util.stream.Stream; +import org.jspecify.annotations.Nullable; /** * Extracts a PDF's embedded bitonal images as TIFFs by driving {@code pdfimages}. The page range is * split across the worker pool (one {@code pdfimages -f/-l} per chunk) with distinct zero-padded - * {@code page-cNN-} prefixes, so a name sort yields reading order and chunks never collide. The - * dominant scan DPI is read from {@code pdfimages -list} and passed to the clean step as an - * explicit DPI, since {@code pdfimages} tags the extracted TIFFs at a default 72 dpi. + * {@code page-cNN-} prefixes, so a name sort yields reading order and chunks never collide. An + * all-CCITT source is remuxed — the raw embedded G4 streams pass through into single-strip TIFFs + * with their true ppi stamped (see {@link #extract}); any other source is extracted decoded, where + * {@code pdfimages} tags the TIFFs at a default 72 dpi, so the dominant scan DPI from {@code + * pdfimages -list} is passed downstream explicitly either way. * *

The textual {@code pdfinfo}/{@code pdfimages -list} reports are parsed by the pure {@link * PdfListingParser}; this adapter only drives the external processes via {@link @@ -93,13 +101,33 @@ public int dominantDpi(Path pdf) throws IOException { /** * Extract all pages of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks. {@code jobs} bounds both the chunk count and the pool slots used. + * chunks of about {@link #CHUNK_PAGES} pages on {@code pool} (at most {@code 4 * jobs} chunks). + * + *

One {@code pdfimages -list} pass picks the mode: when every embedded image is 1-bpp CCITT + * (the usual self-scanned book), each chunk dumps the raw G4 streams ({@code -ccitt}) and wraps + * them into single-strip CCITT-G4 TIFFs — a pure remux: no decode/re-encode, intermediates tens + * of KB per page instead of the decoded megabytes, and the image's true ppi stamped instead of + * {@code pdfimages}' default 72 dpi. Every wrapped page is decoded back once as verification + * (PDF's {@code EncodedByteAlign} never reaches the dumped params, so trust requires a decode); + * a chunk whose dump or wrap deviates in any way is re-extracted decoded ({@code -tiff}), which + * is also the whole-run mode for any other source. */ public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException { int total = pageCount(pdf); - int chunks = Math.max(1, Math.min(jobs, total)); - int per = (total + chunks - 1) / chunks; String pdfimages = resolve("pdfimages", pdfimagesPropertyKey); + List rows = + PdfListingParser.parseImageRows( + capture(List.of(pdfimages, "-list", pdf.toString()), INFO_TIMEOUT)); + boolean rawCcitt = + !rows.isEmpty() + && rows.stream().allMatch(r -> r.bpc() == 1 && "ccitt".equals(r.enc())); + + // Chunks of ~CHUNK_PAGES rather than total/jobs: fast finishers free their pool slot early + // (the straggler tail shrinks from total/jobs to ~CHUNK_PAGES pages), and a streaming + // consumer can take pages chunk by chunk. Capped so a small book is not all process spawns. + int chunkCap = (int) Math.min(4L * jobs, total); + int chunks = Math.clamp(Math.ceilDiv(total, CHUNK_PAGES), 1, Math.max(1, chunkCap)); + int per = Math.ceilDiv(total, chunks); List> tasks = new ArrayList<>(); int chunk = 0; for (int first = 1; first <= total; first += per) { @@ -108,18 +136,11 @@ public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throw outDir.resolve(String.format(Locale.ROOT, "page-c%03d-", chunk)).toString(); int from = first; int to = last; + List chunkRows = + rawCcitt ? rowsInRange(rows, from, to) : List.of(); tasks.add( () -> { - runDiscarding( - List.of( - pdfimages, - "-tiff", - "-f", - Integer.toString(from), - "-l", - Integer.toString(to), - pdf.toString(), - prefix)); + extractChunk(pdfimages, pdf, from, to, prefix, chunkRows); return null; }); chunk++; @@ -127,6 +148,143 @@ public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throw Tasks.awaitAll(pool, tasks, "pdfimages extract interrupted", "pdfimages extract failed"); } + /** Pages per extraction chunk; see {@link #extract}. */ + private static final int CHUNK_PAGES = 12; + + /** The listing rows for pages {@code from..to}, in listing (= dump) order. */ + private static List rowsInRange( + List rows, int from, int to) { + return rows.stream().filter(r -> r.page() >= from && r.page() <= to).toList(); + } + + /** + * Extract one page-range chunk: raw-CCITT remux when {@code ccittRows} describes it, decoded + * {@code -tiff} otherwise — and the {@code -tiff} rerun as the fallback when the dump deviates + * from the listing in any way (count, params shape, or a wrap that does not decode back). + */ + private void extractChunk( + String pdfimages, + Path pdf, + int from, + int to, + String prefix, + List ccittRows) + throws IOException { + if (ccittRows.isEmpty()) { + runDiscarding(extractCommand(pdfimages, "-tiff", from, to, pdf, prefix)); + return; + } + runDiscarding(extractCommand(pdfimages, "-ccitt", from, to, pdf, prefix)); + if (!wrapChunk(prefix, ccittRows)) { + deleteByPrefix(prefix); + runDiscarding(extractCommand(pdfimages, "-tiff", from, to, pdf, prefix)); + } + } + + private static List extractCommand( + String pdfimages, String format, int from, int to, Path pdf, String prefix) { + return List.of( + pdfimages, + format, + "-f", + Integer.toString(from), + "-l", + Integer.toString(to), + pdf.toString(), + prefix); + } + + /** + * Wrap every {@code .ccitt} dump under {@code prefix} into a single-strip G4 TIFF, verifying + * each by decoding it back. Returns {@code false} (without cleaning up) on any deviation; the + * caller then discards the chunk's artifacts and falls back to a decoded extract. + */ + private static boolean wrapChunk(String prefix, List rows) + throws IOException { + List dumps = filesByPrefix(prefix, ".ccitt"); + if (dumps.size() != rows.size()) { + return false; + } + for (int i = 0; i < dumps.size(); i++) { + Path ccitt = dumps.get(i); + PdfListingParser.ImageRow row = rows.get(i); + Path paramsFile = withExtension(ccitt, ".params"); + if (!Files.isRegularFile(paramsFile)) { + return false; + } + CcittTiffs.@Nullable Params params = + CcittTiffs.parseParams(Files.readString(paramsFile, StandardCharsets.UTF_8)); + if (params == null || !CcittTiffs.supported(params, row.width())) { + return false; + } + Path out = withExtension(ccitt, ".tif"); + CcittTiffs.writeSingleStripG4( + out, + Files.readAllBytes(ccitt), + row.width(), + row.height(), + params.blackIs1(), + Math.max(row.xPpi(), 0)); + if (!decodesBack(out, row)) { + return false; + } + Files.delete(ccitt); + Files.delete(paramsFile); + } + return true; + } + + /** + * Whether the wrapped TIFF decodes to the listing row's dimensions — the read-back proof that + * the stream really was plain T.6 (an {@code EncodedByteAlign} stream, undetectable from the + * params file, fails or mis-sizes here). + */ + private static boolean decodesBack(Path tif, PdfListingParser.ImageRow row) { + try (Pix pix = Pix.read(tif)) { + return pix.width() == row.width() && pix.height() == row.height(); + } catch (IllegalStateException e) { + return false; + } + } + + /** The files starting with {@code prefix}'s file name and ending in {@code suffix}, sorted. */ + private static List filesByPrefix(String prefix, String suffix) throws IOException { + Path prefixPath = Path.of(prefix); + Path dir = prefixPath.getParent(); + String name = String.valueOf(prefixPath.getFileName()); + if (dir == null) { + throw new IOException("extract prefix has no parent directory: " + prefix); + } + try (Stream entries = Files.list(dir)) { + return entries.filter( + p -> { + String fileName = String.valueOf(p.getFileName()); + return fileName.startsWith(name) && fileName.endsWith(suffix); + }) + .sorted(Comparator.comparing(p -> String.valueOf(p.getFileName()))) + .toList(); + } + } + + /** + * Delete every artifact of one chunk ({@code .ccitt}, {@code .params}, partial {@code .tif}). + */ + private static void deleteByPrefix(String prefix) throws IOException { + for (String suffix : List.of(".ccitt", ".params", ".tif")) { + for (Path file : filesByPrefix(prefix, suffix)) { + Files.deleteIfExists(file); + } + } + } + + /** A sibling of {@code file} with its extension replaced by {@code extension}. */ + private static Path withExtension(Path file, String extension) { + String name = String.valueOf(file.getFileName()); + int dot = name.lastIndexOf('.'); + String base = dot < 0 ? name : name.substring(0, dot); + return file.resolveSibling(base + extension); + } + /** Run an extraction command, discarding its (file-producing) output. */ private static void runDiscarding(List command) throws IOException { try { diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java index efe12aa..0a2bb1b 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java @@ -1,6 +1,8 @@ package io.github.p4suta.shared.pdf; +import java.util.ArrayList; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; /** @@ -36,27 +38,17 @@ public static int parsePageCount(String pdfinfoOutput) { } /** - * The most common rounded x-ppi (column 13, 0-based 12) across the {@code image} rows of a - * {@code pdfimages -list} report, skipping the two header rows. Ties resolve to the first value - * seen and a non-positive winner falls back to {@link #DEFAULT_DPI}. + * The most common rounded x-ppi across the {@code image} rows of a {@code pdfimages -list} + * report. Ties resolve to the first value seen and a non-positive winner falls back to {@link + * #DEFAULT_DPI}. * * @param listOutput the full text {@code pdfimages -list} printed * @return the dominant rounded x-ppi, or {@link #DEFAULT_DPI} when none is usable */ public static int parseDominantDpi(String listOutput) { - String[] lines = listOutput.split("\n", -1); Map counts = new LinkedHashMap<>(); - for (int i = 2; i < lines.length; i++) { - String[] fields = lines[i].trim().split("\\s+", -1); - if (fields.length < 13 || !"image".equals(fields[2])) { - continue; - } - try { - int ppi = (int) Math.round(Double.parseDouble(fields[12])); - counts.merge(ppi, 1, Integer::sum); - } catch (NumberFormatException ignored) { - // Non-numeric x-ppi cell: skip this row. - } + for (ImageRow row : parseImageRows(listOutput)) { + counts.merge(row.xPpi(), 1, Integer::sum); } if (counts.isEmpty()) { return DEFAULT_DPI; @@ -71,4 +63,50 @@ public static int parseDominantDpi(String listOutput) { } return best > 0 ? best : DEFAULT_DPI; } + + /** + * One {@code image} row of a {@code pdfimages -list} report — the columns the extractor needs + * to pick its mode and to wrap raw CCITT dumps. + * + * @param page the 1-based page the image sits on + * @param width the image width in pixels + * @param height the image height in pixels + * @param bpc bits per component ({@code 1} for bitonal) + * @param enc the embedded encoding token ({@code ccitt}, {@code jbig2}, {@code jpeg}, {@code + * image}, …) + * @param xPpi the rounded x-ppi the image is placed at (0 when the cell is unusable) + */ + public record ImageRow(int page, int width, int height, int bpc, String enc, int xPpi) {} + + /** + * Parse the {@code image} rows of a {@code pdfimages -list} report, in listing order (the same + * order {@code pdfimages} dumps the images in), skipping the two header rows and any row with + * unparsable numeric cells. + * + * @param listOutput the full text {@code pdfimages -list} printed + * @return the parsed rows, possibly empty + */ + public static List parseImageRows(String listOutput) { + String[] lines = listOutput.split("\n", -1); + List rows = new ArrayList<>(); + for (int i = 2; i < lines.length; i++) { + String[] fields = lines[i].trim().split("\\s+", -1); + if (fields.length < 13 || !"image".equals(fields[2])) { + continue; + } + try { + rows.add( + new ImageRow( + Integer.parseInt(fields[0]), + Integer.parseInt(fields[3]), + Integer.parseInt(fields[4]), + Integer.parseInt(fields[7]), + fields[8], + (int) Math.round(Double.parseDouble(fields[12])))); + } catch (NumberFormatException ignored) { + // A non-numeric cell: skip this row. + } + } + return rows; + } } diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java new file mode 100644 index 0000000..63e93d6 --- /dev/null +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java @@ -0,0 +1,160 @@ +package io.github.p4suta.shared.pdf; + +import static org.assertj.core.api.Assertions.assertThat; + +import io.github.p4suta.shared.imaging.Pix; +import java.awt.Color; +import java.awt.Graphics2D; +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; +import javax.imageio.ImageIO; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +/** + * The CCITT remux building blocks: the fax2tiff-style params parser (pure) and the single-strip G4 + * TIFF writer, verified by wrapping a real G4 stream (PDFBox's CCITT encoder, the same encoding a + * scanner PDF embeds) and decoding it back through Leptonica pixel-for-pixel. + */ +final class CcittTiffsTest { + + // ---- params parsing ---- + + @Test + void parsesTheUsualScannerShape() { + CcittTiffs.Params params = parsed("-4 -P -X 3496 -B -M\n"); + assertThat(params).isEqualTo(new CcittTiffs.Params("-4", false, 3496, false)); + assertThat(CcittTiffs.supported(params, 3496)).isTrue(); + } + + @Test + void eolMarkersAreUnsupported() { + CcittTiffs.Params params = parsed("-4 -A -X 100 -W -M"); + assertThat(params).isEqualTo(new CcittTiffs.Params("-4", true, 100, true)); + // EOL markers are not representable in TIFF T.6. + assertThat(CcittTiffs.supported(params, 100)).isFalse(); + } + + @Test + void group3IsUnsupported() { + assertThat(CcittTiffs.supported(parsed("-2 -P -X 100 -B -M"), 100)).isFalse(); + } + + @Test + void widthMismatchIsUnsupported() { + assertThat(CcittTiffs.supported(parsed("-4 -P -X 100 -B -M"), 200)).isFalse(); + } + + /** Parse params the test asserts are well-formed, made non-null for NullAway. */ + private static CcittTiffs.Params parsed(String text) { + return java.util.Objects.requireNonNull(CcittTiffs.parseParams(text)); + } + + @Test + void unknownTokensAndMissingFlagsAreUnparsable() { + assertThat(CcittTiffs.parseParams("-4 -P -X 100 -B -M -Z")).isNull(); // unknown flag + assertThat(CcittTiffs.parseParams("-4 -P -B -M")).isNull(); // no -X + assertThat(CcittTiffs.parseParams("-4 -P -X nope -B -M")).isNull(); // bad width + assertThat(CcittTiffs.parseParams("-4 -P -X 100 -B")).isNull(); // no -M + assertThat(CcittTiffs.parseParams("")).isNull(); + } + + // ---- TIFF wrapping ---- + + /** + * Round trip: draw a known bitonal pattern, encode it to a raw G4 stream with PDFBox's CCITT + * encoder (the very encoding a scanner PDF embeds and {@code pdfimages -ccitt} dumps), wrap the + * stream with {@link CcittTiffs#writeSingleStripG4}, and assert it decodes back through + * Leptonica pixel-identical to the original, with the stamped resolution. + */ + @Test + void wrappedStreamDecodesBackPixelIdentical(@TempDir Path tmp) throws Exception { + int width = 200; + int height = 150; + BufferedImage img = pattern(width, height); + G4Stream g4 = encodeG4(img); + + Path wrapped = tmp.resolve("wrapped.tif"); + CcittTiffs.writeSingleStripG4(wrapped, g4.bytes, width, height, g4.blackIs1, 450); + + Path referencePng = tmp.resolve("reference.png"); + ImageIO.write(img, "png", referencePng.toFile()); + try (Pix expected = Pix.read(referencePng); + Pix actual = Pix.read(wrapped)) { + assertThat(actual.width()).isEqualTo(width); + assertThat(actual.height()).isEqualTo(height); + assertThat(actual.resolution()).isEqualTo(450); + assertThat(actual.blackPixels()).isPositive(); + assertThat(actual.pixelsEqual(expected)).isTrue(); + } + } + + @Test + void omitsResolutionTagsWhenDpiUnknown(@TempDir Path tmp) throws Exception { + int width = 64; + int height = 48; + G4Stream g4 = encodeG4(pattern(width, height)); + + Path wrapped = tmp.resolve("wrapped.tif"); + CcittTiffs.writeSingleStripG4(wrapped, g4.bytes, width, height, g4.blackIs1, 0); + + try (Pix actual = Pix.read(wrapped)) { + assertThat(actual.width()).isEqualTo(width); + assertThat(actual.resolution()).isZero(); + } + } + + /** A raw G4 (T.6) stream and the {@code BlackIs1} convention its encoder declared. */ + private static final class G4Stream { + final byte[] bytes; + final boolean blackIs1; + + G4Stream(byte[] bytes, boolean blackIs1) { + this.bytes = bytes; + this.blackIs1 = blackIs1; + } + } + + /** + * The raw CCITT G4 stream PDFBox's {@link CCITTFactory} encodes {@code img} to (lifted verbatim + * from the image XObject, exactly the bytes {@code pdfimages -ccitt} would dump), along with + * the {@code BlackIs1} decode parameter it declared. + */ + private static G4Stream encodeG4(BufferedImage img) throws IOException { + try (PDDocument doc = new PDDocument()) { + PDImageXObject image = CCITTFactory.createFromImage(doc, img); + COSDictionary decodeParms = + (COSDictionary) image.getCOSObject().getDictionaryObject(COSName.DECODE_PARMS); + boolean blackIs1 = + decodeParms != null && decodeParms.getBoolean(COSName.BLACK_IS_1, false); + try (InputStream in = image.getCOSObject().createRawInputStream()) { + return new G4Stream(in.readAllBytes(), blackIs1); + } + } + } + + /** A deterministic bitonal pattern with structure (bars + a block). */ + private static BufferedImage pattern(int width, int height) { + BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); + Graphics2D g = img.createGraphics(); + try { + g.setColor(Color.WHITE); + g.fillRect(0, 0, width, height); + g.setColor(Color.BLACK); + for (int x = 4; x < width - 8; x += 12) { + g.fillRect(x, 8, 6, height - 16); + } + g.fillRect(width / 3, height / 3, width / 3, height / 3); + } finally { + g.dispose(); + } + return img; + } +} diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java index c9032d2..9b7d1c5 100644 --- a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java @@ -94,6 +94,69 @@ void dominantDpiReturnsAPositiveResolution(@TempDir Path tmp) throws Exception { assertThat(dpi).isPositive(); } + /** Build a multi-page PDF embedding CCITT-G4 bitonal images, one per page, at ~200 ppi. */ + private static void writeCcittPdf(Path pdf, int pages, int imgW, int imgH) throws IOException { + try (PDDocument doc = new PDDocument()) { + for (int i = 0; i < pages; i++) { + BufferedImage bitonal = + new BufferedImage(imgW, imgH, BufferedImage.TYPE_BYTE_BINARY); + java.awt.Graphics2D g = bitonal.createGraphics(); + try { + g.setColor(java.awt.Color.WHITE); + g.fillRect(0, 0, imgW, imgH); + g.setColor(java.awt.Color.BLACK); + g.fillRect(10 + i, 10, imgW / 3, imgH / 2); + } finally { + g.dispose(); + } + float wPt = imgW * 72f / 200; + float hPt = imgH * 72f / 200; + PDPage page = new PDPage(new PDRectangle(wPt, hPt)); + doc.addPage(page); + PDImageXObject image = + org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory.createFromImage( + doc, bitonal); + try (PDPageContentStream content = new PDPageContentStream(doc, page)) { + content.drawImage(image, 0, 0, wPt, hPt); + } + } + doc.save(pdf.toFile()); + } + } + + @Test + @EnabledIf("io.github.p4suta.shared.pdf.PdfImagesCliExtractorTest#toolsOnPath") + void remuxesAnAllCcittSourceIntoStampedG4Tiffs(@TempDir Path tmp) throws Exception { + Path pdf = tmp.resolve("scan.pdf"); + writeCcittPdf(pdf, 3, 240, 180); + Path outDir = Files.createDirectory(tmp.resolve("out")); + + ExecutorService pool = Executors.newFixedThreadPool(2); + try { + new PdfImagesCliExtractor(PDFIMAGES_KEY, PDFINFO_KEY).extract(pdf, outDir, 2, pool); + } finally { + pool.shutdownNow(); + } + + try (Stream entries = Files.list(outDir)) { + List files = entries.sorted().toList(); + // The remux leaves exactly one .tif per page — no .ccitt/.params residue. + assertThat(files).hasSize(3); + assertThat(files).allSatisfy(p -> assertThat(p.toString()).endsWith(".tif")); + for (Path tif : files) { + try (io.github.p4suta.shared.imaging.Pix pix = + io.github.p4suta.shared.imaging.Pix.read(tif)) { + assertThat(pix.width()).isEqualTo(240); + assertThat(pix.height()).isEqualTo(180); + // The remux stamps the image's true ppi instead of pdfimages' default 72. + assertThat(pix.resolution()).isEqualTo(200); + // Black ink, not inverted: the drawn block is ~1/6 of the page. + assertThat(pix.blackPixels()).isGreaterThan(0).isLessThan(240L * 180 / 2); + } + } + } + } + @Test void missingToolFailsWithAClearMessage(@TempDir Path tmp) throws Exception { Path pdf = tmp.resolve("doc.pdf"); diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java index f228032..2bba8d0 100644 --- a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java @@ -120,4 +120,27 @@ void parseDominantDpiSkipsNonImageRowsWithTooFewFields() { PdfListingParser.DEFAULT_DPI, PdfListingParser.parseDominantDpi("hdr\n----\n 1 0 smask\n")); } + + @Test + void parseImageRowsReadsTheColumnsTheExtractorNeeds() { + var rows = PdfListingParser.parseImageRows(LIST); + assertEquals(3, rows.size()); + assertEquals(new PdfListingParser.ImageRow(1, 2480, 3508, 1, "ccitt", 300), rows.get(0)); + assertEquals(new PdfListingParser.ImageRow(3, 1240, 1754, 1, "ccitt", 150), rows.get(2)); + } + + @Test + void parseImageRowsSkipsMalformedAndNonImageRows() { + String mixed = + """ + page num type width height color comp bpc enc interp object ID x-ppi y-ppi size ratio + -------------------------------------------------------------------------------------------- + 1 0 smask 2480 3508 gray 1 1 ccitt no 7 0 300 300 101K 1.2% + 2 1 image bad 3508 gray 1 1 ccitt no 11 0 300 300 99K 1.1% + 3 2 image 2480 3508 rgb 3 8 jpeg no 14 0 150 150 40K 1.0% + """; + var rows = PdfListingParser.parseImageRows(mixed); + assertEquals(1, rows.size()); + assertEquals(new PdfListingParser.ImageRow(3, 2480, 3508, 8, "jpeg", 150), rows.get(0)); + } } From 86074bf872345b48a50f0e4f80748b57fc6533b9 Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:49:25 +0900 Subject: [PATCH 3/9] fix(concurrency): fail-fast page fan-out that keeps exception identity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shared Tasks.awaitAll (and its two private copies in despeckle/ register) had three real defects: - No fail-fast: invokeAll waits for every page even after the first failure — a corrupt page at position 1 of 600 still ran the other 599. - Error kinds were destroyed at the pool boundary: a worker's domain exception (DespeckleException, RegisterException — RuntimeExceptions carrying their ErrorKind) was wrapped into a generic IOException, so ExceptionMapper saw INTERNAL instead of the real kind: wrong exit code, wrong RunFailed token in the webapp's SSE stream. - ProcessRunner leaked the child on interrupt: an InterruptedException from waitFor propagated without destroyForcibly, leaving the drainer close() hanging on a live child's pipes. Tasks is rewritten around a batch-owned executor (try-with-resources) with StructuredTaskScope.join() semantics on final-Java features: fail-fast with sibling interruption plus a gate that stops queued tasks a freed worker dequeues in the shutdownNow race window, quiescence before the failure propagates (so finally-deletes never race in-flight writers), and exception identity preserved (IOException/ RuntimeException unchanged, UncheckedIOException unwrapped). Two scheduling modes: Workers.platform(jobs) for CPU-bound Leptonica/FFM work (long downcalls pin virtual threads' carriers) and Workers.virtual(maxInFlight) for subprocess waits (pdfimages chunks, per-page jbig2), semaphore-bounded. ItemProgress now fires on the orchestrating thread in completion order, so per-page progress events are strictly ordered and the AtomicInteger counters at every call site are gone. All seven call sites migrate (G4EncodeStage, DespeckleService, RegistrationService both passes, PdfExtractSource, the shared extractor/assembler, Jbig2PackService, both PdfPipelineServices — whose redundant outer pools, idle through the whole middle step, are removed); the ExecutorService parameter disappears from the PdfImageExtractor/Jbig2Assembler ports; the dead NativeTools.awaitAll copies (zero callers) are deleted. pdfbook's Main also gains a shutdown hook that interrupts the run and waits (bounded 15s) for it to unwind, so Ctrl-C now kills the children, quiesces the workers, and deletes the p4suta-pipeline- work area instead of leaking it. Verified in-container: SIGINT to the process group aborts the run (exit 130), no work dir remains. Benchmark: no regression (conv 14.29s vs 14.23s at -j8 on the 200-page fixture, within noise). Co-Authored-By: Claude Fable 5 --- despeckle/application/build.gradle.kts | 4 + .../application/DespeckleService.java | 76 ++-- .../application/Jbig2PackService.java | 6 +- .../application/PdfPipelineService.java | 62 ++- .../p4suta/despeckle/application/Fakes.java | 6 +- .../pdf/PdfBoxJbig2Assembler.java | 7 +- .../pdf/PdfImagesCliExtractor.java | 7 +- .../infrastructure/process/NativeTools.java | 50 +-- .../p4suta/despeckle/port/Jbig2Assembler.java | 7 +- .../despeckle/port/PdfImageExtractor.java | 9 +- .../p4suta/despeckle/port/package-info.java | 6 +- .../java/io/github/p4suta/pipeline/Main.java | 48 ++- .../infrastructure/G4EncodeStage.java | 44 +-- .../infrastructure/PdfExtractSource.java | 9 +- .../infrastructure/G4EncodeStageTest.java | 12 +- register/application/build.gradle.kts | 4 + .../application/PdfPipelineService.java | 61 ++- .../application/RegistrationService.java | 128 ++----- .../p4suta/register/application/Fakes.java | 6 +- .../application/RegistrationServiceTest.java | 7 +- .../pdf/PdfBoxJbig2Assembler.java | 5 +- .../pdf/PdfImagesCliExtractor.java | 7 +- .../infrastructure/process/NativeTools.java | 25 +- .../p4suta/register/port/Jbig2Assembler.java | 3 +- .../register/port/PdfImageExtractor.java | 7 +- .../shared/pdf/PdfBoxJbig2Assembler.java | 10 +- .../shared/pdf/PdfImagesCliExtractor.java | 10 +- .../shared/pdf/PdfBoxJbig2AssemblerTest.java | 20 +- .../shared/pdf/PdfImagesCliExtractorTest.java | 16 +- shared/process/build.gradle.kts | 19 +- .../p4suta/shared/process/ProcessRunner.java | 12 +- .../github/p4suta/shared/process/Tasks.java | 237 ++++++++++-- .../shared/process/ProcessRunnerTest.java | 34 ++ .../p4suta/shared/process/TasksTest.java | 356 +++++++++++++++--- 34 files changed, 827 insertions(+), 493 deletions(-) diff --git a/despeckle/application/build.gradle.kts b/despeckle/application/build.gradle.kts index a6f1593..165103b 100644 --- a/despeckle/application/build.gradle.kts +++ b/despeckle/application/build.gradle.kts @@ -25,6 +25,10 @@ dependencies { // implementations are no longer duplicated here. Static calls returning java.base types only, so // implementation (not api) is the right scope. implementation(project(":shared:io")) + // Tasks.awaitAll(Workers...): the shared fail-fast page fan-out (batch-owned executor, + // sibling interruption, quiescence before the failure propagates) DespeckleService runs its + // per-page work on. + implementation(project(":shared:process")) implementation(libs.slf4j.api) } diff --git a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java index 062e974..034f2e2 100644 --- a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java +++ b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java @@ -9,18 +9,13 @@ import io.github.p4suta.shared.io.CorpusFiles; import io.github.p4suta.shared.io.OutputDirs; import io.github.p4suta.shared.kernel.PageProgressListener; +import io.github.p4suta.shared.process.Tasks; import java.io.IOException; -import java.io.UncheckedIOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.atomic.AtomicInteger; import org.jspecify.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -88,8 +83,8 @@ public Summary run(Config config) throws IOException { /** * Execute a run, reporting each finished page to {@code progress}. * - * @param progress called once per page as it completes (1-based count, total page count); may - * be invoked from the worker threads, so it must be thread-safe + * @param progress called once per page as it completes (1-based count, total page count), + * always on the calling thread and in order */ public Summary run(Config config, PageProgressListener progress) throws IOException { OutputDirs.prepare(config.outputDir(), config.force()); @@ -106,27 +101,25 @@ public Summary run(Config config, PageProgressListener progress) throws IOExcept ? Reporter.noOp() : reporterFactory.create(config.reportDir(), config.flipbook()); - AtomicInteger done = new AtomicInteger(); - List outcomes; - ExecutorService pool = Executors.newFixedThreadPool(config.jobs()); - try { - List> tasks = new ArrayList<>(files.size()); - for (Path src : files) { - tasks.add( - () -> { - PageOutcome outcome = processOne(src, config, report); - int n = done.incrementAndGet(); - progress.onPage(n, files.size()); - if (n % PROGRESS_EVERY == 0 || n == files.size()) { - LOG.info("{}/{}", n, files.size()); + List> tasks = new ArrayList<>(files.size()); + for (Path src : files) { + tasks.add(() -> processOne(src, config, report)); + } + // Platform workers: each page is CPU-bound Leptonica work (FFM downcalls pin virtual + // threads' carriers). The fan-out fails fast and quiesces before throwing, so a failed + // run never leaves workers writing into the output directory. Progress and the human + // log arrive on this thread, ordered. + List outcomes = + Tasks.awaitAll( + Tasks.Workers.platform(config.jobs()), + tasks, + "despeckle", + (done, total) -> { + progress.onPage(done, total); + if (done % PROGRESS_EVERY == 0 || done == total) { + LOG.info("{}/{}", done, total); } - return outcome; }); - } - outcomes = invokeAll(pool, tasks); - } finally { - pool.shutdown(); - } report.finish(); @@ -152,7 +145,7 @@ public Summary run(Config config, PageProgressListener progress) throws IOExcept private record PageOutcome(Path source, ProcessResult result) {} - private PageOutcome processOne(Path src, Config config, Reporter report) { + private PageOutcome processOne(Path src, Config config, Reporter report) throws IOException { Path dest = CorpusFiles.mirrorDestination( src, config.inputDir(), config.outputDir(), config.format().extension()); @@ -166,30 +159,9 @@ private PageOutcome processOne(Path src, Config config, Reporter report) { report.addPage(stem, src, dest, result); return new PageOutcome(src, result); } catch (IOException e) { - throw new UncheckedIOException("failed to process " + src, e); - } - } - - private static List invokeAll( - ExecutorService pool, List> tasks) throws IOException { - List> futures; - try { - futures = pool.invokeAll(tasks); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("despeckle run interrupted", e); - } - List outcomes = new ArrayList<>(futures.size()); - for (Future future : futures) { - try { - outcomes.add(future.get()); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("despeckle run interrupted", e); - } catch (ExecutionException e) { - throw new IOException("page processing failed", e.getCause()); - } + // Re-throw with the page named: Tasks surfaces a task's IOException unchanged, so + // this context reaches the user instead of being lost in a generic wrapper. + throw new IOException("failed to process " + src, e); } - return outcomes; } } diff --git a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/Jbig2PackService.java b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/Jbig2PackService.java index c82f4cb..665badd 100644 --- a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/Jbig2PackService.java +++ b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/Jbig2PackService.java @@ -9,8 +9,6 @@ import java.nio.file.Path; import java.util.Comparator; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import java.util.stream.Stream; import org.jspecify.annotations.Nullable; import org.slf4j.Logger; @@ -77,17 +75,15 @@ public void run(Config config) throws IOException { } Path jb2Dir = createWorkDir(config.outPdf()); - ExecutorService pool = Executors.newFixedThreadPool(config.jobs()); try { assembler.assemble( config.imageDir(), config.outPdf(), config.source(), config.dpi(), - pool, + config.jobs(), jb2Dir); } finally { - pool.shutdown(); deleteRecursively(jb2Dir); } linearizer.linearize(config.outPdf()); diff --git a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfPipelineService.java b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfPipelineService.java index e90a508..16357c6 100644 --- a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfPipelineService.java +++ b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfPipelineService.java @@ -12,8 +12,6 @@ import java.nio.file.Path; import java.util.Comparator; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import java.util.stream.Stream; import org.jspecify.annotations.Nullable; import org.slf4j.Logger; @@ -96,41 +94,37 @@ public DespeckleService.Summary run(Config config) throws IOException { Path extracted = Files.createDirectories(work.resolve("in")); Path cleaned = Files.createDirectories(work.resolve("clean")); Path jbig2Dir = Files.createDirectories(work.resolve("jb2")); - ExecutorService pool = Executors.newFixedThreadPool(config.jobs()); - DespeckleService.Summary summary; - try { - int dpi = - config.options().dpi().isPresent() - ? config.options().dpi().getAsInt() - : extractor.dominantDpi(config.inputPdf()); - LOG.info( - "pipeline: {} -> {} at {} dpi", config.inputPdf(), config.outputPdf(), dpi); + int dpi = + config.options().dpi().isPresent() + ? config.options().dpi().getAsInt() + : extractor.dominantDpi(config.inputPdf()); + LOG.info("pipeline: {} -> {} at {} dpi", config.inputPdf(), config.outputPdf(), dpi); - extractor.extract(config.inputPdf(), extracted, config.jobs(), pool); + // Each step fans out on its own batch-owned workers bounded by the same jobs budget, + // so the steps never hold idle threads for each other (the old shared outer pool sat + // idle through the whole despeckle step). + extractor.extract(config.inputPdf(), extracted, config.jobs()); - DespeckleService.Config clean = - new DespeckleService.Config( - extracted, - cleaned, - OutputFormat.TIFF, - "*.tif", - config.jobs(), - true, - config.options().withDpi(dpi), - config.reportDir(), - config.flipbook()); - summary = despeckleService.run(clean); + DespeckleService.Config clean = + new DespeckleService.Config( + extracted, + cleaned, + OutputFormat.TIFF, + "*.tif", + config.jobs(), + true, + config.options().withDpi(dpi), + config.reportDir(), + config.flipbook()); + DespeckleService.Summary summary = despeckleService.run(clean); - assembler.assemble( - cleaned, - config.outputPdf(), - config.inputPdf(), - OptionalInt.of(dpi), - pool, - jbig2Dir); - } finally { - pool.shutdown(); - } + assembler.assemble( + cleaned, + config.outputPdf(), + config.inputPdf(), + OptionalInt.of(dpi), + config.jobs(), + jbig2Dir); linearizer.linearize(config.outputPdf()); LOG.info("wrote {}", config.outputPdf()); return summary; diff --git a/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java b/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java index a970181..aaf9f49 100644 --- a/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java +++ b/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java @@ -17,7 +17,6 @@ import java.nio.file.Path; import java.util.List; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import org.jspecify.annotations.Nullable; @@ -111,8 +110,7 @@ public int dominantDpi(Path pdf) { } @Override - public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) - throws IOException { + public void extract(Path pdf, Path outDir, int jobs) throws IOException { if (failOn != null && pdf.getFileName().toString().contains(failOn)) { throw new IOException("fake extract failed for " + pdf); } @@ -135,7 +133,7 @@ public void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException { Files.writeString(outPdf, "%PDF-fake", StandardCharsets.UTF_8); diff --git a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfBoxJbig2Assembler.java b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfBoxJbig2Assembler.java index 907e663..a1a7a7b 100644 --- a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfBoxJbig2Assembler.java +++ b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfBoxJbig2Assembler.java @@ -4,7 +4,6 @@ import java.io.IOException; import java.nio.file.Path; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import org.jspecify.annotations.Nullable; /** @@ -34,7 +33,7 @@ public PdfBoxJbig2Assembler() {} * @param outPdf the lossless-JBIG2 PDF to write * @param source a PDF whose Info dict, XMP and version are inherited, or {@code null} for none * @param forcedDpi a single DPI to size every page with, or empty to read each image's own - * @param pool the worker pool the per-page {@code jbig2} encodes run on + * @param jobs how many {@code jbig2} children may encode at once * @param scratchDir scratch directory for the intermediate per-page JBIG2 streams * (caller-owned) * @throws IOException if the directory is empty, a tool fails, or the write fails @@ -45,9 +44,9 @@ public void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException { - delegate.assemble(imageDir, outPdf, source, forcedDpi, pool, scratchDir); + delegate.assemble(imageDir, outPdf, source, forcedDpi, jobs, scratchDir); } } diff --git a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfImagesCliExtractor.java b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfImagesCliExtractor.java index 044a175..ab186b5 100644 --- a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfImagesCliExtractor.java +++ b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfImagesCliExtractor.java @@ -3,7 +3,6 @@ import io.github.p4suta.despeckle.port.PdfImageExtractor; import java.io.IOException; import java.nio.file.Path; -import java.util.concurrent.ExecutorService; /** * Despeckle's {@link PdfImageExtractor} adapter — a wrapper over the shared {@code @@ -34,10 +33,10 @@ public int dominantDpi(Path pdf) throws IOException { /** * Extract all pages of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks. {@code jobs} bounds both the chunk count and the pool slots used. + * chunks. at most {@code jobs} chunks run at once. */ @Override - public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException { - delegate.extract(pdf, outDir, jobs, pool); + public void extract(Path pdf, Path outDir, int jobs) throws IOException { + delegate.extract(pdf, outDir, jobs); } } diff --git a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/process/NativeTools.java b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/process/NativeTools.java index 5812478..dfdf3c5 100644 --- a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/process/NativeTools.java +++ b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/process/NativeTools.java @@ -6,12 +6,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.Path; -import java.util.ArrayList; import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; /** @@ -26,10 +21,10 @@ *

The launch helpers stay local rather than routing through {@code :shared:process}'s {@code * ProcessRunner}/{@code Tasks}: {@link #capture} returns the raw {@code byte[]} a JBIG2 stream * needs (the shared runner decodes to a UTF-8 {@code String}, which is lossy for binary output), - * and {@link #run}/{@link #awaitAll} raise/propagate the tagged {@link DespeckleException} kind - * (the shared {@code Tasks} flattens any non-{@code IOException} cause, losing the kind and its - * exit code). The {@code qpdf} call site, whose output is text and whose exit-3 tolerance the - * shared runner models directly, uses {@code ProcessRunner} instead. + * and {@link #run} raises/propagates the tagged {@link DespeckleException} kind (the shared {@code + * Tasks} flattens any non-{@code IOException} cause, losing the kind and its exit code). The {@code + * qpdf} call site, whose output is text and whose exit-3 tolerance the shared runner models + * directly, uses {@code ProcessRunner} instead. * *

Public so the sibling {@code infrastructure.pdf} adapters can call it; it never leaves the * {@code :infrastructure} module (the application and CLI layers depend only on the {@code :port} @@ -121,41 +116,4 @@ private static void awaitExit(Process process, List command, long timeou null); } } - - /** - * Run every task on {@code pool}, in submission order, surfacing the first failure as - * IOException. - */ - public static List awaitAll(ExecutorService pool, List> tasks) - throws IOException { - List> futures; - try { - futures = pool.invokeAll(tasks); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("pipeline interrupted", e); - } - List results = new ArrayList<>(futures.size()); - for (Future future : futures) { - try { - results.add(future.get()); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("pipeline interrupted", e); - } catch (ExecutionException e) { - Throwable cause = e.getCause(); - // Preserve a tagged domain failure (e.g. NATIVE_TOOL_FAILED / IMAGE_UNREADABLE - // raised inside a worker) so its kind — and exit code — survives the pool boundary - // instead of being flattened to a generic IOException. - if (cause instanceof DespeckleException de) { - throw de; - } - if (cause instanceof IOException io) { - throw io; - } - throw new IOException("pipeline task failed", cause); - } - } - return results; - } } diff --git a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/Jbig2Assembler.java b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/Jbig2Assembler.java index 7db4260..a7549e5 100644 --- a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/Jbig2Assembler.java +++ b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/Jbig2Assembler.java @@ -3,13 +3,12 @@ import java.io.IOException; import java.nio.file.Path; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import org.jspecify.annotations.Nullable; /** * Packs a directory of cleaned bitonal pages into a lossless-JBIG2 PDF. The abstraction over the * {@code jbig2} encoder + PDFBox container; the implementation ({@code - * infrastructure.pdf.PdfBoxJbig2Assembler}) runs the per-page encodes on the supplied pool. + * infrastructure.pdf.PdfBoxJbig2Assembler}) runs at most {@code jobs} per-page encodes at once. */ public interface Jbig2Assembler { @@ -20,7 +19,7 @@ public interface Jbig2Assembler { * @param outPdf the lossless-JBIG2 PDF to write * @param source a PDF whose Info dict, XMP and version are inherited, or {@code null} for none * @param forcedDpi a single DPI to size every page with, or empty to read each image's own - * @param pool the worker pool the per-page encodes run on + * @param jobs how many per-page encodes may run at once * @param scratchDir scratch directory for the intermediate per-page JBIG2 streams * (caller-owned) * @throws IOException if the directory is empty, a tool fails, or the write fails @@ -30,7 +29,7 @@ void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException; } diff --git a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/PdfImageExtractor.java b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/PdfImageExtractor.java index 77a4969..f80348b 100644 --- a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/PdfImageExtractor.java +++ b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/PdfImageExtractor.java @@ -2,13 +2,12 @@ import java.io.IOException; import java.nio.file.Path; -import java.util.concurrent.ExecutorService; /** * Extracts a PDF's embedded bitonal images, and reports its dominant scan resolution. The * abstraction over the {@code pdfimages}/{@code pdfinfo} drivers; the implementation ({@code - * infrastructure.pdf.PdfImagesCliExtractor}) splits the page range across the supplied pool, one - * {@code pdfimages} invocation per chunk. + * infrastructure.pdf.PdfImagesCliExtractor}) splits the page range into bounded chunks, one {@code + * pdfimages} invocation per chunk. */ public interface PdfImageExtractor { @@ -17,7 +16,7 @@ public interface PdfImageExtractor { /** * Extract all pages of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks. {@code jobs} bounds both the chunk count and the pool slots used. + * chunks. at most {@code jobs} chunks run at once. */ - void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException; + void extract(Path pdf, Path outDir, int jobs) throws IOException; } diff --git a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/package-info.java b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/package-info.java index 4c8bdf4..23f5510 100644 --- a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/package-info.java +++ b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/package-info.java @@ -2,9 +2,9 @@ * The despeckle ports: the narrow interfaces the application layer drives and the infrastructure * layer implements. Every signature is expressed purely in terms of {@code domain.model} value * types and JDK types ({@link java.nio.file.Path}, primitives, {@link java.util.OptionalInt}, - * {@link java.util.List}, {@link java.util.concurrent.ExecutorService}, {@link - * java.io.IOException}); no Leptonica {@code Pix}, PDFBox or AWT type ever crosses a port boundary, - * so the dependency is kept unidirectional and the adapters stay swappable. + * {@link java.util.List}, {@link java.io.IOException}); no Leptonica {@code Pix}, PDFBox or AWT + * type ever crosses a port boundary, so the dependency is kept unidirectional and the adapters stay + * swappable. */ @NullMarked package io.github.p4suta.despeckle.port; diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/Main.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/Main.java index 67b1db9..81ccc68 100644 --- a/pipeline/app/src/main/java/io/github/p4suta/pipeline/Main.java +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/Main.java @@ -2,20 +2,62 @@ import io.github.p4suta.pipeline.cli.PipelineCommand; import io.github.p4suta.shared.observability.FatalUncaughtHandler; +import java.time.Duration; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; /** Process entry point for the unified pipeline tool ({@code pdfbook}). */ public final class Main { + /** + * How long the shutdown hook waits for the interrupted run to unwind. Generous: the page + * fan-outs quiesce in at most one page's worth of native work per worker, and the extractor + * kills its children on interrupt. + */ + private static final Duration SHUTDOWN_GRACE = Duration.ofSeconds(15); + private Main() {} /** - * Installs the fatal uncaught-exception handler, runs the CLI, and translates the result into a - * process exit code. All argument parsing and terminal output live in {@link PipelineCommand}. + * Installs the fatal uncaught-exception handler and the Ctrl-C cleanup hook, runs the CLI, and + * translates the result into a process exit code. All argument parsing and terminal output live + * in {@link PipelineCommand}. + * + *

On Ctrl-C (or any normal-termination signal) the shutdown hook interrupts the main thread + * and waits — bounded by {@link #SHUTDOWN_GRACE} — for the run to unwind: the page fan-outs + * stop their workers and quiesce, subprocesses are killed, and the run's {@code finally} blocks + * delete the temp work area. A Ctrl-C therefore no longer leaks a {@code p4suta-pipeline-} + * directory full of intermediates. On a normal exit the latch is already down, so the hook + * returns immediately. * * @param args command-line arguments */ public static void main(String[] args) { Thread.setDefaultUncaughtExceptionHandler(new FatalUncaughtHandler()); - System.exit(new PipelineCommand().run(args)); + CountDownLatch unwound = new CountDownLatch(1); + Thread main = Thread.currentThread(); + Runtime.getRuntime() + .addShutdownHook( + new Thread( + () -> { + main.interrupt(); + try { + boolean clean = + unwound.await( + SHUTDOWN_GRACE.toMillis(), + TimeUnit.MILLISECONDS); + if (!clean) { + System.err.println( + "pdfbook: shutdown grace expired; temp files" + + " may remain"); + } + } catch (InterruptedException ignored) { + // The JVM is exiting anyway; stop waiting. + } + }, + "pdfbook-shutdown")); + int code = new PipelineCommand().run(args); + unwound.countDown(); + System.exit(code); } } diff --git a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java index bb3ad78..4fea0dc 100644 --- a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java +++ b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java @@ -12,9 +12,6 @@ import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.atomic.AtomicInteger; /** * The G4-normalization {@link Stage}: re-encodes each extracted page as single-strip CCITT G4 TIFF @@ -57,29 +54,26 @@ public String name() { @Override public Corpus apply(Corpus input, Path workDir) throws IOException { List pages = CorpusFiles.collect(input.dir(), input.glob()); - AtomicInteger done = new AtomicInteger(); - ExecutorService pool = Executors.newFixedThreadPool(jobs); - try { - List> tasks = new ArrayList<>(pages.size()); - for (Path src : pages) { - tasks.add( - () -> { - Path out = - CorpusFiles.mirrorDestination(src, input.dir(), workDir, "tif"); - try (Pix page = Pix.read(src)) { - page.setResolution(input.dpi()); - page.writeTiffG4(out); - } - progress.emit( - new ProgressEvent.PageProcessed( - name(), done.incrementAndGet(), pages.size())); - return null; - }); - } - Tasks.awaitAll(pool, tasks, "G4 encode interrupted", "G4 encode failed"); - } finally { - pool.shutdown(); + List> tasks = new ArrayList<>(pages.size()); + for (Path src : pages) { + tasks.add( + () -> { + Path out = CorpusFiles.mirrorDestination(src, input.dir(), workDir, "tif"); + try (Pix page = Pix.read(src)) { + page.setResolution(input.dpi()); + page.writeTiffG4(out); + } + return null; + }); } + // Platform workers: each task is a Leptonica decode + G4 encode (CPU-bound FFM downcalls, + // which would pin virtual threads' carriers). Progress arrives on this thread, ordered. + Tasks.awaitAll( + Tasks.Workers.platform(jobs), + tasks, + "G4 encode", + (done, total) -> + progress.emit(new ProgressEvent.PageProcessed(name(), done, total))); return input.movedTo(workDir, OUTPUT_GLOB); } } diff --git a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/PdfExtractSource.java b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/PdfExtractSource.java index 1a11456..c40694d 100644 --- a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/PdfExtractSource.java +++ b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/PdfExtractSource.java @@ -7,8 +7,6 @@ import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; /** * The pipeline {@link Source}: extracts a scan PDF's bitonal pages once, via the shared {@code @@ -45,12 +43,7 @@ public String name() { @Override public Corpus open(Path workDir) throws IOException { int dpi = extractor.dominantDpi(sourcePdf); - ExecutorService pool = Executors.newFixedThreadPool(jobs); - try { - extractor.extract(sourcePdf, workDir, jobs, pool); - } finally { - pool.shutdown(); - } + extractor.extract(sourcePdf, workDir, jobs); return new Corpus(workDir, EXTRACT_GLOB, dpi, count(workDir, EXTRACT_GLOB)); } diff --git a/pipeline/infrastructure/src/test/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStageTest.java b/pipeline/infrastructure/src/test/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStageTest.java index 4ee3222..16a149c 100644 --- a/pipeline/infrastructure/src/test/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStageTest.java +++ b/pipeline/infrastructure/src/test/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStageTest.java @@ -1,7 +1,7 @@ package io.github.p4suta.pipeline.infrastructure; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatIOException; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import io.github.p4suta.pipeline.domain.Corpus; import io.github.p4suta.shared.imaging.Pix; @@ -79,15 +79,17 @@ void reEncodesUncompressedTiffsAsCcittG4() throws Exception { } @Test - void surfacesAnUnreadablePageAsIoException() throws Exception { + void surfacesAnUnreadablePageWithItsOriginalException() throws Exception { Path inputDir = Files.createDirectories(tmp.resolve("in")); Path workDir = Files.createDirectories(tmp.resolve("out")); Files.writeString(inputDir.resolve("page-000.tif"), "not a tiff"); Corpus input = new Corpus(inputDir, "*.tif", DPI, 1); - assertThatIOException() - .isThrownBy(() -> new G4EncodeStage(2).apply(input, workDir)) - .withMessage("G4 encode failed"); + // The fan-out preserves the failure's identity (no generic IOException wrapper), so the + // shared exception mapper still sees the original type and the message names the page. + assertThatThrownBy(() -> new G4EncodeStage(2).apply(input, workDir)) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("could not read image"); } /** diff --git a/register/application/build.gradle.kts b/register/application/build.gradle.kts index 631d833..11a5a77 100644 --- a/register/application/build.gradle.kts +++ b/register/application/build.gradle.kts @@ -25,6 +25,10 @@ dependencies { // caller-supplied extension). Register passes its own glob/OutputFormat.extension() so this layer // imports neither the matcher nor the OutputFormat enum. implementation(project(":shared:io")) + // Tasks.awaitAll(Workers...): the shared fail-fast page fan-out (batch-owned executor, + // sibling interruption, quiescence before the failure propagates) both registration passes + // run their per-page work on. + implementation(project(":shared:process")) implementation(libs.slf4j.api) } diff --git a/register/application/src/main/java/io/github/p4suta/register/application/PdfPipelineService.java b/register/application/src/main/java/io/github/p4suta/register/application/PdfPipelineService.java index 0a826b4..6672142 100644 --- a/register/application/src/main/java/io/github/p4suta/register/application/PdfPipelineService.java +++ b/register/application/src/main/java/io/github/p4suta/register/application/PdfPipelineService.java @@ -11,8 +11,6 @@ import java.nio.file.Path; import java.util.Comparator; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import java.util.stream.Stream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -80,40 +78,37 @@ public void run(Config config) throws IOException { Path extracted = Files.createDirectories(work.resolve("in")); Path registered = Files.createDirectories(work.resolve("reg")); Path jbig2Dir = Files.createDirectories(work.resolve("jb2")); - ExecutorService pool = Executors.newFixedThreadPool(config.jobs()); - try { - int dpi = - config.options().dpi().isPresent() - ? config.options().dpi().getAsInt() - : extractor.dominantDpi(config.inputPdf()); - LOG.info( - "pipeline: {} -> {} at {} dpi", config.inputPdf(), config.outputPdf(), dpi); + int dpi = + config.options().dpi().isPresent() + ? config.options().dpi().getAsInt() + : extractor.dominantDpi(config.inputPdf()); + LOG.info("pipeline: {} -> {} at {} dpi", config.inputPdf(), config.outputPdf(), dpi); - extractor.extract(config.inputPdf(), extracted, config.jobs(), pool); + // Each step fans out on its own batch-owned workers bounded by the same jobs budget, + // so the steps never hold idle threads for each other (the old shared outer pool sat + // idle through the whole registration step). + extractor.extract(config.inputPdf(), extracted, config.jobs()); - RegistrationService.Config registration = - new RegistrationService.Config( - extracted, - registered, - OutputFormat.TIFF, - "*.tif", - config.jobs(), - true, - withDpi(config.options(), dpi), - null, - false); - registrationService.run(registration); + RegistrationService.Config registration = + new RegistrationService.Config( + extracted, + registered, + OutputFormat.TIFF, + "*.tif", + config.jobs(), + true, + withDpi(config.options(), dpi), + null, + false); + registrationService.run(registration); - assembler.assemble( - registered, - config.outputPdf(), - config.inputPdf(), - OptionalInt.of(dpi), - pool, - jbig2Dir); - } finally { - pool.shutdown(); - } + assembler.assemble( + registered, + config.outputPdf(), + config.inputPdf(), + OptionalInt.of(dpi), + config.jobs(), + jbig2Dir); LOG.info("wrote {}", config.outputPdf()); } finally { deleteRecursively(work); diff --git a/register/application/src/main/java/io/github/p4suta/register/application/RegistrationService.java b/register/application/src/main/java/io/github/p4suta/register/application/RegistrationService.java index a6210c3..378a6a1 100644 --- a/register/application/src/main/java/io/github/p4suta/register/application/RegistrationService.java +++ b/register/application/src/main/java/io/github/p4suta/register/application/RegistrationService.java @@ -17,6 +17,7 @@ import io.github.p4suta.shared.io.OutputDirs; import io.github.p4suta.shared.kernel.Medians; import io.github.p4suta.shared.kernel.PageProgressListener; +import io.github.p4suta.shared.process.Tasks; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -26,11 +27,6 @@ import java.util.Locale; import java.util.OptionalInt; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; import org.jspecify.annotations.Nullable; import org.slf4j.Logger; @@ -109,8 +105,8 @@ public Summary run(Config requested) throws IOException { * * @param requested run configuration (its empty {@code --dpi} is resolved from the inputs * before rendering) - * @param progress called once per finished page in each pass (1-based count over {@code 2N}); - * invoked from the worker threads, so it must be thread-safe + * @param progress called once per finished page in each pass (1-based count over {@code 2N}), + * always on the calling thread and in order * @throws IOException on filesystem failure */ public Summary run(Config requested, PageProgressListener progress) throws IOException { @@ -130,7 +126,6 @@ public Summary run(Config requested, PageProgressListener progress) throws IOExc // Per-page progress spans both passes (analyze 1..N, render N+1..2N), so each PageProcessed // the composing stage emits carries this 2N denominator. int progressTotal = files.size() * 2; - AtomicInteger pageProgress = new AtomicInteger(); Path diagDir = config.diagDir(); boolean recordDiagnostics = diagDir != null; @@ -140,21 +135,15 @@ public Summary run(Config requested, PageProgressListener progress) throws IOExc : Reporter.noOp(); // The analysis pass writes each deskewed page here; the render pass reads it back, so the - // costly deskew and detection run once per page, not again in pass two. Removed at the end. + // costly deskew and detection run once per page, not again in pass two. Removed at the + // end — safely: each pass's fan-out quiesces before returning or throwing, so no worker + // is still writing into the scratch when the finally-delete runs. Path scratchDir = createBeside(config.outputDir(), ".register-deskewed-"); - ExecutorService pool = Executors.newFixedThreadPool(config.jobs()); try { // Pass 1: deskew once, detect, cache the deskewed page for pass two List pages = analyzePass( - files, - scratchDir, - config, - recordDiagnostics, - pool, - progress, - pageProgress, - progressTotal); + files, scratchDir, config, recordDiagnostics, progress, progressTotal); List observations = toObservations(pages); int analyzed = observations.size(); @@ -189,9 +178,7 @@ public Summary run(Config requested, PageProgressListener progress) throws IOExc config, recordDiagnostics, reporter, - pool, progress, - pageProgress, progressTotal); if (recordDiagnostics) { @@ -221,7 +208,6 @@ public Summary run(Config requested, PageProgressListener progress) throws IOExc analyzed); return new Summary(files.size(), analyzed); } finally { - pool.shutdown(); deleteRecursively(scratchDir); } } @@ -235,9 +221,7 @@ private List analyzePass( Path scratchDir, Config config, boolean recordDiagnostics, - ExecutorService pool, PageProgressListener progress, - AtomicInteger pageProgress, int progressTotal) throws IOException { List> tasks = new ArrayList<>(files.size()); @@ -246,18 +230,23 @@ private List analyzePass( Path src = files.get(i); Path scratch = scratchDir.resolve(String.format(Locale.ROOT, "%06d.tif", index)); tasks.add( - () -> { - PageAnalysis analysis = - pageRegistrar.analyze( - src, scratch, config.options(), recordDiagnostics); - AnalyzedPage analyzed = - new AnalyzedPage(index, Parity.of(index), src, scratch, analysis); - // The analyze pass occupies the first N of the 2N progress units. - progress.onPage(pageProgress.incrementAndGet(), progressTotal); - return analyzed; - }); + () -> + new AnalyzedPage( + index, + Parity.of(index), + src, + scratch, + pageRegistrar.analyze( + src, scratch, config.options(), recordDiagnostics))); } - return awaitAll(pool, tasks); + // Platform workers: deskew + detection are CPU-bound Leptonica work (FFM downcalls pin + // virtual threads' carriers). The analyze pass occupies the first N of the 2N progress + // units; the callback arrives on this thread, ordered. + return Tasks.awaitAll( + Tasks.Workers.platform(config.jobs()), + tasks, + "register analyze", + (done, total) -> progress.onPage(done, progressTotal)); } /** The detected main-column boxes from pass 1, one per page that had a detectable column. */ @@ -286,36 +275,27 @@ private List renderPass( Config config, boolean recordDiagnostics, Reporter reporter, - ExecutorService pool, PageProgressListener progress, - AtomicInteger pageProgress, int progressTotal) throws IOException { - AtomicInteger done = new AtomicInteger(); - int total = pages.size(); - List> tasks = new ArrayList<>(total); + List> tasks = new ArrayList<>(pages.size()); for (AnalyzedPage page : pages) { tasks.add( - () -> { - Path dest = - renderOne( - page, - reference, - canvas, - config, - recordDiagnostics, - reporter); - int n = done.incrementAndGet(); - // The render pass occupies the second N of the 2N progress units; the local - // count drives the human log at its own N denominator. - progress.onPage(pageProgress.incrementAndGet(), progressTotal); - if (n % PROGRESS_EVERY == 0 || n == total) { - LOG.info("{}/{}", n, total); - } - return dest; - }); + () -> renderOne(page, reference, canvas, config, recordDiagnostics, reporter)); } - return awaitAll(pool, tasks); + // Platform workers: placement is CPU-bound Leptonica work. The render pass occupies the + // second N of the 2N progress units; the local count drives the human log at its own N + // denominator. Both arrive on this thread, ordered. + return Tasks.awaitAll( + Tasks.Workers.platform(config.jobs()), + tasks, + "register render", + (done, total) -> { + progress.onPage(pages.size() + done, progressTotal); + if (done % PROGRESS_EVERY == 0 || done == total) { + LOG.info("{}/{}", done, total); + } + }); } /** @@ -431,38 +411,6 @@ private Path renderOne( return dest; } - /** - * Run every task on {@code pool} in submission order, surfacing the first failure: a task's own - * {@link IOException} is re-thrown unchanged, any other failure is wrapped, and an interruption - * restores the interrupt flag. - */ - private static List awaitAll(ExecutorService pool, List> tasks) - throws IOException { - List> futures; - try { - futures = pool.invokeAll(tasks); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("register run interrupted", e); - } - List results = new ArrayList<>(futures.size()); - for (Future future : futures) { - try { - results.add(future.get()); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("register run interrupted", e); - } catch (ExecutionException e) { - Throwable cause = e.getCause(); - if (cause instanceof IOException io) { - throw io; - } - throw new IOException("page processing failed", cause); - } - } - return results; - } - /** * Create a temporary directory beside {@code sibling} (under its parent), so the scratch space * shares a filesystem with the output it serves. Falls back to the working directory when diff --git a/register/application/src/test/java/io/github/p4suta/register/application/Fakes.java b/register/application/src/test/java/io/github/p4suta/register/application/Fakes.java index 28d7ad4..08f2a4e 100644 --- a/register/application/src/test/java/io/github/p4suta/register/application/Fakes.java +++ b/register/application/src/test/java/io/github/p4suta/register/application/Fakes.java @@ -23,7 +23,6 @@ import java.util.List; import java.util.Optional; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import org.jspecify.annotations.Nullable; @@ -194,8 +193,7 @@ public int dominantDpi(Path pdf) { } @Override - public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) - throws IOException { + public void extract(Path pdf, Path outDir, int jobs) throws IOException { extractCalls.incrementAndGet(); if (failOn != null && pdf.getFileName().toString().contains(failOn)) { throw new IOException("fake extract failed for " + pdf); @@ -220,7 +218,7 @@ public void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException { lastForcedDpi = forcedDpi; diff --git a/register/application/src/test/java/io/github/p4suta/register/application/RegistrationServiceTest.java b/register/application/src/test/java/io/github/p4suta/register/application/RegistrationServiceTest.java index cb028db..5b16165 100644 --- a/register/application/src/test/java/io/github/p4suta/register/application/RegistrationServiceTest.java +++ b/register/application/src/test/java/io/github/p4suta/register/application/RegistrationServiceTest.java @@ -181,11 +181,12 @@ void writesDiagnosticsWhenADiagDirIsGiven(@TempDir Path tmp) throws IOException } @Test - void propagatesAWorkerFailureAsAnIoException(@TempDir Path tmp) throws IOException { + void propagatesAWorkerFailureWithItsIdentity(@TempDir Path tmp) throws IOException { Path in = Files.createDirectories(tmp.resolve("in")); Path out = tmp.resolve("out"); writePages(in, 2); - // A registrar whose analyze pass throws: the service must surface it, not swallow it. + // A registrar whose analyze pass throws: the service must surface the original exception + // unchanged (the fan-out preserves identity, so domain kinds reach the exception mapper). PageRegistrar failing = new FakePageRegistrar(true, 600, 1000, 1500) { @Override @@ -201,7 +202,7 @@ public PageAnalysis analyze( new RegistrationService(failing, new RecordingReporterFactory()); assertThrows( - IOException.class, + IllegalStateException.class, () -> service.run( config( diff --git a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfBoxJbig2Assembler.java b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfBoxJbig2Assembler.java index a5a1411..85224b2 100644 --- a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfBoxJbig2Assembler.java +++ b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfBoxJbig2Assembler.java @@ -4,7 +4,6 @@ import java.io.IOException; import java.nio.file.Path; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import org.jspecify.annotations.Nullable; /** @@ -40,9 +39,9 @@ public void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException { - delegate.assemble(imageDir, outPdf, source, forcedDpi, pool, scratchDir); + delegate.assemble(imageDir, outPdf, source, forcedDpi, jobs, scratchDir); } } diff --git a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfImagesCliExtractor.java b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfImagesCliExtractor.java index 231229a..07c55e5 100644 --- a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfImagesCliExtractor.java +++ b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfImagesCliExtractor.java @@ -3,7 +3,6 @@ import io.github.p4suta.register.port.PdfImageExtractor; import java.io.IOException; import java.nio.file.Path; -import java.util.concurrent.ExecutorService; /** * Register's {@link PdfImageExtractor} adapter: a thin binding onto the cross-app {@link @@ -43,10 +42,10 @@ public int dominantDpi(Path pdf) throws IOException { /** * Extract all pages of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks. {@code jobs} bounds both the chunk count and the pool slots used. + * chunks. at most {@code jobs} chunks run at once. */ @Override - public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException { - delegate.extract(pdf, outDir, jobs, pool); + public void extract(Path pdf, Path outDir, int jobs) throws IOException { + delegate.extract(pdf, outDir, jobs); } } diff --git a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/process/NativeTools.java b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/process/NativeTools.java index c254061..347a21b 100644 --- a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/process/NativeTools.java +++ b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/process/NativeTools.java @@ -3,7 +3,6 @@ import io.github.p4suta.register.domain.exception.RegisterErrorKind; import io.github.p4suta.register.domain.exception.RegisterException; import io.github.p4suta.shared.process.ProcessRunner; -import io.github.p4suta.shared.process.Tasks; import io.github.p4suta.shared.process.ToolPath; import java.io.IOException; import java.io.InputStream; @@ -11,8 +10,6 @@ import java.time.Duration; import java.util.List; import java.util.Objects; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -22,12 +19,11 @@ * io.github.p4suta.shared.process} plumbing: discovery is delegated to {@link * io.github.p4suta.shared.process.ToolPath} (passing register's per-tool {@code * -Dregister..path} override key — this is how the packaged app-image points at its bundled - * binaries, else the tool is looked up on {@code PATH}), the discard-output run is delegated to - * {@link io.github.p4suta.shared.process.ProcessRunner}, and the worker-pool fan-out to {@link - * io.github.p4suta.shared.process.Tasks}. Unlike the optional flip-book, a missing pipeline tool is - * fatal — the pipeline cannot proceed — so resolution throws, and the shared layer's neutral {@code - * IOException}/{@code TimeoutException} failures are re-mapped to register's {@code - * RegisterException} (NATIVE_TOOL_FAILED) so the pipeline keeps its own error wording. + * binaries, else the tool is looked up on {@code PATH}), and the discard-output run is delegated to + * {@link io.github.p4suta.shared.process.ProcessRunner}. Unlike the optional flip-book, a missing + * pipeline tool is fatal — the pipeline cannot proceed — so resolution throws, and the shared + * layer's neutral {@code IOException}/{@code TimeoutException} failures are re-mapped to register's + * {@code RegisterException} (NATIVE_TOOL_FAILED) so the pipeline keeps its own error wording. */ public final class NativeTools { @@ -128,15 +124,4 @@ private static void awaitExit(Process process, List command, long timeou null); } } - - /** - * Run every task on {@code pool}, in submission order, surfacing the first failure as an {@link - * IOException}. Thin wrapper over the shared {@link - * io.github.p4suta.shared.process.Tasks#awaitAll} that supplies the pipeline's messages, so the - * await semantics live in one place while the pipeline keeps its own error wording. - */ - public static List awaitAll(ExecutorService pool, List> tasks) - throws IOException { - return Tasks.awaitAll(pool, tasks, "pipeline interrupted", "pipeline task failed"); - } } diff --git a/register/port/src/main/java/io/github/p4suta/register/port/Jbig2Assembler.java b/register/port/src/main/java/io/github/p4suta/register/port/Jbig2Assembler.java index 466e95a..95902d0 100644 --- a/register/port/src/main/java/io/github/p4suta/register/port/Jbig2Assembler.java +++ b/register/port/src/main/java/io/github/p4suta/register/port/Jbig2Assembler.java @@ -3,7 +3,6 @@ import java.io.IOException; import java.nio.file.Path; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import org.jspecify.annotations.Nullable; /** @@ -28,7 +27,7 @@ void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException; } diff --git a/register/port/src/main/java/io/github/p4suta/register/port/PdfImageExtractor.java b/register/port/src/main/java/io/github/p4suta/register/port/PdfImageExtractor.java index 7114285..23089b4 100644 --- a/register/port/src/main/java/io/github/p4suta/register/port/PdfImageExtractor.java +++ b/register/port/src/main/java/io/github/p4suta/register/port/PdfImageExtractor.java @@ -2,7 +2,6 @@ import java.io.IOException; import java.nio.file.Path; -import java.util.concurrent.ExecutorService; /** * Extracts a scanned PDF's embedded bitonal images as TIFFs. The abstraction over {@code pdfimages} @@ -20,10 +19,10 @@ public interface PdfImageExtractor { /** * Extract every page of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks across the worker pool. + * chunks, at most {@code jobs} at once. * - * @param jobs the worker thread count (bounds the chunk count and pool slots used) + * @param jobs how many chunks may extract at once * @throws IOException if the external tool fails */ - void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException; + void extract(Path pdf, Path outDir, int jobs) throws IOException; } diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfBoxJbig2Assembler.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfBoxJbig2Assembler.java index 8fc2306..3113420 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfBoxJbig2Assembler.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfBoxJbig2Assembler.java @@ -14,7 +14,6 @@ import java.util.Locale; import java.util.OptionalInt; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.stream.Stream; import org.apache.pdfbox.Loader; @@ -85,7 +84,7 @@ private record ImageInfo(int width, int height, int resolution) {} * @param outPdf the lossless-JBIG2 PDF to write * @param source a PDF whose Info dict, XMP and version are inherited, or {@code null} for none * @param forcedDpi a single DPI to size every page with, or empty to read each image's own - * @param pool the worker pool the per-page {@code jbig2} encodes run on + * @param jobs how many {@code jbig2} children may encode at once * @param scratchDir scratch directory for the intermediate per-page JBIG2 streams * (caller-owned) * @throws IOException if the directory is empty, a tool fails, or the write fails @@ -95,7 +94,7 @@ public void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException { List images = sortedImages(imageDir); @@ -109,8 +108,9 @@ public void assemble( int index = i; tasks.add(() -> encode(jbig2, image, scratchDir, index, forcedDpi)); } - List pages = - Tasks.awaitAll(pool, tasks, "jbig2 encode interrupted", "jbig2 encode failed"); + // Virtual workers: each task is one subprocess wait plus a brief Pix header read; the + // semaphore-bounded admission keeps at most `jobs` jbig2 children alive at once. + List pages = Tasks.awaitAll(Tasks.Workers.virtual(jobs), tasks, "jbig2 encode"); try (PDDocument doc = new PDDocument()) { for (Page page : pages) { diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java index 2c4bfac..833b01d 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java @@ -14,7 +14,6 @@ import java.util.List; import java.util.Locale; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeoutException; import java.util.stream.Stream; import org.jspecify.annotations.Nullable; @@ -101,7 +100,8 @@ public int dominantDpi(Path pdf) throws IOException { /** * Extract all pages of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks of about {@link #CHUNK_PAGES} pages on {@code pool} (at most {@code 4 * jobs} chunks). + * chunks of about {@link #CHUNK_PAGES} pages, at most {@code jobs} running at once (and at most + * {@code 4 * jobs} chunks). * *

One {@code pdfimages -list} pass picks the mode: when every embedded image is 1-bpp CCITT * (the usual self-scanned book), each chunk dumps the raw G4 streams ({@code -ccitt}) and wraps @@ -112,7 +112,7 @@ public int dominantDpi(Path pdf) throws IOException { * a chunk whose dump or wrap deviates in any way is re-extracted decoded ({@code -tiff}), which * is also the whole-run mode for any other source. */ - public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException { + public void extract(Path pdf, Path outDir, int jobs) throws IOException { int total = pageCount(pdf); String pdfimages = resolve("pdfimages", pdfimagesPropertyKey); List rows = @@ -145,7 +145,9 @@ public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throw }); chunk++; } - Tasks.awaitAll(pool, tasks, "pdfimages extract interrupted", "pdfimages extract failed"); + // Virtual workers: a chunk is one subprocess wait plus (in remux mode) a brief wrap; the + // semaphore-bounded admission keeps at most `jobs` pdfimages children alive at once. + Tasks.awaitAll(Tasks.Workers.virtual(jobs), tasks, "pdfimages extract"); } /** Pages per extraction chunk; see {@link #extract}. */ diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfBoxJbig2AssemblerTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfBoxJbig2AssemblerTest.java index 3bd7edf..2bae0ec 100644 --- a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfBoxJbig2AssemblerTest.java +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfBoxJbig2AssemblerTest.java @@ -8,8 +8,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import javax.imageio.ImageIO; import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; @@ -57,13 +55,10 @@ void assemblesABitonalDirectoryIntoAJbig2Pdf(@TempDir Path tmp) throws Exception writeBitonalPage(imageDir.resolve("page-000.png"), 64, 48); writeBitonalPage(imageDir.resolve("page-001.png"), 80, 60); Path outPdf = tmp.resolve("out.pdf"); - - ExecutorService pool = Executors.newFixedThreadPool(2); try { new PdfBoxJbig2Assembler(JBIG2_KEY) - .assemble(imageDir, outPdf, null, OptionalInt.of(300), pool, scratch); + .assemble(imageDir, outPdf, null, OptionalInt.of(300), 2, scratch); } finally { - pool.shutdownNow(); } assertThat(Files.exists(outPdf)).isTrue(); @@ -103,13 +98,10 @@ void inheritsInfoAndVersionFromTheSourcePdf(@TempDir Path tmp) throws Exception Path scratch = Files.createDirectory(tmp.resolve("scratch")); writeBitonalPage(imageDir.resolve("page-000.png"), 64, 48); Path outPdf = tmp.resolve("out.pdf"); - - ExecutorService pool = Executors.newSingleThreadExecutor(); try { new PdfBoxJbig2Assembler(JBIG2_KEY) - .assemble(imageDir, outPdf, source, OptionalInt.empty(), pool, scratch); + .assemble(imageDir, outPdf, source, OptionalInt.empty(), 2, scratch); } finally { - pool.shutdownNow(); } try (PDDocument doc = Loader.loadPDF(outPdf.toFile())) { @@ -125,7 +117,6 @@ void emptyDirectoryFailsBeforeTouchingTheToolchain(@TempDir Path tmp) throws Exc // runs before tool resolution). Works on any host. Path imageDir = Files.createDirectory(tmp.resolve("empty")); Path scratch = Files.createDirectory(tmp.resolve("scratch")); - ExecutorService pool = Executors.newSingleThreadExecutor(); try { PdfBoxJbig2Assembler assembler = new PdfBoxJbig2Assembler(JBIG2_KEY); assertThatThrownBy( @@ -135,12 +126,11 @@ void emptyDirectoryFailsBeforeTouchingTheToolchain(@TempDir Path tmp) throws Exc tmp.resolve("out.pdf"), null, OptionalInt.empty(), - pool, + 1, scratch)) .isInstanceOf(IOException.class) .hasMessageContaining("no cleaned images"); } finally { - pool.shutdownNow(); } } @@ -156,7 +146,6 @@ void missingJbig2BinaryFailsWithAClearMessage(@TempDir Path tmp) throws Exceptio String bogusKey = "shared.pdf.test.bogus.jbig2.path"; System.setProperty(bogusKey, tmp.resolve("definitely-not-jbig2").toString()); try { - ExecutorService pool = Executors.newSingleThreadExecutor(); try { PdfBoxJbig2Assembler assembler = new PdfBoxJbig2Assembler(bogusKey); assertThatThrownBy( @@ -166,11 +155,10 @@ void missingJbig2BinaryFailsWithAClearMessage(@TempDir Path tmp) throws Exceptio tmp.resolve("out.pdf"), null, OptionalInt.of(300), - pool, + 1, scratch)) .isInstanceOf(IOException.class); } finally { - pool.shutdownNow(); } } finally { System.clearProperty(bogusKey); diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java index 9b7d1c5..4d17e4d 100644 --- a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java @@ -10,8 +10,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import java.util.stream.Stream; import javax.imageio.ImageIO; import org.apache.pdfbox.pdmodel.PDDocument; @@ -68,12 +66,7 @@ void extractsEmbeddedImagesAsTiffs(@TempDir Path tmp) throws Exception { writePdfWithImage(pdf, 120, 90); Path outDir = Files.createDirectory(tmp.resolve("out")); - ExecutorService pool = Executors.newFixedThreadPool(2); - try { - new PdfImagesCliExtractor(PDFIMAGES_KEY, PDFINFO_KEY).extract(pdf, outDir, 2, pool); - } finally { - pool.shutdownNow(); - } + new PdfImagesCliExtractor(PDFIMAGES_KEY, PDFINFO_KEY).extract(pdf, outDir, 2); try (Stream entries = Files.list(outDir)) { List tiffs = @@ -131,12 +124,7 @@ void remuxesAnAllCcittSourceIntoStampedG4Tiffs(@TempDir Path tmp) throws Excepti writeCcittPdf(pdf, 3, 240, 180); Path outDir = Files.createDirectory(tmp.resolve("out")); - ExecutorService pool = Executors.newFixedThreadPool(2); - try { - new PdfImagesCliExtractor(PDFIMAGES_KEY, PDFINFO_KEY).extract(pdf, outDir, 2, pool); - } finally { - pool.shutdownNow(); - } + new PdfImagesCliExtractor(PDFIMAGES_KEY, PDFINFO_KEY).extract(pdf, outDir, 2); try (Stream entries = Files.list(outDir)) { List files = entries.sorted().toList(); diff --git a/shared/process/build.gradle.kts b/shared/process/build.gradle.kts index 5ca5418..f34fc9a 100644 --- a/shared/process/build.gradle.kts +++ b/shared/process/build.gradle.kts @@ -20,8 +20,11 @@ plugins { // (exitCode + stdout + stderr + elapsed Duration; tate's typed Result is the shape donor) and // throwing TimeoutException on timeout. The caller passes the set of acceptable non-zero exits // (generalizing despeckle's hardcoded qpdf-exit-3 tolerance) — a non-acceptable exit throws. -// * Tasks — awaitAll: a parameterized parallel fan-out over a caller-owned pool (register's), -// collecting results in submission order and aggregating the first failure as an IOException. +// * Tasks — awaitAll: a parallel fan-out over a batch-owned executor (Workers.platform for +// CPU-bound work, Workers.virtual for subprocess waits), collecting results in submission +// order with fail-fast sibling interruption, quiescence before the failure propagates, and +// exception identity preserved (domain RuntimeExceptions keep their error kind). Mirrors +// StructuredTaskScope.join() semantics on final-Java features. // // No domain exceptions are reachable here (that is the point of generalizing): a launch failure or // an unacceptable exit surfaces as a plain IOException, a timeout as a TimeoutException — so the @@ -37,13 +40,11 @@ dependencies { // Coverage floor: the same realistic infra-like 0.75 line / 0.60 branch the despeckle // :infrastructure and :shared:imaging modules use — this is process/exec plumbing, not branch-rich -// domain logic, so the domain-grade 0.95/0.90 the kernel/observability use would be dishonest. The -// only branches a unit test cannot drive are the InterruptedException catches (you cannot -// deterministically interrupt the waiting thread mid-waitFor) and a stray launch-failure path; at -// CLASS granularity the imaging-style exclusion would throw away ProcessRunner's genuinely covered -// happy/timeout/exit paths too, so NO class is excluded — the lenient floor absorbs the few -// untestable catch arms. The same self-contained block the other shared modules carry, since the -// floor is not applied by any convention plugin. +// domain logic, so the domain-grade 0.95/0.90 the kernel/observability use would be dishonest. +// (The interrupt arms are now deterministically covered — TasksTest and ProcessRunnerTest drive +// them with latch-coordinated interrupts — but a stray launch-failure path remains; the lenient +// floor absorbs it.) The same self-contained block the other shared modules carry, since the floor +// is not applied by any convention plugin. tasks.named("jacocoTestCoverageVerification") { dependsOn(tasks.named("test")) violationRules { diff --git a/shared/process/src/main/java/io/github/p4suta/shared/process/ProcessRunner.java b/shared/process/src/main/java/io/github/p4suta/shared/process/ProcessRunner.java index f1349fa..8ecfefe 100644 --- a/shared/process/src/main/java/io/github/p4suta/shared/process/ProcessRunner.java +++ b/shared/process/src/main/java/io/github/p4suta/shared/process/ProcessRunner.java @@ -81,9 +81,17 @@ public static Result run( try (ExecutorService drainers = Executors.newVirtualThreadPerTaskExecutor()) { Future out = drainers.submit(() -> process.getInputStream().readAllBytes()); Future err = drainers.submit(() -> process.getErrorStream().readAllBytes()); - if (!process.waitFor(timeout.toMillis(), TimeUnit.MILLISECONDS)) { + try { + if (!process.waitFor(timeout.toMillis(), TimeUnit.MILLISECONDS)) { + process.destroyForcibly(); + throw new TimeoutException(command.get(0) + " timed out after " + timeout); + } + } catch (InterruptedException e) { + // Kill the child before propagating, exactly like the timeout path: the drainers + // then reach EOF, so the try-with-resources close() returns instead of waiting on + // a live child's pipes — and no orphaned process outlives the interrupted caller. process.destroyForcibly(); - throw new TimeoutException(command.get(0) + " timed out after " + timeout); + throw e; } // Decode leniently (replace malformed bytes), matching the long-standing behavior. String stdout = new String(await(out), StandardCharsets.UTF_8); diff --git a/shared/process/src/main/java/io/github/p4suta/shared/process/Tasks.java b/shared/process/src/main/java/io/github/p4suta/shared/process/Tasks.java index 725d3aa..7212886 100644 --- a/shared/process/src/main/java/io/github/p4suta/shared/process/Tasks.java +++ b/shared/process/src/main/java/io/github/p4suta/shared/process/Tasks.java @@ -1,65 +1,236 @@ package io.github.p4suta.shared.process; import java.io.IOException; +import java.io.UncheckedIOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorCompletionService; import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.Future; +import java.util.concurrent.Semaphore; +import java.util.concurrent.atomic.AtomicBoolean; +import org.jspecify.annotations.Nullable; /** - * Runs a batch of {@link Callable}s on a caller-owned {@link ExecutorService} and collects their - * results in submission order, turning the executor's checked machinery into a single {@link - * IOException}. + * Runs a batch of {@link Callable}s on a batch-owned executor with fail-fast semantics: the first + * failure interrupts the remaining workers, the batch waits until every worker has actually stopped + * (quiescence), and only then does the failure propagate — so a caller's {@code finally} can safely + * delete the directories the workers were writing into. + * + *

The propagated failure keeps its identity: a task's own {@link IOException} or {@link + * RuntimeException} (the apps' domain exceptions are runtime exceptions) is re-thrown unchanged, so + * the shared exception mapper still sees the original kind and maps the right exit code and {@code + * RunFailed} token; an {@link UncheckedIOException} is unwrapped to its cause. This deliberately + * mirrors {@code StructuredTaskScope.join()}'s contract (fail-fast, sibling interruption, + * quiescence before propagation) on final-Java features, so a future swap to structured concurrency + * stays internal to this class. */ public final class Tasks { private Tasks() {} /** - * Submit every task, wait for all of them, and return their results in the order {@code tasks} - * was given. The first failure stops collection and surfaces as an {@link IOException}: a - * task's own {@link IOException} is re-thrown unchanged (so callers see the real cause), any - * other failure is wrapped with {@code failureMessage}, and an interruption restores the - * thread's interrupt flag and is reported with {@code interruptedMessage}. + * How one batch schedules its workers. * - * @param pool the executor to run the tasks on (the caller owns its lifecycle) + *

{@link #platform(int)} is for CPU-bound work (Leptonica/FFM pixel ops, TIFF G4 encoding): + * a fixed pool of platform threads, because a long native downcall pins a virtual thread's + * carrier, which would silently cap concurrency at the carrier count. {@link #virtual(int)} is + * for subprocess-bound work (per-page {@code jbig2}, {@code pdfimages} chunks): one virtual + * thread per task, with at most {@code maxInFlight} admitted past an internal semaphore so the + * spawned child processes stay bounded. + */ + public static final class Workers { + + private final int limit; + private final boolean virtualThreads; + + private Workers(int limit, boolean virtualThreads) { + if (limit < 1) { + throw new IllegalArgumentException("worker limit must be >= 1, got " + limit); + } + this.limit = limit; + this.virtualThreads = virtualThreads; + } + + /** {@code jobs} platform threads — CPU-bound work. */ + public static Workers platform(int jobs) { + return new Workers(jobs, false); + } + + /** + * A virtual thread per task, at most {@code maxInFlight} running at once — work that mostly + * waits on a subprocess. + */ + public static Workers virtual(int maxInFlight) { + return new Workers(maxInFlight, true); + } + + private ExecutorService newExecutor() { + return virtualThreads + ? Executors.newVirtualThreadPerTaskExecutor() + : Executors.newFixedThreadPool(limit); + } + + private @Nullable Semaphore newPermits() { + return virtualThreads ? new Semaphore(limit) : null; + } + } + + /** + * Per-item completion callback, invoked on the orchestrating thread in completion order: {@code + * done} runs {@code 1..total}, strictly increasing, successes only — so progress events built + * on it are ordered without any thread-safety burden on the listener. + */ + @FunctionalInterface + public interface ItemProgress { + + /** The no-op listener. */ + ItemProgress NONE = (done, total) -> {}; + + /** + * One more item finished. + * + * @param done how many items have finished so far (1-based, strictly increasing) + * @param total how many items the batch runs + */ + void onItem(int done, int total); + } + + /** {@link #awaitAll(Workers, List, String, ItemProgress)} without progress reporting. */ + public static List awaitAll(Workers workers, List> tasks, String label) + throws IOException { + return awaitAll(workers, tasks, label, ItemProgress.NONE); + } + + /** + * Run every task on a batch-owned executor and return their results in submission order. + * + *

On the first failure the remaining workers are interrupted ({@code shutdownNow}) and the + * batch waits for full quiescence before the failure propagates with its identity intact: + * {@link IOException} and {@link RuntimeException} unchanged, {@link UncheckedIOException} + * unwrapped to its cause, anything else wrapped as {@code IOException(label + " failed")}. + * Caller interruption likewise stops and joins the workers, restores the interrupt flag, and + * surfaces as {@code IOException(label + " interrupted")}. + * + * @param workers the scheduling mode (see {@link Workers}) * @param tasks the work to run, one result per task - * @param interruptedMessage the message for an {@link IOException} raised on interruption - * @param failureMessage the message wrapping a non-{@code IOException} task failure + * @param label the batch's name in failure/interruption messages (e.g. {@code "G4 encode"}) + * @param progress called on this thread after each successful item * @param the task result type * @return the task results, in submission order * @throws IOException if any task fails or the wait is interrupted */ public static List awaitAll( - ExecutorService pool, - List> tasks, - String interruptedMessage, - String failureMessage) + Workers workers, List> tasks, String label, ItemProgress progress) throws IOException { - List> futures; - try { - futures = pool.invokeAll(tasks); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException(interruptedMessage, e); - } - List results = new ArrayList<>(futures.size()); - for (Future future : futures) { + int total = tasks.size(); + List<@Nullable T> results = new ArrayList<>(Collections.nCopies(total, null)); + try (ExecutorService pool = workers.newExecutor()) { + ExecutorCompletionService> completion = + new ExecutorCompletionService<>(pool); + @Nullable Semaphore permits = workers.newPermits(); + // The fail-fast gate: set by the first worker whose task throws, checked by every + // worker before it starts a task. shutdownNow() alone cannot stop a queued task a + // newly-freed worker dequeues in the instant before the orchestrator reacts; the gate + // closes that window (a gated task completes with the BatchCancelled marker instead + // of running user code). + AtomicBoolean failed = new AtomicBoolean(); + for (int i = 0; i < total; i++) { + int index = i; + Callable task = tasks.get(i); + completion.submit( + () -> { + if (failed.get()) { + throw new BatchCancelled(); + } + // Acquired inside the worker so submission never blocks and an + // interrupt during the wait cancels cleanly. + if (permits != null) { + permits.acquire(); + } + try { + return new IndexedResult<>(index, task.call()); + } catch (Throwable t) { + failed.set(true); + throw t; + } finally { + if (permits != null) { + permits.release(); + } + } + }); + } try { - results.add(future.get()); + int done = 0; + while (done < total) { + Future> future = completion.take(); + try { + IndexedResult result = future.get(); + results.set(result.index(), result.value()); + done++; + progress.onItem(done, total); + } catch (ExecutionException e) { + if (e.getCause() instanceof BatchCancelled) { + // A gated sibling, not the root cause — keep draining: the failure + // that set the gate already ran, so its completion is on its way. + continue; + } + pool.shutdownNow(); + throw failure(e.getCause(), label); + } + } } catch (InterruptedException e) { + pool.shutdownNow(); Thread.currentThread().interrupt(); - throw new IOException(interruptedMessage, e); - } catch (ExecutionException e) { - Throwable cause = e.getCause(); - if (cause instanceof IOException io) { - throw io; - } - throw new IOException(failureMessage, cause); + throw new IOException(label + " interrupted", e); } + // The try-with-resources close() awaits full termination on every path, so by the + // time a result or an exception reaches the caller, no worker is still running (or + // still writing into directories the caller is about to delete). } - return results; + return castNonNull(results); + } + + /** One task's result tagged with its submission index, for submission-order assembly. */ + private record IndexedResult(int index, @Nullable T value) {} + + /** The internal marker a gated (skipped-after-failure) task completes with; never user code. */ + private static final class BatchCancelled extends RuntimeException { + private static final long serialVersionUID = 1L; + + BatchCancelled() { + super("cancelled by an earlier failure in the batch", null, false, false); + } + } + + /** The failure to propagate for a task that threw {@code cause}; see {@link #awaitAll}. */ + private static IOException failure(@Nullable Throwable cause, String label) { + switch (cause) { + case IOException io -> { + return io; + } + case UncheckedIOException unchecked -> { + // UncheckedIOException's constructor requires a non-null cause. + return java.util.Objects.requireNonNull(unchecked.getCause()); + } + case RuntimeException runtime -> throw runtime; + case Error error -> throw error; + case null, default -> { + return new IOException(label + " failed", cause); + } + } + } + + /** + * {@return the results list with every slot filled} Every slot was set exactly once (one + * completion per submitted task), so the nullable build-up list is safe to expose as non-null. + */ + @SuppressWarnings({"unchecked", "NullAway"}) // each index 0..total-1 was set by its task + private static List castNonNull(List<@Nullable T> results) { + return (List) (List) results; } } diff --git a/shared/process/src/test/java/io/github/p4suta/shared/process/ProcessRunnerTest.java b/shared/process/src/test/java/io/github/p4suta/shared/process/ProcessRunnerTest.java index 5514e24..c826d9d 100644 --- a/shared/process/src/test/java/io/github/p4suta/shared/process/ProcessRunnerTest.java +++ b/shared/process/src/test/java/io/github/p4suta/shared/process/ProcessRunnerTest.java @@ -111,4 +111,38 @@ void missingBinaryFailsWithIoException() { Duration.ofSeconds(10))) .isInstanceOf(IOException.class); } + + @Test + @org.junit.jupiter.api.Timeout(15) + void interruptedWaiterKillsTheChildAndUnwindsPromptly() throws Exception { + // Regression: an interrupt during waitFor used to propagate WITHOUT killing the child, so + // the drainer close() hung on the live child's pipes (here: up to the sleep's 30s) and the + // child leaked. Now the child is killed first, so the call unwinds promptly — the @Timeout + // and the bounded join are the proof. + var thrown = new java.util.concurrent.atomic.AtomicReference(); + var flagRestoredPathTaken = new java.util.concurrent.atomic.AtomicBoolean(); + var started = new java.util.concurrent.CountDownLatch(1); + Thread caller = + new Thread( + () -> { + try { + started.countDown(); + ProcessRunner.run(List.of("sleep", "30"), Duration.ofMinutes(5)); + } catch (InterruptedException e) { + thrown.set(e); + flagRestoredPathTaken.set(true); + } catch (Exception e) { + thrown.set(e); + } + }); + caller.start(); + assertThat(started.await(10, java.util.concurrent.TimeUnit.SECONDS)).isTrue(); + // Give the child a beat to actually spawn before interrupting the waiter. + Thread.sleep(300); + caller.interrupt(); + caller.join(10_000); + assertThat(caller.isAlive()).as("the interrupted run must unwind promptly").isFalse(); + assertThat(thrown.get()).isInstanceOf(InterruptedException.class); + assertThat(flagRestoredPathTaken).isTrue(); + } } diff --git a/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java b/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java index 0b689d1..f50253b 100644 --- a/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java +++ b/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java @@ -1,74 +1,340 @@ package io.github.p4suta.shared.process; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertSame; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; +import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +/** + * The batch contract of {@link Tasks#awaitAll(Tasks.Workers, List, String, Tasks.ItemProgress)}: + * submission-order results, fail-fast with sibling interruption, quiescence before the failure + * propagates, exception identity, orchestrator-thread progress, and the virtual mode's in-flight + * bound. All coordination is latch-based — no sleeps as logic. + */ +@Timeout(30) class TasksTest { + // ---- happy path ---- + @Test void returnsResultsInSubmissionOrder() throws IOException { - ExecutorService pool = Executors.newFixedThreadPool(4); - try { - List> tasks = new ArrayList<>(); - for (int i = 0; i < 16; i++) { - int value = i; - tasks.add(() -> value); - } - List results = Tasks.awaitAll(pool, tasks, "interrupted", "failed"); - for (int i = 0; i < 16; i++) { - assertEquals(i, results.get(i)); - } - } finally { - pool.shutdown(); + List> tasks = new ArrayList<>(); + for (int i = 0; i < 16; i++) { + int value = i; + tasks.add(() -> value); + } + List results = Tasks.awaitAll(Tasks.Workers.platform(4), tasks, "batch"); + for (int i = 0; i < 16; i++) { + assertEquals(i, results.get(i)); } } @Test - void surfacesATasksIOExceptionUnchanged() { - ExecutorService pool = Executors.newFixedThreadPool(2); - try { - IOException boom = new IOException("boom"); - Callable task = - () -> { - throw boom; - }; - IOException thrown = - assertThrows( - IOException.class, - () -> Tasks.awaitAll(pool, List.of(task), "interrupted", "failed")); - // The task's own IOException is re-thrown as-is, not wrapped with the failure message. - assertSame(boom, thrown); - } finally { - pool.shutdown(); + void progressRunsOnTheCallingThreadInCompletionOrder() throws IOException { + Thread caller = Thread.currentThread(); + List seen = new ArrayList<>(); // safe: appended on the calling thread only + AtomicBoolean wrongThread = new AtomicBoolean(); + List> tasks = new ArrayList<>(); + for (int i = 0; i < 8; i++) { + int value = i; + tasks.add(() -> value); } + Tasks.awaitAll( + Tasks.Workers.platform(4), + tasks, + "batch", + (done, total) -> { + if (Thread.currentThread() != caller) { + wrongThread.set(true); + } + seen.add(new int[] {done, total}); + }); + assertFalse(wrongThread.get(), "progress must run on the orchestrating thread"); + assertEquals(8, seen.size()); + for (int i = 0; i < 8; i++) { + assertEquals(i + 1, seen.get(i)[0], "done is strictly increasing from 1"); + assertEquals(8, seen.get(i)[1]); + } + } + + // ---- failure semantics ---- + + @Test + void surfacesATasksIOExceptionByIdentity() { + IOException boom = new IOException("boom"); + IOException thrown = + assertThrows( + IOException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), + List.of(failing(boom)), + "batch")); + assertSame(boom, thrown); + } + + @Test + void surfacesARuntimeExceptionByIdentitySoErrorKindsSurvive() { + IllegalStateException boom = new IllegalStateException("domain kind carrier"); + IllegalStateException thrown = + assertThrows( + IllegalStateException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), + List.of(failing(boom)), + "batch")); + assertSame(boom, thrown); + } + + @Test + void unwrapsAnUncheckedIOExceptionToItsCause() { + IOException cause = new IOException("disk"); + IOException thrown = + assertThrows( + IOException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), + List.of(failing(new UncheckedIOException(cause))), + "batch")); + assertSame(cause, thrown); + } + + @Test + void wrapsAnUnknownCheckedFailureWithTheLabel() { + Exception checked = new Exception("odd"); + IOException thrown = + assertThrows( + IOException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), + List.of(failing(checked)), + "G4 encode")); + assertEquals("G4 encode failed", thrown.getMessage()); + assertSame(checked, thrown.getCause()); + } + + // ---- fail-fast, sibling interruption, quiescence ---- + + @Test + void firstFailureInterruptsSiblingsAndSkipsQueuedWork() throws Exception { + CountDownLatch blockerStarted = new CountDownLatch(1); + AtomicBoolean blockerInterrupted = new AtomicBoolean(); + AtomicBoolean queuedRan = new AtomicBoolean(); + IllegalStateException boom = new IllegalStateException("first failure"); + + Callable blocker = + () -> { + blockerStarted.countDown(); + try { + new CountDownLatch(1).await(); // blocks until interrupted + } catch (InterruptedException e) { + blockerInterrupted.set(true); + } + return null; + }; + Callable failer = + () -> { + blockerStarted.await(); // fail only once the sibling is provably running + throw boom; + }; + Callable queued = + () -> { + queuedRan.set(true); + return null; + }; + + IllegalStateException thrown = + assertThrows( + IllegalStateException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), + List.of(blocker, failer, queued), + "batch")); + assertSame(boom, thrown); + assertTrue(blockerInterrupted.get(), "the running sibling must be interrupted"); + assertFalse(queuedRan.get(), "queued work must be discarded on failure"); } @Test - void wrapsANonIoFailureWithTheFailureMessage() { - ExecutorService pool = Executors.newFixedThreadPool(2); - try { - Callable task = + void quiescesBeforeTheFailurePropagates() { + AtomicInteger inFlight = new AtomicInteger(); + CountDownLatch blockerStarted = new CountDownLatch(1); + + Callable blocker = + () -> { + inFlight.incrementAndGet(); + try { + blockerStarted.countDown(); + new CountDownLatch(1).await(); // until interrupted + return null; + } catch (InterruptedException e) { + return null; + } finally { + inFlight.decrementAndGet(); + } + }; + Callable failer = + () -> { + blockerStarted.await(); + throw new IllegalStateException("boom"); + }; + + assertThrows( + IllegalStateException.class, + () -> Tasks.awaitAll(Tasks.Workers.platform(2), List.of(blocker, failer), "batch")); + // The contract callers' finally-blocks rely on: when awaitAll throws, no worker is still + // running (still writing into directories about to be deleted). + assertEquals(0, inFlight.get()); + } + + @Test + void progressCountsOnlySuccessesContiguously() { + CountDownLatch twoDone = new CountDownLatch(2); + List dones = new ArrayList<>(); // appended on the calling thread only + Callable ok = + () -> { + twoDone.countDown(); + return null; + }; + Callable failer = + () -> { + twoDone.await(); // both successes complete first + throw new IllegalStateException("boom"); + }; + assertThrows( + IllegalStateException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(3), + List.of(ok, ok, failer), + "batch", + (done, total) -> dones.add(done))); + assertEquals(List.of(1, 2), dones); + } + + // ---- caller interruption ---- + + @Test + void callerInterruptionStopsWorkersAndRestoresTheFlag() throws Exception { + CountDownLatch workerStarted = new CountDownLatch(1); + AtomicBoolean workerInterrupted = new AtomicBoolean(); + AtomicInteger inFlight = new AtomicInteger(); + AtomicReference thrown = new AtomicReference<>(); + AtomicBoolean flagRestored = new AtomicBoolean(); + + Callable blocker = + () -> { + inFlight.incrementAndGet(); + try { + workerStarted.countDown(); + new CountDownLatch(1).await(); + return null; + } catch (InterruptedException e) { + workerInterrupted.set(true); + return null; + } finally { + inFlight.decrementAndGet(); + } + }; + + Thread caller = + new Thread( + () -> { + try { + Tasks.awaitAll( + Tasks.Workers.platform(1), List.of(blocker), "register"); + } catch (IOException e) { + thrown.set(e); + flagRestored.set(Thread.currentThread().isInterrupted()); + } + }); + caller.start(); + assertTrue(workerStarted.await(10, TimeUnit.SECONDS)); + caller.interrupt(); + caller.join(10_000); + assertFalse(caller.isAlive(), "the interrupted batch must unwind promptly"); + + assertNotNull(thrown.get()); + assertInstanceOf(IOException.class, thrown.get()); + assertEquals("register interrupted", thrown.get().getMessage()); + assertTrue(flagRestored.get(), "the interrupt flag must be restored"); + assertTrue(workerInterrupted.get(), "the worker must be stopped"); + assertEquals(0, inFlight.get(), "quiescent before the exception reached the caller"); + } + + // ---- virtual mode ---- + + @Test + void virtualModeBoundsInFlightWork() throws Exception { + int bound = 3; + AtomicInteger inFlight = new AtomicInteger(); + AtomicInteger highWater = new AtomicInteger(); + CountDownLatch gate = new CountDownLatch(1); + CountDownLatch boundRunning = new CountDownLatch(bound); + + List> tasks = new ArrayList<>(); + for (int i = 0; i < 12; i++) { + tasks.add( () -> { - throw new IllegalStateException("oops"); - }; - IOException thrown = - assertThrows( - IOException.class, - () -> Tasks.awaitAll(pool, List.of(task), "interrupted", "failed")); - assertEquals("failed", thrown.getMessage()); - assertInstanceOf(IllegalStateException.class, thrown.getCause()); - } finally { - pool.shutdown(); + int now = inFlight.incrementAndGet(); + highWater.accumulateAndGet(now, Math::max); + boundRunning.countDown(); + try { + gate.await(); + return null; + } finally { + inFlight.decrementAndGet(); + } + }); } + Thread opener = + new Thread( + () -> { + try { + // Open the gate only once `bound` workers are provably in flight, + // so the high-water assertion is meaningful, then let all pass. + boundRunning.await(); + gate.countDown(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + }); + opener.start(); + Tasks.awaitAll(Tasks.Workers.virtual(bound), tasks, "jbig2 encode"); + opener.join(10_000); + assertEquals(bound, highWater.get(), "no more than the bound may run at once"); + } + + @Test + void rejectsANonPositiveWorkerLimit() { + assertThrows(IllegalArgumentException.class, () -> Tasks.Workers.platform(0)); + assertThrows(IllegalArgumentException.class, () -> Tasks.Workers.virtual(-1)); + } + + /** A task that throws {@code failure} once called. */ + private static Callable failing(Exception failure) { + return () -> { + throw failure; + }; } } From b7c20be7c93d04d06e4d99ea96e8a355f695158a Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 15:25:45 +0900 Subject: [PATCH 4/9] test(despeckle): add an op-level micro-benchmark for the page cleaner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit despeckle dominates pdfbook conversions (~72% of conv), but nothing measured WHERE inside clean() the ~50ms/page/core goes. benchCleaner times each Leptonica primitive the cleaner composes — read, the four selectBySize shapes (incl. the inverted-page variant whose giant background component is rendered back), the 43x43 dilate, the 7x7 open, the boolean ops, both counting passes, the G4 write — plus clean() end-to-end, on a deterministic synthetic 600-dpi A5 page, and writes the table to despeckle/docs/cleaner-baseline.md. The committed baseline (174.9ms clean(), sigma row covers 92.5%): dilate 43x43 = 21.5%, selectBySize on the inverted page x2 = 25.3% (22.2ms vs 15.2ms on the normal page — the background-component re-render penalty, measured), metrics-only countConnComp x2 = 13.5%, page selectBySize x2 = 17.4%, open 7x7 = 7.0%. Every following optimization is judged against this table. Co-Authored-By: Claude Fable 5 --- despeckle/docs/cleaner-baseline.md | 31 ++ despeckle/infrastructure/build.gradle.kts | 18 + .../tools/CleanerBenchmark.java | 341 ++++++++++++++++++ 3 files changed, 390 insertions(+) create mode 100644 despeckle/docs/cleaner-baseline.md create mode 100644 despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/tools/CleanerBenchmark.java diff --git a/despeckle/docs/cleaner-baseline.md b/despeckle/docs/cleaner-baseline.md new file mode 100644 index 0000000..96c422a --- /dev/null +++ b/despeckle/docs/cleaner-baseline.md @@ -0,0 +1,31 @@ +# despeckle cleaner op-level baseline + +Generated by `CleanerBenchmark` (`./gradlew :despeckle:infrastructure:benchCleaner`, dev container). +Times each Leptonica primitive the page cleaner composes on a synthetic +600-dpi A5 page (3496x4961 px, fixed seed). Re-run after any change to the +cleaner or the imaging bindings and compare before merging. + +- Date (UTC): 2026-06-10 06:25:05 +- Host: Linux amd64, 8 CPUs +- Samples: median of 10 reps after 2 warmups; single-threaded. + +| op | median (ms) | min (ms) | calls/clean() | est. share of clean() | +|---|---:|---:|---:|---:| +| read TIFF-G4 | 2.59 | 2.52 | 1 | 1.5% | +| selectBySize k=6 (page) | 15.22 | 14.98 | 1 | 8.7% | +| selectBySize 15 (page) | 15.19 | 14.80 | 1 | 8.7% | +| selectBySize k=6 (inverted) | 22.16 | 21.89 | 2 | 25.3% | +| dilate 43x43 (text mask) | 37.52 | 37.01 | 1 | 21.5% | +| open 7x7 (page) | 12.18 | 12.04 | 1 | 7.0% | +| invert | 0.26 | 0.25 | 2 | 0.3% | +| subtract | 0.38 | 0.36 | 5 | 1.1% | +| and | 0.40 | 0.38 | 1 | 0.2% | +| or | 0.33 | 0.32 | 3 | 0.6% | +| countConnComp | 11.82 | 11.46 | 2 | 13.5% | +| countPixels | 0.41 | 0.41 | 2 | 0.5% | +| write TIFF-G4 | 6.44 | 6.29 | 1 | 3.7% | +| **Σ(median × calls)** | 161.74 | | | 92.5% | +| **clean() end-to-end** | 174.91 | 173.28 | 1 | 100% | + +The Σ row landing near 100% means the table accounts for clean()'s real cost; +a large gap points at untimed work (allocation churn, codec internals). diff --git a/despeckle/infrastructure/build.gradle.kts b/despeckle/infrastructure/build.gradle.kts index 90f7db5..c957b23 100644 --- a/despeckle/infrastructure/build.gradle.kts +++ b/despeckle/infrastructure/build.gradle.kts @@ -78,3 +78,21 @@ tasks.named("jacocoTestCoverageVerification") { } tasks.named("check") { dependsOn("jacocoTestCoverageVerification") } + +// ---- Cleaner op-level micro-benchmark (see despeckle/docs/cleaner-baseline.md) ------------------ +// Times each Leptonica primitive the page cleaner composes, plus end-to-end clean(), on a +// deterministic synthetic 600-dpi A5 page — the measurement every cleaner/imaging optimization is +// judged against. Knob: -Preps=N (default 10). Dev-container only (needs the native Leptonica). +tasks.register("benchCleaner") { + group = "verification" + description = "Micro-benchmark cleaner ops + end-to-end clean(); writes despeckle/docs/cleaner-baseline.md" + dependsOn(tasks.named("testClasses")) + classpath = sourceSets["test"].runtimeClasspath + mainClass = "io.github.p4suta.despeckle.infrastructure.tools.CleanerBenchmark" + workingDir = rootDir + args = + listOf( + "despeckle/docs/cleaner-baseline.md", + providers.gradleProperty("reps").getOrElse("10"), + ) +} diff --git a/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/tools/CleanerBenchmark.java b/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/tools/CleanerBenchmark.java new file mode 100644 index 0000000..d1ff679 --- /dev/null +++ b/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/tools/CleanerBenchmark.java @@ -0,0 +1,341 @@ +package io.github.p4suta.despeckle.infrastructure.tools; + +import io.github.p4suta.despeckle.domain.model.OutputFormat; +import io.github.p4suta.despeckle.domain.model.ProcessOptions; +import io.github.p4suta.despeckle.infrastructure.leptonica.LeptonicaPageCleaner; +import io.github.p4suta.despeckle.testsupport.TestImages; +import io.github.p4suta.shared.imaging.Pix; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Random; + +/** + * Op-level micro-benchmark for the despeckle page cleaner: times each Leptonica primitive {@link + * LeptonicaPageCleaner} composes, plus the end-to-end {@code clean()}, on a deterministic synthetic + * 600-dpi A5 page (3496×4961 px — the pdfbook pipeline's real page geometry). Writes a Markdown + * table to {@code despeckle/docs/cleaner-baseline.md} so every optimization claim is judged against + * a committed measurement, mirroring the pipeline's {@code benchPipeline} convention. + * + *

Each op row reports the median and minimum of N timed reps (after warmup), the number of times + * {@code clean()} invokes that op per page with default options, and the estimated share of the + * end-to-end time the op therefore accounts for. The Σ(median × calls) sanity row should land near + * the {@code clean()} median; a large gap means an untimed cost (allocation, G4 codec internals). + * + *

Test-sources tool (driven by the {@code benchCleaner} Gradle task): never ships, and needs the + * dev container's Leptonica. Usage: {@code CleanerBenchmark [reps]}. + */ +public final class CleanerBenchmark { + + private static final int DPI = 600; + private static final int WIDTH = Math.round(148f * DPI / 25.4f); // A5 portrait + private static final int HEIGHT = Math.round(210f * DPI / 25.4f); + private static final int WARMUP_REPS = 2; + + // The cleaner's literal Leptonica selection flags (pix.h values, see LeptonicaPageCleaner). + private static final int CONN_8 = 8; + private static final int IF_EITHER = 5; + private static final int IF_GT = 2; + + // Default-derived sizes at 600 dpi: speck k=6, isolated dust 15, proximity 21, stroke 3. + private static final int SPECK = 6; + private static final int DUST = 15; + private static final int PROXIMITY = 21; + private static final int STROKE = 3; + + private CleanerBenchmark() {} + + /** One measured op: its label, how often clean() runs it, and the timed reps. */ + private record Row(String op, int callsPerClean, double medianMs, double minMs) {} + + public static void main(String[] args) throws IOException { + Path outDoc = Path.of(args.length > 0 ? args[0] : "despeckle/docs/cleaner-baseline.md"); + int reps = args.length > 1 ? Integer.parseInt(args[1]) : 10; + + Path work = Files.createTempDirectory("cleaner-bench"); + try { + Path pbm = work.resolve("page.pbm"); + TestImages.writePbm(pbm, syntheticPage()); + Path g4 = work.resolve("page.tif"); + try (Pix page = Pix.read(pbm)) { + page.setResolution(DPI); + page.writeTiffG4(g4); + } + + List rows = new ArrayList<>(); + try (Pix page = Pix.read(g4); + Pix inverted = page.inverted(); + Pix text6 = page.selectBySize(SPECK, SPECK, CONN_8, IF_EITHER, IF_GT); + Pix text15 = page.selectBySize(DUST, DUST, CONN_8, IF_EITHER, IF_GT)) { + + rows.add(timePix("read TIFF-G4", 1, reps, () -> Pix.read(g4))); + rows.add( + timePix( + "selectBySize k=6 (page)", + 1, + reps, + () -> page.selectBySize(SPECK, SPECK, CONN_8, IF_EITHER, IF_GT))); + rows.add( + timePix( + "selectBySize 15 (page)", + 1, + reps, + () -> page.selectBySize(DUST, DUST, CONN_8, IF_EITHER, IF_GT))); + rows.add( + timePix( + "selectBySize k=6 (inverted)", + 2, + reps, + () -> + inverted.selectBySize( + SPECK, SPECK, CONN_8, IF_EITHER, IF_GT))); + rows.add( + timePix( + "dilate 43x43 (text mask)", + 1, + reps, + () -> text15.dilated(PROXIMITY))); + rows.add(timePix("open 7x7 (page)", 1, reps, () -> page.opened(STROKE))); + rows.add(timePix("invert", 2, reps, page::inverted)); + rows.add(timePix("subtract", 5, reps, () -> page.subtract(text6))); + rows.add(timePix("and", 1, reps, () -> page.and(text6))); + rows.add(timePix("or", 3, reps, () -> page.or(text6))); + rows.add(timeVoid("countConnComp", 2, reps, page::connectedComponents)); + rows.add(timeVoid("countPixels", 2, reps, page::blackPixels)); + Path scratchTif = work.resolve("write.tif"); + rows.add(timeVoid("write TIFF-G4", 1, reps, () -> page.writeTiffG4(scratchTif))); + } + + LeptonicaPageCleaner cleaner = new LeptonicaPageCleaner(); + Path cleaned = work.resolve("cleaned.tif"); + Row clean = + timeVoid( + "clean() end-to-end", + 1, + reps, + () -> cleaner.clean(g4, cleaned, OutputFormat.TIFF, options())); + + String report = render(rows, clean, reps); + Path parent = outDoc.toAbsolutePath().getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + Files.writeString(outDoc, report, StandardCharsets.UTF_8); + System.out.print(report); + System.err.println(); + System.err.println("→ wrote " + outDoc); + } finally { + deleteTree(work); + } + } + + private static ProcessOptions options() { + // The pipeline's exact configuration: defaults (fillHoles + isolatedDust on) at a known + // dpi, so the derived sizes match the op rows above. + return ProcessOptions.defaults().withDpi(DPI); + } + + // ---- timing ---- + + @FunctionalInterface + private interface PixOp { + Pix run() throws IOException; + } + + @FunctionalInterface + private interface VoidOp { + void run() throws IOException; + } + + /** Times an op returning a Pix; the result's close() happens outside the timed window. */ + private static Row timePix(String op, int calls, int reps, PixOp body) throws IOException { + double[] samples = new double[reps]; + for (int i = -WARMUP_REPS; i < reps; i++) { + long start = System.nanoTime(); + Pix result = body.run(); + long elapsed = System.nanoTime() - start; + result.close(); + if (i >= 0) { + samples[i] = elapsed / 1e6; + } + } + return new Row(op, calls, median(samples), Arrays.stream(samples).min().orElse(0)); + } + + private static Row timeVoid(String op, int calls, int reps, VoidOp body) throws IOException { + double[] samples = new double[reps]; + for (int i = -WARMUP_REPS; i < reps; i++) { + long start = System.nanoTime(); + body.run(); + long elapsed = System.nanoTime() - start; + if (i >= 0) { + samples[i] = elapsed / 1e6; + } + } + return new Row(op, calls, median(samples), Arrays.stream(samples).min().orElse(0)); + } + + private static double median(double[] values) { + double[] sorted = values.clone(); + Arrays.sort(sorted); + int n = sorted.length; + return (n % 2 == 1) ? sorted[n / 2] : (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0; + } + + // ---- the synthetic page ---- + + /** + * A deterministic 600-dpi A5 "scan": right-to-left glyph columns (despeckle's protected text), + * 1–3 px dust everywhere (the speck filter's work), isolated 8–12 px blots in the margins (the + * isolated-dust pass's work), and pin-holes punched into glyph blocks (the fill-holes pass's + * work). Fixed seed, so the baseline is reproducible. + */ + private static boolean[][] syntheticPage() { + boolean[][] img = TestImages.blank(WIDTH, HEIGHT); + Random random = new Random(42); + + int margin = WIDTH / 10; + int glyph = Math.max(4, WIDTH / 60); + int leading = glyph / 2; + int top = HEIGHT / 12; + int bottom = HEIGHT - HEIGHT / 12; + int glyphIndex = 0; + for (int x = WIDTH - margin - glyph; x >= margin; x -= glyph + leading) { + int y = top; + while (y + glyph <= bottom) { + if (random.nextInt(100) < 8) { + y += glyph * (2 + random.nextInt(4)); + continue; + } + TestImages.fillRect(img, x, y, x + glyph - 3, y + glyph - 3); + // Every ~40th glyph carries a pin-hole, so fillHoles has real work. + if (glyphIndex % 40 == 0) { + int hx = x + glyph / 2; + int hy = y + glyph / 2; + img[hy][hx] = false; + img[hy][hx + 1] = false; + img[hy + 1][hx] = false; + img[hy + 1][hx + 1] = false; + } + glyphIndex++; + y += glyph; + } + } + + // Salt-and-pepper dust (1–3 px), what the main speck filter removes. + int specks = WIDTH * HEIGHT / 25_000; + for (int i = 0; i < specks; i++) { + int size = 1 + random.nextInt(3); + int x = random.nextInt(WIDTH - size); + int y = random.nextInt(HEIGHT - size); + TestImages.fillRect(img, x, y, x + size - 1, y + size - 1); + } + + // Isolated 8–12 px blots out in the margins, what the isolated-dust pass removes. + for (int i = 0; i < 20; i++) { + int size = 8 + random.nextInt(5); + int x = + random.nextBoolean() + ? random.nextInt(margin - size) + : WIDTH - margin + random.nextInt(margin - size); + int y = top + random.nextInt(bottom - top - size); + TestImages.fillRect(img, x, y, x + size - 1, y + size - 1); + } + return img; + } + + // ---- rendering ---- + + private static String render(List rows, Row clean, int reps) { + var sb = new StringBuilder(); + sb.append("# despeckle cleaner op-level baseline\n\n") + .append("Generated by `CleanerBenchmark`") + .append(" (`./gradlew :despeckle:infrastructure:benchCleaner`, dev container).\n") + .append("Times each Leptonica primitive the page cleaner composes on a synthetic\n") + .append("600-dpi A5 page (") + .append(WIDTH) + .append("x") + .append(HEIGHT) + .append(" px, fixed seed). Re-run after any change to the\n") + .append("cleaner or the imaging bindings and compare before merging.\n\n"); + String date = + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss", Locale.ROOT) + .withZone(ZoneOffset.UTC) + .format(Instant.now()); + sb.append("- Date (UTC): ").append(date).append('\n'); + sb.append("- Host: ") + .append(System.getProperty("os.name", "?")) + .append(' ') + .append(System.getProperty("os.arch", "?")) + .append(", ") + .append(Runtime.getRuntime().availableProcessors()) + .append(" CPUs\n"); + sb.append("- Samples: median of ") + .append(reps) + .append(" reps after ") + .append(WARMUP_REPS) + .append(" warmups; single-threaded.\n\n"); + + sb.append("| op | median (ms) | min (ms) | calls/clean() | est. share of clean() |\n"); + sb.append("|---|---:|---:|---:|---:|\n"); + double estimatedTotal = 0; + for (Row row : rows) { + double estimated = row.medianMs() * row.callsPerClean(); + estimatedTotal += estimated; + sb.append( + String.format( + Locale.ROOT, + "| %s | %.2f | %.2f | %d | %.1f%% |%n", + row.op(), + row.medianMs(), + row.minMs(), + row.callsPerClean(), + estimated * 100.0 / clean.medianMs())); + } + sb.append( + String.format( + Locale.ROOT, + "| **Σ(median × calls)** | %.2f | | | %.1f%% |%n", + estimatedTotal, + estimatedTotal * 100.0 / clean.medianMs())); + sb.append( + String.format( + Locale.ROOT, + "| **clean() end-to-end** | %.2f | %.2f | 1 | 100%% |%n", + clean.medianMs(), + clean.minMs())); + sb.append( + "\nThe Σ row landing near 100% means the table accounts for clean()'s real" + + " cost;\n") + .append( + "a large gap points at untimed work (allocation churn, codec" + + " internals).\n"); + return sb.toString(); + } + + private static void deleteTree(Path dir) throws IOException { + if (!Files.exists(dir)) { + return; + } + try (var paths = Files.walk(dir)) { + paths.sorted((a, b) -> b.getNameCount() - a.getNameCount()) + .forEach( + p -> { + try { + Files.deleteIfExists(p); + } catch (IOException e) { + System.err.println( + "warn: could not delete " + p + ": " + e.getMessage()); + } + }); + } + } +} From d3111761c892dba586f7b1cb929f25970407ce2a Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 15:37:15 +0900 Subject: [PATCH 5/9] perf(despeckle): skip the metrics-only component-counting passes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clean() counted 8-connected components before and after every page — two full connected-component labelings, 13.5% of the page's time per the committed baseline — yet the counts feed nothing but the HTML report's charts and the summary log line. The pdfbook pipeline never sets a report dir, so its hot path paid for numbers nobody read. ProcessResult's component counts become OptionalInt (a counted 4-int convenience constructor keeps every existing construction compiling unchanged; componentsRemoved() stays present-or-zero int, documented). ProcessOptions grows collectComponentStats (default true via the delegating 5-knob constructor) plus a withoutComponentStats() wither. DespeckleService turns counting off exactly when no reportDir is set — the report path is byte-identical, including its charts and totals. The black-pixel counts still run unconditionally: they feed the 3% over-removal guard. Summary logs switch to always-measured black-pixel terms when counting was skipped. Measured: clean() 161.8ms -> 139.5ms single-threaded (-13.8%, matching the baseline's countConnComp share); pipeline at -j8 despeckle stage 10.19s -> 9.57s (-6.1%, the saving partially absorbed by memory- bandwidth saturation), conv -4.4%. The benchmark gains a "clean() without component stats" row pinning the pipeline-path number. Co-Authored-By: Claude Fable 5 --- .../application/DespeckleService.java | 44 +++++++++++----- .../application/PdfBatchService.java | 23 +++++--- .../application/DespeckleServiceTest.java | 27 ++++++++++ .../p4suta/despeckle/application/Fakes.java | 2 + despeckle/docs/cleaner-baseline.md | 31 +++++------ .../domain/model/ProcessOptions.java | 27 +++++++++- .../despeckle/domain/model/ProcessResult.java | 52 +++++++++++++++++-- .../domain/model/ProcessOptionsTest.java | 17 ++++++ .../domain/model/ProcessResultTest.java | 17 ++++++ .../leptonica/LeptonicaPageCleaner.java | 16 ++++-- .../infrastructure/report/HtmlReporter.java | 6 ++- .../leptonica/LeptonicaPageCleanerTest.java | 34 ++++++++++++ .../tools/CleanerBenchmark.java | 23 +++++++- 13 files changed, 270 insertions(+), 49 deletions(-) diff --git a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java index 034f2e2..4bd632b 100644 --- a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java +++ b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java @@ -70,7 +70,8 @@ public record Config( * Aggregate outcome of a run. * * @param pages number of pages processed - * @param componentsRemoved total components removed across all pages + * @param componentsRemoved total components removed across all pages; {@code 0} when no report + * consumed component stats (counting is skipped for speed without a report) * @param overRemovalWarnings number of pages flagged for possible over-removal */ public record Summary(int pages, long componentsRemoved, int overRemovalWarnings) {} @@ -96,14 +97,20 @@ public Summary run(Config config, PageProgressListener progress) throws IOExcept } LOG.info("despeckling {} page(s) with {} thread(s)", files.size(), config.jobs()); + @Nullable Path reportDir = config.reportDir(); + boolean reporting = reportDir != null; Reporter report = - config.reportDir() == null - ? Reporter.noOp() - : reporterFactory.create(config.reportDir(), config.flipbook()); + reportDir != null + ? reporterFactory.create(reportDir, config.flipbook()) + : Reporter.noOp(); + // The report is the only consumer of per-page component counts, and counting is a full + // connected-component labeling twice per page — skip it when no report will be written. + ProcessOptions options = + reporting ? config.options() : config.options().withoutComponentStats(); List> tasks = new ArrayList<>(files.size()); for (Path src : files) { - tasks.add(() -> processOne(src, config, report)); + tasks.add(() -> processOne(src, config, options, report)); } // Platform workers: each page is CPU-bound Leptonica work (FFM downcalls pin virtual // threads' carriers). The fan-out fails fast and quiesces before throwing, so a failed @@ -124,9 +131,11 @@ public Summary run(Config config, PageProgressListener progress) throws IOExcept report.finish(); long totalRemoved = 0; + long blackRemoved = 0; int warnings = 0; for (PageOutcome outcome : outcomes) { totalRemoved += outcome.result().componentsRemoved(); + blackRemoved += outcome.result().blackPixelsRemoved(); if (outcome.result().isOverRemoval()) { warnings++; LOG.warn( @@ -135,17 +144,28 @@ public Summary run(Config config, PageProgressListener progress) throws IOExcept Math.round(outcome.result().removedBlackPixelRatio() * 100)); } } - LOG.info( - "done: {} page(s), {} component(s) removed, {} over-removal warning(s)", - files.size(), - totalRemoved, - warnings); + if (reporting) { + LOG.info( + "done: {} page(s), {} component(s) removed, {} over-removal warning(s)", + files.size(), + totalRemoved, + warnings); + } else { + // Without a report nothing counted components (the counting passes are skipped for + // speed), so the summary speaks in the always-measured black-pixel terms. + LOG.info( + "done: {} page(s), {} black pixel(s) removed, {} over-removal warning(s)", + files.size(), + blackRemoved, + warnings); + } return new Summary(files.size(), totalRemoved, warnings); } private record PageOutcome(Path source, ProcessResult result) {} - private PageOutcome processOne(Path src, Config config, Reporter report) throws IOException { + private PageOutcome processOne(Path src, Config config, ProcessOptions options, Reporter report) + throws IOException { Path dest = CorpusFiles.mirrorDestination( src, config.inputDir(), config.outputDir(), config.format().extension()); @@ -154,7 +174,7 @@ private PageOutcome processOne(Path src, Config config, Reporter report) throws if (parent != null) { Files.createDirectories(parent); } - ProcessResult result = pageCleaner.clean(src, dest, config.format(), config.options()); + ProcessResult result = pageCleaner.clean(src, dest, config.format(), options); Path stem = config.inputDir().relativize(src); report.addPage(stem, src, dest, result); return new PageOutcome(src, result); diff --git a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfBatchService.java b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfBatchService.java index c42cf03..d1c5da2 100644 --- a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfBatchService.java +++ b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfBatchService.java @@ -148,14 +148,23 @@ public Summary run(Config config) throws IOException { if (config.reportParent() != null) { batchReporter.write(config.reportParent(), books); + LOG.info( + "done: {} ok, {} skipped, {} failed, {} page(s), {} component(s) removed", + ok, + skipped, + failed, + totalPages, + totalComponentsRemoved); + } else { + // Without reports the runs skip component counting (an expensive labeling, twice per + // page), so a component total would always read 0 — leave it out of the line. + LOG.info( + "done: {} ok, {} skipped, {} failed, {} page(s)", + ok, + skipped, + failed, + totalPages); } - LOG.info( - "done: {} ok, {} skipped, {} failed, {} page(s), {} component(s) removed", - ok, - skipped, - failed, - totalPages, - totalComponentsRemoved); return new Summary(ok, skipped, failed, totalPages, totalComponentsRemoved); } diff --git a/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/DespeckleServiceTest.java b/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/DespeckleServiceTest.java index 4bb8187..ba543df 100644 --- a/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/DespeckleServiceTest.java +++ b/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/DespeckleServiceTest.java @@ -1,6 +1,7 @@ package io.github.p4suta.despeckle.application; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -141,4 +142,30 @@ void drivesTheReporterWhenAReportDirIsGiven(@TempDir Path tmp) throws IOExceptio assertEquals(2, factory.reporter.pages.get(), "every page is reported"); assertEquals(1, factory.reporter.finished.get(), "the report is finalized once"); } + + @Test + void skipsComponentCountingWithoutAReport(@TempDir Path tmp) throws IOException { + Path in = tmp.resolve("in"); + writeInputs(in, 1); + FakePageCleaner cleaner = new FakePageCleaner(LIGHT); + + new DespeckleService(cleaner, new RecordingReporterFactory()) + .run(config(in, tmp.resolve("out"), false, null)); + + ProcessOptions seen = java.util.Objects.requireNonNull(cleaner.lastOptions); + assertFalse(seen.collectComponentStats(), "no report -> counting passes are skipped"); + } + + @Test + void keepsComponentCountingForTheReportPath(@TempDir Path tmp) throws IOException { + Path in = tmp.resolve("in"); + writeInputs(in, 1); + FakePageCleaner cleaner = new FakePageCleaner(LIGHT); + + new DespeckleService(cleaner, new RecordingReporterFactory()) + .run(config(in, tmp.resolve("out"), false, tmp.resolve("report"))); + + ProcessOptions seen = java.util.Objects.requireNonNull(cleaner.lastOptions); + assertTrue(seen.collectComponentStats(), "the report consumes the counts"); + } } diff --git a/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java b/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java index aaf9f49..a061822 100644 --- a/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java +++ b/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java @@ -33,6 +33,7 @@ private Fakes() {} /** A {@link PageCleaner} that writes a stub output file and returns a fixed result. */ static final class FakePageCleaner implements PageCleaner { final AtomicInteger calls = new AtomicInteger(); + volatile @Nullable ProcessOptions lastOptions; private final ProcessResult result; FakePageCleaner(ProcessResult result) { @@ -48,6 +49,7 @@ public ProcessResult clean( throw new java.io.UncheckedIOException(e); } calls.incrementAndGet(); + lastOptions = options; return result; } } diff --git a/despeckle/docs/cleaner-baseline.md b/despeckle/docs/cleaner-baseline.md index 96c422a..1304db0 100644 --- a/despeckle/docs/cleaner-baseline.md +++ b/despeckle/docs/cleaner-baseline.md @@ -5,27 +5,28 @@ Times each Leptonica primitive the page cleaner composes on a synthetic 600-dpi A5 page (3496x4961 px, fixed seed). Re-run after any change to the cleaner or the imaging bindings and compare before merging. -- Date (UTC): 2026-06-10 06:25:05 +- Date (UTC): 2026-06-10 06:36:38 - Host: Linux amd64, 8 CPUs - Samples: median of 10 reps after 2 warmups; single-threaded. | op | median (ms) | min (ms) | calls/clean() | est. share of clean() | |---|---:|---:|---:|---:| -| read TIFF-G4 | 2.59 | 2.52 | 1 | 1.5% | -| selectBySize k=6 (page) | 15.22 | 14.98 | 1 | 8.7% | -| selectBySize 15 (page) | 15.19 | 14.80 | 1 | 8.7% | -| selectBySize k=6 (inverted) | 22.16 | 21.89 | 2 | 25.3% | -| dilate 43x43 (text mask) | 37.52 | 37.01 | 1 | 21.5% | -| open 7x7 (page) | 12.18 | 12.04 | 1 | 7.0% | -| invert | 0.26 | 0.25 | 2 | 0.3% | -| subtract | 0.38 | 0.36 | 5 | 1.1% | -| and | 0.40 | 0.38 | 1 | 0.2% | -| or | 0.33 | 0.32 | 3 | 0.6% | -| countConnComp | 11.82 | 11.46 | 2 | 13.5% | +| read TIFF-G4 | 2.53 | 2.52 | 1 | 1.6% | +| selectBySize k=6 (page) | 16.04 | 14.90 | 1 | 9.9% | +| selectBySize 15 (page) | 15.02 | 14.84 | 1 | 9.3% | +| selectBySize k=6 (inverted) | 22.14 | 21.84 | 2 | 27.4% | +| dilate 43x43 (text mask) | 38.37 | 36.97 | 1 | 23.7% | +| open 7x7 (page) | 12.15 | 12.01 | 1 | 7.5% | +| invert | 0.25 | 0.25 | 2 | 0.3% | +| subtract | 0.40 | 0.38 | 5 | 1.2% | +| and | 0.41 | 0.40 | 1 | 0.3% | +| or | 0.39 | 0.35 | 3 | 0.7% | +| countConnComp | 11.53 | 11.46 | 2 | 14.3% | | countPixels | 0.41 | 0.41 | 2 | 0.5% | -| write TIFF-G4 | 6.44 | 6.29 | 1 | 3.7% | -| **Σ(median × calls)** | 161.74 | | | 92.5% | -| **clean() end-to-end** | 174.91 | 173.28 | 1 | 100% | +| write TIFF-G4 | 6.34 | 6.26 | 1 | 3.9% | +| **Σ(median × calls)** | 162.69 | | | 100.6% | +| **clean() end-to-end** | 161.78 | 159.91 | 1 | 100% | +| **clean() without component stats** | 139.46 | 137.07 | 1 | 86.2% | The Σ row landing near 100% means the table accounts for clean()'s real cost; a large gap points at untimed work (allocation churn, codec internals). diff --git a/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessOptions.java b/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessOptions.java index a565b1f..2051160 100644 --- a/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessOptions.java +++ b/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessOptions.java @@ -27,13 +27,16 @@ * @param removeIsolatedDust whether to run the isolated-medium-dust pass * @param isolatedDustSizePx an explicit max size for an isolated speck, or empty to derive it; a * present value also implies {@code removeIsolatedDust} + * @param collectComponentStats whether to count 8-connected components before and after — two of + * the most expensive full-page scans, so off when nothing consumes the counts (no report) */ public record ProcessOptions( OptionalInt dpi, OptionalInt speckSizePx, boolean fillHoles, boolean removeIsolatedDust, - OptionalInt isolatedDustSizePx) { + OptionalInt isolatedDustSizePx, + boolean collectComponentStats) { /** Resolution assumed for the speck filter when neither a flag nor the image supplies one. */ public static final int DEFAULT_DPI = 300; @@ -50,6 +53,16 @@ public record ProcessOptions( } } + /** The five-knob shape every direct caller uses; component counting defaults to on. */ + public ProcessOptions( + OptionalInt dpi, + OptionalInt speckSizePx, + boolean fillHoles, + boolean removeIsolatedDust, + OptionalInt isolatedDustSizePx) { + this(dpi, speckSizePx, fillHoles, removeIsolatedDust, isolatedDustSizePx, true); + } + /** Options with the isolated-dust pass off — the common case. */ public static ProcessOptions of(OptionalInt dpi, OptionalInt speckSizePx, boolean fillHoles) { return new ProcessOptions(dpi, speckSizePx, fillHoles, false, OptionalInt.empty()); @@ -66,7 +79,17 @@ public ProcessOptions withDpi(int dpi) { speckSizePx, fillHoles, removeIsolatedDust, - isolatedDustSizePx); + isolatedDustSizePx, + collectComponentStats); + } + + /** + * A copy with component counting disabled — for runs where nothing consumes the counts (no + * report), saving two full connected-component labelings per page. + */ + public ProcessOptions withoutComponentStats() { + return new ProcessOptions( + dpi, speckSizePx, fillHoles, removeIsolatedDust, isolatedDustSizePx, false); } /** diff --git a/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessResult.java b/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessResult.java index f6efbf9..8e6fd25 100644 --- a/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessResult.java +++ b/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessResult.java @@ -1,15 +1,26 @@ package io.github.p4suta.despeckle.domain.model; +import java.util.OptionalInt; + /** * The outcome of despeckling one page. * - * @param componentsBefore 8-connected component count of the input - * @param componentsAfter 8-connected component count of the output + *

The black-pixel counts are always measured — they feed the over-removal guard. The component + * counts are measured only when something consumes them (the HTML report): counting 8-connected + * components is a full connected-component labeling of the page, one of the most expensive scans in + * the whole clean, so a run with no report skips both counting passes and carries empty components + * here. + * + * @param componentsBefore 8-connected component count of the input, when counted + * @param componentsAfter 8-connected component count of the output, when counted * @param blackPixelsBefore foreground pixel count of the input * @param blackPixelsAfter foreground pixel count of the output */ public record ProcessResult( - int componentsBefore, int componentsAfter, long blackPixelsBefore, long blackPixelsAfter) { + OptionalInt componentsBefore, + OptionalInt componentsAfter, + long blackPixelsBefore, + long blackPixelsAfter) { /** * A black-pixel removal ratio above this flags a possibly over-cleaned page. The single domain @@ -17,12 +28,38 @@ public record ProcessResult( */ public static final double OVER_REMOVAL_WARN_RATIO = 0.03; + /** A counted result — the shape the report path consumes. */ + public ProcessResult( + int componentsBefore, + int componentsAfter, + long blackPixelsBefore, + long blackPixelsAfter) { + this( + OptionalInt.of(componentsBefore), + OptionalInt.of(componentsAfter), + blackPixelsBefore, + blackPixelsAfter); + } + + /** The result of a run that skipped component counting (nothing consumes the counts). */ + public static ProcessResult withoutComponentStats( + long blackPixelsBefore, long blackPixelsAfter) { + return new ProcessResult( + OptionalInt.empty(), OptionalInt.empty(), blackPixelsBefore, blackPixelsAfter); + } + + /** Whether component counting ran (true on the report path). */ + public boolean hasComponentStats() { + return componentsBefore.isPresent(); + } + /** * Net drop in 8-connected components — dust removed minus any holes filled back in. Summed into - * the run total and plotted per page in the report. + * the run total and plotted per page in the report. {@code 0} when counting was skipped (see + * {@link #hasComponentStats()}). */ public int componentsRemoved() { - return componentsBefore - componentsAfter; + return componentsBefore.orElse(0) - componentsAfter.orElse(0); } /** @@ -36,6 +73,11 @@ public double removedBlackPixelRatio() { return (double) (blackPixelsBefore - blackPixelsAfter) / blackPixelsBefore; } + /** Black pixels removed from the page — the cost-free removal measure every run carries. */ + public long blackPixelsRemoved() { + return blackPixelsBefore - blackPixelsAfter; + } + /** * Whether {@link #removedBlackPixelRatio()} exceeds {@link #OVER_REMOVAL_WARN_RATIO}, the * threshold the runner logs on and the report flags. diff --git a/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessOptionsTest.java b/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessOptionsTest.java index 700035a..6d70e42 100644 --- a/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessOptionsTest.java +++ b/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessOptionsTest.java @@ -163,4 +163,21 @@ void isolatedDustSizeStaysAboveTheSpeckSizeAtVeryLowResolution() { assertEquals( 2, options.isolatedDustSize(noImg()), "isolated-dust size floors at speckSize + 1"); } + + @Test + void componentStatsDefaultOnAndWitherTurnsThemOff() { + ProcessOptions defaults = ProcessOptions.defaults(); + assertTrue(defaults.collectComponentStats(), "counting is on unless turned off"); + + ProcessOptions off = defaults.withoutComponentStats(); + assertFalse(off.collectComponentStats()); + // Every other knob survives the wither. + assertEquals(defaults.dpi(), off.dpi()); + assertEquals(defaults.speckSizePx(), off.speckSizePx()); + assertEquals(defaults.fillHoles(), off.fillHoles()); + assertEquals(defaults.removeIsolatedDust(), off.removeIsolatedDust()); + assertEquals(defaults.isolatedDustSizePx(), off.isolatedDustSizePx()); + // ...including through withDpi. + assertFalse(off.withDpi(600).collectComponentStats()); + } } diff --git a/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessResultTest.java b/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessResultTest.java index fac55e0..7e2508b 100644 --- a/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessResultTest.java +++ b/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessResultTest.java @@ -60,4 +60,21 @@ void isOverRemovalIsFalseBelowTheThreshold() { void isOverRemovalIsFalseForABlankInputPage() { assertFalse(new ProcessResult(0, 0, 0, 0).isOverRemoval()); } + + @Test + void withoutComponentStatsCarriesNoCountsButFullPixelMath() { + ProcessResult result = ProcessResult.withoutComponentStats(1000, 950); + assertFalse(result.hasComponentStats()); + assertEquals(0, result.componentsRemoved(), "absent counts read as zero, documented"); + assertEquals(50L, result.blackPixelsRemoved()); + assertTrue(result.isOverRemoval(), "the over-removal guard works without counts"); + } + + @Test + void countedConstructorHasComponentStats() { + ProcessResult result = new ProcessResult(120, 95, 1000, 990); + assertTrue(result.hasComponentStats()); + assertEquals(25, result.componentsRemoved()); + assertEquals(10L, result.blackPixelsRemoved()); + } } diff --git a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleaner.java b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleaner.java index 7f8434c..6fd7d77 100644 --- a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleaner.java +++ b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleaner.java @@ -52,7 +52,11 @@ public ProcessResult clean( int raw = source.resolution(); Optional img = raw > 0 ? Optional.of(Resolution.of(raw)) : Optional.empty(); int k = options.speckSize(img); - int componentsBefore = source.connectedComponents(); + // Component counting is a full connected-component labeling — one of the most + // expensive scans here — so it runs only when something consumes the counts (the + // report). The black-pixel counts always run: they feed the over-removal guard. + boolean countComponents = options.collectComponentStats(); + int componentsBefore = countComponents ? source.connectedComponents() : 0; long blackBefore = source.blackPixels(); int sourceFormat = source.inputFormat(); @@ -78,14 +82,18 @@ public ProcessResult clean( current = filled; } - int componentsAfter = current.connectedComponents(); long blackAfter = current.blackPixels(); // Stamp the honored resolution so a TIFF/PNG output carries an accurate tag. Only a // known resolution is written; an unknown one is left untouched. options.resolution(img).map(Resolution::dpi).ifPresent(current::setResolution); writeIn(current, output, format, sourceFormat); - return new ProcessResult( - componentsBefore, componentsAfter, blackBefore, blackAfter); + return countComponents + ? new ProcessResult( + componentsBefore, + current.connectedComponents(), + blackBefore, + blackAfter) + : ProcessResult.withoutComponentStats(blackBefore, blackAfter); } finally { current.close(); } diff --git a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/report/HtmlReporter.java b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/report/HtmlReporter.java index 2024be8..32d243f 100644 --- a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/report/HtmlReporter.java +++ b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/report/HtmlReporter.java @@ -88,11 +88,13 @@ public void addPage(Path relativeStem, Path inputImage, Path outputImage, Proces } writeOverlayAndAccumulate(beforeImg, afterImg, overlayWebp); + // The report path always runs with component counting on (DespeckleService enables it + // whenever a reportDir is set), so the counts are present here; orElse(0) is type-driven. stats.add( new PageStat( stem, - result.componentsBefore(), - result.componentsAfter(), + result.componentsBefore().orElse(0), + result.componentsAfter().orElse(0), result.removedBlackPixelRatio())); } diff --git a/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleanerTest.java b/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleanerTest.java index 4bd4523..15d3cef 100644 --- a/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleanerTest.java +++ b/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleanerTest.java @@ -1,6 +1,7 @@ package io.github.p4suta.despeckle.infrastructure.leptonica; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import io.github.p4suta.despeckle.domain.model.OutputFormat; @@ -231,4 +232,37 @@ void fillHolesSparesTheGapInsideAThinWalledGlyph(@TempDir Path dir) throws Excep ring, cleaned.blackPixels(), "the thin-walled gap is preserved, not filled"); } } + + @Test + void skippedComponentCountingStillCleansAndGuardsOverRemoval(@TempDir Path dir) + throws Exception { + // The same speck-removal scenario as removesSpecksButPreservesGlyph, with counting off: + // the output is identical, the result carries no component stats, and the always-measured + // black-pixel math still drives the over-removal guard. + Path src = dir.resolve("page.pbm"); + Path out = dir.resolve("page-out.pbm"); + boolean[][] img = TestImages.blank(40, 40); + TestImages.fillRect(img, 8, 8, 19, 25); + TestImages.dot(img, 2, 2); + TestImages.dot(img, 35, 30); + TestImages.dot(img, 30, 4); + TestImages.writePbm(src, img); + + ProcessResult result = + cleaner.clean( + src, + out, + OutputFormat.PBM, + ProcessOptions.of(OptionalInt.of(300), OptionalInt.of(3), false) + .withoutComponentStats()); + + assertFalse(result.hasComponentStats()); + assertEquals(0, result.componentsRemoved(), "absent counts read as zero"); + assertEquals(3L, result.blackPixelsRemoved(), "three 1px specks gone"); + assertFalse(result.isOverRemoval(), "3 of 219 black pixels is under the 3% threshold"); + try (Pix cleaned = Pix.read(out)) { + assertEquals(1, cleaned.connectedComponents()); + assertEquals(12L * 18L, cleaned.blackPixels(), "the whole glyph survives intact"); + } + } } diff --git a/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/tools/CleanerBenchmark.java b/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/tools/CleanerBenchmark.java index d1ff679..6811533 100644 --- a/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/tools/CleanerBenchmark.java +++ b/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/tools/CleanerBenchmark.java @@ -122,8 +122,20 @@ public static void main(String[] args) throws IOException { 1, reps, () -> cleaner.clean(g4, cleaned, OutputFormat.TIFF, options())); + // The pipeline's actual configuration (no report -> component counting skipped). + Row cleanNoStats = + timeVoid( + "clean() without component stats", + 1, + reps, + () -> + cleaner.clean( + g4, + cleaned, + OutputFormat.TIFF, + options().withoutComponentStats())); - String report = render(rows, clean, reps); + String report = render(rows, clean, cleanNoStats, reps); Path parent = outDoc.toAbsolutePath().getParent(); if (parent != null) { Files.createDirectories(parent); @@ -254,7 +266,7 @@ private static boolean[][] syntheticPage() { // ---- rendering ---- - private static String render(List rows, Row clean, int reps) { + private static String render(List rows, Row clean, Row cleanNoStats, int reps) { var sb = new StringBuilder(); sb.append("# despeckle cleaner op-level baseline\n\n") .append("Generated by `CleanerBenchmark`") @@ -312,6 +324,13 @@ private static String render(List rows, Row clean, int reps) { "| **clean() end-to-end** | %.2f | %.2f | 1 | 100%% |%n", clean.medianMs(), clean.minMs())); + sb.append( + String.format( + Locale.ROOT, + "| **clean() without component stats** | %.2f | %.2f | 1 | %.1f%% |%n", + cleanNoStats.medianMs(), + cleanNoStats.minMs(), + cleanNoStats.medianMs() * 100.0 / clean.medianMs())); sb.append( "\nThe Σ row landing near 100% means the table accounts for clean()'s real" + " cost;\n") From 97b3db2045fda868854f03331985d1e738324f13 Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 16:06:57 +0900 Subject: [PATCH 6/9] chore(lint): teach typos the PDF spec's DecodeParms, reword "mis-sizes" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI spell check flags PDFBox's COSName.DECODE_PARMS (the PDF spec's own key name, which the remux test must name verbatim) and a hyphenated coinage in the extractor's javadoc. Allowlist the spec identifier — the same precedent as the veraPDF en-GB names — and use plain words. Co-Authored-By: Claude Fable 5 --- _typos.toml | 4 ++++ .../io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/_typos.toml b/_typos.toml index df533d6..b9800fd 100644 --- a/_typos.toml +++ b/_typos.toml @@ -45,3 +45,7 @@ Pagent = "Pagent" Flavour = "Flavour" flavours = "flavours" initialise = "initialise" +# The PDF spec's own key name `DecodeParms` (and PDFBox's COSName.DECODE_PARMS +# constant mirroring it) — third-party/spec identifiers we must name verbatim. +Parms = "Parms" +PARMS = "PARMS" diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java index 2c4bfac..77ed395 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java @@ -237,7 +237,7 @@ private static boolean wrapChunk(String prefix, List /** * Whether the wrapped TIFF decodes to the listing row's dimensions — the read-back proof that * the stream really was plain T.6 (an {@code EncodedByteAlign} stream, undetectable from the - * params file, fails or mis-sizes here). + * params file, fails to decode or comes back the wrong size here). */ private static boolean decodesBack(Path tif, PdfListingParser.ImageRow row) { try (Pix pix = Pix.read(tif)) { From 8601937e5b5f8290bc2006b1223fd28da6da967f Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 16:09:35 +0900 Subject: [PATCH 7/9] test(process): make the contiguous-progress test deterministic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test released its gate from inside the ok tasks, leaving a window where the failing task's completion could overtake an ok task's still-being-enqueued completion (countDown happens inside call(), the queue add after it returns) — the recorded sequence then read [1] instead of [1, 2]. The gate now opens from the progress callback on the orchestrating thread, so both successes are provably consumed before the failure is thrown. Also renames `failer` to `failing` for the spell check. Co-Authored-By: Claude Fable 5 --- .../p4suta/shared/process/TasksTest.java | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java b/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java index f50253b..0b70e1b 100644 --- a/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java +++ b/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java @@ -151,7 +151,7 @@ void firstFailureInterruptsSiblingsAndSkipsQueuedWork() throws Exception { } return null; }; - Callable failer = + Callable failing = () -> { blockerStarted.await(); // fail only once the sibling is provably running throw boom; @@ -168,7 +168,7 @@ void firstFailureInterruptsSiblingsAndSkipsQueuedWork() throws Exception { () -> Tasks.awaitAll( Tasks.Workers.platform(2), - List.of(blocker, failer, queued), + List.of(blocker, failing, queued), "batch")); assertSame(boom, thrown); assertTrue(blockerInterrupted.get(), "the running sibling must be interrupted"); @@ -193,7 +193,7 @@ void quiescesBeforeTheFailurePropagates() { inFlight.decrementAndGet(); } }; - Callable failer = + Callable failing = () -> { blockerStarted.await(); throw new IllegalStateException("boom"); @@ -201,7 +201,7 @@ void quiescesBeforeTheFailurePropagates() { assertThrows( IllegalStateException.class, - () -> Tasks.awaitAll(Tasks.Workers.platform(2), List.of(blocker, failer), "batch")); + () -> Tasks.awaitAll(Tasks.Workers.platform(2), List.of(blocker, failing), "batch")); // The contract callers' finally-blocks rely on: when awaitAll throws, no worker is still // running (still writing into directories about to be deleted). assertEquals(0, inFlight.get()); @@ -209,16 +209,15 @@ void quiescesBeforeTheFailurePropagates() { @Test void progressCountsOnlySuccessesContiguously() { - CountDownLatch twoDone = new CountDownLatch(2); + // Released from the PROGRESS callback (the orchestrating thread), not from the ok tasks: + // a task-side latch would leave a window where the failing task's completion overtakes an + // ok task's still-being-enqueued completion, making the recorded sequence racy. + CountDownLatch twoConsumed = new CountDownLatch(2); List dones = new ArrayList<>(); // appended on the calling thread only - Callable ok = + Callable ok = () -> null; + Callable failing = () -> { - twoDone.countDown(); - return null; - }; - Callable failer = - () -> { - twoDone.await(); // both successes complete first + twoConsumed.await(); // both successes are consumed first, provably throw new IllegalStateException("boom"); }; assertThrows( @@ -226,9 +225,12 @@ void progressCountsOnlySuccessesContiguously() { () -> Tasks.awaitAll( Tasks.Workers.platform(3), - List.of(ok, ok, failer), + List.of(ok, ok, failing), "batch", - (done, total) -> dones.add(done))); + (done, total) -> { + dones.add(done); + twoConsumed.countDown(); + })); assertEquals(List.of(1, 2), dones); } From efac0ad50a38a8234fc4a4e7cfe0220334618140 Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 16:13:57 +0900 Subject: [PATCH 8/9] style: rewrap the line the failing-task rename pushed past 100 columns Co-Authored-By: Claude Fable 5 --- .../test/java/io/github/p4suta/shared/process/TasksTest.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java b/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java index 0b70e1b..3356f83 100644 --- a/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java +++ b/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java @@ -201,7 +201,9 @@ void quiescesBeforeTheFailurePropagates() { assertThrows( IllegalStateException.class, - () -> Tasks.awaitAll(Tasks.Workers.platform(2), List.of(blocker, failing), "batch")); + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), List.of(blocker, failing), "batch")); // The contract callers' finally-blocks rely on: when awaitAll throws, no worker is still // running (still writing into directories about to be deleted). assertEquals(0, inFlight.get()); From f03170b069cee58404b685eb9c0607b14caf8cea Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 19:16:39 +0900 Subject: [PATCH 9/9] docs: align the generated baseline docs with main Squash merges orphan the stack's ancestry, so the benchmark documents (regenerated on every bench run) collide as add/add between this branch and main. Align them to main's version; the round's closing PR commits the final regenerated baselines, so no information is lost from the final state. The measured numbers this PR contributed remain in its commit message and PR description. Co-Authored-By: Claude Fable 5 --- despeckle/docs/cleaner-baseline.md | 31 +++++++++++++++--------------- pipeline/docs/perf-baseline.md | 10 +++++----- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/despeckle/docs/cleaner-baseline.md b/despeckle/docs/cleaner-baseline.md index 1304db0..96c422a 100644 --- a/despeckle/docs/cleaner-baseline.md +++ b/despeckle/docs/cleaner-baseline.md @@ -5,28 +5,27 @@ Times each Leptonica primitive the page cleaner composes on a synthetic 600-dpi A5 page (3496x4961 px, fixed seed). Re-run after any change to the cleaner or the imaging bindings and compare before merging. -- Date (UTC): 2026-06-10 06:36:38 +- Date (UTC): 2026-06-10 06:25:05 - Host: Linux amd64, 8 CPUs - Samples: median of 10 reps after 2 warmups; single-threaded. | op | median (ms) | min (ms) | calls/clean() | est. share of clean() | |---|---:|---:|---:|---:| -| read TIFF-G4 | 2.53 | 2.52 | 1 | 1.6% | -| selectBySize k=6 (page) | 16.04 | 14.90 | 1 | 9.9% | -| selectBySize 15 (page) | 15.02 | 14.84 | 1 | 9.3% | -| selectBySize k=6 (inverted) | 22.14 | 21.84 | 2 | 27.4% | -| dilate 43x43 (text mask) | 38.37 | 36.97 | 1 | 23.7% | -| open 7x7 (page) | 12.15 | 12.01 | 1 | 7.5% | -| invert | 0.25 | 0.25 | 2 | 0.3% | -| subtract | 0.40 | 0.38 | 5 | 1.2% | -| and | 0.41 | 0.40 | 1 | 0.3% | -| or | 0.39 | 0.35 | 3 | 0.7% | -| countConnComp | 11.53 | 11.46 | 2 | 14.3% | +| read TIFF-G4 | 2.59 | 2.52 | 1 | 1.5% | +| selectBySize k=6 (page) | 15.22 | 14.98 | 1 | 8.7% | +| selectBySize 15 (page) | 15.19 | 14.80 | 1 | 8.7% | +| selectBySize k=6 (inverted) | 22.16 | 21.89 | 2 | 25.3% | +| dilate 43x43 (text mask) | 37.52 | 37.01 | 1 | 21.5% | +| open 7x7 (page) | 12.18 | 12.04 | 1 | 7.0% | +| invert | 0.26 | 0.25 | 2 | 0.3% | +| subtract | 0.38 | 0.36 | 5 | 1.1% | +| and | 0.40 | 0.38 | 1 | 0.2% | +| or | 0.33 | 0.32 | 3 | 0.6% | +| countConnComp | 11.82 | 11.46 | 2 | 13.5% | | countPixels | 0.41 | 0.41 | 2 | 0.5% | -| write TIFF-G4 | 6.34 | 6.26 | 1 | 3.9% | -| **Σ(median × calls)** | 162.69 | | | 100.6% | -| **clean() end-to-end** | 161.78 | 159.91 | 1 | 100% | -| **clean() without component stats** | 139.46 | 137.07 | 1 | 86.2% | +| write TIFF-G4 | 6.44 | 6.29 | 1 | 3.7% | +| **Σ(median × calls)** | 161.74 | | | 92.5% | +| **clean() end-to-end** | 174.91 | 173.28 | 1 | 100% | The Σ row landing near 100% means the table accounts for clean()'s real cost; a large gap points at untimed work (allocation churn, codec internals). diff --git a/pipeline/docs/perf-baseline.md b/pipeline/docs/perf-baseline.md index 06ea504..9727575 100644 --- a/pipeline/docs/perf-baseline.md +++ b/pipeline/docs/perf-baseline.md @@ -6,7 +6,7 @@ of the installDist `pdfbook` launcher. Re-run after any change to the pipeline and compare against the previous run before merging (acceptance: ≥5% median total-wall improvement, or an explicit RSS/disk win, with output validated). -- Date (UTC): 2026-06-10 05:11:24 +- Date (UTC): 2026-06-10 04:46:07 - Host: Linux 6.8.0-124-generic amd64, 8 CPUs, RAM 16Gi - Launcher: `pipeline/app/build/install/pdfbook/bin/pdfbook` - Samples per measurement: cold (1st run) + warm median of 3. @@ -21,8 +21,8 @@ total-wall improvement, or an explicit RSS/disk win, with output validated). | Input | Jobs | Pages | E2E wall | conv | extract | despeckle | register | spread | startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) | |---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| -| sample-scan-200p.pdf | 1 | 200 | 46.18s | 45.98s | 0.88s | 31.95s | 12.94s | 0.20s | 0.20s | 47.85s | 173 | 6.4 | -| sample-scan-200p.pdf | 8 | 200 | 14.43s | 14.23s | 0.46s | 10.19s | 3.28s | 0.20s | 0.20s | 14.16s | 322 | 6.4 | +| sample-scan-200p.pdf | 1 | 200 | 50.10s | 49.85s | 4.57s | 32.16s | 12.94s | 0.20s | 0.25s | 51.42s | 156 | 6.4 | +| sample-scan-200p.pdf | 8 | 200 | 14.77s | 14.48s | 1.15s | 9.85s | 3.27s | 0.21s | 0.29s | 15.04s | 328 | 6.4 | ## Stage shares (of conv, warm median) @@ -31,5 +31,5 @@ cannot pay for a parallelization rewrite no matter how elegant. | Input | Jobs | extract | despeckle | register | spread | |---|---|---:|---:|---:|---:| -| sample-scan-200p.pdf | 1 | 1.9% | 69.5% | 28.1% | 0.4% | -| sample-scan-200p.pdf | 8 | 3.2% | 71.6% | 23.0% | 1.4% | +| sample-scan-200p.pdf | 1 | 9.2% | 64.5% | 26.0% | 0.4% | +| sample-scan-200p.pdf | 8 | 7.9% | 68.0% | 22.6% | 1.5% |