From 3fca5ad6924829f56f15a491ed8e9058282e868c Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 13:47:02 +0900 Subject: [PATCH 1/4] feat(pipeline): add stage-level timings and a benchmark harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nothing in the pipeline measured where a run's time went: ProgressEvents carry no timestamps and the stage logs no durations, so optimization work had no baseline to argue against. This adds the measurement layer: - --timings: a StageTimingSink (composed in the CLI shell) prints a stable, machine-parseable per-stage breakdown to stderr when a run ends ("timing: = s (%)"), including the still-open stage on failure. - PipelineRunner logs each stage directory's byte total, making the intermediate I/O of every stage visible. - benchPipeline: a Gradle task driving the installDist launcher with --timings (PipelineBenchmark, test sources), measuring E2E wall, the per-stage medians, peak RSS via /proc VmHWM, and output size, over a -Pjobs sweep; writes pipeline/docs/perf-baseline.md. - createSampleScan: a deterministic synthetic 600-dpi A5 scan book (specks for despeckle, ±0.5° skew for deskew) so the benchmark needs no copyrighted input and stays comparable across machines. Baseline on the 200-page fixture (8 CPUs): conv 14.48s at -j8 — despeckle 68%, register 22.6%, extract 7.9%, spread 1.5% — and a 3.44x scale-up from -j1, recorded in pipeline/docs/perf-baseline.md. Co-Authored-By: Claude Fable 5 --- pipeline/README.md | 1 + pipeline/app/build.gradle.kts | 47 ++ .../p4suta/pipeline/cli/PipelineCommand.java | 32 +- .../p4suta/pipeline/cli/StageTimingSink.java | 93 ++++ .../pipeline/cli/StageTimingSinkTest.java | 75 +++ .../pipeline/tools/PipelineBenchmark.java | 498 ++++++++++++++++++ .../pipeline/tools/SampleScanGenerator.java | 135 +++++ .../pipeline/application/PipelineRunner.java | 35 +- pipeline/docs/perf-baseline.md | 35 ++ 9 files changed, 947 insertions(+), 4 deletions(-) create mode 100644 pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java create mode 100644 pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java create mode 100644 pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java create mode 100644 pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java create mode 100644 pipeline/docs/perf-baseline.md diff --git a/pipeline/README.md b/pipeline/README.md index b8d47ef..12d15ef 100644 --- a/pipeline/README.md +++ b/pipeline/README.md @@ -33,6 +33,7 @@ never stops the rest; existing outputs are skipped unless `--force`). | `--pdf-a` | off | emit PDF/A-2b conformance | | `--force` | off | overwrite an existing output (batch: regenerate, don't skip) | | `--progress-file ` | — | write machine-readable JSONL progress events (single input only) | +| `--timings` | off | print a per-stage wall-clock breakdown to stderr when each run ends | | `-i, --interactive` | off | guided mode: prompt for the input, options and output | | `-h, --help` | — | show help and exit | | `-V, --version` | — | print version and exit | diff --git a/pipeline/app/build.gradle.kts b/pipeline/app/build.gradle.kts index 9764cfe..5c2572d 100644 --- a/pipeline/app/build.gradle.kts +++ b/pipeline/app/build.gradle.kts @@ -34,6 +34,10 @@ dependencies { implementation(libs.commons.cli) implementation(libs.slf4j.api) runtimeOnly(libs.slf4j.simple) + + // The benchmark fixture generator (test sources, never shipped — mirroring register's + // createSamplePdf) draws synthetic scan pages with PDFBox directly. + testImplementation(libs.pdfbox) } // The one place native access is granted to the launched app; run, test and JavaExec inherit it. @@ -85,3 +89,46 @@ selfContainedApp { // jbig2 (its register stage writes TIFF-G4; the spread pack embeds CCITT G4). bundleQpdf(this, libs.versions.qpdf.get()) } + +// ---- Stage-level benchmark (see pipeline/docs/perf-baseline.md) --------------------------------- + +// Deterministic synthetic scan book for the benchmark: an existing output is reused, so the +// generation cost (a minute at 200 pages × 600 dpi) is paid once. Knob: -Ppages=N (default 200). +tasks.register("createSampleScan") { + group = "verification" + description = "Generate the synthetic bitonal scan book the benchmark converts (cached)" + dependsOn(tasks.named("testClasses")) + classpath = sourceSets["test"].runtimeClasspath + mainClass = "io.github.p4suta.pipeline.tools.SampleScanGenerator" + val pages = providers.gradleProperty("pages").getOrElse("200") + args = listOf("build/test-data/sample-scan-${pages}p.pdf", pages, "600") +} + +// Stage-level runtime + memory benchmark (the pdfbook counterpart of tate's benchRuntime): runs the +// installDist launcher in-container with --timings, parses the per-stage breakdown, samples peak +// RSS from /proc, and writes pipeline/docs/perf-baseline.md. Knobs: -Pruns=N (warm runs, default +// 3), -Pjobs=1,8 (comma-separated -j sweep; default auto = the launcher's CPU-count default), +// -Ppages=N (fixture size, default 200), -Pinputs="a.pdf b.pdf" (real books instead of the +// fixture; resolved against the repo root). +tasks.register("benchPipeline") { + group = "verification" + description = "Benchmark pdfbook stage timings + peak memory; writes pipeline/docs/perf-baseline.md" + dependsOn(tasks.named("installDist"), tasks.named("createSampleScan")) + classpath = sourceSets["test"].runtimeClasspath + mainClass = "io.github.p4suta.pipeline.tools.PipelineBenchmark" + workingDir = rootDir + val runs = providers.gradleProperty("runs").getOrElse("3") + val jobs = providers.gradleProperty("jobs").getOrElse("auto") + val pages = providers.gradleProperty("pages").getOrElse("200") + val extraInputs = + providers + .gradleProperty("inputs") + .orNull + ?.split(Regex("\\s+")) + ?.filter { it.isNotBlank() } + ?: emptyList() + val launcher = "pipeline/app/build/install/pdfbook/bin/pdfbook" + val inputs = + extraInputs.ifEmpty { listOf("pipeline/app/build/test-data/sample-scan-${pages}p.pdf") } + args = listOf(launcher, "qpdf", "pipeline/docs/perf-baseline.md", runs, jobs) + inputs +} diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java index 99e453c..f29449d 100644 --- a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java @@ -153,6 +153,13 @@ private static Options buildOptions() { "Write machine-readable JSONL progress events to this file (single" + " input only); used by front ends to report progress.") .get()); + options.addOption( + Option.builder() + .longOpt("timings") + .desc( + "Print a per-stage wall-clock breakdown to stderr when each run" + + " ends.") + .get()); CliDocs.options(options); return options; } @@ -351,6 +358,7 @@ record Plan(Path input, Path output, Config config) {} deskew, scale, pdfA, + false, force); return new Plan(input, output, config); } @@ -380,14 +388,32 @@ private static String defaultOutput(Path input) { private static void runOne(Path input, Path output, Config config, @Nullable Path progressFile) throws IOException { if (progressFile == null) { - runWith(input, output, config, ProgressSink.NO_OP); + runWith(input, output, config, withTimings(config, ProgressSink.NO_OP)); } else { try (JsonlFileProgressSink progress = new JsonlFileProgressSink(progressFile)) { - runWith(input, output, config, progress); + runWith(input, output, config, withTimings(config, progress)); } } } + /** + * Wraps {@code sink} with a fresh {@link StageTimingSink} when {@code --timings} is set, so + * each run (every book of a batch separately) prints its own per-stage breakdown to stderr. + */ + private static ProgressSink withTimings(Config config, ProgressSink sink) { + if (!config.timings()) { + return sink; + } + StageTimingSink timings = new StageTimingSink(System.err); + if (sink == ProgressSink.NO_OP) { + return timings; + } + return event -> { + sink.emit(event); + timings.emit(event); + }; + } + // Resolves the progress sink first so the stages and sink report page-level PageProcessed // events into the same sink PipelineRunner reports stage boundaries into. With no // --progress-file the sink is NO_OP and every emit is a no-op. @@ -447,6 +473,7 @@ private static Config parseConfig(CommandLine cmd) throws ParseException { !cmd.hasOption("no-deskew"), !cmd.hasOption("no-scale"), cmd.hasOption("pdf-a"), + cmd.hasOption("timings"), cmd.hasOption("force")); } @@ -475,5 +502,6 @@ record Config( boolean deskew, boolean scale, boolean pdfA, + boolean timings, boolean force) {} } diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java new file mode 100644 index 0000000..496b0e4 --- /dev/null +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java @@ -0,0 +1,93 @@ +package io.github.p4suta.pipeline.cli; + +import io.github.p4suta.shared.progress.ProgressEvent; +import io.github.p4suta.shared.progress.ProgressSink; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import org.jspecify.annotations.Nullable; + +/** + * Measures each stage's wall clock from its {@link ProgressEvent.StageStarted}/{@link + * ProgressEvent.StageCompleted} boundaries and prints a per-stage breakdown when the run ends — the + * {@code --timings} flag's implementation. One line per stage in completion order, then the + * run-wide total: + * + *
{@code
+ * timing: extract = 4.21s (18.3%)
+ * timing: despeckle = 9.87s (42.9%)
+ * timing: total = 23.01s
+ * }
+ * + *

The {@code timing: = s} shape is a stable contract the {@code benchPipeline} + * harness parses; keep it machine-readable. A stage still open when the run fails is reported with + * its elapsed-so-far, so a failed run still shows where the time went. Thread-safe like every + * {@link ProgressSink}: events are handled under one lock. + */ +final class StageTimingSink implements ProgressSink { + + private final PrintStream out; + private final Object lock = new Object(); + private final List stages = new ArrayList<>(); + private final List stageNanos = new ArrayList<>(); + private @Nullable String openStage; + private long openedAtNanos; + private long runStartedAtNanos; + private boolean runStarted; + + StageTimingSink(PrintStream out) { + this.out = out; + } + + @Override + public void emit(ProgressEvent event) { + synchronized (lock) { + switch (event) { + case ProgressEvent.RunStarted ignored -> markRunStarted(); + case ProgressEvent.StageStarted s -> { + // Defensive: a sink wired mid-run still measures from the first boundary. + markRunStarted(); + openStage = s.stage(); + openedAtNanos = System.nanoTime(); + } + case ProgressEvent.StageCompleted ignored -> closeOpenStage(); + case ProgressEvent.PageProcessed ignored -> { + // Stage boundaries carry all the timing information. + } + case ProgressEvent.RunCompleted ignored -> report(); + case ProgressEvent.RunFailed ignored -> report(); + } + } + } + + private void markRunStarted() { + if (!runStarted) { + runStartedAtNanos = System.nanoTime(); + runStarted = true; + } + } + + private void closeOpenStage() { + @Nullable String stage = openStage; + if (stage != null) { + stages.add(stage); + stageNanos.add(System.nanoTime() - openedAtNanos); + openStage = null; + } + } + + private void report() { + closeOpenStage(); + long totalNanos = runStarted ? System.nanoTime() - runStartedAtNanos : 0; + for (int i = 0; i < stages.size(); i++) { + out.printf( + Locale.ROOT, + "timing: %s = %.2fs (%.1f%%)%n", + stages.get(i), + stageNanos.get(i) / 1e9, + totalNanos > 0 ? stageNanos.get(i) * 100.0 / totalNanos : 0.0); + } + out.printf(Locale.ROOT, "timing: total = %.2fs%n", totalNanos / 1e9); + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java new file mode 100644 index 0000000..3fbb3f7 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java @@ -0,0 +1,75 @@ +package io.github.p4suta.pipeline.cli; + +import static org.assertj.core.api.Assertions.assertThat; + +import io.github.p4suta.shared.progress.ProgressEvent; +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import org.junit.jupiter.api.Test; + +/** + * Pins the {@code --timings} report: one machine-parseable {@code timing: = s} + * line per completed stage (in completion order, percentages attached) plus a {@code timing: total} + * line, printed only when the run ends — and on failure, the still-open stage is reported with its + * elapsed-so-far. The line shape is the contract the {@code benchPipeline} harness parses. + */ +final class StageTimingSinkTest { + + private final ByteArrayOutputStream buf = new ByteArrayOutputStream(); + private final StageTimingSink sink = + new StageTimingSink(new PrintStream(buf, true, StandardCharsets.UTF_8)); + + private String output() { + return buf.toString(StandardCharsets.UTF_8); + } + + @Test + void completedRunReportsEachStageInOrderAndATotal() { + sink.emit(new ProgressEvent.RunStarted(2)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 2)); + sink.emit(new ProgressEvent.PageProcessed("extract", 1, 2)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + sink.emit(new ProgressEvent.StageStarted("spread", 1, 2)); + sink.emit(new ProgressEvent.StageCompleted("spread")); + sink.emit(new ProgressEvent.RunCompleted()); + + assertThat(output().lines()) + .hasSize(3) + .satisfiesExactly( + extract -> + assertThat(extract) + .matches( + "timing: extract = \\d+\\.\\d{2}s" + + " \\(\\d+\\.\\d%\\)"), + spread -> + assertThat(spread) + .matches( + "timing: spread = \\d+\\.\\d{2}s" + + " \\(\\d+\\.\\d%\\)"), + total -> assertThat(total).matches("timing: total = \\d+\\.\\d{2}s")); + } + + @Test + void nothingIsPrintedBeforeTheRunEnds() { + sink.emit(new ProgressEvent.RunStarted(1)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 1)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + + assertThat(output()).isEmpty(); + } + + @Test + void failedRunReportsTheStillOpenStage() { + sink.emit(new ProgressEvent.RunStarted(2)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 2)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + sink.emit(new ProgressEvent.StageStarted("register", 1, 2)); + sink.emit(new ProgressEvent.RunFailed("INTERNAL", "boom")); + + assertThat(output()) + .contains("timing: extract = ") + .contains("timing: register = ") + .contains("timing: total = "); + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java new file mode 100644 index 0000000..1b59397 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java @@ -0,0 +1,498 @@ +package io.github.p4suta.pipeline.tools; + +import java.io.IOException; +import java.lang.management.ManagementFactory; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Stage-level runtime + memory benchmark for the installed {@code pdfbook} launcher — the pdfbook + * counterpart of tate's {@code RuntimeBenchmark}, with per-stage attribution as the addition. + * + *

Drives {@code pdfbook -o /out.pdf --force --timings [-j N]} as a child process, + * measuring end-to-end wall around the process ({@link System#nanoTime()}), peak RSS by sampling + * the child's {@code /proc//status} {@code VmHWM} (Linux-only; {@code n/a} elsewhere), and the + * per-stage wall by parsing the stable {@code timing: = s} lines {@code + * StageTimingSink} prints. Writes a Markdown report. + * + *

Test-sources tool (driven by the {@code benchPipeline} Gradle task): it never ships in the + * production launcher, and it expects the dev container's native toolchain (pdfimages, Leptonica, + * qpdf) on PATH — the installDist launcher, not the jpackage image, is what it measures. + * + *

Usage: {@code PipelineBenchmark + * ...} — {@code jobsCsv} is a comma-separated {@code -j} sweep ({@code auto} = omit + * {@code -j}, i.e. the launcher's CPU-count default). + */ +public final class PipelineBenchmark { + + private static final Pattern TIMING = + Pattern.compile("^timing: (\\S+) = ([0-9.]+)s", Pattern.MULTILINE); + private static final Pattern VM_HWM = Pattern.compile("VmHWM:\\s*([0-9]+)"); + private static final long POLL_MILLIS = 5; + private static final long PROCESS_TIMEOUT_NANOS = TimeUnit.MINUTES.toNanos(30); + private static final long MIB = 1024L * 1024L; + + private final Path launcher; + private final String qpdf; + private final Path outDoc; + private final int runs; + private final List jobsSweep; + + private PipelineBenchmark( + Path launcher, String qpdf, Path outDoc, int runs, List jobsSweep) { + this.launcher = launcher; + this.qpdf = qpdf; + this.outDoc = outDoc; + this.runs = runs; + this.jobsSweep = jobsSweep; + } + + public static void main(String[] args) throws IOException, InterruptedException { + if (args.length < 6) { + System.err.println( + "usage: PipelineBenchmark " + + " ..."); + System.exit(2); + return; + } + var benchmark = + new PipelineBenchmark( + Path.of(args[0]), + args[1], + Path.of(args[2]), + Integer.parseInt(args[3]), + Arrays.stream(args[4].split(",")).map(String::trim).toList()); + List inputs = Arrays.stream(args).skip(5).map(Path::of).toList(); + benchmark.run(inputs); + } + + // Result records + + /** One measured child run: wall seconds, peak RSS (KiB, -1 if unavailable), merged output. */ + private record Timed(double elapsedSeconds, long maxRssKib, String output) {} + + /** A finished input × jobs measurement, ready to render. */ + private record Row( + String name, + String jobs, + int pages, + long inputBytes, + double wallMedian, + double coldWall, + Map stageMedians, + long rssMedianKib, + long outputBytes) {} + + // Orchestration + + private void run(List inputs) throws IOException, InterruptedException { + requireExecutable(launcher, "pdfbook launcher", "build it first: just pdfbook-install"); + + List rows = new ArrayList<>(); + for (Path input : inputs) { + if (!Files.isRegularFile(input)) { + System.err.println("skip (not found): " + input); + continue; + } + for (String jobs : jobsSweep) { + rows.add(measure(input, jobs)); + } + } + + String report = render(rows); + Files.createDirectories(requireParent(outDoc)); + Files.writeString(outDoc, report, StandardCharsets.UTF_8); + System.out.print(report); + System.err.println(); + System.err.println("→ wrote " + outDoc); + } + + private Row measure(Path input, String jobs) throws IOException, InterruptedException { + int pages = pageCount(input); + long inputBytes = Files.size(input); + System.err.printf( + Locale.ROOT, + "Measuring: %s (%dp, %s MiB, jobs=%s)…%n", + fileName(input), + pages, + mib(inputBytes), + jobs); + + Path work = Files.createTempDirectory("pdfbook-bench"); + try { + Path out = work.resolve("out.pdf"); + List convert = new ArrayList<>(); + convert.add(launcher.toString()); + convert.add(input.toString()); + convert.add("-o"); + convert.add(out.toString()); + convert.add("--force"); + convert.add("--timings"); + if (!"auto".equals(jobs)) { + convert.add("-j"); + convert.add(jobs); + } + + // Cold run (fresh page cache for the input is not guaranteed, but a fresh JVM is) — + // recorded separately from the warm median. + Timed cold = timed(convert); + + double[] walls = new double[runs]; + long[] rsss = new long[runs]; + Map> stages = new LinkedHashMap<>(); + for (int r = 0; r < runs; r++) { + Timed t = timed(convert); + walls[r] = t.elapsedSeconds(); + rsss[r] = t.maxRssKib(); + parseTimings(t.output()) + .forEach( + (stage, seconds) -> + stages.computeIfAbsent(stage, ignored -> new ArrayList<>()) + .add(seconds)); + } + Map stageMedians = new LinkedHashMap<>(); + stages.forEach((stage, seconds) -> stageMedians.put(stage, median(seconds))); + + long outputBytes = Files.isRegularFile(out) ? Files.size(out) : -1; + return new Row( + fileName(input), + jobs, + pages, + inputBytes, + median(walls), + cold.elapsedSeconds(), + stageMedians, + medianLong(rsss), + outputBytes); + } finally { + deleteTree(work); + } + } + + /** + * The per-stage seconds of one run, keyed by stage label in print order ({@code total} + * included). Repeated labels (a batch run) sum, though the harness always converts one book. + */ + private static Map parseTimings(String output) { + Map timings = new LinkedHashMap<>(); + Matcher m = TIMING.matcher(output); + while (m.find()) { + timings.merge(m.group(1), Double.parseDouble(m.group(2)), Double::sum); + } + return timings; + } + + // Subprocess measurement + + /** + * Runs {@code command}, returning its wall time, peak RSS (sampled from {@code + * /proc//status} {@code VmHWM}; -1 where unavailable), and its merged stdout+stderr (so + * the {@code timing:} lines can be parsed). Output is drained on a separate thread so a chatty + * child cannot deadlock on a full pipe. + */ + private static Timed timed(List command) throws IOException, InterruptedException { + long start = System.nanoTime(); + Process process = new ProcessBuilder(command).redirectErrorStream(true).start(); + + var captured = new AtomicReference<>(""); + Thread drainer = + Thread.ofVirtual() + .start( + () -> { + try (var in = process.getInputStream()) { + captured.set( + new String( + in.readAllBytes(), StandardCharsets.UTF_8)); + } catch (IOException ignored) { + // Process gone; whatever was read is lost — acceptable for + // a benchmark. + } + }); + + Path status = Path.of("/proc", Long.toString(process.pid()), "status"); + long peakRssKib = -1; + while (process.isAlive()) { + if (System.nanoTime() - start > PROCESS_TIMEOUT_NANOS) { + process.destroyForcibly(); + throw new IOException("timed command did not finish: " + command); + } + peakRssKib = Math.max(peakRssKib, readVmHwmKib(status)); + Thread.sleep(POLL_MILLIS); + } + double elapsed = (System.nanoTime() - start) / 1.0e9; + int exit = process.waitFor(); + drainer.join(); + if (exit != 0) { + throw new IOException( + "benchmark child failed with exit " + exit + ": " + captured.get()); + } + return new Timed(elapsed, peakRssKib, captured.get()); + } + + /** Peak RSS (KiB) from {@code /proc//status}, or -1 if unreadable / non-Linux. */ + private static long readVmHwmKib(Path status) { + try { + Matcher m = VM_HWM.matcher(Files.readString(status, StandardCharsets.UTF_8)); + return m.find() ? Long.parseLong(m.group(1)) : -1; + } catch (IOException | RuntimeException e) { + return -1; // process already exited, or /proc not present + } + } + + /** Page count via {@code qpdf --show-npages} (PATH or absolute), or -1 when unavailable. */ + private int pageCount(Path pdf) throws InterruptedException { + try { + Process process = + new ProcessBuilder(qpdf, "--show-npages", pdf.toString()) + .redirectErrorStream(true) + .start(); + String output; + try (var in = process.getInputStream()) { + output = new String(in.readAllBytes(), StandardCharsets.UTF_8); + } + if (!process.waitFor(1, TimeUnit.MINUTES)) { + process.destroyForcibly(); + return -1; + } + // The count is the one digits-only line; qpdf may surround it with warning lines. + return output.lines() + .map(String::strip) + .filter(line -> line.matches("\\d+")) + .findFirst() + .map(Integer::parseInt) + .orElse(-1); + } catch (IOException e) { + return -1; // qpdf not installed — page count is cosmetic here + } + } + + // Numeric helpers + + private static double median(double[] values) { + double[] sorted = values.clone(); + Arrays.sort(sorted); + int n = sorted.length; + if (n == 0) { + return 0; + } + return (n % 2 == 1) ? sorted[n / 2] : (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0; + } + + private static double median(List values) { + return median(values.stream().mapToDouble(Double::doubleValue).toArray()); + } + + private static long medianLong(long[] values) { + return Math.round(median(Arrays.stream(values).asDoubleStream().toArray())); + } + + private static String mib(long bytes) { + return bytes < 0 ? "n/a" : String.format(Locale.ROOT, "%.1f", bytes / (double) MIB); + } + + // Rendering + + private String render(List rows) { + // Stage columns: the union of stage labels across rows in first-appearance order, with + // "total" (the launcher's in-process conversion time) pulled out as its own column. + Set stageNames = new LinkedHashSet<>(); + for (Row row : rows) { + stageNames.addAll(row.stageMedians().keySet()); + } + stageNames.remove("total"); + + var sb = new StringBuilder(); + sb.append("# pdfbook runtime baseline (stage-level)\n\n") + .append("Generated by `PipelineBenchmark`") + .append(" (`./gradlew :pipeline:app:benchPipeline`, in the dev container).\n") + .append( + "Tracks the **per-stage wall-clock breakdown, end-to-end runtime and peak" + + " memory**\n") + .append( + "of the installDist `pdfbook` launcher. Re-run after any change to the" + + " pipeline\n") + .append( + "and compare against the previous run before merging (acceptance: ≥5%" + + " median\n") + .append( + "total-wall improvement, or an explicit RSS/disk win, with output" + + " validated).\n\n"); + appendHostInfo(sb); + sb.append("\n## Stage breakdown (warm median of ").append(runs).append(" runs)\n\n"); + sb.append( + "`conv` is the launcher's in-process total (`timing: total`);" + + " `startup+init` = E2E wall − conv\n") + .append( + "(JVM boot + first-touch PDFBox/AWT init). `jobs=auto` is the launcher's" + + " CPU-count default.\n\n"); + sb.append("| Input | Jobs | Pages | E2E wall | conv |"); + for (String stage : stageNames) { + sb.append(' ').append(stage).append(" |"); + } + sb.append(" startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) |\n"); + sb.append("|---|---|---:|---:|---:|"); + sb.append("---:|".repeat(stageNames.size())); + sb.append("---:|---:|---:|---:|\n"); + for (Row row : rows) { + double conv = row.stageMedians().getOrDefault("total", 0.0); + sb.append("| ") + .append(row.name()) + .append(" | ") + .append(row.jobs()) + .append(" | ") + .append(pages(row.pages())) + .append(" | ") + .append(secs(row.wallMedian())) + .append(" | ") + .append(conv > 0 ? secs(conv) : "n/a") + .append(" |"); + for (String stage : stageNames) { + Double seconds = row.stageMedians().get(stage); + sb.append(' ').append(seconds == null ? "n/a" : secs(seconds)).append(" |"); + } + sb.append(' ') + .append(conv > 0 ? secs(Math.max(0, row.wallMedian() - conv)) : "n/a") + .append(" | ") + .append(secs(row.coldWall())) + .append(" | ") + .append(rssMib(row.rssMedianKib())) + .append(" | ") + .append(mib(row.outputBytes())) + .append(" |\n"); + } + sb.append("\n## Stage shares (of conv, warm median)\n\n") + .append( + "The shares that decide where optimization effort goes: a stage that is" + + " ~5% of conv\n") + .append("cannot pay for a parallelization rewrite no matter how elegant.\n\n"); + sb.append("| Input | Jobs |"); + for (String stage : stageNames) { + sb.append(' ').append(stage).append(" |"); + } + sb.append('\n').append("|---|---|").append("---:|".repeat(stageNames.size())).append('\n'); + for (Row row : rows) { + double conv = row.stageMedians().getOrDefault("total", 0.0); + sb.append("| ").append(row.name()).append(" | ").append(row.jobs()).append(" |"); + for (String stage : stageNames) { + Double seconds = row.stageMedians().get(stage); + sb.append(' ') + .append( + seconds == null || conv <= 0 + ? "n/a" + : String.format( + Locale.ROOT, "%.1f%%", seconds * 100.0 / conv)) + .append(" |"); + } + sb.append('\n'); + } + return sb.toString(); + } + + private void appendHostInfo(StringBuilder sb) { + String date = + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss", Locale.ROOT) + .withZone(ZoneOffset.UTC) + .format(Instant.now()); + long totalRamBytes = totalPhysicalMemoryBytes(); + sb.append("- Date (UTC): ").append(date).append('\n'); + sb.append("- Host: ") + .append(System.getProperty("os.name", "?")) + .append(' ') + .append(System.getProperty("os.version", "?")) + .append(' ') + .append(System.getProperty("os.arch", "?")) + .append(", ") + .append(Runtime.getRuntime().availableProcessors()) + .append(" CPUs, RAM ") + .append(totalRamBytes > 0 ? Math.round(totalRamBytes / 1.073741824e9) + "Gi" : "?") + .append('\n'); + sb.append("- Launcher: `").append(launcher).append("`\n"); + sb.append("- Samples per measurement: cold (1st run) + warm median of ") + .append(runs) + .append(".\n"); + sb.append( + "- The default input is the deterministic synthetic fixture" + + " (`createSampleScan`,\n") + .append( + " seeded, so identical across machines). Real books are pluggable via" + + " `-Pinputs=\"…\"`;\n") + .append(" only their page count and byte size are reported.\n"); + } + + private static long totalPhysicalMemoryBytes() { + if (ManagementFactory.getOperatingSystemMXBean() + instanceof com.sun.management.OperatingSystemMXBean os) { + return os.getTotalMemorySize(); + } + return -1; + } + + private static String secs(double seconds) { + return String.format(Locale.ROOT, "%.2fs", seconds); + } + + private static String pages(int pages) { + return pages < 0 ? "?" : Integer.toString(pages); + } + + private static String rssMib(long rssKib) { + return rssKib < 0 ? "n/a" : Long.toString(Math.round(rssKib / 1024.0)); + } + + // Small utilities + + private static String fileName(Path path) { + Path name = path.getFileName(); + return name != null ? name.toString() : path.toString(); + } + + private static void requireExecutable(Path path, String what, String hint) { + if (!Files.isExecutable(path)) { + System.err.println("error: " + what + " not found at " + path); + System.err.println(" " + hint); + System.exit(1); + } + } + + private static Path requireParent(Path path) { + Path parent = path.getParent(); + if (parent == null) { + throw new IllegalArgumentException("output path has no parent: " + path); + } + return parent; + } + + private static void deleteTree(Path dir) throws IOException { + if (!Files.exists(dir)) { + return; + } + try (var paths = Files.walk(dir)) { + paths.sorted((a, b) -> b.getNameCount() - a.getNameCount()) + .forEach( + p -> { + try { + Files.deleteIfExists(p); + } catch (IOException e) { + System.err.println( + "warn: could not delete " + p + ": " + e.getMessage()); + } + }); + } + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java new file mode 100644 index 0000000..8b111a7 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java @@ -0,0 +1,135 @@ +package io.github.p4suta.pipeline.tools; + +import java.awt.Color; +import java.awt.Graphics2D; +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Locale; +import java.util.Random; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; + +/** + * Generates the synthetic, copyright-free bitonal "scan" book the {@code benchPipeline} harness + * converts: A5 pages at the requested dpi carrying vertical text-like columns with per-page + * position jitter, a small per-page skew of up to ±0.5° (so the register stage's deskew has real + * work) and salt-and-pepper specks (so despeckle has real work), embedded as CCITT G4 so {@code + * pdfimages} extracts them exactly like a real scan. A fixed seed keeps every generation + * byte-identical, so benchmark runs stay comparable across machines and branches. + * + *

This deliberately lives in test sources (driven by the {@code createSampleScan} Gradle task), + * mirroring register's {@code SamplePdfGenerator}: the dev tool never ships in the production + * launcher. An existing output is reused, so repeated benchmark runs skip the generation cost. + * + *

Usage: {@code SampleScanGenerator [pages] [dpi]} + */ +public final class SampleScanGenerator { + + private SampleScanGenerator() {} + + /** {@code SampleScanGenerator [pages] [dpi]} — writes the synthetic scan book. */ + public static void main(String[] args) throws IOException { + Path out = Path.of(args.length > 0 ? args[0] : "sample-scan.pdf"); + int pages = args.length > 1 ? Integer.parseInt(args[1]) : 200; + int dpi = args.length > 2 ? Integer.parseInt(args[2]) : 600; + if (Files.isRegularFile(out)) { + System.out.println("reusing existing " + out + " (delete it to regenerate)"); + return; + } + long start = System.nanoTime(); + write(out, pages, dpi); + System.out.printf( + Locale.ROOT, + "wrote %s: %d page(s) at %d dpi, %.1f MiB in %.1fs%n", + out, + pages, + dpi, + Files.size(out) / (1024.0 * 1024.0), + (System.nanoTime() - start) / 1e9); + } + + /** Writes a {@code pages}-page synthetic bitonal scan book to {@code out} (A5 geometry). */ + public static void write(Path out, int pages, int dpi) throws IOException { + int width = Math.round(148f * dpi / 25.4f); // A5 portrait: 148 mm × 210 mm + int height = Math.round(210f * dpi / 25.4f); + Random random = new Random(42); + Path parent = out.toAbsolutePath().getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + try (PDDocument doc = new PDDocument()) { + float widthPt = width * 72f / dpi; + float heightPt = height * 72f / dpi; + for (int i = 0; i < pages; i++) { + PDImageXObject image = + CCITTFactory.createFromImage(doc, page(width, height, random)); + PDPage page = new PDPage(new PDRectangle(widthPt, heightPt)); + doc.addPage(page); + try (PDPageContentStream content = new PDPageContentStream(doc, page)) { + content.drawImage(image, 0, 0, widthPt, heightPt); + } + } + doc.save(out.toFile()); + } + } + + /** One page: slightly skewed text-like columns plus unrotated scanner-dust specks. */ + private static BufferedImage page(int width, int height, Random random) { + BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); + Graphics2D g = img.createGraphics(); + try { + g.setColor(Color.WHITE); + g.fillRect(0, 0, width, height); + g.setColor(Color.BLACK); + double skew = Math.toRadians(random.nextDouble() - 0.5); // ±0.5° + g.rotate(skew, width / 2.0, height / 2.0); + drawColumns(g, width, height, random); + g.rotate(-skew, width / 2.0, height / 2.0); + drawSpecks(g, width, height, random); + } finally { + g.dispose(); + } + return img; + } + + /** + * Vertical "text" columns right-to-left (Japanese book layout): stacked glyph-sized blocks with + * per-page jitter so registration has a real column position to detect and correct, and random + * early line breaks so the texture resembles prose rather than a solid block. + */ + private static void drawColumns(Graphics2D g, int width, int height, Random random) { + int margin = width / 10; + int glyph = Math.max(4, width / 60); + int leading = glyph / 2; + int jitterX = random.nextInt(glyph + 1) - glyph / 2; + int top = height / 12 + random.nextInt(glyph + 1); + int bottom = height - height / 12; + for (int x = width - margin - glyph + jitterX; x >= margin; x -= glyph + leading) { + int y = top; + while (y + glyph <= bottom) { + // ~8% of glyph slots end the "sentence" early, leaving prose-like white runs. + if (random.nextInt(100) < 8) { + y += glyph * (2 + random.nextInt(4)); + continue; + } + g.fillRect(x, y, glyph - 2, glyph - 2); + y += glyph; + } + } + } + + /** Salt-and-pepper dust: ~1 speck of 1–3 px per 25k pixels, what despeckle exists to remove. */ + private static void drawSpecks(Graphics2D g, int width, int height, Random random) { + int specks = width * height / 25_000; + for (int i = 0; i < specks; i++) { + int size = 1 + random.nextInt(3); + g.fillRect(random.nextInt(width - size), random.nextInt(height - size), size, size); + } + } +} diff --git a/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java b/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java index 0810505..8ae99b8 100644 --- a/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java +++ b/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java @@ -70,7 +70,11 @@ public void run( progress.emit(new ProgressEvent.StageStarted(source.name(), position, total)); Corpus corpus = source.open(stageDir(work, 0, source.name())); - log.info("source: {} page(s) at {} dpi", corpus.pageCount(), corpus.dpi()); + log.info( + "source: {} page(s) at {} dpi, {}", + corpus.pageCount(), + corpus.dpi(), + intermediatesSize(corpus.dir())); progress.emit(new ProgressEvent.StageCompleted(source.name())); position++; @@ -78,7 +82,12 @@ public void run( for (Stage stage : stages) { progress.emit(new ProgressEvent.StageStarted(stage.name(), position, total)); corpus = stage.apply(corpus, stageDir(work, dirIndex, stage.name())); - log.info("stage {} ({}): {} page(s)", dirIndex, stage.name(), corpus.pageCount()); + log.info( + "stage {} ({}): {} page(s), {}", + dirIndex, + stage.name(), + corpus.pageCount(), + intermediatesSize(corpus.dir())); progress.emit(new ProgressEvent.StageCompleted(stage.name())); position++; dirIndex++; @@ -111,6 +120,28 @@ private static Path stageDir(Path work, int index, String name) throws IOExcepti work.resolve(String.format(Locale.ROOT, "%02d-%s", index, name))); } + /** + * The stage directory's total file bytes rendered as MiB — visibility into how much + * intermediate I/O each stage produces (best-effort: {@code ?} when the walk fails). + */ + private static String intermediatesSize(Path dir) { + try (Stream files = Files.walk(dir)) { + long bytes = + files.filter(Files::isRegularFile).mapToLong(PipelineRunner::sizeQuietly).sum(); + return String.format(Locale.ROOT, "%.1f MiB", bytes / (1024.0 * 1024.0)); + } catch (IOException e) { + return "? MiB"; + } + } + + private static long sizeQuietly(Path file) { + try { + return Files.size(file); + } catch (IOException e) { + return 0L; + } + } + private static void deleteRecursively(Path dir) { try (Stream walk = Files.walk(dir)) { walk.sorted(Comparator.reverseOrder()).forEach(PipelineRunner::deleteQuietly); diff --git a/pipeline/docs/perf-baseline.md b/pipeline/docs/perf-baseline.md new file mode 100644 index 0000000..9727575 --- /dev/null +++ b/pipeline/docs/perf-baseline.md @@ -0,0 +1,35 @@ +# pdfbook runtime baseline (stage-level) + +Generated by `PipelineBenchmark` (`./gradlew :pipeline:app:benchPipeline`, in the dev container). +Tracks the **per-stage wall-clock breakdown, end-to-end runtime and peak memory** +of the installDist `pdfbook` launcher. Re-run after any change to the pipeline +and compare against the previous run before merging (acceptance: ≥5% median +total-wall improvement, or an explicit RSS/disk win, with output validated). + +- Date (UTC): 2026-06-10 04:46:07 +- Host: Linux 6.8.0-124-generic amd64, 8 CPUs, RAM 16Gi +- Launcher: `pipeline/app/build/install/pdfbook/bin/pdfbook` +- Samples per measurement: cold (1st run) + warm median of 3. +- The default input is the deterministic synthetic fixture (`createSampleScan`, + seeded, so identical across machines). Real books are pluggable via `-Pinputs="…"`; + only their page count and byte size are reported. + +## Stage breakdown (warm median of 3 runs) + +`conv` is the launcher's in-process total (`timing: total`); `startup+init` = E2E wall − conv +(JVM boot + first-touch PDFBox/AWT init). `jobs=auto` is the launcher's CPU-count default. + +| Input | Jobs | Pages | E2E wall | conv | extract | despeckle | register | spread | startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) | +|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| sample-scan-200p.pdf | 1 | 200 | 50.10s | 49.85s | 4.57s | 32.16s | 12.94s | 0.20s | 0.25s | 51.42s | 156 | 6.4 | +| sample-scan-200p.pdf | 8 | 200 | 14.77s | 14.48s | 1.15s | 9.85s | 3.27s | 0.21s | 0.29s | 15.04s | 328 | 6.4 | + +## Stage shares (of conv, warm median) + +The shares that decide where optimization effort goes: a stage that is ~5% of conv +cannot pay for a parallelization rewrite no matter how elegant. + +| Input | Jobs | extract | despeckle | register | spread | +|---|---|---:|---:|---:|---:| +| sample-scan-200p.pdf | 1 | 9.2% | 64.5% | 26.0% | 0.4% | +| sample-scan-200p.pdf | 8 | 7.9% | 68.0% | 22.6% | 1.5% | From 62c37bf10271edca106bb2e61bf997ecb0eb7fb8 Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:12:25 +0900 Subject: [PATCH 2/4] perf(extract): remux all-CCITT sources instead of decoding, finer chunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pdfimages -tiff decodes every embedded G4 image into an uncompressed TIFF (~2.2 MB per 600-dpi page; ~434 MB of transient intermediates for a 200-page book) even though the typical self-scanned source is CCITT G4 end to end. (The originally planned `-tiffcompression g4` flag does not exist on pdfimages — it is a pdftoppm option.) The extractor now picks its mode from one pdfimages -list pass: when every embedded image is 1-bpp CCITT, each chunk dumps the raw G4 streams (-ccitt) and CcittTiffs wraps them verbatim into single-strip CCITT-G4 TIFFs — a pure remux: no decode/re-encode, intermediates drop ~60x, and the image's true ppi is stamped instead of pdfimages' default 72 dpi. Because PDF's EncodedByteAlign never reaches the dumped .params file, every wrapped page is decoded back once through Leptonica as verification; a chunk that deviates in any way (params shape, count, or a wrap that fails to decode) is re-extracted decoded, which is also the whole-run mode for any non-CCITT source. The photometric mapping (-B -> WhiteIsZero, -W -> BlackIsZero) is pinned empirically by a pixel-identical round trip test. Extraction chunks also shrink from total/jobs to ~12 pages (capped at 4*jobs): fast finishers free their pool slot early, and a future streaming source can consume pages chunk by chunk. Benchmark (200-page fixture, warm median of 3, vs the PR #28 baseline): extract 1.15s -> 0.46s at -j8 (4.57s -> 0.88s at -j1), conv 49.85s -> 45.98s (-7.8%) at -j1, intermediates ~434 MB -> ~7 MB. Output validated with qpdf --check (100 spreads, linearized, no errors). Co-Authored-By: Claude Fable 5 --- .../p4suta/pipeline/cli/PipelineCommand.java | 8 +- pipeline/docs/perf-baseline.md | 10 +- .../infrastructure/G4EncodeStage.java | 13 +- .../github/p4suta/shared/pdf/CcittTiffs.java | 159 +++++++++++++++ .../shared/pdf/PdfImagesCliExtractor.java | 190 ++++++++++++++++-- .../p4suta/shared/pdf/PdfListingParser.java | 68 +++++-- .../p4suta/shared/pdf/CcittTiffsTest.java | 160 +++++++++++++++ .../shared/pdf/PdfImagesCliExtractorTest.java | 63 ++++++ .../shared/pdf/PdfListingParserTest.java | 23 +++ 9 files changed, 650 insertions(+), 44 deletions(-) create mode 100644 shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java create mode 100644 shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java index f29449d..25f8125 100644 --- a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java @@ -427,9 +427,11 @@ private static void runWith(Path input, Path output, Config config, ProgressSink stages.add(new RegisterStage(config.jobs(), config.deskew(), config.scale(), progress)); } if (stages.isEmpty()) { - // --no-despeckle --no-register: the raw pdfimages TIFFs are not CCITT G4, which the - // spread sink's pass-through embedding requires; despeckle/register each re-encode G4 - // themselves, so only the no-stage path needs this normalization. + // --no-despeckle --no-register: a non-CCITT source extracts as decoded TIFFs that are + // not the single-strip CCITT G4 the spread sink's pass-through embedding requires; + // despeckle/register each re-encode G4 themselves, so only the no-stage path needs + // this normalization (an all-CCITT source arrives already G4 — then this is a cheap + // lossless re-encode that keeps the path uniform). stages.add(new G4EncodeStage(config.jobs(), progress)); } Source source = new PdfExtractSource(input, config.jobs()); diff --git a/pipeline/docs/perf-baseline.md b/pipeline/docs/perf-baseline.md index 9727575..06ea504 100644 --- a/pipeline/docs/perf-baseline.md +++ b/pipeline/docs/perf-baseline.md @@ -6,7 +6,7 @@ of the installDist `pdfbook` launcher. Re-run after any change to the pipeline and compare against the previous run before merging (acceptance: ≥5% median total-wall improvement, or an explicit RSS/disk win, with output validated). -- Date (UTC): 2026-06-10 04:46:07 +- Date (UTC): 2026-06-10 05:11:24 - Host: Linux 6.8.0-124-generic amd64, 8 CPUs, RAM 16Gi - Launcher: `pipeline/app/build/install/pdfbook/bin/pdfbook` - Samples per measurement: cold (1st run) + warm median of 3. @@ -21,8 +21,8 @@ total-wall improvement, or an explicit RSS/disk win, with output validated). | Input | Jobs | Pages | E2E wall | conv | extract | despeckle | register | spread | startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) | |---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| -| sample-scan-200p.pdf | 1 | 200 | 50.10s | 49.85s | 4.57s | 32.16s | 12.94s | 0.20s | 0.25s | 51.42s | 156 | 6.4 | -| sample-scan-200p.pdf | 8 | 200 | 14.77s | 14.48s | 1.15s | 9.85s | 3.27s | 0.21s | 0.29s | 15.04s | 328 | 6.4 | +| sample-scan-200p.pdf | 1 | 200 | 46.18s | 45.98s | 0.88s | 31.95s | 12.94s | 0.20s | 0.20s | 47.85s | 173 | 6.4 | +| sample-scan-200p.pdf | 8 | 200 | 14.43s | 14.23s | 0.46s | 10.19s | 3.28s | 0.20s | 0.20s | 14.16s | 322 | 6.4 | ## Stage shares (of conv, warm median) @@ -31,5 +31,5 @@ cannot pay for a parallelization rewrite no matter how elegant. | Input | Jobs | extract | despeckle | register | spread | |---|---|---:|---:|---:|---:| -| sample-scan-200p.pdf | 1 | 9.2% | 64.5% | 26.0% | 0.4% | -| sample-scan-200p.pdf | 8 | 7.9% | 68.0% | 22.6% | 1.5% | +| sample-scan-200p.pdf | 1 | 1.9% | 69.5% | 28.1% | 0.4% | +| sample-scan-200p.pdf | 8 | 3.2% | 71.6% | 23.0% | 1.4% | diff --git a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java index bcd1008..bb3ad78 100644 --- a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java +++ b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java @@ -18,11 +18,14 @@ /** * The G4-normalization {@link Stage}: re-encodes each extracted page as single-strip CCITT G4 TIFF - * via Leptonica, which {@link SpreadPackSink}'s pass-through CCITT embedding requires. {@code - * pdfimages -tiff} writes poppler's default (non-G4) TIFF compression, so the raw extract output - * cannot be embedded directly; despeckle and register each re-encode their output as G4 themselves, - * so the composition root inserts this stage only when neither of them runs. The corpus dpi is - * stamped on every page, since {@code pdfimages} tags the extracted TIFFs at a default 72 dpi. + * via Leptonica, which {@link SpreadPackSink}'s pass-through CCITT embedding requires. The + * extractor's decoded mode ({@code pdfimages -tiff}, used for any source that is not all-CCITT) + * writes poppler's default (non-G4) TIFF compression at a default 72 dpi, so that output cannot be + * embedded directly; despeckle and register each re-encode their output as G4 themselves, so the + * composition root inserts this stage only when neither of them runs. The corpus dpi is stamped on + * every page. (For an all-CCITT source the extractor's remux already produces stamped single-strip + * G4 — this stage then re-encodes losslessly, a small constant cost that keeps the no-stage path + * uniform.) */ public final class G4EncodeStage implements Stage { diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java new file mode 100644 index 0000000..1bfd7c3 --- /dev/null +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java @@ -0,0 +1,159 @@ +package io.github.p4suta.shared.pdf; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Files; +import java.nio.file.Path; +import org.jspecify.annotations.Nullable; + +/** + * Wraps the raw CCITT G4 stream {@code pdfimages -ccitt} dumps into a single-strip TIFF — the + * pass-through half of the extractor's remux mode: the scan's embedded G4 bytes become a readable + * TIFF without ever being decoded and re-encoded. + * + *

poppler writes a fax2tiff-style {@code .params} file beside each dump ({@code -4} G4 / {@code + * -1 -2} G3, {@code -A} EOL markers / {@code -P} none, {@code -X }, {@code -W} BlackIs1 / + * {@code -B} not, {@code -M} MSB-first). Only the plain shape TIFF's T.6 compression can represent + * verbatim is {@linkplain #supported supported}: G4, no EOL markers, MSB-first. Crucially, PDF's + * {@code EncodedByteAlign} never reaches the params file, so a wrapped stream is only trusted after + * the caller decodes it back successfully (see the extractor's read-back verification). + */ +final class CcittTiffs { + + private static final short TYPE_SHORT = 3; + private static final short TYPE_LONG = 4; + private static final short TYPE_RATIONAL = 5; + + private CcittTiffs() {} + + /** + * The decode parameters poppler records beside a {@code .ccitt} dump. + * + * @param kind the coding scheme flag: {@code -4} (G4), {@code -2} (G3 2D) or {@code -1} (G3 1D) + * @param endOfLine whether rows are prefixed with EOL markers ({@code -A}) + * @param columns the row width in pixels ({@code -X}) + * @param blackIs1 whether decoded 1-bits are black ({@code -W}) or 0-bits are ({@code -B}) + */ + record Params(String kind, boolean endOfLine, int columns, boolean blackIs1) {} + + /** Parse a {@code .params} file's text, or {@code null} when any token is unrecognized. */ + static @Nullable Params parseParams(String text) { + @Nullable String kind = null; + @Nullable Boolean endOfLine = null; + @Nullable Integer columns = null; + @Nullable Boolean blackIs1 = null; + boolean msbFirst = false; + String[] tokens = text.trim().split("\\s+", -1); + for (int i = 0; i < tokens.length; i++) { + switch (tokens[i]) { + case "-4", "-2", "-1" -> kind = tokens[i]; + case "-A" -> endOfLine = true; + case "-P" -> endOfLine = false; + case "-W" -> blackIs1 = true; + case "-B" -> blackIs1 = false; + case "-M" -> msbFirst = true; + case "-X" -> { + i++; + if (i >= tokens.length) { + return null; + } + try { + columns = Integer.parseInt(tokens[i]); + } catch (NumberFormatException e) { + return null; + } + } + default -> { + return null; + } + } + } + if (kind == null || endOfLine == null || columns == null || blackIs1 == null || !msbFirst) { + return null; + } + return new Params(kind, endOfLine, columns, blackIs1); + } + + /** + * Whether {@code params} describes a stream TIFF T.6 represents verbatim: Group 4, no EOL + * markers, and a width agreeing with the listing row the dump corresponds to. + */ + static boolean supported(Params params, int expectedWidth) { + return "-4".equals(params.kind()) + && !params.endOfLine() + && params.columns() == expectedWidth; + } + + /** + * Write {@code g4} as a little-endian, single-strip CCITT-G4 TIFF — header, the verbatim stream + * as the one strip, then the IFD. + * + * @param out the TIFF to write + * @param g4 the raw G4 (T.6) stream, verbatim + * @param width the row width in pixels + * @param height the row count + * @param blackIs1 the params' photometric hint: decoded 1-bits are black ({@code -W}) + * @param dpi the resolution to stamp, or {@code <= 0} to omit the resolution tags + */ + static void writeSingleStripG4( + Path out, byte[] g4, int width, int height, boolean blackIs1, int dpi) + throws IOException { + boolean withResolution = dpi > 0; + int entryCount = withResolution ? 14 : 11; + int stripOffset = 8; + int padding = g4.length % 2; // IFD offsets must be word-aligned + int ifdOffset = stripOffset + g4.length + padding; + int rationalOffset = ifdOffset + 2 + entryCount * 12 + 4; + ByteBuffer buf = + ByteBuffer.allocate(rationalOffset + (withResolution ? 16 : 0)) + .order(ByteOrder.LITTLE_ENDIAN); + + buf.put((byte) 'I').put((byte) 'I').putShort((short) 42).putInt(ifdOffset); + buf.put(g4); + if (padding == 1) { + buf.put((byte) 0); + } + + buf.putShort((short) entryCount); // entries below stay sorted by tag id + entry(buf, 256, TYPE_LONG, width); // ImageWidth + entry(buf, 257, TYPE_LONG, height); // ImageLength + entryShort(buf, 258, 1); // BitsPerSample + entryShort(buf, 259, 4); // Compression: CCITT T.6 (Group 4) + // The G4 stream encodes white/black runs; this tag tells readers which sense to + // materialize them in. The PDF default (-B, BlackIs1=false) is the standard fax sense — + // TIFF WhiteIsZero (0); -W (BlackIs1=true) is the inverted sense, BlackIsZero (1). + // Pinned empirically by CcittTiffsTest's pixel-identical round trip. + entryShort(buf, 262, blackIs1 ? 1 : 0); // PhotometricInterpretation + entryShort(buf, 266, 1); // FillOrder: MSB first (params -M) + entry(buf, 273, TYPE_LONG, stripOffset); // StripOffsets + entryShort(buf, 277, 1); // SamplesPerPixel + entry(buf, 278, TYPE_LONG, height); // RowsPerStrip: the single strip + entry(buf, 279, TYPE_LONG, g4.length); // StripByteCounts + if (withResolution) { + entry(buf, 282, TYPE_RATIONAL, rationalOffset); // XResolution + entry(buf, 283, TYPE_RATIONAL, rationalOffset + 8); // YResolution + } + entry(buf, 293, TYPE_LONG, 0); // T6Options: none + if (withResolution) { + entryShort(buf, 296, 2); // ResolutionUnit: inch + } + buf.putInt(0); // no next IFD + + if (withResolution) { + buf.putInt(dpi).putInt(1).putInt(dpi).putInt(1); + } + Files.write(out, buf.array()); + } + + /** One IFD entry holding an inline LONG (or a RATIONAL's value offset). */ + private static void entry(ByteBuffer buf, int tag, short type, int value) { + buf.putShort((short) tag).putShort(type).putInt(1).putInt(value); + } + + /** One IFD entry holding an inline SHORT (left-justified in the 4-byte value field). */ + private static void entryShort(ByteBuffer buf, int tag, int value) { + buf.putShort((short) tag).putShort(TYPE_SHORT).putInt(1); + buf.putShort((short) value).putShort((short) 0); + } +} diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java index 35c8b77..2c4bfac 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java @@ -1,24 +1,32 @@ package io.github.p4suta.shared.pdf; +import io.github.p4suta.shared.imaging.Pix; import io.github.p4suta.shared.process.ProcessRunner; import io.github.p4suta.shared.process.Tasks; import io.github.p4suta.shared.process.ToolPath; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.nio.file.Path; import java.time.Duration; import java.util.ArrayList; +import java.util.Comparator; import java.util.List; import java.util.Locale; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeoutException; +import java.util.stream.Stream; +import org.jspecify.annotations.Nullable; /** * Extracts a PDF's embedded bitonal images as TIFFs by driving {@code pdfimages}. The page range is * split across the worker pool (one {@code pdfimages -f/-l} per chunk) with distinct zero-padded - * {@code page-cNN-} prefixes, so a name sort yields reading order and chunks never collide. The - * dominant scan DPI is read from {@code pdfimages -list} and passed to the clean step as an - * explicit DPI, since {@code pdfimages} tags the extracted TIFFs at a default 72 dpi. + * {@code page-cNN-} prefixes, so a name sort yields reading order and chunks never collide. An + * all-CCITT source is remuxed — the raw embedded G4 streams pass through into single-strip TIFFs + * with their true ppi stamped (see {@link #extract}); any other source is extracted decoded, where + * {@code pdfimages} tags the TIFFs at a default 72 dpi, so the dominant scan DPI from {@code + * pdfimages -list} is passed downstream explicitly either way. * *

The textual {@code pdfinfo}/{@code pdfimages -list} reports are parsed by the pure {@link * PdfListingParser}; this adapter only drives the external processes via {@link @@ -93,13 +101,33 @@ public int dominantDpi(Path pdf) throws IOException { /** * Extract all pages of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks. {@code jobs} bounds both the chunk count and the pool slots used. + * chunks of about {@link #CHUNK_PAGES} pages on {@code pool} (at most {@code 4 * jobs} chunks). + * + *

One {@code pdfimages -list} pass picks the mode: when every embedded image is 1-bpp CCITT + * (the usual self-scanned book), each chunk dumps the raw G4 streams ({@code -ccitt}) and wraps + * them into single-strip CCITT-G4 TIFFs — a pure remux: no decode/re-encode, intermediates tens + * of KB per page instead of the decoded megabytes, and the image's true ppi stamped instead of + * {@code pdfimages}' default 72 dpi. Every wrapped page is decoded back once as verification + * (PDF's {@code EncodedByteAlign} never reaches the dumped params, so trust requires a decode); + * a chunk whose dump or wrap deviates in any way is re-extracted decoded ({@code -tiff}), which + * is also the whole-run mode for any other source. */ public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException { int total = pageCount(pdf); - int chunks = Math.max(1, Math.min(jobs, total)); - int per = (total + chunks - 1) / chunks; String pdfimages = resolve("pdfimages", pdfimagesPropertyKey); + List rows = + PdfListingParser.parseImageRows( + capture(List.of(pdfimages, "-list", pdf.toString()), INFO_TIMEOUT)); + boolean rawCcitt = + !rows.isEmpty() + && rows.stream().allMatch(r -> r.bpc() == 1 && "ccitt".equals(r.enc())); + + // Chunks of ~CHUNK_PAGES rather than total/jobs: fast finishers free their pool slot early + // (the straggler tail shrinks from total/jobs to ~CHUNK_PAGES pages), and a streaming + // consumer can take pages chunk by chunk. Capped so a small book is not all process spawns. + int chunkCap = (int) Math.min(4L * jobs, total); + int chunks = Math.clamp(Math.ceilDiv(total, CHUNK_PAGES), 1, Math.max(1, chunkCap)); + int per = Math.ceilDiv(total, chunks); List> tasks = new ArrayList<>(); int chunk = 0; for (int first = 1; first <= total; first += per) { @@ -108,18 +136,11 @@ public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throw outDir.resolve(String.format(Locale.ROOT, "page-c%03d-", chunk)).toString(); int from = first; int to = last; + List chunkRows = + rawCcitt ? rowsInRange(rows, from, to) : List.of(); tasks.add( () -> { - runDiscarding( - List.of( - pdfimages, - "-tiff", - "-f", - Integer.toString(from), - "-l", - Integer.toString(to), - pdf.toString(), - prefix)); + extractChunk(pdfimages, pdf, from, to, prefix, chunkRows); return null; }); chunk++; @@ -127,6 +148,143 @@ public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throw Tasks.awaitAll(pool, tasks, "pdfimages extract interrupted", "pdfimages extract failed"); } + /** Pages per extraction chunk; see {@link #extract}. */ + private static final int CHUNK_PAGES = 12; + + /** The listing rows for pages {@code from..to}, in listing (= dump) order. */ + private static List rowsInRange( + List rows, int from, int to) { + return rows.stream().filter(r -> r.page() >= from && r.page() <= to).toList(); + } + + /** + * Extract one page-range chunk: raw-CCITT remux when {@code ccittRows} describes it, decoded + * {@code -tiff} otherwise — and the {@code -tiff} rerun as the fallback when the dump deviates + * from the listing in any way (count, params shape, or a wrap that does not decode back). + */ + private void extractChunk( + String pdfimages, + Path pdf, + int from, + int to, + String prefix, + List ccittRows) + throws IOException { + if (ccittRows.isEmpty()) { + runDiscarding(extractCommand(pdfimages, "-tiff", from, to, pdf, prefix)); + return; + } + runDiscarding(extractCommand(pdfimages, "-ccitt", from, to, pdf, prefix)); + if (!wrapChunk(prefix, ccittRows)) { + deleteByPrefix(prefix); + runDiscarding(extractCommand(pdfimages, "-tiff", from, to, pdf, prefix)); + } + } + + private static List extractCommand( + String pdfimages, String format, int from, int to, Path pdf, String prefix) { + return List.of( + pdfimages, + format, + "-f", + Integer.toString(from), + "-l", + Integer.toString(to), + pdf.toString(), + prefix); + } + + /** + * Wrap every {@code .ccitt} dump under {@code prefix} into a single-strip G4 TIFF, verifying + * each by decoding it back. Returns {@code false} (without cleaning up) on any deviation; the + * caller then discards the chunk's artifacts and falls back to a decoded extract. + */ + private static boolean wrapChunk(String prefix, List rows) + throws IOException { + List dumps = filesByPrefix(prefix, ".ccitt"); + if (dumps.size() != rows.size()) { + return false; + } + for (int i = 0; i < dumps.size(); i++) { + Path ccitt = dumps.get(i); + PdfListingParser.ImageRow row = rows.get(i); + Path paramsFile = withExtension(ccitt, ".params"); + if (!Files.isRegularFile(paramsFile)) { + return false; + } + CcittTiffs.@Nullable Params params = + CcittTiffs.parseParams(Files.readString(paramsFile, StandardCharsets.UTF_8)); + if (params == null || !CcittTiffs.supported(params, row.width())) { + return false; + } + Path out = withExtension(ccitt, ".tif"); + CcittTiffs.writeSingleStripG4( + out, + Files.readAllBytes(ccitt), + row.width(), + row.height(), + params.blackIs1(), + Math.max(row.xPpi(), 0)); + if (!decodesBack(out, row)) { + return false; + } + Files.delete(ccitt); + Files.delete(paramsFile); + } + return true; + } + + /** + * Whether the wrapped TIFF decodes to the listing row's dimensions — the read-back proof that + * the stream really was plain T.6 (an {@code EncodedByteAlign} stream, undetectable from the + * params file, fails or mis-sizes here). + */ + private static boolean decodesBack(Path tif, PdfListingParser.ImageRow row) { + try (Pix pix = Pix.read(tif)) { + return pix.width() == row.width() && pix.height() == row.height(); + } catch (IllegalStateException e) { + return false; + } + } + + /** The files starting with {@code prefix}'s file name and ending in {@code suffix}, sorted. */ + private static List filesByPrefix(String prefix, String suffix) throws IOException { + Path prefixPath = Path.of(prefix); + Path dir = prefixPath.getParent(); + String name = String.valueOf(prefixPath.getFileName()); + if (dir == null) { + throw new IOException("extract prefix has no parent directory: " + prefix); + } + try (Stream entries = Files.list(dir)) { + return entries.filter( + p -> { + String fileName = String.valueOf(p.getFileName()); + return fileName.startsWith(name) && fileName.endsWith(suffix); + }) + .sorted(Comparator.comparing(p -> String.valueOf(p.getFileName()))) + .toList(); + } + } + + /** + * Delete every artifact of one chunk ({@code .ccitt}, {@code .params}, partial {@code .tif}). + */ + private static void deleteByPrefix(String prefix) throws IOException { + for (String suffix : List.of(".ccitt", ".params", ".tif")) { + for (Path file : filesByPrefix(prefix, suffix)) { + Files.deleteIfExists(file); + } + } + } + + /** A sibling of {@code file} with its extension replaced by {@code extension}. */ + private static Path withExtension(Path file, String extension) { + String name = String.valueOf(file.getFileName()); + int dot = name.lastIndexOf('.'); + String base = dot < 0 ? name : name.substring(0, dot); + return file.resolveSibling(base + extension); + } + /** Run an extraction command, discarding its (file-producing) output. */ private static void runDiscarding(List command) throws IOException { try { diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java index efe12aa..0a2bb1b 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java @@ -1,6 +1,8 @@ package io.github.p4suta.shared.pdf; +import java.util.ArrayList; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; /** @@ -36,27 +38,17 @@ public static int parsePageCount(String pdfinfoOutput) { } /** - * The most common rounded x-ppi (column 13, 0-based 12) across the {@code image} rows of a - * {@code pdfimages -list} report, skipping the two header rows. Ties resolve to the first value - * seen and a non-positive winner falls back to {@link #DEFAULT_DPI}. + * The most common rounded x-ppi across the {@code image} rows of a {@code pdfimages -list} + * report. Ties resolve to the first value seen and a non-positive winner falls back to {@link + * #DEFAULT_DPI}. * * @param listOutput the full text {@code pdfimages -list} printed * @return the dominant rounded x-ppi, or {@link #DEFAULT_DPI} when none is usable */ public static int parseDominantDpi(String listOutput) { - String[] lines = listOutput.split("\n", -1); Map counts = new LinkedHashMap<>(); - for (int i = 2; i < lines.length; i++) { - String[] fields = lines[i].trim().split("\\s+", -1); - if (fields.length < 13 || !"image".equals(fields[2])) { - continue; - } - try { - int ppi = (int) Math.round(Double.parseDouble(fields[12])); - counts.merge(ppi, 1, Integer::sum); - } catch (NumberFormatException ignored) { - // Non-numeric x-ppi cell: skip this row. - } + for (ImageRow row : parseImageRows(listOutput)) { + counts.merge(row.xPpi(), 1, Integer::sum); } if (counts.isEmpty()) { return DEFAULT_DPI; @@ -71,4 +63,50 @@ public static int parseDominantDpi(String listOutput) { } return best > 0 ? best : DEFAULT_DPI; } + + /** + * One {@code image} row of a {@code pdfimages -list} report — the columns the extractor needs + * to pick its mode and to wrap raw CCITT dumps. + * + * @param page the 1-based page the image sits on + * @param width the image width in pixels + * @param height the image height in pixels + * @param bpc bits per component ({@code 1} for bitonal) + * @param enc the embedded encoding token ({@code ccitt}, {@code jbig2}, {@code jpeg}, {@code + * image}, …) + * @param xPpi the rounded x-ppi the image is placed at (0 when the cell is unusable) + */ + public record ImageRow(int page, int width, int height, int bpc, String enc, int xPpi) {} + + /** + * Parse the {@code image} rows of a {@code pdfimages -list} report, in listing order (the same + * order {@code pdfimages} dumps the images in), skipping the two header rows and any row with + * unparsable numeric cells. + * + * @param listOutput the full text {@code pdfimages -list} printed + * @return the parsed rows, possibly empty + */ + public static List parseImageRows(String listOutput) { + String[] lines = listOutput.split("\n", -1); + List rows = new ArrayList<>(); + for (int i = 2; i < lines.length; i++) { + String[] fields = lines[i].trim().split("\\s+", -1); + if (fields.length < 13 || !"image".equals(fields[2])) { + continue; + } + try { + rows.add( + new ImageRow( + Integer.parseInt(fields[0]), + Integer.parseInt(fields[3]), + Integer.parseInt(fields[4]), + Integer.parseInt(fields[7]), + fields[8], + (int) Math.round(Double.parseDouble(fields[12])))); + } catch (NumberFormatException ignored) { + // A non-numeric cell: skip this row. + } + } + return rows; + } } diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java new file mode 100644 index 0000000..63e93d6 --- /dev/null +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java @@ -0,0 +1,160 @@ +package io.github.p4suta.shared.pdf; + +import static org.assertj.core.api.Assertions.assertThat; + +import io.github.p4suta.shared.imaging.Pix; +import java.awt.Color; +import java.awt.Graphics2D; +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; +import javax.imageio.ImageIO; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +/** + * The CCITT remux building blocks: the fax2tiff-style params parser (pure) and the single-strip G4 + * TIFF writer, verified by wrapping a real G4 stream (PDFBox's CCITT encoder, the same encoding a + * scanner PDF embeds) and decoding it back through Leptonica pixel-for-pixel. + */ +final class CcittTiffsTest { + + // ---- params parsing ---- + + @Test + void parsesTheUsualScannerShape() { + CcittTiffs.Params params = parsed("-4 -P -X 3496 -B -M\n"); + assertThat(params).isEqualTo(new CcittTiffs.Params("-4", false, 3496, false)); + assertThat(CcittTiffs.supported(params, 3496)).isTrue(); + } + + @Test + void eolMarkersAreUnsupported() { + CcittTiffs.Params params = parsed("-4 -A -X 100 -W -M"); + assertThat(params).isEqualTo(new CcittTiffs.Params("-4", true, 100, true)); + // EOL markers are not representable in TIFF T.6. + assertThat(CcittTiffs.supported(params, 100)).isFalse(); + } + + @Test + void group3IsUnsupported() { + assertThat(CcittTiffs.supported(parsed("-2 -P -X 100 -B -M"), 100)).isFalse(); + } + + @Test + void widthMismatchIsUnsupported() { + assertThat(CcittTiffs.supported(parsed("-4 -P -X 100 -B -M"), 200)).isFalse(); + } + + /** Parse params the test asserts are well-formed, made non-null for NullAway. */ + private static CcittTiffs.Params parsed(String text) { + return java.util.Objects.requireNonNull(CcittTiffs.parseParams(text)); + } + + @Test + void unknownTokensAndMissingFlagsAreUnparsable() { + assertThat(CcittTiffs.parseParams("-4 -P -X 100 -B -M -Z")).isNull(); // unknown flag + assertThat(CcittTiffs.parseParams("-4 -P -B -M")).isNull(); // no -X + assertThat(CcittTiffs.parseParams("-4 -P -X nope -B -M")).isNull(); // bad width + assertThat(CcittTiffs.parseParams("-4 -P -X 100 -B")).isNull(); // no -M + assertThat(CcittTiffs.parseParams("")).isNull(); + } + + // ---- TIFF wrapping ---- + + /** + * Round trip: draw a known bitonal pattern, encode it to a raw G4 stream with PDFBox's CCITT + * encoder (the very encoding a scanner PDF embeds and {@code pdfimages -ccitt} dumps), wrap the + * stream with {@link CcittTiffs#writeSingleStripG4}, and assert it decodes back through + * Leptonica pixel-identical to the original, with the stamped resolution. + */ + @Test + void wrappedStreamDecodesBackPixelIdentical(@TempDir Path tmp) throws Exception { + int width = 200; + int height = 150; + BufferedImage img = pattern(width, height); + G4Stream g4 = encodeG4(img); + + Path wrapped = tmp.resolve("wrapped.tif"); + CcittTiffs.writeSingleStripG4(wrapped, g4.bytes, width, height, g4.blackIs1, 450); + + Path referencePng = tmp.resolve("reference.png"); + ImageIO.write(img, "png", referencePng.toFile()); + try (Pix expected = Pix.read(referencePng); + Pix actual = Pix.read(wrapped)) { + assertThat(actual.width()).isEqualTo(width); + assertThat(actual.height()).isEqualTo(height); + assertThat(actual.resolution()).isEqualTo(450); + assertThat(actual.blackPixels()).isPositive(); + assertThat(actual.pixelsEqual(expected)).isTrue(); + } + } + + @Test + void omitsResolutionTagsWhenDpiUnknown(@TempDir Path tmp) throws Exception { + int width = 64; + int height = 48; + G4Stream g4 = encodeG4(pattern(width, height)); + + Path wrapped = tmp.resolve("wrapped.tif"); + CcittTiffs.writeSingleStripG4(wrapped, g4.bytes, width, height, g4.blackIs1, 0); + + try (Pix actual = Pix.read(wrapped)) { + assertThat(actual.width()).isEqualTo(width); + assertThat(actual.resolution()).isZero(); + } + } + + /** A raw G4 (T.6) stream and the {@code BlackIs1} convention its encoder declared. */ + private static final class G4Stream { + final byte[] bytes; + final boolean blackIs1; + + G4Stream(byte[] bytes, boolean blackIs1) { + this.bytes = bytes; + this.blackIs1 = blackIs1; + } + } + + /** + * The raw CCITT G4 stream PDFBox's {@link CCITTFactory} encodes {@code img} to (lifted verbatim + * from the image XObject, exactly the bytes {@code pdfimages -ccitt} would dump), along with + * the {@code BlackIs1} decode parameter it declared. + */ + private static G4Stream encodeG4(BufferedImage img) throws IOException { + try (PDDocument doc = new PDDocument()) { + PDImageXObject image = CCITTFactory.createFromImage(doc, img); + COSDictionary decodeParms = + (COSDictionary) image.getCOSObject().getDictionaryObject(COSName.DECODE_PARMS); + boolean blackIs1 = + decodeParms != null && decodeParms.getBoolean(COSName.BLACK_IS_1, false); + try (InputStream in = image.getCOSObject().createRawInputStream()) { + return new G4Stream(in.readAllBytes(), blackIs1); + } + } + } + + /** A deterministic bitonal pattern with structure (bars + a block). */ + private static BufferedImage pattern(int width, int height) { + BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); + Graphics2D g = img.createGraphics(); + try { + g.setColor(Color.WHITE); + g.fillRect(0, 0, width, height); + g.setColor(Color.BLACK); + for (int x = 4; x < width - 8; x += 12) { + g.fillRect(x, 8, 6, height - 16); + } + g.fillRect(width / 3, height / 3, width / 3, height / 3); + } finally { + g.dispose(); + } + return img; + } +} diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java index c9032d2..9b7d1c5 100644 --- a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java @@ -94,6 +94,69 @@ void dominantDpiReturnsAPositiveResolution(@TempDir Path tmp) throws Exception { assertThat(dpi).isPositive(); } + /** Build a multi-page PDF embedding CCITT-G4 bitonal images, one per page, at ~200 ppi. */ + private static void writeCcittPdf(Path pdf, int pages, int imgW, int imgH) throws IOException { + try (PDDocument doc = new PDDocument()) { + for (int i = 0; i < pages; i++) { + BufferedImage bitonal = + new BufferedImage(imgW, imgH, BufferedImage.TYPE_BYTE_BINARY); + java.awt.Graphics2D g = bitonal.createGraphics(); + try { + g.setColor(java.awt.Color.WHITE); + g.fillRect(0, 0, imgW, imgH); + g.setColor(java.awt.Color.BLACK); + g.fillRect(10 + i, 10, imgW / 3, imgH / 2); + } finally { + g.dispose(); + } + float wPt = imgW * 72f / 200; + float hPt = imgH * 72f / 200; + PDPage page = new PDPage(new PDRectangle(wPt, hPt)); + doc.addPage(page); + PDImageXObject image = + org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory.createFromImage( + doc, bitonal); + try (PDPageContentStream content = new PDPageContentStream(doc, page)) { + content.drawImage(image, 0, 0, wPt, hPt); + } + } + doc.save(pdf.toFile()); + } + } + + @Test + @EnabledIf("io.github.p4suta.shared.pdf.PdfImagesCliExtractorTest#toolsOnPath") + void remuxesAnAllCcittSourceIntoStampedG4Tiffs(@TempDir Path tmp) throws Exception { + Path pdf = tmp.resolve("scan.pdf"); + writeCcittPdf(pdf, 3, 240, 180); + Path outDir = Files.createDirectory(tmp.resolve("out")); + + ExecutorService pool = Executors.newFixedThreadPool(2); + try { + new PdfImagesCliExtractor(PDFIMAGES_KEY, PDFINFO_KEY).extract(pdf, outDir, 2, pool); + } finally { + pool.shutdownNow(); + } + + try (Stream entries = Files.list(outDir)) { + List files = entries.sorted().toList(); + // The remux leaves exactly one .tif per page — no .ccitt/.params residue. + assertThat(files).hasSize(3); + assertThat(files).allSatisfy(p -> assertThat(p.toString()).endsWith(".tif")); + for (Path tif : files) { + try (io.github.p4suta.shared.imaging.Pix pix = + io.github.p4suta.shared.imaging.Pix.read(tif)) { + assertThat(pix.width()).isEqualTo(240); + assertThat(pix.height()).isEqualTo(180); + // The remux stamps the image's true ppi instead of pdfimages' default 72. + assertThat(pix.resolution()).isEqualTo(200); + // Black ink, not inverted: the drawn block is ~1/6 of the page. + assertThat(pix.blackPixels()).isGreaterThan(0).isLessThan(240L * 180 / 2); + } + } + } + } + @Test void missingToolFailsWithAClearMessage(@TempDir Path tmp) throws Exception { Path pdf = tmp.resolve("doc.pdf"); diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java index f228032..2bba8d0 100644 --- a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java @@ -120,4 +120,27 @@ void parseDominantDpiSkipsNonImageRowsWithTooFewFields() { PdfListingParser.DEFAULT_DPI, PdfListingParser.parseDominantDpi("hdr\n----\n 1 0 smask\n")); } + + @Test + void parseImageRowsReadsTheColumnsTheExtractorNeeds() { + var rows = PdfListingParser.parseImageRows(LIST); + assertEquals(3, rows.size()); + assertEquals(new PdfListingParser.ImageRow(1, 2480, 3508, 1, "ccitt", 300), rows.get(0)); + assertEquals(new PdfListingParser.ImageRow(3, 1240, 1754, 1, "ccitt", 150), rows.get(2)); + } + + @Test + void parseImageRowsSkipsMalformedAndNonImageRows() { + String mixed = + """ + page num type width height color comp bpc enc interp object ID x-ppi y-ppi size ratio + -------------------------------------------------------------------------------------------- + 1 0 smask 2480 3508 gray 1 1 ccitt no 7 0 300 300 101K 1.2% + 2 1 image bad 3508 gray 1 1 ccitt no 11 0 300 300 99K 1.1% + 3 2 image 2480 3508 rgb 3 8 jpeg no 14 0 150 150 40K 1.0% + """; + var rows = PdfListingParser.parseImageRows(mixed); + assertEquals(1, rows.size()); + assertEquals(new PdfListingParser.ImageRow(3, 2480, 3508, 8, "jpeg", 150), rows.get(0)); + } } From 97b3db2045fda868854f03331985d1e738324f13 Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 16:06:57 +0900 Subject: [PATCH 3/4] chore(lint): teach typos the PDF spec's DecodeParms, reword "mis-sizes" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI spell check flags PDFBox's COSName.DECODE_PARMS (the PDF spec's own key name, which the remux test must name verbatim) and a hyphenated coinage in the extractor's javadoc. Allowlist the spec identifier — the same precedent as the veraPDF en-GB names — and use plain words. Co-Authored-By: Claude Fable 5 --- _typos.toml | 4 ++++ .../io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/_typos.toml b/_typos.toml index df533d6..b9800fd 100644 --- a/_typos.toml +++ b/_typos.toml @@ -45,3 +45,7 @@ Pagent = "Pagent" Flavour = "Flavour" flavours = "flavours" initialise = "initialise" +# The PDF spec's own key name `DecodeParms` (and PDFBox's COSName.DECODE_PARMS +# constant mirroring it) — third-party/spec identifiers we must name verbatim. +Parms = "Parms" +PARMS = "PARMS" diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java index 2c4bfac..77ed395 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java @@ -237,7 +237,7 @@ private static boolean wrapChunk(String prefix, List /** * Whether the wrapped TIFF decodes to the listing row's dimensions — the read-back proof that * the stream really was plain T.6 (an {@code EncodedByteAlign} stream, undetectable from the - * params file, fails or mis-sizes here). + * params file, fails to decode or comes back the wrong size here). */ private static boolean decodesBack(Path tif, PdfListingParser.ImageRow row) { try (Pix pix = Pix.read(tif)) { From 4c1a950bca9f56710cc15b699f688f4b3f492380 Mon Sep 17 00:00:00 2001 From: Yasunobu <42543015+P4suta@users.noreply.github.com> Date: Wed, 10 Jun 2026 19:16:29 +0900 Subject: [PATCH 4/4] docs: align the generated baseline docs with main Squash merges orphan the stack's ancestry, so the benchmark documents (regenerated on every bench run) collide as add/add between this branch and main. Align them to main's version; the round's closing PR commits the final regenerated baselines, so no information is lost from the final state. The measured numbers this PR contributed remain in its commit message and PR description. Co-Authored-By: Claude Fable 5 --- pipeline/docs/perf-baseline.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pipeline/docs/perf-baseline.md b/pipeline/docs/perf-baseline.md index 06ea504..9727575 100644 --- a/pipeline/docs/perf-baseline.md +++ b/pipeline/docs/perf-baseline.md @@ -6,7 +6,7 @@ of the installDist `pdfbook` launcher. Re-run after any change to the pipeline and compare against the previous run before merging (acceptance: ≥5% median total-wall improvement, or an explicit RSS/disk win, with output validated). -- Date (UTC): 2026-06-10 05:11:24 +- Date (UTC): 2026-06-10 04:46:07 - Host: Linux 6.8.0-124-generic amd64, 8 CPUs, RAM 16Gi - Launcher: `pipeline/app/build/install/pdfbook/bin/pdfbook` - Samples per measurement: cold (1st run) + warm median of 3. @@ -21,8 +21,8 @@ total-wall improvement, or an explicit RSS/disk win, with output validated). | Input | Jobs | Pages | E2E wall | conv | extract | despeckle | register | spread | startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) | |---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| -| sample-scan-200p.pdf | 1 | 200 | 46.18s | 45.98s | 0.88s | 31.95s | 12.94s | 0.20s | 0.20s | 47.85s | 173 | 6.4 | -| sample-scan-200p.pdf | 8 | 200 | 14.43s | 14.23s | 0.46s | 10.19s | 3.28s | 0.20s | 0.20s | 14.16s | 322 | 6.4 | +| sample-scan-200p.pdf | 1 | 200 | 50.10s | 49.85s | 4.57s | 32.16s | 12.94s | 0.20s | 0.25s | 51.42s | 156 | 6.4 | +| sample-scan-200p.pdf | 8 | 200 | 14.77s | 14.48s | 1.15s | 9.85s | 3.27s | 0.21s | 0.29s | 15.04s | 328 | 6.4 | ## Stage shares (of conv, warm median) @@ -31,5 +31,5 @@ cannot pay for a parallelization rewrite no matter how elegant. | Input | Jobs | extract | despeckle | register | spread | |---|---|---:|---:|---:|---:| -| sample-scan-200p.pdf | 1 | 1.9% | 69.5% | 28.1% | 0.4% | -| sample-scan-200p.pdf | 8 | 3.2% | 71.6% | 23.0% | 1.4% | +| sample-scan-200p.pdf | 1 | 9.2% | 64.5% | 26.0% | 0.4% | +| sample-scan-200p.pdf | 8 | 7.9% | 68.0% | 22.6% | 1.5% |