diff --git a/_typos.toml b/_typos.toml index df533d6..b9800fd 100644 --- a/_typos.toml +++ b/_typos.toml @@ -45,3 +45,7 @@ Pagent = "Pagent" Flavour = "Flavour" flavours = "flavours" initialise = "initialise" +# The PDF spec's own key name `DecodeParms` (and PDFBox's COSName.DECODE_PARMS +# constant mirroring it) — third-party/spec identifiers we must name verbatim. +Parms = "Parms" +PARMS = "PARMS" diff --git a/pipeline/README.md b/pipeline/README.md index b8d47ef..12d15ef 100644 --- a/pipeline/README.md +++ b/pipeline/README.md @@ -33,6 +33,7 @@ never stops the rest; existing outputs are skipped unless `--force`). | `--pdf-a` | off | emit PDF/A-2b conformance | | `--force` | off | overwrite an existing output (batch: regenerate, don't skip) | | `--progress-file ` | — | write machine-readable JSONL progress events (single input only) | +| `--timings` | off | print a per-stage wall-clock breakdown to stderr when each run ends | | `-i, --interactive` | off | guided mode: prompt for the input, options and output | | `-h, --help` | — | show help and exit | | `-V, --version` | — | print version and exit | diff --git a/pipeline/app/build.gradle.kts b/pipeline/app/build.gradle.kts index 9764cfe..5c2572d 100644 --- a/pipeline/app/build.gradle.kts +++ b/pipeline/app/build.gradle.kts @@ -34,6 +34,10 @@ dependencies { implementation(libs.commons.cli) implementation(libs.slf4j.api) runtimeOnly(libs.slf4j.simple) + + // The benchmark fixture generator (test sources, never shipped — mirroring register's + // createSamplePdf) draws synthetic scan pages with PDFBox directly. + testImplementation(libs.pdfbox) } // The one place native access is granted to the launched app; run, test and JavaExec inherit it. @@ -85,3 +89,46 @@ selfContainedApp { // jbig2 (its register stage writes TIFF-G4; the spread pack embeds CCITT G4). bundleQpdf(this, libs.versions.qpdf.get()) } + +// ---- Stage-level benchmark (see pipeline/docs/perf-baseline.md) --------------------------------- + +// Deterministic synthetic scan book for the benchmark: an existing output is reused, so the +// generation cost (a minute at 200 pages × 600 dpi) is paid once. Knob: -Ppages=N (default 200). +tasks.register("createSampleScan") { + group = "verification" + description = "Generate the synthetic bitonal scan book the benchmark converts (cached)" + dependsOn(tasks.named("testClasses")) + classpath = sourceSets["test"].runtimeClasspath + mainClass = "io.github.p4suta.pipeline.tools.SampleScanGenerator" + val pages = providers.gradleProperty("pages").getOrElse("200") + args = listOf("build/test-data/sample-scan-${pages}p.pdf", pages, "600") +} + +// Stage-level runtime + memory benchmark (the pdfbook counterpart of tate's benchRuntime): runs the +// installDist launcher in-container with --timings, parses the per-stage breakdown, samples peak +// RSS from /proc, and writes pipeline/docs/perf-baseline.md. Knobs: -Pruns=N (warm runs, default +// 3), -Pjobs=1,8 (comma-separated -j sweep; default auto = the launcher's CPU-count default), +// -Ppages=N (fixture size, default 200), -Pinputs="a.pdf b.pdf" (real books instead of the +// fixture; resolved against the repo root). +tasks.register("benchPipeline") { + group = "verification" + description = "Benchmark pdfbook stage timings + peak memory; writes pipeline/docs/perf-baseline.md" + dependsOn(tasks.named("installDist"), tasks.named("createSampleScan")) + classpath = sourceSets["test"].runtimeClasspath + mainClass = "io.github.p4suta.pipeline.tools.PipelineBenchmark" + workingDir = rootDir + val runs = providers.gradleProperty("runs").getOrElse("3") + val jobs = providers.gradleProperty("jobs").getOrElse("auto") + val pages = providers.gradleProperty("pages").getOrElse("200") + val extraInputs = + providers + .gradleProperty("inputs") + .orNull + ?.split(Regex("\\s+")) + ?.filter { it.isNotBlank() } + ?: emptyList() + val launcher = "pipeline/app/build/install/pdfbook/bin/pdfbook" + val inputs = + extraInputs.ifEmpty { listOf("pipeline/app/build/test-data/sample-scan-${pages}p.pdf") } + args = listOf(launcher, "qpdf", "pipeline/docs/perf-baseline.md", runs, jobs) + inputs +} diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java index 99e453c..25f8125 100644 --- a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java @@ -153,6 +153,13 @@ private static Options buildOptions() { "Write machine-readable JSONL progress events to this file (single" + " input only); used by front ends to report progress.") .get()); + options.addOption( + Option.builder() + .longOpt("timings") + .desc( + "Print a per-stage wall-clock breakdown to stderr when each run" + + " ends.") + .get()); CliDocs.options(options); return options; } @@ -351,6 +358,7 @@ record Plan(Path input, Path output, Config config) {} deskew, scale, pdfA, + false, force); return new Plan(input, output, config); } @@ -380,14 +388,32 @@ private static String defaultOutput(Path input) { private static void runOne(Path input, Path output, Config config, @Nullable Path progressFile) throws IOException { if (progressFile == null) { - runWith(input, output, config, ProgressSink.NO_OP); + runWith(input, output, config, withTimings(config, ProgressSink.NO_OP)); } else { try (JsonlFileProgressSink progress = new JsonlFileProgressSink(progressFile)) { - runWith(input, output, config, progress); + runWith(input, output, config, withTimings(config, progress)); } } } + /** + * Wraps {@code sink} with a fresh {@link StageTimingSink} when {@code --timings} is set, so + * each run (every book of a batch separately) prints its own per-stage breakdown to stderr. + */ + private static ProgressSink withTimings(Config config, ProgressSink sink) { + if (!config.timings()) { + return sink; + } + StageTimingSink timings = new StageTimingSink(System.err); + if (sink == ProgressSink.NO_OP) { + return timings; + } + return event -> { + sink.emit(event); + timings.emit(event); + }; + } + // Resolves the progress sink first so the stages and sink report page-level PageProcessed // events into the same sink PipelineRunner reports stage boundaries into. With no // --progress-file the sink is NO_OP and every emit is a no-op. @@ -401,9 +427,11 @@ private static void runWith(Path input, Path output, Config config, ProgressSink stages.add(new RegisterStage(config.jobs(), config.deskew(), config.scale(), progress)); } if (stages.isEmpty()) { - // --no-despeckle --no-register: the raw pdfimages TIFFs are not CCITT G4, which the - // spread sink's pass-through embedding requires; despeckle/register each re-encode G4 - // themselves, so only the no-stage path needs this normalization. + // --no-despeckle --no-register: a non-CCITT source extracts as decoded TIFFs that are + // not the single-strip CCITT G4 the spread sink's pass-through embedding requires; + // despeckle/register each re-encode G4 themselves, so only the no-stage path needs + // this normalization (an all-CCITT source arrives already G4 — then this is a cheap + // lossless re-encode that keeps the path uniform). stages.add(new G4EncodeStage(config.jobs(), progress)); } Source source = new PdfExtractSource(input, config.jobs()); @@ -447,6 +475,7 @@ private static Config parseConfig(CommandLine cmd) throws ParseException { !cmd.hasOption("no-deskew"), !cmd.hasOption("no-scale"), cmd.hasOption("pdf-a"), + cmd.hasOption("timings"), cmd.hasOption("force")); } @@ -475,5 +504,6 @@ record Config( boolean deskew, boolean scale, boolean pdfA, + boolean timings, boolean force) {} } diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java new file mode 100644 index 0000000..496b0e4 --- /dev/null +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java @@ -0,0 +1,93 @@ +package io.github.p4suta.pipeline.cli; + +import io.github.p4suta.shared.progress.ProgressEvent; +import io.github.p4suta.shared.progress.ProgressSink; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import org.jspecify.annotations.Nullable; + +/** + * Measures each stage's wall clock from its {@link ProgressEvent.StageStarted}/{@link + * ProgressEvent.StageCompleted} boundaries and prints a per-stage breakdown when the run ends — the + * {@code --timings} flag's implementation. One line per stage in completion order, then the + * run-wide total: + * + *
{@code
+ * timing: extract = 4.21s (18.3%)
+ * timing: despeckle = 9.87s (42.9%)
+ * timing: total = 23.01s
+ * }
+ * + *

The {@code timing: = s} shape is a stable contract the {@code benchPipeline} + * harness parses; keep it machine-readable. A stage still open when the run fails is reported with + * its elapsed-so-far, so a failed run still shows where the time went. Thread-safe like every + * {@link ProgressSink}: events are handled under one lock. + */ +final class StageTimingSink implements ProgressSink { + + private final PrintStream out; + private final Object lock = new Object(); + private final List stages = new ArrayList<>(); + private final List stageNanos = new ArrayList<>(); + private @Nullable String openStage; + private long openedAtNanos; + private long runStartedAtNanos; + private boolean runStarted; + + StageTimingSink(PrintStream out) { + this.out = out; + } + + @Override + public void emit(ProgressEvent event) { + synchronized (lock) { + switch (event) { + case ProgressEvent.RunStarted ignored -> markRunStarted(); + case ProgressEvent.StageStarted s -> { + // Defensive: a sink wired mid-run still measures from the first boundary. + markRunStarted(); + openStage = s.stage(); + openedAtNanos = System.nanoTime(); + } + case ProgressEvent.StageCompleted ignored -> closeOpenStage(); + case ProgressEvent.PageProcessed ignored -> { + // Stage boundaries carry all the timing information. + } + case ProgressEvent.RunCompleted ignored -> report(); + case ProgressEvent.RunFailed ignored -> report(); + } + } + } + + private void markRunStarted() { + if (!runStarted) { + runStartedAtNanos = System.nanoTime(); + runStarted = true; + } + } + + private void closeOpenStage() { + @Nullable String stage = openStage; + if (stage != null) { + stages.add(stage); + stageNanos.add(System.nanoTime() - openedAtNanos); + openStage = null; + } + } + + private void report() { + closeOpenStage(); + long totalNanos = runStarted ? System.nanoTime() - runStartedAtNanos : 0; + for (int i = 0; i < stages.size(); i++) { + out.printf( + Locale.ROOT, + "timing: %s = %.2fs (%.1f%%)%n", + stages.get(i), + stageNanos.get(i) / 1e9, + totalNanos > 0 ? stageNanos.get(i) * 100.0 / totalNanos : 0.0); + } + out.printf(Locale.ROOT, "timing: total = %.2fs%n", totalNanos / 1e9); + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java new file mode 100644 index 0000000..3fbb3f7 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java @@ -0,0 +1,75 @@ +package io.github.p4suta.pipeline.cli; + +import static org.assertj.core.api.Assertions.assertThat; + +import io.github.p4suta.shared.progress.ProgressEvent; +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import org.junit.jupiter.api.Test; + +/** + * Pins the {@code --timings} report: one machine-parseable {@code timing: = s} + * line per completed stage (in completion order, percentages attached) plus a {@code timing: total} + * line, printed only when the run ends — and on failure, the still-open stage is reported with its + * elapsed-so-far. The line shape is the contract the {@code benchPipeline} harness parses. + */ +final class StageTimingSinkTest { + + private final ByteArrayOutputStream buf = new ByteArrayOutputStream(); + private final StageTimingSink sink = + new StageTimingSink(new PrintStream(buf, true, StandardCharsets.UTF_8)); + + private String output() { + return buf.toString(StandardCharsets.UTF_8); + } + + @Test + void completedRunReportsEachStageInOrderAndATotal() { + sink.emit(new ProgressEvent.RunStarted(2)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 2)); + sink.emit(new ProgressEvent.PageProcessed("extract", 1, 2)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + sink.emit(new ProgressEvent.StageStarted("spread", 1, 2)); + sink.emit(new ProgressEvent.StageCompleted("spread")); + sink.emit(new ProgressEvent.RunCompleted()); + + assertThat(output().lines()) + .hasSize(3) + .satisfiesExactly( + extract -> + assertThat(extract) + .matches( + "timing: extract = \\d+\\.\\d{2}s" + + " \\(\\d+\\.\\d%\\)"), + spread -> + assertThat(spread) + .matches( + "timing: spread = \\d+\\.\\d{2}s" + + " \\(\\d+\\.\\d%\\)"), + total -> assertThat(total).matches("timing: total = \\d+\\.\\d{2}s")); + } + + @Test + void nothingIsPrintedBeforeTheRunEnds() { + sink.emit(new ProgressEvent.RunStarted(1)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 1)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + + assertThat(output()).isEmpty(); + } + + @Test + void failedRunReportsTheStillOpenStage() { + sink.emit(new ProgressEvent.RunStarted(2)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 2)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + sink.emit(new ProgressEvent.StageStarted("register", 1, 2)); + sink.emit(new ProgressEvent.RunFailed("INTERNAL", "boom")); + + assertThat(output()) + .contains("timing: extract = ") + .contains("timing: register = ") + .contains("timing: total = "); + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java new file mode 100644 index 0000000..1b59397 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java @@ -0,0 +1,498 @@ +package io.github.p4suta.pipeline.tools; + +import java.io.IOException; +import java.lang.management.ManagementFactory; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Stage-level runtime + memory benchmark for the installed {@code pdfbook} launcher — the pdfbook + * counterpart of tate's {@code RuntimeBenchmark}, with per-stage attribution as the addition. + * + *

Drives {@code pdfbook -o /out.pdf --force --timings [-j N]} as a child process, + * measuring end-to-end wall around the process ({@link System#nanoTime()}), peak RSS by sampling + * the child's {@code /proc//status} {@code VmHWM} (Linux-only; {@code n/a} elsewhere), and the + * per-stage wall by parsing the stable {@code timing: = s} lines {@code + * StageTimingSink} prints. Writes a Markdown report. + * + *

Test-sources tool (driven by the {@code benchPipeline} Gradle task): it never ships in the + * production launcher, and it expects the dev container's native toolchain (pdfimages, Leptonica, + * qpdf) on PATH — the installDist launcher, not the jpackage image, is what it measures. + * + *

Usage: {@code PipelineBenchmark + * ...} — {@code jobsCsv} is a comma-separated {@code -j} sweep ({@code auto} = omit + * {@code -j}, i.e. the launcher's CPU-count default). + */ +public final class PipelineBenchmark { + + private static final Pattern TIMING = + Pattern.compile("^timing: (\\S+) = ([0-9.]+)s", Pattern.MULTILINE); + private static final Pattern VM_HWM = Pattern.compile("VmHWM:\\s*([0-9]+)"); + private static final long POLL_MILLIS = 5; + private static final long PROCESS_TIMEOUT_NANOS = TimeUnit.MINUTES.toNanos(30); + private static final long MIB = 1024L * 1024L; + + private final Path launcher; + private final String qpdf; + private final Path outDoc; + private final int runs; + private final List jobsSweep; + + private PipelineBenchmark( + Path launcher, String qpdf, Path outDoc, int runs, List jobsSweep) { + this.launcher = launcher; + this.qpdf = qpdf; + this.outDoc = outDoc; + this.runs = runs; + this.jobsSweep = jobsSweep; + } + + public static void main(String[] args) throws IOException, InterruptedException { + if (args.length < 6) { + System.err.println( + "usage: PipelineBenchmark " + + " ..."); + System.exit(2); + return; + } + var benchmark = + new PipelineBenchmark( + Path.of(args[0]), + args[1], + Path.of(args[2]), + Integer.parseInt(args[3]), + Arrays.stream(args[4].split(",")).map(String::trim).toList()); + List inputs = Arrays.stream(args).skip(5).map(Path::of).toList(); + benchmark.run(inputs); + } + + // Result records + + /** One measured child run: wall seconds, peak RSS (KiB, -1 if unavailable), merged output. */ + private record Timed(double elapsedSeconds, long maxRssKib, String output) {} + + /** A finished input × jobs measurement, ready to render. */ + private record Row( + String name, + String jobs, + int pages, + long inputBytes, + double wallMedian, + double coldWall, + Map stageMedians, + long rssMedianKib, + long outputBytes) {} + + // Orchestration + + private void run(List inputs) throws IOException, InterruptedException { + requireExecutable(launcher, "pdfbook launcher", "build it first: just pdfbook-install"); + + List rows = new ArrayList<>(); + for (Path input : inputs) { + if (!Files.isRegularFile(input)) { + System.err.println("skip (not found): " + input); + continue; + } + for (String jobs : jobsSweep) { + rows.add(measure(input, jobs)); + } + } + + String report = render(rows); + Files.createDirectories(requireParent(outDoc)); + Files.writeString(outDoc, report, StandardCharsets.UTF_8); + System.out.print(report); + System.err.println(); + System.err.println("→ wrote " + outDoc); + } + + private Row measure(Path input, String jobs) throws IOException, InterruptedException { + int pages = pageCount(input); + long inputBytes = Files.size(input); + System.err.printf( + Locale.ROOT, + "Measuring: %s (%dp, %s MiB, jobs=%s)…%n", + fileName(input), + pages, + mib(inputBytes), + jobs); + + Path work = Files.createTempDirectory("pdfbook-bench"); + try { + Path out = work.resolve("out.pdf"); + List convert = new ArrayList<>(); + convert.add(launcher.toString()); + convert.add(input.toString()); + convert.add("-o"); + convert.add(out.toString()); + convert.add("--force"); + convert.add("--timings"); + if (!"auto".equals(jobs)) { + convert.add("-j"); + convert.add(jobs); + } + + // Cold run (fresh page cache for the input is not guaranteed, but a fresh JVM is) — + // recorded separately from the warm median. + Timed cold = timed(convert); + + double[] walls = new double[runs]; + long[] rsss = new long[runs]; + Map> stages = new LinkedHashMap<>(); + for (int r = 0; r < runs; r++) { + Timed t = timed(convert); + walls[r] = t.elapsedSeconds(); + rsss[r] = t.maxRssKib(); + parseTimings(t.output()) + .forEach( + (stage, seconds) -> + stages.computeIfAbsent(stage, ignored -> new ArrayList<>()) + .add(seconds)); + } + Map stageMedians = new LinkedHashMap<>(); + stages.forEach((stage, seconds) -> stageMedians.put(stage, median(seconds))); + + long outputBytes = Files.isRegularFile(out) ? Files.size(out) : -1; + return new Row( + fileName(input), + jobs, + pages, + inputBytes, + median(walls), + cold.elapsedSeconds(), + stageMedians, + medianLong(rsss), + outputBytes); + } finally { + deleteTree(work); + } + } + + /** + * The per-stage seconds of one run, keyed by stage label in print order ({@code total} + * included). Repeated labels (a batch run) sum, though the harness always converts one book. + */ + private static Map parseTimings(String output) { + Map timings = new LinkedHashMap<>(); + Matcher m = TIMING.matcher(output); + while (m.find()) { + timings.merge(m.group(1), Double.parseDouble(m.group(2)), Double::sum); + } + return timings; + } + + // Subprocess measurement + + /** + * Runs {@code command}, returning its wall time, peak RSS (sampled from {@code + * /proc//status} {@code VmHWM}; -1 where unavailable), and its merged stdout+stderr (so + * the {@code timing:} lines can be parsed). Output is drained on a separate thread so a chatty + * child cannot deadlock on a full pipe. + */ + private static Timed timed(List command) throws IOException, InterruptedException { + long start = System.nanoTime(); + Process process = new ProcessBuilder(command).redirectErrorStream(true).start(); + + var captured = new AtomicReference<>(""); + Thread drainer = + Thread.ofVirtual() + .start( + () -> { + try (var in = process.getInputStream()) { + captured.set( + new String( + in.readAllBytes(), StandardCharsets.UTF_8)); + } catch (IOException ignored) { + // Process gone; whatever was read is lost — acceptable for + // a benchmark. + } + }); + + Path status = Path.of("/proc", Long.toString(process.pid()), "status"); + long peakRssKib = -1; + while (process.isAlive()) { + if (System.nanoTime() - start > PROCESS_TIMEOUT_NANOS) { + process.destroyForcibly(); + throw new IOException("timed command did not finish: " + command); + } + peakRssKib = Math.max(peakRssKib, readVmHwmKib(status)); + Thread.sleep(POLL_MILLIS); + } + double elapsed = (System.nanoTime() - start) / 1.0e9; + int exit = process.waitFor(); + drainer.join(); + if (exit != 0) { + throw new IOException( + "benchmark child failed with exit " + exit + ": " + captured.get()); + } + return new Timed(elapsed, peakRssKib, captured.get()); + } + + /** Peak RSS (KiB) from {@code /proc//status}, or -1 if unreadable / non-Linux. */ + private static long readVmHwmKib(Path status) { + try { + Matcher m = VM_HWM.matcher(Files.readString(status, StandardCharsets.UTF_8)); + return m.find() ? Long.parseLong(m.group(1)) : -1; + } catch (IOException | RuntimeException e) { + return -1; // process already exited, or /proc not present + } + } + + /** Page count via {@code qpdf --show-npages} (PATH or absolute), or -1 when unavailable. */ + private int pageCount(Path pdf) throws InterruptedException { + try { + Process process = + new ProcessBuilder(qpdf, "--show-npages", pdf.toString()) + .redirectErrorStream(true) + .start(); + String output; + try (var in = process.getInputStream()) { + output = new String(in.readAllBytes(), StandardCharsets.UTF_8); + } + if (!process.waitFor(1, TimeUnit.MINUTES)) { + process.destroyForcibly(); + return -1; + } + // The count is the one digits-only line; qpdf may surround it with warning lines. + return output.lines() + .map(String::strip) + .filter(line -> line.matches("\\d+")) + .findFirst() + .map(Integer::parseInt) + .orElse(-1); + } catch (IOException e) { + return -1; // qpdf not installed — page count is cosmetic here + } + } + + // Numeric helpers + + private static double median(double[] values) { + double[] sorted = values.clone(); + Arrays.sort(sorted); + int n = sorted.length; + if (n == 0) { + return 0; + } + return (n % 2 == 1) ? sorted[n / 2] : (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0; + } + + private static double median(List values) { + return median(values.stream().mapToDouble(Double::doubleValue).toArray()); + } + + private static long medianLong(long[] values) { + return Math.round(median(Arrays.stream(values).asDoubleStream().toArray())); + } + + private static String mib(long bytes) { + return bytes < 0 ? "n/a" : String.format(Locale.ROOT, "%.1f", bytes / (double) MIB); + } + + // Rendering + + private String render(List rows) { + // Stage columns: the union of stage labels across rows in first-appearance order, with + // "total" (the launcher's in-process conversion time) pulled out as its own column. + Set stageNames = new LinkedHashSet<>(); + for (Row row : rows) { + stageNames.addAll(row.stageMedians().keySet()); + } + stageNames.remove("total"); + + var sb = new StringBuilder(); + sb.append("# pdfbook runtime baseline (stage-level)\n\n") + .append("Generated by `PipelineBenchmark`") + .append(" (`./gradlew :pipeline:app:benchPipeline`, in the dev container).\n") + .append( + "Tracks the **per-stage wall-clock breakdown, end-to-end runtime and peak" + + " memory**\n") + .append( + "of the installDist `pdfbook` launcher. Re-run after any change to the" + + " pipeline\n") + .append( + "and compare against the previous run before merging (acceptance: ≥5%" + + " median\n") + .append( + "total-wall improvement, or an explicit RSS/disk win, with output" + + " validated).\n\n"); + appendHostInfo(sb); + sb.append("\n## Stage breakdown (warm median of ").append(runs).append(" runs)\n\n"); + sb.append( + "`conv` is the launcher's in-process total (`timing: total`);" + + " `startup+init` = E2E wall − conv\n") + .append( + "(JVM boot + first-touch PDFBox/AWT init). `jobs=auto` is the launcher's" + + " CPU-count default.\n\n"); + sb.append("| Input | Jobs | Pages | E2E wall | conv |"); + for (String stage : stageNames) { + sb.append(' ').append(stage).append(" |"); + } + sb.append(" startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) |\n"); + sb.append("|---|---|---:|---:|---:|"); + sb.append("---:|".repeat(stageNames.size())); + sb.append("---:|---:|---:|---:|\n"); + for (Row row : rows) { + double conv = row.stageMedians().getOrDefault("total", 0.0); + sb.append("| ") + .append(row.name()) + .append(" | ") + .append(row.jobs()) + .append(" | ") + .append(pages(row.pages())) + .append(" | ") + .append(secs(row.wallMedian())) + .append(" | ") + .append(conv > 0 ? secs(conv) : "n/a") + .append(" |"); + for (String stage : stageNames) { + Double seconds = row.stageMedians().get(stage); + sb.append(' ').append(seconds == null ? "n/a" : secs(seconds)).append(" |"); + } + sb.append(' ') + .append(conv > 0 ? secs(Math.max(0, row.wallMedian() - conv)) : "n/a") + .append(" | ") + .append(secs(row.coldWall())) + .append(" | ") + .append(rssMib(row.rssMedianKib())) + .append(" | ") + .append(mib(row.outputBytes())) + .append(" |\n"); + } + sb.append("\n## Stage shares (of conv, warm median)\n\n") + .append( + "The shares that decide where optimization effort goes: a stage that is" + + " ~5% of conv\n") + .append("cannot pay for a parallelization rewrite no matter how elegant.\n\n"); + sb.append("| Input | Jobs |"); + for (String stage : stageNames) { + sb.append(' ').append(stage).append(" |"); + } + sb.append('\n').append("|---|---|").append("---:|".repeat(stageNames.size())).append('\n'); + for (Row row : rows) { + double conv = row.stageMedians().getOrDefault("total", 0.0); + sb.append("| ").append(row.name()).append(" | ").append(row.jobs()).append(" |"); + for (String stage : stageNames) { + Double seconds = row.stageMedians().get(stage); + sb.append(' ') + .append( + seconds == null || conv <= 0 + ? "n/a" + : String.format( + Locale.ROOT, "%.1f%%", seconds * 100.0 / conv)) + .append(" |"); + } + sb.append('\n'); + } + return sb.toString(); + } + + private void appendHostInfo(StringBuilder sb) { + String date = + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss", Locale.ROOT) + .withZone(ZoneOffset.UTC) + .format(Instant.now()); + long totalRamBytes = totalPhysicalMemoryBytes(); + sb.append("- Date (UTC): ").append(date).append('\n'); + sb.append("- Host: ") + .append(System.getProperty("os.name", "?")) + .append(' ') + .append(System.getProperty("os.version", "?")) + .append(' ') + .append(System.getProperty("os.arch", "?")) + .append(", ") + .append(Runtime.getRuntime().availableProcessors()) + .append(" CPUs, RAM ") + .append(totalRamBytes > 0 ? Math.round(totalRamBytes / 1.073741824e9) + "Gi" : "?") + .append('\n'); + sb.append("- Launcher: `").append(launcher).append("`\n"); + sb.append("- Samples per measurement: cold (1st run) + warm median of ") + .append(runs) + .append(".\n"); + sb.append( + "- The default input is the deterministic synthetic fixture" + + " (`createSampleScan`,\n") + .append( + " seeded, so identical across machines). Real books are pluggable via" + + " `-Pinputs=\"…\"`;\n") + .append(" only their page count and byte size are reported.\n"); + } + + private static long totalPhysicalMemoryBytes() { + if (ManagementFactory.getOperatingSystemMXBean() + instanceof com.sun.management.OperatingSystemMXBean os) { + return os.getTotalMemorySize(); + } + return -1; + } + + private static String secs(double seconds) { + return String.format(Locale.ROOT, "%.2fs", seconds); + } + + private static String pages(int pages) { + return pages < 0 ? "?" : Integer.toString(pages); + } + + private static String rssMib(long rssKib) { + return rssKib < 0 ? "n/a" : Long.toString(Math.round(rssKib / 1024.0)); + } + + // Small utilities + + private static String fileName(Path path) { + Path name = path.getFileName(); + return name != null ? name.toString() : path.toString(); + } + + private static void requireExecutable(Path path, String what, String hint) { + if (!Files.isExecutable(path)) { + System.err.println("error: " + what + " not found at " + path); + System.err.println(" " + hint); + System.exit(1); + } + } + + private static Path requireParent(Path path) { + Path parent = path.getParent(); + if (parent == null) { + throw new IllegalArgumentException("output path has no parent: " + path); + } + return parent; + } + + private static void deleteTree(Path dir) throws IOException { + if (!Files.exists(dir)) { + return; + } + try (var paths = Files.walk(dir)) { + paths.sorted((a, b) -> b.getNameCount() - a.getNameCount()) + .forEach( + p -> { + try { + Files.deleteIfExists(p); + } catch (IOException e) { + System.err.println( + "warn: could not delete " + p + ": " + e.getMessage()); + } + }); + } + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java new file mode 100644 index 0000000..8b111a7 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java @@ -0,0 +1,135 @@ +package io.github.p4suta.pipeline.tools; + +import java.awt.Color; +import java.awt.Graphics2D; +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Locale; +import java.util.Random; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; + +/** + * Generates the synthetic, copyright-free bitonal "scan" book the {@code benchPipeline} harness + * converts: A5 pages at the requested dpi carrying vertical text-like columns with per-page + * position jitter, a small per-page skew of up to ±0.5° (so the register stage's deskew has real + * work) and salt-and-pepper specks (so despeckle has real work), embedded as CCITT G4 so {@code + * pdfimages} extracts them exactly like a real scan. A fixed seed keeps every generation + * byte-identical, so benchmark runs stay comparable across machines and branches. + * + *

This deliberately lives in test sources (driven by the {@code createSampleScan} Gradle task), + * mirroring register's {@code SamplePdfGenerator}: the dev tool never ships in the production + * launcher. An existing output is reused, so repeated benchmark runs skip the generation cost. + * + *

Usage: {@code SampleScanGenerator [pages] [dpi]} + */ +public final class SampleScanGenerator { + + private SampleScanGenerator() {} + + /** {@code SampleScanGenerator [pages] [dpi]} — writes the synthetic scan book. */ + public static void main(String[] args) throws IOException { + Path out = Path.of(args.length > 0 ? args[0] : "sample-scan.pdf"); + int pages = args.length > 1 ? Integer.parseInt(args[1]) : 200; + int dpi = args.length > 2 ? Integer.parseInt(args[2]) : 600; + if (Files.isRegularFile(out)) { + System.out.println("reusing existing " + out + " (delete it to regenerate)"); + return; + } + long start = System.nanoTime(); + write(out, pages, dpi); + System.out.printf( + Locale.ROOT, + "wrote %s: %d page(s) at %d dpi, %.1f MiB in %.1fs%n", + out, + pages, + dpi, + Files.size(out) / (1024.0 * 1024.0), + (System.nanoTime() - start) / 1e9); + } + + /** Writes a {@code pages}-page synthetic bitonal scan book to {@code out} (A5 geometry). */ + public static void write(Path out, int pages, int dpi) throws IOException { + int width = Math.round(148f * dpi / 25.4f); // A5 portrait: 148 mm × 210 mm + int height = Math.round(210f * dpi / 25.4f); + Random random = new Random(42); + Path parent = out.toAbsolutePath().getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + try (PDDocument doc = new PDDocument()) { + float widthPt = width * 72f / dpi; + float heightPt = height * 72f / dpi; + for (int i = 0; i < pages; i++) { + PDImageXObject image = + CCITTFactory.createFromImage(doc, page(width, height, random)); + PDPage page = new PDPage(new PDRectangle(widthPt, heightPt)); + doc.addPage(page); + try (PDPageContentStream content = new PDPageContentStream(doc, page)) { + content.drawImage(image, 0, 0, widthPt, heightPt); + } + } + doc.save(out.toFile()); + } + } + + /** One page: slightly skewed text-like columns plus unrotated scanner-dust specks. */ + private static BufferedImage page(int width, int height, Random random) { + BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); + Graphics2D g = img.createGraphics(); + try { + g.setColor(Color.WHITE); + g.fillRect(0, 0, width, height); + g.setColor(Color.BLACK); + double skew = Math.toRadians(random.nextDouble() - 0.5); // ±0.5° + g.rotate(skew, width / 2.0, height / 2.0); + drawColumns(g, width, height, random); + g.rotate(-skew, width / 2.0, height / 2.0); + drawSpecks(g, width, height, random); + } finally { + g.dispose(); + } + return img; + } + + /** + * Vertical "text" columns right-to-left (Japanese book layout): stacked glyph-sized blocks with + * per-page jitter so registration has a real column position to detect and correct, and random + * early line breaks so the texture resembles prose rather than a solid block. + */ + private static void drawColumns(Graphics2D g, int width, int height, Random random) { + int margin = width / 10; + int glyph = Math.max(4, width / 60); + int leading = glyph / 2; + int jitterX = random.nextInt(glyph + 1) - glyph / 2; + int top = height / 12 + random.nextInt(glyph + 1); + int bottom = height - height / 12; + for (int x = width - margin - glyph + jitterX; x >= margin; x -= glyph + leading) { + int y = top; + while (y + glyph <= bottom) { + // ~8% of glyph slots end the "sentence" early, leaving prose-like white runs. + if (random.nextInt(100) < 8) { + y += glyph * (2 + random.nextInt(4)); + continue; + } + g.fillRect(x, y, glyph - 2, glyph - 2); + y += glyph; + } + } + } + + /** Salt-and-pepper dust: ~1 speck of 1–3 px per 25k pixels, what despeckle exists to remove. */ + private static void drawSpecks(Graphics2D g, int width, int height, Random random) { + int specks = width * height / 25_000; + for (int i = 0; i < specks; i++) { + int size = 1 + random.nextInt(3); + g.fillRect(random.nextInt(width - size), random.nextInt(height - size), size, size); + } + } +} diff --git a/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java b/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java index 0810505..8ae99b8 100644 --- a/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java +++ b/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java @@ -70,7 +70,11 @@ public void run( progress.emit(new ProgressEvent.StageStarted(source.name(), position, total)); Corpus corpus = source.open(stageDir(work, 0, source.name())); - log.info("source: {} page(s) at {} dpi", corpus.pageCount(), corpus.dpi()); + log.info( + "source: {} page(s) at {} dpi, {}", + corpus.pageCount(), + corpus.dpi(), + intermediatesSize(corpus.dir())); progress.emit(new ProgressEvent.StageCompleted(source.name())); position++; @@ -78,7 +82,12 @@ public void run( for (Stage stage : stages) { progress.emit(new ProgressEvent.StageStarted(stage.name(), position, total)); corpus = stage.apply(corpus, stageDir(work, dirIndex, stage.name())); - log.info("stage {} ({}): {} page(s)", dirIndex, stage.name(), corpus.pageCount()); + log.info( + "stage {} ({}): {} page(s), {}", + dirIndex, + stage.name(), + corpus.pageCount(), + intermediatesSize(corpus.dir())); progress.emit(new ProgressEvent.StageCompleted(stage.name())); position++; dirIndex++; @@ -111,6 +120,28 @@ private static Path stageDir(Path work, int index, String name) throws IOExcepti work.resolve(String.format(Locale.ROOT, "%02d-%s", index, name))); } + /** + * The stage directory's total file bytes rendered as MiB — visibility into how much + * intermediate I/O each stage produces (best-effort: {@code ?} when the walk fails). + */ + private static String intermediatesSize(Path dir) { + try (Stream files = Files.walk(dir)) { + long bytes = + files.filter(Files::isRegularFile).mapToLong(PipelineRunner::sizeQuietly).sum(); + return String.format(Locale.ROOT, "%.1f MiB", bytes / (1024.0 * 1024.0)); + } catch (IOException e) { + return "? MiB"; + } + } + + private static long sizeQuietly(Path file) { + try { + return Files.size(file); + } catch (IOException e) { + return 0L; + } + } + private static void deleteRecursively(Path dir) { try (Stream walk = Files.walk(dir)) { walk.sorted(Comparator.reverseOrder()).forEach(PipelineRunner::deleteQuietly); diff --git a/pipeline/docs/perf-baseline.md b/pipeline/docs/perf-baseline.md new file mode 100644 index 0000000..9727575 --- /dev/null +++ b/pipeline/docs/perf-baseline.md @@ -0,0 +1,35 @@ +# pdfbook runtime baseline (stage-level) + +Generated by `PipelineBenchmark` (`./gradlew :pipeline:app:benchPipeline`, in the dev container). +Tracks the **per-stage wall-clock breakdown, end-to-end runtime and peak memory** +of the installDist `pdfbook` launcher. Re-run after any change to the pipeline +and compare against the previous run before merging (acceptance: ≥5% median +total-wall improvement, or an explicit RSS/disk win, with output validated). + +- Date (UTC): 2026-06-10 04:46:07 +- Host: Linux 6.8.0-124-generic amd64, 8 CPUs, RAM 16Gi +- Launcher: `pipeline/app/build/install/pdfbook/bin/pdfbook` +- Samples per measurement: cold (1st run) + warm median of 3. +- The default input is the deterministic synthetic fixture (`createSampleScan`, + seeded, so identical across machines). Real books are pluggable via `-Pinputs="…"`; + only their page count and byte size are reported. + +## Stage breakdown (warm median of 3 runs) + +`conv` is the launcher's in-process total (`timing: total`); `startup+init` = E2E wall − conv +(JVM boot + first-touch PDFBox/AWT init). `jobs=auto` is the launcher's CPU-count default. + +| Input | Jobs | Pages | E2E wall | conv | extract | despeckle | register | spread | startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) | +|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| sample-scan-200p.pdf | 1 | 200 | 50.10s | 49.85s | 4.57s | 32.16s | 12.94s | 0.20s | 0.25s | 51.42s | 156 | 6.4 | +| sample-scan-200p.pdf | 8 | 200 | 14.77s | 14.48s | 1.15s | 9.85s | 3.27s | 0.21s | 0.29s | 15.04s | 328 | 6.4 | + +## Stage shares (of conv, warm median) + +The shares that decide where optimization effort goes: a stage that is ~5% of conv +cannot pay for a parallelization rewrite no matter how elegant. + +| Input | Jobs | extract | despeckle | register | spread | +|---|---|---:|---:|---:|---:| +| sample-scan-200p.pdf | 1 | 9.2% | 64.5% | 26.0% | 0.4% | +| sample-scan-200p.pdf | 8 | 7.9% | 68.0% | 22.6% | 1.5% | diff --git a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java index bcd1008..bb3ad78 100644 --- a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java +++ b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java @@ -18,11 +18,14 @@ /** * The G4-normalization {@link Stage}: re-encodes each extracted page as single-strip CCITT G4 TIFF - * via Leptonica, which {@link SpreadPackSink}'s pass-through CCITT embedding requires. {@code - * pdfimages -tiff} writes poppler's default (non-G4) TIFF compression, so the raw extract output - * cannot be embedded directly; despeckle and register each re-encode their output as G4 themselves, - * so the composition root inserts this stage only when neither of them runs. The corpus dpi is - * stamped on every page, since {@code pdfimages} tags the extracted TIFFs at a default 72 dpi. + * via Leptonica, which {@link SpreadPackSink}'s pass-through CCITT embedding requires. The + * extractor's decoded mode ({@code pdfimages -tiff}, used for any source that is not all-CCITT) + * writes poppler's default (non-G4) TIFF compression at a default 72 dpi, so that output cannot be + * embedded directly; despeckle and register each re-encode their output as G4 themselves, so the + * composition root inserts this stage only when neither of them runs. The corpus dpi is stamped on + * every page. (For an all-CCITT source the extractor's remux already produces stamped single-strip + * G4 — this stage then re-encodes losslessly, a small constant cost that keeps the no-stage path + * uniform.) */ public final class G4EncodeStage implements Stage { diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java new file mode 100644 index 0000000..1bfd7c3 --- /dev/null +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java @@ -0,0 +1,159 @@ +package io.github.p4suta.shared.pdf; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Files; +import java.nio.file.Path; +import org.jspecify.annotations.Nullable; + +/** + * Wraps the raw CCITT G4 stream {@code pdfimages -ccitt} dumps into a single-strip TIFF — the + * pass-through half of the extractor's remux mode: the scan's embedded G4 bytes become a readable + * TIFF without ever being decoded and re-encoded. + * + *

poppler writes a fax2tiff-style {@code .params} file beside each dump ({@code -4} G4 / {@code + * -1 -2} G3, {@code -A} EOL markers / {@code -P} none, {@code -X }, {@code -W} BlackIs1 / + * {@code -B} not, {@code -M} MSB-first). Only the plain shape TIFF's T.6 compression can represent + * verbatim is {@linkplain #supported supported}: G4, no EOL markers, MSB-first. Crucially, PDF's + * {@code EncodedByteAlign} never reaches the params file, so a wrapped stream is only trusted after + * the caller decodes it back successfully (see the extractor's read-back verification). + */ +final class CcittTiffs { + + private static final short TYPE_SHORT = 3; + private static final short TYPE_LONG = 4; + private static final short TYPE_RATIONAL = 5; + + private CcittTiffs() {} + + /** + * The decode parameters poppler records beside a {@code .ccitt} dump. + * + * @param kind the coding scheme flag: {@code -4} (G4), {@code -2} (G3 2D) or {@code -1} (G3 1D) + * @param endOfLine whether rows are prefixed with EOL markers ({@code -A}) + * @param columns the row width in pixels ({@code -X}) + * @param blackIs1 whether decoded 1-bits are black ({@code -W}) or 0-bits are ({@code -B}) + */ + record Params(String kind, boolean endOfLine, int columns, boolean blackIs1) {} + + /** Parse a {@code .params} file's text, or {@code null} when any token is unrecognized. */ + static @Nullable Params parseParams(String text) { + @Nullable String kind = null; + @Nullable Boolean endOfLine = null; + @Nullable Integer columns = null; + @Nullable Boolean blackIs1 = null; + boolean msbFirst = false; + String[] tokens = text.trim().split("\\s+", -1); + for (int i = 0; i < tokens.length; i++) { + switch (tokens[i]) { + case "-4", "-2", "-1" -> kind = tokens[i]; + case "-A" -> endOfLine = true; + case "-P" -> endOfLine = false; + case "-W" -> blackIs1 = true; + case "-B" -> blackIs1 = false; + case "-M" -> msbFirst = true; + case "-X" -> { + i++; + if (i >= tokens.length) { + return null; + } + try { + columns = Integer.parseInt(tokens[i]); + } catch (NumberFormatException e) { + return null; + } + } + default -> { + return null; + } + } + } + if (kind == null || endOfLine == null || columns == null || blackIs1 == null || !msbFirst) { + return null; + } + return new Params(kind, endOfLine, columns, blackIs1); + } + + /** + * Whether {@code params} describes a stream TIFF T.6 represents verbatim: Group 4, no EOL + * markers, and a width agreeing with the listing row the dump corresponds to. + */ + static boolean supported(Params params, int expectedWidth) { + return "-4".equals(params.kind()) + && !params.endOfLine() + && params.columns() == expectedWidth; + } + + /** + * Write {@code g4} as a little-endian, single-strip CCITT-G4 TIFF — header, the verbatim stream + * as the one strip, then the IFD. + * + * @param out the TIFF to write + * @param g4 the raw G4 (T.6) stream, verbatim + * @param width the row width in pixels + * @param height the row count + * @param blackIs1 the params' photometric hint: decoded 1-bits are black ({@code -W}) + * @param dpi the resolution to stamp, or {@code <= 0} to omit the resolution tags + */ + static void writeSingleStripG4( + Path out, byte[] g4, int width, int height, boolean blackIs1, int dpi) + throws IOException { + boolean withResolution = dpi > 0; + int entryCount = withResolution ? 14 : 11; + int stripOffset = 8; + int padding = g4.length % 2; // IFD offsets must be word-aligned + int ifdOffset = stripOffset + g4.length + padding; + int rationalOffset = ifdOffset + 2 + entryCount * 12 + 4; + ByteBuffer buf = + ByteBuffer.allocate(rationalOffset + (withResolution ? 16 : 0)) + .order(ByteOrder.LITTLE_ENDIAN); + + buf.put((byte) 'I').put((byte) 'I').putShort((short) 42).putInt(ifdOffset); + buf.put(g4); + if (padding == 1) { + buf.put((byte) 0); + } + + buf.putShort((short) entryCount); // entries below stay sorted by tag id + entry(buf, 256, TYPE_LONG, width); // ImageWidth + entry(buf, 257, TYPE_LONG, height); // ImageLength + entryShort(buf, 258, 1); // BitsPerSample + entryShort(buf, 259, 4); // Compression: CCITT T.6 (Group 4) + // The G4 stream encodes white/black runs; this tag tells readers which sense to + // materialize them in. The PDF default (-B, BlackIs1=false) is the standard fax sense — + // TIFF WhiteIsZero (0); -W (BlackIs1=true) is the inverted sense, BlackIsZero (1). + // Pinned empirically by CcittTiffsTest's pixel-identical round trip. + entryShort(buf, 262, blackIs1 ? 1 : 0); // PhotometricInterpretation + entryShort(buf, 266, 1); // FillOrder: MSB first (params -M) + entry(buf, 273, TYPE_LONG, stripOffset); // StripOffsets + entryShort(buf, 277, 1); // SamplesPerPixel + entry(buf, 278, TYPE_LONG, height); // RowsPerStrip: the single strip + entry(buf, 279, TYPE_LONG, g4.length); // StripByteCounts + if (withResolution) { + entry(buf, 282, TYPE_RATIONAL, rationalOffset); // XResolution + entry(buf, 283, TYPE_RATIONAL, rationalOffset + 8); // YResolution + } + entry(buf, 293, TYPE_LONG, 0); // T6Options: none + if (withResolution) { + entryShort(buf, 296, 2); // ResolutionUnit: inch + } + buf.putInt(0); // no next IFD + + if (withResolution) { + buf.putInt(dpi).putInt(1).putInt(dpi).putInt(1); + } + Files.write(out, buf.array()); + } + + /** One IFD entry holding an inline LONG (or a RATIONAL's value offset). */ + private static void entry(ByteBuffer buf, int tag, short type, int value) { + buf.putShort((short) tag).putShort(type).putInt(1).putInt(value); + } + + /** One IFD entry holding an inline SHORT (left-justified in the 4-byte value field). */ + private static void entryShort(ByteBuffer buf, int tag, int value) { + buf.putShort((short) tag).putShort(TYPE_SHORT).putInt(1); + buf.putShort((short) value).putShort((short) 0); + } +} diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java index 35c8b77..77ed395 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java @@ -1,24 +1,32 @@ package io.github.p4suta.shared.pdf; +import io.github.p4suta.shared.imaging.Pix; import io.github.p4suta.shared.process.ProcessRunner; import io.github.p4suta.shared.process.Tasks; import io.github.p4suta.shared.process.ToolPath; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.nio.file.Path; import java.time.Duration; import java.util.ArrayList; +import java.util.Comparator; import java.util.List; import java.util.Locale; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeoutException; +import java.util.stream.Stream; +import org.jspecify.annotations.Nullable; /** * Extracts a PDF's embedded bitonal images as TIFFs by driving {@code pdfimages}. The page range is * split across the worker pool (one {@code pdfimages -f/-l} per chunk) with distinct zero-padded - * {@code page-cNN-} prefixes, so a name sort yields reading order and chunks never collide. The - * dominant scan DPI is read from {@code pdfimages -list} and passed to the clean step as an - * explicit DPI, since {@code pdfimages} tags the extracted TIFFs at a default 72 dpi. + * {@code page-cNN-} prefixes, so a name sort yields reading order and chunks never collide. An + * all-CCITT source is remuxed — the raw embedded G4 streams pass through into single-strip TIFFs + * with their true ppi stamped (see {@link #extract}); any other source is extracted decoded, where + * {@code pdfimages} tags the TIFFs at a default 72 dpi, so the dominant scan DPI from {@code + * pdfimages -list} is passed downstream explicitly either way. * *

The textual {@code pdfinfo}/{@code pdfimages -list} reports are parsed by the pure {@link * PdfListingParser}; this adapter only drives the external processes via {@link @@ -93,13 +101,33 @@ public int dominantDpi(Path pdf) throws IOException { /** * Extract all pages of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks. {@code jobs} bounds both the chunk count and the pool slots used. + * chunks of about {@link #CHUNK_PAGES} pages on {@code pool} (at most {@code 4 * jobs} chunks). + * + *

One {@code pdfimages -list} pass picks the mode: when every embedded image is 1-bpp CCITT + * (the usual self-scanned book), each chunk dumps the raw G4 streams ({@code -ccitt}) and wraps + * them into single-strip CCITT-G4 TIFFs — a pure remux: no decode/re-encode, intermediates tens + * of KB per page instead of the decoded megabytes, and the image's true ppi stamped instead of + * {@code pdfimages}' default 72 dpi. Every wrapped page is decoded back once as verification + * (PDF's {@code EncodedByteAlign} never reaches the dumped params, so trust requires a decode); + * a chunk whose dump or wrap deviates in any way is re-extracted decoded ({@code -tiff}), which + * is also the whole-run mode for any other source. */ public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException { int total = pageCount(pdf); - int chunks = Math.max(1, Math.min(jobs, total)); - int per = (total + chunks - 1) / chunks; String pdfimages = resolve("pdfimages", pdfimagesPropertyKey); + List rows = + PdfListingParser.parseImageRows( + capture(List.of(pdfimages, "-list", pdf.toString()), INFO_TIMEOUT)); + boolean rawCcitt = + !rows.isEmpty() + && rows.stream().allMatch(r -> r.bpc() == 1 && "ccitt".equals(r.enc())); + + // Chunks of ~CHUNK_PAGES rather than total/jobs: fast finishers free their pool slot early + // (the straggler tail shrinks from total/jobs to ~CHUNK_PAGES pages), and a streaming + // consumer can take pages chunk by chunk. Capped so a small book is not all process spawns. + int chunkCap = (int) Math.min(4L * jobs, total); + int chunks = Math.clamp(Math.ceilDiv(total, CHUNK_PAGES), 1, Math.max(1, chunkCap)); + int per = Math.ceilDiv(total, chunks); List> tasks = new ArrayList<>(); int chunk = 0; for (int first = 1; first <= total; first += per) { @@ -108,18 +136,11 @@ public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throw outDir.resolve(String.format(Locale.ROOT, "page-c%03d-", chunk)).toString(); int from = first; int to = last; + List chunkRows = + rawCcitt ? rowsInRange(rows, from, to) : List.of(); tasks.add( () -> { - runDiscarding( - List.of( - pdfimages, - "-tiff", - "-f", - Integer.toString(from), - "-l", - Integer.toString(to), - pdf.toString(), - prefix)); + extractChunk(pdfimages, pdf, from, to, prefix, chunkRows); return null; }); chunk++; @@ -127,6 +148,143 @@ public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throw Tasks.awaitAll(pool, tasks, "pdfimages extract interrupted", "pdfimages extract failed"); } + /** Pages per extraction chunk; see {@link #extract}. */ + private static final int CHUNK_PAGES = 12; + + /** The listing rows for pages {@code from..to}, in listing (= dump) order. */ + private static List rowsInRange( + List rows, int from, int to) { + return rows.stream().filter(r -> r.page() >= from && r.page() <= to).toList(); + } + + /** + * Extract one page-range chunk: raw-CCITT remux when {@code ccittRows} describes it, decoded + * {@code -tiff} otherwise — and the {@code -tiff} rerun as the fallback when the dump deviates + * from the listing in any way (count, params shape, or a wrap that does not decode back). + */ + private void extractChunk( + String pdfimages, + Path pdf, + int from, + int to, + String prefix, + List ccittRows) + throws IOException { + if (ccittRows.isEmpty()) { + runDiscarding(extractCommand(pdfimages, "-tiff", from, to, pdf, prefix)); + return; + } + runDiscarding(extractCommand(pdfimages, "-ccitt", from, to, pdf, prefix)); + if (!wrapChunk(prefix, ccittRows)) { + deleteByPrefix(prefix); + runDiscarding(extractCommand(pdfimages, "-tiff", from, to, pdf, prefix)); + } + } + + private static List extractCommand( + String pdfimages, String format, int from, int to, Path pdf, String prefix) { + return List.of( + pdfimages, + format, + "-f", + Integer.toString(from), + "-l", + Integer.toString(to), + pdf.toString(), + prefix); + } + + /** + * Wrap every {@code .ccitt} dump under {@code prefix} into a single-strip G4 TIFF, verifying + * each by decoding it back. Returns {@code false} (without cleaning up) on any deviation; the + * caller then discards the chunk's artifacts and falls back to a decoded extract. + */ + private static boolean wrapChunk(String prefix, List rows) + throws IOException { + List dumps = filesByPrefix(prefix, ".ccitt"); + if (dumps.size() != rows.size()) { + return false; + } + for (int i = 0; i < dumps.size(); i++) { + Path ccitt = dumps.get(i); + PdfListingParser.ImageRow row = rows.get(i); + Path paramsFile = withExtension(ccitt, ".params"); + if (!Files.isRegularFile(paramsFile)) { + return false; + } + CcittTiffs.@Nullable Params params = + CcittTiffs.parseParams(Files.readString(paramsFile, StandardCharsets.UTF_8)); + if (params == null || !CcittTiffs.supported(params, row.width())) { + return false; + } + Path out = withExtension(ccitt, ".tif"); + CcittTiffs.writeSingleStripG4( + out, + Files.readAllBytes(ccitt), + row.width(), + row.height(), + params.blackIs1(), + Math.max(row.xPpi(), 0)); + if (!decodesBack(out, row)) { + return false; + } + Files.delete(ccitt); + Files.delete(paramsFile); + } + return true; + } + + /** + * Whether the wrapped TIFF decodes to the listing row's dimensions — the read-back proof that + * the stream really was plain T.6 (an {@code EncodedByteAlign} stream, undetectable from the + * params file, fails to decode or comes back the wrong size here). + */ + private static boolean decodesBack(Path tif, PdfListingParser.ImageRow row) { + try (Pix pix = Pix.read(tif)) { + return pix.width() == row.width() && pix.height() == row.height(); + } catch (IllegalStateException e) { + return false; + } + } + + /** The files starting with {@code prefix}'s file name and ending in {@code suffix}, sorted. */ + private static List filesByPrefix(String prefix, String suffix) throws IOException { + Path prefixPath = Path.of(prefix); + Path dir = prefixPath.getParent(); + String name = String.valueOf(prefixPath.getFileName()); + if (dir == null) { + throw new IOException("extract prefix has no parent directory: " + prefix); + } + try (Stream entries = Files.list(dir)) { + return entries.filter( + p -> { + String fileName = String.valueOf(p.getFileName()); + return fileName.startsWith(name) && fileName.endsWith(suffix); + }) + .sorted(Comparator.comparing(p -> String.valueOf(p.getFileName()))) + .toList(); + } + } + + /** + * Delete every artifact of one chunk ({@code .ccitt}, {@code .params}, partial {@code .tif}). + */ + private static void deleteByPrefix(String prefix) throws IOException { + for (String suffix : List.of(".ccitt", ".params", ".tif")) { + for (Path file : filesByPrefix(prefix, suffix)) { + Files.deleteIfExists(file); + } + } + } + + /** A sibling of {@code file} with its extension replaced by {@code extension}. */ + private static Path withExtension(Path file, String extension) { + String name = String.valueOf(file.getFileName()); + int dot = name.lastIndexOf('.'); + String base = dot < 0 ? name : name.substring(0, dot); + return file.resolveSibling(base + extension); + } + /** Run an extraction command, discarding its (file-producing) output. */ private static void runDiscarding(List command) throws IOException { try { diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java index efe12aa..0a2bb1b 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java @@ -1,6 +1,8 @@ package io.github.p4suta.shared.pdf; +import java.util.ArrayList; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; /** @@ -36,27 +38,17 @@ public static int parsePageCount(String pdfinfoOutput) { } /** - * The most common rounded x-ppi (column 13, 0-based 12) across the {@code image} rows of a - * {@code pdfimages -list} report, skipping the two header rows. Ties resolve to the first value - * seen and a non-positive winner falls back to {@link #DEFAULT_DPI}. + * The most common rounded x-ppi across the {@code image} rows of a {@code pdfimages -list} + * report. Ties resolve to the first value seen and a non-positive winner falls back to {@link + * #DEFAULT_DPI}. * * @param listOutput the full text {@code pdfimages -list} printed * @return the dominant rounded x-ppi, or {@link #DEFAULT_DPI} when none is usable */ public static int parseDominantDpi(String listOutput) { - String[] lines = listOutput.split("\n", -1); Map counts = new LinkedHashMap<>(); - for (int i = 2; i < lines.length; i++) { - String[] fields = lines[i].trim().split("\\s+", -1); - if (fields.length < 13 || !"image".equals(fields[2])) { - continue; - } - try { - int ppi = (int) Math.round(Double.parseDouble(fields[12])); - counts.merge(ppi, 1, Integer::sum); - } catch (NumberFormatException ignored) { - // Non-numeric x-ppi cell: skip this row. - } + for (ImageRow row : parseImageRows(listOutput)) { + counts.merge(row.xPpi(), 1, Integer::sum); } if (counts.isEmpty()) { return DEFAULT_DPI; @@ -71,4 +63,50 @@ public static int parseDominantDpi(String listOutput) { } return best > 0 ? best : DEFAULT_DPI; } + + /** + * One {@code image} row of a {@code pdfimages -list} report — the columns the extractor needs + * to pick its mode and to wrap raw CCITT dumps. + * + * @param page the 1-based page the image sits on + * @param width the image width in pixels + * @param height the image height in pixels + * @param bpc bits per component ({@code 1} for bitonal) + * @param enc the embedded encoding token ({@code ccitt}, {@code jbig2}, {@code jpeg}, {@code + * image}, …) + * @param xPpi the rounded x-ppi the image is placed at (0 when the cell is unusable) + */ + public record ImageRow(int page, int width, int height, int bpc, String enc, int xPpi) {} + + /** + * Parse the {@code image} rows of a {@code pdfimages -list} report, in listing order (the same + * order {@code pdfimages} dumps the images in), skipping the two header rows and any row with + * unparsable numeric cells. + * + * @param listOutput the full text {@code pdfimages -list} printed + * @return the parsed rows, possibly empty + */ + public static List parseImageRows(String listOutput) { + String[] lines = listOutput.split("\n", -1); + List rows = new ArrayList<>(); + for (int i = 2; i < lines.length; i++) { + String[] fields = lines[i].trim().split("\\s+", -1); + if (fields.length < 13 || !"image".equals(fields[2])) { + continue; + } + try { + rows.add( + new ImageRow( + Integer.parseInt(fields[0]), + Integer.parseInt(fields[3]), + Integer.parseInt(fields[4]), + Integer.parseInt(fields[7]), + fields[8], + (int) Math.round(Double.parseDouble(fields[12])))); + } catch (NumberFormatException ignored) { + // A non-numeric cell: skip this row. + } + } + return rows; + } } diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java new file mode 100644 index 0000000..63e93d6 --- /dev/null +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java @@ -0,0 +1,160 @@ +package io.github.p4suta.shared.pdf; + +import static org.assertj.core.api.Assertions.assertThat; + +import io.github.p4suta.shared.imaging.Pix; +import java.awt.Color; +import java.awt.Graphics2D; +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; +import javax.imageio.ImageIO; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +/** + * The CCITT remux building blocks: the fax2tiff-style params parser (pure) and the single-strip G4 + * TIFF writer, verified by wrapping a real G4 stream (PDFBox's CCITT encoder, the same encoding a + * scanner PDF embeds) and decoding it back through Leptonica pixel-for-pixel. + */ +final class CcittTiffsTest { + + // ---- params parsing ---- + + @Test + void parsesTheUsualScannerShape() { + CcittTiffs.Params params = parsed("-4 -P -X 3496 -B -M\n"); + assertThat(params).isEqualTo(new CcittTiffs.Params("-4", false, 3496, false)); + assertThat(CcittTiffs.supported(params, 3496)).isTrue(); + } + + @Test + void eolMarkersAreUnsupported() { + CcittTiffs.Params params = parsed("-4 -A -X 100 -W -M"); + assertThat(params).isEqualTo(new CcittTiffs.Params("-4", true, 100, true)); + // EOL markers are not representable in TIFF T.6. + assertThat(CcittTiffs.supported(params, 100)).isFalse(); + } + + @Test + void group3IsUnsupported() { + assertThat(CcittTiffs.supported(parsed("-2 -P -X 100 -B -M"), 100)).isFalse(); + } + + @Test + void widthMismatchIsUnsupported() { + assertThat(CcittTiffs.supported(parsed("-4 -P -X 100 -B -M"), 200)).isFalse(); + } + + /** Parse params the test asserts are well-formed, made non-null for NullAway. */ + private static CcittTiffs.Params parsed(String text) { + return java.util.Objects.requireNonNull(CcittTiffs.parseParams(text)); + } + + @Test + void unknownTokensAndMissingFlagsAreUnparsable() { + assertThat(CcittTiffs.parseParams("-4 -P -X 100 -B -M -Z")).isNull(); // unknown flag + assertThat(CcittTiffs.parseParams("-4 -P -B -M")).isNull(); // no -X + assertThat(CcittTiffs.parseParams("-4 -P -X nope -B -M")).isNull(); // bad width + assertThat(CcittTiffs.parseParams("-4 -P -X 100 -B")).isNull(); // no -M + assertThat(CcittTiffs.parseParams("")).isNull(); + } + + // ---- TIFF wrapping ---- + + /** + * Round trip: draw a known bitonal pattern, encode it to a raw G4 stream with PDFBox's CCITT + * encoder (the very encoding a scanner PDF embeds and {@code pdfimages -ccitt} dumps), wrap the + * stream with {@link CcittTiffs#writeSingleStripG4}, and assert it decodes back through + * Leptonica pixel-identical to the original, with the stamped resolution. + */ + @Test + void wrappedStreamDecodesBackPixelIdentical(@TempDir Path tmp) throws Exception { + int width = 200; + int height = 150; + BufferedImage img = pattern(width, height); + G4Stream g4 = encodeG4(img); + + Path wrapped = tmp.resolve("wrapped.tif"); + CcittTiffs.writeSingleStripG4(wrapped, g4.bytes, width, height, g4.blackIs1, 450); + + Path referencePng = tmp.resolve("reference.png"); + ImageIO.write(img, "png", referencePng.toFile()); + try (Pix expected = Pix.read(referencePng); + Pix actual = Pix.read(wrapped)) { + assertThat(actual.width()).isEqualTo(width); + assertThat(actual.height()).isEqualTo(height); + assertThat(actual.resolution()).isEqualTo(450); + assertThat(actual.blackPixels()).isPositive(); + assertThat(actual.pixelsEqual(expected)).isTrue(); + } + } + + @Test + void omitsResolutionTagsWhenDpiUnknown(@TempDir Path tmp) throws Exception { + int width = 64; + int height = 48; + G4Stream g4 = encodeG4(pattern(width, height)); + + Path wrapped = tmp.resolve("wrapped.tif"); + CcittTiffs.writeSingleStripG4(wrapped, g4.bytes, width, height, g4.blackIs1, 0); + + try (Pix actual = Pix.read(wrapped)) { + assertThat(actual.width()).isEqualTo(width); + assertThat(actual.resolution()).isZero(); + } + } + + /** A raw G4 (T.6) stream and the {@code BlackIs1} convention its encoder declared. */ + private static final class G4Stream { + final byte[] bytes; + final boolean blackIs1; + + G4Stream(byte[] bytes, boolean blackIs1) { + this.bytes = bytes; + this.blackIs1 = blackIs1; + } + } + + /** + * The raw CCITT G4 stream PDFBox's {@link CCITTFactory} encodes {@code img} to (lifted verbatim + * from the image XObject, exactly the bytes {@code pdfimages -ccitt} would dump), along with + * the {@code BlackIs1} decode parameter it declared. + */ + private static G4Stream encodeG4(BufferedImage img) throws IOException { + try (PDDocument doc = new PDDocument()) { + PDImageXObject image = CCITTFactory.createFromImage(doc, img); + COSDictionary decodeParms = + (COSDictionary) image.getCOSObject().getDictionaryObject(COSName.DECODE_PARMS); + boolean blackIs1 = + decodeParms != null && decodeParms.getBoolean(COSName.BLACK_IS_1, false); + try (InputStream in = image.getCOSObject().createRawInputStream()) { + return new G4Stream(in.readAllBytes(), blackIs1); + } + } + } + + /** A deterministic bitonal pattern with structure (bars + a block). */ + private static BufferedImage pattern(int width, int height) { + BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); + Graphics2D g = img.createGraphics(); + try { + g.setColor(Color.WHITE); + g.fillRect(0, 0, width, height); + g.setColor(Color.BLACK); + for (int x = 4; x < width - 8; x += 12) { + g.fillRect(x, 8, 6, height - 16); + } + g.fillRect(width / 3, height / 3, width / 3, height / 3); + } finally { + g.dispose(); + } + return img; + } +} diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java index c9032d2..9b7d1c5 100644 --- a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java @@ -94,6 +94,69 @@ void dominantDpiReturnsAPositiveResolution(@TempDir Path tmp) throws Exception { assertThat(dpi).isPositive(); } + /** Build a multi-page PDF embedding CCITT-G4 bitonal images, one per page, at ~200 ppi. */ + private static void writeCcittPdf(Path pdf, int pages, int imgW, int imgH) throws IOException { + try (PDDocument doc = new PDDocument()) { + for (int i = 0; i < pages; i++) { + BufferedImage bitonal = + new BufferedImage(imgW, imgH, BufferedImage.TYPE_BYTE_BINARY); + java.awt.Graphics2D g = bitonal.createGraphics(); + try { + g.setColor(java.awt.Color.WHITE); + g.fillRect(0, 0, imgW, imgH); + g.setColor(java.awt.Color.BLACK); + g.fillRect(10 + i, 10, imgW / 3, imgH / 2); + } finally { + g.dispose(); + } + float wPt = imgW * 72f / 200; + float hPt = imgH * 72f / 200; + PDPage page = new PDPage(new PDRectangle(wPt, hPt)); + doc.addPage(page); + PDImageXObject image = + org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory.createFromImage( + doc, bitonal); + try (PDPageContentStream content = new PDPageContentStream(doc, page)) { + content.drawImage(image, 0, 0, wPt, hPt); + } + } + doc.save(pdf.toFile()); + } + } + + @Test + @EnabledIf("io.github.p4suta.shared.pdf.PdfImagesCliExtractorTest#toolsOnPath") + void remuxesAnAllCcittSourceIntoStampedG4Tiffs(@TempDir Path tmp) throws Exception { + Path pdf = tmp.resolve("scan.pdf"); + writeCcittPdf(pdf, 3, 240, 180); + Path outDir = Files.createDirectory(tmp.resolve("out")); + + ExecutorService pool = Executors.newFixedThreadPool(2); + try { + new PdfImagesCliExtractor(PDFIMAGES_KEY, PDFINFO_KEY).extract(pdf, outDir, 2, pool); + } finally { + pool.shutdownNow(); + } + + try (Stream entries = Files.list(outDir)) { + List files = entries.sorted().toList(); + // The remux leaves exactly one .tif per page — no .ccitt/.params residue. + assertThat(files).hasSize(3); + assertThat(files).allSatisfy(p -> assertThat(p.toString()).endsWith(".tif")); + for (Path tif : files) { + try (io.github.p4suta.shared.imaging.Pix pix = + io.github.p4suta.shared.imaging.Pix.read(tif)) { + assertThat(pix.width()).isEqualTo(240); + assertThat(pix.height()).isEqualTo(180); + // The remux stamps the image's true ppi instead of pdfimages' default 72. + assertThat(pix.resolution()).isEqualTo(200); + // Black ink, not inverted: the drawn block is ~1/6 of the page. + assertThat(pix.blackPixels()).isGreaterThan(0).isLessThan(240L * 180 / 2); + } + } + } + } + @Test void missingToolFailsWithAClearMessage(@TempDir Path tmp) throws Exception { Path pdf = tmp.resolve("doc.pdf"); diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java index f228032..2bba8d0 100644 --- a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java @@ -120,4 +120,27 @@ void parseDominantDpiSkipsNonImageRowsWithTooFewFields() { PdfListingParser.DEFAULT_DPI, PdfListingParser.parseDominantDpi("hdr\n----\n 1 0 smask\n")); } + + @Test + void parseImageRowsReadsTheColumnsTheExtractorNeeds() { + var rows = PdfListingParser.parseImageRows(LIST); + assertEquals(3, rows.size()); + assertEquals(new PdfListingParser.ImageRow(1, 2480, 3508, 1, "ccitt", 300), rows.get(0)); + assertEquals(new PdfListingParser.ImageRow(3, 1240, 1754, 1, "ccitt", 150), rows.get(2)); + } + + @Test + void parseImageRowsSkipsMalformedAndNonImageRows() { + String mixed = + """ + page num type width height color comp bpc enc interp object ID x-ppi y-ppi size ratio + -------------------------------------------------------------------------------------------- + 1 0 smask 2480 3508 gray 1 1 ccitt no 7 0 300 300 101K 1.2% + 2 1 image bad 3508 gray 1 1 ccitt no 11 0 300 300 99K 1.1% + 3 2 image 2480 3508 rgb 3 8 jpeg no 14 0 150 150 40K 1.0% + """; + var rows = PdfListingParser.parseImageRows(mixed); + assertEquals(1, rows.size()); + assertEquals(new PdfListingParser.ImageRow(3, 2480, 3508, 8, "jpeg", 150), rows.get(0)); + } }