diff --git a/_typos.toml b/_typos.toml index df533d6..b9800fd 100644 --- a/_typos.toml +++ b/_typos.toml @@ -45,3 +45,7 @@ Pagent = "Pagent" Flavour = "Flavour" flavours = "flavours" initialise = "initialise" +# The PDF spec's own key name `DecodeParms` (and PDFBox's COSName.DECODE_PARMS +# constant mirroring it) — third-party/spec identifiers we must name verbatim. +Parms = "Parms" +PARMS = "PARMS" diff --git a/despeckle/application/build.gradle.kts b/despeckle/application/build.gradle.kts index a6f1593..165103b 100644 --- a/despeckle/application/build.gradle.kts +++ b/despeckle/application/build.gradle.kts @@ -25,6 +25,10 @@ dependencies { // implementations are no longer duplicated here. Static calls returning java.base types only, so // implementation (not api) is the right scope. implementation(project(":shared:io")) + // Tasks.awaitAll(Workers...): the shared fail-fast page fan-out (batch-owned executor, + // sibling interruption, quiescence before the failure propagates) DespeckleService runs its + // per-page work on. + implementation(project(":shared:process")) implementation(libs.slf4j.api) } diff --git a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java index 062e974..4bd632b 100644 --- a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java +++ b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/DespeckleService.java @@ -9,18 +9,13 @@ import io.github.p4suta.shared.io.CorpusFiles; import io.github.p4suta.shared.io.OutputDirs; import io.github.p4suta.shared.kernel.PageProgressListener; +import io.github.p4suta.shared.process.Tasks; import java.io.IOException; -import java.io.UncheckedIOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.atomic.AtomicInteger; import org.jspecify.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -75,7 +70,8 @@ public record Config( * Aggregate outcome of a run. * * @param pages number of pages processed - * @param componentsRemoved total components removed across all pages + * @param componentsRemoved total components removed across all pages; {@code 0} when no report + * consumed component stats (counting is skipped for speed without a report) * @param overRemovalWarnings number of pages flagged for possible over-removal */ public record Summary(int pages, long componentsRemoved, int overRemovalWarnings) {} @@ -88,8 +84,8 @@ public Summary run(Config config) throws IOException { /** * Execute a run, reporting each finished page to {@code progress}. * - * @param progress called once per page as it completes (1-based count, total page count); may - * be invoked from the worker threads, so it must be thread-safe + * @param progress called once per page as it completes (1-based count, total page count), + * always on the calling thread and in order */ public Summary run(Config config, PageProgressListener progress) throws IOException { OutputDirs.prepare(config.outputDir(), config.force()); @@ -101,39 +97,45 @@ public Summary run(Config config, PageProgressListener progress) throws IOExcept } LOG.info("despeckling {} page(s) with {} thread(s)", files.size(), config.jobs()); + @Nullable Path reportDir = config.reportDir(); + boolean reporting = reportDir != null; Reporter report = - config.reportDir() == null - ? Reporter.noOp() - : reporterFactory.create(config.reportDir(), config.flipbook()); - - AtomicInteger done = new AtomicInteger(); - List outcomes; - ExecutorService pool = Executors.newFixedThreadPool(config.jobs()); - try { - List> tasks = new ArrayList<>(files.size()); - for (Path src : files) { - tasks.add( - () -> { - PageOutcome outcome = processOne(src, config, report); - int n = done.incrementAndGet(); - progress.onPage(n, files.size()); - if (n % PROGRESS_EVERY == 0 || n == files.size()) { - LOG.info("{}/{}", n, files.size()); + reportDir != null + ? reporterFactory.create(reportDir, config.flipbook()) + : Reporter.noOp(); + // The report is the only consumer of per-page component counts, and counting is a full + // connected-component labeling twice per page — skip it when no report will be written. + ProcessOptions options = + reporting ? config.options() : config.options().withoutComponentStats(); + + List> tasks = new ArrayList<>(files.size()); + for (Path src : files) { + tasks.add(() -> processOne(src, config, options, report)); + } + // Platform workers: each page is CPU-bound Leptonica work (FFM downcalls pin virtual + // threads' carriers). The fan-out fails fast and quiesces before throwing, so a failed + // run never leaves workers writing into the output directory. Progress and the human + // log arrive on this thread, ordered. + List outcomes = + Tasks.awaitAll( + Tasks.Workers.platform(config.jobs()), + tasks, + "despeckle", + (done, total) -> { + progress.onPage(done, total); + if (done % PROGRESS_EVERY == 0 || done == total) { + LOG.info("{}/{}", done, total); } - return outcome; }); - } - outcomes = invokeAll(pool, tasks); - } finally { - pool.shutdown(); - } report.finish(); long totalRemoved = 0; + long blackRemoved = 0; int warnings = 0; for (PageOutcome outcome : outcomes) { totalRemoved += outcome.result().componentsRemoved(); + blackRemoved += outcome.result().blackPixelsRemoved(); if (outcome.result().isOverRemoval()) { warnings++; LOG.warn( @@ -142,17 +144,28 @@ public Summary run(Config config, PageProgressListener progress) throws IOExcept Math.round(outcome.result().removedBlackPixelRatio() * 100)); } } - LOG.info( - "done: {} page(s), {} component(s) removed, {} over-removal warning(s)", - files.size(), - totalRemoved, - warnings); + if (reporting) { + LOG.info( + "done: {} page(s), {} component(s) removed, {} over-removal warning(s)", + files.size(), + totalRemoved, + warnings); + } else { + // Without a report nothing counted components (the counting passes are skipped for + // speed), so the summary speaks in the always-measured black-pixel terms. + LOG.info( + "done: {} page(s), {} black pixel(s) removed, {} over-removal warning(s)", + files.size(), + blackRemoved, + warnings); + } return new Summary(files.size(), totalRemoved, warnings); } private record PageOutcome(Path source, ProcessResult result) {} - private PageOutcome processOne(Path src, Config config, Reporter report) { + private PageOutcome processOne(Path src, Config config, ProcessOptions options, Reporter report) + throws IOException { Path dest = CorpusFiles.mirrorDestination( src, config.inputDir(), config.outputDir(), config.format().extension()); @@ -161,35 +174,14 @@ private PageOutcome processOne(Path src, Config config, Reporter report) { if (parent != null) { Files.createDirectories(parent); } - ProcessResult result = pageCleaner.clean(src, dest, config.format(), config.options()); + ProcessResult result = pageCleaner.clean(src, dest, config.format(), options); Path stem = config.inputDir().relativize(src); report.addPage(stem, src, dest, result); return new PageOutcome(src, result); } catch (IOException e) { - throw new UncheckedIOException("failed to process " + src, e); - } - } - - private static List invokeAll( - ExecutorService pool, List> tasks) throws IOException { - List> futures; - try { - futures = pool.invokeAll(tasks); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("despeckle run interrupted", e); - } - List outcomes = new ArrayList<>(futures.size()); - for (Future future : futures) { - try { - outcomes.add(future.get()); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("despeckle run interrupted", e); - } catch (ExecutionException e) { - throw new IOException("page processing failed", e.getCause()); - } + // Re-throw with the page named: Tasks surfaces a task's IOException unchanged, so + // this context reaches the user instead of being lost in a generic wrapper. + throw new IOException("failed to process " + src, e); } - return outcomes; } } diff --git a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/Jbig2PackService.java b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/Jbig2PackService.java index c82f4cb..665badd 100644 --- a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/Jbig2PackService.java +++ b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/Jbig2PackService.java @@ -9,8 +9,6 @@ import java.nio.file.Path; import java.util.Comparator; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import java.util.stream.Stream; import org.jspecify.annotations.Nullable; import org.slf4j.Logger; @@ -77,17 +75,15 @@ public void run(Config config) throws IOException { } Path jb2Dir = createWorkDir(config.outPdf()); - ExecutorService pool = Executors.newFixedThreadPool(config.jobs()); try { assembler.assemble( config.imageDir(), config.outPdf(), config.source(), config.dpi(), - pool, + config.jobs(), jb2Dir); } finally { - pool.shutdown(); deleteRecursively(jb2Dir); } linearizer.linearize(config.outPdf()); diff --git a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfBatchService.java b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfBatchService.java index c42cf03..d1c5da2 100644 --- a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfBatchService.java +++ b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfBatchService.java @@ -148,14 +148,23 @@ public Summary run(Config config) throws IOException { if (config.reportParent() != null) { batchReporter.write(config.reportParent(), books); + LOG.info( + "done: {} ok, {} skipped, {} failed, {} page(s), {} component(s) removed", + ok, + skipped, + failed, + totalPages, + totalComponentsRemoved); + } else { + // Without reports the runs skip component counting (an expensive labeling, twice per + // page), so a component total would always read 0 — leave it out of the line. + LOG.info( + "done: {} ok, {} skipped, {} failed, {} page(s)", + ok, + skipped, + failed, + totalPages); } - LOG.info( - "done: {} ok, {} skipped, {} failed, {} page(s), {} component(s) removed", - ok, - skipped, - failed, - totalPages, - totalComponentsRemoved); return new Summary(ok, skipped, failed, totalPages, totalComponentsRemoved); } diff --git a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfPipelineService.java b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfPipelineService.java index e90a508..16357c6 100644 --- a/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfPipelineService.java +++ b/despeckle/application/src/main/java/io/github/p4suta/despeckle/application/PdfPipelineService.java @@ -12,8 +12,6 @@ import java.nio.file.Path; import java.util.Comparator; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import java.util.stream.Stream; import org.jspecify.annotations.Nullable; import org.slf4j.Logger; @@ -96,41 +94,37 @@ public DespeckleService.Summary run(Config config) throws IOException { Path extracted = Files.createDirectories(work.resolve("in")); Path cleaned = Files.createDirectories(work.resolve("clean")); Path jbig2Dir = Files.createDirectories(work.resolve("jb2")); - ExecutorService pool = Executors.newFixedThreadPool(config.jobs()); - DespeckleService.Summary summary; - try { - int dpi = - config.options().dpi().isPresent() - ? config.options().dpi().getAsInt() - : extractor.dominantDpi(config.inputPdf()); - LOG.info( - "pipeline: {} -> {} at {} dpi", config.inputPdf(), config.outputPdf(), dpi); + int dpi = + config.options().dpi().isPresent() + ? config.options().dpi().getAsInt() + : extractor.dominantDpi(config.inputPdf()); + LOG.info("pipeline: {} -> {} at {} dpi", config.inputPdf(), config.outputPdf(), dpi); - extractor.extract(config.inputPdf(), extracted, config.jobs(), pool); + // Each step fans out on its own batch-owned workers bounded by the same jobs budget, + // so the steps never hold idle threads for each other (the old shared outer pool sat + // idle through the whole despeckle step). + extractor.extract(config.inputPdf(), extracted, config.jobs()); - DespeckleService.Config clean = - new DespeckleService.Config( - extracted, - cleaned, - OutputFormat.TIFF, - "*.tif", - config.jobs(), - true, - config.options().withDpi(dpi), - config.reportDir(), - config.flipbook()); - summary = despeckleService.run(clean); + DespeckleService.Config clean = + new DespeckleService.Config( + extracted, + cleaned, + OutputFormat.TIFF, + "*.tif", + config.jobs(), + true, + config.options().withDpi(dpi), + config.reportDir(), + config.flipbook()); + DespeckleService.Summary summary = despeckleService.run(clean); - assembler.assemble( - cleaned, - config.outputPdf(), - config.inputPdf(), - OptionalInt.of(dpi), - pool, - jbig2Dir); - } finally { - pool.shutdown(); - } + assembler.assemble( + cleaned, + config.outputPdf(), + config.inputPdf(), + OptionalInt.of(dpi), + config.jobs(), + jbig2Dir); linearizer.linearize(config.outputPdf()); LOG.info("wrote {}", config.outputPdf()); return summary; diff --git a/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/DespeckleServiceTest.java b/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/DespeckleServiceTest.java index 4bb8187..ba543df 100644 --- a/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/DespeckleServiceTest.java +++ b/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/DespeckleServiceTest.java @@ -1,6 +1,7 @@ package io.github.p4suta.despeckle.application; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -141,4 +142,30 @@ void drivesTheReporterWhenAReportDirIsGiven(@TempDir Path tmp) throws IOExceptio assertEquals(2, factory.reporter.pages.get(), "every page is reported"); assertEquals(1, factory.reporter.finished.get(), "the report is finalized once"); } + + @Test + void skipsComponentCountingWithoutAReport(@TempDir Path tmp) throws IOException { + Path in = tmp.resolve("in"); + writeInputs(in, 1); + FakePageCleaner cleaner = new FakePageCleaner(LIGHT); + + new DespeckleService(cleaner, new RecordingReporterFactory()) + .run(config(in, tmp.resolve("out"), false, null)); + + ProcessOptions seen = java.util.Objects.requireNonNull(cleaner.lastOptions); + assertFalse(seen.collectComponentStats(), "no report -> counting passes are skipped"); + } + + @Test + void keepsComponentCountingForTheReportPath(@TempDir Path tmp) throws IOException { + Path in = tmp.resolve("in"); + writeInputs(in, 1); + FakePageCleaner cleaner = new FakePageCleaner(LIGHT); + + new DespeckleService(cleaner, new RecordingReporterFactory()) + .run(config(in, tmp.resolve("out"), false, tmp.resolve("report"))); + + ProcessOptions seen = java.util.Objects.requireNonNull(cleaner.lastOptions); + assertTrue(seen.collectComponentStats(), "the report consumes the counts"); + } } diff --git a/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java b/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java index a970181..a061822 100644 --- a/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java +++ b/despeckle/application/src/test/java/io/github/p4suta/despeckle/application/Fakes.java @@ -17,7 +17,6 @@ import java.nio.file.Path; import java.util.List; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import org.jspecify.annotations.Nullable; @@ -34,6 +33,7 @@ private Fakes() {} /** A {@link PageCleaner} that writes a stub output file and returns a fixed result. */ static final class FakePageCleaner implements PageCleaner { final AtomicInteger calls = new AtomicInteger(); + volatile @Nullable ProcessOptions lastOptions; private final ProcessResult result; FakePageCleaner(ProcessResult result) { @@ -49,6 +49,7 @@ public ProcessResult clean( throw new java.io.UncheckedIOException(e); } calls.incrementAndGet(); + lastOptions = options; return result; } } @@ -111,8 +112,7 @@ public int dominantDpi(Path pdf) { } @Override - public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) - throws IOException { + public void extract(Path pdf, Path outDir, int jobs) throws IOException { if (failOn != null && pdf.getFileName().toString().contains(failOn)) { throw new IOException("fake extract failed for " + pdf); } @@ -135,7 +135,7 @@ public void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException { Files.writeString(outPdf, "%PDF-fake", StandardCharsets.UTF_8); diff --git a/despeckle/docs/cleaner-baseline.md b/despeckle/docs/cleaner-baseline.md new file mode 100644 index 0000000..96c422a --- /dev/null +++ b/despeckle/docs/cleaner-baseline.md @@ -0,0 +1,31 @@ +# despeckle cleaner op-level baseline + +Generated by `CleanerBenchmark` (`./gradlew :despeckle:infrastructure:benchCleaner`, dev container). +Times each Leptonica primitive the page cleaner composes on a synthetic +600-dpi A5 page (3496x4961 px, fixed seed). Re-run after any change to the +cleaner or the imaging bindings and compare before merging. + +- Date (UTC): 2026-06-10 06:25:05 +- Host: Linux amd64, 8 CPUs +- Samples: median of 10 reps after 2 warmups; single-threaded. + +| op | median (ms) | min (ms) | calls/clean() | est. share of clean() | +|---|---:|---:|---:|---:| +| read TIFF-G4 | 2.59 | 2.52 | 1 | 1.5% | +| selectBySize k=6 (page) | 15.22 | 14.98 | 1 | 8.7% | +| selectBySize 15 (page) | 15.19 | 14.80 | 1 | 8.7% | +| selectBySize k=6 (inverted) | 22.16 | 21.89 | 2 | 25.3% | +| dilate 43x43 (text mask) | 37.52 | 37.01 | 1 | 21.5% | +| open 7x7 (page) | 12.18 | 12.04 | 1 | 7.0% | +| invert | 0.26 | 0.25 | 2 | 0.3% | +| subtract | 0.38 | 0.36 | 5 | 1.1% | +| and | 0.40 | 0.38 | 1 | 0.2% | +| or | 0.33 | 0.32 | 3 | 0.6% | +| countConnComp | 11.82 | 11.46 | 2 | 13.5% | +| countPixels | 0.41 | 0.41 | 2 | 0.5% | +| write TIFF-G4 | 6.44 | 6.29 | 1 | 3.7% | +| **Σ(median × calls)** | 161.74 | | | 92.5% | +| **clean() end-to-end** | 174.91 | 173.28 | 1 | 100% | + +The Σ row landing near 100% means the table accounts for clean()'s real cost; +a large gap points at untimed work (allocation churn, codec internals). diff --git a/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessOptions.java b/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessOptions.java index a565b1f..2051160 100644 --- a/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessOptions.java +++ b/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessOptions.java @@ -27,13 +27,16 @@ * @param removeIsolatedDust whether to run the isolated-medium-dust pass * @param isolatedDustSizePx an explicit max size for an isolated speck, or empty to derive it; a * present value also implies {@code removeIsolatedDust} + * @param collectComponentStats whether to count 8-connected components before and after — two of + * the most expensive full-page scans, so off when nothing consumes the counts (no report) */ public record ProcessOptions( OptionalInt dpi, OptionalInt speckSizePx, boolean fillHoles, boolean removeIsolatedDust, - OptionalInt isolatedDustSizePx) { + OptionalInt isolatedDustSizePx, + boolean collectComponentStats) { /** Resolution assumed for the speck filter when neither a flag nor the image supplies one. */ public static final int DEFAULT_DPI = 300; @@ -50,6 +53,16 @@ public record ProcessOptions( } } + /** The five-knob shape every direct caller uses; component counting defaults to on. */ + public ProcessOptions( + OptionalInt dpi, + OptionalInt speckSizePx, + boolean fillHoles, + boolean removeIsolatedDust, + OptionalInt isolatedDustSizePx) { + this(dpi, speckSizePx, fillHoles, removeIsolatedDust, isolatedDustSizePx, true); + } + /** Options with the isolated-dust pass off — the common case. */ public static ProcessOptions of(OptionalInt dpi, OptionalInt speckSizePx, boolean fillHoles) { return new ProcessOptions(dpi, speckSizePx, fillHoles, false, OptionalInt.empty()); @@ -66,7 +79,17 @@ public ProcessOptions withDpi(int dpi) { speckSizePx, fillHoles, removeIsolatedDust, - isolatedDustSizePx); + isolatedDustSizePx, + collectComponentStats); + } + + /** + * A copy with component counting disabled — for runs where nothing consumes the counts (no + * report), saving two full connected-component labelings per page. + */ + public ProcessOptions withoutComponentStats() { + return new ProcessOptions( + dpi, speckSizePx, fillHoles, removeIsolatedDust, isolatedDustSizePx, false); } /** diff --git a/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessResult.java b/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessResult.java index f6efbf9..8e6fd25 100644 --- a/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessResult.java +++ b/despeckle/domain/src/main/java/io/github/p4suta/despeckle/domain/model/ProcessResult.java @@ -1,15 +1,26 @@ package io.github.p4suta.despeckle.domain.model; +import java.util.OptionalInt; + /** * The outcome of despeckling one page. * - * @param componentsBefore 8-connected component count of the input - * @param componentsAfter 8-connected component count of the output + *

The black-pixel counts are always measured — they feed the over-removal guard. The component + * counts are measured only when something consumes them (the HTML report): counting 8-connected + * components is a full connected-component labeling of the page, one of the most expensive scans in + * the whole clean, so a run with no report skips both counting passes and carries empty components + * here. + * + * @param componentsBefore 8-connected component count of the input, when counted + * @param componentsAfter 8-connected component count of the output, when counted * @param blackPixelsBefore foreground pixel count of the input * @param blackPixelsAfter foreground pixel count of the output */ public record ProcessResult( - int componentsBefore, int componentsAfter, long blackPixelsBefore, long blackPixelsAfter) { + OptionalInt componentsBefore, + OptionalInt componentsAfter, + long blackPixelsBefore, + long blackPixelsAfter) { /** * A black-pixel removal ratio above this flags a possibly over-cleaned page. The single domain @@ -17,12 +28,38 @@ public record ProcessResult( */ public static final double OVER_REMOVAL_WARN_RATIO = 0.03; + /** A counted result — the shape the report path consumes. */ + public ProcessResult( + int componentsBefore, + int componentsAfter, + long blackPixelsBefore, + long blackPixelsAfter) { + this( + OptionalInt.of(componentsBefore), + OptionalInt.of(componentsAfter), + blackPixelsBefore, + blackPixelsAfter); + } + + /** The result of a run that skipped component counting (nothing consumes the counts). */ + public static ProcessResult withoutComponentStats( + long blackPixelsBefore, long blackPixelsAfter) { + return new ProcessResult( + OptionalInt.empty(), OptionalInt.empty(), blackPixelsBefore, blackPixelsAfter); + } + + /** Whether component counting ran (true on the report path). */ + public boolean hasComponentStats() { + return componentsBefore.isPresent(); + } + /** * Net drop in 8-connected components — dust removed minus any holes filled back in. Summed into - * the run total and plotted per page in the report. + * the run total and plotted per page in the report. {@code 0} when counting was skipped (see + * {@link #hasComponentStats()}). */ public int componentsRemoved() { - return componentsBefore - componentsAfter; + return componentsBefore.orElse(0) - componentsAfter.orElse(0); } /** @@ -36,6 +73,11 @@ public double removedBlackPixelRatio() { return (double) (blackPixelsBefore - blackPixelsAfter) / blackPixelsBefore; } + /** Black pixels removed from the page — the cost-free removal measure every run carries. */ + public long blackPixelsRemoved() { + return blackPixelsBefore - blackPixelsAfter; + } + /** * Whether {@link #removedBlackPixelRatio()} exceeds {@link #OVER_REMOVAL_WARN_RATIO}, the * threshold the runner logs on and the report flags. diff --git a/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessOptionsTest.java b/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessOptionsTest.java index 700035a..6d70e42 100644 --- a/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessOptionsTest.java +++ b/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessOptionsTest.java @@ -163,4 +163,21 @@ void isolatedDustSizeStaysAboveTheSpeckSizeAtVeryLowResolution() { assertEquals( 2, options.isolatedDustSize(noImg()), "isolated-dust size floors at speckSize + 1"); } + + @Test + void componentStatsDefaultOnAndWitherTurnsThemOff() { + ProcessOptions defaults = ProcessOptions.defaults(); + assertTrue(defaults.collectComponentStats(), "counting is on unless turned off"); + + ProcessOptions off = defaults.withoutComponentStats(); + assertFalse(off.collectComponentStats()); + // Every other knob survives the wither. + assertEquals(defaults.dpi(), off.dpi()); + assertEquals(defaults.speckSizePx(), off.speckSizePx()); + assertEquals(defaults.fillHoles(), off.fillHoles()); + assertEquals(defaults.removeIsolatedDust(), off.removeIsolatedDust()); + assertEquals(defaults.isolatedDustSizePx(), off.isolatedDustSizePx()); + // ...including through withDpi. + assertFalse(off.withDpi(600).collectComponentStats()); + } } diff --git a/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessResultTest.java b/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessResultTest.java index fac55e0..7e2508b 100644 --- a/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessResultTest.java +++ b/despeckle/domain/src/test/java/io/github/p4suta/despeckle/domain/model/ProcessResultTest.java @@ -60,4 +60,21 @@ void isOverRemovalIsFalseBelowTheThreshold() { void isOverRemovalIsFalseForABlankInputPage() { assertFalse(new ProcessResult(0, 0, 0, 0).isOverRemoval()); } + + @Test + void withoutComponentStatsCarriesNoCountsButFullPixelMath() { + ProcessResult result = ProcessResult.withoutComponentStats(1000, 950); + assertFalse(result.hasComponentStats()); + assertEquals(0, result.componentsRemoved(), "absent counts read as zero, documented"); + assertEquals(50L, result.blackPixelsRemoved()); + assertTrue(result.isOverRemoval(), "the over-removal guard works without counts"); + } + + @Test + void countedConstructorHasComponentStats() { + ProcessResult result = new ProcessResult(120, 95, 1000, 990); + assertTrue(result.hasComponentStats()); + assertEquals(25, result.componentsRemoved()); + assertEquals(10L, result.blackPixelsRemoved()); + } } diff --git a/despeckle/infrastructure/build.gradle.kts b/despeckle/infrastructure/build.gradle.kts index 90f7db5..c957b23 100644 --- a/despeckle/infrastructure/build.gradle.kts +++ b/despeckle/infrastructure/build.gradle.kts @@ -78,3 +78,21 @@ tasks.named("jacocoTestCoverageVerification") { } tasks.named("check") { dependsOn("jacocoTestCoverageVerification") } + +// ---- Cleaner op-level micro-benchmark (see despeckle/docs/cleaner-baseline.md) ------------------ +// Times each Leptonica primitive the page cleaner composes, plus end-to-end clean(), on a +// deterministic synthetic 600-dpi A5 page — the measurement every cleaner/imaging optimization is +// judged against. Knob: -Preps=N (default 10). Dev-container only (needs the native Leptonica). +tasks.register("benchCleaner") { + group = "verification" + description = "Micro-benchmark cleaner ops + end-to-end clean(); writes despeckle/docs/cleaner-baseline.md" + dependsOn(tasks.named("testClasses")) + classpath = sourceSets["test"].runtimeClasspath + mainClass = "io.github.p4suta.despeckle.infrastructure.tools.CleanerBenchmark" + workingDir = rootDir + args = + listOf( + "despeckle/docs/cleaner-baseline.md", + providers.gradleProperty("reps").getOrElse("10"), + ) +} diff --git a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleaner.java b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleaner.java index 7f8434c..6fd7d77 100644 --- a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleaner.java +++ b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleaner.java @@ -52,7 +52,11 @@ public ProcessResult clean( int raw = source.resolution(); Optional img = raw > 0 ? Optional.of(Resolution.of(raw)) : Optional.empty(); int k = options.speckSize(img); - int componentsBefore = source.connectedComponents(); + // Component counting is a full connected-component labeling — one of the most + // expensive scans here — so it runs only when something consumes the counts (the + // report). The black-pixel counts always run: they feed the over-removal guard. + boolean countComponents = options.collectComponentStats(); + int componentsBefore = countComponents ? source.connectedComponents() : 0; long blackBefore = source.blackPixels(); int sourceFormat = source.inputFormat(); @@ -78,14 +82,18 @@ public ProcessResult clean( current = filled; } - int componentsAfter = current.connectedComponents(); long blackAfter = current.blackPixels(); // Stamp the honored resolution so a TIFF/PNG output carries an accurate tag. Only a // known resolution is written; an unknown one is left untouched. options.resolution(img).map(Resolution::dpi).ifPresent(current::setResolution); writeIn(current, output, format, sourceFormat); - return new ProcessResult( - componentsBefore, componentsAfter, blackBefore, blackAfter); + return countComponents + ? new ProcessResult( + componentsBefore, + current.connectedComponents(), + blackBefore, + blackAfter) + : ProcessResult.withoutComponentStats(blackBefore, blackAfter); } finally { current.close(); } diff --git a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfBoxJbig2Assembler.java b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfBoxJbig2Assembler.java index 907e663..a1a7a7b 100644 --- a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfBoxJbig2Assembler.java +++ b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfBoxJbig2Assembler.java @@ -4,7 +4,6 @@ import java.io.IOException; import java.nio.file.Path; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import org.jspecify.annotations.Nullable; /** @@ -34,7 +33,7 @@ public PdfBoxJbig2Assembler() {} * @param outPdf the lossless-JBIG2 PDF to write * @param source a PDF whose Info dict, XMP and version are inherited, or {@code null} for none * @param forcedDpi a single DPI to size every page with, or empty to read each image's own - * @param pool the worker pool the per-page {@code jbig2} encodes run on + * @param jobs how many {@code jbig2} children may encode at once * @param scratchDir scratch directory for the intermediate per-page JBIG2 streams * (caller-owned) * @throws IOException if the directory is empty, a tool fails, or the write fails @@ -45,9 +44,9 @@ public void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException { - delegate.assemble(imageDir, outPdf, source, forcedDpi, pool, scratchDir); + delegate.assemble(imageDir, outPdf, source, forcedDpi, jobs, scratchDir); } } diff --git a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfImagesCliExtractor.java b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfImagesCliExtractor.java index 044a175..ab186b5 100644 --- a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfImagesCliExtractor.java +++ b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/pdf/PdfImagesCliExtractor.java @@ -3,7 +3,6 @@ import io.github.p4suta.despeckle.port.PdfImageExtractor; import java.io.IOException; import java.nio.file.Path; -import java.util.concurrent.ExecutorService; /** * Despeckle's {@link PdfImageExtractor} adapter — a wrapper over the shared {@code @@ -34,10 +33,10 @@ public int dominantDpi(Path pdf) throws IOException { /** * Extract all pages of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks. {@code jobs} bounds both the chunk count and the pool slots used. + * chunks. at most {@code jobs} chunks run at once. */ @Override - public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException { - delegate.extract(pdf, outDir, jobs, pool); + public void extract(Path pdf, Path outDir, int jobs) throws IOException { + delegate.extract(pdf, outDir, jobs); } } diff --git a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/process/NativeTools.java b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/process/NativeTools.java index 5812478..dfdf3c5 100644 --- a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/process/NativeTools.java +++ b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/process/NativeTools.java @@ -6,12 +6,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.Path; -import java.util.ArrayList; import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; /** @@ -26,10 +21,10 @@ *

The launch helpers stay local rather than routing through {@code :shared:process}'s {@code * ProcessRunner}/{@code Tasks}: {@link #capture} returns the raw {@code byte[]} a JBIG2 stream * needs (the shared runner decodes to a UTF-8 {@code String}, which is lossy for binary output), - * and {@link #run}/{@link #awaitAll} raise/propagate the tagged {@link DespeckleException} kind - * (the shared {@code Tasks} flattens any non-{@code IOException} cause, losing the kind and its - * exit code). The {@code qpdf} call site, whose output is text and whose exit-3 tolerance the - * shared runner models directly, uses {@code ProcessRunner} instead. + * and {@link #run} raises/propagates the tagged {@link DespeckleException} kind (the shared {@code + * Tasks} flattens any non-{@code IOException} cause, losing the kind and its exit code). The {@code + * qpdf} call site, whose output is text and whose exit-3 tolerance the shared runner models + * directly, uses {@code ProcessRunner} instead. * *

Public so the sibling {@code infrastructure.pdf} adapters can call it; it never leaves the * {@code :infrastructure} module (the application and CLI layers depend only on the {@code :port} @@ -121,41 +116,4 @@ private static void awaitExit(Process process, List command, long timeou null); } } - - /** - * Run every task on {@code pool}, in submission order, surfacing the first failure as - * IOException. - */ - public static List awaitAll(ExecutorService pool, List> tasks) - throws IOException { - List> futures; - try { - futures = pool.invokeAll(tasks); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("pipeline interrupted", e); - } - List results = new ArrayList<>(futures.size()); - for (Future future : futures) { - try { - results.add(future.get()); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("pipeline interrupted", e); - } catch (ExecutionException e) { - Throwable cause = e.getCause(); - // Preserve a tagged domain failure (e.g. NATIVE_TOOL_FAILED / IMAGE_UNREADABLE - // raised inside a worker) so its kind — and exit code — survives the pool boundary - // instead of being flattened to a generic IOException. - if (cause instanceof DespeckleException de) { - throw de; - } - if (cause instanceof IOException io) { - throw io; - } - throw new IOException("pipeline task failed", cause); - } - } - return results; - } } diff --git a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/report/HtmlReporter.java b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/report/HtmlReporter.java index 2024be8..32d243f 100644 --- a/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/report/HtmlReporter.java +++ b/despeckle/infrastructure/src/main/java/io/github/p4suta/despeckle/infrastructure/report/HtmlReporter.java @@ -88,11 +88,13 @@ public void addPage(Path relativeStem, Path inputImage, Path outputImage, Proces } writeOverlayAndAccumulate(beforeImg, afterImg, overlayWebp); + // The report path always runs with component counting on (DespeckleService enables it + // whenever a reportDir is set), so the counts are present here; orElse(0) is type-driven. stats.add( new PageStat( stem, - result.componentsBefore(), - result.componentsAfter(), + result.componentsBefore().orElse(0), + result.componentsAfter().orElse(0), result.removedBlackPixelRatio())); } diff --git a/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleanerTest.java b/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleanerTest.java index 4bd4523..15d3cef 100644 --- a/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleanerTest.java +++ b/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/leptonica/LeptonicaPageCleanerTest.java @@ -1,6 +1,7 @@ package io.github.p4suta.despeckle.infrastructure.leptonica; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import io.github.p4suta.despeckle.domain.model.OutputFormat; @@ -231,4 +232,37 @@ void fillHolesSparesTheGapInsideAThinWalledGlyph(@TempDir Path dir) throws Excep ring, cleaned.blackPixels(), "the thin-walled gap is preserved, not filled"); } } + + @Test + void skippedComponentCountingStillCleansAndGuardsOverRemoval(@TempDir Path dir) + throws Exception { + // The same speck-removal scenario as removesSpecksButPreservesGlyph, with counting off: + // the output is identical, the result carries no component stats, and the always-measured + // black-pixel math still drives the over-removal guard. + Path src = dir.resolve("page.pbm"); + Path out = dir.resolve("page-out.pbm"); + boolean[][] img = TestImages.blank(40, 40); + TestImages.fillRect(img, 8, 8, 19, 25); + TestImages.dot(img, 2, 2); + TestImages.dot(img, 35, 30); + TestImages.dot(img, 30, 4); + TestImages.writePbm(src, img); + + ProcessResult result = + cleaner.clean( + src, + out, + OutputFormat.PBM, + ProcessOptions.of(OptionalInt.of(300), OptionalInt.of(3), false) + .withoutComponentStats()); + + assertFalse(result.hasComponentStats()); + assertEquals(0, result.componentsRemoved(), "absent counts read as zero"); + assertEquals(3L, result.blackPixelsRemoved(), "three 1px specks gone"); + assertFalse(result.isOverRemoval(), "3 of 219 black pixels is under the 3% threshold"); + try (Pix cleaned = Pix.read(out)) { + assertEquals(1, cleaned.connectedComponents()); + assertEquals(12L * 18L, cleaned.blackPixels(), "the whole glyph survives intact"); + } + } } diff --git a/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/tools/CleanerBenchmark.java b/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/tools/CleanerBenchmark.java new file mode 100644 index 0000000..6811533 --- /dev/null +++ b/despeckle/infrastructure/src/test/java/io/github/p4suta/despeckle/infrastructure/tools/CleanerBenchmark.java @@ -0,0 +1,360 @@ +package io.github.p4suta.despeckle.infrastructure.tools; + +import io.github.p4suta.despeckle.domain.model.OutputFormat; +import io.github.p4suta.despeckle.domain.model.ProcessOptions; +import io.github.p4suta.despeckle.infrastructure.leptonica.LeptonicaPageCleaner; +import io.github.p4suta.despeckle.testsupport.TestImages; +import io.github.p4suta.shared.imaging.Pix; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Random; + +/** + * Op-level micro-benchmark for the despeckle page cleaner: times each Leptonica primitive {@link + * LeptonicaPageCleaner} composes, plus the end-to-end {@code clean()}, on a deterministic synthetic + * 600-dpi A5 page (3496×4961 px — the pdfbook pipeline's real page geometry). Writes a Markdown + * table to {@code despeckle/docs/cleaner-baseline.md} so every optimization claim is judged against + * a committed measurement, mirroring the pipeline's {@code benchPipeline} convention. + * + *

Each op row reports the median and minimum of N timed reps (after warmup), the number of times + * {@code clean()} invokes that op per page with default options, and the estimated share of the + * end-to-end time the op therefore accounts for. The Σ(median × calls) sanity row should land near + * the {@code clean()} median; a large gap means an untimed cost (allocation, G4 codec internals). + * + *

Test-sources tool (driven by the {@code benchCleaner} Gradle task): never ships, and needs the + * dev container's Leptonica. Usage: {@code CleanerBenchmark [reps]}. + */ +public final class CleanerBenchmark { + + private static final int DPI = 600; + private static final int WIDTH = Math.round(148f * DPI / 25.4f); // A5 portrait + private static final int HEIGHT = Math.round(210f * DPI / 25.4f); + private static final int WARMUP_REPS = 2; + + // The cleaner's literal Leptonica selection flags (pix.h values, see LeptonicaPageCleaner). + private static final int CONN_8 = 8; + private static final int IF_EITHER = 5; + private static final int IF_GT = 2; + + // Default-derived sizes at 600 dpi: speck k=6, isolated dust 15, proximity 21, stroke 3. + private static final int SPECK = 6; + private static final int DUST = 15; + private static final int PROXIMITY = 21; + private static final int STROKE = 3; + + private CleanerBenchmark() {} + + /** One measured op: its label, how often clean() runs it, and the timed reps. */ + private record Row(String op, int callsPerClean, double medianMs, double minMs) {} + + public static void main(String[] args) throws IOException { + Path outDoc = Path.of(args.length > 0 ? args[0] : "despeckle/docs/cleaner-baseline.md"); + int reps = args.length > 1 ? Integer.parseInt(args[1]) : 10; + + Path work = Files.createTempDirectory("cleaner-bench"); + try { + Path pbm = work.resolve("page.pbm"); + TestImages.writePbm(pbm, syntheticPage()); + Path g4 = work.resolve("page.tif"); + try (Pix page = Pix.read(pbm)) { + page.setResolution(DPI); + page.writeTiffG4(g4); + } + + List rows = new ArrayList<>(); + try (Pix page = Pix.read(g4); + Pix inverted = page.inverted(); + Pix text6 = page.selectBySize(SPECK, SPECK, CONN_8, IF_EITHER, IF_GT); + Pix text15 = page.selectBySize(DUST, DUST, CONN_8, IF_EITHER, IF_GT)) { + + rows.add(timePix("read TIFF-G4", 1, reps, () -> Pix.read(g4))); + rows.add( + timePix( + "selectBySize k=6 (page)", + 1, + reps, + () -> page.selectBySize(SPECK, SPECK, CONN_8, IF_EITHER, IF_GT))); + rows.add( + timePix( + "selectBySize 15 (page)", + 1, + reps, + () -> page.selectBySize(DUST, DUST, CONN_8, IF_EITHER, IF_GT))); + rows.add( + timePix( + "selectBySize k=6 (inverted)", + 2, + reps, + () -> + inverted.selectBySize( + SPECK, SPECK, CONN_8, IF_EITHER, IF_GT))); + rows.add( + timePix( + "dilate 43x43 (text mask)", + 1, + reps, + () -> text15.dilated(PROXIMITY))); + rows.add(timePix("open 7x7 (page)", 1, reps, () -> page.opened(STROKE))); + rows.add(timePix("invert", 2, reps, page::inverted)); + rows.add(timePix("subtract", 5, reps, () -> page.subtract(text6))); + rows.add(timePix("and", 1, reps, () -> page.and(text6))); + rows.add(timePix("or", 3, reps, () -> page.or(text6))); + rows.add(timeVoid("countConnComp", 2, reps, page::connectedComponents)); + rows.add(timeVoid("countPixels", 2, reps, page::blackPixels)); + Path scratchTif = work.resolve("write.tif"); + rows.add(timeVoid("write TIFF-G4", 1, reps, () -> page.writeTiffG4(scratchTif))); + } + + LeptonicaPageCleaner cleaner = new LeptonicaPageCleaner(); + Path cleaned = work.resolve("cleaned.tif"); + Row clean = + timeVoid( + "clean() end-to-end", + 1, + reps, + () -> cleaner.clean(g4, cleaned, OutputFormat.TIFF, options())); + // The pipeline's actual configuration (no report -> component counting skipped). + Row cleanNoStats = + timeVoid( + "clean() without component stats", + 1, + reps, + () -> + cleaner.clean( + g4, + cleaned, + OutputFormat.TIFF, + options().withoutComponentStats())); + + String report = render(rows, clean, cleanNoStats, reps); + Path parent = outDoc.toAbsolutePath().getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + Files.writeString(outDoc, report, StandardCharsets.UTF_8); + System.out.print(report); + System.err.println(); + System.err.println("→ wrote " + outDoc); + } finally { + deleteTree(work); + } + } + + private static ProcessOptions options() { + // The pipeline's exact configuration: defaults (fillHoles + isolatedDust on) at a known + // dpi, so the derived sizes match the op rows above. + return ProcessOptions.defaults().withDpi(DPI); + } + + // ---- timing ---- + + @FunctionalInterface + private interface PixOp { + Pix run() throws IOException; + } + + @FunctionalInterface + private interface VoidOp { + void run() throws IOException; + } + + /** Times an op returning a Pix; the result's close() happens outside the timed window. */ + private static Row timePix(String op, int calls, int reps, PixOp body) throws IOException { + double[] samples = new double[reps]; + for (int i = -WARMUP_REPS; i < reps; i++) { + long start = System.nanoTime(); + Pix result = body.run(); + long elapsed = System.nanoTime() - start; + result.close(); + if (i >= 0) { + samples[i] = elapsed / 1e6; + } + } + return new Row(op, calls, median(samples), Arrays.stream(samples).min().orElse(0)); + } + + private static Row timeVoid(String op, int calls, int reps, VoidOp body) throws IOException { + double[] samples = new double[reps]; + for (int i = -WARMUP_REPS; i < reps; i++) { + long start = System.nanoTime(); + body.run(); + long elapsed = System.nanoTime() - start; + if (i >= 0) { + samples[i] = elapsed / 1e6; + } + } + return new Row(op, calls, median(samples), Arrays.stream(samples).min().orElse(0)); + } + + private static double median(double[] values) { + double[] sorted = values.clone(); + Arrays.sort(sorted); + int n = sorted.length; + return (n % 2 == 1) ? sorted[n / 2] : (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0; + } + + // ---- the synthetic page ---- + + /** + * A deterministic 600-dpi A5 "scan": right-to-left glyph columns (despeckle's protected text), + * 1–3 px dust everywhere (the speck filter's work), isolated 8–12 px blots in the margins (the + * isolated-dust pass's work), and pin-holes punched into glyph blocks (the fill-holes pass's + * work). Fixed seed, so the baseline is reproducible. + */ + private static boolean[][] syntheticPage() { + boolean[][] img = TestImages.blank(WIDTH, HEIGHT); + Random random = new Random(42); + + int margin = WIDTH / 10; + int glyph = Math.max(4, WIDTH / 60); + int leading = glyph / 2; + int top = HEIGHT / 12; + int bottom = HEIGHT - HEIGHT / 12; + int glyphIndex = 0; + for (int x = WIDTH - margin - glyph; x >= margin; x -= glyph + leading) { + int y = top; + while (y + glyph <= bottom) { + if (random.nextInt(100) < 8) { + y += glyph * (2 + random.nextInt(4)); + continue; + } + TestImages.fillRect(img, x, y, x + glyph - 3, y + glyph - 3); + // Every ~40th glyph carries a pin-hole, so fillHoles has real work. + if (glyphIndex % 40 == 0) { + int hx = x + glyph / 2; + int hy = y + glyph / 2; + img[hy][hx] = false; + img[hy][hx + 1] = false; + img[hy + 1][hx] = false; + img[hy + 1][hx + 1] = false; + } + glyphIndex++; + y += glyph; + } + } + + // Salt-and-pepper dust (1–3 px), what the main speck filter removes. + int specks = WIDTH * HEIGHT / 25_000; + for (int i = 0; i < specks; i++) { + int size = 1 + random.nextInt(3); + int x = random.nextInt(WIDTH - size); + int y = random.nextInt(HEIGHT - size); + TestImages.fillRect(img, x, y, x + size - 1, y + size - 1); + } + + // Isolated 8–12 px blots out in the margins, what the isolated-dust pass removes. + for (int i = 0; i < 20; i++) { + int size = 8 + random.nextInt(5); + int x = + random.nextBoolean() + ? random.nextInt(margin - size) + : WIDTH - margin + random.nextInt(margin - size); + int y = top + random.nextInt(bottom - top - size); + TestImages.fillRect(img, x, y, x + size - 1, y + size - 1); + } + return img; + } + + // ---- rendering ---- + + private static String render(List rows, Row clean, Row cleanNoStats, int reps) { + var sb = new StringBuilder(); + sb.append("# despeckle cleaner op-level baseline\n\n") + .append("Generated by `CleanerBenchmark`") + .append(" (`./gradlew :despeckle:infrastructure:benchCleaner`, dev container).\n") + .append("Times each Leptonica primitive the page cleaner composes on a synthetic\n") + .append("600-dpi A5 page (") + .append(WIDTH) + .append("x") + .append(HEIGHT) + .append(" px, fixed seed). Re-run after any change to the\n") + .append("cleaner or the imaging bindings and compare before merging.\n\n"); + String date = + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss", Locale.ROOT) + .withZone(ZoneOffset.UTC) + .format(Instant.now()); + sb.append("- Date (UTC): ").append(date).append('\n'); + sb.append("- Host: ") + .append(System.getProperty("os.name", "?")) + .append(' ') + .append(System.getProperty("os.arch", "?")) + .append(", ") + .append(Runtime.getRuntime().availableProcessors()) + .append(" CPUs\n"); + sb.append("- Samples: median of ") + .append(reps) + .append(" reps after ") + .append(WARMUP_REPS) + .append(" warmups; single-threaded.\n\n"); + + sb.append("| op | median (ms) | min (ms) | calls/clean() | est. share of clean() |\n"); + sb.append("|---|---:|---:|---:|---:|\n"); + double estimatedTotal = 0; + for (Row row : rows) { + double estimated = row.medianMs() * row.callsPerClean(); + estimatedTotal += estimated; + sb.append( + String.format( + Locale.ROOT, + "| %s | %.2f | %.2f | %d | %.1f%% |%n", + row.op(), + row.medianMs(), + row.minMs(), + row.callsPerClean(), + estimated * 100.0 / clean.medianMs())); + } + sb.append( + String.format( + Locale.ROOT, + "| **Σ(median × calls)** | %.2f | | | %.1f%% |%n", + estimatedTotal, + estimatedTotal * 100.0 / clean.medianMs())); + sb.append( + String.format( + Locale.ROOT, + "| **clean() end-to-end** | %.2f | %.2f | 1 | 100%% |%n", + clean.medianMs(), + clean.minMs())); + sb.append( + String.format( + Locale.ROOT, + "| **clean() without component stats** | %.2f | %.2f | 1 | %.1f%% |%n", + cleanNoStats.medianMs(), + cleanNoStats.minMs(), + cleanNoStats.medianMs() * 100.0 / clean.medianMs())); + sb.append( + "\nThe Σ row landing near 100% means the table accounts for clean()'s real" + + " cost;\n") + .append( + "a large gap points at untimed work (allocation churn, codec" + + " internals).\n"); + return sb.toString(); + } + + private static void deleteTree(Path dir) throws IOException { + if (!Files.exists(dir)) { + return; + } + try (var paths = Files.walk(dir)) { + paths.sorted((a, b) -> b.getNameCount() - a.getNameCount()) + .forEach( + p -> { + try { + Files.deleteIfExists(p); + } catch (IOException e) { + System.err.println( + "warn: could not delete " + p + ": " + e.getMessage()); + } + }); + } + } +} diff --git a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/Jbig2Assembler.java b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/Jbig2Assembler.java index 7db4260..a7549e5 100644 --- a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/Jbig2Assembler.java +++ b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/Jbig2Assembler.java @@ -3,13 +3,12 @@ import java.io.IOException; import java.nio.file.Path; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import org.jspecify.annotations.Nullable; /** * Packs a directory of cleaned bitonal pages into a lossless-JBIG2 PDF. The abstraction over the * {@code jbig2} encoder + PDFBox container; the implementation ({@code - * infrastructure.pdf.PdfBoxJbig2Assembler}) runs the per-page encodes on the supplied pool. + * infrastructure.pdf.PdfBoxJbig2Assembler}) runs at most {@code jobs} per-page encodes at once. */ public interface Jbig2Assembler { @@ -20,7 +19,7 @@ public interface Jbig2Assembler { * @param outPdf the lossless-JBIG2 PDF to write * @param source a PDF whose Info dict, XMP and version are inherited, or {@code null} for none * @param forcedDpi a single DPI to size every page with, or empty to read each image's own - * @param pool the worker pool the per-page encodes run on + * @param jobs how many per-page encodes may run at once * @param scratchDir scratch directory for the intermediate per-page JBIG2 streams * (caller-owned) * @throws IOException if the directory is empty, a tool fails, or the write fails @@ -30,7 +29,7 @@ void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException; } diff --git a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/PdfImageExtractor.java b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/PdfImageExtractor.java index 77a4969..f80348b 100644 --- a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/PdfImageExtractor.java +++ b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/PdfImageExtractor.java @@ -2,13 +2,12 @@ import java.io.IOException; import java.nio.file.Path; -import java.util.concurrent.ExecutorService; /** * Extracts a PDF's embedded bitonal images, and reports its dominant scan resolution. The * abstraction over the {@code pdfimages}/{@code pdfinfo} drivers; the implementation ({@code - * infrastructure.pdf.PdfImagesCliExtractor}) splits the page range across the supplied pool, one - * {@code pdfimages} invocation per chunk. + * infrastructure.pdf.PdfImagesCliExtractor}) splits the page range into bounded chunks, one {@code + * pdfimages} invocation per chunk. */ public interface PdfImageExtractor { @@ -17,7 +16,7 @@ public interface PdfImageExtractor { /** * Extract all pages of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks. {@code jobs} bounds both the chunk count and the pool slots used. + * chunks. at most {@code jobs} chunks run at once. */ - void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException; + void extract(Path pdf, Path outDir, int jobs) throws IOException; } diff --git a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/package-info.java b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/package-info.java index 4c8bdf4..23f5510 100644 --- a/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/package-info.java +++ b/despeckle/port/src/main/java/io/github/p4suta/despeckle/port/package-info.java @@ -2,9 +2,9 @@ * The despeckle ports: the narrow interfaces the application layer drives and the infrastructure * layer implements. Every signature is expressed purely in terms of {@code domain.model} value * types and JDK types ({@link java.nio.file.Path}, primitives, {@link java.util.OptionalInt}, - * {@link java.util.List}, {@link java.util.concurrent.ExecutorService}, {@link - * java.io.IOException}); no Leptonica {@code Pix}, PDFBox or AWT type ever crosses a port boundary, - * so the dependency is kept unidirectional and the adapters stay swappable. + * {@link java.util.List}, {@link java.io.IOException}); no Leptonica {@code Pix}, PDFBox or AWT + * type ever crosses a port boundary, so the dependency is kept unidirectional and the adapters stay + * swappable. */ @NullMarked package io.github.p4suta.despeckle.port; diff --git a/pipeline/README.md b/pipeline/README.md index b8d47ef..12d15ef 100644 --- a/pipeline/README.md +++ b/pipeline/README.md @@ -33,6 +33,7 @@ never stops the rest; existing outputs are skipped unless `--force`). | `--pdf-a` | off | emit PDF/A-2b conformance | | `--force` | off | overwrite an existing output (batch: regenerate, don't skip) | | `--progress-file ` | — | write machine-readable JSONL progress events (single input only) | +| `--timings` | off | print a per-stage wall-clock breakdown to stderr when each run ends | | `-i, --interactive` | off | guided mode: prompt for the input, options and output | | `-h, --help` | — | show help and exit | | `-V, --version` | — | print version and exit | diff --git a/pipeline/app/build.gradle.kts b/pipeline/app/build.gradle.kts index 9764cfe..5c2572d 100644 --- a/pipeline/app/build.gradle.kts +++ b/pipeline/app/build.gradle.kts @@ -34,6 +34,10 @@ dependencies { implementation(libs.commons.cli) implementation(libs.slf4j.api) runtimeOnly(libs.slf4j.simple) + + // The benchmark fixture generator (test sources, never shipped — mirroring register's + // createSamplePdf) draws synthetic scan pages with PDFBox directly. + testImplementation(libs.pdfbox) } // The one place native access is granted to the launched app; run, test and JavaExec inherit it. @@ -85,3 +89,46 @@ selfContainedApp { // jbig2 (its register stage writes TIFF-G4; the spread pack embeds CCITT G4). bundleQpdf(this, libs.versions.qpdf.get()) } + +// ---- Stage-level benchmark (see pipeline/docs/perf-baseline.md) --------------------------------- + +// Deterministic synthetic scan book for the benchmark: an existing output is reused, so the +// generation cost (a minute at 200 pages × 600 dpi) is paid once. Knob: -Ppages=N (default 200). +tasks.register("createSampleScan") { + group = "verification" + description = "Generate the synthetic bitonal scan book the benchmark converts (cached)" + dependsOn(tasks.named("testClasses")) + classpath = sourceSets["test"].runtimeClasspath + mainClass = "io.github.p4suta.pipeline.tools.SampleScanGenerator" + val pages = providers.gradleProperty("pages").getOrElse("200") + args = listOf("build/test-data/sample-scan-${pages}p.pdf", pages, "600") +} + +// Stage-level runtime + memory benchmark (the pdfbook counterpart of tate's benchRuntime): runs the +// installDist launcher in-container with --timings, parses the per-stage breakdown, samples peak +// RSS from /proc, and writes pipeline/docs/perf-baseline.md. Knobs: -Pruns=N (warm runs, default +// 3), -Pjobs=1,8 (comma-separated -j sweep; default auto = the launcher's CPU-count default), +// -Ppages=N (fixture size, default 200), -Pinputs="a.pdf b.pdf" (real books instead of the +// fixture; resolved against the repo root). +tasks.register("benchPipeline") { + group = "verification" + description = "Benchmark pdfbook stage timings + peak memory; writes pipeline/docs/perf-baseline.md" + dependsOn(tasks.named("installDist"), tasks.named("createSampleScan")) + classpath = sourceSets["test"].runtimeClasspath + mainClass = "io.github.p4suta.pipeline.tools.PipelineBenchmark" + workingDir = rootDir + val runs = providers.gradleProperty("runs").getOrElse("3") + val jobs = providers.gradleProperty("jobs").getOrElse("auto") + val pages = providers.gradleProperty("pages").getOrElse("200") + val extraInputs = + providers + .gradleProperty("inputs") + .orNull + ?.split(Regex("\\s+")) + ?.filter { it.isNotBlank() } + ?: emptyList() + val launcher = "pipeline/app/build/install/pdfbook/bin/pdfbook" + val inputs = + extraInputs.ifEmpty { listOf("pipeline/app/build/test-data/sample-scan-${pages}p.pdf") } + args = listOf(launcher, "qpdf", "pipeline/docs/perf-baseline.md", runs, jobs) + inputs +} diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/Main.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/Main.java index 67b1db9..81ccc68 100644 --- a/pipeline/app/src/main/java/io/github/p4suta/pipeline/Main.java +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/Main.java @@ -2,20 +2,62 @@ import io.github.p4suta.pipeline.cli.PipelineCommand; import io.github.p4suta.shared.observability.FatalUncaughtHandler; +import java.time.Duration; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; /** Process entry point for the unified pipeline tool ({@code pdfbook}). */ public final class Main { + /** + * How long the shutdown hook waits for the interrupted run to unwind. Generous: the page + * fan-outs quiesce in at most one page's worth of native work per worker, and the extractor + * kills its children on interrupt. + */ + private static final Duration SHUTDOWN_GRACE = Duration.ofSeconds(15); + private Main() {} /** - * Installs the fatal uncaught-exception handler, runs the CLI, and translates the result into a - * process exit code. All argument parsing and terminal output live in {@link PipelineCommand}. + * Installs the fatal uncaught-exception handler and the Ctrl-C cleanup hook, runs the CLI, and + * translates the result into a process exit code. All argument parsing and terminal output live + * in {@link PipelineCommand}. + * + *

On Ctrl-C (or any normal-termination signal) the shutdown hook interrupts the main thread + * and waits — bounded by {@link #SHUTDOWN_GRACE} — for the run to unwind: the page fan-outs + * stop their workers and quiesce, subprocesses are killed, and the run's {@code finally} blocks + * delete the temp work area. A Ctrl-C therefore no longer leaks a {@code p4suta-pipeline-} + * directory full of intermediates. On a normal exit the latch is already down, so the hook + * returns immediately. * * @param args command-line arguments */ public static void main(String[] args) { Thread.setDefaultUncaughtExceptionHandler(new FatalUncaughtHandler()); - System.exit(new PipelineCommand().run(args)); + CountDownLatch unwound = new CountDownLatch(1); + Thread main = Thread.currentThread(); + Runtime.getRuntime() + .addShutdownHook( + new Thread( + () -> { + main.interrupt(); + try { + boolean clean = + unwound.await( + SHUTDOWN_GRACE.toMillis(), + TimeUnit.MILLISECONDS); + if (!clean) { + System.err.println( + "pdfbook: shutdown grace expired; temp files" + + " may remain"); + } + } catch (InterruptedException ignored) { + // The JVM is exiting anyway; stop waiting. + } + }, + "pdfbook-shutdown")); + int code = new PipelineCommand().run(args); + unwound.countDown(); + System.exit(code); } } diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java index 99e453c..25f8125 100644 --- a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/PipelineCommand.java @@ -153,6 +153,13 @@ private static Options buildOptions() { "Write machine-readable JSONL progress events to this file (single" + " input only); used by front ends to report progress.") .get()); + options.addOption( + Option.builder() + .longOpt("timings") + .desc( + "Print a per-stage wall-clock breakdown to stderr when each run" + + " ends.") + .get()); CliDocs.options(options); return options; } @@ -351,6 +358,7 @@ record Plan(Path input, Path output, Config config) {} deskew, scale, pdfA, + false, force); return new Plan(input, output, config); } @@ -380,14 +388,32 @@ private static String defaultOutput(Path input) { private static void runOne(Path input, Path output, Config config, @Nullable Path progressFile) throws IOException { if (progressFile == null) { - runWith(input, output, config, ProgressSink.NO_OP); + runWith(input, output, config, withTimings(config, ProgressSink.NO_OP)); } else { try (JsonlFileProgressSink progress = new JsonlFileProgressSink(progressFile)) { - runWith(input, output, config, progress); + runWith(input, output, config, withTimings(config, progress)); } } } + /** + * Wraps {@code sink} with a fresh {@link StageTimingSink} when {@code --timings} is set, so + * each run (every book of a batch separately) prints its own per-stage breakdown to stderr. + */ + private static ProgressSink withTimings(Config config, ProgressSink sink) { + if (!config.timings()) { + return sink; + } + StageTimingSink timings = new StageTimingSink(System.err); + if (sink == ProgressSink.NO_OP) { + return timings; + } + return event -> { + sink.emit(event); + timings.emit(event); + }; + } + // Resolves the progress sink first so the stages and sink report page-level PageProcessed // events into the same sink PipelineRunner reports stage boundaries into. With no // --progress-file the sink is NO_OP and every emit is a no-op. @@ -401,9 +427,11 @@ private static void runWith(Path input, Path output, Config config, ProgressSink stages.add(new RegisterStage(config.jobs(), config.deskew(), config.scale(), progress)); } if (stages.isEmpty()) { - // --no-despeckle --no-register: the raw pdfimages TIFFs are not CCITT G4, which the - // spread sink's pass-through embedding requires; despeckle/register each re-encode G4 - // themselves, so only the no-stage path needs this normalization. + // --no-despeckle --no-register: a non-CCITT source extracts as decoded TIFFs that are + // not the single-strip CCITT G4 the spread sink's pass-through embedding requires; + // despeckle/register each re-encode G4 themselves, so only the no-stage path needs + // this normalization (an all-CCITT source arrives already G4 — then this is a cheap + // lossless re-encode that keeps the path uniform). stages.add(new G4EncodeStage(config.jobs(), progress)); } Source source = new PdfExtractSource(input, config.jobs()); @@ -447,6 +475,7 @@ private static Config parseConfig(CommandLine cmd) throws ParseException { !cmd.hasOption("no-deskew"), !cmd.hasOption("no-scale"), cmd.hasOption("pdf-a"), + cmd.hasOption("timings"), cmd.hasOption("force")); } @@ -475,5 +504,6 @@ record Config( boolean deskew, boolean scale, boolean pdfA, + boolean timings, boolean force) {} } diff --git a/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java new file mode 100644 index 0000000..496b0e4 --- /dev/null +++ b/pipeline/app/src/main/java/io/github/p4suta/pipeline/cli/StageTimingSink.java @@ -0,0 +1,93 @@ +package io.github.p4suta.pipeline.cli; + +import io.github.p4suta.shared.progress.ProgressEvent; +import io.github.p4suta.shared.progress.ProgressSink; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import org.jspecify.annotations.Nullable; + +/** + * Measures each stage's wall clock from its {@link ProgressEvent.StageStarted}/{@link + * ProgressEvent.StageCompleted} boundaries and prints a per-stage breakdown when the run ends — the + * {@code --timings} flag's implementation. One line per stage in completion order, then the + * run-wide total: + * + *

{@code
+ * timing: extract = 4.21s (18.3%)
+ * timing: despeckle = 9.87s (42.9%)
+ * timing: total = 23.01s
+ * }
+ * + *

The {@code timing: = s} shape is a stable contract the {@code benchPipeline} + * harness parses; keep it machine-readable. A stage still open when the run fails is reported with + * its elapsed-so-far, so a failed run still shows where the time went. Thread-safe like every + * {@link ProgressSink}: events are handled under one lock. + */ +final class StageTimingSink implements ProgressSink { + + private final PrintStream out; + private final Object lock = new Object(); + private final List stages = new ArrayList<>(); + private final List stageNanos = new ArrayList<>(); + private @Nullable String openStage; + private long openedAtNanos; + private long runStartedAtNanos; + private boolean runStarted; + + StageTimingSink(PrintStream out) { + this.out = out; + } + + @Override + public void emit(ProgressEvent event) { + synchronized (lock) { + switch (event) { + case ProgressEvent.RunStarted ignored -> markRunStarted(); + case ProgressEvent.StageStarted s -> { + // Defensive: a sink wired mid-run still measures from the first boundary. + markRunStarted(); + openStage = s.stage(); + openedAtNanos = System.nanoTime(); + } + case ProgressEvent.StageCompleted ignored -> closeOpenStage(); + case ProgressEvent.PageProcessed ignored -> { + // Stage boundaries carry all the timing information. + } + case ProgressEvent.RunCompleted ignored -> report(); + case ProgressEvent.RunFailed ignored -> report(); + } + } + } + + private void markRunStarted() { + if (!runStarted) { + runStartedAtNanos = System.nanoTime(); + runStarted = true; + } + } + + private void closeOpenStage() { + @Nullable String stage = openStage; + if (stage != null) { + stages.add(stage); + stageNanos.add(System.nanoTime() - openedAtNanos); + openStage = null; + } + } + + private void report() { + closeOpenStage(); + long totalNanos = runStarted ? System.nanoTime() - runStartedAtNanos : 0; + for (int i = 0; i < stages.size(); i++) { + out.printf( + Locale.ROOT, + "timing: %s = %.2fs (%.1f%%)%n", + stages.get(i), + stageNanos.get(i) / 1e9, + totalNanos > 0 ? stageNanos.get(i) * 100.0 / totalNanos : 0.0); + } + out.printf(Locale.ROOT, "timing: total = %.2fs%n", totalNanos / 1e9); + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java new file mode 100644 index 0000000..3fbb3f7 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/cli/StageTimingSinkTest.java @@ -0,0 +1,75 @@ +package io.github.p4suta.pipeline.cli; + +import static org.assertj.core.api.Assertions.assertThat; + +import io.github.p4suta.shared.progress.ProgressEvent; +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import org.junit.jupiter.api.Test; + +/** + * Pins the {@code --timings} report: one machine-parseable {@code timing: = s} + * line per completed stage (in completion order, percentages attached) plus a {@code timing: total} + * line, printed only when the run ends — and on failure, the still-open stage is reported with its + * elapsed-so-far. The line shape is the contract the {@code benchPipeline} harness parses. + */ +final class StageTimingSinkTest { + + private final ByteArrayOutputStream buf = new ByteArrayOutputStream(); + private final StageTimingSink sink = + new StageTimingSink(new PrintStream(buf, true, StandardCharsets.UTF_8)); + + private String output() { + return buf.toString(StandardCharsets.UTF_8); + } + + @Test + void completedRunReportsEachStageInOrderAndATotal() { + sink.emit(new ProgressEvent.RunStarted(2)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 2)); + sink.emit(new ProgressEvent.PageProcessed("extract", 1, 2)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + sink.emit(new ProgressEvent.StageStarted("spread", 1, 2)); + sink.emit(new ProgressEvent.StageCompleted("spread")); + sink.emit(new ProgressEvent.RunCompleted()); + + assertThat(output().lines()) + .hasSize(3) + .satisfiesExactly( + extract -> + assertThat(extract) + .matches( + "timing: extract = \\d+\\.\\d{2}s" + + " \\(\\d+\\.\\d%\\)"), + spread -> + assertThat(spread) + .matches( + "timing: spread = \\d+\\.\\d{2}s" + + " \\(\\d+\\.\\d%\\)"), + total -> assertThat(total).matches("timing: total = \\d+\\.\\d{2}s")); + } + + @Test + void nothingIsPrintedBeforeTheRunEnds() { + sink.emit(new ProgressEvent.RunStarted(1)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 1)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + + assertThat(output()).isEmpty(); + } + + @Test + void failedRunReportsTheStillOpenStage() { + sink.emit(new ProgressEvent.RunStarted(2)); + sink.emit(new ProgressEvent.StageStarted("extract", 0, 2)); + sink.emit(new ProgressEvent.StageCompleted("extract")); + sink.emit(new ProgressEvent.StageStarted("register", 1, 2)); + sink.emit(new ProgressEvent.RunFailed("INTERNAL", "boom")); + + assertThat(output()) + .contains("timing: extract = ") + .contains("timing: register = ") + .contains("timing: total = "); + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java new file mode 100644 index 0000000..1b59397 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/PipelineBenchmark.java @@ -0,0 +1,498 @@ +package io.github.p4suta.pipeline.tools; + +import java.io.IOException; +import java.lang.management.ManagementFactory; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Stage-level runtime + memory benchmark for the installed {@code pdfbook} launcher — the pdfbook + * counterpart of tate's {@code RuntimeBenchmark}, with per-stage attribution as the addition. + * + *

Drives {@code pdfbook -o /out.pdf --force --timings [-j N]} as a child process, + * measuring end-to-end wall around the process ({@link System#nanoTime()}), peak RSS by sampling + * the child's {@code /proc//status} {@code VmHWM} (Linux-only; {@code n/a} elsewhere), and the + * per-stage wall by parsing the stable {@code timing: = s} lines {@code + * StageTimingSink} prints. Writes a Markdown report. + * + *

Test-sources tool (driven by the {@code benchPipeline} Gradle task): it never ships in the + * production launcher, and it expects the dev container's native toolchain (pdfimages, Leptonica, + * qpdf) on PATH — the installDist launcher, not the jpackage image, is what it measures. + * + *

Usage: {@code PipelineBenchmark + * ...} — {@code jobsCsv} is a comma-separated {@code -j} sweep ({@code auto} = omit + * {@code -j}, i.e. the launcher's CPU-count default). + */ +public final class PipelineBenchmark { + + private static final Pattern TIMING = + Pattern.compile("^timing: (\\S+) = ([0-9.]+)s", Pattern.MULTILINE); + private static final Pattern VM_HWM = Pattern.compile("VmHWM:\\s*([0-9]+)"); + private static final long POLL_MILLIS = 5; + private static final long PROCESS_TIMEOUT_NANOS = TimeUnit.MINUTES.toNanos(30); + private static final long MIB = 1024L * 1024L; + + private final Path launcher; + private final String qpdf; + private final Path outDoc; + private final int runs; + private final List jobsSweep; + + private PipelineBenchmark( + Path launcher, String qpdf, Path outDoc, int runs, List jobsSweep) { + this.launcher = launcher; + this.qpdf = qpdf; + this.outDoc = outDoc; + this.runs = runs; + this.jobsSweep = jobsSweep; + } + + public static void main(String[] args) throws IOException, InterruptedException { + if (args.length < 6) { + System.err.println( + "usage: PipelineBenchmark " + + " ..."); + System.exit(2); + return; + } + var benchmark = + new PipelineBenchmark( + Path.of(args[0]), + args[1], + Path.of(args[2]), + Integer.parseInt(args[3]), + Arrays.stream(args[4].split(",")).map(String::trim).toList()); + List inputs = Arrays.stream(args).skip(5).map(Path::of).toList(); + benchmark.run(inputs); + } + + // Result records + + /** One measured child run: wall seconds, peak RSS (KiB, -1 if unavailable), merged output. */ + private record Timed(double elapsedSeconds, long maxRssKib, String output) {} + + /** A finished input × jobs measurement, ready to render. */ + private record Row( + String name, + String jobs, + int pages, + long inputBytes, + double wallMedian, + double coldWall, + Map stageMedians, + long rssMedianKib, + long outputBytes) {} + + // Orchestration + + private void run(List inputs) throws IOException, InterruptedException { + requireExecutable(launcher, "pdfbook launcher", "build it first: just pdfbook-install"); + + List rows = new ArrayList<>(); + for (Path input : inputs) { + if (!Files.isRegularFile(input)) { + System.err.println("skip (not found): " + input); + continue; + } + for (String jobs : jobsSweep) { + rows.add(measure(input, jobs)); + } + } + + String report = render(rows); + Files.createDirectories(requireParent(outDoc)); + Files.writeString(outDoc, report, StandardCharsets.UTF_8); + System.out.print(report); + System.err.println(); + System.err.println("→ wrote " + outDoc); + } + + private Row measure(Path input, String jobs) throws IOException, InterruptedException { + int pages = pageCount(input); + long inputBytes = Files.size(input); + System.err.printf( + Locale.ROOT, + "Measuring: %s (%dp, %s MiB, jobs=%s)…%n", + fileName(input), + pages, + mib(inputBytes), + jobs); + + Path work = Files.createTempDirectory("pdfbook-bench"); + try { + Path out = work.resolve("out.pdf"); + List convert = new ArrayList<>(); + convert.add(launcher.toString()); + convert.add(input.toString()); + convert.add("-o"); + convert.add(out.toString()); + convert.add("--force"); + convert.add("--timings"); + if (!"auto".equals(jobs)) { + convert.add("-j"); + convert.add(jobs); + } + + // Cold run (fresh page cache for the input is not guaranteed, but a fresh JVM is) — + // recorded separately from the warm median. + Timed cold = timed(convert); + + double[] walls = new double[runs]; + long[] rsss = new long[runs]; + Map> stages = new LinkedHashMap<>(); + for (int r = 0; r < runs; r++) { + Timed t = timed(convert); + walls[r] = t.elapsedSeconds(); + rsss[r] = t.maxRssKib(); + parseTimings(t.output()) + .forEach( + (stage, seconds) -> + stages.computeIfAbsent(stage, ignored -> new ArrayList<>()) + .add(seconds)); + } + Map stageMedians = new LinkedHashMap<>(); + stages.forEach((stage, seconds) -> stageMedians.put(stage, median(seconds))); + + long outputBytes = Files.isRegularFile(out) ? Files.size(out) : -1; + return new Row( + fileName(input), + jobs, + pages, + inputBytes, + median(walls), + cold.elapsedSeconds(), + stageMedians, + medianLong(rsss), + outputBytes); + } finally { + deleteTree(work); + } + } + + /** + * The per-stage seconds of one run, keyed by stage label in print order ({@code total} + * included). Repeated labels (a batch run) sum, though the harness always converts one book. + */ + private static Map parseTimings(String output) { + Map timings = new LinkedHashMap<>(); + Matcher m = TIMING.matcher(output); + while (m.find()) { + timings.merge(m.group(1), Double.parseDouble(m.group(2)), Double::sum); + } + return timings; + } + + // Subprocess measurement + + /** + * Runs {@code command}, returning its wall time, peak RSS (sampled from {@code + * /proc//status} {@code VmHWM}; -1 where unavailable), and its merged stdout+stderr (so + * the {@code timing:} lines can be parsed). Output is drained on a separate thread so a chatty + * child cannot deadlock on a full pipe. + */ + private static Timed timed(List command) throws IOException, InterruptedException { + long start = System.nanoTime(); + Process process = new ProcessBuilder(command).redirectErrorStream(true).start(); + + var captured = new AtomicReference<>(""); + Thread drainer = + Thread.ofVirtual() + .start( + () -> { + try (var in = process.getInputStream()) { + captured.set( + new String( + in.readAllBytes(), StandardCharsets.UTF_8)); + } catch (IOException ignored) { + // Process gone; whatever was read is lost — acceptable for + // a benchmark. + } + }); + + Path status = Path.of("/proc", Long.toString(process.pid()), "status"); + long peakRssKib = -1; + while (process.isAlive()) { + if (System.nanoTime() - start > PROCESS_TIMEOUT_NANOS) { + process.destroyForcibly(); + throw new IOException("timed command did not finish: " + command); + } + peakRssKib = Math.max(peakRssKib, readVmHwmKib(status)); + Thread.sleep(POLL_MILLIS); + } + double elapsed = (System.nanoTime() - start) / 1.0e9; + int exit = process.waitFor(); + drainer.join(); + if (exit != 0) { + throw new IOException( + "benchmark child failed with exit " + exit + ": " + captured.get()); + } + return new Timed(elapsed, peakRssKib, captured.get()); + } + + /** Peak RSS (KiB) from {@code /proc//status}, or -1 if unreadable / non-Linux. */ + private static long readVmHwmKib(Path status) { + try { + Matcher m = VM_HWM.matcher(Files.readString(status, StandardCharsets.UTF_8)); + return m.find() ? Long.parseLong(m.group(1)) : -1; + } catch (IOException | RuntimeException e) { + return -1; // process already exited, or /proc not present + } + } + + /** Page count via {@code qpdf --show-npages} (PATH or absolute), or -1 when unavailable. */ + private int pageCount(Path pdf) throws InterruptedException { + try { + Process process = + new ProcessBuilder(qpdf, "--show-npages", pdf.toString()) + .redirectErrorStream(true) + .start(); + String output; + try (var in = process.getInputStream()) { + output = new String(in.readAllBytes(), StandardCharsets.UTF_8); + } + if (!process.waitFor(1, TimeUnit.MINUTES)) { + process.destroyForcibly(); + return -1; + } + // The count is the one digits-only line; qpdf may surround it with warning lines. + return output.lines() + .map(String::strip) + .filter(line -> line.matches("\\d+")) + .findFirst() + .map(Integer::parseInt) + .orElse(-1); + } catch (IOException e) { + return -1; // qpdf not installed — page count is cosmetic here + } + } + + // Numeric helpers + + private static double median(double[] values) { + double[] sorted = values.clone(); + Arrays.sort(sorted); + int n = sorted.length; + if (n == 0) { + return 0; + } + return (n % 2 == 1) ? sorted[n / 2] : (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0; + } + + private static double median(List values) { + return median(values.stream().mapToDouble(Double::doubleValue).toArray()); + } + + private static long medianLong(long[] values) { + return Math.round(median(Arrays.stream(values).asDoubleStream().toArray())); + } + + private static String mib(long bytes) { + return bytes < 0 ? "n/a" : String.format(Locale.ROOT, "%.1f", bytes / (double) MIB); + } + + // Rendering + + private String render(List rows) { + // Stage columns: the union of stage labels across rows in first-appearance order, with + // "total" (the launcher's in-process conversion time) pulled out as its own column. + Set stageNames = new LinkedHashSet<>(); + for (Row row : rows) { + stageNames.addAll(row.stageMedians().keySet()); + } + stageNames.remove("total"); + + var sb = new StringBuilder(); + sb.append("# pdfbook runtime baseline (stage-level)\n\n") + .append("Generated by `PipelineBenchmark`") + .append(" (`./gradlew :pipeline:app:benchPipeline`, in the dev container).\n") + .append( + "Tracks the **per-stage wall-clock breakdown, end-to-end runtime and peak" + + " memory**\n") + .append( + "of the installDist `pdfbook` launcher. Re-run after any change to the" + + " pipeline\n") + .append( + "and compare against the previous run before merging (acceptance: ≥5%" + + " median\n") + .append( + "total-wall improvement, or an explicit RSS/disk win, with output" + + " validated).\n\n"); + appendHostInfo(sb); + sb.append("\n## Stage breakdown (warm median of ").append(runs).append(" runs)\n\n"); + sb.append( + "`conv` is the launcher's in-process total (`timing: total`);" + + " `startup+init` = E2E wall − conv\n") + .append( + "(JVM boot + first-touch PDFBox/AWT init). `jobs=auto` is the launcher's" + + " CPU-count default.\n\n"); + sb.append("| Input | Jobs | Pages | E2E wall | conv |"); + for (String stage : stageNames) { + sb.append(' ').append(stage).append(" |"); + } + sb.append(" startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) |\n"); + sb.append("|---|---|---:|---:|---:|"); + sb.append("---:|".repeat(stageNames.size())); + sb.append("---:|---:|---:|---:|\n"); + for (Row row : rows) { + double conv = row.stageMedians().getOrDefault("total", 0.0); + sb.append("| ") + .append(row.name()) + .append(" | ") + .append(row.jobs()) + .append(" | ") + .append(pages(row.pages())) + .append(" | ") + .append(secs(row.wallMedian())) + .append(" | ") + .append(conv > 0 ? secs(conv) : "n/a") + .append(" |"); + for (String stage : stageNames) { + Double seconds = row.stageMedians().get(stage); + sb.append(' ').append(seconds == null ? "n/a" : secs(seconds)).append(" |"); + } + sb.append(' ') + .append(conv > 0 ? secs(Math.max(0, row.wallMedian() - conv)) : "n/a") + .append(" | ") + .append(secs(row.coldWall())) + .append(" | ") + .append(rssMib(row.rssMedianKib())) + .append(" | ") + .append(mib(row.outputBytes())) + .append(" |\n"); + } + sb.append("\n## Stage shares (of conv, warm median)\n\n") + .append( + "The shares that decide where optimization effort goes: a stage that is" + + " ~5% of conv\n") + .append("cannot pay for a parallelization rewrite no matter how elegant.\n\n"); + sb.append("| Input | Jobs |"); + for (String stage : stageNames) { + sb.append(' ').append(stage).append(" |"); + } + sb.append('\n').append("|---|---|").append("---:|".repeat(stageNames.size())).append('\n'); + for (Row row : rows) { + double conv = row.stageMedians().getOrDefault("total", 0.0); + sb.append("| ").append(row.name()).append(" | ").append(row.jobs()).append(" |"); + for (String stage : stageNames) { + Double seconds = row.stageMedians().get(stage); + sb.append(' ') + .append( + seconds == null || conv <= 0 + ? "n/a" + : String.format( + Locale.ROOT, "%.1f%%", seconds * 100.0 / conv)) + .append(" |"); + } + sb.append('\n'); + } + return sb.toString(); + } + + private void appendHostInfo(StringBuilder sb) { + String date = + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss", Locale.ROOT) + .withZone(ZoneOffset.UTC) + .format(Instant.now()); + long totalRamBytes = totalPhysicalMemoryBytes(); + sb.append("- Date (UTC): ").append(date).append('\n'); + sb.append("- Host: ") + .append(System.getProperty("os.name", "?")) + .append(' ') + .append(System.getProperty("os.version", "?")) + .append(' ') + .append(System.getProperty("os.arch", "?")) + .append(", ") + .append(Runtime.getRuntime().availableProcessors()) + .append(" CPUs, RAM ") + .append(totalRamBytes > 0 ? Math.round(totalRamBytes / 1.073741824e9) + "Gi" : "?") + .append('\n'); + sb.append("- Launcher: `").append(launcher).append("`\n"); + sb.append("- Samples per measurement: cold (1st run) + warm median of ") + .append(runs) + .append(".\n"); + sb.append( + "- The default input is the deterministic synthetic fixture" + + " (`createSampleScan`,\n") + .append( + " seeded, so identical across machines). Real books are pluggable via" + + " `-Pinputs=\"…\"`;\n") + .append(" only their page count and byte size are reported.\n"); + } + + private static long totalPhysicalMemoryBytes() { + if (ManagementFactory.getOperatingSystemMXBean() + instanceof com.sun.management.OperatingSystemMXBean os) { + return os.getTotalMemorySize(); + } + return -1; + } + + private static String secs(double seconds) { + return String.format(Locale.ROOT, "%.2fs", seconds); + } + + private static String pages(int pages) { + return pages < 0 ? "?" : Integer.toString(pages); + } + + private static String rssMib(long rssKib) { + return rssKib < 0 ? "n/a" : Long.toString(Math.round(rssKib / 1024.0)); + } + + // Small utilities + + private static String fileName(Path path) { + Path name = path.getFileName(); + return name != null ? name.toString() : path.toString(); + } + + private static void requireExecutable(Path path, String what, String hint) { + if (!Files.isExecutable(path)) { + System.err.println("error: " + what + " not found at " + path); + System.err.println(" " + hint); + System.exit(1); + } + } + + private static Path requireParent(Path path) { + Path parent = path.getParent(); + if (parent == null) { + throw new IllegalArgumentException("output path has no parent: " + path); + } + return parent; + } + + private static void deleteTree(Path dir) throws IOException { + if (!Files.exists(dir)) { + return; + } + try (var paths = Files.walk(dir)) { + paths.sorted((a, b) -> b.getNameCount() - a.getNameCount()) + .forEach( + p -> { + try { + Files.deleteIfExists(p); + } catch (IOException e) { + System.err.println( + "warn: could not delete " + p + ": " + e.getMessage()); + } + }); + } + } +} diff --git a/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java new file mode 100644 index 0000000..8b111a7 --- /dev/null +++ b/pipeline/app/src/test/java/io/github/p4suta/pipeline/tools/SampleScanGenerator.java @@ -0,0 +1,135 @@ +package io.github.p4suta.pipeline.tools; + +import java.awt.Color; +import java.awt.Graphics2D; +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Locale; +import java.util.Random; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; + +/** + * Generates the synthetic, copyright-free bitonal "scan" book the {@code benchPipeline} harness + * converts: A5 pages at the requested dpi carrying vertical text-like columns with per-page + * position jitter, a small per-page skew of up to ±0.5° (so the register stage's deskew has real + * work) and salt-and-pepper specks (so despeckle has real work), embedded as CCITT G4 so {@code + * pdfimages} extracts them exactly like a real scan. A fixed seed keeps every generation + * byte-identical, so benchmark runs stay comparable across machines and branches. + * + *

This deliberately lives in test sources (driven by the {@code createSampleScan} Gradle task), + * mirroring register's {@code SamplePdfGenerator}: the dev tool never ships in the production + * launcher. An existing output is reused, so repeated benchmark runs skip the generation cost. + * + *

Usage: {@code SampleScanGenerator [pages] [dpi]} + */ +public final class SampleScanGenerator { + + private SampleScanGenerator() {} + + /** {@code SampleScanGenerator [pages] [dpi]} — writes the synthetic scan book. */ + public static void main(String[] args) throws IOException { + Path out = Path.of(args.length > 0 ? args[0] : "sample-scan.pdf"); + int pages = args.length > 1 ? Integer.parseInt(args[1]) : 200; + int dpi = args.length > 2 ? Integer.parseInt(args[2]) : 600; + if (Files.isRegularFile(out)) { + System.out.println("reusing existing " + out + " (delete it to regenerate)"); + return; + } + long start = System.nanoTime(); + write(out, pages, dpi); + System.out.printf( + Locale.ROOT, + "wrote %s: %d page(s) at %d dpi, %.1f MiB in %.1fs%n", + out, + pages, + dpi, + Files.size(out) / (1024.0 * 1024.0), + (System.nanoTime() - start) / 1e9); + } + + /** Writes a {@code pages}-page synthetic bitonal scan book to {@code out} (A5 geometry). */ + public static void write(Path out, int pages, int dpi) throws IOException { + int width = Math.round(148f * dpi / 25.4f); // A5 portrait: 148 mm × 210 mm + int height = Math.round(210f * dpi / 25.4f); + Random random = new Random(42); + Path parent = out.toAbsolutePath().getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + try (PDDocument doc = new PDDocument()) { + float widthPt = width * 72f / dpi; + float heightPt = height * 72f / dpi; + for (int i = 0; i < pages; i++) { + PDImageXObject image = + CCITTFactory.createFromImage(doc, page(width, height, random)); + PDPage page = new PDPage(new PDRectangle(widthPt, heightPt)); + doc.addPage(page); + try (PDPageContentStream content = new PDPageContentStream(doc, page)) { + content.drawImage(image, 0, 0, widthPt, heightPt); + } + } + doc.save(out.toFile()); + } + } + + /** One page: slightly skewed text-like columns plus unrotated scanner-dust specks. */ + private static BufferedImage page(int width, int height, Random random) { + BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); + Graphics2D g = img.createGraphics(); + try { + g.setColor(Color.WHITE); + g.fillRect(0, 0, width, height); + g.setColor(Color.BLACK); + double skew = Math.toRadians(random.nextDouble() - 0.5); // ±0.5° + g.rotate(skew, width / 2.0, height / 2.0); + drawColumns(g, width, height, random); + g.rotate(-skew, width / 2.0, height / 2.0); + drawSpecks(g, width, height, random); + } finally { + g.dispose(); + } + return img; + } + + /** + * Vertical "text" columns right-to-left (Japanese book layout): stacked glyph-sized blocks with + * per-page jitter so registration has a real column position to detect and correct, and random + * early line breaks so the texture resembles prose rather than a solid block. + */ + private static void drawColumns(Graphics2D g, int width, int height, Random random) { + int margin = width / 10; + int glyph = Math.max(4, width / 60); + int leading = glyph / 2; + int jitterX = random.nextInt(glyph + 1) - glyph / 2; + int top = height / 12 + random.nextInt(glyph + 1); + int bottom = height - height / 12; + for (int x = width - margin - glyph + jitterX; x >= margin; x -= glyph + leading) { + int y = top; + while (y + glyph <= bottom) { + // ~8% of glyph slots end the "sentence" early, leaving prose-like white runs. + if (random.nextInt(100) < 8) { + y += glyph * (2 + random.nextInt(4)); + continue; + } + g.fillRect(x, y, glyph - 2, glyph - 2); + y += glyph; + } + } + } + + /** Salt-and-pepper dust: ~1 speck of 1–3 px per 25k pixels, what despeckle exists to remove. */ + private static void drawSpecks(Graphics2D g, int width, int height, Random random) { + int specks = width * height / 25_000; + for (int i = 0; i < specks; i++) { + int size = 1 + random.nextInt(3); + g.fillRect(random.nextInt(width - size), random.nextInt(height - size), size, size); + } + } +} diff --git a/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java b/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java index 0810505..8ae99b8 100644 --- a/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java +++ b/pipeline/application/src/main/java/io/github/p4suta/pipeline/application/PipelineRunner.java @@ -70,7 +70,11 @@ public void run( progress.emit(new ProgressEvent.StageStarted(source.name(), position, total)); Corpus corpus = source.open(stageDir(work, 0, source.name())); - log.info("source: {} page(s) at {} dpi", corpus.pageCount(), corpus.dpi()); + log.info( + "source: {} page(s) at {} dpi, {}", + corpus.pageCount(), + corpus.dpi(), + intermediatesSize(corpus.dir())); progress.emit(new ProgressEvent.StageCompleted(source.name())); position++; @@ -78,7 +82,12 @@ public void run( for (Stage stage : stages) { progress.emit(new ProgressEvent.StageStarted(stage.name(), position, total)); corpus = stage.apply(corpus, stageDir(work, dirIndex, stage.name())); - log.info("stage {} ({}): {} page(s)", dirIndex, stage.name(), corpus.pageCount()); + log.info( + "stage {} ({}): {} page(s), {}", + dirIndex, + stage.name(), + corpus.pageCount(), + intermediatesSize(corpus.dir())); progress.emit(new ProgressEvent.StageCompleted(stage.name())); position++; dirIndex++; @@ -111,6 +120,28 @@ private static Path stageDir(Path work, int index, String name) throws IOExcepti work.resolve(String.format(Locale.ROOT, "%02d-%s", index, name))); } + /** + * The stage directory's total file bytes rendered as MiB — visibility into how much + * intermediate I/O each stage produces (best-effort: {@code ?} when the walk fails). + */ + private static String intermediatesSize(Path dir) { + try (Stream files = Files.walk(dir)) { + long bytes = + files.filter(Files::isRegularFile).mapToLong(PipelineRunner::sizeQuietly).sum(); + return String.format(Locale.ROOT, "%.1f MiB", bytes / (1024.0 * 1024.0)); + } catch (IOException e) { + return "? MiB"; + } + } + + private static long sizeQuietly(Path file) { + try { + return Files.size(file); + } catch (IOException e) { + return 0L; + } + } + private static void deleteRecursively(Path dir) { try (Stream walk = Files.walk(dir)) { walk.sorted(Comparator.reverseOrder()).forEach(PipelineRunner::deleteQuietly); diff --git a/pipeline/docs/perf-baseline.md b/pipeline/docs/perf-baseline.md new file mode 100644 index 0000000..9727575 --- /dev/null +++ b/pipeline/docs/perf-baseline.md @@ -0,0 +1,35 @@ +# pdfbook runtime baseline (stage-level) + +Generated by `PipelineBenchmark` (`./gradlew :pipeline:app:benchPipeline`, in the dev container). +Tracks the **per-stage wall-clock breakdown, end-to-end runtime and peak memory** +of the installDist `pdfbook` launcher. Re-run after any change to the pipeline +and compare against the previous run before merging (acceptance: ≥5% median +total-wall improvement, or an explicit RSS/disk win, with output validated). + +- Date (UTC): 2026-06-10 04:46:07 +- Host: Linux 6.8.0-124-generic amd64, 8 CPUs, RAM 16Gi +- Launcher: `pipeline/app/build/install/pdfbook/bin/pdfbook` +- Samples per measurement: cold (1st run) + warm median of 3. +- The default input is the deterministic synthetic fixture (`createSampleScan`, + seeded, so identical across machines). Real books are pluggable via `-Pinputs="…"`; + only their page count and byte size are reported. + +## Stage breakdown (warm median of 3 runs) + +`conv` is the launcher's in-process total (`timing: total`); `startup+init` = E2E wall − conv +(JVM boot + first-touch PDFBox/AWT init). `jobs=auto` is the launcher's CPU-count default. + +| Input | Jobs | Pages | E2E wall | conv | extract | despeckle | register | spread | startup+init | Cold wall | Peak RSS (MiB) | Output (MiB) | +|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| sample-scan-200p.pdf | 1 | 200 | 50.10s | 49.85s | 4.57s | 32.16s | 12.94s | 0.20s | 0.25s | 51.42s | 156 | 6.4 | +| sample-scan-200p.pdf | 8 | 200 | 14.77s | 14.48s | 1.15s | 9.85s | 3.27s | 0.21s | 0.29s | 15.04s | 328 | 6.4 | + +## Stage shares (of conv, warm median) + +The shares that decide where optimization effort goes: a stage that is ~5% of conv +cannot pay for a parallelization rewrite no matter how elegant. + +| Input | Jobs | extract | despeckle | register | spread | +|---|---|---:|---:|---:|---:| +| sample-scan-200p.pdf | 1 | 9.2% | 64.5% | 26.0% | 0.4% | +| sample-scan-200p.pdf | 8 | 7.9% | 68.0% | 22.6% | 1.5% | diff --git a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java index bcd1008..4fea0dc 100644 --- a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java +++ b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStage.java @@ -12,17 +12,17 @@ import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.atomic.AtomicInteger; /** * The G4-normalization {@link Stage}: re-encodes each extracted page as single-strip CCITT G4 TIFF - * via Leptonica, which {@link SpreadPackSink}'s pass-through CCITT embedding requires. {@code - * pdfimages -tiff} writes poppler's default (non-G4) TIFF compression, so the raw extract output - * cannot be embedded directly; despeckle and register each re-encode their output as G4 themselves, - * so the composition root inserts this stage only when neither of them runs. The corpus dpi is - * stamped on every page, since {@code pdfimages} tags the extracted TIFFs at a default 72 dpi. + * via Leptonica, which {@link SpreadPackSink}'s pass-through CCITT embedding requires. The + * extractor's decoded mode ({@code pdfimages -tiff}, used for any source that is not all-CCITT) + * writes poppler's default (non-G4) TIFF compression at a default 72 dpi, so that output cannot be + * embedded directly; despeckle and register each re-encode their output as G4 themselves, so the + * composition root inserts this stage only when neither of them runs. The corpus dpi is stamped on + * every page. (For an all-CCITT source the extractor's remux already produces stamped single-strip + * G4 — this stage then re-encodes losslessly, a small constant cost that keeps the no-stage path + * uniform.) */ public final class G4EncodeStage implements Stage { @@ -54,29 +54,26 @@ public String name() { @Override public Corpus apply(Corpus input, Path workDir) throws IOException { List pages = CorpusFiles.collect(input.dir(), input.glob()); - AtomicInteger done = new AtomicInteger(); - ExecutorService pool = Executors.newFixedThreadPool(jobs); - try { - List> tasks = new ArrayList<>(pages.size()); - for (Path src : pages) { - tasks.add( - () -> { - Path out = - CorpusFiles.mirrorDestination(src, input.dir(), workDir, "tif"); - try (Pix page = Pix.read(src)) { - page.setResolution(input.dpi()); - page.writeTiffG4(out); - } - progress.emit( - new ProgressEvent.PageProcessed( - name(), done.incrementAndGet(), pages.size())); - return null; - }); - } - Tasks.awaitAll(pool, tasks, "G4 encode interrupted", "G4 encode failed"); - } finally { - pool.shutdown(); + List> tasks = new ArrayList<>(pages.size()); + for (Path src : pages) { + tasks.add( + () -> { + Path out = CorpusFiles.mirrorDestination(src, input.dir(), workDir, "tif"); + try (Pix page = Pix.read(src)) { + page.setResolution(input.dpi()); + page.writeTiffG4(out); + } + return null; + }); } + // Platform workers: each task is a Leptonica decode + G4 encode (CPU-bound FFM downcalls, + // which would pin virtual threads' carriers). Progress arrives on this thread, ordered. + Tasks.awaitAll( + Tasks.Workers.platform(jobs), + tasks, + "G4 encode", + (done, total) -> + progress.emit(new ProgressEvent.PageProcessed(name(), done, total))); return input.movedTo(workDir, OUTPUT_GLOB); } } diff --git a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/PdfExtractSource.java b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/PdfExtractSource.java index 1a11456..c40694d 100644 --- a/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/PdfExtractSource.java +++ b/pipeline/infrastructure/src/main/java/io/github/p4suta/pipeline/infrastructure/PdfExtractSource.java @@ -7,8 +7,6 @@ import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; /** * The pipeline {@link Source}: extracts a scan PDF's bitonal pages once, via the shared {@code @@ -45,12 +43,7 @@ public String name() { @Override public Corpus open(Path workDir) throws IOException { int dpi = extractor.dominantDpi(sourcePdf); - ExecutorService pool = Executors.newFixedThreadPool(jobs); - try { - extractor.extract(sourcePdf, workDir, jobs, pool); - } finally { - pool.shutdown(); - } + extractor.extract(sourcePdf, workDir, jobs); return new Corpus(workDir, EXTRACT_GLOB, dpi, count(workDir, EXTRACT_GLOB)); } diff --git a/pipeline/infrastructure/src/test/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStageTest.java b/pipeline/infrastructure/src/test/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStageTest.java index 4ee3222..16a149c 100644 --- a/pipeline/infrastructure/src/test/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStageTest.java +++ b/pipeline/infrastructure/src/test/java/io/github/p4suta/pipeline/infrastructure/G4EncodeStageTest.java @@ -1,7 +1,7 @@ package io.github.p4suta.pipeline.infrastructure; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatIOException; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import io.github.p4suta.pipeline.domain.Corpus; import io.github.p4suta.shared.imaging.Pix; @@ -79,15 +79,17 @@ void reEncodesUncompressedTiffsAsCcittG4() throws Exception { } @Test - void surfacesAnUnreadablePageAsIoException() throws Exception { + void surfacesAnUnreadablePageWithItsOriginalException() throws Exception { Path inputDir = Files.createDirectories(tmp.resolve("in")); Path workDir = Files.createDirectories(tmp.resolve("out")); Files.writeString(inputDir.resolve("page-000.tif"), "not a tiff"); Corpus input = new Corpus(inputDir, "*.tif", DPI, 1); - assertThatIOException() - .isThrownBy(() -> new G4EncodeStage(2).apply(input, workDir)) - .withMessage("G4 encode failed"); + // The fan-out preserves the failure's identity (no generic IOException wrapper), so the + // shared exception mapper still sees the original type and the message names the page. + assertThatThrownBy(() -> new G4EncodeStage(2).apply(input, workDir)) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("could not read image"); } /** diff --git a/register/application/build.gradle.kts b/register/application/build.gradle.kts index 631d833..11a5a77 100644 --- a/register/application/build.gradle.kts +++ b/register/application/build.gradle.kts @@ -25,6 +25,10 @@ dependencies { // caller-supplied extension). Register passes its own glob/OutputFormat.extension() so this layer // imports neither the matcher nor the OutputFormat enum. implementation(project(":shared:io")) + // Tasks.awaitAll(Workers...): the shared fail-fast page fan-out (batch-owned executor, + // sibling interruption, quiescence before the failure propagates) both registration passes + // run their per-page work on. + implementation(project(":shared:process")) implementation(libs.slf4j.api) } diff --git a/register/application/src/main/java/io/github/p4suta/register/application/PdfPipelineService.java b/register/application/src/main/java/io/github/p4suta/register/application/PdfPipelineService.java index 0a826b4..6672142 100644 --- a/register/application/src/main/java/io/github/p4suta/register/application/PdfPipelineService.java +++ b/register/application/src/main/java/io/github/p4suta/register/application/PdfPipelineService.java @@ -11,8 +11,6 @@ import java.nio.file.Path; import java.util.Comparator; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import java.util.stream.Stream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -80,40 +78,37 @@ public void run(Config config) throws IOException { Path extracted = Files.createDirectories(work.resolve("in")); Path registered = Files.createDirectories(work.resolve("reg")); Path jbig2Dir = Files.createDirectories(work.resolve("jb2")); - ExecutorService pool = Executors.newFixedThreadPool(config.jobs()); - try { - int dpi = - config.options().dpi().isPresent() - ? config.options().dpi().getAsInt() - : extractor.dominantDpi(config.inputPdf()); - LOG.info( - "pipeline: {} -> {} at {} dpi", config.inputPdf(), config.outputPdf(), dpi); + int dpi = + config.options().dpi().isPresent() + ? config.options().dpi().getAsInt() + : extractor.dominantDpi(config.inputPdf()); + LOG.info("pipeline: {} -> {} at {} dpi", config.inputPdf(), config.outputPdf(), dpi); - extractor.extract(config.inputPdf(), extracted, config.jobs(), pool); + // Each step fans out on its own batch-owned workers bounded by the same jobs budget, + // so the steps never hold idle threads for each other (the old shared outer pool sat + // idle through the whole registration step). + extractor.extract(config.inputPdf(), extracted, config.jobs()); - RegistrationService.Config registration = - new RegistrationService.Config( - extracted, - registered, - OutputFormat.TIFF, - "*.tif", - config.jobs(), - true, - withDpi(config.options(), dpi), - null, - false); - registrationService.run(registration); + RegistrationService.Config registration = + new RegistrationService.Config( + extracted, + registered, + OutputFormat.TIFF, + "*.tif", + config.jobs(), + true, + withDpi(config.options(), dpi), + null, + false); + registrationService.run(registration); - assembler.assemble( - registered, - config.outputPdf(), - config.inputPdf(), - OptionalInt.of(dpi), - pool, - jbig2Dir); - } finally { - pool.shutdown(); - } + assembler.assemble( + registered, + config.outputPdf(), + config.inputPdf(), + OptionalInt.of(dpi), + config.jobs(), + jbig2Dir); LOG.info("wrote {}", config.outputPdf()); } finally { deleteRecursively(work); diff --git a/register/application/src/main/java/io/github/p4suta/register/application/RegistrationService.java b/register/application/src/main/java/io/github/p4suta/register/application/RegistrationService.java index a6210c3..378a6a1 100644 --- a/register/application/src/main/java/io/github/p4suta/register/application/RegistrationService.java +++ b/register/application/src/main/java/io/github/p4suta/register/application/RegistrationService.java @@ -17,6 +17,7 @@ import io.github.p4suta.shared.io.OutputDirs; import io.github.p4suta.shared.kernel.Medians; import io.github.p4suta.shared.kernel.PageProgressListener; +import io.github.p4suta.shared.process.Tasks; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -26,11 +27,6 @@ import java.util.Locale; import java.util.OptionalInt; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; import org.jspecify.annotations.Nullable; import org.slf4j.Logger; @@ -109,8 +105,8 @@ public Summary run(Config requested) throws IOException { * * @param requested run configuration (its empty {@code --dpi} is resolved from the inputs * before rendering) - * @param progress called once per finished page in each pass (1-based count over {@code 2N}); - * invoked from the worker threads, so it must be thread-safe + * @param progress called once per finished page in each pass (1-based count over {@code 2N}), + * always on the calling thread and in order * @throws IOException on filesystem failure */ public Summary run(Config requested, PageProgressListener progress) throws IOException { @@ -130,7 +126,6 @@ public Summary run(Config requested, PageProgressListener progress) throws IOExc // Per-page progress spans both passes (analyze 1..N, render N+1..2N), so each PageProcessed // the composing stage emits carries this 2N denominator. int progressTotal = files.size() * 2; - AtomicInteger pageProgress = new AtomicInteger(); Path diagDir = config.diagDir(); boolean recordDiagnostics = diagDir != null; @@ -140,21 +135,15 @@ public Summary run(Config requested, PageProgressListener progress) throws IOExc : Reporter.noOp(); // The analysis pass writes each deskewed page here; the render pass reads it back, so the - // costly deskew and detection run once per page, not again in pass two. Removed at the end. + // costly deskew and detection run once per page, not again in pass two. Removed at the + // end — safely: each pass's fan-out quiesces before returning or throwing, so no worker + // is still writing into the scratch when the finally-delete runs. Path scratchDir = createBeside(config.outputDir(), ".register-deskewed-"); - ExecutorService pool = Executors.newFixedThreadPool(config.jobs()); try { // Pass 1: deskew once, detect, cache the deskewed page for pass two List pages = analyzePass( - files, - scratchDir, - config, - recordDiagnostics, - pool, - progress, - pageProgress, - progressTotal); + files, scratchDir, config, recordDiagnostics, progress, progressTotal); List observations = toObservations(pages); int analyzed = observations.size(); @@ -189,9 +178,7 @@ public Summary run(Config requested, PageProgressListener progress) throws IOExc config, recordDiagnostics, reporter, - pool, progress, - pageProgress, progressTotal); if (recordDiagnostics) { @@ -221,7 +208,6 @@ public Summary run(Config requested, PageProgressListener progress) throws IOExc analyzed); return new Summary(files.size(), analyzed); } finally { - pool.shutdown(); deleteRecursively(scratchDir); } } @@ -235,9 +221,7 @@ private List analyzePass( Path scratchDir, Config config, boolean recordDiagnostics, - ExecutorService pool, PageProgressListener progress, - AtomicInteger pageProgress, int progressTotal) throws IOException { List> tasks = new ArrayList<>(files.size()); @@ -246,18 +230,23 @@ private List analyzePass( Path src = files.get(i); Path scratch = scratchDir.resolve(String.format(Locale.ROOT, "%06d.tif", index)); tasks.add( - () -> { - PageAnalysis analysis = - pageRegistrar.analyze( - src, scratch, config.options(), recordDiagnostics); - AnalyzedPage analyzed = - new AnalyzedPage(index, Parity.of(index), src, scratch, analysis); - // The analyze pass occupies the first N of the 2N progress units. - progress.onPage(pageProgress.incrementAndGet(), progressTotal); - return analyzed; - }); + () -> + new AnalyzedPage( + index, + Parity.of(index), + src, + scratch, + pageRegistrar.analyze( + src, scratch, config.options(), recordDiagnostics))); } - return awaitAll(pool, tasks); + // Platform workers: deskew + detection are CPU-bound Leptonica work (FFM downcalls pin + // virtual threads' carriers). The analyze pass occupies the first N of the 2N progress + // units; the callback arrives on this thread, ordered. + return Tasks.awaitAll( + Tasks.Workers.platform(config.jobs()), + tasks, + "register analyze", + (done, total) -> progress.onPage(done, progressTotal)); } /** The detected main-column boxes from pass 1, one per page that had a detectable column. */ @@ -286,36 +275,27 @@ private List renderPass( Config config, boolean recordDiagnostics, Reporter reporter, - ExecutorService pool, PageProgressListener progress, - AtomicInteger pageProgress, int progressTotal) throws IOException { - AtomicInteger done = new AtomicInteger(); - int total = pages.size(); - List> tasks = new ArrayList<>(total); + List> tasks = new ArrayList<>(pages.size()); for (AnalyzedPage page : pages) { tasks.add( - () -> { - Path dest = - renderOne( - page, - reference, - canvas, - config, - recordDiagnostics, - reporter); - int n = done.incrementAndGet(); - // The render pass occupies the second N of the 2N progress units; the local - // count drives the human log at its own N denominator. - progress.onPage(pageProgress.incrementAndGet(), progressTotal); - if (n % PROGRESS_EVERY == 0 || n == total) { - LOG.info("{}/{}", n, total); - } - return dest; - }); + () -> renderOne(page, reference, canvas, config, recordDiagnostics, reporter)); } - return awaitAll(pool, tasks); + // Platform workers: placement is CPU-bound Leptonica work. The render pass occupies the + // second N of the 2N progress units; the local count drives the human log at its own N + // denominator. Both arrive on this thread, ordered. + return Tasks.awaitAll( + Tasks.Workers.platform(config.jobs()), + tasks, + "register render", + (done, total) -> { + progress.onPage(pages.size() + done, progressTotal); + if (done % PROGRESS_EVERY == 0 || done == total) { + LOG.info("{}/{}", done, total); + } + }); } /** @@ -431,38 +411,6 @@ private Path renderOne( return dest; } - /** - * Run every task on {@code pool} in submission order, surfacing the first failure: a task's own - * {@link IOException} is re-thrown unchanged, any other failure is wrapped, and an interruption - * restores the interrupt flag. - */ - private static List awaitAll(ExecutorService pool, List> tasks) - throws IOException { - List> futures; - try { - futures = pool.invokeAll(tasks); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("register run interrupted", e); - } - List results = new ArrayList<>(futures.size()); - for (Future future : futures) { - try { - results.add(future.get()); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("register run interrupted", e); - } catch (ExecutionException e) { - Throwable cause = e.getCause(); - if (cause instanceof IOException io) { - throw io; - } - throw new IOException("page processing failed", cause); - } - } - return results; - } - /** * Create a temporary directory beside {@code sibling} (under its parent), so the scratch space * shares a filesystem with the output it serves. Falls back to the working directory when diff --git a/register/application/src/test/java/io/github/p4suta/register/application/Fakes.java b/register/application/src/test/java/io/github/p4suta/register/application/Fakes.java index 28d7ad4..08f2a4e 100644 --- a/register/application/src/test/java/io/github/p4suta/register/application/Fakes.java +++ b/register/application/src/test/java/io/github/p4suta/register/application/Fakes.java @@ -23,7 +23,6 @@ import java.util.List; import java.util.Optional; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import org.jspecify.annotations.Nullable; @@ -194,8 +193,7 @@ public int dominantDpi(Path pdf) { } @Override - public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) - throws IOException { + public void extract(Path pdf, Path outDir, int jobs) throws IOException { extractCalls.incrementAndGet(); if (failOn != null && pdf.getFileName().toString().contains(failOn)) { throw new IOException("fake extract failed for " + pdf); @@ -220,7 +218,7 @@ public void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException { lastForcedDpi = forcedDpi; diff --git a/register/application/src/test/java/io/github/p4suta/register/application/RegistrationServiceTest.java b/register/application/src/test/java/io/github/p4suta/register/application/RegistrationServiceTest.java index cb028db..5b16165 100644 --- a/register/application/src/test/java/io/github/p4suta/register/application/RegistrationServiceTest.java +++ b/register/application/src/test/java/io/github/p4suta/register/application/RegistrationServiceTest.java @@ -181,11 +181,12 @@ void writesDiagnosticsWhenADiagDirIsGiven(@TempDir Path tmp) throws IOException } @Test - void propagatesAWorkerFailureAsAnIoException(@TempDir Path tmp) throws IOException { + void propagatesAWorkerFailureWithItsIdentity(@TempDir Path tmp) throws IOException { Path in = Files.createDirectories(tmp.resolve("in")); Path out = tmp.resolve("out"); writePages(in, 2); - // A registrar whose analyze pass throws: the service must surface it, not swallow it. + // A registrar whose analyze pass throws: the service must surface the original exception + // unchanged (the fan-out preserves identity, so domain kinds reach the exception mapper). PageRegistrar failing = new FakePageRegistrar(true, 600, 1000, 1500) { @Override @@ -201,7 +202,7 @@ public PageAnalysis analyze( new RegistrationService(failing, new RecordingReporterFactory()); assertThrows( - IOException.class, + IllegalStateException.class, () -> service.run( config( diff --git a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfBoxJbig2Assembler.java b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfBoxJbig2Assembler.java index a5a1411..85224b2 100644 --- a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfBoxJbig2Assembler.java +++ b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfBoxJbig2Assembler.java @@ -4,7 +4,6 @@ import java.io.IOException; import java.nio.file.Path; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import org.jspecify.annotations.Nullable; /** @@ -40,9 +39,9 @@ public void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException { - delegate.assemble(imageDir, outPdf, source, forcedDpi, pool, scratchDir); + delegate.assemble(imageDir, outPdf, source, forcedDpi, jobs, scratchDir); } } diff --git a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfImagesCliExtractor.java b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfImagesCliExtractor.java index 231229a..07c55e5 100644 --- a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfImagesCliExtractor.java +++ b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/pdf/PdfImagesCliExtractor.java @@ -3,7 +3,6 @@ import io.github.p4suta.register.port.PdfImageExtractor; import java.io.IOException; import java.nio.file.Path; -import java.util.concurrent.ExecutorService; /** * Register's {@link PdfImageExtractor} adapter: a thin binding onto the cross-app {@link @@ -43,10 +42,10 @@ public int dominantDpi(Path pdf) throws IOException { /** * Extract all pages of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks. {@code jobs} bounds both the chunk count and the pool slots used. + * chunks. at most {@code jobs} chunks run at once. */ @Override - public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException { - delegate.extract(pdf, outDir, jobs, pool); + public void extract(Path pdf, Path outDir, int jobs) throws IOException { + delegate.extract(pdf, outDir, jobs); } } diff --git a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/process/NativeTools.java b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/process/NativeTools.java index c254061..347a21b 100644 --- a/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/process/NativeTools.java +++ b/register/infrastructure/src/main/java/io/github/p4suta/register/infrastructure/process/NativeTools.java @@ -3,7 +3,6 @@ import io.github.p4suta.register.domain.exception.RegisterErrorKind; import io.github.p4suta.register.domain.exception.RegisterException; import io.github.p4suta.shared.process.ProcessRunner; -import io.github.p4suta.shared.process.Tasks; import io.github.p4suta.shared.process.ToolPath; import java.io.IOException; import java.io.InputStream; @@ -11,8 +10,6 @@ import java.time.Duration; import java.util.List; import java.util.Objects; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -22,12 +19,11 @@ * io.github.p4suta.shared.process} plumbing: discovery is delegated to {@link * io.github.p4suta.shared.process.ToolPath} (passing register's per-tool {@code * -Dregister..path} override key — this is how the packaged app-image points at its bundled - * binaries, else the tool is looked up on {@code PATH}), the discard-output run is delegated to - * {@link io.github.p4suta.shared.process.ProcessRunner}, and the worker-pool fan-out to {@link - * io.github.p4suta.shared.process.Tasks}. Unlike the optional flip-book, a missing pipeline tool is - * fatal — the pipeline cannot proceed — so resolution throws, and the shared layer's neutral {@code - * IOException}/{@code TimeoutException} failures are re-mapped to register's {@code - * RegisterException} (NATIVE_TOOL_FAILED) so the pipeline keeps its own error wording. + * binaries, else the tool is looked up on {@code PATH}), and the discard-output run is delegated to + * {@link io.github.p4suta.shared.process.ProcessRunner}. Unlike the optional flip-book, a missing + * pipeline tool is fatal — the pipeline cannot proceed — so resolution throws, and the shared + * layer's neutral {@code IOException}/{@code TimeoutException} failures are re-mapped to register's + * {@code RegisterException} (NATIVE_TOOL_FAILED) so the pipeline keeps its own error wording. */ public final class NativeTools { @@ -128,15 +124,4 @@ private static void awaitExit(Process process, List command, long timeou null); } } - - /** - * Run every task on {@code pool}, in submission order, surfacing the first failure as an {@link - * IOException}. Thin wrapper over the shared {@link - * io.github.p4suta.shared.process.Tasks#awaitAll} that supplies the pipeline's messages, so the - * await semantics live in one place while the pipeline keeps its own error wording. - */ - public static List awaitAll(ExecutorService pool, List> tasks) - throws IOException { - return Tasks.awaitAll(pool, tasks, "pipeline interrupted", "pipeline task failed"); - } } diff --git a/register/port/src/main/java/io/github/p4suta/register/port/Jbig2Assembler.java b/register/port/src/main/java/io/github/p4suta/register/port/Jbig2Assembler.java index 466e95a..95902d0 100644 --- a/register/port/src/main/java/io/github/p4suta/register/port/Jbig2Assembler.java +++ b/register/port/src/main/java/io/github/p4suta/register/port/Jbig2Assembler.java @@ -3,7 +3,6 @@ import java.io.IOException; import java.nio.file.Path; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; import org.jspecify.annotations.Nullable; /** @@ -28,7 +27,7 @@ void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException; } diff --git a/register/port/src/main/java/io/github/p4suta/register/port/PdfImageExtractor.java b/register/port/src/main/java/io/github/p4suta/register/port/PdfImageExtractor.java index 7114285..23089b4 100644 --- a/register/port/src/main/java/io/github/p4suta/register/port/PdfImageExtractor.java +++ b/register/port/src/main/java/io/github/p4suta/register/port/PdfImageExtractor.java @@ -2,7 +2,6 @@ import java.io.IOException; import java.nio.file.Path; -import java.util.concurrent.ExecutorService; /** * Extracts a scanned PDF's embedded bitonal images as TIFFs. The abstraction over {@code pdfimages} @@ -20,10 +19,10 @@ public interface PdfImageExtractor { /** * Extract every page of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks across the worker pool. + * chunks, at most {@code jobs} at once. * - * @param jobs the worker thread count (bounds the chunk count and pool slots used) + * @param jobs how many chunks may extract at once * @throws IOException if the external tool fails */ - void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException; + void extract(Path pdf, Path outDir, int jobs) throws IOException; } diff --git a/shared/imaging/src/main/java/io/github/p4suta/shared/imaging/Leptonica.java b/shared/imaging/src/main/java/io/github/p4suta/shared/imaging/Leptonica.java index 9cf9945..1ca223d 100644 --- a/shared/imaging/src/main/java/io/github/p4suta/shared/imaging/Leptonica.java +++ b/shared/imaging/src/main/java/io/github/p4suta/shared/imaging/Leptonica.java @@ -296,6 +296,16 @@ private static MethodHandle handle(String name, FunctionDescriptor descriptor) { handle( "pixOpenBrick", FunctionDescriptor.of(ADDRESS, ADDRESS, ADDRESS, JAVA_INT, JAVA_INT)); + private static final MethodHandle PIX_DILATE_BRICK_DWA = + handle( + "pixDilateBrickDwa", + FunctionDescriptor.of(ADDRESS, ADDRESS, ADDRESS, JAVA_INT, JAVA_INT)); + private static final MethodHandle PIX_OPEN_BRICK_DWA = + handle( + "pixOpenBrickDwa", + FunctionDescriptor.of(ADDRESS, ADDRESS, ADDRESS, JAVA_INT, JAVA_INT)); + private static final MethodHandle MAKE_PIXEL_SUM_TAB8 = + handle("makePixelSumTab8", FunctionDescriptor.of(ADDRESS)); private static final MethodHandle PIX_AND = handle("pixAnd", FunctionDescriptor.of(ADDRESS, ADDRESS, ADDRESS, ADDRESS)); private static final MethodHandle PIX_OR = @@ -316,16 +326,38 @@ private static MethodHandle handle(String name, FunctionDescriptor descriptor) { private static final MethodHandle SET_MSG_SEVERITY = handle("setMsgSeverity", FunctionDescriptor.of(JAVA_INT, JAVA_INT)); + /** + * Largest brick side routed to a single DWA kernel call. Leptonica generates DWA sels only for + * a fixed set of linear sizes; measured against this build's 1.82, {@code pixDilateBrickDwa} + * silently diverges from the generic brick for the missing sizes (every prime above 15, e.g. + * 17, 43), while sizes up to 15 are complete. Larger dilations are composed from safe passes in + * {@code Pix}; the equality sweep in {@code PixTest} pins all of this empirically. + */ + static final int DWA_SAFE_BRICK = 15; + + /** + * The process-lifetime 8-bit popcount table {@code pixCountPixels} consumes (~1 KiB native, + * deliberately never freed): without it Leptonica mallocs, fills and frees the same table on + * every call. + */ + private static final MemorySegment PIXEL_SUM_TAB8; + static { // Suppress Leptonica's stderr diagnostics once, at class load. The returned previous // severity is discarded. try { SET_MSG_SEVERITY.invoke(L_SEVERITY_NONE); + PIXEL_SUM_TAB8 = (MemorySegment) MAKE_PIXEL_SUM_TAB8.invoke(); } catch (Throwable t) { - throw sneaky("setMsgSeverity", t); + throw sneaky("leptonica static init", t); } } + /** {@return the shared popcount table for {@code pixCountPixels}} */ + static MemorySegment pixelSumTab8() { + return PIXEL_SUM_TAB8; + } + // pix lifecycle / metadata /** Read an image file, returning the raw {@code PIX *} (0 on failure). */ @@ -613,6 +645,34 @@ static MemorySegment pixOpenBrick(MemorySegment src, int hsize, int vsize) { } } + /** + * Dilate {@code src} by a {@code hsize x vsize} brick via the word-accelerated DWA kernels into + * a fresh {@code PIX}. Only exact for sizes up to {@link #DWA_SAFE_BRICK} — see that constant; + * {@code Pix} owns the larger-size composition. + */ + static MemorySegment pixDilateBrickDwa(MemorySegment src, int hsize, int vsize) { + try { + return (MemorySegment) + PIX_DILATE_BRICK_DWA.invoke(MemorySegment.NULL, src, hsize, vsize); + } catch (Throwable t) { + throw sneaky("pixDilateBrickDwa", t); + } + } + + /** + * Open (erode then dilate) {@code src} by a {@code hsize x vsize} brick via the + * word-accelerated DWA kernels into a fresh {@code PIX}. Routed only for sizes up to {@link + * #DWA_SAFE_BRICK}; pixel-identical there to {@link #pixOpenBrick} (pinned by {@code PixTest}'s + * sweep) and several times faster. + */ + static MemorySegment pixOpenBrickDwa(MemorySegment src, int hsize, int vsize) { + try { + return (MemorySegment) PIX_OPEN_BRICK_DWA.invoke(MemorySegment.NULL, src, hsize, vsize); + } catch (Throwable t) { + throw sneaky("pixOpenBrickDwa", t); + } + } + /** {@code s1 AND s2} into a fresh {@code PIX} (the {@code pixd == NULL} path). */ static MemorySegment pixAnd(MemorySegment s1, MemorySegment s2) { try { diff --git a/shared/imaging/src/main/java/io/github/p4suta/shared/imaging/Pix.java b/shared/imaging/src/main/java/io/github/p4suta/shared/imaging/Pix.java index 036887b..659c1cb 100644 --- a/shared/imaging/src/main/java/io/github/p4suta/shared/imaging/Pix.java +++ b/shared/imaging/src/main/java/io/github/p4suta/shared/imaging/Pix.java @@ -312,7 +312,8 @@ public long blackPixels() { MemorySegment h = requireHandle(); try (Arena arena = Arena.ofConfined()) { MemorySegment count = arena.allocate(JAVA_INT); - Leptonica.pixCountPixels(h, count, MemorySegment.NULL); + // The shared popcount table: without it Leptonica rebuilds (and frees) one per call. + Leptonica.pixCountPixels(h, count, Leptonica.pixelSumTab8()); return Integer.toUnsignedLong(count.get(JAVA_INT, 0)); } } @@ -352,8 +353,41 @@ public Pix subtract(Pix other) { /** * Return a new {@code Pix} grown by {@code radius} pixels in every direction (dilation by a * {@code (2*radius+1)} square). A {@code radius} of 0 is the identity. + * + *

Runs on Leptonica's word-accelerated DWA kernels for every size: bricks up to {@link + * Leptonica#DWA_SAFE_BRICK} directly, larger ones as a chain of safe-size passes — exact, + * because dilating by {@code brick(a)} then {@code brick(b)} equals dilating by {@code + * brick(a+b-1)} (Minkowski sum; within the image rectangle the L∞ paths between in-bounds + * points stay in bounds, so per-pass clipping changes nothing). Pixel-identity against the + * generic rasterop path is pinned by {@code PixTest}'s full sweep. */ public Pix dilated(int radius) { + int size = 2 * radius + 1; + if (size <= Leptonica.DWA_SAFE_BRICK) { + return wrap( + Leptonica.pixDilateBrickDwa(requireHandle(), size, size), "pixDilateBrickDwa"); + } + int covered = Leptonica.DWA_SAFE_BRICK; + Pix current = + wrap( + Leptonica.pixDilateBrickDwa(requireHandle(), covered, covered), + "pixDilateBrickDwa"); + while (covered < size) { + // size and covered stay odd, so the step is odd and within the safe sel set. + int step = Math.min(Leptonica.DWA_SAFE_BRICK, size - covered + 1); + Pix next = + wrap( + Leptonica.pixDilateBrickDwa(current.requireHandle(), step, step), + "pixDilateBrickDwa"); + current.close(); + current = next; + covered += step - 1; + } + return current; + } + + /** The generic rasterop dilation — kept as the DWA equality oracle for {@code PixTest}. */ + Pix dilatedGeneric(int radius) { int size = 2 * radius + 1; return wrap(Leptonica.pixDilateBrick(requireHandle(), size, size), "pixDilateBrick"); } @@ -361,8 +395,22 @@ public Pix dilated(int radius) { /** * Return a new {@code Pix} opened (eroded then dilated) by a {@code (2*radius+1)} square — i.e. * foreground thinner than the brick in either axis is erased, leaving only the solid parts. + * + *

Bricks up to {@link Leptonica#DWA_SAFE_BRICK} run on Leptonica's word-accelerated DWA + * kernels — pixel-identical to the generic rasterop path (pinned by {@code PixTest}'s sweep) + * and several times faster; larger bricks fall back to the generic path (an opening, unlike a + * dilation, does not compose from smaller passes). */ public Pix opened(int radius) { + int size = 2 * radius + 1; + if (size <= Leptonica.DWA_SAFE_BRICK) { + return wrap(Leptonica.pixOpenBrickDwa(requireHandle(), size, size), "pixOpenBrickDwa"); + } + return openedGeneric(radius); + } + + /** The generic rasterop opening — the large-brick fallback and the DWA equality oracle. */ + Pix openedGeneric(int radius) { int size = 2 * radius + 1; return wrap(Leptonica.pixOpenBrick(requireHandle(), size, size), "pixOpenBrick"); } diff --git a/shared/imaging/src/test/java/io/github/p4suta/shared/imaging/PixTest.java b/shared/imaging/src/test/java/io/github/p4suta/shared/imaging/PixTest.java index 3baa473..09cb681 100644 --- a/shared/imaging/src/test/java/io/github/p4suta/shared/imaging/PixTest.java +++ b/shared/imaging/src/test/java/io/github/p4suta/shared/imaging/PixTest.java @@ -295,4 +295,83 @@ void useAfterCloseThrows(@TempDir Path dir) throws Exception { pix.close(); assertThrows(IllegalStateException.class, pix::width); } + + // DWA vs generic morphology: the empirical gate for the fast path + + /** + * The load-bearing equality sweep: {@code dilated} runs on the DWA fast path for every size + * (single safe kernel up to 15, composed safe passes beyond — Leptonica's generated DWA sels + * are incomplete above 15 and silently diverge there, so the composition is the only exact + * route), and {@code opened} up to 15. Both must be pixel-identical to the generic rasterop + * morphology — including at the image borders, where DWA's internal bordering is the classic + * divergence trap. The fixture has ink touching all four borders, interior glyphs and isolated + * dots; the radii sweep covers the production sizes (7x7 open, 43x43 dilate) and the 63x63 + * ceiling. A failure here means the fast path may not ship. + */ + @Test + void dwaMorphologyMatchesGenericBrickIncludingBorders(@TempDir Path dir) throws Exception { + Path pbm = dir.resolve("border-ink.pbm"); + boolean[][] img = TestImages.blank(200, 150); + // Ink on all four borders: full-width top/bottom lines, full-height left/right lines. + TestImages.fillRect(img, 0, 0, 199, 1); + TestImages.fillRect(img, 0, 148, 199, 149); + TestImages.fillRect(img, 0, 0, 1, 149); + TestImages.fillRect(img, 198, 0, 199, 149); + // Corner blocks, interior glyphs, isolated dots. + TestImages.fillRect(img, 0, 0, 8, 8); + TestImages.fillRect(img, 190, 140, 199, 149); + TestImages.fillRect(img, 40, 30, 80, 90); + TestImages.fillRect(img, 120, 50, 150, 60); + TestImages.fillRect(img, 100, 110, 100, 110); + TestImages.fillRect(img, 20, 130, 20, 130); + TestImages.writePbm(pbm, img); + + int[] radii = {0, 1, 3, 7, 10, 15, 21, 31}; + try (Pix page = Pix.read(pbm)) { + for (int radius : radii) { + try (Pix dwa = page.dilated(radius); + Pix generic = page.dilatedGeneric(radius)) { + assertTrue( + dwa.pixelsEqual(generic), + "dilate radius " + radius + " must be pixel-identical"); + } + try (Pix dwa = page.opened(radius); + Pix generic = page.openedGeneric(radius)) { + assertTrue( + dwa.pixelsEqual(generic), + "open radius " + radius + " must be pixel-identical"); + } + } + } + } + + /** The sweep on degenerate pages: smaller than DWA's border, all-black, and all-white. */ + @Test + void dwaMorphologyMatchesGenericOnDegeneratePages(@TempDir Path dir) throws Exception { + boolean[][] tiny = TestImages.blank(20, 20); + TestImages.fillRect(tiny, 0, 0, 19, 2); + TestImages.fillRect(tiny, 17, 0, 19, 19); + TestImages.fillRect(tiny, 9, 9, 9, 9); + boolean[][] black = TestImages.blank(50, 40); + TestImages.fillRect(black, 0, 0, 49, 39); + boolean[][] white = TestImages.blank(50, 40); + + int page = 0; + for (boolean[][] img : java.util.List.of(tiny, black, white)) { + Path pbm = dir.resolve("degenerate-" + page++ + ".pbm"); + TestImages.writePbm(pbm, img); + try (Pix pix = Pix.read(pbm)) { + for (int radius : new int[] {1, 3, 21}) { + try (Pix dwa = pix.dilated(radius); + Pix generic = pix.dilatedGeneric(radius)) { + assertTrue(dwa.pixelsEqual(generic), "dilate r=" + radius); + } + try (Pix dwa = pix.opened(radius); + Pix generic = pix.openedGeneric(radius)) { + assertTrue(dwa.pixelsEqual(generic), "open r=" + radius); + } + } + } + } + } } diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java new file mode 100644 index 0000000..1bfd7c3 --- /dev/null +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/CcittTiffs.java @@ -0,0 +1,159 @@ +package io.github.p4suta.shared.pdf; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Files; +import java.nio.file.Path; +import org.jspecify.annotations.Nullable; + +/** + * Wraps the raw CCITT G4 stream {@code pdfimages -ccitt} dumps into a single-strip TIFF — the + * pass-through half of the extractor's remux mode: the scan's embedded G4 bytes become a readable + * TIFF without ever being decoded and re-encoded. + * + *

poppler writes a fax2tiff-style {@code .params} file beside each dump ({@code -4} G4 / {@code + * -1 -2} G3, {@code -A} EOL markers / {@code -P} none, {@code -X }, {@code -W} BlackIs1 / + * {@code -B} not, {@code -M} MSB-first). Only the plain shape TIFF's T.6 compression can represent + * verbatim is {@linkplain #supported supported}: G4, no EOL markers, MSB-first. Crucially, PDF's + * {@code EncodedByteAlign} never reaches the params file, so a wrapped stream is only trusted after + * the caller decodes it back successfully (see the extractor's read-back verification). + */ +final class CcittTiffs { + + private static final short TYPE_SHORT = 3; + private static final short TYPE_LONG = 4; + private static final short TYPE_RATIONAL = 5; + + private CcittTiffs() {} + + /** + * The decode parameters poppler records beside a {@code .ccitt} dump. + * + * @param kind the coding scheme flag: {@code -4} (G4), {@code -2} (G3 2D) or {@code -1} (G3 1D) + * @param endOfLine whether rows are prefixed with EOL markers ({@code -A}) + * @param columns the row width in pixels ({@code -X}) + * @param blackIs1 whether decoded 1-bits are black ({@code -W}) or 0-bits are ({@code -B}) + */ + record Params(String kind, boolean endOfLine, int columns, boolean blackIs1) {} + + /** Parse a {@code .params} file's text, or {@code null} when any token is unrecognized. */ + static @Nullable Params parseParams(String text) { + @Nullable String kind = null; + @Nullable Boolean endOfLine = null; + @Nullable Integer columns = null; + @Nullable Boolean blackIs1 = null; + boolean msbFirst = false; + String[] tokens = text.trim().split("\\s+", -1); + for (int i = 0; i < tokens.length; i++) { + switch (tokens[i]) { + case "-4", "-2", "-1" -> kind = tokens[i]; + case "-A" -> endOfLine = true; + case "-P" -> endOfLine = false; + case "-W" -> blackIs1 = true; + case "-B" -> blackIs1 = false; + case "-M" -> msbFirst = true; + case "-X" -> { + i++; + if (i >= tokens.length) { + return null; + } + try { + columns = Integer.parseInt(tokens[i]); + } catch (NumberFormatException e) { + return null; + } + } + default -> { + return null; + } + } + } + if (kind == null || endOfLine == null || columns == null || blackIs1 == null || !msbFirst) { + return null; + } + return new Params(kind, endOfLine, columns, blackIs1); + } + + /** + * Whether {@code params} describes a stream TIFF T.6 represents verbatim: Group 4, no EOL + * markers, and a width agreeing with the listing row the dump corresponds to. + */ + static boolean supported(Params params, int expectedWidth) { + return "-4".equals(params.kind()) + && !params.endOfLine() + && params.columns() == expectedWidth; + } + + /** + * Write {@code g4} as a little-endian, single-strip CCITT-G4 TIFF — header, the verbatim stream + * as the one strip, then the IFD. + * + * @param out the TIFF to write + * @param g4 the raw G4 (T.6) stream, verbatim + * @param width the row width in pixels + * @param height the row count + * @param blackIs1 the params' photometric hint: decoded 1-bits are black ({@code -W}) + * @param dpi the resolution to stamp, or {@code <= 0} to omit the resolution tags + */ + static void writeSingleStripG4( + Path out, byte[] g4, int width, int height, boolean blackIs1, int dpi) + throws IOException { + boolean withResolution = dpi > 0; + int entryCount = withResolution ? 14 : 11; + int stripOffset = 8; + int padding = g4.length % 2; // IFD offsets must be word-aligned + int ifdOffset = stripOffset + g4.length + padding; + int rationalOffset = ifdOffset + 2 + entryCount * 12 + 4; + ByteBuffer buf = + ByteBuffer.allocate(rationalOffset + (withResolution ? 16 : 0)) + .order(ByteOrder.LITTLE_ENDIAN); + + buf.put((byte) 'I').put((byte) 'I').putShort((short) 42).putInt(ifdOffset); + buf.put(g4); + if (padding == 1) { + buf.put((byte) 0); + } + + buf.putShort((short) entryCount); // entries below stay sorted by tag id + entry(buf, 256, TYPE_LONG, width); // ImageWidth + entry(buf, 257, TYPE_LONG, height); // ImageLength + entryShort(buf, 258, 1); // BitsPerSample + entryShort(buf, 259, 4); // Compression: CCITT T.6 (Group 4) + // The G4 stream encodes white/black runs; this tag tells readers which sense to + // materialize them in. The PDF default (-B, BlackIs1=false) is the standard fax sense — + // TIFF WhiteIsZero (0); -W (BlackIs1=true) is the inverted sense, BlackIsZero (1). + // Pinned empirically by CcittTiffsTest's pixel-identical round trip. + entryShort(buf, 262, blackIs1 ? 1 : 0); // PhotometricInterpretation + entryShort(buf, 266, 1); // FillOrder: MSB first (params -M) + entry(buf, 273, TYPE_LONG, stripOffset); // StripOffsets + entryShort(buf, 277, 1); // SamplesPerPixel + entry(buf, 278, TYPE_LONG, height); // RowsPerStrip: the single strip + entry(buf, 279, TYPE_LONG, g4.length); // StripByteCounts + if (withResolution) { + entry(buf, 282, TYPE_RATIONAL, rationalOffset); // XResolution + entry(buf, 283, TYPE_RATIONAL, rationalOffset + 8); // YResolution + } + entry(buf, 293, TYPE_LONG, 0); // T6Options: none + if (withResolution) { + entryShort(buf, 296, 2); // ResolutionUnit: inch + } + buf.putInt(0); // no next IFD + + if (withResolution) { + buf.putInt(dpi).putInt(1).putInt(dpi).putInt(1); + } + Files.write(out, buf.array()); + } + + /** One IFD entry holding an inline LONG (or a RATIONAL's value offset). */ + private static void entry(ByteBuffer buf, int tag, short type, int value) { + buf.putShort((short) tag).putShort(type).putInt(1).putInt(value); + } + + /** One IFD entry holding an inline SHORT (left-justified in the 4-byte value field). */ + private static void entryShort(ByteBuffer buf, int tag, int value) { + buf.putShort((short) tag).putShort(TYPE_SHORT).putInt(1); + buf.putShort((short) value).putShort((short) 0); + } +} diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfBoxJbig2Assembler.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfBoxJbig2Assembler.java index 8fc2306..3113420 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfBoxJbig2Assembler.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfBoxJbig2Assembler.java @@ -14,7 +14,6 @@ import java.util.Locale; import java.util.OptionalInt; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.stream.Stream; import org.apache.pdfbox.Loader; @@ -85,7 +84,7 @@ private record ImageInfo(int width, int height, int resolution) {} * @param outPdf the lossless-JBIG2 PDF to write * @param source a PDF whose Info dict, XMP and version are inherited, or {@code null} for none * @param forcedDpi a single DPI to size every page with, or empty to read each image's own - * @param pool the worker pool the per-page {@code jbig2} encodes run on + * @param jobs how many {@code jbig2} children may encode at once * @param scratchDir scratch directory for the intermediate per-page JBIG2 streams * (caller-owned) * @throws IOException if the directory is empty, a tool fails, or the write fails @@ -95,7 +94,7 @@ public void assemble( Path outPdf, @Nullable Path source, OptionalInt forcedDpi, - ExecutorService pool, + int jobs, Path scratchDir) throws IOException { List images = sortedImages(imageDir); @@ -109,8 +108,9 @@ public void assemble( int index = i; tasks.add(() -> encode(jbig2, image, scratchDir, index, forcedDpi)); } - List pages = - Tasks.awaitAll(pool, tasks, "jbig2 encode interrupted", "jbig2 encode failed"); + // Virtual workers: each task is one subprocess wait plus a brief Pix header read; the + // semaphore-bounded admission keeps at most `jobs` jbig2 children alive at once. + List pages = Tasks.awaitAll(Tasks.Workers.virtual(jobs), tasks, "jbig2 encode"); try (PDDocument doc = new PDDocument()) { for (Page page : pages) { diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java index 35c8b77..4ec4eeb 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractor.java @@ -1,24 +1,31 @@ package io.github.p4suta.shared.pdf; +import io.github.p4suta.shared.imaging.Pix; import io.github.p4suta.shared.process.ProcessRunner; import io.github.p4suta.shared.process.Tasks; import io.github.p4suta.shared.process.ToolPath; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.nio.file.Path; import java.time.Duration; import java.util.ArrayList; +import java.util.Comparator; import java.util.List; import java.util.Locale; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeoutException; +import java.util.stream.Stream; +import org.jspecify.annotations.Nullable; /** * Extracts a PDF's embedded bitonal images as TIFFs by driving {@code pdfimages}. The page range is * split across the worker pool (one {@code pdfimages -f/-l} per chunk) with distinct zero-padded - * {@code page-cNN-} prefixes, so a name sort yields reading order and chunks never collide. The - * dominant scan DPI is read from {@code pdfimages -list} and passed to the clean step as an - * explicit DPI, since {@code pdfimages} tags the extracted TIFFs at a default 72 dpi. + * {@code page-cNN-} prefixes, so a name sort yields reading order and chunks never collide. An + * all-CCITT source is remuxed — the raw embedded G4 streams pass through into single-strip TIFFs + * with their true ppi stamped (see {@link #extract}); any other source is extracted decoded, where + * {@code pdfimages} tags the TIFFs at a default 72 dpi, so the dominant scan DPI from {@code + * pdfimages -list} is passed downstream explicitly either way. * *

The textual {@code pdfinfo}/{@code pdfimages -list} reports are parsed by the pure {@link * PdfListingParser}; this adapter only drives the external processes via {@link @@ -93,13 +100,34 @@ public int dominantDpi(Path pdf) throws IOException { /** * Extract all pages of {@code pdf} into {@code outDir} as TIFFs, parallelized over page-range - * chunks. {@code jobs} bounds both the chunk count and the pool slots used. + * chunks of about {@link #CHUNK_PAGES} pages, at most {@code jobs} running at once (and at most + * {@code 4 * jobs} chunks). + * + *

One {@code pdfimages -list} pass picks the mode: when every embedded image is 1-bpp CCITT + * (the usual self-scanned book), each chunk dumps the raw G4 streams ({@code -ccitt}) and wraps + * them into single-strip CCITT-G4 TIFFs — a pure remux: no decode/re-encode, intermediates tens + * of KB per page instead of the decoded megabytes, and the image's true ppi stamped instead of + * {@code pdfimages}' default 72 dpi. Every wrapped page is decoded back once as verification + * (PDF's {@code EncodedByteAlign} never reaches the dumped params, so trust requires a decode); + * a chunk whose dump or wrap deviates in any way is re-extracted decoded ({@code -tiff}), which + * is also the whole-run mode for any other source. */ - public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throws IOException { + public void extract(Path pdf, Path outDir, int jobs) throws IOException { int total = pageCount(pdf); - int chunks = Math.max(1, Math.min(jobs, total)); - int per = (total + chunks - 1) / chunks; String pdfimages = resolve("pdfimages", pdfimagesPropertyKey); + List rows = + PdfListingParser.parseImageRows( + capture(List.of(pdfimages, "-list", pdf.toString()), INFO_TIMEOUT)); + boolean rawCcitt = + !rows.isEmpty() + && rows.stream().allMatch(r -> r.bpc() == 1 && "ccitt".equals(r.enc())); + + // Chunks of ~CHUNK_PAGES rather than total/jobs: fast finishers free their pool slot early + // (the straggler tail shrinks from total/jobs to ~CHUNK_PAGES pages), and a streaming + // consumer can take pages chunk by chunk. Capped so a small book is not all process spawns. + int chunkCap = (int) Math.min(4L * jobs, total); + int chunks = Math.clamp(Math.ceilDiv(total, CHUNK_PAGES), 1, Math.max(1, chunkCap)); + int per = Math.ceilDiv(total, chunks); List> tasks = new ArrayList<>(); int chunk = 0; for (int first = 1; first <= total; first += per) { @@ -108,23 +136,155 @@ public void extract(Path pdf, Path outDir, int jobs, ExecutorService pool) throw outDir.resolve(String.format(Locale.ROOT, "page-c%03d-", chunk)).toString(); int from = first; int to = last; + List chunkRows = + rawCcitt ? rowsInRange(rows, from, to) : List.of(); tasks.add( () -> { - runDiscarding( - List.of( - pdfimages, - "-tiff", - "-f", - Integer.toString(from), - "-l", - Integer.toString(to), - pdf.toString(), - prefix)); + extractChunk(pdfimages, pdf, from, to, prefix, chunkRows); return null; }); chunk++; } - Tasks.awaitAll(pool, tasks, "pdfimages extract interrupted", "pdfimages extract failed"); + // Virtual workers: a chunk is one subprocess wait plus (in remux mode) a brief wrap; the + // semaphore-bounded admission keeps at most `jobs` pdfimages children alive at once. + Tasks.awaitAll(Tasks.Workers.virtual(jobs), tasks, "pdfimages extract"); + } + + /** Pages per extraction chunk; see {@link #extract}. */ + private static final int CHUNK_PAGES = 12; + + /** The listing rows for pages {@code from..to}, in listing (= dump) order. */ + private static List rowsInRange( + List rows, int from, int to) { + return rows.stream().filter(r -> r.page() >= from && r.page() <= to).toList(); + } + + /** + * Extract one page-range chunk: raw-CCITT remux when {@code ccittRows} describes it, decoded + * {@code -tiff} otherwise — and the {@code -tiff} rerun as the fallback when the dump deviates + * from the listing in any way (count, params shape, or a wrap that does not decode back). + */ + private void extractChunk( + String pdfimages, + Path pdf, + int from, + int to, + String prefix, + List ccittRows) + throws IOException { + if (ccittRows.isEmpty()) { + runDiscarding(extractCommand(pdfimages, "-tiff", from, to, pdf, prefix)); + return; + } + runDiscarding(extractCommand(pdfimages, "-ccitt", from, to, pdf, prefix)); + if (!wrapChunk(prefix, ccittRows)) { + deleteByPrefix(prefix); + runDiscarding(extractCommand(pdfimages, "-tiff", from, to, pdf, prefix)); + } + } + + private static List extractCommand( + String pdfimages, String format, int from, int to, Path pdf, String prefix) { + return List.of( + pdfimages, + format, + "-f", + Integer.toString(from), + "-l", + Integer.toString(to), + pdf.toString(), + prefix); + } + + /** + * Wrap every {@code .ccitt} dump under {@code prefix} into a single-strip G4 TIFF, verifying + * each by decoding it back. Returns {@code false} (without cleaning up) on any deviation; the + * caller then discards the chunk's artifacts and falls back to a decoded extract. + */ + private static boolean wrapChunk(String prefix, List rows) + throws IOException { + List dumps = filesByPrefix(prefix, ".ccitt"); + if (dumps.size() != rows.size()) { + return false; + } + for (int i = 0; i < dumps.size(); i++) { + Path ccitt = dumps.get(i); + PdfListingParser.ImageRow row = rows.get(i); + Path paramsFile = withExtension(ccitt, ".params"); + if (!Files.isRegularFile(paramsFile)) { + return false; + } + CcittTiffs.@Nullable Params params = + CcittTiffs.parseParams(Files.readString(paramsFile, StandardCharsets.UTF_8)); + if (params == null || !CcittTiffs.supported(params, row.width())) { + return false; + } + Path out = withExtension(ccitt, ".tif"); + CcittTiffs.writeSingleStripG4( + out, + Files.readAllBytes(ccitt), + row.width(), + row.height(), + params.blackIs1(), + Math.max(row.xPpi(), 0)); + if (!decodesBack(out, row)) { + return false; + } + Files.delete(ccitt); + Files.delete(paramsFile); + } + return true; + } + + /** + * Whether the wrapped TIFF decodes to the listing row's dimensions — the read-back proof that + * the stream really was plain T.6 (an {@code EncodedByteAlign} stream, undetectable from the + * params file, fails to decode or comes back the wrong size here). + */ + private static boolean decodesBack(Path tif, PdfListingParser.ImageRow row) { + try (Pix pix = Pix.read(tif)) { + return pix.width() == row.width() && pix.height() == row.height(); + } catch (IllegalStateException e) { + return false; + } + } + + /** The files starting with {@code prefix}'s file name and ending in {@code suffix}, sorted. */ + private static List filesByPrefix(String prefix, String suffix) throws IOException { + Path prefixPath = Path.of(prefix); + Path dir = prefixPath.getParent(); + String name = String.valueOf(prefixPath.getFileName()); + if (dir == null) { + throw new IOException("extract prefix has no parent directory: " + prefix); + } + try (Stream entries = Files.list(dir)) { + return entries.filter( + p -> { + String fileName = String.valueOf(p.getFileName()); + return fileName.startsWith(name) && fileName.endsWith(suffix); + }) + .sorted(Comparator.comparing(p -> String.valueOf(p.getFileName()))) + .toList(); + } + } + + /** + * Delete every artifact of one chunk ({@code .ccitt}, {@code .params}, partial {@code .tif}). + */ + private static void deleteByPrefix(String prefix) throws IOException { + for (String suffix : List.of(".ccitt", ".params", ".tif")) { + for (Path file : filesByPrefix(prefix, suffix)) { + Files.deleteIfExists(file); + } + } + } + + /** A sibling of {@code file} with its extension replaced by {@code extension}. */ + private static Path withExtension(Path file, String extension) { + String name = String.valueOf(file.getFileName()); + int dot = name.lastIndexOf('.'); + String base = dot < 0 ? name : name.substring(0, dot); + return file.resolveSibling(base + extension); } /** Run an extraction command, discarding its (file-producing) output. */ diff --git a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java index efe12aa..0a2bb1b 100644 --- a/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java +++ b/shared/pdf/src/main/java/io/github/p4suta/shared/pdf/PdfListingParser.java @@ -1,6 +1,8 @@ package io.github.p4suta.shared.pdf; +import java.util.ArrayList; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; /** @@ -36,27 +38,17 @@ public static int parsePageCount(String pdfinfoOutput) { } /** - * The most common rounded x-ppi (column 13, 0-based 12) across the {@code image} rows of a - * {@code pdfimages -list} report, skipping the two header rows. Ties resolve to the first value - * seen and a non-positive winner falls back to {@link #DEFAULT_DPI}. + * The most common rounded x-ppi across the {@code image} rows of a {@code pdfimages -list} + * report. Ties resolve to the first value seen and a non-positive winner falls back to {@link + * #DEFAULT_DPI}. * * @param listOutput the full text {@code pdfimages -list} printed * @return the dominant rounded x-ppi, or {@link #DEFAULT_DPI} when none is usable */ public static int parseDominantDpi(String listOutput) { - String[] lines = listOutput.split("\n", -1); Map counts = new LinkedHashMap<>(); - for (int i = 2; i < lines.length; i++) { - String[] fields = lines[i].trim().split("\\s+", -1); - if (fields.length < 13 || !"image".equals(fields[2])) { - continue; - } - try { - int ppi = (int) Math.round(Double.parseDouble(fields[12])); - counts.merge(ppi, 1, Integer::sum); - } catch (NumberFormatException ignored) { - // Non-numeric x-ppi cell: skip this row. - } + for (ImageRow row : parseImageRows(listOutput)) { + counts.merge(row.xPpi(), 1, Integer::sum); } if (counts.isEmpty()) { return DEFAULT_DPI; @@ -71,4 +63,50 @@ public static int parseDominantDpi(String listOutput) { } return best > 0 ? best : DEFAULT_DPI; } + + /** + * One {@code image} row of a {@code pdfimages -list} report — the columns the extractor needs + * to pick its mode and to wrap raw CCITT dumps. + * + * @param page the 1-based page the image sits on + * @param width the image width in pixels + * @param height the image height in pixels + * @param bpc bits per component ({@code 1} for bitonal) + * @param enc the embedded encoding token ({@code ccitt}, {@code jbig2}, {@code jpeg}, {@code + * image}, …) + * @param xPpi the rounded x-ppi the image is placed at (0 when the cell is unusable) + */ + public record ImageRow(int page, int width, int height, int bpc, String enc, int xPpi) {} + + /** + * Parse the {@code image} rows of a {@code pdfimages -list} report, in listing order (the same + * order {@code pdfimages} dumps the images in), skipping the two header rows and any row with + * unparsable numeric cells. + * + * @param listOutput the full text {@code pdfimages -list} printed + * @return the parsed rows, possibly empty + */ + public static List parseImageRows(String listOutput) { + String[] lines = listOutput.split("\n", -1); + List rows = new ArrayList<>(); + for (int i = 2; i < lines.length; i++) { + String[] fields = lines[i].trim().split("\\s+", -1); + if (fields.length < 13 || !"image".equals(fields[2])) { + continue; + } + try { + rows.add( + new ImageRow( + Integer.parseInt(fields[0]), + Integer.parseInt(fields[3]), + Integer.parseInt(fields[4]), + Integer.parseInt(fields[7]), + fields[8], + (int) Math.round(Double.parseDouble(fields[12])))); + } catch (NumberFormatException ignored) { + // A non-numeric cell: skip this row. + } + } + return rows; + } } diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java new file mode 100644 index 0000000..63e93d6 --- /dev/null +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/CcittTiffsTest.java @@ -0,0 +1,160 @@ +package io.github.p4suta.shared.pdf; + +import static org.assertj.core.api.Assertions.assertThat; + +import io.github.p4suta.shared.imaging.Pix; +import java.awt.Color; +import java.awt.Graphics2D; +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; +import javax.imageio.ImageIO; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +/** + * The CCITT remux building blocks: the fax2tiff-style params parser (pure) and the single-strip G4 + * TIFF writer, verified by wrapping a real G4 stream (PDFBox's CCITT encoder, the same encoding a + * scanner PDF embeds) and decoding it back through Leptonica pixel-for-pixel. + */ +final class CcittTiffsTest { + + // ---- params parsing ---- + + @Test + void parsesTheUsualScannerShape() { + CcittTiffs.Params params = parsed("-4 -P -X 3496 -B -M\n"); + assertThat(params).isEqualTo(new CcittTiffs.Params("-4", false, 3496, false)); + assertThat(CcittTiffs.supported(params, 3496)).isTrue(); + } + + @Test + void eolMarkersAreUnsupported() { + CcittTiffs.Params params = parsed("-4 -A -X 100 -W -M"); + assertThat(params).isEqualTo(new CcittTiffs.Params("-4", true, 100, true)); + // EOL markers are not representable in TIFF T.6. + assertThat(CcittTiffs.supported(params, 100)).isFalse(); + } + + @Test + void group3IsUnsupported() { + assertThat(CcittTiffs.supported(parsed("-2 -P -X 100 -B -M"), 100)).isFalse(); + } + + @Test + void widthMismatchIsUnsupported() { + assertThat(CcittTiffs.supported(parsed("-4 -P -X 100 -B -M"), 200)).isFalse(); + } + + /** Parse params the test asserts are well-formed, made non-null for NullAway. */ + private static CcittTiffs.Params parsed(String text) { + return java.util.Objects.requireNonNull(CcittTiffs.parseParams(text)); + } + + @Test + void unknownTokensAndMissingFlagsAreUnparsable() { + assertThat(CcittTiffs.parseParams("-4 -P -X 100 -B -M -Z")).isNull(); // unknown flag + assertThat(CcittTiffs.parseParams("-4 -P -B -M")).isNull(); // no -X + assertThat(CcittTiffs.parseParams("-4 -P -X nope -B -M")).isNull(); // bad width + assertThat(CcittTiffs.parseParams("-4 -P -X 100 -B")).isNull(); // no -M + assertThat(CcittTiffs.parseParams("")).isNull(); + } + + // ---- TIFF wrapping ---- + + /** + * Round trip: draw a known bitonal pattern, encode it to a raw G4 stream with PDFBox's CCITT + * encoder (the very encoding a scanner PDF embeds and {@code pdfimages -ccitt} dumps), wrap the + * stream with {@link CcittTiffs#writeSingleStripG4}, and assert it decodes back through + * Leptonica pixel-identical to the original, with the stamped resolution. + */ + @Test + void wrappedStreamDecodesBackPixelIdentical(@TempDir Path tmp) throws Exception { + int width = 200; + int height = 150; + BufferedImage img = pattern(width, height); + G4Stream g4 = encodeG4(img); + + Path wrapped = tmp.resolve("wrapped.tif"); + CcittTiffs.writeSingleStripG4(wrapped, g4.bytes, width, height, g4.blackIs1, 450); + + Path referencePng = tmp.resolve("reference.png"); + ImageIO.write(img, "png", referencePng.toFile()); + try (Pix expected = Pix.read(referencePng); + Pix actual = Pix.read(wrapped)) { + assertThat(actual.width()).isEqualTo(width); + assertThat(actual.height()).isEqualTo(height); + assertThat(actual.resolution()).isEqualTo(450); + assertThat(actual.blackPixels()).isPositive(); + assertThat(actual.pixelsEqual(expected)).isTrue(); + } + } + + @Test + void omitsResolutionTagsWhenDpiUnknown(@TempDir Path tmp) throws Exception { + int width = 64; + int height = 48; + G4Stream g4 = encodeG4(pattern(width, height)); + + Path wrapped = tmp.resolve("wrapped.tif"); + CcittTiffs.writeSingleStripG4(wrapped, g4.bytes, width, height, g4.blackIs1, 0); + + try (Pix actual = Pix.read(wrapped)) { + assertThat(actual.width()).isEqualTo(width); + assertThat(actual.resolution()).isZero(); + } + } + + /** A raw G4 (T.6) stream and the {@code BlackIs1} convention its encoder declared. */ + private static final class G4Stream { + final byte[] bytes; + final boolean blackIs1; + + G4Stream(byte[] bytes, boolean blackIs1) { + this.bytes = bytes; + this.blackIs1 = blackIs1; + } + } + + /** + * The raw CCITT G4 stream PDFBox's {@link CCITTFactory} encodes {@code img} to (lifted verbatim + * from the image XObject, exactly the bytes {@code pdfimages -ccitt} would dump), along with + * the {@code BlackIs1} decode parameter it declared. + */ + private static G4Stream encodeG4(BufferedImage img) throws IOException { + try (PDDocument doc = new PDDocument()) { + PDImageXObject image = CCITTFactory.createFromImage(doc, img); + COSDictionary decodeParms = + (COSDictionary) image.getCOSObject().getDictionaryObject(COSName.DECODE_PARMS); + boolean blackIs1 = + decodeParms != null && decodeParms.getBoolean(COSName.BLACK_IS_1, false); + try (InputStream in = image.getCOSObject().createRawInputStream()) { + return new G4Stream(in.readAllBytes(), blackIs1); + } + } + } + + /** A deterministic bitonal pattern with structure (bars + a block). */ + private static BufferedImage pattern(int width, int height) { + BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); + Graphics2D g = img.createGraphics(); + try { + g.setColor(Color.WHITE); + g.fillRect(0, 0, width, height); + g.setColor(Color.BLACK); + for (int x = 4; x < width - 8; x += 12) { + g.fillRect(x, 8, 6, height - 16); + } + g.fillRect(width / 3, height / 3, width / 3, height / 3); + } finally { + g.dispose(); + } + return img; + } +} diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfBoxJbig2AssemblerTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfBoxJbig2AssemblerTest.java index 3bd7edf..2bae0ec 100644 --- a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfBoxJbig2AssemblerTest.java +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfBoxJbig2AssemblerTest.java @@ -8,8 +8,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.OptionalInt; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import javax.imageio.ImageIO; import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; @@ -57,13 +55,10 @@ void assemblesABitonalDirectoryIntoAJbig2Pdf(@TempDir Path tmp) throws Exception writeBitonalPage(imageDir.resolve("page-000.png"), 64, 48); writeBitonalPage(imageDir.resolve("page-001.png"), 80, 60); Path outPdf = tmp.resolve("out.pdf"); - - ExecutorService pool = Executors.newFixedThreadPool(2); try { new PdfBoxJbig2Assembler(JBIG2_KEY) - .assemble(imageDir, outPdf, null, OptionalInt.of(300), pool, scratch); + .assemble(imageDir, outPdf, null, OptionalInt.of(300), 2, scratch); } finally { - pool.shutdownNow(); } assertThat(Files.exists(outPdf)).isTrue(); @@ -103,13 +98,10 @@ void inheritsInfoAndVersionFromTheSourcePdf(@TempDir Path tmp) throws Exception Path scratch = Files.createDirectory(tmp.resolve("scratch")); writeBitonalPage(imageDir.resolve("page-000.png"), 64, 48); Path outPdf = tmp.resolve("out.pdf"); - - ExecutorService pool = Executors.newSingleThreadExecutor(); try { new PdfBoxJbig2Assembler(JBIG2_KEY) - .assemble(imageDir, outPdf, source, OptionalInt.empty(), pool, scratch); + .assemble(imageDir, outPdf, source, OptionalInt.empty(), 2, scratch); } finally { - pool.shutdownNow(); } try (PDDocument doc = Loader.loadPDF(outPdf.toFile())) { @@ -125,7 +117,6 @@ void emptyDirectoryFailsBeforeTouchingTheToolchain(@TempDir Path tmp) throws Exc // runs before tool resolution). Works on any host. Path imageDir = Files.createDirectory(tmp.resolve("empty")); Path scratch = Files.createDirectory(tmp.resolve("scratch")); - ExecutorService pool = Executors.newSingleThreadExecutor(); try { PdfBoxJbig2Assembler assembler = new PdfBoxJbig2Assembler(JBIG2_KEY); assertThatThrownBy( @@ -135,12 +126,11 @@ void emptyDirectoryFailsBeforeTouchingTheToolchain(@TempDir Path tmp) throws Exc tmp.resolve("out.pdf"), null, OptionalInt.empty(), - pool, + 1, scratch)) .isInstanceOf(IOException.class) .hasMessageContaining("no cleaned images"); } finally { - pool.shutdownNow(); } } @@ -156,7 +146,6 @@ void missingJbig2BinaryFailsWithAClearMessage(@TempDir Path tmp) throws Exceptio String bogusKey = "shared.pdf.test.bogus.jbig2.path"; System.setProperty(bogusKey, tmp.resolve("definitely-not-jbig2").toString()); try { - ExecutorService pool = Executors.newSingleThreadExecutor(); try { PdfBoxJbig2Assembler assembler = new PdfBoxJbig2Assembler(bogusKey); assertThatThrownBy( @@ -166,11 +155,10 @@ void missingJbig2BinaryFailsWithAClearMessage(@TempDir Path tmp) throws Exceptio tmp.resolve("out.pdf"), null, OptionalInt.of(300), - pool, + 1, scratch)) .isInstanceOf(IOException.class); } finally { - pool.shutdownNow(); } } finally { System.clearProperty(bogusKey); diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java index c9032d2..4d17e4d 100644 --- a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfImagesCliExtractorTest.java @@ -10,8 +10,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import java.util.stream.Stream; import javax.imageio.ImageIO; import org.apache.pdfbox.pdmodel.PDDocument; @@ -68,12 +66,7 @@ void extractsEmbeddedImagesAsTiffs(@TempDir Path tmp) throws Exception { writePdfWithImage(pdf, 120, 90); Path outDir = Files.createDirectory(tmp.resolve("out")); - ExecutorService pool = Executors.newFixedThreadPool(2); - try { - new PdfImagesCliExtractor(PDFIMAGES_KEY, PDFINFO_KEY).extract(pdf, outDir, 2, pool); - } finally { - pool.shutdownNow(); - } + new PdfImagesCliExtractor(PDFIMAGES_KEY, PDFINFO_KEY).extract(pdf, outDir, 2); try (Stream entries = Files.list(outDir)) { List tiffs = @@ -94,6 +87,64 @@ void dominantDpiReturnsAPositiveResolution(@TempDir Path tmp) throws Exception { assertThat(dpi).isPositive(); } + /** Build a multi-page PDF embedding CCITT-G4 bitonal images, one per page, at ~200 ppi. */ + private static void writeCcittPdf(Path pdf, int pages, int imgW, int imgH) throws IOException { + try (PDDocument doc = new PDDocument()) { + for (int i = 0; i < pages; i++) { + BufferedImage bitonal = + new BufferedImage(imgW, imgH, BufferedImage.TYPE_BYTE_BINARY); + java.awt.Graphics2D g = bitonal.createGraphics(); + try { + g.setColor(java.awt.Color.WHITE); + g.fillRect(0, 0, imgW, imgH); + g.setColor(java.awt.Color.BLACK); + g.fillRect(10 + i, 10, imgW / 3, imgH / 2); + } finally { + g.dispose(); + } + float wPt = imgW * 72f / 200; + float hPt = imgH * 72f / 200; + PDPage page = new PDPage(new PDRectangle(wPt, hPt)); + doc.addPage(page); + PDImageXObject image = + org.apache.pdfbox.pdmodel.graphics.image.CCITTFactory.createFromImage( + doc, bitonal); + try (PDPageContentStream content = new PDPageContentStream(doc, page)) { + content.drawImage(image, 0, 0, wPt, hPt); + } + } + doc.save(pdf.toFile()); + } + } + + @Test + @EnabledIf("io.github.p4suta.shared.pdf.PdfImagesCliExtractorTest#toolsOnPath") + void remuxesAnAllCcittSourceIntoStampedG4Tiffs(@TempDir Path tmp) throws Exception { + Path pdf = tmp.resolve("scan.pdf"); + writeCcittPdf(pdf, 3, 240, 180); + Path outDir = Files.createDirectory(tmp.resolve("out")); + + new PdfImagesCliExtractor(PDFIMAGES_KEY, PDFINFO_KEY).extract(pdf, outDir, 2); + + try (Stream entries = Files.list(outDir)) { + List files = entries.sorted().toList(); + // The remux leaves exactly one .tif per page — no .ccitt/.params residue. + assertThat(files).hasSize(3); + assertThat(files).allSatisfy(p -> assertThat(p.toString()).endsWith(".tif")); + for (Path tif : files) { + try (io.github.p4suta.shared.imaging.Pix pix = + io.github.p4suta.shared.imaging.Pix.read(tif)) { + assertThat(pix.width()).isEqualTo(240); + assertThat(pix.height()).isEqualTo(180); + // The remux stamps the image's true ppi instead of pdfimages' default 72. + assertThat(pix.resolution()).isEqualTo(200); + // Black ink, not inverted: the drawn block is ~1/6 of the page. + assertThat(pix.blackPixels()).isGreaterThan(0).isLessThan(240L * 180 / 2); + } + } + } + } + @Test void missingToolFailsWithAClearMessage(@TempDir Path tmp) throws Exception { Path pdf = tmp.resolve("doc.pdf"); diff --git a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java index f228032..2bba8d0 100644 --- a/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java +++ b/shared/pdf/src/test/java/io/github/p4suta/shared/pdf/PdfListingParserTest.java @@ -120,4 +120,27 @@ void parseDominantDpiSkipsNonImageRowsWithTooFewFields() { PdfListingParser.DEFAULT_DPI, PdfListingParser.parseDominantDpi("hdr\n----\n 1 0 smask\n")); } + + @Test + void parseImageRowsReadsTheColumnsTheExtractorNeeds() { + var rows = PdfListingParser.parseImageRows(LIST); + assertEquals(3, rows.size()); + assertEquals(new PdfListingParser.ImageRow(1, 2480, 3508, 1, "ccitt", 300), rows.get(0)); + assertEquals(new PdfListingParser.ImageRow(3, 1240, 1754, 1, "ccitt", 150), rows.get(2)); + } + + @Test + void parseImageRowsSkipsMalformedAndNonImageRows() { + String mixed = + """ + page num type width height color comp bpc enc interp object ID x-ppi y-ppi size ratio + -------------------------------------------------------------------------------------------- + 1 0 smask 2480 3508 gray 1 1 ccitt no 7 0 300 300 101K 1.2% + 2 1 image bad 3508 gray 1 1 ccitt no 11 0 300 300 99K 1.1% + 3 2 image 2480 3508 rgb 3 8 jpeg no 14 0 150 150 40K 1.0% + """; + var rows = PdfListingParser.parseImageRows(mixed); + assertEquals(1, rows.size()); + assertEquals(new PdfListingParser.ImageRow(3, 2480, 3508, 8, "jpeg", 150), rows.get(0)); + } } diff --git a/shared/process/build.gradle.kts b/shared/process/build.gradle.kts index 5ca5418..f34fc9a 100644 --- a/shared/process/build.gradle.kts +++ b/shared/process/build.gradle.kts @@ -20,8 +20,11 @@ plugins { // (exitCode + stdout + stderr + elapsed Duration; tate's typed Result is the shape donor) and // throwing TimeoutException on timeout. The caller passes the set of acceptable non-zero exits // (generalizing despeckle's hardcoded qpdf-exit-3 tolerance) — a non-acceptable exit throws. -// * Tasks — awaitAll: a parameterized parallel fan-out over a caller-owned pool (register's), -// collecting results in submission order and aggregating the first failure as an IOException. +// * Tasks — awaitAll: a parallel fan-out over a batch-owned executor (Workers.platform for +// CPU-bound work, Workers.virtual for subprocess waits), collecting results in submission +// order with fail-fast sibling interruption, quiescence before the failure propagates, and +// exception identity preserved (domain RuntimeExceptions keep their error kind). Mirrors +// StructuredTaskScope.join() semantics on final-Java features. // // No domain exceptions are reachable here (that is the point of generalizing): a launch failure or // an unacceptable exit surfaces as a plain IOException, a timeout as a TimeoutException — so the @@ -37,13 +40,11 @@ dependencies { // Coverage floor: the same realistic infra-like 0.75 line / 0.60 branch the despeckle // :infrastructure and :shared:imaging modules use — this is process/exec plumbing, not branch-rich -// domain logic, so the domain-grade 0.95/0.90 the kernel/observability use would be dishonest. The -// only branches a unit test cannot drive are the InterruptedException catches (you cannot -// deterministically interrupt the waiting thread mid-waitFor) and a stray launch-failure path; at -// CLASS granularity the imaging-style exclusion would throw away ProcessRunner's genuinely covered -// happy/timeout/exit paths too, so NO class is excluded — the lenient floor absorbs the few -// untestable catch arms. The same self-contained block the other shared modules carry, since the -// floor is not applied by any convention plugin. +// domain logic, so the domain-grade 0.95/0.90 the kernel/observability use would be dishonest. +// (The interrupt arms are now deterministically covered — TasksTest and ProcessRunnerTest drive +// them with latch-coordinated interrupts — but a stray launch-failure path remains; the lenient +// floor absorbs it.) The same self-contained block the other shared modules carry, since the floor +// is not applied by any convention plugin. tasks.named("jacocoTestCoverageVerification") { dependsOn(tasks.named("test")) violationRules { diff --git a/shared/process/src/main/java/io/github/p4suta/shared/process/ProcessRunner.java b/shared/process/src/main/java/io/github/p4suta/shared/process/ProcessRunner.java index f1349fa..8ecfefe 100644 --- a/shared/process/src/main/java/io/github/p4suta/shared/process/ProcessRunner.java +++ b/shared/process/src/main/java/io/github/p4suta/shared/process/ProcessRunner.java @@ -81,9 +81,17 @@ public static Result run( try (ExecutorService drainers = Executors.newVirtualThreadPerTaskExecutor()) { Future out = drainers.submit(() -> process.getInputStream().readAllBytes()); Future err = drainers.submit(() -> process.getErrorStream().readAllBytes()); - if (!process.waitFor(timeout.toMillis(), TimeUnit.MILLISECONDS)) { + try { + if (!process.waitFor(timeout.toMillis(), TimeUnit.MILLISECONDS)) { + process.destroyForcibly(); + throw new TimeoutException(command.get(0) + " timed out after " + timeout); + } + } catch (InterruptedException e) { + // Kill the child before propagating, exactly like the timeout path: the drainers + // then reach EOF, so the try-with-resources close() returns instead of waiting on + // a live child's pipes — and no orphaned process outlives the interrupted caller. process.destroyForcibly(); - throw new TimeoutException(command.get(0) + " timed out after " + timeout); + throw e; } // Decode leniently (replace malformed bytes), matching the long-standing behavior. String stdout = new String(await(out), StandardCharsets.UTF_8); diff --git a/shared/process/src/main/java/io/github/p4suta/shared/process/Tasks.java b/shared/process/src/main/java/io/github/p4suta/shared/process/Tasks.java index 725d3aa..7212886 100644 --- a/shared/process/src/main/java/io/github/p4suta/shared/process/Tasks.java +++ b/shared/process/src/main/java/io/github/p4suta/shared/process/Tasks.java @@ -1,65 +1,236 @@ package io.github.p4suta.shared.process; import java.io.IOException; +import java.io.UncheckedIOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorCompletionService; import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.Future; +import java.util.concurrent.Semaphore; +import java.util.concurrent.atomic.AtomicBoolean; +import org.jspecify.annotations.Nullable; /** - * Runs a batch of {@link Callable}s on a caller-owned {@link ExecutorService} and collects their - * results in submission order, turning the executor's checked machinery into a single {@link - * IOException}. + * Runs a batch of {@link Callable}s on a batch-owned executor with fail-fast semantics: the first + * failure interrupts the remaining workers, the batch waits until every worker has actually stopped + * (quiescence), and only then does the failure propagate — so a caller's {@code finally} can safely + * delete the directories the workers were writing into. + * + *

The propagated failure keeps its identity: a task's own {@link IOException} or {@link + * RuntimeException} (the apps' domain exceptions are runtime exceptions) is re-thrown unchanged, so + * the shared exception mapper still sees the original kind and maps the right exit code and {@code + * RunFailed} token; an {@link UncheckedIOException} is unwrapped to its cause. This deliberately + * mirrors {@code StructuredTaskScope.join()}'s contract (fail-fast, sibling interruption, + * quiescence before propagation) on final-Java features, so a future swap to structured concurrency + * stays internal to this class. */ public final class Tasks { private Tasks() {} /** - * Submit every task, wait for all of them, and return their results in the order {@code tasks} - * was given. The first failure stops collection and surfaces as an {@link IOException}: a - * task's own {@link IOException} is re-thrown unchanged (so callers see the real cause), any - * other failure is wrapped with {@code failureMessage}, and an interruption restores the - * thread's interrupt flag and is reported with {@code interruptedMessage}. + * How one batch schedules its workers. * - * @param pool the executor to run the tasks on (the caller owns its lifecycle) + *

{@link #platform(int)} is for CPU-bound work (Leptonica/FFM pixel ops, TIFF G4 encoding): + * a fixed pool of platform threads, because a long native downcall pins a virtual thread's + * carrier, which would silently cap concurrency at the carrier count. {@link #virtual(int)} is + * for subprocess-bound work (per-page {@code jbig2}, {@code pdfimages} chunks): one virtual + * thread per task, with at most {@code maxInFlight} admitted past an internal semaphore so the + * spawned child processes stay bounded. + */ + public static final class Workers { + + private final int limit; + private final boolean virtualThreads; + + private Workers(int limit, boolean virtualThreads) { + if (limit < 1) { + throw new IllegalArgumentException("worker limit must be >= 1, got " + limit); + } + this.limit = limit; + this.virtualThreads = virtualThreads; + } + + /** {@code jobs} platform threads — CPU-bound work. */ + public static Workers platform(int jobs) { + return new Workers(jobs, false); + } + + /** + * A virtual thread per task, at most {@code maxInFlight} running at once — work that mostly + * waits on a subprocess. + */ + public static Workers virtual(int maxInFlight) { + return new Workers(maxInFlight, true); + } + + private ExecutorService newExecutor() { + return virtualThreads + ? Executors.newVirtualThreadPerTaskExecutor() + : Executors.newFixedThreadPool(limit); + } + + private @Nullable Semaphore newPermits() { + return virtualThreads ? new Semaphore(limit) : null; + } + } + + /** + * Per-item completion callback, invoked on the orchestrating thread in completion order: {@code + * done} runs {@code 1..total}, strictly increasing, successes only — so progress events built + * on it are ordered without any thread-safety burden on the listener. + */ + @FunctionalInterface + public interface ItemProgress { + + /** The no-op listener. */ + ItemProgress NONE = (done, total) -> {}; + + /** + * One more item finished. + * + * @param done how many items have finished so far (1-based, strictly increasing) + * @param total how many items the batch runs + */ + void onItem(int done, int total); + } + + /** {@link #awaitAll(Workers, List, String, ItemProgress)} without progress reporting. */ + public static List awaitAll(Workers workers, List> tasks, String label) + throws IOException { + return awaitAll(workers, tasks, label, ItemProgress.NONE); + } + + /** + * Run every task on a batch-owned executor and return their results in submission order. + * + *

On the first failure the remaining workers are interrupted ({@code shutdownNow}) and the + * batch waits for full quiescence before the failure propagates with its identity intact: + * {@link IOException} and {@link RuntimeException} unchanged, {@link UncheckedIOException} + * unwrapped to its cause, anything else wrapped as {@code IOException(label + " failed")}. + * Caller interruption likewise stops and joins the workers, restores the interrupt flag, and + * surfaces as {@code IOException(label + " interrupted")}. + * + * @param workers the scheduling mode (see {@link Workers}) * @param tasks the work to run, one result per task - * @param interruptedMessage the message for an {@link IOException} raised on interruption - * @param failureMessage the message wrapping a non-{@code IOException} task failure + * @param label the batch's name in failure/interruption messages (e.g. {@code "G4 encode"}) + * @param progress called on this thread after each successful item * @param the task result type * @return the task results, in submission order * @throws IOException if any task fails or the wait is interrupted */ public static List awaitAll( - ExecutorService pool, - List> tasks, - String interruptedMessage, - String failureMessage) + Workers workers, List> tasks, String label, ItemProgress progress) throws IOException { - List> futures; - try { - futures = pool.invokeAll(tasks); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException(interruptedMessage, e); - } - List results = new ArrayList<>(futures.size()); - for (Future future : futures) { + int total = tasks.size(); + List<@Nullable T> results = new ArrayList<>(Collections.nCopies(total, null)); + try (ExecutorService pool = workers.newExecutor()) { + ExecutorCompletionService> completion = + new ExecutorCompletionService<>(pool); + @Nullable Semaphore permits = workers.newPermits(); + // The fail-fast gate: set by the first worker whose task throws, checked by every + // worker before it starts a task. shutdownNow() alone cannot stop a queued task a + // newly-freed worker dequeues in the instant before the orchestrator reacts; the gate + // closes that window (a gated task completes with the BatchCancelled marker instead + // of running user code). + AtomicBoolean failed = new AtomicBoolean(); + for (int i = 0; i < total; i++) { + int index = i; + Callable task = tasks.get(i); + completion.submit( + () -> { + if (failed.get()) { + throw new BatchCancelled(); + } + // Acquired inside the worker so submission never blocks and an + // interrupt during the wait cancels cleanly. + if (permits != null) { + permits.acquire(); + } + try { + return new IndexedResult<>(index, task.call()); + } catch (Throwable t) { + failed.set(true); + throw t; + } finally { + if (permits != null) { + permits.release(); + } + } + }); + } try { - results.add(future.get()); + int done = 0; + while (done < total) { + Future> future = completion.take(); + try { + IndexedResult result = future.get(); + results.set(result.index(), result.value()); + done++; + progress.onItem(done, total); + } catch (ExecutionException e) { + if (e.getCause() instanceof BatchCancelled) { + // A gated sibling, not the root cause — keep draining: the failure + // that set the gate already ran, so its completion is on its way. + continue; + } + pool.shutdownNow(); + throw failure(e.getCause(), label); + } + } } catch (InterruptedException e) { + pool.shutdownNow(); Thread.currentThread().interrupt(); - throw new IOException(interruptedMessage, e); - } catch (ExecutionException e) { - Throwable cause = e.getCause(); - if (cause instanceof IOException io) { - throw io; - } - throw new IOException(failureMessage, cause); + throw new IOException(label + " interrupted", e); } + // The try-with-resources close() awaits full termination on every path, so by the + // time a result or an exception reaches the caller, no worker is still running (or + // still writing into directories the caller is about to delete). } - return results; + return castNonNull(results); + } + + /** One task's result tagged with its submission index, for submission-order assembly. */ + private record IndexedResult(int index, @Nullable T value) {} + + /** The internal marker a gated (skipped-after-failure) task completes with; never user code. */ + private static final class BatchCancelled extends RuntimeException { + private static final long serialVersionUID = 1L; + + BatchCancelled() { + super("cancelled by an earlier failure in the batch", null, false, false); + } + } + + /** The failure to propagate for a task that threw {@code cause}; see {@link #awaitAll}. */ + private static IOException failure(@Nullable Throwable cause, String label) { + switch (cause) { + case IOException io -> { + return io; + } + case UncheckedIOException unchecked -> { + // UncheckedIOException's constructor requires a non-null cause. + return java.util.Objects.requireNonNull(unchecked.getCause()); + } + case RuntimeException runtime -> throw runtime; + case Error error -> throw error; + case null, default -> { + return new IOException(label + " failed", cause); + } + } + } + + /** + * {@return the results list with every slot filled} Every slot was set exactly once (one + * completion per submitted task), so the nullable build-up list is safe to expose as non-null. + */ + @SuppressWarnings({"unchecked", "NullAway"}) // each index 0..total-1 was set by its task + private static List castNonNull(List<@Nullable T> results) { + return (List) (List) results; } } diff --git a/shared/process/src/test/java/io/github/p4suta/shared/process/ProcessRunnerTest.java b/shared/process/src/test/java/io/github/p4suta/shared/process/ProcessRunnerTest.java index 5514e24..c826d9d 100644 --- a/shared/process/src/test/java/io/github/p4suta/shared/process/ProcessRunnerTest.java +++ b/shared/process/src/test/java/io/github/p4suta/shared/process/ProcessRunnerTest.java @@ -111,4 +111,38 @@ void missingBinaryFailsWithIoException() { Duration.ofSeconds(10))) .isInstanceOf(IOException.class); } + + @Test + @org.junit.jupiter.api.Timeout(15) + void interruptedWaiterKillsTheChildAndUnwindsPromptly() throws Exception { + // Regression: an interrupt during waitFor used to propagate WITHOUT killing the child, so + // the drainer close() hung on the live child's pipes (here: up to the sleep's 30s) and the + // child leaked. Now the child is killed first, so the call unwinds promptly — the @Timeout + // and the bounded join are the proof. + var thrown = new java.util.concurrent.atomic.AtomicReference(); + var flagRestoredPathTaken = new java.util.concurrent.atomic.AtomicBoolean(); + var started = new java.util.concurrent.CountDownLatch(1); + Thread caller = + new Thread( + () -> { + try { + started.countDown(); + ProcessRunner.run(List.of("sleep", "30"), Duration.ofMinutes(5)); + } catch (InterruptedException e) { + thrown.set(e); + flagRestoredPathTaken.set(true); + } catch (Exception e) { + thrown.set(e); + } + }); + caller.start(); + assertThat(started.await(10, java.util.concurrent.TimeUnit.SECONDS)).isTrue(); + // Give the child a beat to actually spawn before interrupting the waiter. + Thread.sleep(300); + caller.interrupt(); + caller.join(10_000); + assertThat(caller.isAlive()).as("the interrupted run must unwind promptly").isFalse(); + assertThat(thrown.get()).isInstanceOf(InterruptedException.class); + assertThat(flagRestoredPathTaken).isTrue(); + } } diff --git a/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java b/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java index 0b689d1..3356f83 100644 --- a/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java +++ b/shared/process/src/test/java/io/github/p4suta/shared/process/TasksTest.java @@ -1,74 +1,344 @@ package io.github.p4suta.shared.process; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertSame; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; +import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +/** + * The batch contract of {@link Tasks#awaitAll(Tasks.Workers, List, String, Tasks.ItemProgress)}: + * submission-order results, fail-fast with sibling interruption, quiescence before the failure + * propagates, exception identity, orchestrator-thread progress, and the virtual mode's in-flight + * bound. All coordination is latch-based — no sleeps as logic. + */ +@Timeout(30) class TasksTest { + // ---- happy path ---- + @Test void returnsResultsInSubmissionOrder() throws IOException { - ExecutorService pool = Executors.newFixedThreadPool(4); - try { - List> tasks = new ArrayList<>(); - for (int i = 0; i < 16; i++) { - int value = i; - tasks.add(() -> value); - } - List results = Tasks.awaitAll(pool, tasks, "interrupted", "failed"); - for (int i = 0; i < 16; i++) { - assertEquals(i, results.get(i)); - } - } finally { - pool.shutdown(); + List> tasks = new ArrayList<>(); + for (int i = 0; i < 16; i++) { + int value = i; + tasks.add(() -> value); + } + List results = Tasks.awaitAll(Tasks.Workers.platform(4), tasks, "batch"); + for (int i = 0; i < 16; i++) { + assertEquals(i, results.get(i)); } } @Test - void surfacesATasksIOExceptionUnchanged() { - ExecutorService pool = Executors.newFixedThreadPool(2); - try { - IOException boom = new IOException("boom"); - Callable task = - () -> { - throw boom; - }; - IOException thrown = - assertThrows( - IOException.class, - () -> Tasks.awaitAll(pool, List.of(task), "interrupted", "failed")); - // The task's own IOException is re-thrown as-is, not wrapped with the failure message. - assertSame(boom, thrown); - } finally { - pool.shutdown(); + void progressRunsOnTheCallingThreadInCompletionOrder() throws IOException { + Thread caller = Thread.currentThread(); + List seen = new ArrayList<>(); // safe: appended on the calling thread only + AtomicBoolean wrongThread = new AtomicBoolean(); + List> tasks = new ArrayList<>(); + for (int i = 0; i < 8; i++) { + int value = i; + tasks.add(() -> value); } + Tasks.awaitAll( + Tasks.Workers.platform(4), + tasks, + "batch", + (done, total) -> { + if (Thread.currentThread() != caller) { + wrongThread.set(true); + } + seen.add(new int[] {done, total}); + }); + assertFalse(wrongThread.get(), "progress must run on the orchestrating thread"); + assertEquals(8, seen.size()); + for (int i = 0; i < 8; i++) { + assertEquals(i + 1, seen.get(i)[0], "done is strictly increasing from 1"); + assertEquals(8, seen.get(i)[1]); + } + } + + // ---- failure semantics ---- + + @Test + void surfacesATasksIOExceptionByIdentity() { + IOException boom = new IOException("boom"); + IOException thrown = + assertThrows( + IOException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), + List.of(failing(boom)), + "batch")); + assertSame(boom, thrown); + } + + @Test + void surfacesARuntimeExceptionByIdentitySoErrorKindsSurvive() { + IllegalStateException boom = new IllegalStateException("domain kind carrier"); + IllegalStateException thrown = + assertThrows( + IllegalStateException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), + List.of(failing(boom)), + "batch")); + assertSame(boom, thrown); + } + + @Test + void unwrapsAnUncheckedIOExceptionToItsCause() { + IOException cause = new IOException("disk"); + IOException thrown = + assertThrows( + IOException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), + List.of(failing(new UncheckedIOException(cause))), + "batch")); + assertSame(cause, thrown); + } + + @Test + void wrapsAnUnknownCheckedFailureWithTheLabel() { + Exception checked = new Exception("odd"); + IOException thrown = + assertThrows( + IOException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), + List.of(failing(checked)), + "G4 encode")); + assertEquals("G4 encode failed", thrown.getMessage()); + assertSame(checked, thrown.getCause()); + } + + // ---- fail-fast, sibling interruption, quiescence ---- + + @Test + void firstFailureInterruptsSiblingsAndSkipsQueuedWork() throws Exception { + CountDownLatch blockerStarted = new CountDownLatch(1); + AtomicBoolean blockerInterrupted = new AtomicBoolean(); + AtomicBoolean queuedRan = new AtomicBoolean(); + IllegalStateException boom = new IllegalStateException("first failure"); + + Callable blocker = + () -> { + blockerStarted.countDown(); + try { + new CountDownLatch(1).await(); // blocks until interrupted + } catch (InterruptedException e) { + blockerInterrupted.set(true); + } + return null; + }; + Callable failing = + () -> { + blockerStarted.await(); // fail only once the sibling is provably running + throw boom; + }; + Callable queued = + () -> { + queuedRan.set(true); + return null; + }; + + IllegalStateException thrown = + assertThrows( + IllegalStateException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), + List.of(blocker, failing, queued), + "batch")); + assertSame(boom, thrown); + assertTrue(blockerInterrupted.get(), "the running sibling must be interrupted"); + assertFalse(queuedRan.get(), "queued work must be discarded on failure"); } @Test - void wrapsANonIoFailureWithTheFailureMessage() { - ExecutorService pool = Executors.newFixedThreadPool(2); - try { - Callable task = + void quiescesBeforeTheFailurePropagates() { + AtomicInteger inFlight = new AtomicInteger(); + CountDownLatch blockerStarted = new CountDownLatch(1); + + Callable blocker = + () -> { + inFlight.incrementAndGet(); + try { + blockerStarted.countDown(); + new CountDownLatch(1).await(); // until interrupted + return null; + } catch (InterruptedException e) { + return null; + } finally { + inFlight.decrementAndGet(); + } + }; + Callable failing = + () -> { + blockerStarted.await(); + throw new IllegalStateException("boom"); + }; + + assertThrows( + IllegalStateException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(2), List.of(blocker, failing), "batch")); + // The contract callers' finally-blocks rely on: when awaitAll throws, no worker is still + // running (still writing into directories about to be deleted). + assertEquals(0, inFlight.get()); + } + + @Test + void progressCountsOnlySuccessesContiguously() { + // Released from the PROGRESS callback (the orchestrating thread), not from the ok tasks: + // a task-side latch would leave a window where the failing task's completion overtakes an + // ok task's still-being-enqueued completion, making the recorded sequence racy. + CountDownLatch twoConsumed = new CountDownLatch(2); + List dones = new ArrayList<>(); // appended on the calling thread only + Callable ok = () -> null; + Callable failing = + () -> { + twoConsumed.await(); // both successes are consumed first, provably + throw new IllegalStateException("boom"); + }; + assertThrows( + IllegalStateException.class, + () -> + Tasks.awaitAll( + Tasks.Workers.platform(3), + List.of(ok, ok, failing), + "batch", + (done, total) -> { + dones.add(done); + twoConsumed.countDown(); + })); + assertEquals(List.of(1, 2), dones); + } + + // ---- caller interruption ---- + + @Test + void callerInterruptionStopsWorkersAndRestoresTheFlag() throws Exception { + CountDownLatch workerStarted = new CountDownLatch(1); + AtomicBoolean workerInterrupted = new AtomicBoolean(); + AtomicInteger inFlight = new AtomicInteger(); + AtomicReference thrown = new AtomicReference<>(); + AtomicBoolean flagRestored = new AtomicBoolean(); + + Callable blocker = + () -> { + inFlight.incrementAndGet(); + try { + workerStarted.countDown(); + new CountDownLatch(1).await(); + return null; + } catch (InterruptedException e) { + workerInterrupted.set(true); + return null; + } finally { + inFlight.decrementAndGet(); + } + }; + + Thread caller = + new Thread( + () -> { + try { + Tasks.awaitAll( + Tasks.Workers.platform(1), List.of(blocker), "register"); + } catch (IOException e) { + thrown.set(e); + flagRestored.set(Thread.currentThread().isInterrupted()); + } + }); + caller.start(); + assertTrue(workerStarted.await(10, TimeUnit.SECONDS)); + caller.interrupt(); + caller.join(10_000); + assertFalse(caller.isAlive(), "the interrupted batch must unwind promptly"); + + assertNotNull(thrown.get()); + assertInstanceOf(IOException.class, thrown.get()); + assertEquals("register interrupted", thrown.get().getMessage()); + assertTrue(flagRestored.get(), "the interrupt flag must be restored"); + assertTrue(workerInterrupted.get(), "the worker must be stopped"); + assertEquals(0, inFlight.get(), "quiescent before the exception reached the caller"); + } + + // ---- virtual mode ---- + + @Test + void virtualModeBoundsInFlightWork() throws Exception { + int bound = 3; + AtomicInteger inFlight = new AtomicInteger(); + AtomicInteger highWater = new AtomicInteger(); + CountDownLatch gate = new CountDownLatch(1); + CountDownLatch boundRunning = new CountDownLatch(bound); + + List> tasks = new ArrayList<>(); + for (int i = 0; i < 12; i++) { + tasks.add( () -> { - throw new IllegalStateException("oops"); - }; - IOException thrown = - assertThrows( - IOException.class, - () -> Tasks.awaitAll(pool, List.of(task), "interrupted", "failed")); - assertEquals("failed", thrown.getMessage()); - assertInstanceOf(IllegalStateException.class, thrown.getCause()); - } finally { - pool.shutdown(); + int now = inFlight.incrementAndGet(); + highWater.accumulateAndGet(now, Math::max); + boundRunning.countDown(); + try { + gate.await(); + return null; + } finally { + inFlight.decrementAndGet(); + } + }); } + Thread opener = + new Thread( + () -> { + try { + // Open the gate only once `bound` workers are provably in flight, + // so the high-water assertion is meaningful, then let all pass. + boundRunning.await(); + gate.countDown(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + }); + opener.start(); + Tasks.awaitAll(Tasks.Workers.virtual(bound), tasks, "jbig2 encode"); + opener.join(10_000); + assertEquals(bound, highWater.get(), "no more than the bound may run at once"); + } + + @Test + void rejectsANonPositiveWorkerLimit() { + assertThrows(IllegalArgumentException.class, () -> Tasks.Workers.platform(0)); + assertThrows(IllegalArgumentException.class, () -> Tasks.Workers.virtual(-1)); + } + + /** A task that throws {@code failure} once called. */ + private static Callable failing(Exception failure) { + return () -> { + throw failure; + }; } }