Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ public record Config(
* Aggregate outcome of a run.
*
* @param pages number of pages processed
* @param componentsRemoved total components removed across all pages
* @param componentsRemoved total components removed across all pages; {@code 0} when no report
* consumed component stats (counting is skipped for speed without a report)
* @param overRemovalWarnings number of pages flagged for possible over-removal
*/
public record Summary(int pages, long componentsRemoved, int overRemovalWarnings) {}
Expand All @@ -96,14 +97,20 @@ public Summary run(Config config, PageProgressListener progress) throws IOExcept
}
LOG.info("despeckling {} page(s) with {} thread(s)", files.size(), config.jobs());

@Nullable Path reportDir = config.reportDir();
boolean reporting = reportDir != null;
Reporter report =
config.reportDir() == null
? Reporter.noOp()
: reporterFactory.create(config.reportDir(), config.flipbook());
reportDir != null
? reporterFactory.create(reportDir, config.flipbook())
: Reporter.noOp();
// The report is the only consumer of per-page component counts, and counting is a full
// connected-component labeling twice per page — skip it when no report will be written.
ProcessOptions options =
reporting ? config.options() : config.options().withoutComponentStats();

List<Callable<PageOutcome>> tasks = new ArrayList<>(files.size());
for (Path src : files) {
tasks.add(() -> processOne(src, config, report));
tasks.add(() -> processOne(src, config, options, report));
}
// Platform workers: each page is CPU-bound Leptonica work (FFM downcalls pin virtual
// threads' carriers). The fan-out fails fast and quiesces before throwing, so a failed
Expand All @@ -124,9 +131,11 @@ public Summary run(Config config, PageProgressListener progress) throws IOExcept
report.finish();

long totalRemoved = 0;
long blackRemoved = 0;
int warnings = 0;
for (PageOutcome outcome : outcomes) {
totalRemoved += outcome.result().componentsRemoved();
blackRemoved += outcome.result().blackPixelsRemoved();
if (outcome.result().isOverRemoval()) {
warnings++;
LOG.warn(
Expand All @@ -135,17 +144,28 @@ public Summary run(Config config, PageProgressListener progress) throws IOExcept
Math.round(outcome.result().removedBlackPixelRatio() * 100));
}
}
LOG.info(
"done: {} page(s), {} component(s) removed, {} over-removal warning(s)",
files.size(),
totalRemoved,
warnings);
if (reporting) {
LOG.info(
"done: {} page(s), {} component(s) removed, {} over-removal warning(s)",
files.size(),
totalRemoved,
warnings);
} else {
// Without a report nothing counted components (the counting passes are skipped for
// speed), so the summary speaks in the always-measured black-pixel terms.
LOG.info(
"done: {} page(s), {} black pixel(s) removed, {} over-removal warning(s)",
files.size(),
blackRemoved,
warnings);
}
return new Summary(files.size(), totalRemoved, warnings);
}

private record PageOutcome(Path source, ProcessResult result) {}

private PageOutcome processOne(Path src, Config config, Reporter report) throws IOException {
private PageOutcome processOne(Path src, Config config, ProcessOptions options, Reporter report)
throws IOException {
Path dest =
CorpusFiles.mirrorDestination(
src, config.inputDir(), config.outputDir(), config.format().extension());
Expand All @@ -154,7 +174,7 @@ private PageOutcome processOne(Path src, Config config, Reporter report) throws
if (parent != null) {
Files.createDirectories(parent);
}
ProcessResult result = pageCleaner.clean(src, dest, config.format(), config.options());
ProcessResult result = pageCleaner.clean(src, dest, config.format(), options);
Path stem = config.inputDir().relativize(src);
report.addPage(stem, src, dest, result);
return new PageOutcome(src, result);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,14 +148,23 @@ public Summary run(Config config) throws IOException {

if (config.reportParent() != null) {
batchReporter.write(config.reportParent(), books);
LOG.info(
"done: {} ok, {} skipped, {} failed, {} page(s), {} component(s) removed",
ok,
skipped,
failed,
totalPages,
totalComponentsRemoved);
} else {
// Without reports the runs skip component counting (an expensive labeling, twice per
// page), so a component total would always read 0 — leave it out of the line.
LOG.info(
"done: {} ok, {} skipped, {} failed, {} page(s)",
ok,
skipped,
failed,
totalPages);
}
LOG.info(
"done: {} ok, {} skipped, {} failed, {} page(s), {} component(s) removed",
ok,
skipped,
failed,
totalPages,
totalComponentsRemoved);
return new Summary(ok, skipped, failed, totalPages, totalComponentsRemoved);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package io.github.p4suta.despeckle.application;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;

Expand Down Expand Up @@ -141,4 +142,30 @@ void drivesTheReporterWhenAReportDirIsGiven(@TempDir Path tmp) throws IOExceptio
assertEquals(2, factory.reporter.pages.get(), "every page is reported");
assertEquals(1, factory.reporter.finished.get(), "the report is finalized once");
}

@Test
void skipsComponentCountingWithoutAReport(@TempDir Path tmp) throws IOException {
Path in = tmp.resolve("in");
writeInputs(in, 1);
FakePageCleaner cleaner = new FakePageCleaner(LIGHT);

new DespeckleService(cleaner, new RecordingReporterFactory())
.run(config(in, tmp.resolve("out"), false, null));

ProcessOptions seen = java.util.Objects.requireNonNull(cleaner.lastOptions);
assertFalse(seen.collectComponentStats(), "no report -> counting passes are skipped");
}

@Test
void keepsComponentCountingForTheReportPath(@TempDir Path tmp) throws IOException {
Path in = tmp.resolve("in");
writeInputs(in, 1);
FakePageCleaner cleaner = new FakePageCleaner(LIGHT);

new DespeckleService(cleaner, new RecordingReporterFactory())
.run(config(in, tmp.resolve("out"), false, tmp.resolve("report")));

ProcessOptions seen = java.util.Objects.requireNonNull(cleaner.lastOptions);
assertTrue(seen.collectComponentStats(), "the report consumes the counts");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ private Fakes() {}
/** A {@link PageCleaner} that writes a stub output file and returns a fixed result. */
static final class FakePageCleaner implements PageCleaner {
final AtomicInteger calls = new AtomicInteger();
volatile @Nullable ProcessOptions lastOptions;
private final ProcessResult result;

FakePageCleaner(ProcessResult result) {
Expand All @@ -48,6 +49,7 @@ public ProcessResult clean(
throw new java.io.UncheckedIOException(e);
}
calls.incrementAndGet();
lastOptions = options;
return result;
}
}
Expand Down
31 changes: 16 additions & 15 deletions despeckle/docs/cleaner-baseline.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,28 @@ Times each Leptonica primitive the page cleaner composes on a synthetic
600-dpi A5 page (3496x4961 px, fixed seed). Re-run after any change to the
cleaner or the imaging bindings and compare before merging.

- Date (UTC): 2026-06-10 06:25:05
- Date (UTC): 2026-06-10 06:36:38
- Host: Linux amd64, 8 CPUs
- Samples: median of 10 reps after 2 warmups; single-threaded.

| op | median (ms) | min (ms) | calls/clean() | est. share of clean() |
|---|---:|---:|---:|---:|
| read TIFF-G4 | 2.59 | 2.52 | 1 | 1.5% |
| selectBySize k=6 (page) | 15.22 | 14.98 | 1 | 8.7% |
| selectBySize 15 (page) | 15.19 | 14.80 | 1 | 8.7% |
| selectBySize k=6 (inverted) | 22.16 | 21.89 | 2 | 25.3% |
| dilate 43x43 (text mask) | 37.52 | 37.01 | 1 | 21.5% |
| open 7x7 (page) | 12.18 | 12.04 | 1 | 7.0% |
| invert | 0.26 | 0.25 | 2 | 0.3% |
| subtract | 0.38 | 0.36 | 5 | 1.1% |
| and | 0.40 | 0.38 | 1 | 0.2% |
| or | 0.33 | 0.32 | 3 | 0.6% |
| countConnComp | 11.82 | 11.46 | 2 | 13.5% |
| read TIFF-G4 | 2.53 | 2.52 | 1 | 1.6% |
| selectBySize k=6 (page) | 16.04 | 14.90 | 1 | 9.9% |
| selectBySize 15 (page) | 15.02 | 14.84 | 1 | 9.3% |
| selectBySize k=6 (inverted) | 22.14 | 21.84 | 2 | 27.4% |
| dilate 43x43 (text mask) | 38.37 | 36.97 | 1 | 23.7% |
| open 7x7 (page) | 12.15 | 12.01 | 1 | 7.5% |
| invert | 0.25 | 0.25 | 2 | 0.3% |
| subtract | 0.40 | 0.38 | 5 | 1.2% |
| and | 0.41 | 0.40 | 1 | 0.3% |
| or | 0.39 | 0.35 | 3 | 0.7% |
| countConnComp | 11.53 | 11.46 | 2 | 14.3% |
| countPixels | 0.41 | 0.41 | 2 | 0.5% |
| write TIFF-G4 | 6.44 | 6.29 | 1 | 3.7% |
| **Σ(median × calls)** | 161.74 | | | 92.5% |
| **clean() end-to-end** | 174.91 | 173.28 | 1 | 100% |
| write TIFF-G4 | 6.34 | 6.26 | 1 | 3.9% |
| **Σ(median × calls)** | 162.69 | | | 100.6% |
| **clean() end-to-end** | 161.78 | 159.91 | 1 | 100% |
| **clean() without component stats** | 139.46 | 137.07 | 1 | 86.2% |

The Σ row landing near 100% means the table accounts for clean()'s real cost;
a large gap points at untimed work (allocation churn, codec internals).
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,16 @@
* @param removeIsolatedDust whether to run the isolated-medium-dust pass
* @param isolatedDustSizePx an explicit max size for an isolated speck, or empty to derive it; a
* present value also implies {@code removeIsolatedDust}
* @param collectComponentStats whether to count 8-connected components before and after — two of
* the most expensive full-page scans, so off when nothing consumes the counts (no report)
*/
public record ProcessOptions(
OptionalInt dpi,
OptionalInt speckSizePx,
boolean fillHoles,
boolean removeIsolatedDust,
OptionalInt isolatedDustSizePx) {
OptionalInt isolatedDustSizePx,
boolean collectComponentStats) {

/** Resolution assumed for the speck filter when neither a flag nor the image supplies one. */
public static final int DEFAULT_DPI = 300;
Expand All @@ -50,6 +53,16 @@ public record ProcessOptions(
}
}

/** The five-knob shape every direct caller uses; component counting defaults to on. */
public ProcessOptions(
OptionalInt dpi,
OptionalInt speckSizePx,
boolean fillHoles,
boolean removeIsolatedDust,
OptionalInt isolatedDustSizePx) {
this(dpi, speckSizePx, fillHoles, removeIsolatedDust, isolatedDustSizePx, true);
}

/** Options with the isolated-dust pass off — the common case. */
public static ProcessOptions of(OptionalInt dpi, OptionalInt speckSizePx, boolean fillHoles) {
return new ProcessOptions(dpi, speckSizePx, fillHoles, false, OptionalInt.empty());
Expand All @@ -66,7 +79,17 @@ public ProcessOptions withDpi(int dpi) {
speckSizePx,
fillHoles,
removeIsolatedDust,
isolatedDustSizePx);
isolatedDustSizePx,
collectComponentStats);
}

/**
* A copy with component counting disabled — for runs where nothing consumes the counts (no
* report), saving two full connected-component labelings per page.
*/
public ProcessOptions withoutComponentStats() {
return new ProcessOptions(
dpi, speckSizePx, fillHoles, removeIsolatedDust, isolatedDustSizePx, false);
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,28 +1,65 @@
package io.github.p4suta.despeckle.domain.model;

import java.util.OptionalInt;

/**
* The outcome of despeckling one page.
*
* @param componentsBefore 8-connected component count of the input
* @param componentsAfter 8-connected component count of the output
* <p>The black-pixel counts are always measured — they feed the over-removal guard. The component
* counts are measured only when something consumes them (the HTML report): counting 8-connected
* components is a full connected-component labeling of the page, one of the most expensive scans in
* the whole clean, so a run with no report skips both counting passes and carries empty components
* here.
*
* @param componentsBefore 8-connected component count of the input, when counted
* @param componentsAfter 8-connected component count of the output, when counted
* @param blackPixelsBefore foreground pixel count of the input
* @param blackPixelsAfter foreground pixel count of the output
*/
public record ProcessResult(
int componentsBefore, int componentsAfter, long blackPixelsBefore, long blackPixelsAfter) {
OptionalInt componentsBefore,
OptionalInt componentsAfter,
long blackPixelsBefore,
long blackPixelsAfter) {

/**
* A black-pixel removal ratio above this flags a possibly over-cleaned page. The single domain
* home for the over-removal threshold the runner warns on and the report highlights.
*/
public static final double OVER_REMOVAL_WARN_RATIO = 0.03;

/** A counted result — the shape the report path consumes. */
public ProcessResult(
int componentsBefore,
int componentsAfter,
long blackPixelsBefore,
long blackPixelsAfter) {
this(
OptionalInt.of(componentsBefore),
OptionalInt.of(componentsAfter),
blackPixelsBefore,
blackPixelsAfter);
}

/** The result of a run that skipped component counting (nothing consumes the counts). */
public static ProcessResult withoutComponentStats(
long blackPixelsBefore, long blackPixelsAfter) {
return new ProcessResult(
OptionalInt.empty(), OptionalInt.empty(), blackPixelsBefore, blackPixelsAfter);
}

/** Whether component counting ran (true on the report path). */
public boolean hasComponentStats() {
return componentsBefore.isPresent();
}

/**
* Net drop in 8-connected components — dust removed minus any holes filled back in. Summed into
* the run total and plotted per page in the report.
* the run total and plotted per page in the report. {@code 0} when counting was skipped (see
* {@link #hasComponentStats()}).
*/
public int componentsRemoved() {
return componentsBefore - componentsAfter;
return componentsBefore.orElse(0) - componentsAfter.orElse(0);
}

/**
Expand All @@ -36,6 +73,11 @@ public double removedBlackPixelRatio() {
return (double) (blackPixelsBefore - blackPixelsAfter) / blackPixelsBefore;
}

/** Black pixels removed from the page — the cost-free removal measure every run carries. */
public long blackPixelsRemoved() {
return blackPixelsBefore - blackPixelsAfter;
}

/**
* Whether {@link #removedBlackPixelRatio()} exceeds {@link #OVER_REMOVAL_WARN_RATIO}, the
* threshold the runner logs on and the report flags.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,21 @@ void isolatedDustSizeStaysAboveTheSpeckSizeAtVeryLowResolution() {
assertEquals(
2, options.isolatedDustSize(noImg()), "isolated-dust size floors at speckSize + 1");
}

@Test
void componentStatsDefaultOnAndWitherTurnsThemOff() {
ProcessOptions defaults = ProcessOptions.defaults();
assertTrue(defaults.collectComponentStats(), "counting is on unless turned off");

ProcessOptions off = defaults.withoutComponentStats();
assertFalse(off.collectComponentStats());
// Every other knob survives the wither.
assertEquals(defaults.dpi(), off.dpi());
assertEquals(defaults.speckSizePx(), off.speckSizePx());
assertEquals(defaults.fillHoles(), off.fillHoles());
assertEquals(defaults.removeIsolatedDust(), off.removeIsolatedDust());
assertEquals(defaults.isolatedDustSizePx(), off.isolatedDustSizePx());
// ...including through withDpi.
assertFalse(off.withDpi(600).collectComponentStats());
}
}
Loading
Loading