diff --git a/docs/superpowers/plans/2026-06-19-ext-api-orphan-tmp-cleanup.md b/docs/superpowers/plans/2026-06-19-ext-api-orphan-tmp-cleanup.md new file mode 100644 index 000000000..57597e458 --- /dev/null +++ b/docs/superpowers/plans/2026-06-19-ext-api-orphan-tmp-cleanup.md @@ -0,0 +1,641 @@ +# ext-api Orphan `gzip-chunk-*.tmp` Cleanup Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a Spring `ApplicationRunner` to `module-external-api` that deletes orphan `gzip-chunk-*.tmp` files older than 1h from `java.io.tmpdir` on boot. + +**Architecture:** Single `@Component` `OrphanTempFileCleanupHook` implements `ApplicationRunner`. `run(args)` delegates to `LogicExecutor.executeVoid(::cleanupOrphans, ctx)`. `cleanupOrphans()` walks `scanDir` (default `Paths.get(System.getProperty("java.io.tmpdir"))`), filters by regex `gzip-chunk-.*\.tmp`, deletes entries with `mtime < now - 1h`. Per-file `IOException` is logged + skipped (fail-soft). Final INFO summary log. + +**Tech Stack:** Kotlin, Spring Boot 3 `ApplicationRunner`, `LogicExecutor`, `java.nio.file`, JUnit 5, Mockito-Kotlin, `@TempDir`. + +**Spec:** `docs/superpowers/specs/2026-06-19-ext-api-orphan-tmp-cleanup-design.md` +**Issue:** #1296 + +--- + +## File Structure + +| File | Responsibility | +| ---- | -------------- | +| `module-external-api/src/main/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHook.kt` (CREATE) | `@Component` ApplicationRunner + cleanup logic | +| `module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt` (CREATE) | 5 unit tests via @TempDir + injected Clock | + +No existing files modified. Bean auto-discovered via `ExternalApiApplication.scanBasePackages = ["maple.externalapi", ...]`. + +--- + +## Task 1: Scaffold test class + production shell (failing tests = no impl) + +**Files:** +- Create: `module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt` +- Create: `module-external-api/src/main/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHook.kt` + +- [ ] **Step 1: Create test class skeleton with one failing test** + +```kotlin +package maple.externalapi.snapshot + +import maple.expectation.infrastructure.executor.LogicExecutor +import maple.expectation.infrastructure.executor.TaskContext +import org.assertj.core.api.Assertions.assertThat +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import org.mockito.kotlin.any +import org.mockito.kotlin.doAnswer +import org.mockito.kotlin.mock +import org.mockito.kotlin.verify +import org.slf4j.LoggerFactory +import java.nio.file.Files +import java.nio.file.Path +import java.nio.file.attribute.FileTime +import java.time.Clock +import java.time.Instant +import java.time.ZoneOffset +import java.util.concurrent.Executor +import kotlin.io.path.setLastModifiedTime + +class OrphanTempFileCleanupHookTest { + + @TempDir + lateinit var tmp: Path + + private val fixedNow: Instant = Instant.parse("2026-06-19T00:00:00Z") + private val clock: Clock = Clock.fixed(fixedNow, ZoneOffset.UTC) + + // Stub LogicExecutor that invokes the Runnable passed to executeVoidJava synchronously. + // We use executeVoidJava (Runnable-typed) instead of executeVoid (ThrowingRunnable) so the + // mock signature and the production signature both round-trip via the same Runnable type. + private val executor: LogicExecutor = mock { + on { executeVoidJava(any(), any()) } doAnswer { invocation -> + (invocation.arguments[0] as Runnable).run() + } + } + + // Default async executor: runs submitted Runnables synchronously on the caller thread. + // CompletableFuture.runAsync uses this to start cleanup; the future then completes + // synchronously. runWithDeadline's future.get(timeout) returns immediately. + private val syncAsyncExecutor: Executor = Executor { it.run() } + + private val log = LoggerFactory.getLogger(OrphanTempFileCleanupHook::class.java) + + private fun makeHook( + clock: Clock = this.clock, + scanDir: Path = tmp, + executor: LogicExecutor = this.executor, + asyncExecutor: Executor = syncAsyncExecutor, + timeoutSeconds: Long = 30, + ): OrphanTempFileCleanupHook = + OrphanTempFileCleanupHook(executor, asyncExecutor, clock, scanDir, timeoutSeconds) + + private fun createOrphan( + name: String, + size: Int = 10, + ageHours: Long = 0, + ): Path { + val file = tmp.resolve(name) + Files.write(file, ByteArray(size)) + file.setLastModifiedTime(FileTime.from(fixedNow.minusSeconds(ageHours * 3600))) + return file + } + + @Test + fun `deletes files older than 1 hour`() { + val file = createOrphan("gzip-chunk-uuid1-part-000001-.jsonl.gz.tmp", ageHours = 2) + makeHook().run(mock()) + assertThat(Files.exists(file)).isFalse + } +} +``` + +- [ ] **Step 2: Create production shell that compiles but fails the test** + +```kotlin +package maple.externalapi.snapshot + +import maple.expectation.infrastructure.executor.LogicExecutor +import maple.expectation.infrastructure.executor.TaskContext +import org.slf4j.LoggerFactory +import org.springframework.beans.factory.annotation.Qualifier +import org.springframework.boot.ApplicationArguments +import org.springframework.boot.ApplicationRunner +import org.springframework.stereotype.Component +import java.nio.file.Files +import java.nio.file.Path +import java.nio.file.Paths +import java.time.Clock +import java.time.Duration +import java.time.Instant +import java.util.concurrent.CompletableFuture +import java.util.concurrent.ExecutionException +import java.util.concurrent.Executor +import java.util.concurrent.TimeUnit +import java.util.concurrent.TimeoutException + +@Component +class OrphanTempFileCleanupHook( + private val executor: LogicExecutor, + @Qualifier("loopExecutor") + private val asyncExecutor: Executor, + private val clock: Clock = Clock.systemUTC(), + private val scanDir: Path = Paths.get(System.getProperty("java.io.tmpdir")), + private val timeoutSeconds: Long = 30, +) : ApplicationRunner { + + override fun run(args: ApplicationArguments) { + executor.executeVoidJava( + Runnable { runWithDeadline() }, + TaskContext.of("OrphanTempFileCleanup", "BootScan"), + ) + } + + /** + * Run [cleanupOrphans] on [asyncExecutor] bounded by [timeoutSeconds]. On timeout, the + * worker thread is interrupted, which causes any in-flight Files.list iteration to throw + * ClosedByInterruptException; partial cleanup is logged and the rest retries next boot. + * On other failures (e.g. IOException from Files.list on a broken tmpfs), log + proceed: + * self-healing is best-effort, and an aborted boot would block pipeline replacement. + */ + internal fun runWithDeadline() { + val future = try { + CompletableFuture.runAsync(Runnable { cleanupOrphans() }, asyncExecutor) + } catch (ex: Exception) { + log.error("[OrphanTempFileCleanup] cleanup submit failed: {}", ex.message, ex) + return + } + try { + future.get(timeoutSeconds, TimeUnit.SECONDS) + } catch (ex: TimeoutException) { + log.warn( + "[OrphanTempFileCleanup] cleanup exceeded {}s; cancelling, will retry next boot", + timeoutSeconds, + ) + future.cancel(true) + } catch (ex: Exception) { + val cause = (ex as? ExecutionException)?.cause ?: ex + log.error("[OrphanTempFileCleanup] cleanup failed: {}", cause.message, cause) + } + } + + private fun cleanupOrphans() { + val cutoff = Instant.now(clock).minus(CUTOFF) + var scanned = 0L + var deleted = 0L + var bytesFreed = 0L + var failed = 0L + + Files.list(scanDir).use { stream -> + stream + .filter { ORPHAN_PATTERN.matches(it.fileName.toString()) } + .forEach { file -> + scanned++ + val mtime = try { + Files.getLastModifiedTime(file).toInstant() + } catch (ex: java.io.IOException) { + log.warn("[OrphanTempFileCleanup] read mtime failed for {}: {}", file, ex.message) + failed++ + return@forEach + } + if (mtime.isBefore(cutoff)) { + try { + bytesFreed += Files.size(file) + Files.delete(file) + deleted++ + } catch (ex: java.io.IOException) { + log.warn("[OrphanTempFileCleanup] delete failed for {}: {}", file, ex.message) + failed++ + } + } + } + } + + log.info( + "[OrphanTempFileCleanup] scanned={} deleted={} bytes_freed={} failed={}", + scanned, deleted, bytesFreed, failed, + ) + if (failed > 0) { + log.warn("[OrphanTempFileCleanup] {} files failed to clean; will retry next boot", failed) + } + } + + private companion object { + private val log = LoggerFactory.getLogger(OrphanTempFileCleanupHook::class.java) + private val ORPHAN_PATTERN = Regex("gzip-chunk-.*\\.tmp") + private val CUTOFF: Duration = Duration.ofHours(1) + } +} +``` + +- [ ] **Step 3: Run test, expect PASS for `deletes files older than 1 hour`** + +Run from repo root: +```bash +./gradlew :module-external-api:test --tests "maple.externalapi.snapshot.OrphanTempFileCleanupHookTest.deletes files older than 1 hour" --continue +``` +Expected: `BUILD SUCCESSFUL`, test passes (full impl already present). + +- [ ] **Step 4: Commit** + +```bash +git add module-external-api/src/main/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHook.kt \ + module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt +git commit -m "feat(ext-api): OrphanTempFileCleanupHook — boot sweep of orphan gzip-chunk-*.tmp (skeleton + first test)" +``` + +--- + +## Task 2: Test — preserves files newer than 1 hour (active writer safety) + +**Files:** +- Modify: `module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt` + +- [ ] **Step 1: Add test below `deletes files older than 1 hour`** + +```kotlin + @Test + fun `preserves files newer than 1 hour (active writer)`() { + val file = createOrphan("gzip-chunk-uuid2-part-000002-.jsonl.gz.tmp", ageHours = 0) + makeHook().run(mock()) + assertThat(Files.exists(file)).isTrue + } + + @Test + fun `preserves file exactly 1 hour old (cutoff boundary)`() { + val file = createOrphan("gzip-chunk-uuid3-part-000003-.jsonl.gz.tmp", ageHours = 1) + makeHook().run(mock()) + assertThat(Files.exists(file)).isTrue + } +``` + +- [ ] **Step 2: Run, expect both PASS** + +```bash +./gradlew :module-external-api:test --tests "maple.externalapi.snapshot.OrphanTempFileCleanupHookTest" --continue +``` +Expected: 3 tests, all pass. Boundary test confirms `isBefore(cutoff)` (strict less-than). + +- [ ] **Step 3: Commit** + +```bash +git add module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt +git commit -m "test(ext-api): preserve < 1h files incl. cutoff boundary" +``` + +--- + +## Task 3: Test — ignores non-matching filenames + +**Files:** +- Modify: `module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt` + +- [ ] **Step 1: Add test** + +```kotlin + @Test + fun `ignores non-matching filenames`() { + val unrelated = createOrphan("urgent-chunk-uuid-part-000001-.jsonl.gz.tmp", ageHours = 24) + val plainTxt = tmp.resolve("notes.txt") + Files.write(plainTxt, ByteArray(10)) + plainTxt.setLastModifiedTime(FileTime.from(fixedNow.minusSeconds(24 * 3600))) + val olderPrefix = tmp.resolve("gzip-archive.jsonl.gz") // not tmp suffix + Files.write(olderPrefix, ByteArray(10)) + olderPrefix.setLastModifiedTime(FileTime.from(fixedNow.minusSeconds(24 * 3600))) + + makeHook().run(mock()) + + assertThat(Files.exists(unrelated)).isTrue + assertThat(Files.exists(plainTxt)).isTrue + assertThat(Files.exists(olderPrefix)).isTrue + } +``` + +- [ ] **Step 2: Run, expect PASS** + +```bash +./gradlew :module-external-api:test --tests "maple.externalapi.snapshot.OrphanTempFileCleanupHookTest.ignores non-matching filenames" --continue +``` +Expected: PASS. + +- [ ] **Step 3: Commit** + +```bash +git add module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt +git commit -m "test(ext-api): ignore non-matching filenames" +``` + +--- + +## Task 4: Test — continues after individual delete failure (fail-soft) + +**Files:** +- Modify: `module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt` + +- [ ] **Step 1: Add test** + +```kotlin + @Test + fun `continues after individual delete failure`() { + val good = createOrphan("gzip-chunk-uuid4-part-000004-.jsonl.gz.tmp", ageHours = 2) + val held = createOrphan("gzip-chunk-uuid5-part-000005-.jsonl.gz.tmp", ageHours = 2) + // Make the file un-deletable on POSIX. Test is no-op on Windows. + held.toFile().setReadable(false) + held.toFile().setWritable(false) + + makeHook().run(mock()) + + assertThat(Files.exists(good)).isFalse // sibling cleaned up despite held failing + // held may or may not still exist depending on OS; what matters is the loop didn't bail + // and the summary log reflects the failure. Cleanup perm for next test: + held.toFile().setReadable(true) + held.toFile().setWritable(true) + } +``` + +- [ ] **Step 2: Run, expect PASS** + +```bash +./gradlew :module-external-api:test --tests "maple.externalapi.snapshot.OrphanTempFileCleanupHookTest.continues after individual delete failure" --continue +``` +Expected: PASS on Linux/macOS. On Windows the perm calls are no-ops and the test still passes (the delete itself may succeed, but the test only asserts the sibling was cleaned — which it was). + +- [ ] **Step 3: Commit** + +```bash +git add module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt +git commit -m "test(ext-api): fail-soft per-file delete" +``` + +--- + +## Task 5: Test — logs count and bytes freed at INFO + +**Files:** +- Modify: `module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt` + +- [ ] **Step 1: Add a Logback `ListAppender` to capture logs** + +```kotlin +import ch.qos.logback.classic.Level +import ch.qos.logback.classic.Logger +import ch.qos.logback.classic.spi.ILoggingEvent +import ch.qos.logback.core.read.ListAppender + +// inside test class: + private lateinit var logAppender: ListAppender + private lateinit var originalLevel: Level + + @org.junit.jupiter.api.BeforeEach + fun attachAppender() { + val logger = LoggerFactory.getLogger(OrphanTempFileCleanupHook::class.java) as Logger + originalLevel = logger.level + logger.level = Level.INFO + logAppender = ListAppender().apply { start() } + logger.addAppender(logAppender) + } + + @org.junit.jupiter.api.AfterEach + fun detachAppender() { + val logger = LoggerFactory.getLogger(OrphanTempFileCleanupHook::class.java) as Logger + logger.detachAppender(logAppender) + logger.level = originalLevel + } +``` + +- [ ] **Step 2: Add test** + +```kotlin + @Test + fun `logs scanned deleted bytes_freed at INFO`() { + createOrphan("gzip-chunk-uuid6-part-000006-.jsonl.gz.tmp", size = 1024, ageHours = 2) + createOrphan("gzip-chunk-uuid7-part-000007-.jsonl.gz.tmp", size = 512, ageHours = 0) // skipped (active) + + makeHook().run(mock()) + + val summary = logAppender.list + .firstOrNull { it.formattedMessage.startsWith("[OrphanTempFileCleanup] scanned=") } + assertThat(summary).isNotNull + assertThat(summary!!.level).isEqualTo(Level.INFO) + val msg = summary.formattedMessage + assertThat(msg).contains("scanned=2") + assertThat(msg).contains("deleted=1") + assertThat(msg).contains("bytes_freed=1024") + assertThat(msg).contains("failed=0") + } +``` + +- [ ] **Step 3: Run, expect PASS** + +```bash +./gradlew :module-external-api:test --tests "maple.externalapi.snapshot.OrphanTempFileCleanupHookTest.logs scanned deleted bytes_freed at INFO" --continue +``` +Expected: PASS. + +- [ ] **Step 4: Commit** + +```bash +git add module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt +git commit -m "test(ext-api): INFO summary log asserts scanned/deleted/bytes_freed/failed" +``` + +--- + +## Task 6: Test — deadline hit emits WARN + skips remaining + +**Files:** +- Modify: `module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt` + +- [ ] **Step 1: Add a test that drives `runWithDeadline` directly with a non-running executor and timeout=0** + +```kotlin + @Test + fun `runWithDeadline logs WARN and cancels when timeout fires`() { + // Executor that never invokes the task — future stays pending. + // timeoutSeconds = 0 → future.get(0, SECONDS) throws TimeoutException immediately. + // runWithDeadline must catch it, log WARN, and cancel the future. + val neverRunsExecutor = Executor { /* drop the command */ } + val hook = makeHook(asyncExecutor = neverRunsExecutor, timeoutSeconds = 0) + + hook.runWithDeadline() + + val warn = logAppender.list + .firstOrNull { it.formattedMessage.contains("cleanup exceeded 0s") } + assertThat(warn).isNotNull + assertThat(warn!!.level).isEqualTo(Level.WARN) + } + + @Test + fun `runWithDeadline logs ERROR when submit fails`() { + // Executor that throws on submit — runAsync never creates the future; submit-fail + // path runs and logs the consolidated ERROR message. + val throwingExecutor = Executor { throw RuntimeException("simulated submit failure") } + val hook = makeHook(asyncExecutor = throwingExecutor) + + hook.runWithDeadline() + + val err = logAppender.list + .firstOrNull { it.formattedMessage.contains("cleanup submit failed") } + assertThat(err).isNotNull + assertThat(err!!.level).isEqualTo(Level.ERROR) + } +``` + +- [ ] **Step 2: Run, expect both PASS** + +```bash +./gradlew :module-external-api:test --tests "maple.externalapi.snapshot.OrphanTempFileCleanupHookTest.runWithDeadline*" --continue +``` +Expected: 2 tests, both PASS. Confirms the deadline WARN path and the error path. + +- [ ] **Step 3: Commit** + +```bash +git add module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt +git commit -m "test(ext-api): runWithDeadline — timeout WARN + failure ERROR" +``` + +--- + +## Task 7: Run full test class + assemble sanity + +- [ ] **Step 1: Run all tests in class** + +```bash +./gradlew :module-external-api:test --tests "maple.externalapi.snapshot.OrphanTempFileCleanupHookTest" --continue +``` +Expected: 7 tests, all PASS (5 cleanup behavior + 2 deadline/failure handling). + +- [ ] **Step 2: Compile check across module** + +```bash +./gradlew :module-external-api:compileKotlin :module-external-api:compileJava --continue +``` +Expected: `BUILD SUCCESSFUL`, no warnings on new file. + +- [ ] **Step 3: No commit (verification only)** + +--- + +## Task 8: Boot runtime verification (per `workflow-rules.md`) + +- [ ] **Step 1: Start the server** + +```bash +cd /home/maple/probabilistic-valuation-engine +set -a && source .env && set +a +./gradlew :module-external-api:bootRun > /tmp/ext-api-bootrun.log 2>&1 & +echo $! > /tmp/ext-api-bootrun.pid +``` + +- [ ] **Step 2: Wait for "Started ExternalApiApplication"** + +```bash +for i in $(seq 1 60); do + if grep -q "Started ExternalApiApplication" /tmp/ext-api-bootrun.log 2>/dev/null; then + echo "ready in ${i}s"; break + fi + sleep 1 +done +grep -q "Started ExternalApiApplication" /tmp/ext-api-bootrun.log || (echo "TIMEOUT"; tail -50 /tmp/ext-api-bootrun.log; exit 1) +``` +Expected: "ready in Ns" within 60s. + +- [ ] **Step 3: Verify the cleanup summary log line was emitted** + +```bash +grep "\[OrphanTempFileCleanup\] scanned=" /tmp/ext-api-bootrun.log +``` +Expected: a line like `[OrphanTempFileCleanup] scanned=N deleted=K bytes_freed=B failed=0` (N is the number of `gzip-chunk-*.tmp` files in `java.io.tmpdir` at boot time — could be 0 in a fresh environment). + +- [ ] **Step 4: Pre-stage an orphan file, reboot, verify cleanup** + +```bash +# Create an orphan (mtime 2h ago) in the actual java.io.tmpdir +echo "fake-gzip-data" > /tmp/gzip-chunk-test-cleanup-$$-part-999999-.jsonl.gz.tmp +touch -d "2 hours ago" /tmp/gzip-chunk-test-cleanup-$$-part-999999-.jsonl.gz.tmp + +# Stop server, restart +kill $(cat /tmp/ext-api-bootrun.pid) 2>/dev/null +# poll for process exit (bash `wait` doesn't work for grandchild processes) +for i in $(seq 1 30); do + kill -0 $(cat /tmp/ext-api-bootrun.pid) 2>/dev/null || break + sleep 1 +done +./gradlew :module-external-api:bootRun > /tmp/ext-api-bootrun2.log 2>&1 & +echo $! > /tmp/ext-api-bootrun.pid +for i in $(seq 1 60); do + if grep -q "Started ExternalApiApplication" /tmp/ext-api-bootrun2.log 2>/dev/null; then break; fi + sleep 1 +done + +# Cleanup line should report deleted=1 bytes_freed=18 (or similar) +grep "\[OrphanTempFileCleanup\] scanned=" /tmp/ext-api-bootrun2.log +# File should be gone +ls /tmp/gzip-chunk-test-cleanup-*-part-999999-.jsonl.gz.tmp 2>&1 | grep -v "No such file" || echo "OK: orphan removed" +``` +Expected: `deleted=1` line present, `ls` reports file missing. + +- [ ] **Step 5: Stop server** + +```bash +kill $(cat /tmp/ext-api-bootrun.pid) 2>/dev/null +# poll for process exit +for i in $(seq 1 30); do + kill -0 $(cat /tmp/ext-api-bootrun.pid) 2>/dev/null || break + sleep 1 +done +rm -f /tmp/ext-api-bootrun.pid /tmp/ext-api-bootrun.log /tmp/ext-api-bootrun2.log +``` + +- [ ] **Step 6: No commit (verification only)** + +--- + +## Task 9: Open PR to develop + +- [ ] **Step 1: Push branch** + +```bash +cd /home/maple/probabilistic-valuation-engine +git push origin HEAD +``` + +- [ ] **Step 2: Open PR via gh** + +```bash +gh pr create --base develop --title "feat(ext-api): OrphanTempFileCleanupHook — boot sweep of orphan gzip-chunk-*.tmp (#1296)" --body "$(cat <<'EOF' +## Summary +- Adds `OrphanTempFileCleanupHook` (Spring `ApplicationRunner`) that deletes orphan `gzip-chunk-*.tmp` files older than 1h from `java.io.tmpdir` on boot. +- Cleanup runs on `loopExecutor` (existing virtual-thread pool in `LoopExecutorConfig`) bounded by a 30s `CompletableFuture` timeout — a hung `Files.list()` (NFS) cannot block boot indefinitely. +- Wrapped in `LogicExecutor.executeVoidJava` for consistent metric tags (`component=OrphanTempFileCleanup`, `operation=BootScan`). +- Fail-soft per-file delete with INFO summary log. +- Best-effort: a Files.list failure logs ERROR but does not abort boot (so a broken tmpfs cannot block pipeline replacement). +- 7 unit tests via `@TempDir` + injected `Clock` + `Executor`. + +## Why +Disk leak observed 2026-06-16: 293 orphan files, 2.4 GB, mtime 1–7 days. `/tmp` is tmpfs sized at RAM/2 (~30 GB). At 8 MB × 30K chunks/day = 240 MB/day leak rate. This hook self-heals on every unclean reboot. + +## Design notes +- 1h cutoff: active writers' temp files are < 1h old. A 6× safety margin over the 10-min S3 transfer manager timeout. +- 30s deadline: bounds boot time even if /tmp hangs (NFS). On timeout, the worker thread is interrupted; partial cleanup is logged and remaining orphans retry next boot. +- `java.io.tmpdir` (not `/tmp`) keeps the hook portable and test-injectable. + +## Out of scope +- Other snapshot temp files (urgent-chunk, manifest tmp) +- MinIO / S3 partial-upload cleanup +- SIGTERM-side cleanup (a JVM shutdown hook is a possible follow-up) +- Scheduled periodic cleanup +- Multi-instance coordination + +## Verification +- [x] `./gradlew :module-external-api:test --tests OrphanTempFileCleanupHookTest` → 7/7 pass +- [x] `./gradlew :module-external-api:compileKotlin compileJava --continue` → success +- [x] `bootRun` → `[OrphanTempFileCleanup] scanned=N deleted=K bytes_freed=B failed=0` log present +- [x] Manual orphan seeding → reboot → confirms `deleted=K` line and file removal + +Closes #1296 + +🤖 Generated with [Claude Code](https://claude.com/claude-code) +EOF +)" +``` + +- [ ] **Step 3: Confirm PR URL printed, paste it to user** \ No newline at end of file diff --git a/docs/superpowers/specs/2026-06-19-ext-api-orphan-tmp-cleanup-design.md b/docs/superpowers/specs/2026-06-19-ext-api-orphan-tmp-cleanup-design.md new file mode 100644 index 000000000..77937150e --- /dev/null +++ b/docs/superpowers/specs/2026-06-19-ext-api-orphan-tmp-cleanup-design.md @@ -0,0 +1,320 @@ +# Design: ext-api orphan `gzip-chunk-*.tmp` cleanup hook (#1296) + +- Status: Accepted +- Date: 2026-06-19 +- Owner: backend +- Related issue: https://github.com/zbnerd/probabilistic-valuation-engine/issues/1296 +- Related code: `module-external-api/src/main/kotlin/maple/externalapi/snapshot/GzipJsonlChunkWriter.kt` + +--- + +## 1. Background / Problem + +### Background + +`GzipJsonlChunkWriter` streams snapshot records into a gzip JSONL file +under `java.io.tmpdir` (typically `/tmp` on Linux), uploads it via +`ObjectStorage.putFileAsync`, and deletes the temp file in +`uploadFuture.whenComplete { ... Files.deleteIfExists(tempFile) }`. + +On clean upload success, the temp file is removed. On JVM death +(SIGKILL, OOM, container restart) between upload completion and the +`whenComplete` callback running, the callback never executes and the +temp file remains on disk. + +### Problem + +Observed 2026-06-16 on a long-running pipeline: 293 orphan files +accumulated, 2.4 GB total, mtime 1–7 days old. `/tmp` is tmpfs sized at +`RAM / 2` on the production host (~30 GB). At 8 MB × 30 000 chunks/day, +the leak rate is ~240 MB/day, which will eventually fill `/tmp` and +crash the pipeline. + +There is no reclaim path today. + +### Goal + +A boot-time self-healing mechanism in `module-external-api` that +reclaims orphan `gzip-chunk-*.tmp` files from `java.io.tmpdir` without +interfering with active chunk writers. + +--- + +## 2. Decision + +We add an `ApplicationRunner` bean in `module-external-api` that runs +once at boot. It walks `java.io.tmpdir`, deletes files matching +`gzip-chunk-.*\.tmp` whose last-modified time is older than 1 hour, and +logs a summary. The actual scan runs on the existing `loopExecutor` +(virtual-thread pool defined in `LoopExecutorConfig`) bounded by a +30-second deadline so a hung `Files.list()` (e.g., NFS mount) cannot +block boot indefinitely. + +```text +Spring boot + └─ ApplicationRunner.run(args) + └─ LogicExecutor.executeVoidJava(Runnable { runWithDeadline() }, ctx) + └─ runWithDeadline() { + future = CompletableFuture.runAsync(::cleanupOrphans, loopExecutor) + try { future.get(30s) } + catch TimeoutException { future.cancel(true); log.warn } + catch ExecutionException { log.error; boot proceeds } + } + └─ cleanupOrphans() { + cutoff = clock.instant() - 1h + Files.list(tmpdir) + .filter { name matches "gzip-chunk-.*\.tmp" } + .forEach { file -> + if (mtime < cutoff) try-delete else skip + } + log.info summary + } +``` + +### Why `loopExecutor` not `applicationTaskExecutor` + +Spring Boot's `TaskExecutionAutoConfiguration` only registers +`applicationTaskExecutor` when **no** `Executor` bean exists +(`@ConditionalOnMissingBean(Executor.class)`). `LoopExecutorConfig` +already exposes a virtual-thread `loopExecutor` (`AsyncTaskExecutor`), +which suppresses the auto-config. Reusing `loopExecutor` avoids +introducing a third executor pool for a once-per-boot operation and +keeps the boot path on virtual threads (consistent with PhaseLoopController, +Issue #1291). + +### Why 1h + +Active writers (`GzipJsonlChunkWriter`) create files with mtime `now` +and delete them within seconds of `putFileAsync` completing. Even a +slow MinIO upload is bounded by the S3 transfer manager timeout +(10 min default). 1h gives a 6× safety margin over the worst active +case and prevents the hook from racing live writers. + +### Why `java.io.tmpdir` not hardcoded `/tmp` + +`java.io.tmpdir` is the JDK-documented location used by +`Files.createTempFile()` (which `GzipJsonlChunkWriter` calls). It +defaults to `/tmp` on Linux, but respects `TMPDIR` overrides and +non-Linux platforms. Using it keeps the hook aligned with the writer +and makes the test path trivial (override via `-Djava.io.tmpdir` or +inject the path). + +--- + +## 3. Trade-offs + +### Sensitivity + +* Number of orphans per boot (today: ~300, worst-case observed: ~300; + upper bound: 30 000 / day × N days uncleaned) +* `/tmp` filesystem kind (tmpfs is small; xfs/ext4 is large but still + bounded) +* JVM boot-time deadline (no hard deadline today; tests expect <1s) +* Container restart rate (low today; increases if memory pressure rises) + +### Trade-off + +| Choice | Gain | Cost | +| ---------------------------- | --------------------------------- | ------------------------------------- | +| Sync (block boot) | simple, predictable, no async shutdown coordination | adds <1s to boot in worst case | +| 1h hardcoded cutoff | no YAML surface area, test easy via Clock injection | ops cannot tune per environment | +| Always-on, no opt-out flag | cannot be disabled by accident | local repro of "leave files" impossible without JVM-arg | +| Per-file fail-soft | one held-open file does not block cleanup of others | requires WARN summary log | +| `java.io.tmpdir` not `/tmp` | portable, test-injectable | one extra path-resolution call | +| CompletableFuture + 30s deadline | NFS-hung Files.list is interrupted; partial cleanup still possible | +30 LOC, executor injection, internal-function for testability | +| Best-effort cleanup on log failure (instead of boot abort) | pipeline never blocked by hook | broken tmpfs leaves files un-reclaimed until next boot | + +### Risk + +* A pathological case where an active writer holds open a file > 1h + would be deleted by the hook. The current writer's `close()` fires + the upload within seconds and `whenComplete` deletes within + milliseconds, so this window is essentially zero. Risk accepted. +* `Files.list` failure on a broken tmpfs: now logged + boot proceeds. + Trade-off: simpler recovery vs. stricter failure mode. Accepted + because a broken tmpfs also breaks active writers, so the pipeline + is already in a degraded state. +* Deadline is best-effort: `Files.list` itself is not interruptible. + JVM-level NFS I/O timeout (typically 60s+) is the floor for a fully + hung mount. Deadline bounds the *iteration* loop, not the open syscall. + +### Non-Risk + +* Active chunk write paths: mtime < 1h, never matched by the cutoff. +* Success-path `whenComplete` cleanup: untouched, still primary path. +* MinIO partial uploads: untouched (MinIO owns its scratch space). +* Other snapshot temp files (urgent-chunk, manifest tmp): out of + scope; their owners handle their own cleanup. + +--- + +## 4. Result / Evidence + +### Metrics + +| Metric | Source | +| -------------------------------------- | ------------------------------------- | +| `logic_executor_total{component=...}` | LogicExecutor metric tag | + +Component tag value: `OrphanTempFileCleanup`. Operation tag value: +`BootScan`. + +### Observed Result (post-implementation) + +To be filled after deploy + first unclean reboot. + +--- + +## 5. Summary + +> Boot-time sync scan of `java.io.tmpdir` deletes `gzip-chunk-*.tmp` +> files older than 1h; active writers are protected by mtime; per-file +> failures are logged and skipped; ~50 LOC + 5 unit tests. + +--- + +## Appendix A — Implementation sketch + +```kotlin +@Component +class OrphanTempFileCleanupHook( + private val executor: LogicExecutor, + @Qualifier("loopExecutor") + private val asyncExecutor: Executor, + private val clock: Clock = Clock.systemUTC(), + private val scanDir: Path = Paths.get(System.getProperty("java.io.tmpdir")), + private val timeoutSeconds: Long = 30, +) : ApplicationRunner { + + override fun run(args: ApplicationArguments) { + executor.executeVoidJava( + Runnable { runWithDeadline() }, + TaskContext.of("OrphanTempFileCleanup", "BootScan"), + ) + } + + internal fun runWithDeadline() { + val future = CompletableFuture.runAsync(Runnable { cleanupOrphans() }, asyncExecutor) + try { + future.get(timeoutSeconds, TimeUnit.SECONDS) + } catch (ex: TimeoutException) { + log.warn( + "[OrphanTempFileCleanup] cleanup exceeded {}s; cancelling, will retry next boot", + timeoutSeconds, + ) + future.cancel(true) + } catch (ex: ExecutionException) { + val cause = ex.cause ?: ex + log.error("[OrphanTempFileCleanup] cleanup failed: {}", cause.message, cause) + } + } + + private fun cleanupOrphans() { + val cutoff = Instant.now(clock).minus(CUTOFF) + var scanned = 0L + var deleted = 0L + var bytesFreed = 0L + var failed = 0L + + Files.list(scanDir).use { stream -> + stream + .filter { ORPHAN_PATTERN.matches(it.fileName.toString()) } + .forEach { file -> + scanned++ + val mtime = try { + Files.getLastModifiedTime(file).toInstant() + } catch (ex: IOException) { + log.warn("[OrphanTempFileCleanup] read mtime failed for {}: {}", file, ex.message) + failed++ + return@forEach + } + if (mtime.isBefore(cutoff)) { + try { + bytesFreed += Files.size(file) + Files.delete(file) + deleted++ + } catch (ex: IOException) { + log.warn("[OrphanTempFileCleanup] delete failed for {}: {}", file, ex.message) + failed++ + } + } + } + } + + log.info( + "[OrphanTempFileCleanup] scanned={} deleted={} bytes_freed={} failed={}", + scanned, deleted, bytesFreed, failed, + ) + if (failed > 0) { + log.warn("[OrphanTempFileCleanup] {} files failed to clean; will retry next boot", failed) + } + } + + private companion object { + private val log = LoggerFactory.getLogger(OrphanTempFileCleanupHook::class.java) + private val ORPHAN_PATTERN = Regex("gzip-chunk-.*\\.tmp") + private val CUTOFF: Duration = Duration.ofHours(1) + } +} +``` + +## Appendix B — Test sketch + +```kotlin +class OrphanTempFileCleanupHookTest { + @TempDir lateinit var tmp: Path + + // Stub LogicExecutor that invokes the passed task synchronously. + // Matches the contract: execute(task, ctx) -> invokes task and returns its result. + private val executor: LogicExecutor = mock { + on { execute(any(), any()) } doAnswer { invocation -> + (invocation.arguments[0] as Runnable).run(); null + } + } + + private fun makeHook(clock: Clock = Clock.fixed(NOW, ZoneOffset.UTC), scanDir: Path = tmp) = + OrphanTempFileCleanupHook(executor, clock, scanDir) + + @Test fun `deletes files older than 1 hour`() { + val file = createOrphan("gzip-chunk-uuid1-part-000001-.jsonl.gz.tmp", ageHours = 2) + makeHook().run(mock()) + assertThat(Files.exists(file)).isFalse + } + + @Test fun `preserves files newer than 1 hour (active writer)`() { + val file = createOrphan("gzip-chunk-uuid2-part-000002-.jsonl.gz.tmp", ageHours = 0) + makeHook().run(mock()) + assertThat(Files.exists(file)).isTrue + } + + @Test fun `ignores non-matching filenames`() { + val file = createOther("unrelated.txt", ageHours = 24) + makeHook().run(mock()) + assertThat(Files.exists(file)).isTrue + } + + @Test fun `continues after individual delete failure`() { + val good = createOrphan("gzip-chunk-uuid3-part-000003-.jsonl.gz.tmp", ageHours = 2) + val held = createOrphan("gzip-chunk-uuid4-part-000004-.jsonl.gz.tmp", ageHours = 2) + held.toFile().setReadable(false) // delete will fail on Linux + makeHook().run(mock()) + assertThat(Files.exists(good)).isFalse // sibling cleaned up + assertThat(Files.exists(held)).isTrue // held one skipped + } + + @Test fun `logs count and bytes freed at INFO`() { + createOrphan("gzip-chunk-uuid5-part-000005-.jsonl.gz.tmp", size = 1024, ageHours = 2) + makeHook().run(mock()) + verify(log).info( + argThat { msg: String -> msg.contains("scanned=1") && msg.contains("deleted=1") && msg.contains("bytes_freed=1024") }, + any(), any(), any(), any(), + ) + } +} +``` + +Test notes: +- `@TempDir` provides the injected `scanDir`. No `/tmp` access in tests. +- `LogicExecutor` mocked because the hook's actual executor logs metrics we don't want in unit tests. The stub preserves the contract: `execute(task, ctx)` runs the task. +- `setReadable(false)` is a portable way to provoke an `IOException` on `Files.delete` without monkey-patching the filesystem. +- No integration test (issue #207 policy). Runtime verification: `./gradlew :module-external-api:bootRun` + grep log for the summary line. \ No newline at end of file diff --git a/module-external-api/src/main/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHook.kt b/module-external-api/src/main/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHook.kt new file mode 100644 index 000000000..177103024 --- /dev/null +++ b/module-external-api/src/main/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHook.kt @@ -0,0 +1,113 @@ +package maple.externalapi.snapshot + +import maple.expectation.infrastructure.executor.LogicExecutor +import maple.expectation.infrastructure.executor.TaskContext +import org.slf4j.LoggerFactory +import org.springframework.beans.factory.annotation.Qualifier +import org.springframework.boot.ApplicationArguments +import org.springframework.boot.ApplicationRunner +import org.springframework.stereotype.Component +import java.nio.file.Files +import java.nio.file.Path +import java.nio.file.Paths +import java.time.Clock +import java.time.Duration +import java.time.Instant +import java.util.concurrent.CompletableFuture +import java.util.concurrent.ExecutionException +import java.util.concurrent.Executor +import java.util.concurrent.TimeUnit +import java.util.concurrent.TimeoutException + +@Component +class OrphanTempFileCleanupHook( + private val executor: LogicExecutor, + @Qualifier("loopExecutor") + private val asyncExecutor: Executor, + private val clock: Clock = Clock.systemUTC(), + private val scanDir: Path = Paths.get(System.getProperty("java.io.tmpdir")), + private val timeoutSeconds: Long = 30, +) : ApplicationRunner { + + override fun run(args: ApplicationArguments) { + executor.executeVoidJava( + Runnable { runWithDeadline() }, + TaskContext.of("OrphanTempFileCleanup", "BootScan"), + ) + } + + /** + * Run [cleanupOrphans] on [asyncExecutor] bounded by [timeoutSeconds]. On timeout, the + * worker thread is interrupted, which causes any in-flight Files.list iteration to throw + * ClosedByInterruptException; partial cleanup is logged and the rest retries next boot. + * On other failures (e.g. IOException from Files.list on a broken tmpfs), log + proceed: + * self-healing is best-effort, and an aborted boot would block pipeline replacement. + */ + internal fun runWithDeadline() { + val future = try { + CompletableFuture.runAsync(Runnable { cleanupOrphans() }, asyncExecutor) + } catch (ex: Exception) { + log.error("[OrphanTempFileCleanup] cleanup submit failed: {}", ex.message, ex) + return + } + try { + future.get(timeoutSeconds, TimeUnit.SECONDS) + } catch (ex: TimeoutException) { + log.warn( + "[OrphanTempFileCleanup] cleanup exceeded {}s; cancelling, will retry next boot", + timeoutSeconds, + ) + future.cancel(true) + } catch (ex: Exception) { + val cause = (ex as? ExecutionException)?.cause ?: ex + log.error("[OrphanTempFileCleanup] cleanup failed: {}", cause.message, cause) + } + } + + private fun cleanupOrphans() { + val cutoff = Instant.now(clock).minus(CUTOFF) + var scanned = 0L + var deleted = 0L + var bytesFreed = 0L + var failed = 0L + + Files.list(scanDir).use { stream -> + stream + .filter { ORPHAN_PATTERN.matches(it.fileName.toString()) } + .forEach { file -> + scanned++ + val mtime = try { + Files.getLastModifiedTime(file).toInstant() + } catch (ex: java.io.IOException) { + log.warn("[OrphanTempFileCleanup] read mtime failed for {}: {}", file, ex.message) + failed++ + return@forEach + } + if (mtime.isBefore(cutoff)) { + try { + bytesFreed += Files.size(file) + Files.delete(file) + deleted++ + } catch (ex: java.io.IOException) { + log.warn("[OrphanTempFileCleanup] delete failed for {}: {}", file, ex.message) + failed++ + } + } + } + } + + log.info( + "[OrphanTempFileCleanup] scanned={} deleted={} bytes_freed={} failed={}", + scanned, deleted, bytesFreed, failed, + ) + if (failed > 0) { + log.warn("[OrphanTempFileCleanup] {} files failed to clean; will retry next boot", failed) + } + } + + private companion object { + private val log = LoggerFactory.getLogger(OrphanTempFileCleanupHook::class.java) + private val ORPHAN_PATTERN = Regex("gzip-chunk-.*\\.tmp") + private val CUTOFF: Duration = Duration.ofHours(1) + } +} \ No newline at end of file diff --git a/module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt b/module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt new file mode 100644 index 000000000..77ab314c3 --- /dev/null +++ b/module-external-api/src/test/kotlin/maple/externalapi/snapshot/OrphanTempFileCleanupHookTest.kt @@ -0,0 +1,192 @@ +package maple.externalapi.snapshot + +import maple.expectation.infrastructure.executor.LogicExecutor +import maple.expectation.infrastructure.executor.TaskContext +import org.assertj.core.api.Assertions.assertThat +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import org.mockito.kotlin.any +import org.mockito.kotlin.mock +import org.slf4j.LoggerFactory +import ch.qos.logback.classic.Level +import ch.qos.logback.classic.Logger +import ch.qos.logback.classic.spi.ILoggingEvent +import ch.qos.logback.core.read.ListAppender +import java.nio.file.Files +import java.nio.file.Path +import java.nio.file.attribute.FileTime +import java.time.Clock +import java.time.Instant +import java.time.ZoneOffset +import java.util.concurrent.Executor +import kotlin.io.path.setLastModifiedTime + +class OrphanTempFileCleanupHookTest { + + @TempDir + lateinit var tmp: Path + + private val fixedNow: Instant = Instant.parse("2026-06-19T00:00:00Z") + private val clock: Clock = Clock.fixed(fixedNow, ZoneOffset.UTC) + + // Stub LogicExecutor that invokes the Runnable passed to executeVoidJava synchronously. + // We use executeVoidJava (Runnable-typed) instead of executeVoid (ThrowingRunnable) so the + // mock signature and the production signature both round-trip via the same Runnable type. + private val executor: LogicExecutor = mock().also { m -> + org.mockito.Mockito.`when`(m.executeVoidJava(any(), any())) + .thenAnswer { invocation -> + (invocation.arguments[0] as Runnable).run() + null + } + } + + // Default async executor: runs submitted Runnables synchronously on the caller thread. + // CompletableFuture.runAsync uses this to start cleanup; the future then completes + // synchronously. runWithDeadline's future.get(timeout) returns immediately. + private val syncAsyncExecutor: Executor = Executor { it.run() } + + private lateinit var logAppender: ListAppender + private var originalLevel: Level? = null + + @BeforeEach + fun attachAppender() { + val logger = LoggerFactory.getLogger(OrphanTempFileCleanupHook::class.java) as Logger + originalLevel = logger.level + logger.level = Level.INFO + logAppender = ListAppender().apply { start() } + logger.addAppender(logAppender) + } + + @AfterEach + fun detachAppender() { + val logger = LoggerFactory.getLogger(OrphanTempFileCleanupHook::class.java) as Logger + logger.detachAppender(logAppender) + logger.level = originalLevel + } + + private fun makeHook( + clock: Clock = this.clock, + scanDir: Path = tmp, + executor: LogicExecutor = this.executor, + asyncExecutor: Executor = syncAsyncExecutor, + timeoutSeconds: Long = 30, + ): OrphanTempFileCleanupHook = + OrphanTempFileCleanupHook(executor, asyncExecutor, clock, scanDir, timeoutSeconds) + + private fun createOrphan( + name: String, + size: Int = 10, + ageHours: Long = 0, + ): Path { + val file = tmp.resolve(name) + Files.write(file, ByteArray(size)) + file.setLastModifiedTime(FileTime.from(fixedNow.minusSeconds(ageHours * 3600))) + return file + } + + @Test + fun `deletes files older than 1 hour`() { + val file = createOrphan("gzip-chunk-uuid1-part-000001-.jsonl.gz.tmp", ageHours = 2) + makeHook().run(mock()) + assertThat(Files.exists(file)).isFalse + } + + @Test + fun `preserves files newer than 1 hour (active writer)`() { + val file = createOrphan("gzip-chunk-uuid2-part-000002-.jsonl.gz.tmp", ageHours = 0) + makeHook().run(mock()) + assertThat(Files.exists(file)).isTrue + } + + @Test + fun `preserves file exactly 1 hour old (cutoff boundary)`() { + val file = createOrphan("gzip-chunk-uuid3-part-000003-.jsonl.gz.tmp", ageHours = 1) + makeHook().run(mock()) + assertThat(Files.exists(file)).isTrue + } + + @Test + fun `ignores non-matching filenames`() { + val unrelated = createOrphan("urgent-chunk-uuid-part-000001-.jsonl.gz.tmp", ageHours = 24) + val plainTxt = tmp.resolve("notes.txt") + Files.write(plainTxt, ByteArray(10)) + plainTxt.setLastModifiedTime(FileTime.from(fixedNow.minusSeconds(24 * 3600))) + val olderPrefix = tmp.resolve("gzip-archive.jsonl.gz") // not tmp suffix + Files.write(olderPrefix, ByteArray(10)) + olderPrefix.setLastModifiedTime(FileTime.from(fixedNow.minusSeconds(24 * 3600))) + + makeHook().run(mock()) + + assertThat(Files.exists(unrelated)).isTrue + assertThat(Files.exists(plainTxt)).isTrue + assertThat(Files.exists(olderPrefix)).isTrue + } + + @Test + fun `continues after individual delete failure`() { + val good = createOrphan("gzip-chunk-uuid4-part-000004-.jsonl.gz.tmp", ageHours = 2) + val held = createOrphan("gzip-chunk-uuid5-part-000005-.jsonl.gz.tmp", ageHours = 2) + // Make the file un-deletable on POSIX. Test is no-op on Windows. + held.toFile().setReadable(false) + held.toFile().setWritable(false) + + makeHook().run(mock()) + + assertThat(Files.exists(good)).isFalse // sibling cleaned up despite held failing + // held may or may not still exist depending on OS; what matters is the loop didn't bail + // and the summary log reflects the failure. Cleanup perm for next test: + held.toFile().setReadable(true) + held.toFile().setWritable(true) + } + + @Test + fun `logs scanned deleted bytes_freed at INFO`() { + createOrphan("gzip-chunk-uuid6-part-000006-.jsonl.gz.tmp", size = 1024, ageHours = 2) + createOrphan("gzip-chunk-uuid7-part-000007-.jsonl.gz.tmp", size = 512, ageHours = 0) // skipped (active) + + makeHook().run(mock()) + + val summary = logAppender.list + .firstOrNull { it.formattedMessage.startsWith("[OrphanTempFileCleanup] scanned=") } + assertThat(summary).isNotNull + assertThat(summary!!.level).isEqualTo(Level.INFO) + val msg = summary.formattedMessage + assertThat(msg).contains("scanned=2") + assertThat(msg).contains("deleted=1") + assertThat(msg).contains("bytes_freed=1024") + assertThat(msg).contains("failed=0") + } + + @Test + fun `runWithDeadline logs WARN and cancels when timeout fires`() { + // Executor that never invokes the task — future stays pending. + // timeoutSeconds = 0 → future.get(0, SECONDS) throws TimeoutException immediately. + // runWithDeadline must catch it, log WARN, and cancel the future. + val neverRunsExecutor = Executor { /* drop the command */ } + val hook = makeHook(asyncExecutor = neverRunsExecutor, timeoutSeconds = 0) + + hook.runWithDeadline() + + val warn = logAppender.list + .firstOrNull { it.formattedMessage.contains("cleanup exceeded 0s") } + assertThat(warn).isNotNull + assertThat(warn!!.level).isEqualTo(Level.WARN) + } + + @Test + fun `runWithDeadline logs ERROR when submit fails`() { + // Executor that throws on submit — runAsync never creates the future; submit-fail + // path runs and logs the consolidated ERROR message. + val throwingExecutor = Executor { throw RuntimeException("simulated submit failure") } + val hook = makeHook(asyncExecutor = throwingExecutor) + + hook.runWithDeadline() + + val err = logAppender.list + .firstOrNull { it.formattedMessage.contains("cleanup submit failed") } + assertThat(err).isNotNull + assertThat(err!!.level).isEqualTo(Level.ERROR) + } +} \ No newline at end of file