From 25a0257ce33a1bc033c5c4c586612901ca4ad1a1 Mon Sep 17 00:00:00 2001
From: Shannon Holland <github@shannonholland.com>
Date: Thu, 7 May 2026 13:39:21 -0700
Subject: [PATCH 1/3] remove Layer 3b CPU fallback (0% recovery rate,
 1,038/1,038 fail)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Production evidence (2026-05-07 SafariUnfucker run: 5,570 successes,
1,038 cpu_fallback_failed, 0 recovered_iosurface_exhaustion) shows the
CPU fallback never recovers any inference. The proactive reload (Layer 2)
is the only mitigation that works.

- Remove `cpuPredictorFactory` stored property and all init wiring
- Remove Layer 3b branch from `predictWindow`; Layer 3a failure catch now
  calls `logNativeException` + `throw nativeError` instead of falling through
- Delete `logRecoveredIOSurface`, `logCPUFallbackFailed`, `extractCPUErrorFields`
- Retire `cpu_fallback_failed` JSONL category
- Delete three CPU fallback stress tests (Scenarios A/B/C)
- Add `testANERetrySucceedsAfterReload` and `testANERetryFailsLogsErrorRow`
  for Layer 3a stub-based coverage
- ADR 021 second addendum: production evidence, removal rationale, simplified
  two-state JSONL category scheme, ADR 010 §(g) citation now moot
- docs/Plan.md: check off #93

Closes #93

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../SwitchcraftCoreML/T5CoreMLEmbedder.swift  | 117 ++----------
 .../T5CoreMLEmbedderStressTests.swift         | 173 +++++-------------
 ...ne-iosurface-pool-exhaustion-mitigation.md |  60 ++++++
 docs/Plan.md                                  |   1 +
 4 files changed, 114 insertions(+), 237 deletions(-)

diff --git a/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift b/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift
index 351cdeb..87b9534 100644
--- a/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift
+++ b/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift
@@ -47,12 +47,9 @@ private let coreMLLogger = Logger(subsystem: "com.switchcraft.coreml", category:
 ///    ANE resources before the pool is exhausted. Each reload takes 1–3 s on
 ///    ANE-capable hardware (CPU recompile is faster); tune `reloadInterval`
 ///    to balance stall frequency against pool pressure for your workload.
-/// 3. On IOSurface failure: force-reload + ANE retry first; if that also fails,
-///    retry the window on `.cpuOnly`. Logged in `failureLogURL` (when set)
-///    with three distinct `category` values:
-///    - `"warning"` (`"recovered_iosurface_exhaustion"`): CPU fallback succeeded.
-///    - `"cpu_fallback_failed"`: CPU fallback was attempted but also failed.
-///    - `"error"`: IOSurface was not the cause (no recovery attempted).
+/// 3. On IOSurface failure: force-reload + ANE retry. If the ANE retry also
+///    fails, the original error is logged in `failureLogURL` (when set) with
+///    `category: "error"` and rethrown.
 ///
 /// See ADR 021 for the full rationale.
 ///
@@ -106,9 +103,6 @@ public actor T5CoreMLEmbedder: Embedder {
     /// Recreates the main predictor on demand; used by proactive reload to
     /// flush accumulated ANE IOSurface resources.
     private let predictorFactory: @Sendable () throws -> any MLPredictor
-    /// Produces a `.cpuOnly` predictor for reactive IOSurface fallback.
-    /// `nil` in test inits that use a static stub (no real `compiledURL` available).
-    private let cpuPredictorFactory: (@Sendable () throws -> any MLPredictor)?
     private let failureLogURL: URL?
     private var callCount: Int = 0
     /// Number of `encode` calls between proactive model reloads.
@@ -206,14 +200,8 @@ public actor T5CoreMLEmbedder: Embedder {
             config.computeUnits = capturedComputeUnits
             return try MLModel(contentsOf: capturedCompiledURL, configuration: config)
         }
-        let cpuFactory: @Sendable () throws -> any MLPredictor = {
-            let config = MLModelConfiguration()
-            config.computeUnits = .cpuOnly
-            return try MLModel(contentsOf: capturedCompiledURL, configuration: config)
-        }
 
         self.predictorFactory = factory
-        self.cpuPredictorFactory = cpuFactory
         self.predictor = try MLModel(contentsOf: compiledURL, configuration: configuration)
         self.tokenizer = tokenizer
         self.dims = dims
@@ -294,7 +282,6 @@ public actor T5CoreMLEmbedder: Embedder {
                      "maxInputTokens must be >= windowSize")
         let capturedPredictor = predictor
         self.predictorFactory = { capturedPredictor }
-        self.cpuPredictorFactory = nil
         self.predictor = predictor
         self.tokenizer = tokenizer
         self.dims = dims
@@ -311,13 +298,12 @@ public actor T5CoreMLEmbedder: Embedder {
     /// Test-only init: inject a factory for predictor lifecycle testing.
     ///
     /// Use this variant when the test must verify model reload behaviour or
-    /// the IOSurface CPU-fallback path. The factory is called once during init
-    /// and again on each proactive reload.
+    /// the Layer 3a reactive reload + ANE retry path. The factory is called
+    /// once during init and again on each proactive or reactive reload.
     ///
     /// `internal` — access from test targets via `@testable import SwitchcraftCoreML`.
     internal init(
         predictorFactory: @escaping @Sendable () throws -> any MLPredictor,
-        cpuPredictorFactory: (@Sendable () throws -> any MLPredictor)? = nil,
         tokenizer: Tokenizer,
         dims: Int = 128,
         windowSize: Int = 512,
@@ -338,7 +324,6 @@ public actor T5CoreMLEmbedder: Embedder {
         precondition(resolvedMaxInputTokens >= windowSize,
                      "maxInputTokens must be >= windowSize")
         self.predictorFactory = predictorFactory
-        self.cpuPredictorFactory = cpuPredictorFactory
         self.predictor = try predictorFactory()
         self.tokenizer = tokenizer
         self.dims = dims
@@ -359,8 +344,8 @@ public actor T5CoreMLEmbedder: Embedder {
     ///
     /// ObjC exceptions from CoreML are converted to `CoreMLNativeError`
     /// and thrown rather than crashing the host process. IOSurface allocation
-    /// failures are silently retried on CPU (see class doc-comment); callers
-    /// only receive an error if the retry also fails.
+    /// failures trigger a reactive model reload + ANE retry (see class doc-comment);
+    /// callers only receive an error if the ANE retry also fails.
     ///
     /// - Throws: `EmbedderError.inputTooLarge(actual:max:)` when the token
     ///   count exceeds `maxInputTokens` and `overflowPolicy` is `.reject`;
@@ -493,8 +478,7 @@ public actor T5CoreMLEmbedder: Embedder {
 
     // MARK: - Private helpers
 
-    /// Run one window prediction with autoreleasepool drainage, reactive reload,
-    /// ANE retry, and IOSurface CPU fallback.
+    /// Run one window prediction with autoreleasepool drainage and reactive reload + ANE retry.
     private func predictWindow(
         provider: MLDictionaryFeatureProvider,
         inputLength: Int,
@@ -511,7 +495,7 @@ public actor T5CoreMLEmbedder: Embedder {
             )
             return result
         } catch let nativeError as CoreMLNativeError {
-            guard isIOSurfaceExhaustion(nativeError), let cpuFactory = cpuPredictorFactory else {
+            guard isIOSurfaceExhaustion(nativeError) else {
                 if let logURL = failureLogURL {
                     logNativeException(nativeError, inputLength: inputLength, to: logURL)
                 }
@@ -519,7 +503,7 @@ public actor T5CoreMLEmbedder: Embedder {
             }
 
             // Layer 3a — Reactive reload + ANE retry: force-reload the predictor
-            // and retry on ANE before falling back to CPU.
+            // and retry on ANE.
             do {
                 self.predictor = try predictorFactory()
                 let retryResult = try autoreleasepool {
@@ -531,27 +515,8 @@ public actor T5CoreMLEmbedder: Embedder {
                 )
                 return retryResult
             } catch {
-                coreMLLogger.warning(
-                    "T5CoreMLEmbedder: reactive reload/ANE retry failed, falling back to CPU: \(error, privacy: .public)"
-                )
-            }
-
-            // Layer 3b — CPU fallback: retry this window on .cpuOnly.
-            do {
-                let cpuPredictor = try cpuFactory()
-                let result = try autoreleasepool {
-                    try catchingNSException { try cpuPredictor.predict(input: provider) }
-                }
                 if let logURL = failureLogURL {
-                    logRecoveredIOSurface(nativeError, inputLength: inputLength, to: logURL)
-                }
-                return result
-            } catch let cpuError {
-                // CPU fallback also failed — log with distinct category so the
-                // two states (no fallback vs. fallback-attempted-and-failed) are
-                // distinguishable in the JSONL log.
-                if let logURL = failureLogURL {
-                    logCPUFallbackFailed(nativeError, cpuError: cpuError, inputLength: inputLength, to: logURL)
+                    logNativeException(nativeError, inputLength: inputLength, to: logURL)
                 }
                 throw nativeError
             }
@@ -589,66 +554,6 @@ public actor T5CoreMLEmbedder: Embedder {
         )
     }
 
-    private func logRecoveredIOSurface(
-        _ error: CoreMLNativeError,
-        inputLength: Int,
-        to url: URL
-    ) {
-        guard case .nativeException(_, let reason, let callStack) = error else { return }
-
-        coreMLLogger.warning(
-            "🟡 [COREML-RECOVERY] recovered IOSurface exhaustion on CPU fallback input_len=\(inputLength, privacy: .public)"
-        )
-
-        appendJSONLRow(
-            name: "recovered_iosurface_exhaustion",
-            reason: reason,
-            inputLength: inputLength,
-            callStack: Array(callStack.prefix(5)),
-            category: "warning",
-            to: url
-        )
-    }
-
-    private func logCPUFallbackFailed(
-        _ aneError: CoreMLNativeError,
-        cpuError: Error,
-        inputLength: Int,
-        to url: URL
-    ) {
-        guard case .nativeException(let aneName, let aneReason, let aneCallStack) = aneError else { return }
-        let (cpuName, cpuReason, cpuCallStack) = extractCPUErrorFields(cpuError)
-
-        coreMLLogger.error(
-            "🔴 [COREML-CPU-FAILED] ane=\(aneName, privacy: .public) cpu=\(cpuName, privacy: .public) cpu_reason=\(cpuReason, privacy: .public) input_len=\(inputLength, privacy: .public)"
-        )
-        for frame in cpuCallStack.prefix(5) {
-            coreMLLogger.error("  cpu: \(frame, privacy: .public)")
-        }
-
-        appendJSONLRow(
-            name: aneName,
-            reason: aneReason,
-            inputLength: inputLength,
-            callStack: Array(aneCallStack.prefix(5)),
-            category: "cpu_fallback_failed",
-            cpuErrorName: cpuName,
-            cpuErrorReason: cpuReason,
-            cpuCallStack: Array(cpuCallStack.prefix(5)),
-            to: url
-        )
-    }
-
-    /// Extract name, reason, and call stack from a CPU-side error regardless of
-    /// its concrete type (`CoreMLNativeError` from predict or `NSError` from model load).
-    private func extractCPUErrorFields(_ error: Error) -> (name: String, reason: String, callStack: [String]) {
-        if case .nativeException(let name, let reason, let frames) = error as? CoreMLNativeError {
-            return (name, reason, frames)
-        }
-        let nsErr = error as NSError
-        return (nsErr.domain, nsErr.localizedDescription, [])
-    }
-
     private func appendJSONLRow(
         name: String,
         reason: String,
diff --git a/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift b/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift
index d7a6613..a70ec0a 100644
--- a/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift
+++ b/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift
@@ -9,7 +9,7 @@ import CoreML
 
 /// Stress and lifecycle tests for `T5CoreMLEmbedder`'s ANE IOSurface pool
 /// exhaustion mitigation: autoreleasepool discipline, proactive model reload,
-/// and reactive CPU fallback.
+/// and reactive reload + ANE retry (Layer 3a).
 ///
 /// No CoreML model asset is required — `CountingStubPredictor` is injected
 /// via the factory-based internal init.
@@ -112,93 +112,31 @@ struct T5CoreMLEmbedderStressTests {
         #expect(counter.count >= 4, "expected ≥4 factory calls (1 init + 3 reloads), got \(counter.count)")
     }
 
-    // MARK: - IOSurface fallback test
+    // MARK: - Layer 3a tests
 
-    /// When every prediction raises an IOSurface-like exception, the CPU fallback
-    /// must succeed and log a `"recovered_iosurface_exhaustion"` JSONL row with
-    /// `"category": "warning"` for each encode call.
-    @Test("IOSurface exhaustion triggers CPU fallback and logs recovery row")
-    func testIOSurfaceFallbackLogsRecovery() async throws {
+    /// When the initial predictor raises an IOSurface-like exception, Layer 3a
+    /// must force-reload the predictor via the factory and retry on ANE. When
+    /// the reloaded predictor succeeds, encode must not throw and no JSONL row
+    /// must be written.
+    @Test("Layer 3a ANE retry succeeds after reactive reload, no JSONL row written")
+    func testANERetrySucceedsAfterReload() async throws {
         let tokenizer = try Self.makeTokenizer()
 
         let logURL = FileManager.default.temporaryDirectory
-            .appendingPathComponent("switchcraft-stress-recovery-\(UUID().uuidString).jsonl")
+            .appendingPathComponent("switchcraft-ane-retry-success-\(UUID().uuidString).jsonl")
         defer { try? FileManager.default.removeItem(at: logURL) }
 
-        // Main predictor: always raises IOSurface-like exception.
-        // CPU fallback predictor: always succeeds.
         let dims = 16
+        let counter = FactoryCallCounter()
+        // Factory call 1 (at init): returns a predictor that fails every predict.
+        // Factory call 2 (Layer 3a reactive reload): returns a succeeding predictor.
         let embedder = try T5CoreMLEmbedder(
-            predictorFactory: { CountingStubPredictor(failInterval: 1, dims: dims) },
-            cpuPredictorFactory: { CountingStubPredictor(failInterval: nil, dims: dims) },
-            tokenizer: tokenizer,
-            dims: dims,
-            windowSize: 64,
-            stride: 32,
-            minNorm: 1.0,
-            failureLogURL: logURL,
-            reloadInterval: 500
-        )
-
-        // "test input" (10 chars, 1 window) → each encode = 1 IOSurface hit → 1 recovery row.
-        let inputText = "test input"
-        let encodeCount = 10
-
-        for _ in 0..<encodeCount {
-            // Must NOT throw — the CPU fallback should recover.
-            let result = try await embedder.encode(inputText)
-            #expect(!result.isEmpty, "CPU-fallback encode should return non-empty embeddings")
-        }
-
-        // Verify recovery rows were written.
-        let logData = try Data(contentsOf: logURL)
-        let rawText = try #require(String(data: logData, encoding: .utf8), "log not UTF-8")
-        let lines = rawText.split(separator: "\n", omittingEmptySubsequences: true)
-        #expect(lines.count == encodeCount,
-                "expected \(encodeCount) recovery rows, got \(lines.count)")
-
-        for (idx, line) in lines.enumerated() {
-            let rowData = Data(line.utf8)
-            let json = try #require(
-                try JSONSerialization.jsonObject(with: rowData) as? [String: Any],
-                "row \(idx) is not a JSON object"
-            )
-            #expect((json["name"]     as? String) == "recovered_iosurface_exhaustion",
-                    "row \(idx): unexpected name")
-            #expect((json["category"] as? String) == "warning",
-                    "row \(idx): category must be \"warning\"")
-            // Scenario C regression: CPU-error fields must be absent on successful recovery.
-            #expect((json["cpuErrorName"]   as? String) == nil,
-                    "row \(idx): cpuErrorName must be nil on recovery row")
-            #expect((json["cpuErrorReason"] as? String) == nil,
-                    "row \(idx): cpuErrorReason must be nil on recovery row")
-        }
-    }
-
-    // MARK: - R6 Scenario A: CPU factory throws on construction
-
-    /// When the CPU fallback factory itself throws (model load failure), the JSONL
-    /// row must record the CPU-side error with `category: "cpu_fallback_failed"` —
-    /// not silently masquerade as an unrecovered ANE failure (`category: "error"`).
-    @Test("CPU factory throws: JSONL row records cpu_fallback_failed with CPU error fields")
-    func testCPUFactoryThrowsLogsDistinctError() async throws {
-        let tokenizer = try Self.makeTokenizer()
-
-        let logURL = FileManager.default.temporaryDirectory
-            .appendingPathComponent("switchcraft-cpu-factory-fail-\(UUID().uuidString).jsonl")
-        defer { try? FileManager.default.removeItem(at: logURL) }
-
-        let dims = 16
-        // Main predictor always raises IOSurface-like exception.
-        // CPU factory always throws a plain NSError (model load failure).
-        let cpuInitError = NSError(
-            domain: "test.cpu",
-            code: 1,
-            userInfo: [NSLocalizedDescriptionKey: "cpu init failed"]
-        )
-        let embedder = try T5CoreMLEmbedder(
-            predictorFactory: { CountingStubPredictor(failInterval: 1, dims: dims) },
-            cpuPredictorFactory: { throw cpuInitError },
+            predictorFactory: {
+                counter.increment()
+                return counter.count == 1
+                    ? CountingStubPredictor(failInterval: 1, dims: dims)
+                    : CountingStubPredictor(failInterval: nil, dims: dims)
+            },
             tokenizer: tokenizer,
             dims: dims,
             windowSize: 64,
@@ -208,58 +146,30 @@ struct T5CoreMLEmbedderStressTests {
             reloadInterval: 500
         )
 
-        let inputText = "test input"
-        // encode must throw because CPU fallback also failed.
-        do {
-            _ = try await embedder.encode(inputText)
-            Issue.record("Expected CoreMLNativeError to be thrown but encode returned normally")
-            return
-        } catch is CoreMLNativeError {
-            // Expected — original IOSurface error is rethrown.
-        }
-
-        // Verify the JSONL row has category "cpu_fallback_failed" with CPU error fields.
-        let logData = try Data(contentsOf: logURL)
-        let rawText = try #require(String(data: logData, encoding: .utf8), "log not UTF-8")
-        let lines = rawText.split(separator: "\n", omittingEmptySubsequences: true)
-        // Exactly 1 row (the cpu_fallback_failed row).
-        #expect(lines.count == 1, "expected 1 JSONL row, got \(lines.count)")
-
-        let rowData = Data(lines[0].utf8)
-        let json = try #require(
-            try JSONSerialization.jsonObject(with: rowData) as? [String: Any],
-            "JSONL row is not a JSON object"
-        )
-        #expect((json["category"] as? String) == "cpu_fallback_failed",
-                "category must be cpu_fallback_failed, got \(json["category"] ?? "nil")")
-        #expect((json["cpuErrorName"]   as? String) != nil,
-                "cpuErrorName must be non-nil when CPU factory threw")
-        #expect((json["cpuErrorReason"] as? String) != nil,
-                "cpuErrorReason must be non-nil when CPU factory threw")
-        // Confirm the row does NOT masquerade as a plain unrecovered error.
-        #expect((json["category"] as? String) != "error",
-                "row must not have category \"error\" — that would be indistinguishable from no-fallback-attempted")
+        // encode must not throw — Layer 3a reactive reload + ANE retry should recover.
+        let result = try await embedder.encode("test input")
+        #expect(!result.isEmpty, "Layer 3a ANE retry should return non-empty embeddings")
+        // No JSONL row when the retry succeeds.
+        #expect(!FileManager.default.fileExists(atPath: logURL.path),
+                "No JSONL row expected when ANE retry succeeds")
     }
 
-    // MARK: - R6 Scenario B: CPU predict throws after successful factory construction
-
-    /// When the CPU fallback factory succeeds but `cpuPredictor.predict` throws,
-    /// the JSONL row must record the CPU-side error name and reason distinctly
-    /// from the original ANE error.
-    @Test("CPU predict throws: JSONL row records cpu_fallback_failed with CPU error name and reason")
-    func testCPUPredictThrowsLogsDistinctError() async throws {
+    /// When both the initial predictor and the Layer 3a ANE retry raise an
+    /// IOSurface-like exception, encode must throw `CoreMLNativeError` and
+    /// write exactly one JSONL row with `category: "error"`. The retired
+    /// `cpu_fallback_failed` category must not appear.
+    @Test("Layer 3a ANE retry failure logs error row and rethrows")
+    func testANERetryFailsLogsErrorRow() async throws {
         let tokenizer = try Self.makeTokenizer()
 
         let logURL = FileManager.default.temporaryDirectory
-            .appendingPathComponent("switchcraft-cpu-predict-fail-\(UUID().uuidString).jsonl")
+            .appendingPathComponent("switchcraft-ane-retry-fail-\(UUID().uuidString).jsonl")
         defer { try? FileManager.default.removeItem(at: logURL) }
 
         let dims = 16
-        // Main predictor always raises IOSurface-like exception.
-        // CPU factory returns ThrowingStubPredictor whose predict() always raises "TestCrash".
+        // Always-failing predictor: every predict raises an IOSurface-like exception.
         let embedder = try T5CoreMLEmbedder(
             predictorFactory: { CountingStubPredictor(failInterval: 1, dims: dims) },
-            cpuPredictorFactory: { ThrowingStubPredictor() },
             tokenizer: tokenizer,
             dims: dims,
             windowSize: 64,
@@ -269,15 +179,16 @@ struct T5CoreMLEmbedderStressTests {
             reloadInterval: 500
         )
 
-        let inputText = "test input"
+        // encode must throw — both the initial ANE call and the Layer 3a retry fail.
         do {
-            _ = try await embedder.encode(inputText)
+            _ = try await embedder.encode("test input")
             Issue.record("Expected CoreMLNativeError to be thrown but encode returned normally")
             return
         } catch is CoreMLNativeError {
             // Expected — original IOSurface error is rethrown.
         }
 
+        // Verify a single error row was written.
         let logData = try Data(contentsOf: logURL)
         let rawText = try #require(String(data: logData, encoding: .utf8), "log not UTF-8")
         let lines = rawText.split(separator: "\n", omittingEmptySubsequences: true)
@@ -288,13 +199,13 @@ struct T5CoreMLEmbedderStressTests {
             try JSONSerialization.jsonObject(with: rowData) as? [String: Any],
             "JSONL row is not a JSON object"
         )
-        #expect((json["category"] as? String) == "cpu_fallback_failed",
-                "category must be cpu_fallback_failed, got \(json["category"] ?? "nil")")
-        // ThrowingStubPredictor raises "TestCrash" / "deliberate test exception".
-        #expect((json["cpuErrorName"]   as? String) == ThrowingStubPredictor.exceptionName,
-                "cpuErrorName must match ThrowingStubPredictor.exceptionName")
-        #expect((json["cpuErrorReason"] as? String) == ThrowingStubPredictor.exceptionReason,
-                "cpuErrorReason must match ThrowingStubPredictor.exceptionReason")
+        #expect((json["category"] as? String) == "error",
+                "category must be \"error\", got \(json["category"] ?? "nil")")
+        // The retired cpu_fallback_failed category must not appear.
+        #expect((json["category"] as? String) != "cpu_fallback_failed",
+                "retired cpu_fallback_failed category must not appear")
+        #expect((json["cpuErrorName"] as? String) == nil,
+                "cpuErrorName field must be absent after Layer 3b removal")
     }
 }
 
diff --git a/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md b/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md
index 5c34225..c7ab045 100644
--- a/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md
+++ b/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md
@@ -198,3 +198,63 @@ is out of scope for issue #90 and should be filed as a separate issue.
 
 - Issue #90 — production failure report and bug fixes.
 - Issue #89 — parallel input-size guard (structural prevention of size-driven trigger).
+
+## 2026-05-07 Addendum (Issue #93)
+
+### Production evidence: Layer 3b CPU fallback has 0% recovery rate
+
+A SafariUnfucker bulk-index run (2026-05-07) processed 5,570 successful inferences
+and 1,038 failures:
+
+```
+1038  cpu_fallback_failed  (every single failure)
+1038  "Failed to allocate E5 buffer object. E5RT: Failed to allocate memory IOSurface object. (3)"
+0     recovered_iosurface_exhaustion  (none)
+```
+
+Every IOSurface exception that reached Layer 3b resulted in `cpu_fallback_failed`
+— zero recoveries. The ANE pool DID self-recover three times during the run, with
+recovery periods of 5–25 minutes between failure bursts. Recovery timing is
+consistent with the proactive reload (Layer 2) firing at 500-call boundaries, not
+with the CPU fallback.
+
+### Decision: Remove Layer 3b CPU fallback
+
+Layer 3b is removed. The production evidence is determinative: 0/1,038 recovery
+rate across three distinct failure bursts, with per-failure overhead of one extra
+`MLModel` load and `predict` call that contributed nothing.
+
+After this change, the three-layer model becomes a two-layer model:
+
+1. **Layer 1** — `autoreleasepool` per window (unchanged).
+2. **Layer 2** — Proactive model reload every `reloadInterval` encodes (unchanged).
+3. **Layer 3a** — Reactive reload + ANE retry on IOSurface failure. If the ANE
+   retry also fails, the original error is logged with `category: "error"` and
+   rethrown. No CPU fallback is attempted.
+
+### Simplified JSONL category scheme
+
+The three-state scheme from the 2026-05-06 addendum is reduced to two active
+categories:
+
+| Category | Meaning |
+|----------|---------|
+| `"error"` | Inference failed. Includes both non-IOSurface failures and IOSurface failures where the Layer 3a ANE retry also failed. |
+| `"warning"` | (Reserved; not currently produced.) |
+
+The `"cpu_fallback_failed"` category is retired. It will no longer appear in
+`failureLogURL` JSONL output. The `cpuErrorName`, `cpuErrorReason`, and
+`cpuCallStack` fields remain in `CoreMLFailureLogEntry` (out of scope to remove)
+but will always be absent (`nil`) from JSON output going forward.
+
+### ADR 010 §(g) no longer applies to the Layer 3 path
+
+ADR 010 §(g) sanctions `.cpuOnly` compute-unit override for constrained
+environments. This sanction was cited in the original ADR 021 to justify the CPU
+fallback's use of `.cpuOnly`. With Layer 3b removed, ADR 010 §(g) no longer
+applies to the Layer 3 path. The citation is noted here so future readers know
+it was intentionally vacated, not overlooked.
+
+### References
+
+- Issue #93 — production evidence (0/1,038 CPU recovery rate) and Layer 3b removal.
diff --git a/docs/Plan.md b/docs/Plan.md
index c32ccb2..adad4c7 100644
--- a/docs/Plan.md
+++ b/docs/Plan.md
@@ -320,6 +320,7 @@ Track progress by checking off items as they land. Effort estimates and notes fo
 - [x] **T5CoreMLEmbedder ANE IOSurface fix** (#87) — `autoreleasepool` per window, proactive model reload every `reloadInterval` encodes (default 500, tunable), reactive CPU-fallback with JSONL recovery telemetry (`"recovered_iosurface_exhaustion"`); stub stress test (5k iterations, always-on CI) + real-asset stress test (10k iterations, asset-gated); ADR 021
 - [x] **Embedder overflow guard** (#89) — `maxInputTokens: Int` added to `Embedder` protocol; `EmbedderOverflowPolicy` (`.truncate` / `.reject`) + `EmbedderError.inputTooLarge` in `SwitchcraftCore`; overflow guard in `T5CoreMLEmbedder.encode(_:)` and `T5MetalEmbedder.encode(_:)` between tokenization and `SlidingWindow.plan`; default `8 * windowSize` (4,096 tokens → ~15 windows); prevents ANE pool poisoning from oversized inputs; ADR 022
 - [x] **T5CoreMLEmbedder ANE IOSurface mitigation hardening** (#90) — Fix three bugs in post-#88 production code: (1) silent CPU fallback gap: new `"cpu_fallback_failed"` JSONL category with `cpuErrorName`/`cpuErrorReason`/`cpuCallStack` fields; (2) `reloadInterval` default lowered 500→150 (below observed 388-call production failure point); (3) reactive reload + ANE retry added to Layer 3 before CPU fallback; per-window timing via `os.Logger`; Scenario A/B/C mock tests; ADR 021 amended
+- [x] **Remove Layer 3b CPU fallback (dead code, 0% recovery rate)** (#93) — Production evidence (0/1,038 CPU recovery rate across 2026-05-07 SafariUnfucker run) showed the CPU fallback never recovered any inference; `cpuPredictorFactory` property and all wiring removed; Layer 3b branch deleted from `predictWindow`; Layer 3a failure path now explicitly logs + rethrows; `logRecoveredIOSurface`, `logCPUFallbackFailed`, `extractCPUErrorFields` methods deleted; `cpu_fallback_failed` JSONL category retired; two new Layer 3a stub tests added (`testANERetrySucceedsAfterReload`, `testANERetryFailsLogsErrorRow`); ADR 021 second addendum
 - [x] **K-means clustering** (1 week) — Standard algorithm, use Accelerate
 - [x] **4-bit residual codec** (1 week) — ~200 lines, bit-level packing; round-trip property tests
 - [x] **LSM-tree index structure** (1 week) — Cascading merge logic

From d35954c3a96b575da5b48bb7aee3c547c0fcf3dd Mon Sep 17 00:00:00 2001
From: Shannon Holland <github@shannonholland.com>
Date: Thu, 7 May 2026 13:49:59 -0700
Subject: [PATCH 2/3] =?UTF-8?q?fix(review):=20correct=20ADR=20021=20layer?=
 =?UTF-8?q?=20count=20and=20status=20date;=20rename=20Layer=203a=E2=86=92L?=
 =?UTF-8?q?ayer=203?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copilot finding [1]: ADR status line still read "Amended (2026-05-06)" despite a
new 2026-05-07 addendum having been added. Updated to list both dates.

Copilot finding [2]: The addendum claimed "three-layer model becomes a two-layer
model" but then enumerated Layer 1, Layer 2, and Layer 3a — three layers. The
original design had three layers (1, 2, 3); Layer 3 had two sub-paths (3a and
3b). With 3b removed, Layer 3 is simplified to a single path; the model is still
three layers. Rewrote the paragraph and dropped the "3a" sub-label from Layer 3
throughout the ADR, production source, and test files for consistency.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../SwitchcraftCoreML/T5CoreMLEmbedder.swift  |  4 ++--
 .../T5CoreMLEmbedderStressTests.swift         | 20 +++++++++----------
 ...ne-iosurface-pool-exhaustion-mitigation.md |  8 +++++---
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift b/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift
index 87b9534..8fa683f 100644
--- a/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift
+++ b/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift
@@ -298,7 +298,7 @@ public actor T5CoreMLEmbedder: Embedder {
     /// Test-only init: inject a factory for predictor lifecycle testing.
     ///
     /// Use this variant when the test must verify model reload behaviour or
-    /// the Layer 3a reactive reload + ANE retry path. The factory is called
+    /// the Layer 3 reactive reload + ANE retry path. The factory is called
     /// once during init and again on each proactive or reactive reload.
     ///
     /// `internal` — access from test targets via `@testable import SwitchcraftCoreML`.
@@ -502,7 +502,7 @@ public actor T5CoreMLEmbedder: Embedder {
                 throw nativeError
             }
 
-            // Layer 3a — Reactive reload + ANE retry: force-reload the predictor
+            // Layer 3 — Reactive reload + ANE retry: force-reload the predictor
             // and retry on ANE.
             do {
                 self.predictor = try predictorFactory()
diff --git a/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift b/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift
index a70ec0a..00384e5 100644
--- a/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift
+++ b/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift
@@ -9,7 +9,7 @@ import CoreML
 
 /// Stress and lifecycle tests for `T5CoreMLEmbedder`'s ANE IOSurface pool
 /// exhaustion mitigation: autoreleasepool discipline, proactive model reload,
-/// and reactive reload + ANE retry (Layer 3a).
+/// and reactive reload + ANE retry (Layer 3).
 ///
 /// No CoreML model asset is required — `CountingStubPredictor` is injected
 /// via the factory-based internal init.
@@ -112,13 +112,13 @@ struct T5CoreMLEmbedderStressTests {
         #expect(counter.count >= 4, "expected ≥4 factory calls (1 init + 3 reloads), got \(counter.count)")
     }
 
-    // MARK: - Layer 3a tests
+    // MARK: - Layer 3 tests
 
-    /// When the initial predictor raises an IOSurface-like exception, Layer 3a
+    /// When the initial predictor raises an IOSurface-like exception, Layer 3
     /// must force-reload the predictor via the factory and retry on ANE. When
     /// the reloaded predictor succeeds, encode must not throw and no JSONL row
     /// must be written.
-    @Test("Layer 3a ANE retry succeeds after reactive reload, no JSONL row written")
+    @Test("Layer 3 ANE retry succeeds after reactive reload, no JSONL row written")
     func testANERetrySucceedsAfterReload() async throws {
         let tokenizer = try Self.makeTokenizer()
 
@@ -129,7 +129,7 @@ struct T5CoreMLEmbedderStressTests {
         let dims = 16
         let counter = FactoryCallCounter()
         // Factory call 1 (at init): returns a predictor that fails every predict.
-        // Factory call 2 (Layer 3a reactive reload): returns a succeeding predictor.
+        // Factory call 2 (Layer 3 reactive reload): returns a succeeding predictor.
         let embedder = try T5CoreMLEmbedder(
             predictorFactory: {
                 counter.increment()
@@ -146,19 +146,19 @@ struct T5CoreMLEmbedderStressTests {
             reloadInterval: 500
         )
 
-        // encode must not throw — Layer 3a reactive reload + ANE retry should recover.
+        // encode must not throw — Layer 3 reactive reload + ANE retry should recover.
         let result = try await embedder.encode("test input")
-        #expect(!result.isEmpty, "Layer 3a ANE retry should return non-empty embeddings")
+        #expect(!result.isEmpty, "Layer 3 ANE retry should return non-empty embeddings")
         // No JSONL row when the retry succeeds.
         #expect(!FileManager.default.fileExists(atPath: logURL.path),
                 "No JSONL row expected when ANE retry succeeds")
     }
 
-    /// When both the initial predictor and the Layer 3a ANE retry raise an
+    /// When both the initial predictor and the Layer 3 ANE retry raise an
     /// IOSurface-like exception, encode must throw `CoreMLNativeError` and
     /// write exactly one JSONL row with `category: "error"`. The retired
     /// `cpu_fallback_failed` category must not appear.
-    @Test("Layer 3a ANE retry failure logs error row and rethrows")
+    @Test("Layer 3 ANE retry failure logs error row and rethrows")
     func testANERetryFailsLogsErrorRow() async throws {
         let tokenizer = try Self.makeTokenizer()
 
@@ -179,7 +179,7 @@ struct T5CoreMLEmbedderStressTests {
             reloadInterval: 500
         )
 
-        // encode must throw — both the initial ANE call and the Layer 3a retry fail.
+        // encode must throw — both the initial ANE call and the Layer 3 retry fail.
         do {
             _ = try await embedder.encode("test input")
             Issue.record("Expected CoreMLNativeError to be thrown but encode returned normally")
diff --git a/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md b/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md
index c7ab045..c29fa3a 100644
--- a/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md
+++ b/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md
@@ -2,7 +2,7 @@
 
 ## Status
 
-Amended (2026-05-06 — see addendum below)
+Amended (2026-05-06, 2026-05-07 — see addenda below)
 
 ## Context
 
@@ -224,11 +224,13 @@ Layer 3b is removed. The production evidence is determinative: 0/1,038 recovery
 rate across three distinct failure bursts, with per-failure overhead of one extra
 `MLModel` load and `predict` call that contributed nothing.
 
-After this change, the three-layer model becomes a two-layer model:
+After this change, the three-layer model is retained but Layer 3 is simplified:
+Layer 3b (CPU fallback) is removed; Layer 3a (reactive reload + ANE retry) is the
+sole Layer 3 path and is no longer referred to with the "3a" sub-label:
 
 1. **Layer 1** — `autoreleasepool` per window (unchanged).
 2. **Layer 2** — Proactive model reload every `reloadInterval` encodes (unchanged).
-3. **Layer 3a** — Reactive reload + ANE retry on IOSurface failure. If the ANE
+3. **Layer 3** — Reactive reload + ANE retry on IOSurface failure. If the ANE
    retry also fails, the original error is logged with `category: "error"` and
    rethrown. No CPU fallback is attempted.
 

From f2046ae4143039d2fd2b028762fd38156e2ab0e4 Mon Sep 17 00:00:00 2001
From: Shannon Holland <github@shannonholland.com>
Date: Thu, 7 May 2026 13:50:37 -0700
Subject: [PATCH 3/3] fix(review): update ADR 021 JSONL table to say 'Layer 3'
 not 'Layer 3a'

The category table in the 2026-05-07 addendum described the error case using
the old 'Layer 3a' sub-label, inconsistent with the addendum's own paragraph
above it that drops the sub-label in favour of plain 'Layer 3'.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 adrs/021-ane-iosurface-pool-exhaustion-mitigation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md b/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md
index c29fa3a..bb12eb2 100644
--- a/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md
+++ b/adrs/021-ane-iosurface-pool-exhaustion-mitigation.md
@@ -241,7 +241,7 @@ categories:
 
 | Category | Meaning |
 |----------|---------|
-| `"error"` | Inference failed. Includes both non-IOSurface failures and IOSurface failures where the Layer 3a ANE retry also failed. |
+| `"error"` | Inference failed. Includes both non-IOSurface failures and IOSurface failures where the Layer 3 ANE retry also failed. |
 | `"warning"` | (Reserved; not currently produced.) |
 
 The `"cpu_fallback_failed"` category is retired. It will no longer appear in