diff --git a/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift b/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift index 351cdeb..8fa683f 100644 --- a/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift +++ b/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift @@ -47,12 +47,9 @@ private let coreMLLogger = Logger(subsystem: "com.switchcraft.coreml", category: /// ANE resources before the pool is exhausted. Each reload takes 1–3 s on /// ANE-capable hardware (CPU recompile is faster); tune `reloadInterval` /// to balance stall frequency against pool pressure for your workload. -/// 3. On IOSurface failure: force-reload + ANE retry first; if that also fails, -/// retry the window on `.cpuOnly`. Logged in `failureLogURL` (when set) -/// with three distinct `category` values: -/// - `"warning"` (`"recovered_iosurface_exhaustion"`): CPU fallback succeeded. -/// - `"cpu_fallback_failed"`: CPU fallback was attempted but also failed. -/// - `"error"`: IOSurface was not the cause (no recovery attempted). +/// 3. On IOSurface failure: force-reload + ANE retry. If the ANE retry also +/// fails, the original error is logged in `failureLogURL` (when set) with +/// `category: "error"` and rethrown. /// /// See ADR 021 for the full rationale. /// @@ -106,9 +103,6 @@ public actor T5CoreMLEmbedder: Embedder { /// Recreates the main predictor on demand; used by proactive reload to /// flush accumulated ANE IOSurface resources. private let predictorFactory: @Sendable () throws -> any MLPredictor - /// Produces a `.cpuOnly` predictor for reactive IOSurface fallback. - /// `nil` in test inits that use a static stub (no real `compiledURL` available). - private let cpuPredictorFactory: (@Sendable () throws -> any MLPredictor)? private let failureLogURL: URL? private var callCount: Int = 0 /// Number of `encode` calls between proactive model reloads. @@ -206,14 +200,8 @@ public actor T5CoreMLEmbedder: Embedder { config.computeUnits = capturedComputeUnits return try MLModel(contentsOf: capturedCompiledURL, configuration: config) } - let cpuFactory: @Sendable () throws -> any MLPredictor = { - let config = MLModelConfiguration() - config.computeUnits = .cpuOnly - return try MLModel(contentsOf: capturedCompiledURL, configuration: config) - } self.predictorFactory = factory - self.cpuPredictorFactory = cpuFactory self.predictor = try MLModel(contentsOf: compiledURL, configuration: configuration) self.tokenizer = tokenizer self.dims = dims @@ -294,7 +282,6 @@ public actor T5CoreMLEmbedder: Embedder { "maxInputTokens must be >= windowSize") let capturedPredictor = predictor self.predictorFactory = { capturedPredictor } - self.cpuPredictorFactory = nil self.predictor = predictor self.tokenizer = tokenizer self.dims = dims @@ -311,13 +298,12 @@ public actor T5CoreMLEmbedder: Embedder { /// Test-only init: inject a factory for predictor lifecycle testing. /// /// Use this variant when the test must verify model reload behaviour or - /// the IOSurface CPU-fallback path. The factory is called once during init - /// and again on each proactive reload. + /// the Layer 3 reactive reload + ANE retry path. The factory is called + /// once during init and again on each proactive or reactive reload. /// /// `internal` — access from test targets via `@testable import SwitchcraftCoreML`. internal init( predictorFactory: @escaping @Sendable () throws -> any MLPredictor, - cpuPredictorFactory: (@Sendable () throws -> any MLPredictor)? = nil, tokenizer: Tokenizer, dims: Int = 128, windowSize: Int = 512, @@ -338,7 +324,6 @@ public actor T5CoreMLEmbedder: Embedder { precondition(resolvedMaxInputTokens >= windowSize, "maxInputTokens must be >= windowSize") self.predictorFactory = predictorFactory - self.cpuPredictorFactory = cpuPredictorFactory self.predictor = try predictorFactory() self.tokenizer = tokenizer self.dims = dims @@ -359,8 +344,8 @@ public actor T5CoreMLEmbedder: Embedder { /// /// ObjC exceptions from CoreML are converted to `CoreMLNativeError` /// and thrown rather than crashing the host process. IOSurface allocation - /// failures are silently retried on CPU (see class doc-comment); callers - /// only receive an error if the retry also fails. + /// failures trigger a reactive model reload + ANE retry (see class doc-comment); + /// callers only receive an error if the ANE retry also fails. /// /// - Throws: `EmbedderError.inputTooLarge(actual:max:)` when the token /// count exceeds `maxInputTokens` and `overflowPolicy` is `.reject`; @@ -493,8 +478,7 @@ public actor T5CoreMLEmbedder: Embedder { // MARK: - Private helpers - /// Run one window prediction with autoreleasepool drainage, reactive reload, - /// ANE retry, and IOSurface CPU fallback. + /// Run one window prediction with autoreleasepool drainage and reactive reload + ANE retry. private func predictWindow( provider: MLDictionaryFeatureProvider, inputLength: Int, @@ -511,15 +495,15 @@ public actor T5CoreMLEmbedder: Embedder { ) return result } catch let nativeError as CoreMLNativeError { - guard isIOSurfaceExhaustion(nativeError), let cpuFactory = cpuPredictorFactory else { + guard isIOSurfaceExhaustion(nativeError) else { if let logURL = failureLogURL { logNativeException(nativeError, inputLength: inputLength, to: logURL) } throw nativeError } - // Layer 3a — Reactive reload + ANE retry: force-reload the predictor - // and retry on ANE before falling back to CPU. + // Layer 3 — Reactive reload + ANE retry: force-reload the predictor + // and retry on ANE. do { self.predictor = try predictorFactory() let retryResult = try autoreleasepool { @@ -531,27 +515,8 @@ public actor T5CoreMLEmbedder: Embedder { ) return retryResult } catch { - coreMLLogger.warning( - "T5CoreMLEmbedder: reactive reload/ANE retry failed, falling back to CPU: \(error, privacy: .public)" - ) - } - - // Layer 3b — CPU fallback: retry this window on .cpuOnly. - do { - let cpuPredictor = try cpuFactory() - let result = try autoreleasepool { - try catchingNSException { try cpuPredictor.predict(input: provider) } - } if let logURL = failureLogURL { - logRecoveredIOSurface(nativeError, inputLength: inputLength, to: logURL) - } - return result - } catch let cpuError { - // CPU fallback also failed — log with distinct category so the - // two states (no fallback vs. fallback-attempted-and-failed) are - // distinguishable in the JSONL log. - if let logURL = failureLogURL { - logCPUFallbackFailed(nativeError, cpuError: cpuError, inputLength: inputLength, to: logURL) + logNativeException(nativeError, inputLength: inputLength, to: logURL) } throw nativeError } @@ -589,66 +554,6 @@ public actor T5CoreMLEmbedder: Embedder { ) } - private func logRecoveredIOSurface( - _ error: CoreMLNativeError, - inputLength: Int, - to url: URL - ) { - guard case .nativeException(_, let reason, let callStack) = error else { return } - - coreMLLogger.warning( - "🟡 [COREML-RECOVERY] recovered IOSurface exhaustion on CPU fallback input_len=\(inputLength, privacy: .public)" - ) - - appendJSONLRow( - name: "recovered_iosurface_exhaustion", - reason: reason, - inputLength: inputLength, - callStack: Array(callStack.prefix(5)), - category: "warning", - to: url - ) - } - - private func logCPUFallbackFailed( - _ aneError: CoreMLNativeError, - cpuError: Error, - inputLength: Int, - to url: URL - ) { - guard case .nativeException(let aneName, let aneReason, let aneCallStack) = aneError else { return } - let (cpuName, cpuReason, cpuCallStack) = extractCPUErrorFields(cpuError) - - coreMLLogger.error( - "🔴 [COREML-CPU-FAILED] ane=\(aneName, privacy: .public) cpu=\(cpuName, privacy: .public) cpu_reason=\(cpuReason, privacy: .public) input_len=\(inputLength, privacy: .public)" - ) - for frame in cpuCallStack.prefix(5) { - coreMLLogger.error(" cpu: \(frame, privacy: .public)") - } - - appendJSONLRow( - name: aneName, - reason: aneReason, - inputLength: inputLength, - callStack: Array(aneCallStack.prefix(5)), - category: "cpu_fallback_failed", - cpuErrorName: cpuName, - cpuErrorReason: cpuReason, - cpuCallStack: Array(cpuCallStack.prefix(5)), - to: url - ) - } - - /// Extract name, reason, and call stack from a CPU-side error regardless of - /// its concrete type (`CoreMLNativeError` from predict or `NSError` from model load). - private func extractCPUErrorFields(_ error: Error) -> (name: String, reason: String, callStack: [String]) { - if case .nativeException(let name, let reason, let frames) = error as? CoreMLNativeError { - return (name, reason, frames) - } - let nsErr = error as NSError - return (nsErr.domain, nsErr.localizedDescription, []) - } - private func appendJSONLRow( name: String, reason: String, diff --git a/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift b/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift index d7a6613..00384e5 100644 --- a/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift +++ b/Tests/SwitchcraftTests/T5CoreMLEmbedderStressTests.swift @@ -9,7 +9,7 @@ import CoreML /// Stress and lifecycle tests for `T5CoreMLEmbedder`'s ANE IOSurface pool /// exhaustion mitigation: autoreleasepool discipline, proactive model reload, -/// and reactive CPU fallback. +/// and reactive reload + ANE retry (Layer 3). /// /// No CoreML model asset is required — `CountingStubPredictor` is injected /// via the factory-based internal init. @@ -112,93 +112,31 @@ struct T5CoreMLEmbedderStressTests { #expect(counter.count >= 4, "expected ≥4 factory calls (1 init + 3 reloads), got \(counter.count)") } - // MARK: - IOSurface fallback test + // MARK: - Layer 3 tests - /// When every prediction raises an IOSurface-like exception, the CPU fallback - /// must succeed and log a `"recovered_iosurface_exhaustion"` JSONL row with - /// `"category": "warning"` for each encode call. - @Test("IOSurface exhaustion triggers CPU fallback and logs recovery row") - func testIOSurfaceFallbackLogsRecovery() async throws { + /// When the initial predictor raises an IOSurface-like exception, Layer 3 + /// must force-reload the predictor via the factory and retry on ANE. When + /// the reloaded predictor succeeds, encode must not throw and no JSONL row + /// must be written. + @Test("Layer 3 ANE retry succeeds after reactive reload, no JSONL row written") + func testANERetrySucceedsAfterReload() async throws { let tokenizer = try Self.makeTokenizer() let logURL = FileManager.default.temporaryDirectory - .appendingPathComponent("switchcraft-stress-recovery-\(UUID().uuidString).jsonl") + .appendingPathComponent("switchcraft-ane-retry-success-\(UUID().uuidString).jsonl") defer { try? FileManager.default.removeItem(at: logURL) } - // Main predictor: always raises IOSurface-like exception. - // CPU fallback predictor: always succeeds. let dims = 16 + let counter = FactoryCallCounter() + // Factory call 1 (at init): returns a predictor that fails every predict. + // Factory call 2 (Layer 3 reactive reload): returns a succeeding predictor. let embedder = try T5CoreMLEmbedder( - predictorFactory: { CountingStubPredictor(failInterval: 1, dims: dims) }, - cpuPredictorFactory: { CountingStubPredictor(failInterval: nil, dims: dims) }, - tokenizer: tokenizer, - dims: dims, - windowSize: 64, - stride: 32, - minNorm: 1.0, - failureLogURL: logURL, - reloadInterval: 500 - ) - - // "test input" (10 chars, 1 window) → each encode = 1 IOSurface hit → 1 recovery row. - let inputText = "test input" - let encodeCount = 10 - - for _ in 0..