Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 12 additions & 107 deletions Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,9 @@ private let coreMLLogger = Logger(subsystem: "com.switchcraft.coreml", category:
/// ANE resources before the pool is exhausted. Each reload takes 1–3 s on
/// ANE-capable hardware (CPU recompile is faster); tune `reloadInterval`
/// to balance stall frequency against pool pressure for your workload.
/// 3. On IOSurface failure: force-reload + ANE retry first; if that also fails,
/// retry the window on `.cpuOnly`. Logged in `failureLogURL` (when set)
/// with three distinct `category` values:
/// - `"warning"` (`"recovered_iosurface_exhaustion"`): CPU fallback succeeded.
/// - `"cpu_fallback_failed"`: CPU fallback was attempted but also failed.
/// - `"error"`: IOSurface was not the cause (no recovery attempted).
/// 3. On IOSurface failure: force-reload + ANE retry. If the ANE retry also
/// fails, the original error is logged in `failureLogURL` (when set) with
/// `category: "error"` and rethrown.
///
/// See ADR 021 for the full rationale.
///
Expand Down Expand Up @@ -106,9 +103,6 @@ public actor T5CoreMLEmbedder: Embedder {
/// Recreates the main predictor on demand; used by proactive reload to
/// flush accumulated ANE IOSurface resources.
private let predictorFactory: @Sendable () throws -> any MLPredictor
/// Produces a `.cpuOnly` predictor for reactive IOSurface fallback.
/// `nil` in test inits that use a static stub (no real `compiledURL` available).
private let cpuPredictorFactory: (@Sendable () throws -> any MLPredictor)?
private let failureLogURL: URL?
private var callCount: Int = 0
/// Number of `encode` calls between proactive model reloads.
Expand Down Expand Up @@ -206,14 +200,8 @@ public actor T5CoreMLEmbedder: Embedder {
config.computeUnits = capturedComputeUnits
return try MLModel(contentsOf: capturedCompiledURL, configuration: config)
}
let cpuFactory: @Sendable () throws -> any MLPredictor = {
let config = MLModelConfiguration()
config.computeUnits = .cpuOnly
return try MLModel(contentsOf: capturedCompiledURL, configuration: config)
}

self.predictorFactory = factory
self.cpuPredictorFactory = cpuFactory
self.predictor = try MLModel(contentsOf: compiledURL, configuration: configuration)
self.tokenizer = tokenizer
self.dims = dims
Expand Down Expand Up @@ -294,7 +282,6 @@ public actor T5CoreMLEmbedder: Embedder {
"maxInputTokens must be >= windowSize")
let capturedPredictor = predictor
self.predictorFactory = { capturedPredictor }
self.cpuPredictorFactory = nil
self.predictor = predictor
self.tokenizer = tokenizer
self.dims = dims
Expand All @@ -311,13 +298,12 @@ public actor T5CoreMLEmbedder: Embedder {
/// Test-only init: inject a factory for predictor lifecycle testing.
///
/// Use this variant when the test must verify model reload behaviour or
/// the IOSurface CPU-fallback path. The factory is called once during init
/// and again on each proactive reload.
/// the Layer 3 reactive reload + ANE retry path. The factory is called
/// once during init and again on each proactive or reactive reload.
///
/// `internal` — access from test targets via `@testable import SwitchcraftCoreML`.
internal init(
predictorFactory: @escaping @Sendable () throws -> any MLPredictor,
cpuPredictorFactory: (@Sendable () throws -> any MLPredictor)? = nil,
tokenizer: Tokenizer,
dims: Int = 128,
windowSize: Int = 512,
Expand All @@ -338,7 +324,6 @@ public actor T5CoreMLEmbedder: Embedder {
precondition(resolvedMaxInputTokens >= windowSize,
"maxInputTokens must be >= windowSize")
self.predictorFactory = predictorFactory
self.cpuPredictorFactory = cpuPredictorFactory
self.predictor = try predictorFactory()
self.tokenizer = tokenizer
self.dims = dims
Expand All @@ -359,8 +344,8 @@ public actor T5CoreMLEmbedder: Embedder {
///
/// ObjC exceptions from CoreML are converted to `CoreMLNativeError`
/// and thrown rather than crashing the host process. IOSurface allocation
/// failures are silently retried on CPU (see class doc-comment); callers
/// only receive an error if the retry also fails.
/// failures trigger a reactive model reload + ANE retry (see class doc-comment);
/// callers only receive an error if the ANE retry also fails.
///
/// - Throws: `EmbedderError.inputTooLarge(actual:max:)` when the token
/// count exceeds `maxInputTokens` and `overflowPolicy` is `.reject`;
Expand Down Expand Up @@ -493,8 +478,7 @@ public actor T5CoreMLEmbedder: Embedder {

// MARK: - Private helpers

/// Run one window prediction with autoreleasepool drainage, reactive reload,
/// ANE retry, and IOSurface CPU fallback.
/// Run one window prediction with autoreleasepool drainage and reactive reload + ANE retry.
private func predictWindow(
provider: MLDictionaryFeatureProvider,
inputLength: Int,
Expand All @@ -511,15 +495,15 @@ public actor T5CoreMLEmbedder: Embedder {
)
return result
} catch let nativeError as CoreMLNativeError {
guard isIOSurfaceExhaustion(nativeError), let cpuFactory = cpuPredictorFactory else {
guard isIOSurfaceExhaustion(nativeError) else {
if let logURL = failureLogURL {
logNativeException(nativeError, inputLength: inputLength, to: logURL)
}
throw nativeError
}

// Layer 3a — Reactive reload + ANE retry: force-reload the predictor
// and retry on ANE before falling back to CPU.
// Layer 3 — Reactive reload + ANE retry: force-reload the predictor
// and retry on ANE.
do {
self.predictor = try predictorFactory()
let retryResult = try autoreleasepool {
Expand All @@ -531,27 +515,8 @@ public actor T5CoreMLEmbedder: Embedder {
)
return retryResult
} catch {
coreMLLogger.warning(
"T5CoreMLEmbedder: reactive reload/ANE retry failed, falling back to CPU: \(error, privacy: .public)"
)
}

// Layer 3b — CPU fallback: retry this window on .cpuOnly.
do {
let cpuPredictor = try cpuFactory()
let result = try autoreleasepool {
try catchingNSException { try cpuPredictor.predict(input: provider) }
}
if let logURL = failureLogURL {
logRecoveredIOSurface(nativeError, inputLength: inputLength, to: logURL)
}
return result
} catch let cpuError {
// CPU fallback also failed — log with distinct category so the
// two states (no fallback vs. fallback-attempted-and-failed) are
// distinguishable in the JSONL log.
if let logURL = failureLogURL {
logCPUFallbackFailed(nativeError, cpuError: cpuError, inputLength: inputLength, to: logURL)
logNativeException(nativeError, inputLength: inputLength, to: logURL)
}
throw nativeError
}
Expand Down Expand Up @@ -589,66 +554,6 @@ public actor T5CoreMLEmbedder: Embedder {
)
}

private func logRecoveredIOSurface(
_ error: CoreMLNativeError,
inputLength: Int,
to url: URL
) {
guard case .nativeException(_, let reason, let callStack) = error else { return }

coreMLLogger.warning(
"🟡 [COREML-RECOVERY] recovered IOSurface exhaustion on CPU fallback input_len=\(inputLength, privacy: .public)"
)

appendJSONLRow(
name: "recovered_iosurface_exhaustion",
reason: reason,
inputLength: inputLength,
callStack: Array(callStack.prefix(5)),
category: "warning",
to: url
)
}

private func logCPUFallbackFailed(
_ aneError: CoreMLNativeError,
cpuError: Error,
inputLength: Int,
to url: URL
) {
guard case .nativeException(let aneName, let aneReason, let aneCallStack) = aneError else { return }
let (cpuName, cpuReason, cpuCallStack) = extractCPUErrorFields(cpuError)

coreMLLogger.error(
"🔴 [COREML-CPU-FAILED] ane=\(aneName, privacy: .public) cpu=\(cpuName, privacy: .public) cpu_reason=\(cpuReason, privacy: .public) input_len=\(inputLength, privacy: .public)"
)
for frame in cpuCallStack.prefix(5) {
coreMLLogger.error(" cpu: \(frame, privacy: .public)")
}

appendJSONLRow(
name: aneName,
reason: aneReason,
inputLength: inputLength,
callStack: Array(aneCallStack.prefix(5)),
category: "cpu_fallback_failed",
cpuErrorName: cpuName,
cpuErrorReason: cpuReason,
cpuCallStack: Array(cpuCallStack.prefix(5)),
to: url
)
}

/// Extract name, reason, and call stack from a CPU-side error regardless of
/// its concrete type (`CoreMLNativeError` from predict or `NSError` from model load).
private func extractCPUErrorFields(_ error: Error) -> (name: String, reason: String, callStack: [String]) {
if case .nativeException(let name, let reason, let frames) = error as? CoreMLNativeError {
return (name, reason, frames)
}
let nsErr = error as NSError
return (nsErr.domain, nsErr.localizedDescription, [])
}

private func appendJSONLRow(
name: String,
reason: String,
Expand Down
Loading
Loading