totalslacker · totalslacker · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/Sources/SwitchcraftCore/Embedding/Embedder.swift b/Sources/SwitchcraftCore/Embedding/Embedder.swift
@@ -27,7 +27,18 @@ public protocol Embedder: Sendable {
     /// Stable identifier for the model. Recorded on `ChunkRecord.model`.
     var modelIdentifier: String { get }
 
+    /// Maximum number of tokens the embedder will process in a single `encode`
+    /// call. Inputs that tokenise to more tokens than this limit are handled
+    /// according to the conformer's configured `EmbedderOverflowPolicy`.
+    ///
+    /// Conformers should expose this as a `nonisolated let` stored property so
+    /// callers can read the limit without entering an actor.
+    var maxInputTokens: Int { get }
+
     /// Encode `text` into a flat row-major `n × dims` per-token embedding
     /// matrix. Returns an empty array for empty / whitespace-only text.
+    ///
+    /// - Throws: `EmbedderError.inputTooLarge(actual:max:)` when the token
+    ///   count exceeds `maxInputTokens` and the overflow policy is `.reject`.
     func encode(_ text: String) async throws -> [Float]
 }
diff --git a/Sources/SwitchcraftCore/Embedding/EmbedderError.swift b/Sources/SwitchcraftCore/Embedding/EmbedderError.swift
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: Apache-2.0
+import Foundation
+
+/// Errors thrown by `Embedder` implementations.
+public enum EmbedderError: Error, Sendable, Equatable {
+    /// The tokenised input length exceeded the embedder's `maxInputTokens` limit
+    /// and the configured overflow policy is `.reject`.
+    ///
+    /// - Parameters:
+    ///   - actual: The number of tokens produced by the tokenizer for the input.
+    ///   - max: The embedder's `maxInputTokens` limit.
+    case inputTooLarge(actual: Int, max: Int)
+}
diff --git a/Sources/SwitchcraftCore/Embedding/EmbedderOverflowPolicy.swift b/Sources/SwitchcraftCore/Embedding/EmbedderOverflowPolicy.swift
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: Apache-2.0
+import Foundation
+
+/// Controls how an `Embedder` handles inputs whose token count exceeds `maxInputTokens`.
+///
+/// The two policies match the primary consumer trade-offs for retrieval workloads:
+///
+/// - `.truncate` (default): Silently clips the token sequence to the first
+///   `maxInputTokens` tokens and encodes the prefix. This is the established
+///   convention (Hugging Face `truncation=True, max_length=...`) and is appropriate
+///   for search, classification, and bulk-index pipelines where prefix content is
+///   informative and silent data loss is acceptable.
+///
+/// - `.reject`: Throws `EmbedderError.inputTooLarge(actual:max:)` so the caller can
+///   decide whether to skip, summarise, or split the input. Use this when silent
+///   truncation would violate the application's correctness guarantees.
+public enum EmbedderOverflowPolicy: Sendable, Equatable, Hashable {
+    /// Silently truncate the token sequence to `maxInputTokens` elements and encode
+    /// the prefix. No error is thrown; embeddings are returned for the truncated input.
+    case truncate
+
+    /// Throw `EmbedderError.inputTooLarge(actual:max:)` without calling the underlying
+    /// model. The caller is responsible for splitting, summarising, or skipping the input.
+    case reject
+}
diff --git a/Sources/SwitchcraftCoreML/MLPredictor.swift b/Sources/SwitchcraftCoreML/MLPredictor.swift
@@ -16,6 +16,11 @@ internal protocol MLPredictor: Sendable {
     func predict(input: any MLFeatureProvider) throws -> any MLFeatureProvider
 }
 
+// @unchecked @retroactive: retroactive conformances on CoreML types; safe
+// because all access is gated through T5CoreMLEmbedder's actor isolation.
+extension MLModel: @unchecked @retroactive Sendable {}
+extension MLDictionaryFeatureProvider: @unchecked @retroactive Sendable {}
+
 extension MLModel: MLPredictor {
     internal func predict(input: any MLFeatureProvider) throws -> any MLFeatureProvider {
         try self.prediction(from: input)

diff --git a/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift b/Sources/SwitchcraftCoreML/T5CoreMLEmbedder.swift
@@ -87,6 +87,20 @@ public actor T5CoreMLEmbedder: Embedder {
     /// `windowSize`. `256` matches Witchcraft's sliding-window stride.
     public nonisolated let stride: Int
 
+    /// Maximum total token count across all sliding windows that the embedder
+    /// will accept in a single `encode` call. Inputs that tokenise to more
+    /// tokens than this value are handled according to `overflowPolicy`.
+    ///
+    /// Default `8 * windowSize` (4,096 for the standard 512-token window),
+    /// yielding ~15 windows at stride 256 — well below the ~577-window burst
+    /// that exhausted the ANE IOSurface pool. Must be ≥ `windowSize`.
+    public nonisolated let maxInputTokens: Int
+
+    /// Controls behaviour when a tokenised input exceeds `maxInputTokens`.
+    /// `.truncate` (default) silently clips to the prefix; `.reject` throws
+    /// `EmbedderError.inputTooLarge(actual:max:)`.
+    public nonisolated let overflowPolicy: EmbedderOverflowPolicy
+
     private let tokenizer: Tokenizer
     private var predictor: any MLPredictor
     /// Recreates the main predictor on demand; used by proactive reload to
@@ -136,6 +150,12 @@ public actor T5CoreMLEmbedder: Embedder {
     ///     accumulated ANE IOSurface resources. Default `150` — see the stored
     ///     property doc-comment for tuning guidance. Existing callers that omit
     ///     this parameter are unaffected.
+    ///   - maxInputTokens: maximum total token count accepted per `encode` call.
+    ///     Inputs exceeding this limit are handled by `overflowPolicy`. Default
+    ///     `8 * windowSize`. Must be ≥ `windowSize`.
+    ///   - overflowPolicy: `.truncate` (default) clips oversized inputs to the
+    ///     first `maxInputTokens` tokens; `.reject` throws
+    ///     `EmbedderError.inputTooLarge(actual:max:)`.
     /// - Throws: any error from `MLModel.compileModel(at:)` or `MLModel(contentsOf:)`.
     public init(
         modelURL: URL,
@@ -147,13 +167,18 @@ public actor T5CoreMLEmbedder: Embedder {
         stride: Int = 256,
         minNorm: Float = 1.0,
         failureLogURL: URL? = nil,
-        reloadInterval: Int = 150
+        reloadInterval: Int = 150,
+        maxInputTokens: Int? = nil,
+        overflowPolicy: EmbedderOverflowPolicy = .truncate
     ) async throws {
         precondition(dims > 0 && dims % 2 == 0,
                      "dims must be positive and even (Q4 codec packs two nibbles per byte)")
         precondition(windowSize > 0)
         precondition(stride > 0 && stride <= windowSize)
         precondition(reloadInterval > 0, "reloadInterval must be positive (used as modulo divisor)")
+        let resolvedMaxInputTokens = maxInputTokens ?? 8 * windowSize
+        precondition(resolvedMaxInputTokens >= windowSize,
+                     "maxInputTokens must be >= windowSize (got \(resolvedMaxInputTokens) < \(windowSize))")
 
         let configuration = MLModelConfiguration()
         configuration.computeUnits = computeUnits
@@ -198,6 +223,8 @@ public actor T5CoreMLEmbedder: Embedder {
         self.stride = stride
         self.failureLogURL = failureLogURL
         self.reloadInterval = reloadInterval
+        self.maxInputTokens = resolvedMaxInputTokens
+        self.overflowPolicy = overflowPolicy
     }
 
     /// Convenience init that resolves the model URL from a `Bundle`.
@@ -213,7 +240,9 @@ public actor T5CoreMLEmbedder: Embedder {
         stride: Int = 256,
         minNorm: Float = 1.0,
         failureLogURL: URL? = nil,
-        reloadInterval: Int = 150
+        reloadInterval: Int = 150,
+        maxInputTokens: Int? = nil,
+        overflowPolicy: EmbedderOverflowPolicy = .truncate
     ) async throws {
         guard let url = bundle.url(
             forResource: resourceName,
@@ -235,7 +264,9 @@ public actor T5CoreMLEmbedder: Embedder {
             stride: stride,
             minNorm: minNorm,
             failureLogURL: failureLogURL,
-            reloadInterval: reloadInterval
+            reloadInterval: reloadInterval,
+            maxInputTokens: maxInputTokens,
+            overflowPolicy: overflowPolicy
         )
     }
 
@@ -250,12 +281,17 @@ public actor T5CoreMLEmbedder: Embedder {
         stride: Int = 256,
         minNorm: Float = 1.0,
         modelIdentifier: String = "stub@v0",
-        failureLogURL: URL? = nil
+        failureLogURL: URL? = nil,
+        maxInputTokens: Int? = nil,
+        overflowPolicy: EmbedderOverflowPolicy = .truncate
     ) {
         precondition(dims > 0 && dims % 2 == 0,
                      "dims must be positive and even")
         precondition(windowSize > 0)
         precondition(stride > 0 && stride <= windowSize)
+        let resolvedMaxInputTokens = maxInputTokens ?? 8 * windowSize
+        precondition(resolvedMaxInputTokens >= windowSize,
+                     "maxInputTokens must be >= windowSize")
         let capturedPredictor = predictor
         self.predictorFactory = { capturedPredictor }
         self.cpuPredictorFactory = nil
@@ -268,6 +304,8 @@ public actor T5CoreMLEmbedder: Embedder {
         self.modelIdentifier = modelIdentifier
         self.failureLogURL = failureLogURL
         self.reloadInterval = 150
+        self.maxInputTokens = resolvedMaxInputTokens
+        self.overflowPolicy = overflowPolicy
     }
 
     /// Test-only init: inject a factory for predictor lifecycle testing.
@@ -287,13 +325,18 @@ public actor T5CoreMLEmbedder: Embedder {
         minNorm: Float = 1.0,
         modelIdentifier: String = "stub@v0",
         failureLogURL: URL? = nil,
-        reloadInterval: Int = 150
+        reloadInterval: Int = 150,
+        maxInputTokens: Int? = nil,
+        overflowPolicy: EmbedderOverflowPolicy = .truncate
     ) throws {
         precondition(dims > 0 && dims % 2 == 0,
                      "dims must be positive and even")
         precondition(windowSize > 0)
         precondition(stride > 0 && stride <= windowSize)
         precondition(reloadInterval > 0, "reloadInterval must be positive (used as modulo divisor)")
+        let resolvedMaxInputTokens = maxInputTokens ?? 8 * windowSize
+        precondition(resolvedMaxInputTokens >= windowSize,
+                     "maxInputTokens must be >= windowSize")
         self.predictorFactory = predictorFactory
         self.cpuPredictorFactory = cpuPredictorFactory
         self.predictor = try predictorFactory()
@@ -305,6 +348,8 @@ public actor T5CoreMLEmbedder: Embedder {
         self.modelIdentifier = modelIdentifier
         self.failureLogURL = failureLogURL
         self.reloadInterval = reloadInterval
+        self.maxInputTokens = resolvedMaxInputTokens
+        self.overflowPolicy = overflowPolicy
     }
 
     // MARK: - Embedder
@@ -317,8 +362,10 @@ public actor T5CoreMLEmbedder: Embedder {
     /// failures are silently retried on CPU (see class doc-comment); callers
     /// only receive an error if the retry also fails.
     ///
-    /// - Throws: `T5CoreMLEmbedderError.missingOutput` if the CoreML
-    ///   model does not produce the expected feature dictionary;
+    /// - Throws: `EmbedderError.inputTooLarge(actual:max:)` when the token
+    ///   count exceeds `maxInputTokens` and `overflowPolicy` is `.reject`;
+    ///   `T5CoreMLEmbedderError.missingOutput` if the CoreML model does not
+    ///   produce the expected feature dictionary;
     ///   `CoreMLNativeError.nativeException` if CoreML raises an internal
     ///   ObjC exception that the embedder cannot recover from; any
     ///   tokenizer-originated error.
@@ -343,9 +390,20 @@ public actor T5CoreMLEmbedder: Embedder {
         let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
         if trimmed.isEmpty { return [] }
 
-        let tokens = try tokenizer.encode(text, addSpecialTokens: true)
+        var tokens = try tokenizer.encode(text, addSpecialTokens: true)
         if tokens.isEmpty { return [] }
 
+        // Overflow guard: prevent oversized inputs from generating hundreds of
+        // sliding windows and exhausting the ANE IOSurface buffer pool (ADR 022).
+        if tokens.count > maxInputTokens {
+            switch overflowPolicy {
+            case .truncate:
+                tokens = Array(tokens.prefix(maxInputTokens))
+            case .reject:
+                throw EmbedderError.inputTooLarge(actual: tokens.count, max: maxInputTokens)
+            }
+        }
+
         // Proactive model reload: recreate the predictor every reloadInterval
         // encodes to flush accumulated ANE IOSurface resources.
         // Counter increments only for real inference calls (whitespace-only inputs
@@ -438,7 +496,7 @@ public actor T5CoreMLEmbedder: Embedder {
     /// Run one window prediction with autoreleasepool drainage, reactive reload,
     /// ANE retry, and IOSurface CPU fallback.
     private func predictWindow(
-        provider: any MLFeatureProvider,
+        provider: MLDictionaryFeatureProvider,
         inputLength: Int,
         windowTokenCount: Int
     ) throws -> any MLFeatureProvider {

diff --git a/Sources/SwitchcraftMetal/T5MetalEmbedder.swift b/Sources/SwitchcraftMetal/T5MetalEmbedder.swift
@@ -65,6 +65,8 @@ public actor T5MetalEmbedder: Embedder {
     public nonisolated let minNorm: Float
     public nonisolated let windowSize: Int
     public nonisolated let stride: Int
+    public nonisolated let maxInputTokens: Int
+    public nonisolated let overflowPolicy: EmbedderOverflowPolicy
 
     // MARK: - Architecture constants
     //
@@ -149,12 +151,17 @@ public actor T5MetalEmbedder: Embedder {
         windowSize windowSizeParam: Int = 512,
         stride strideParam: Int = 256,
         minNorm minNormParam: Float = 1.0,
-        modelIdentifier modelIdentifierParam: String = "google/xtr-base-en@v1+gguf"
+        modelIdentifier modelIdentifierParam: String = "google/xtr-base-en@v1+gguf",
+        maxInputTokens maxInputTokensParam: Int? = nil,
+        overflowPolicy overflowPolicyParam: EmbedderOverflowPolicy = .truncate
     ) async throws {
         precondition(dimsParam > 0 && dimsParam % 2 == 0,
                      "dims must be positive and even (Q4 codec packs two nibbles per byte)")
         precondition(windowSizeParam > 0)
         precondition(strideParam > 0 && strideParam <= windowSizeParam)
+        let resolvedMaxInputTokens = maxInputTokensParam ?? 8 * windowSizeParam
+        precondition(resolvedMaxInputTokens >= windowSizeParam,
+                     "maxInputTokens must be >= windowSize")
 
         guard let context = MetalContext.shared else {
             throw T5MetalEmbedderError.metalUnavailable
@@ -414,6 +421,8 @@ public actor T5MetalEmbedder: Embedder {
         self.minNorm = minNormParam
         self.windowSize = windowSizeParam
         self.stride = strideParam
+        self.maxInputTokens = resolvedMaxInputTokens
+        self.overflowPolicy = overflowPolicyParam
         self.tokenizer = tokenizer
         self.context = context
         self.layers = layerWeights
@@ -456,9 +465,20 @@ public actor T5MetalEmbedder: Embedder {
         let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
         if trimmed.isEmpty { return [] }
 
-        let tokens = try tokenizer.encode(text, addSpecialTokens: true)
+        var tokens = try tokenizer.encode(text, addSpecialTokens: true)
         if tokens.isEmpty { return [] }
 
+        // Overflow guard: prevent oversized inputs from generating hundreds of
+        // Metal command buffers and exhausting device memory (ADR 022).
+        if tokens.count > maxInputTokens {
+            switch overflowPolicy {
+            case .truncate:
+                tokens = Array(tokens.prefix(maxInputTokens))
+            case .reject:
+                throw EmbedderError.inputTooLarge(actual: tokens.count, max: maxInputTokens)
+            }
+        }
+
         let starts = SlidingWindow.plan(
             tokenCount: tokens.count,
             windowSize: windowSize,

diff --git a/Tests/SwitchcraftTests/SearchTimeoutTests.swift b/Tests/SwitchcraftTests/SearchTimeoutTests.swift
@@ -55,6 +55,7 @@ struct SearchTimeoutTests {
 
         var dims: Int { inner.dims }
         var modelIdentifier: String { inner.modelIdentifier }
+        var maxInputTokens: Int { inner.maxInputTokens }
 
         func encode(_ text: String) async throws -> [Float] {
             // Task.sleep respects task cancellation: it throws

diff --git a/Tests/SwitchcraftTests/Support/MockEmbedder.swift b/Tests/SwitchcraftTests/Support/MockEmbedder.swift
@@ -15,6 +15,7 @@ import SwitchcraftCore
 struct MockEmbedder: Embedder, Sendable {
     let dims: Int
     let modelIdentifier: String
+    let maxInputTokens: Int = Int.max
 
     init(dims: Int = 128, modelIdentifier: String? = nil) {
         precondition(dims > 0 && dims % 2 == 0,

diff --git a/Tests/SwitchcraftTests/SwitchcraftStoreTests.swift b/Tests/SwitchcraftTests/SwitchcraftStoreTests.swift
@@ -339,6 +339,7 @@ struct SwitchcraftStoreTests {
         struct OddDimsEmbedder: Embedder {
             let dims = 33
             let modelIdentifier = "odd"
+            let maxInputTokens: Int = Int.max
             func encode(_ text: String) async throws -> [Float] { [] }
         }