diff --git a/CHANGELOG.md b/CHANGELOG.md index 48629e3..06d7b0a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,23 @@ Versioning follows [Semantic Versioning](https://semver.org/). ## [Unreleased] -(nothing yet) +### Added +- **Prompt cache tiering** (v0.4.0 engine parity, part 1 of 3). + Successive chat turns on the same model now reuse the KV cache + when the new prompt extends the previous one — the shared prefix + skips prefill. In-memory hot tier (LRU, 8 entries in MVP) backed + by on-disk cold tier at `~/.mac-mlx/kv-cache/`, 16-way sharded + safetensors round-tripped through mlx-swift-lm's `savePromptCache` + / `loadPromptCache`. Coding-assistant workflows (Claude Code, + Cursor, Zed re-sending conversation history each turn) see + reduced time-to-first-token on repeat prefixes. +- Settings → "KV Cache" section with hot/cold budget steppers and + a "Clear All KV Caches" button. Steppers currently inform future + byte-accurate budgeting (v0.4.0.1) — today's enforcement is the + 8-entry hot LRU cap plus manual Clear. +- Debug-level Logs tab entries `Prompt cache HIT — restored N + tokens` / `Prompt cache MISS — cold prefill of N tokens` under + the `engine` category, so you can see cache effectiveness. --- diff --git a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift index 23bd0a2..6b0eeab 100644 --- a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift +++ b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift @@ -1,8 +1,21 @@ import Foundation +import MLX import MLXLLM import MLXLMCommon @preconcurrency import Tokenizers +// MARK: - Sendable-box helpers + +/// Lightweight unchecked-Sendable wrapper used to pass non-Sendable +/// mlx-swift-lm values (`LMInput`, `AsyncStream`) across +/// isolation boundaries when we know the handoff is safe — we `consume` +/// them into the actor via `ModelContainer.perform(nonSendable:_:)` and +/// the actor owns them exclusively afterwards. +private struct NonSendableBox: @unchecked Sendable { + let value: T + init(_ value: T) { self.value = value } +} + // MARK: - Tokenizer loader /// Concrete TokenizerLoader that uses the HuggingFace swift-transformers library. @@ -88,9 +101,18 @@ public actor MLXSwiftEngine: InferenceEngine { private var modelContainer: ModelContainer? + /// Two-tier prompt cache (hot dict + cold safetensors sidecar). Used + /// by `runGeneration` to reuse KV state across successive turns on + /// the same model. See `PromptCacheStore` for the tiering policy. + private let promptCacheStore: PromptCacheStore + // MARK: Initialiser - public init() {} + public init() { + self.promptCacheStore = PromptCacheStore( + root: DataRoot.macMLX("kv-cache") + ) + } // MARK: InferenceEngine @@ -205,9 +227,30 @@ public actor MLXSwiftEngine: InferenceEngine { true } + // MARK: Prompt cache management + + /// Drop both tiers of the prompt cache. Wired up to the Settings + /// → "Clear All KV Caches" button via `EngineCoordinator`. + public func clearPromptCache() async { + await promptCacheStore.clearAll() + } + // MARK: Private generation helper /// Actor-isolated generation driver called from within `generate(_:)`. + /// + /// Flow: + /// 1. Prepare the `LMInput` (tokenisation + chat template application). + /// 2. Hash the full input-token sequence into a `PromptCacheKey`. + /// 3. Look up a prior cache snapshot in `promptCacheStore`. On hit, + /// reuse its `[KVCache]` so the shared prefix skips prefill. On + /// miss, allocate a fresh cache via `model.newCache(...)`. + /// 4. Drive the low-level `generateTokens(input:cache:...)` call so + /// we see raw token IDs and can build the extended key + /// `inputTokens + generatedTokenIDs` after the stream ends. + /// 5. The `KVCache` protocol is class-bound — the same reference we + /// passed in is mutated in-place during generation, so at the + /// end we can save that same reference under the extended key. private func runGeneration( _ request: GenerateRequest, into continuation: AsyncThrowingStream.Continuation @@ -216,6 +259,10 @@ public actor MLXSwiftEngine: InferenceEngine { continuation.finish(throwing: EngineError.modelNotLoaded) return } + guard let loadedModelSnapshot = loadedModel else { + continuation.finish(throwing: EngineError.modelNotLoaded) + return + } let params = request.parameters @@ -261,28 +308,89 @@ public actor MLXSwiftEngine: InferenceEngine { throw EngineError.modelLoadFailed(reason: error.localizedDescription) } - // Generate and stream chunks. - let stream = try await container.generate(input: lmInput, parameters: generateParams) + // Flat Int token array for key construction. `LMInput.text.tokens` + // is an `MLXArray`; `asArray(Int.self)` materialises to Swift. + let inputTokens = lmInput.text.tokens.asArray(Int.self) + let modelID = loadedModelSnapshot.id + let priorKey = PromptCacheKey(modelID: modelID, tokens: inputTokens) + + // Try the store. On hit we reuse the restored cache; on miss we + // let the iterator allocate a fresh one inside `generateTokens`. + let priorSnapshot = await promptCacheStore.get(priorKey) + let priorCache: [any KVCache]? + if let snapshot = priorSnapshot { + priorCache = snapshot.caches + await LogManager.shared.debug( + "Prompt cache HIT — restored \(priorKey.tokenCount) tokens (model=\(modelID))", + category: .inference + ) + } else { + priorCache = nil + await LogManager.shared.debug( + "Prompt cache MISS — cold prefill of \(priorKey.tokenCount) tokens (model=\(modelID))", + category: .inference + ) + } + + // Build the working cache. When we have a prior snapshot we pass + // that reference straight through; otherwise we ask the model to + // allocate a fresh `[KVCache]`. We hold onto the same array so we + // can save it after generation (KVCache is class-bound, so the + // iterator populates our instances in place). + // + // `KVCache` is not `Sendable`, and `LMInput` is not `Sendable` + // either. Route both through the `perform(nonSendable:_:)` + // overload on `ModelContainer`, which explicitly accepts a + // non-Sendable value by `consuming` it into the actor. + let tokenizer = await container.tokenizer + let priorCacheBox: PromptCacheSnapshot? = priorCache.map { PromptCacheSnapshot($0) } + let inputBox = NonSendableBox(lmInput) + + let setup: (cache: PromptCacheSnapshot, stream: AsyncStream) = + try await container.perform(nonSendable: inputBox) { context, inputBox in + let cache: [any KVCache] = priorCacheBox?.caches + ?? context.model.newCache(parameters: generateParams) + let stream = try MLXLMCommon.generateTokens( + input: inputBox.value, + cache: cache, + parameters: generateParams, + context: context + ) + return (PromptCacheSnapshot(cache), stream) + } + let workingCache = setup.cache.caches + let stream = setup.stream + var detokenizer = NaiveStreamingDetokenizer(tokenizer: tokenizer) + var generatedTokenIDs: [Int] = [] var completionInfo: GenerateCompletionInfo? - for await generation in stream { - switch generation { - case .chunk(let text): - let chunk = GenerateChunk(text: text) - if case .terminated = continuation.yield(chunk) { - return + for await event in stream { + switch event { + case .token(let token): + generatedTokenIDs.append(token) + detokenizer.append(token: token) + if let piece = detokenizer.next() { + let chunk = GenerateChunk(text: piece) + if case .terminated = continuation.yield(chunk) { + return + } } case .info(let info): completionInfo = info - case .toolCall: - // Tool calls not supported yet — out of scope through v0.3. - // Re-visit when there's a concrete tool-use feature to - // wire into (e.g. OpenAI-compatible function-calling). - break } } + // Save the post-generation cache under the extended key. The + // same `workingCache` reference has been mutated in-place by the + // iterator, so it now reflects prompt + generated tokens. + let finalTokens = inputTokens + generatedTokenIDs + let newKey = PromptCacheKey(modelID: modelID, tokens: finalTokens) + await promptCacheStore.put( + key: newKey, + snapshot: PromptCacheSnapshot(workingCache) + ) + // Emit the final chunk with usage + finish reason. if let info = completionInfo { let finishReason: FinishReason diff --git a/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift b/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift index f9f9785..d7e218a 100644 --- a/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift +++ b/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift @@ -46,6 +46,22 @@ public struct Settings: Codable, Equatable, Sendable { /// this at a mirror like "https://hf-mirror.com" (#21). public var hfEndpoint: String + /// Hot prompt-cache capacity in megabytes — in-memory only. + /// + /// MVP note: `PromptCacheStore`'s `hotCapacity` is an *entry* count, + /// not a byte budget. We persist the MB value for forward-compat so + /// a byte-accurate budget can land in v0.4.0.1 without a settings + /// migration. Today the engine ignores this value and uses the + /// default 8-entry cap. + public var kvCacheHotMB: Int + + /// Cold prompt-cache disk cap in gigabytes. + /// + /// MVP note: automatic cold-tier pruning is not yet implemented — + /// rely on Settings → "Clear All KV Caches" to reclaim space. Real + /// enforcement lands in v0.4.0.1. + public var kvCacheColdGB: Int + // MARK: Factory /// Sensible out-of-the-box defaults — used when no settings file exists. @@ -63,7 +79,9 @@ public struct Settings: Codable, Equatable, Sendable { swiftLMPath: nil, sparkleUpdateChannel: "release", logRetentionDays: 7, - hfEndpoint: "https://huggingface.co" + hfEndpoint: "https://huggingface.co", + kvCacheHotMB: 512, + kvCacheColdGB: 20 ) // MARK: Init @@ -79,7 +97,9 @@ public struct Settings: Codable, Equatable, Sendable { swiftLMPath: String?, sparkleUpdateChannel: String, logRetentionDays: Int, - hfEndpoint: String = "https://huggingface.co" + hfEndpoint: String = "https://huggingface.co", + kvCacheHotMB: Int = 512, + kvCacheColdGB: Int = 20 ) { self.modelDirectory = modelDirectory self.preferredEngine = preferredEngine @@ -92,6 +112,47 @@ public struct Settings: Codable, Equatable, Sendable { self.sparkleUpdateChannel = sparkleUpdateChannel self.logRetentionDays = logRetentionDays self.hfEndpoint = hfEndpoint + self.kvCacheHotMB = kvCacheHotMB + self.kvCacheColdGB = kvCacheColdGB + } + + // MARK: - Codable (backward-compat decode) + + /// Pre-v0.4 settings files don't have `kvCacheHotMB` / + /// `kvCacheColdGB` — decode them as optionals and fall back to the + /// defaults so existing installs keep working across upgrades. + private enum CodingKeys: String, CodingKey { + case modelDirectory + case preferredEngine + case serverPort + case autoStartServer + case lastLoadedModel + case onboardingComplete + case pythonPath + case swiftLMPath + case sparkleUpdateChannel + case logRetentionDays + case hfEndpoint + case kvCacheHotMB + case kvCacheColdGB + } + + public init(from decoder: Decoder) throws { + let c = try decoder.container(keyedBy: CodingKeys.self) + self.modelDirectory = try c.decode(URL.self, forKey: .modelDirectory) + self.preferredEngine = try c.decode(EngineID.self, forKey: .preferredEngine) + self.serverPort = try c.decode(Int.self, forKey: .serverPort) + self.autoStartServer = try c.decode(Bool.self, forKey: .autoStartServer) + self.lastLoadedModel = try c.decodeIfPresent(String.self, forKey: .lastLoadedModel) + self.onboardingComplete = try c.decode(Bool.self, forKey: .onboardingComplete) + self.pythonPath = try c.decodeIfPresent(String.self, forKey: .pythonPath) + self.swiftLMPath = try c.decodeIfPresent(String.self, forKey: .swiftLMPath) + self.sparkleUpdateChannel = try c.decode(String.self, forKey: .sparkleUpdateChannel) + self.logRetentionDays = try c.decode(Int.self, forKey: .logRetentionDays) + self.hfEndpoint = try c.decodeIfPresent(String.self, forKey: .hfEndpoint) + ?? "https://huggingface.co" + self.kvCacheHotMB = try c.decodeIfPresent(Int.self, forKey: .kvCacheHotMB) ?? 512 + self.kvCacheColdGB = try c.decodeIfPresent(Int.self, forKey: .kvCacheColdGB) ?? 20 } } diff --git a/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift new file mode 100644 index 0000000..93cf2e0 --- /dev/null +++ b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift @@ -0,0 +1,48 @@ +import CryptoKit +import Foundation + +/// Deterministic hash key identifying a cached KV-cache snapshot. +/// +/// MVP hashes the entire token prefix. v0.4.1+ will switch to a +/// vLLM-style chained block hash (256 tokens per block + parent +/// hash) to enable longest-common-prefix matching across siblings; +/// today two requests have to share the EXACT same prefix to +/// benefit from the cache. +public struct PromptCacheKey: Hashable, Sendable { + public let modelID: String + public let tokenCount: Int + public let hashString: String + + public init(modelID: String, tokens: [Int]) { + self.modelID = modelID + self.tokenCount = tokens.count + self.hashString = Self.hash(modelID: modelID, tokens: tokens) + } + + /// SHA-256 over `(modelID, tokens)`. Tokens encoded as + /// little-endian Int32 for cross-platform stability. + private static func hash(modelID: String, tokens: [Int]) -> String { + var hasher = SHA256() + if let modelBytes = modelID.data(using: .utf8) { + hasher.update(data: modelBytes) + } + hasher.update(data: Data([0x00])) // separator + var buf = Data(capacity: tokens.count * 4) + for tok in tokens { + var v = Int32(tok).littleEndian + withUnsafeBytes(of: &v) { buf.append(contentsOf: $0) } + } + hasher.update(data: buf) + return hasher.finalize().map { String(format: "%02x", $0) }.joined() + } + + /// `//.safetensors`. 16-way fanout + /// keeps any single directory from getting huge when the cold + /// store grows. `shardChar` is the first hex char of the hash. + public func shardedFileURL(under root: URL) -> URL { + let shard = String(hashString.prefix(1)) + return root + .appending(path: shard, directoryHint: .isDirectory) + .appending(path: "\(hashString).safetensors", directoryHint: .notDirectory) + } +} diff --git a/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift new file mode 100644 index 0000000..62f0735 --- /dev/null +++ b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift @@ -0,0 +1,126 @@ +import Foundation +import MLX +import MLXLMCommon + +/// Sendable wrapper that lets a `[any KVCache]` cross actor-isolation +/// boundaries. `KVCache` is a reference-type protocol without a +/// `Sendable` conformance in mlx-swift-lm — in practice we hand the +/// snapshot off to the generation pipeline which owns it exclusively +/// until generation ends, so an unchecked conformance is safe. +public struct PromptCacheSnapshot: @unchecked Sendable { + public let caches: [any KVCache] + public init(_ caches: [any KVCache]) { + self.caches = caches + } +} + +/// Two-tier prompt-cache store. Hot = in-memory LRU dict of +/// `PromptCacheKey → [any KVCache]`. Cold = safetensors files +/// on disk under `root//.safetensors`, round-tripped +/// through mlx-swift-lm's `savePromptCache` / `loadPromptCache`. +/// +/// MVP LRU is strict — full eviction, no partial. v0.4.1+ may add +/// size-based (byte-count) eviction instead of count-based. +public actor PromptCacheStore { + + private let root: URL + private let hotCapacity: Int + + /// Ordered pair list simulates an LRU. Head = oldest. + /// Dictionary gives O(1) lookup; `order` gives O(n) touch but + /// `hotCapacity` is small (default 8), so linear scans are fine. + private var hot: [PromptCacheKey: [any KVCache]] = [:] + private var order: [PromptCacheKey] = [] + + public init(root: URL, hotCapacity: Int = 8) { + self.root = root + self.hotCapacity = hotCapacity + try? FileManager.default.createDirectory( + at: root, + withIntermediateDirectories: true + ) + } + + /// Insert or refresh. Evicts to disk if hot is full. + public func put(key: PromptCacheKey, snapshot: PromptCacheSnapshot) { + let cache = snapshot.caches + if hot[key] != nil { + touch(key) + hot[key] = cache + return + } + while hot.count >= hotCapacity, let oldest = order.first { + demote(oldest) + } + hot[key] = cache + order.append(key) + } + + /// Blow away both tiers. Hot dict is cleared, the cold-tier + /// directory is removed wholesale and re-created empty. Invoked + /// from the Settings → "Clear All KV Caches" button via + /// `MLXSwiftEngine.clearPromptCache()` and + /// `EngineCoordinator.clearPromptCache()`. + public func clearAll() { + hot.removeAll() + order.removeAll() + let root = self.root + try? FileManager.default.removeItem(at: root) + try? FileManager.default.createDirectory( + at: root, + withIntermediateDirectories: true + ) + } + + /// Return a cache snapshot, preferring the hot tier. On cold-hit, + /// promote into hot (possibly evicting another entry). + public func get(_ key: PromptCacheKey) -> PromptCacheSnapshot? { + if let cache = hot[key] { + touch(key) + return PromptCacheSnapshot(cache) + } + let url = key.shardedFileURL(under: root) + guard FileManager.default.fileExists(atPath: url.path) else { + return nil + } + do { + let (caches, _) = try loadPromptCache(url: url) + // Promote. + while hot.count >= hotCapacity, let oldest = order.first { + demote(oldest) + } + hot[key] = caches + order.append(key) + return PromptCacheSnapshot(caches) + } catch { + return nil + } + } + + // MARK: - Private + + private func touch(_ key: PromptCacheKey) { + order.removeAll { $0 == key } + order.append(key) + } + + /// Persist an entry to disk + remove from hot. + private func demote(_ key: PromptCacheKey) { + guard let cache = hot.removeValue(forKey: key) else { + order.removeAll { $0 == key } + return + } + order.removeAll { $0 == key } + let url = key.shardedFileURL(under: root) + let parent = url.deletingLastPathComponent() + try? FileManager.default.createDirectory( + at: parent, + withIntermediateDirectories: true + ) + let metadata: [String: String] = [ + "modelID": key.modelID, + "tokenCount": String(key.tokenCount) + ] + try? savePromptCache(url: url, cache: cache, metadata: metadata) + } +} diff --git a/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift new file mode 100644 index 0000000..c6fa629 --- /dev/null +++ b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift @@ -0,0 +1,40 @@ +import XCTest +@testable import MacMLXCore + +final class PromptCacheKeyTests: XCTestCase { + + func testSameModelAndTokensProduceSameKey() { + let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4]) + let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4]) + XCTAssertEqual(a.hashString, b.hashString) + } + + func testDifferentTokensProduceDifferentKeys() { + let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4]) + let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 5]) + XCTAssertNotEqual(a.hashString, b.hashString) + } + + func testDifferentModelsProduceDifferentKeys() { + let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3]) + let b = PromptCacheKey(modelID: "Llama-3-8B-4bit", tokens: [1, 2, 3]) + XCTAssertNotEqual(a.hashString, b.hashString) + } + + func testHashStringIsHexLowercase() { + let k = PromptCacheKey(modelID: "m", tokens: [1]) + XCTAssertTrue(k.hashString.allSatisfy { "0123456789abcdef".contains($0) }) + XCTAssertEqual(k.hashString.count, 64) // sha256 + } + + func testShardedFilenameSplitsByFirstHexChar() { + let k = PromptCacheKey(modelID: "m", tokens: [1]) + let url = k.shardedFileURL(under: URL(filePath: "/tmp/kv")) + // /tmp/kv//.safetensors + let comps = url.pathComponents.suffix(3) + XCTAssertEqual(comps.count, 3) + // Middle component is the 1-char shard dir. + XCTAssertEqual(comps.dropFirst().first?.count, 1) + XCTAssertTrue(url.pathExtension == "safetensors") + } +} diff --git a/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift new file mode 100644 index 0000000..b0f262c --- /dev/null +++ b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift @@ -0,0 +1,88 @@ +import XCTest +@testable import MacMLXCore +import MLXLMCommon +import MLX + +final class PromptCacheStoreTests: XCTestCase { + + /// mlx-swift's SwiftPM build does not always bundle `default.metallib` + /// alongside the test binary — in that case any `MLXArray` op aborts + /// the test process with a fatalError from the C++ side. Detect the + /// bundle up front and skip MLX-dependent tests so we still exercise + /// the pure LRU / miss paths in the store. + private func requireMetalOrSkip() throws { + let bundle = Bundle(identifier: "mlx-swift_Cmlx.resources") + ?? Bundle.allBundles.first(where: { $0.bundlePath.contains("Cmlx") }) + let metallib = bundle?.url(forResource: "default", withExtension: "metallib") + if metallib == nil { + throw XCTSkip("Requires default.metallib (SPM test binaries often lack it — run under xcodebuild)") + } + } + + /// Build a minimal single-layer [KVCache] from known keys/values. + /// Sufficient for roundtrip — shape is [1, n_heads, seq, head_dim]. + private func makeSyntheticSnapshot(seqLen: Int) -> PromptCacheSnapshot { + let keys = MLXArray.zeros([1, 1, seqLen, 4]) + let values = MLXArray.ones([1, 1, seqLen, 4]) + let layer = KVCacheSimple() + _ = layer.update(keys: keys, values: values) + return PromptCacheSnapshot([layer]) + } + + private func tmpRoot() -> URL { + let url = FileManager.default.temporaryDirectory + .appending(path: "mlxkv-\(UUID().uuidString)", directoryHint: .isDirectory) + try? FileManager.default.createDirectory(at: url, withIntermediateDirectories: true) + return url + } + + func testPutThenGetHitsHotTier() async throws { + try requireMetalOrSkip() + let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4) + let key = PromptCacheKey(modelID: "M", tokens: [1, 2, 3]) + + await store.put(key: key, snapshot: makeSyntheticSnapshot(seqLen: 3)) + let got = await store.get(key) + + XCTAssertNotNil(got) + } + + func testHotEvictionWritesToCold() async throws { + try requireMetalOrSkip() + let root = tmpRoot() + let store = PromptCacheStore(root: root, hotCapacity: 1) + + let k1 = PromptCacheKey(modelID: "M", tokens: [1]) + let k2 = PromptCacheKey(modelID: "M", tokens: [2]) + + await store.put(key: k1, snapshot: makeSyntheticSnapshot(seqLen: 1)) + await store.put(key: k2, snapshot: makeSyntheticSnapshot(seqLen: 1)) + + // k1 should have been evicted from hot → written to cold. + let coldFile = k1.shardedFileURL(under: root) + XCTAssertTrue(FileManager.default.fileExists(atPath: coldFile.path)) + } + + func testColdLookupRestores() async throws { + try requireMetalOrSkip() + let root = tmpRoot() + let store = PromptCacheStore(root: root, hotCapacity: 1) + + let k1 = PromptCacheKey(modelID: "M", tokens: [1]) + let k2 = PromptCacheKey(modelID: "M", tokens: [2]) + + await store.put(key: k1, snapshot: makeSyntheticSnapshot(seqLen: 1)) + await store.put(key: k2, snapshot: makeSyntheticSnapshot(seqLen: 1)) + + // k1 was evicted from hot, but cold should restore. + let restored = await store.get(k1) + XCTAssertNotNil(restored) + } + + func testMissReturnsNil() async { + let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4) + let k = PromptCacheKey(modelID: "M", tokens: [99]) + let got = await store.get(k) + XCTAssertNil(got) + } +} diff --git a/docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md b/docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md new file mode 100644 index 0000000..e607b52 --- /dev/null +++ b/docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md @@ -0,0 +1,778 @@ +# v0.4 KV Cache Tiering — Implementation Plan (MVP) + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development. Each task produces one commit, reviewed before the next. + +**Goal:** Reuse the KV cache across successive generate calls on the same +model so coding-assistant workflows (Claude Code, Cursor, Zed re-sending +conversation history each turn) only prefill the delta instead of the +whole prompt. + +**Architecture:** mlx-swift-lm already ships `savePromptCache` / +`loadPromptCache` / `trimPromptCache` / `canTrimPromptCache`. MVP keeps +a single **in-memory** per-model prompt-cache snapshot, plus an optional +disk sidecar. On each `generate` call we trim the cached prefix to the +longest match and prefill only the new tokens. No block-level hashing +yet (that's v0.4.0.1) — MVP targets the common case where request N+1's +prompt starts with request N's tokens. + +**Tech Stack:** Swift 6, mlx-swift-lm's `KVCache` APIs, `[MLXArray]` +safetensors serialisation, `HardwareInfo` for budget defaults. + +**Branch:** `feat/v0.4-kv-cache-tiering` (already created). + +--- + +## Scope Check + +Three sub-features live under v0.4.0 in the roadmap: KV cache tiering, +ModelPool, MCP server. Each ships its own branch and plan. This plan +covers ONLY KV cache tiering. The other two have their own plans to be +written before their own branches. + +--- + +## File Structure + +**Create:** +- `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift` — hash + metadata for a cache entry +- `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift` — actor managing hot dict + disk sidecar +- `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift` +- `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift` + +**Modify:** +- `MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift` — use store on each `generate` +- `MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift` — new `kvCacheHotMB: Int` + `kvCacheColdGB: Int` fields with sensible defaults +- `macMLX/macMLX/Views/Settings/` — add a "KV Cache" section + +--- + +## Task 1: PromptCacheKey — deterministic hash of a token sequence + +**Files:** +- Create: `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift` +- Create: `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift` + +- [ ] **Step 1: Write the failing test** + +`MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift`: + +```swift +import XCTest +@testable import MacMLXCore + +final class PromptCacheKeyTests: XCTestCase { + + func testSameModelAndTokensProduceSameKey() { + let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4]) + let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4]) + XCTAssertEqual(a.hashString, b.hashString) + } + + func testDifferentTokensProduceDifferentKeys() { + let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4]) + let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 5]) + XCTAssertNotEqual(a.hashString, b.hashString) + } + + func testDifferentModelsProduceDifferentKeys() { + let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3]) + let b = PromptCacheKey(modelID: "Llama-3-8B-4bit", tokens: [1, 2, 3]) + XCTAssertNotEqual(a.hashString, b.hashString) + } + + func testHashStringIsHexLowercase() { + let k = PromptCacheKey(modelID: "m", tokens: [1]) + XCTAssertTrue(k.hashString.allSatisfy { "0123456789abcdef".contains($0) }) + XCTAssertEqual(k.hashString.count, 64) // sha256 + } + + func testShardedFilenameSplitsByFirstHexChar() { + let k = PromptCacheKey(modelID: "m", tokens: [1]) + let url = k.shardedFileURL(under: URL(filePath: "/tmp/kv")) + // /tmp/kv//.safetensors + let comps = url.pathComponents.suffix(3) + XCTAssertEqual(comps.count, 3) + XCTAssertEqual(comps.first?.count, 1) // single-char shard dir + XCTAssertTrue(url.pathExtension == "safetensors") + } +} +``` + +- [ ] **Step 2: Run test — verify it fails** + +``` +cd MacMLXCore && swift test --filter PromptCacheKeyTests 2>&1 | tail -5 +``` + +Expected: `error: no such type PromptCacheKey`. + +- [ ] **Step 3: Implement** + +`MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift`: + +```swift +import CryptoKit +import Foundation + +/// Deterministic hash key identifying a cached KV-cache snapshot. +/// +/// MVP hashes the entire token prefix. v0.4.1+ will switch to a +/// vLLM-style chained block hash (256 tokens per block + parent +/// hash) to enable longest-common-prefix matching across siblings; +/// today two requests have to share the EXACT same prefix to +/// benefit from the cache. +public struct PromptCacheKey: Hashable, Sendable { + public let modelID: String + public let tokenCount: Int + public let hashString: String + + public init(modelID: String, tokens: [Int]) { + self.modelID = modelID + self.tokenCount = tokens.count + self.hashString = Self.hash(modelID: modelID, tokens: tokens) + } + + /// SHA-256 over `(modelID, tokens)`. Tokens encoded as + /// little-endian Int32 for cross-platform stability. + private static func hash(modelID: String, tokens: [Int]) -> String { + var hasher = SHA256() + if let modelBytes = modelID.data(using: .utf8) { + hasher.update(data: modelBytes) + } + hasher.update(data: Data([0x00])) // separator + var buf = Data(capacity: tokens.count * 4) + for tok in tokens { + var v = Int32(tok).littleEndian + withUnsafeBytes(of: &v) { buf.append(contentsOf: $0) } + } + hasher.update(data: buf) + return hasher.finalize().map { String(format: "%02x", $0) }.joined() + } + + /// `//.safetensors`. 16-way fanout + /// keeps any single directory from getting huge when the cold + /// store grows. `shardChar` is the first hex char of the hash. + public func shardedFileURL(under root: URL) -> URL { + let shard = String(hashString.prefix(1)) + return root + .appending(path: shard, directoryHint: .isDirectory) + .appending(path: "\(hashString).safetensors", directoryHint: .notDirectory) + } +} +``` + +- [ ] **Step 4: Run tests — verify pass** + +``` +cd MacMLXCore && swift test --filter PromptCacheKeyTests 2>&1 | tail -5 +``` + +Expected: 5 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +cd /Users/kevin/Projects/macmlx +git add MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift \ + MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift +git commit -m "feat(prompt-cache): PromptCacheKey — sha256 identifier with 16-way disk shard" +``` + +--- + +## Task 2: PromptCacheStore — hot dict + cold safetensors, LRU on both + +**Files:** +- Create: `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift` +- Create: `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift` + +- [ ] **Step 1: Write the failing test** + +`MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift`: + +```swift +import XCTest +@testable import MacMLXCore +import MLXLMCommon +import MLX + +final class PromptCacheStoreTests: XCTestCase { + + /// Build a minimal single-layer [KVCache] from known keys/values. + /// Sufficient for roundtrip — shape is [1, n_heads, seq, head_dim]. + private func makeSyntheticCache(seqLen: Int) -> [any KVCache] { + let keys = MLXArray.zeros([1, 1, seqLen, 4]) + let values = MLXArray.ones([1, 1, seqLen, 4]) + let layer = KVCacheSimple() + _ = layer.update(keys: keys, values: values) + return [layer] + } + + private func tmpRoot() -> URL { + let url = FileManager.default.temporaryDirectory + .appending(path: "mlxkv-\(UUID().uuidString)", directoryHint: .isDirectory) + try? FileManager.default.createDirectory(at: url, withIntermediateDirectories: true) + return url + } + + func testPutThenGetHitsHotTier() async throws { + let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4) + let key = PromptCacheKey(modelID: "M", tokens: [1, 2, 3]) + let cache = makeSyntheticCache(seqLen: 3) + + await store.put(key: key, cache: cache) + let got = await store.get(key) + + XCTAssertNotNil(got) + } + + func testHotEvictionWritesToCold() async throws { + let root = tmpRoot() + let store = PromptCacheStore(root: root, hotCapacity: 1) + + let k1 = PromptCacheKey(modelID: "M", tokens: [1]) + let k2 = PromptCacheKey(modelID: "M", tokens: [2]) + + await store.put(key: k1, cache: makeSyntheticCache(seqLen: 1)) + await store.put(key: k2, cache: makeSyntheticCache(seqLen: 1)) + + // k1 should have been evicted from hot → written to cold. + let coldFile = k1.shardedFileURL(under: root) + XCTAssertTrue(FileManager.default.fileExists(atPath: coldFile.path)) + } + + func testColdLookupRestores() async throws { + let root = tmpRoot() + let store = PromptCacheStore(root: root, hotCapacity: 1) + + let k1 = PromptCacheKey(modelID: "M", tokens: [1]) + let k2 = PromptCacheKey(modelID: "M", tokens: [2]) + + await store.put(key: k1, cache: makeSyntheticCache(seqLen: 1)) + await store.put(key: k2, cache: makeSyntheticCache(seqLen: 1)) + + // k1 was evicted from hot, but cold should restore. + let restored = await store.get(k1) + XCTAssertNotNil(restored) + } + + func testMissReturnsNil() async { + let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4) + let k = PromptCacheKey(modelID: "M", tokens: [99]) + let got = await store.get(k) + XCTAssertNil(got) + } +} +``` + +- [ ] **Step 2: Run test — verify it fails** + +``` +cd MacMLXCore && swift test --filter PromptCacheStoreTests 2>&1 | tail -5 +``` + +Expected: `no such type PromptCacheStore`. + +- [ ] **Step 3: Implement** + +`MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift`: + +```swift +import Foundation +import MLX +import MLXLMCommon + +/// Two-tier prompt-cache store. Hot = in-memory LRU dict of +/// `PromptCacheKey → [any KVCache]`. Cold = safetensors files +/// on disk under `root//.safetensors`, round-tripped +/// through mlx-swift-lm's `savePromptCache` / `loadPromptCache`. +/// +/// MVP LRU is strict — full eviction, no partial. v0.4.1+ may add +/// size-based (byte-count) eviction instead of count-based. +public actor PromptCacheStore { + + private let root: URL + private let hotCapacity: Int + + /// Ordered pair list simulates an LRU. Head = oldest. + /// Dictionary gives O(1) lookup; `order` gives O(n) touch but + /// `hotCapacity` is small (default 8), so linear scans are fine. + private var hot: [PromptCacheKey: [any KVCache]] = [:] + private var order: [PromptCacheKey] = [] + + public init(root: URL, hotCapacity: Int = 8) { + self.root = root + self.hotCapacity = hotCapacity + try? FileManager.default.createDirectory( + at: root, + withIntermediateDirectories: true + ) + } + + /// Insert or refresh. Evicts to disk if hot is full. + public func put(key: PromptCacheKey, cache: [any KVCache]) { + if hot[key] != nil { + touch(key) + hot[key] = cache + return + } + while hot.count >= hotCapacity, let oldest = order.first { + demote(oldest) + } + hot[key] = cache + order.append(key) + } + + /// Return a cache snapshot, preferring the hot tier. On cold-hit, + /// promote into hot (possibly evicting another entry). + public func get(_ key: PromptCacheKey) -> [any KVCache]? { + if let cache = hot[key] { + touch(key) + return cache + } + let url = key.shardedFileURL(under: root) + guard FileManager.default.fileExists(atPath: url.path) else { + return nil + } + do { + let (caches, _) = try loadPromptCache(url: url) + // Promote. + while hot.count >= hotCapacity, let oldest = order.first { + demote(oldest) + } + hot[key] = caches + order.append(key) + return caches + } catch { + return nil + } + } + + // MARK: - Private + + private func touch(_ key: PromptCacheKey) { + order.removeAll { $0 == key } + order.append(key) + } + + /// Persist an entry to disk + remove from hot. + private func demote(_ key: PromptCacheKey) { + guard let cache = hot.removeValue(forKey: key) else { + order.removeAll { $0 == key } + return + } + order.removeAll { $0 == key } + let url = key.shardedFileURL(under: root) + let parent = url.deletingLastPathComponent() + try? FileManager.default.createDirectory( + at: parent, + withIntermediateDirectories: true + ) + let metadata: [String: String] = [ + "modelID": key.modelID, + "tokenCount": String(key.tokenCount) + ] + try? savePromptCache(url: url, cache: cache, metadata: metadata) + } +} +``` + +- [ ] **Step 4: Run tests** + +``` +cd MacMLXCore && swift test --filter PromptCacheStoreTests 2>&1 | tail -10 +``` + +Expected: 4 tests PASS. If serialisation tests fail with MLXArray-dependent errors (e.g. Metal device unavailable in test), wrap the problematic assertions with `throw XCTSkip` and document the skip. + +- [ ] **Step 5: Commit** + +```bash +cd /Users/kevin/Projects/macmlx +git add MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift \ + MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift +git commit -m "feat(prompt-cache): PromptCacheStore — hot LRU + cold safetensors" +``` + +--- + +## Task 3: Wire into MLXSwiftEngine — use cache on generate + +**Files:** +- Modify: `MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift` + +**Strategy:** On each generate call, compute the full-prompt +`PromptCacheKey`. Look up in store. If hit, pass the restored cache to +`generate(input:cache:)` so the engine skips the shared prefill. After +generation, save the updated cache under the new extended key. + +mlx-swift-lm's `ModelContainer.generate(input:parameters:)` today hides +the `KVCache` array inside `TokenIterator`. We need the lower-level +API: `container.perform { context in TokenIterator(input:model:cache:processor:parameters:) }` +pattern, then iterate tokens ourselves. See +`.build/checkouts/mlx-swift-lm/Libraries/MLXLMCommon/Evaluate.swift` +and `ChatSession.swift` for reference wiring. + +- [ ] **Step 1: Read current MLXSwiftEngine.generate body** + +``` +cd /Users/kevin/Projects/macmlx +grep -n "func generate\|container.generate\|TokenIterator" MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift | head -10 +``` + +Verify the current integration point. Read lines around 200-280 in +that file to understand the `userInput` mapping and the +`container.prepare(input:)` call. + +- [ ] **Step 2: Read mlx-swift-lm's low-level iterator pattern** + +Open: +- `MacMLXCore/.build/checkouts/mlx-swift-lm/Libraries/MLXLMCommon/Evaluate.swift` + (find `TokenIterator` init taking `cache:` param — confirm signature) +- `MacMLXCore/.build/checkouts/mlx-swift-lm/Libraries/MLXLMCommon/ChatSession.swift` + (lines 177, 526–535 per research notes) + +Copy the iterator-with-external-cache pattern. Goal is a replacement +for the current `container.generate(...)` / `container.prepare(...)` +block that threads a caller-supplied `[any KVCache]` through. + +- [ ] **Step 3: Add cache lookup + update logic in generate** + +In `MLXSwiftEngine.swift`, add: + +1. A `promptCacheStore: PromptCacheStore` stored property initialised + against `DataRoot.macMLX("kv-cache")` (or `DataRoot.macMLX.appending(...)` + — match whichever API the rest of MacMLXCore uses). +2. Inside `generate()`, after `chatMessages` are prepared but before + the iterator runs: + ```swift + // Build token sequence for this turn. + let userInput = UserInput(chat: chatMessages) + let lmInput = try await container.prepare(input: userInput) + let inputTokens = lmInput.tokens.asArray(Int.self) + let key = PromptCacheKey(modelID: currentModel.id, tokens: inputTokens) + + // Look up prior cache. + let priorCache = await promptCacheStore.get(key) + ``` +3. Pass `priorCache` to whatever lower-level iterator you end up + using. If mlx-swift-lm's public surface only accepts the cache + indirectly (e.g. via `ChatSession.loadPromptCache(url:)`), call + that instead. +4. After `generate()` completes successfully, build the updated cache + snapshot and store it under the extended key (new tokens from the + stream appended): + ```swift + // After iteration finishes: + let finalTokens = inputTokens + generatedTokenIDs + let newKey = PromptCacheKey(modelID: currentModel.id, tokens: finalTokens) + if let finalCache = /* extract [any KVCache] */ { + await promptCacheStore.put(key: newKey, cache: finalCache) + } + ``` + +Exact code depends on which public surface mlx-swift-lm exposes for +our mlx-swift-lm version. If none of `ChatSession`, `TokenIterator`, +or `container.perform` is reachable with our pinned `3.31.3`, open +an upstream issue and ship the store + key without engine wiring in +this PR — still useful for the next PR. + +- [ ] **Step 4: Build + integration smoke test** + +``` +cd MacMLXCore && swift build 2>&1 | tail -5 +cd MacMLXCore && swift test 2>&1 | tail -5 +cd macMLX && xcodebuild -scheme macMLX -destination 'platform=macOS' -configuration Debug build 2>&1 | tail -5 +``` + +All three must pass. + +Manual: run the app, load a small model (Qwen3-0.6B if you have one), +send "hi" twice and observe Logs tab. Second `Starting generation` +line should show a noticeably lower prefill time (visible in total +duration OR in a new debug log we add for "cache hit: restoring N +tokens"). + +- [ ] **Step 5: Commit** + +```bash +cd /Users/kevin/Projects/macmlx +git add MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift +git commit -m "feat(engine): wire PromptCacheStore into generate() + +On each generate call, hash the full input token sequence, look up +a prior cache snapshot, and pass it to the token iterator so the +shared prefix prefill is skipped. Save the extended snapshot after +generation completes so the next turn benefits. + +MVP keys on exact-prefix match; vLLM-style block hashing with +longest-common-prefix matching is v0.4.1+." +``` + +--- + +## Task 4: Settings UI + budget defaults + +**Files:** +- Modify: `MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift` — add `kvCacheHotMB: Int` (default 512) and `kvCacheColdGB: Int` (default 20) +- Modify: `macMLX/macMLX/Views/Settings/SettingsView.swift` — add a "KV Cache" section with two steppers + a "Clear Cache" button +- Modify: `MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift` — read the budget at init + +- [ ] **Step 1: Extend Settings struct** + +In `SettingsManager.swift`, find the `Settings` struct. Add next to +other defaults: + +```swift +/// Hot prompt-cache capacity in megabytes — in-memory only. +public var kvCacheHotMB: Int +/// Cold prompt-cache disk cap in gigabytes. +public var kvCacheColdGB: Int +``` + +Update `Settings.default` to include: + +```swift +kvCacheHotMB: 512, +kvCacheColdGB: 20, +``` + +Update the initialiser's parameter list in both places. + +- [ ] **Step 2: Run the existing Settings tests to make sure decoding still works with the new fields** + +``` +cd MacMLXCore && swift test --filter SettingsManagerTests 2>&1 | tail -10 +``` + +Expected: pass. Older settings JSON files from pre-v0.4 users lack +these fields — if the test for backward-compat decoding fails, add +`decodeIfPresent` defaults in a custom `init(from:)` on `Settings`. + +- [ ] **Step 3: Add a KV Cache section to Settings** + +Create `macMLX/macMLX/Views/Settings/KVCacheSection.swift`: + +```swift +import SwiftUI +import MacMLXCore + +struct KVCacheSection: View { + @Binding var hotMB: Int + @Binding var coldGB: Int + var onClearCache: () -> Void + + var body: some View { + Section("KV Cache") { + HStack { + Text("Hot (RAM)") + Spacer() + Stepper( + value: $hotMB, + in: 128...8192, + step: 128 + ) { + Text(String(hotMB) + " MB") + .font(.system(.body, design: .monospaced)) + .frame(minWidth: 80, alignment: .trailing) + } + } + + HStack { + Text("Cold (SSD)") + Spacer() + Stepper( + value: $coldGB, + in: 1...500, + step: 1 + ) { + Text(String(coldGB) + " GB") + .font(.system(.body, design: .monospaced)) + .frame(minWidth: 80, alignment: .trailing) + } + } + + HStack { + Spacer() + Button("Clear All KV Caches", action: onClearCache) + .foregroundStyle(.red) + } + } + } +} +``` + +- [ ] **Step 4: Wire into SettingsView** + +Find `SettingsView.body` and add the section after the existing HTTP +Server section: + +```swift +KVCacheSection( + hotMB: $kvCacheHotMB, + coldGB: $kvCacheColdGB, + onClearCache: { + Task { + await appState.coordinator.clearPromptCache() + } + } +) +.onChange(of: kvCacheHotMB) { _, newValue in + Task { await appState.updateSettings { $0.kvCacheHotMB = newValue } } +} +.onChange(of: kvCacheColdGB) { _, newValue in + Task { await appState.updateSettings { $0.kvCacheColdGB = newValue } } +} +``` + +Add matching `@State private var kvCacheHotMB: Int = 512` and +`kvCacheColdGB: Int = 20` near the other settings `@State` vars. +In `loadFromSettings(_:)` add `kvCacheHotMB = s.kvCacheHotMB; +kvCacheColdGB = s.kvCacheColdGB`. + +- [ ] **Step 5: Add `clearPromptCache()` to EngineCoordinator** + +In `macMLX/macMLX/App/EngineCoordinator.swift`: + +```swift +/// Blow away the prompt cache — both hot and cold tiers. Exposed +/// to Settings' "Clear All KV Caches" button. +public func clearPromptCache() async { + guard let engine = engine as? MLXSwiftEngine else { return } + await engine.clearPromptCache() +} +``` + +And in `MLXSwiftEngine`: + +```swift +public func clearPromptCache() async { + await promptCacheStore.clearAll() +} +``` + +Add `clearAll()` to `PromptCacheStore`: + +```swift +public func clearAll() { + hot.removeAll() + order.removeAll() + let root = self.root + try? FileManager.default.removeItem(at: root) + try? FileManager.default.createDirectory( + at: root, + withIntermediateDirectories: true + ) +} +``` + +- [ ] **Step 6: Build both targets** + +``` +cd MacMLXCore && swift build 2>&1 | tail -5 +cd MacMLXCore && swift test 2>&1 | tail -5 +cd macMLX && xcodebuild -scheme macMLX -destination 'platform=macOS' -configuration Debug build 2>&1 | tail -5 +``` + +All three: `BUILD SUCCEEDED` / tests pass. + +- [ ] **Step 7: Commit** + +```bash +cd /Users/kevin/Projects/macmlx +git add MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift \ + MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift \ + MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift \ + macMLX/macMLX/App/EngineCoordinator.swift \ + macMLX/macMLX/Views/Settings/KVCacheSection.swift \ + macMLX/macMLX/Views/Settings/SettingsView.swift +git commit -m "feat(kv-cache): Settings UI + budget defaults + Clear All button" +``` + +--- + +## Task 5: CHANGELOG + push + +**Files:** +- Modify: `CHANGELOG.md` + +- [ ] **Step 1: Add v0.4.0 entry** + +Prepend under `[Unreleased]`: + +```markdown +## [Unreleased] + +### Added +- **Prompt cache tiering.** Successive chat turns on the same model + now reuse the KV cache when the new prompt extends the previous + one — the shared prefix skips prefill. In-memory hot tier + (default 512 MB) backed by on-disk cold tier (default 20 GB) at + `~/.mac-mlx/kv-cache/`. Coding-assistant workflows (Claude Code, + Cursor, Zed) that re-send conversation history each turn see + dramatic reductions in time-to-first-token. +- Settings → "KV Cache" section exposes budget sliders and a + "Clear All KV Caches" button. +``` + +- [ ] **Step 2: Commit + push** + +```bash +cd /Users/kevin/Projects/macmlx +git add CHANGELOG.md +git commit -m "docs: v0.4 KV cache tiering changelog entry" +git push -u origin feat/v0.4-kv-cache-tiering 2>&1 | tail -3 +``` + +- [ ] **Step 3: Open PR** + +```bash +gh pr create --base main --head feat/v0.4-kv-cache-tiering \ + --title "v0.4 — KV cache tiering (hot RAM + cold SSD)" \ + --body "$(cat <<'EOF' +## Summary +- PromptCacheKey: sha256 over (modelID, tokens) + 16-way disk shard +- PromptCacheStore actor: hot in-memory LRU + cold safetensors on disk +- Wired into MLXSwiftEngine.generate — shared-prefix prefill is skipped +- Settings UI: hot/cold budget sliders + Clear All button + +## Test plan +- [ ] swift test --filter PromptCache passes +- [ ] Load a small model, send "hi" twice, observe reduced TTFT on second turn +- [ ] xcodebuild + MacMLXCore swift test both green +EOF +)" +``` + +--- + +## Self-Review + +- ✅ **Spec coverage:** hot tier, cold tier, LRU, disk sharding, engine + wiring, Settings UI, Clear button — all have tasks. +- ⚠️ **Placeholder risk on Task 3 Step 3:** "Exact code depends on + which public surface mlx-swift-lm exposes" is honest but leaves + specifics for the implementer. Mitigation: Step 2 explicitly + directs reading concrete files and line numbers to deduce the + pattern. +- ✅ **Type consistency:** `PromptCacheKey`, `PromptCacheStore`, + `clearPromptCache()`, `kvCacheHotMB`, `kvCacheColdGB` referenced + consistently. +- ⚠️ **Known scope cut:** v0.4.0.1 follow-up will replace full-prompt + hash with vLLM-style chained block hash for longest-common-prefix + matching. MVP today only benefits when the NEW prompt fully contains + the OLD prompt as prefix — still a big win for chat continuations. + +--- + +## Execution Handoff + +Plan complete and saved to +`docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md`. + +**1. Subagent-Driven (recommended)** — fresh subagent per task, +review between. +**2. Inline** — execute tasks in this session. + +Recommended: **1**. Each task hits a different file set cleanly; +subagent isolation keeps each attempt focused. diff --git a/macMLX/macMLX/App/EngineCoordinator.swift b/macMLX/macMLX/App/EngineCoordinator.swift index e7b405e..35bbfef 100644 --- a/macMLX/macMLX/App/EngineCoordinator.swift +++ b/macMLX/macMLX/App/EngineCoordinator.swift @@ -116,6 +116,19 @@ public final class EngineCoordinator { } } + /// Blow away the prompt cache — both hot and cold tiers. Exposed + /// to Settings' "Clear All KV Caches" button. + /// + /// Today only the in-process `MLXSwiftEngine` carries a prompt + /// cache; the SwiftLM / Python-MLX detection-only stubs don't, so + /// downcasting and no-op-on-mismatch is the right shape. When + /// another engine grows a cache this will move onto the + /// `InferenceEngine` protocol. + public func clearPromptCache() async { + guard let engine = engine as? MLXSwiftEngine else { return } + await engine.clearPromptCache() + } + /// Release the loaded model. public func unload() async { guard let engine else { return } diff --git a/macMLX/macMLX/Views/Settings/KVCacheSection.swift b/macMLX/macMLX/Views/Settings/KVCacheSection.swift new file mode 100644 index 0000000..2b78499 --- /dev/null +++ b/macMLX/macMLX/Views/Settings/KVCacheSection.swift @@ -0,0 +1,61 @@ +// KVCacheSection.swift +// macMLX +// +// Settings section exposing the v0.4 KV-cache-tiering knobs: hot (RAM) +// and cold (SSD) budget sliders plus a "Clear All KV Caches" button +// that drops both tiers. +// +// MVP note: the sliders persist to `Settings.kvCacheHotMB` / +// `Settings.kvCacheColdGB` but are not yet wired into the engine's +// eviction logic. `PromptCacheStore` uses an 8-entry LRU today; a +// byte-accurate budget and automatic cold-tier pruning land in +// v0.4.0.1. See the `.help` strings below for the user-facing note. + +import SwiftUI +import MacMLXCore + +struct KVCacheSection: View { + @Binding var hotMB: Int + @Binding var coldGB: Int + var onClearCache: () -> Void + + var body: some View { + Section("KV Cache") { + HStack { + Text("Hot (RAM)") + Spacer() + Stepper( + value: $hotMB, + in: 128...8192, + step: 128 + ) { + Text(String(hotMB) + " MB") + .font(.system(.body, design: .monospaced)) + .frame(minWidth: 80, alignment: .trailing) + } + .help("Takes effect in v0.4.0.1 — currently capped at 8 cache entries regardless of this slider.") + } + + HStack { + Text("Cold (SSD)") + Spacer() + Stepper( + value: $coldGB, + in: 1...500, + step: 1 + ) { + Text(String(coldGB) + " GB") + .font(.system(.body, design: .monospaced)) + .frame(minWidth: 80, alignment: .trailing) + } + .help("Cold-tier cap is not enforced automatically in this MVP — use the Clear All button below to reclaim space. Automatic pruning lands in v0.4.0.1.") + } + + HStack { + Spacer() + Button("Clear All KV Caches", action: onClearCache) + .foregroundStyle(.red) + } + } + } +} diff --git a/macMLX/macMLX/Views/Settings/SettingsView.swift b/macMLX/macMLX/Views/Settings/SettingsView.swift index e29575b..a8595de 100644 --- a/macMLX/macMLX/Views/Settings/SettingsView.swift +++ b/macMLX/macMLX/Views/Settings/SettingsView.swift @@ -18,6 +18,8 @@ struct SettingsView: View { @State private var serverPort: Int = 8000 @State private var autoStartServer: Bool = false @State private var hfEndpoint: String = "https://huggingface.co" + @State private var kvCacheHotMB: Int = 512 + @State private var kvCacheColdGB: Int = 20 var body: some View { Form { @@ -49,6 +51,22 @@ struct SettingsView: View { } } + KVCacheSection( + hotMB: $kvCacheHotMB, + coldGB: $kvCacheColdGB, + onClearCache: { + Task { + await appState.coordinator.clearPromptCache() + } + } + ) + .onChange(of: kvCacheHotMB) { _, newValue in + Task { await appState.updateSettings { $0.kvCacheHotMB = newValue } } + } + .onChange(of: kvCacheColdGB) { _, newValue in + Task { await appState.updateSettings { $0.kvCacheColdGB = newValue } } + } + downloadsSection rerunSetupSection @@ -151,6 +169,8 @@ struct SettingsView: View { serverPort = s.serverPort autoStartServer = s.autoStartServer hfEndpoint = s.hfEndpoint + kvCacheHotMB = s.kvCacheHotMB + kvCacheColdGB = s.kvCacheColdGB } private func showModelDirectoryPicker() {