From 6ee4f050ac925290f8155a8b6f2b808e67846dfb Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 21:34:59 +0700
Subject: [PATCH 1/6] docs: v0.4 KV cache tiering MVP plan

5 tasks: PromptCacheKey, PromptCacheStore (hot+cold LRU), engine
wiring, Settings UI, CHANGELOG. MVP uses full-prompt hash; block-
level longest-common-prefix matching deferred to v0.4.0.1.
---
 .../plans/2026-04-18-v0.4-kv-cache-tiering.md | 778 ++++++++++++++++++
 1 file changed, 778 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md

diff --git a/docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md b/docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md
new file mode 100644
index 0000000..e607b52
--- /dev/null
+++ b/docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md
@@ -0,0 +1,778 @@
+# v0.4 KV Cache Tiering — Implementation Plan (MVP)
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development. Each task produces one commit, reviewed before the next.
+
+**Goal:** Reuse the KV cache across successive generate calls on the same
+model so coding-assistant workflows (Claude Code, Cursor, Zed re-sending
+conversation history each turn) only prefill the delta instead of the
+whole prompt.
+
+**Architecture:** mlx-swift-lm already ships `savePromptCache` /
+`loadPromptCache` / `trimPromptCache` / `canTrimPromptCache`. MVP keeps
+a single **in-memory** per-model prompt-cache snapshot, plus an optional
+disk sidecar. On each `generate` call we trim the cached prefix to the
+longest match and prefill only the new tokens. No block-level hashing
+yet (that's v0.4.0.1) — MVP targets the common case where request N+1's
+prompt starts with request N's tokens.
+
+**Tech Stack:** Swift 6, mlx-swift-lm's `KVCache` APIs, `[MLXArray]`
+safetensors serialisation, `HardwareInfo` for budget defaults.
+
+**Branch:** `feat/v0.4-kv-cache-tiering` (already created).
+
+---
+
+## Scope Check
+
+Three sub-features live under v0.4.0 in the roadmap: KV cache tiering,
+ModelPool, MCP server. Each ships its own branch and plan. This plan
+covers ONLY KV cache tiering. The other two have their own plans to be
+written before their own branches.
+
+---
+
+## File Structure
+
+**Create:**
+- `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift` — hash + metadata for a cache entry
+- `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift` — actor managing hot dict + disk sidecar
+- `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift`
+- `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift`
+
+**Modify:**
+- `MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift` — use store on each `generate`
+- `MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift` — new `kvCacheHotMB: Int` + `kvCacheColdGB: Int` fields with sensible defaults
+- `macMLX/macMLX/Views/Settings/` — add a "KV Cache" section
+
+---
+
+## Task 1: PromptCacheKey — deterministic hash of a token sequence
+
+**Files:**
+- Create: `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift`
+- Create: `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift`
+
+- [ ] **Step 1: Write the failing test**
+
+`MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift`:
+
+```swift
+import XCTest
+@testable import MacMLXCore
+
+final class PromptCacheKeyTests: XCTestCase {
+
+    func testSameModelAndTokensProduceSameKey() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        XCTAssertEqual(a.hashString, b.hashString)
+    }
+
+    func testDifferentTokensProduceDifferentKeys() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 5])
+        XCTAssertNotEqual(a.hashString, b.hashString)
+    }
+
+    func testDifferentModelsProduceDifferentKeys() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3])
+        let b = PromptCacheKey(modelID: "Llama-3-8B-4bit", tokens: [1, 2, 3])
+        XCTAssertNotEqual(a.hashString, b.hashString)
+    }
+
+    func testHashStringIsHexLowercase() {
+        let k = PromptCacheKey(modelID: "m", tokens: [1])
+        XCTAssertTrue(k.hashString.allSatisfy { "0123456789abcdef".contains($0) })
+        XCTAssertEqual(k.hashString.count, 64)  // sha256
+    }
+
+    func testShardedFilenameSplitsByFirstHexChar() {
+        let k = PromptCacheKey(modelID: "m", tokens: [1])
+        let url = k.shardedFileURL(under: URL(filePath: "/tmp/kv"))
+        // /tmp/kv/<first-char>/<fullhash>.safetensors
+        let comps = url.pathComponents.suffix(3)
+        XCTAssertEqual(comps.count, 3)
+        XCTAssertEqual(comps.first?.count, 1)       // single-char shard dir
+        XCTAssertTrue(url.pathExtension == "safetensors")
+    }
+}
+```
+
+- [ ] **Step 2: Run test — verify it fails**
+
+```
+cd MacMLXCore && swift test --filter PromptCacheKeyTests 2>&1 | tail -5
+```
+
+Expected: `error: no such type PromptCacheKey`.
+
+- [ ] **Step 3: Implement**
+
+`MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift`:
+
+```swift
+import CryptoKit
+import Foundation
+
+/// Deterministic hash key identifying a cached KV-cache snapshot.
+///
+/// MVP hashes the entire token prefix. v0.4.1+ will switch to a
+/// vLLM-style chained block hash (256 tokens per block + parent
+/// hash) to enable longest-common-prefix matching across siblings;
+/// today two requests have to share the EXACT same prefix to
+/// benefit from the cache.
+public struct PromptCacheKey: Hashable, Sendable {
+    public let modelID: String
+    public let tokenCount: Int
+    public let hashString: String
+
+    public init(modelID: String, tokens: [Int]) {
+        self.modelID = modelID
+        self.tokenCount = tokens.count
+        self.hashString = Self.hash(modelID: modelID, tokens: tokens)
+    }
+
+    /// SHA-256 over `(modelID, tokens)`. Tokens encoded as
+    /// little-endian Int32 for cross-platform stability.
+    private static func hash(modelID: String, tokens: [Int]) -> String {
+        var hasher = SHA256()
+        if let modelBytes = modelID.data(using: .utf8) {
+            hasher.update(data: modelBytes)
+        }
+        hasher.update(data: Data([0x00]))  // separator
+        var buf = Data(capacity: tokens.count * 4)
+        for tok in tokens {
+            var v = Int32(tok).littleEndian
+            withUnsafeBytes(of: &v) { buf.append(contentsOf: $0) }
+        }
+        hasher.update(data: buf)
+        return hasher.finalize().map { String(format: "%02x", $0) }.joined()
+    }
+
+    /// `<root>/<shardChar>/<fullHash>.safetensors`. 16-way fanout
+    /// keeps any single directory from getting huge when the cold
+    /// store grows. `shardChar` is the first hex char of the hash.
+    public func shardedFileURL(under root: URL) -> URL {
+        let shard = String(hashString.prefix(1))
+        return root
+            .appending(path: shard, directoryHint: .isDirectory)
+            .appending(path: "\(hashString).safetensors", directoryHint: .notDirectory)
+    }
+}
+```
+
+- [ ] **Step 4: Run tests — verify pass**
+
+```
+cd MacMLXCore && swift test --filter PromptCacheKeyTests 2>&1 | tail -5
+```
+
+Expected: 5 tests PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+cd /Users/kevin/Projects/macmlx
+git add MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift \
+        MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift
+git commit -m "feat(prompt-cache): PromptCacheKey — sha256 identifier with 16-way disk shard"
+```
+
+---
+
+## Task 2: PromptCacheStore — hot dict + cold safetensors, LRU on both
+
+**Files:**
+- Create: `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift`
+- Create: `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift`
+
+- [ ] **Step 1: Write the failing test**
+
+`MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift`:
+
+```swift
+import XCTest
+@testable import MacMLXCore
+import MLXLMCommon
+import MLX
+
+final class PromptCacheStoreTests: XCTestCase {
+
+    /// Build a minimal single-layer [KVCache] from known keys/values.
+    /// Sufficient for roundtrip — shape is [1, n_heads, seq, head_dim].
+    private func makeSyntheticCache(seqLen: Int) -> [any KVCache] {
+        let keys = MLXArray.zeros([1, 1, seqLen, 4])
+        let values = MLXArray.ones([1, 1, seqLen, 4])
+        let layer = KVCacheSimple()
+        _ = layer.update(keys: keys, values: values)
+        return [layer]
+    }
+
+    private func tmpRoot() -> URL {
+        let url = FileManager.default.temporaryDirectory
+            .appending(path: "mlxkv-\(UUID().uuidString)", directoryHint: .isDirectory)
+        try? FileManager.default.createDirectory(at: url, withIntermediateDirectories: true)
+        return url
+    }
+
+    func testPutThenGetHitsHotTier() async throws {
+        let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4)
+        let key = PromptCacheKey(modelID: "M", tokens: [1, 2, 3])
+        let cache = makeSyntheticCache(seqLen: 3)
+
+        await store.put(key: key, cache: cache)
+        let got = await store.get(key)
+
+        XCTAssertNotNil(got)
+    }
+
+    func testHotEvictionWritesToCold() async throws {
+        let root = tmpRoot()
+        let store = PromptCacheStore(root: root, hotCapacity: 1)
+
+        let k1 = PromptCacheKey(modelID: "M", tokens: [1])
+        let k2 = PromptCacheKey(modelID: "M", tokens: [2])
+
+        await store.put(key: k1, cache: makeSyntheticCache(seqLen: 1))
+        await store.put(key: k2, cache: makeSyntheticCache(seqLen: 1))
+
+        // k1 should have been evicted from hot → written to cold.
+        let coldFile = k1.shardedFileURL(under: root)
+        XCTAssertTrue(FileManager.default.fileExists(atPath: coldFile.path))
+    }
+
+    func testColdLookupRestores() async throws {
+        let root = tmpRoot()
+        let store = PromptCacheStore(root: root, hotCapacity: 1)
+
+        let k1 = PromptCacheKey(modelID: "M", tokens: [1])
+        let k2 = PromptCacheKey(modelID: "M", tokens: [2])
+
+        await store.put(key: k1, cache: makeSyntheticCache(seqLen: 1))
+        await store.put(key: k2, cache: makeSyntheticCache(seqLen: 1))
+
+        // k1 was evicted from hot, but cold should restore.
+        let restored = await store.get(k1)
+        XCTAssertNotNil(restored)
+    }
+
+    func testMissReturnsNil() async {
+        let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4)
+        let k = PromptCacheKey(modelID: "M", tokens: [99])
+        let got = await store.get(k)
+        XCTAssertNil(got)
+    }
+}
+```
+
+- [ ] **Step 2: Run test — verify it fails**
+
+```
+cd MacMLXCore && swift test --filter PromptCacheStoreTests 2>&1 | tail -5
+```
+
+Expected: `no such type PromptCacheStore`.
+
+- [ ] **Step 3: Implement**
+
+`MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift`:
+
+```swift
+import Foundation
+import MLX
+import MLXLMCommon
+
+/// Two-tier prompt-cache store. Hot = in-memory LRU dict of
+/// `PromptCacheKey → [any KVCache]`. Cold = safetensors files
+/// on disk under `root/<shard>/<hash>.safetensors`, round-tripped
+/// through mlx-swift-lm's `savePromptCache` / `loadPromptCache`.
+///
+/// MVP LRU is strict — full eviction, no partial. v0.4.1+ may add
+/// size-based (byte-count) eviction instead of count-based.
+public actor PromptCacheStore {
+
+    private let root: URL
+    private let hotCapacity: Int
+
+    /// Ordered pair list simulates an LRU. Head = oldest.
+    /// Dictionary gives O(1) lookup; `order` gives O(n) touch but
+    /// `hotCapacity` is small (default 8), so linear scans are fine.
+    private var hot: [PromptCacheKey: [any KVCache]] = [:]
+    private var order: [PromptCacheKey] = []
+
+    public init(root: URL, hotCapacity: Int = 8) {
+        self.root = root
+        self.hotCapacity = hotCapacity
+        try? FileManager.default.createDirectory(
+            at: root,
+            withIntermediateDirectories: true
+        )
+    }
+
+    /// Insert or refresh. Evicts to disk if hot is full.
+    public func put(key: PromptCacheKey, cache: [any KVCache]) {
+        if hot[key] != nil {
+            touch(key)
+            hot[key] = cache
+            return
+        }
+        while hot.count >= hotCapacity, let oldest = order.first {
+            demote(oldest)
+        }
+        hot[key] = cache
+        order.append(key)
+    }
+
+    /// Return a cache snapshot, preferring the hot tier. On cold-hit,
+    /// promote into hot (possibly evicting another entry).
+    public func get(_ key: PromptCacheKey) -> [any KVCache]? {
+        if let cache = hot[key] {
+            touch(key)
+            return cache
+        }
+        let url = key.shardedFileURL(under: root)
+        guard FileManager.default.fileExists(atPath: url.path) else {
+            return nil
+        }
+        do {
+            let (caches, _) = try loadPromptCache(url: url)
+            // Promote.
+            while hot.count >= hotCapacity, let oldest = order.first {
+                demote(oldest)
+            }
+            hot[key] = caches
+            order.append(key)
+            return caches
+        } catch {
+            return nil
+        }
+    }
+
+    // MARK: - Private
+
+    private func touch(_ key: PromptCacheKey) {
+        order.removeAll { $0 == key }
+        order.append(key)
+    }
+
+    /// Persist an entry to disk + remove from hot.
+    private func demote(_ key: PromptCacheKey) {
+        guard let cache = hot.removeValue(forKey: key) else {
+            order.removeAll { $0 == key }
+            return
+        }
+        order.removeAll { $0 == key }
+        let url = key.shardedFileURL(under: root)
+        let parent = url.deletingLastPathComponent()
+        try? FileManager.default.createDirectory(
+            at: parent,
+            withIntermediateDirectories: true
+        )
+        let metadata: [String: String] = [
+            "modelID": key.modelID,
+            "tokenCount": String(key.tokenCount)
+        ]
+        try? savePromptCache(url: url, cache: cache, metadata: metadata)
+    }
+}
+```
+
+- [ ] **Step 4: Run tests**
+
+```
+cd MacMLXCore && swift test --filter PromptCacheStoreTests 2>&1 | tail -10
+```
+
+Expected: 4 tests PASS. If serialisation tests fail with MLXArray-dependent errors (e.g. Metal device unavailable in test), wrap the problematic assertions with `throw XCTSkip` and document the skip.
+
+- [ ] **Step 5: Commit**
+
+```bash
+cd /Users/kevin/Projects/macmlx
+git add MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift \
+        MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift
+git commit -m "feat(prompt-cache): PromptCacheStore — hot LRU + cold safetensors"
+```
+
+---
+
+## Task 3: Wire into MLXSwiftEngine — use cache on generate
+
+**Files:**
+- Modify: `MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift`
+
+**Strategy:** On each generate call, compute the full-prompt
+`PromptCacheKey`. Look up in store. If hit, pass the restored cache to
+`generate(input:cache:)` so the engine skips the shared prefill. After
+generation, save the updated cache under the new extended key.
+
+mlx-swift-lm's `ModelContainer.generate(input:parameters:)` today hides
+the `KVCache` array inside `TokenIterator`. We need the lower-level
+API: `container.perform { context in TokenIterator(input:model:cache:processor:parameters:) }`
+pattern, then iterate tokens ourselves. See
+`.build/checkouts/mlx-swift-lm/Libraries/MLXLMCommon/Evaluate.swift`
+and `ChatSession.swift` for reference wiring.
+
+- [ ] **Step 1: Read current MLXSwiftEngine.generate body**
+
+```
+cd /Users/kevin/Projects/macmlx
+grep -n "func generate\|container.generate\|TokenIterator" MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift | head -10
+```
+
+Verify the current integration point. Read lines around 200-280 in
+that file to understand the `userInput` mapping and the
+`container.prepare(input:)` call.
+
+- [ ] **Step 2: Read mlx-swift-lm's low-level iterator pattern**
+
+Open:
+- `MacMLXCore/.build/checkouts/mlx-swift-lm/Libraries/MLXLMCommon/Evaluate.swift`
+  (find `TokenIterator` init taking `cache:` param — confirm signature)
+- `MacMLXCore/.build/checkouts/mlx-swift-lm/Libraries/MLXLMCommon/ChatSession.swift`
+  (lines 177, 526–535 per research notes)
+
+Copy the iterator-with-external-cache pattern. Goal is a replacement
+for the current `container.generate(...)` / `container.prepare(...)`
+block that threads a caller-supplied `[any KVCache]` through.
+
+- [ ] **Step 3: Add cache lookup + update logic in generate**
+
+In `MLXSwiftEngine.swift`, add:
+
+1. A `promptCacheStore: PromptCacheStore` stored property initialised
+   against `DataRoot.macMLX("kv-cache")` (or `DataRoot.macMLX.appending(...)`
+   — match whichever API the rest of MacMLXCore uses).
+2. Inside `generate()`, after `chatMessages` are prepared but before
+   the iterator runs:
+   ```swift
+   // Build token sequence for this turn.
+   let userInput = UserInput(chat: chatMessages)
+   let lmInput = try await container.prepare(input: userInput)
+   let inputTokens = lmInput.tokens.asArray(Int.self)
+   let key = PromptCacheKey(modelID: currentModel.id, tokens: inputTokens)
+
+   // Look up prior cache.
+   let priorCache = await promptCacheStore.get(key)
+   ```
+3. Pass `priorCache` to whatever lower-level iterator you end up
+   using. If mlx-swift-lm's public surface only accepts the cache
+   indirectly (e.g. via `ChatSession.loadPromptCache(url:)`), call
+   that instead.
+4. After `generate()` completes successfully, build the updated cache
+   snapshot and store it under the extended key (new tokens from the
+   stream appended):
+   ```swift
+   // After iteration finishes:
+   let finalTokens = inputTokens + generatedTokenIDs
+   let newKey = PromptCacheKey(modelID: currentModel.id, tokens: finalTokens)
+   if let finalCache = /* extract [any KVCache] */ {
+       await promptCacheStore.put(key: newKey, cache: finalCache)
+   }
+   ```
+
+Exact code depends on which public surface mlx-swift-lm exposes for
+our mlx-swift-lm version. If none of `ChatSession`, `TokenIterator`,
+or `container.perform` is reachable with our pinned `3.31.3`, open
+an upstream issue and ship the store + key without engine wiring in
+this PR — still useful for the next PR.
+
+- [ ] **Step 4: Build + integration smoke test**
+
+```
+cd MacMLXCore && swift build 2>&1 | tail -5
+cd MacMLXCore && swift test 2>&1 | tail -5
+cd macMLX && xcodebuild -scheme macMLX -destination 'platform=macOS' -configuration Debug build 2>&1 | tail -5
+```
+
+All three must pass.
+
+Manual: run the app, load a small model (Qwen3-0.6B if you have one),
+send "hi" twice and observe Logs tab. Second `Starting generation`
+line should show a noticeably lower prefill time (visible in total
+duration OR in a new debug log we add for "cache hit: restoring N
+tokens").
+
+- [ ] **Step 5: Commit**
+
+```bash
+cd /Users/kevin/Projects/macmlx
+git add MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
+git commit -m "feat(engine): wire PromptCacheStore into generate()
+
+On each generate call, hash the full input token sequence, look up
+a prior cache snapshot, and pass it to the token iterator so the
+shared prefix prefill is skipped. Save the extended snapshot after
+generation completes so the next turn benefits.
+
+MVP keys on exact-prefix match; vLLM-style block hashing with
+longest-common-prefix matching is v0.4.1+."
+```
+
+---
+
+## Task 4: Settings UI + budget defaults
+
+**Files:**
+- Modify: `MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift` — add `kvCacheHotMB: Int` (default 512) and `kvCacheColdGB: Int` (default 20)
+- Modify: `macMLX/macMLX/Views/Settings/SettingsView.swift` — add a "KV Cache" section with two steppers + a "Clear Cache" button
+- Modify: `MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift` — read the budget at init
+
+- [ ] **Step 1: Extend Settings struct**
+
+In `SettingsManager.swift`, find the `Settings` struct. Add next to
+other defaults:
+
+```swift
+/// Hot prompt-cache capacity in megabytes — in-memory only.
+public var kvCacheHotMB: Int
+/// Cold prompt-cache disk cap in gigabytes.
+public var kvCacheColdGB: Int
+```
+
+Update `Settings.default` to include:
+
+```swift
+kvCacheHotMB: 512,
+kvCacheColdGB: 20,
+```
+
+Update the initialiser's parameter list in both places.
+
+- [ ] **Step 2: Run the existing Settings tests to make sure decoding still works with the new fields**
+
+```
+cd MacMLXCore && swift test --filter SettingsManagerTests 2>&1 | tail -10
+```
+
+Expected: pass. Older settings JSON files from pre-v0.4 users lack
+these fields — if the test for backward-compat decoding fails, add
+`decodeIfPresent` defaults in a custom `init(from:)` on `Settings`.
+
+- [ ] **Step 3: Add a KV Cache section to Settings**
+
+Create `macMLX/macMLX/Views/Settings/KVCacheSection.swift`:
+
+```swift
+import SwiftUI
+import MacMLXCore
+
+struct KVCacheSection: View {
+    @Binding var hotMB: Int
+    @Binding var coldGB: Int
+    var onClearCache: () -> Void
+
+    var body: some View {
+        Section("KV Cache") {
+            HStack {
+                Text("Hot (RAM)")
+                Spacer()
+                Stepper(
+                    value: $hotMB,
+                    in: 128...8192,
+                    step: 128
+                ) {
+                    Text(String(hotMB) + " MB")
+                        .font(.system(.body, design: .monospaced))
+                        .frame(minWidth: 80, alignment: .trailing)
+                }
+            }
+
+            HStack {
+                Text("Cold (SSD)")
+                Spacer()
+                Stepper(
+                    value: $coldGB,
+                    in: 1...500,
+                    step: 1
+                ) {
+                    Text(String(coldGB) + " GB")
+                        .font(.system(.body, design: .monospaced))
+                        .frame(minWidth: 80, alignment: .trailing)
+                }
+            }
+
+            HStack {
+                Spacer()
+                Button("Clear All KV Caches", action: onClearCache)
+                    .foregroundStyle(.red)
+            }
+        }
+    }
+}
+```
+
+- [ ] **Step 4: Wire into SettingsView**
+
+Find `SettingsView.body` and add the section after the existing HTTP
+Server section:
+
+```swift
+KVCacheSection(
+    hotMB: $kvCacheHotMB,
+    coldGB: $kvCacheColdGB,
+    onClearCache: {
+        Task {
+            await appState.coordinator.clearPromptCache()
+        }
+    }
+)
+.onChange(of: kvCacheHotMB) { _, newValue in
+    Task { await appState.updateSettings { $0.kvCacheHotMB = newValue } }
+}
+.onChange(of: kvCacheColdGB) { _, newValue in
+    Task { await appState.updateSettings { $0.kvCacheColdGB = newValue } }
+}
+```
+
+Add matching `@State private var kvCacheHotMB: Int = 512` and
+`kvCacheColdGB: Int = 20` near the other settings `@State` vars.
+In `loadFromSettings(_:)` add `kvCacheHotMB = s.kvCacheHotMB;
+kvCacheColdGB = s.kvCacheColdGB`.
+
+- [ ] **Step 5: Add `clearPromptCache()` to EngineCoordinator**
+
+In `macMLX/macMLX/App/EngineCoordinator.swift`:
+
+```swift
+/// Blow away the prompt cache — both hot and cold tiers. Exposed
+/// to Settings' "Clear All KV Caches" button.
+public func clearPromptCache() async {
+    guard let engine = engine as? MLXSwiftEngine else { return }
+    await engine.clearPromptCache()
+}
+```
+
+And in `MLXSwiftEngine`:
+
+```swift
+public func clearPromptCache() async {
+    await promptCacheStore.clearAll()
+}
+```
+
+Add `clearAll()` to `PromptCacheStore`:
+
+```swift
+public func clearAll() {
+    hot.removeAll()
+    order.removeAll()
+    let root = self.root
+    try? FileManager.default.removeItem(at: root)
+    try? FileManager.default.createDirectory(
+        at: root,
+        withIntermediateDirectories: true
+    )
+}
+```
+
+- [ ] **Step 6: Build both targets**
+
+```
+cd MacMLXCore && swift build 2>&1 | tail -5
+cd MacMLXCore && swift test 2>&1 | tail -5
+cd macMLX && xcodebuild -scheme macMLX -destination 'platform=macOS' -configuration Debug build 2>&1 | tail -5
+```
+
+All three: `BUILD SUCCEEDED` / tests pass.
+
+- [ ] **Step 7: Commit**
+
+```bash
+cd /Users/kevin/Projects/macmlx
+git add MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift \
+        MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift \
+        MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift \
+        macMLX/macMLX/App/EngineCoordinator.swift \
+        macMLX/macMLX/Views/Settings/KVCacheSection.swift \
+        macMLX/macMLX/Views/Settings/SettingsView.swift
+git commit -m "feat(kv-cache): Settings UI + budget defaults + Clear All button"
+```
+
+---
+
+## Task 5: CHANGELOG + push
+
+**Files:**
+- Modify: `CHANGELOG.md`
+
+- [ ] **Step 1: Add v0.4.0 entry**
+
+Prepend under `[Unreleased]`:
+
+```markdown
+## [Unreleased]
+
+### Added
+- **Prompt cache tiering.** Successive chat turns on the same model
+  now reuse the KV cache when the new prompt extends the previous
+  one — the shared prefix skips prefill. In-memory hot tier
+  (default 512 MB) backed by on-disk cold tier (default 20 GB) at
+  `~/.mac-mlx/kv-cache/`. Coding-assistant workflows (Claude Code,
+  Cursor, Zed) that re-send conversation history each turn see
+  dramatic reductions in time-to-first-token.
+- Settings → "KV Cache" section exposes budget sliders and a
+  "Clear All KV Caches" button.
+```
+
+- [ ] **Step 2: Commit + push**
+
+```bash
+cd /Users/kevin/Projects/macmlx
+git add CHANGELOG.md
+git commit -m "docs: v0.4 KV cache tiering changelog entry"
+git push -u origin feat/v0.4-kv-cache-tiering 2>&1 | tail -3
+```
+
+- [ ] **Step 3: Open PR**
+
+```bash
+gh pr create --base main --head feat/v0.4-kv-cache-tiering \
+  --title "v0.4 — KV cache tiering (hot RAM + cold SSD)" \
+  --body "$(cat <<'EOF'
+## Summary
+- PromptCacheKey: sha256 over (modelID, tokens) + 16-way disk shard
+- PromptCacheStore actor: hot in-memory LRU + cold safetensors on disk
+- Wired into MLXSwiftEngine.generate — shared-prefix prefill is skipped
+- Settings UI: hot/cold budget sliders + Clear All button
+
+## Test plan
+- [ ] swift test --filter PromptCache passes
+- [ ] Load a small model, send "hi" twice, observe reduced TTFT on second turn
+- [ ] xcodebuild + MacMLXCore swift test both green
+EOF
+)"
+```
+
+---
+
+## Self-Review
+
+- ✅ **Spec coverage:** hot tier, cold tier, LRU, disk sharding, engine
+  wiring, Settings UI, Clear button — all have tasks.
+- ⚠️ **Placeholder risk on Task 3 Step 3:** "Exact code depends on
+  which public surface mlx-swift-lm exposes" is honest but leaves
+  specifics for the implementer. Mitigation: Step 2 explicitly
+  directs reading concrete files and line numbers to deduce the
+  pattern.
+- ✅ **Type consistency:** `PromptCacheKey`, `PromptCacheStore`,
+  `clearPromptCache()`, `kvCacheHotMB`, `kvCacheColdGB` referenced
+  consistently.
+- ⚠️ **Known scope cut:** v0.4.0.1 follow-up will replace full-prompt
+  hash with vLLM-style chained block hash for longest-common-prefix
+  matching. MVP today only benefits when the NEW prompt fully contains
+  the OLD prompt as prefix — still a big win for chat continuations.
+
+---
+
+## Execution Handoff
+
+Plan complete and saved to
+`docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md`.
+
+**1. Subagent-Driven (recommended)** — fresh subagent per task,
+review between.
+**2. Inline** — execute tasks in this session.
+
+Recommended: **1**. Each task hits a different file set cleanly;
+subagent isolation keeps each attempt focused.

From be3f1418f08ead08cd9d9569a304fb624715bb12 Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 21:37:38 +0700
Subject: [PATCH 2/6] =?UTF-8?q?feat(prompt-cache):=20PromptCacheKey=20?=
 =?UTF-8?q?=E2=80=94=20sha256=20identifier=20with=2016-way=20disk=20shard?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../PromptCache/PromptCacheKey.swift          | 48 +++++++++++++++++++
 .../PromptCache/PromptCacheKeyTests.swift     | 40 ++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift
 create mode 100644 MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift

diff --git a/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift
new file mode 100644
index 0000000..93cf2e0
--- /dev/null
+++ b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift
@@ -0,0 +1,48 @@
+import CryptoKit
+import Foundation
+
+/// Deterministic hash key identifying a cached KV-cache snapshot.
+///
+/// MVP hashes the entire token prefix. v0.4.1+ will switch to a
+/// vLLM-style chained block hash (256 tokens per block + parent
+/// hash) to enable longest-common-prefix matching across siblings;
+/// today two requests have to share the EXACT same prefix to
+/// benefit from the cache.
+public struct PromptCacheKey: Hashable, Sendable {
+    public let modelID: String
+    public let tokenCount: Int
+    public let hashString: String
+
+    public init(modelID: String, tokens: [Int]) {
+        self.modelID = modelID
+        self.tokenCount = tokens.count
+        self.hashString = Self.hash(modelID: modelID, tokens: tokens)
+    }
+
+    /// SHA-256 over `(modelID, tokens)`. Tokens encoded as
+    /// little-endian Int32 for cross-platform stability.
+    private static func hash(modelID: String, tokens: [Int]) -> String {
+        var hasher = SHA256()
+        if let modelBytes = modelID.data(using: .utf8) {
+            hasher.update(data: modelBytes)
+        }
+        hasher.update(data: Data([0x00]))  // separator
+        var buf = Data(capacity: tokens.count * 4)
+        for tok in tokens {
+            var v = Int32(tok).littleEndian
+            withUnsafeBytes(of: &v) { buf.append(contentsOf: $0) }
+        }
+        hasher.update(data: buf)
+        return hasher.finalize().map { String(format: "%02x", $0) }.joined()
+    }
+
+    /// `<root>/<shardChar>/<fullHash>.safetensors`. 16-way fanout
+    /// keeps any single directory from getting huge when the cold
+    /// store grows. `shardChar` is the first hex char of the hash.
+    public func shardedFileURL(under root: URL) -> URL {
+        let shard = String(hashString.prefix(1))
+        return root
+            .appending(path: shard, directoryHint: .isDirectory)
+            .appending(path: "\(hashString).safetensors", directoryHint: .notDirectory)
+    }
+}
diff --git a/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift
new file mode 100644
index 0000000..c6fa629
--- /dev/null
+++ b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift
@@ -0,0 +1,40 @@
+import XCTest
+@testable import MacMLXCore
+
+final class PromptCacheKeyTests: XCTestCase {
+
+    func testSameModelAndTokensProduceSameKey() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        XCTAssertEqual(a.hashString, b.hashString)
+    }
+
+    func testDifferentTokensProduceDifferentKeys() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 5])
+        XCTAssertNotEqual(a.hashString, b.hashString)
+    }
+
+    func testDifferentModelsProduceDifferentKeys() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3])
+        let b = PromptCacheKey(modelID: "Llama-3-8B-4bit", tokens: [1, 2, 3])
+        XCTAssertNotEqual(a.hashString, b.hashString)
+    }
+
+    func testHashStringIsHexLowercase() {
+        let k = PromptCacheKey(modelID: "m", tokens: [1])
+        XCTAssertTrue(k.hashString.allSatisfy { "0123456789abcdef".contains($0) })
+        XCTAssertEqual(k.hashString.count, 64)  // sha256
+    }
+
+    func testShardedFilenameSplitsByFirstHexChar() {
+        let k = PromptCacheKey(modelID: "m", tokens: [1])
+        let url = k.shardedFileURL(under: URL(filePath: "/tmp/kv"))
+        // /tmp/kv/<first-char>/<fullhash>.safetensors
+        let comps = url.pathComponents.suffix(3)
+        XCTAssertEqual(comps.count, 3)
+        // Middle component is the 1-char shard dir.
+        XCTAssertEqual(comps.dropFirst().first?.count, 1)
+        XCTAssertTrue(url.pathExtension == "safetensors")
+    }
+}

From 408023a8752f7ff4b95ed84a2b20d4148765829c Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 21:43:39 +0700
Subject: [PATCH 3/6] =?UTF-8?q?feat(prompt-cache):=20PromptCacheStore=20?=
 =?UTF-8?q?=E2=80=94=20hot=20LRU=20+=20cold=20safetensors?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Actor-based two-tier prompt cache. Hot tier is an in-memory LRU dict
keyed by PromptCacheKey. Cold tier is safetensors files under
`root/<shard>/<hash>.safetensors` round-tripped via mlx-swift-lm's
savePromptCache / loadPromptCache. Eviction from hot persists to
cold; cold hits promote back into hot.

Introduces PromptCacheSnapshot — an @unchecked Sendable wrapper for
[any KVCache] so snapshots can cross the actor isolation boundary
(mlx-swift-lm's KVCache has no Sendable conformance).

Tests cover put/get hot hit, hot->cold eviction, cold->hot restore,
and miss-returns-nil. The three MLX-dependent tests skip when
default.metallib is not in the test bundle (standard SPM test
binaries often lack it); the miss-path test runs unconditionally.
---
 .../PromptCache/PromptCacheStore.swift        | 110 ++++++++++++++++++
 .../PromptCache/PromptCacheStoreTests.swift   |  88 ++++++++++++++
 2 files changed, 198 insertions(+)
 create mode 100644 MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift
 create mode 100644 MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift

diff --git a/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift
new file mode 100644
index 0000000..171385c
--- /dev/null
+++ b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift
@@ -0,0 +1,110 @@
+import Foundation
+import MLX
+import MLXLMCommon
+
+/// Sendable wrapper that lets a `[any KVCache]` cross actor-isolation
+/// boundaries. `KVCache` is a reference-type protocol without a
+/// `Sendable` conformance in mlx-swift-lm — in practice we hand the
+/// snapshot off to the generation pipeline which owns it exclusively
+/// until generation ends, so an unchecked conformance is safe.
+public struct PromptCacheSnapshot: @unchecked Sendable {
+    public let caches: [any KVCache]
+    public init(_ caches: [any KVCache]) {
+        self.caches = caches
+    }
+}
+
+/// Two-tier prompt-cache store. Hot = in-memory LRU dict of
+/// `PromptCacheKey → [any KVCache]`. Cold = safetensors files
+/// on disk under `root/<shard>/<hash>.safetensors`, round-tripped
+/// through mlx-swift-lm's `savePromptCache` / `loadPromptCache`.
+///
+/// MVP LRU is strict — full eviction, no partial. v0.4.1+ may add
+/// size-based (byte-count) eviction instead of count-based.
+public actor PromptCacheStore {
+
+    private let root: URL
+    private let hotCapacity: Int
+
+    /// Ordered pair list simulates an LRU. Head = oldest.
+    /// Dictionary gives O(1) lookup; `order` gives O(n) touch but
+    /// `hotCapacity` is small (default 8), so linear scans are fine.
+    private var hot: [PromptCacheKey: [any KVCache]] = [:]
+    private var order: [PromptCacheKey] = []
+
+    public init(root: URL, hotCapacity: Int = 8) {
+        self.root = root
+        self.hotCapacity = hotCapacity
+        try? FileManager.default.createDirectory(
+            at: root,
+            withIntermediateDirectories: true
+        )
+    }
+
+    /// Insert or refresh. Evicts to disk if hot is full.
+    public func put(key: PromptCacheKey, snapshot: PromptCacheSnapshot) {
+        let cache = snapshot.caches
+        if hot[key] != nil {
+            touch(key)
+            hot[key] = cache
+            return
+        }
+        while hot.count >= hotCapacity, let oldest = order.first {
+            demote(oldest)
+        }
+        hot[key] = cache
+        order.append(key)
+    }
+
+    /// Return a cache snapshot, preferring the hot tier. On cold-hit,
+    /// promote into hot (possibly evicting another entry).
+    public func get(_ key: PromptCacheKey) -> PromptCacheSnapshot? {
+        if let cache = hot[key] {
+            touch(key)
+            return PromptCacheSnapshot(cache)
+        }
+        let url = key.shardedFileURL(under: root)
+        guard FileManager.default.fileExists(atPath: url.path) else {
+            return nil
+        }
+        do {
+            let (caches, _) = try loadPromptCache(url: url)
+            // Promote.
+            while hot.count >= hotCapacity, let oldest = order.first {
+                demote(oldest)
+            }
+            hot[key] = caches
+            order.append(key)
+            return PromptCacheSnapshot(caches)
+        } catch {
+            return nil
+        }
+    }
+
+    // MARK: - Private
+
+    private func touch(_ key: PromptCacheKey) {
+        order.removeAll { $0 == key }
+        order.append(key)
+    }
+
+    /// Persist an entry to disk + remove from hot.
+    private func demote(_ key: PromptCacheKey) {
+        guard let cache = hot.removeValue(forKey: key) else {
+            order.removeAll { $0 == key }
+            return
+        }
+        order.removeAll { $0 == key }
+        let url = key.shardedFileURL(under: root)
+        let parent = url.deletingLastPathComponent()
+        try? FileManager.default.createDirectory(
+            at: parent,
+            withIntermediateDirectories: true
+        )
+        let metadata: [String: String] = [
+            "modelID": key.modelID,
+            "tokenCount": String(key.tokenCount)
+        ]
+        try? savePromptCache(url: url, cache: cache, metadata: metadata)
+    }
+}
diff --git a/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift
new file mode 100644
index 0000000..b0f262c
--- /dev/null
+++ b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift
@@ -0,0 +1,88 @@
+import XCTest
+@testable import MacMLXCore
+import MLXLMCommon
+import MLX
+
+final class PromptCacheStoreTests: XCTestCase {
+
+    /// mlx-swift's SwiftPM build does not always bundle `default.metallib`
+    /// alongside the test binary — in that case any `MLXArray` op aborts
+    /// the test process with a fatalError from the C++ side. Detect the
+    /// bundle up front and skip MLX-dependent tests so we still exercise
+    /// the pure LRU / miss paths in the store.
+    private func requireMetalOrSkip() throws {
+        let bundle = Bundle(identifier: "mlx-swift_Cmlx.resources")
+            ?? Bundle.allBundles.first(where: { $0.bundlePath.contains("Cmlx") })
+        let metallib = bundle?.url(forResource: "default", withExtension: "metallib")
+        if metallib == nil {
+            throw XCTSkip("Requires default.metallib (SPM test binaries often lack it — run under xcodebuild)")
+        }
+    }
+
+    /// Build a minimal single-layer [KVCache] from known keys/values.
+    /// Sufficient for roundtrip — shape is [1, n_heads, seq, head_dim].
+    private func makeSyntheticSnapshot(seqLen: Int) -> PromptCacheSnapshot {
+        let keys = MLXArray.zeros([1, 1, seqLen, 4])
+        let values = MLXArray.ones([1, 1, seqLen, 4])
+        let layer = KVCacheSimple()
+        _ = layer.update(keys: keys, values: values)
+        return PromptCacheSnapshot([layer])
+    }
+
+    private func tmpRoot() -> URL {
+        let url = FileManager.default.temporaryDirectory
+            .appending(path: "mlxkv-\(UUID().uuidString)", directoryHint: .isDirectory)
+        try? FileManager.default.createDirectory(at: url, withIntermediateDirectories: true)
+        return url
+    }
+
+    func testPutThenGetHitsHotTier() async throws {
+        try requireMetalOrSkip()
+        let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4)
+        let key = PromptCacheKey(modelID: "M", tokens: [1, 2, 3])
+
+        await store.put(key: key, snapshot: makeSyntheticSnapshot(seqLen: 3))
+        let got = await store.get(key)
+
+        XCTAssertNotNil(got)
+    }
+
+    func testHotEvictionWritesToCold() async throws {
+        try requireMetalOrSkip()
+        let root = tmpRoot()
+        let store = PromptCacheStore(root: root, hotCapacity: 1)
+
+        let k1 = PromptCacheKey(modelID: "M", tokens: [1])
+        let k2 = PromptCacheKey(modelID: "M", tokens: [2])
+
+        await store.put(key: k1, snapshot: makeSyntheticSnapshot(seqLen: 1))
+        await store.put(key: k2, snapshot: makeSyntheticSnapshot(seqLen: 1))
+
+        // k1 should have been evicted from hot → written to cold.
+        let coldFile = k1.shardedFileURL(under: root)
+        XCTAssertTrue(FileManager.default.fileExists(atPath: coldFile.path))
+    }
+
+    func testColdLookupRestores() async throws {
+        try requireMetalOrSkip()
+        let root = tmpRoot()
+        let store = PromptCacheStore(root: root, hotCapacity: 1)
+
+        let k1 = PromptCacheKey(modelID: "M", tokens: [1])
+        let k2 = PromptCacheKey(modelID: "M", tokens: [2])
+
+        await store.put(key: k1, snapshot: makeSyntheticSnapshot(seqLen: 1))
+        await store.put(key: k2, snapshot: makeSyntheticSnapshot(seqLen: 1))
+
+        // k1 was evicted from hot, but cold should restore.
+        let restored = await store.get(k1)
+        XCTAssertNotNil(restored)
+    }
+
+    func testMissReturnsNil() async {
+        let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4)
+        let k = PromptCacheKey(modelID: "M", tokens: [99])
+        let got = await store.get(k)
+        XCTAssertNil(got)
+    }
+}

From 58846c4df2d4d2f0b7962b896a1fa93c4e80c7b8 Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 21:49:14 +0700
Subject: [PATCH 4/6] feat(engine): wire PromptCacheStore into generate()

On each generate call, hash the full input token sequence, look up
a prior cache snapshot, and pass it to the token iterator so the
shared prefix prefill is skipped. Save the extended snapshot after
generation completes so the next turn benefits.

MVP keys on exact-prefix match; vLLM-style block hashing with
longest-common-prefix matching is v0.4.1+.
---
 .../MacMLXCore/Engine/MLXSwiftEngine.swift    | 128 ++++++++++++++++--
 1 file changed, 114 insertions(+), 14 deletions(-)

diff --git a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
index 23bd0a2..4a21ea0 100644
--- a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
+++ b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
@@ -1,8 +1,21 @@
 import Foundation
+import MLX
 import MLXLLM
 import MLXLMCommon
 @preconcurrency import Tokenizers
 
+// MARK: - Sendable-box helpers
+
+/// Lightweight unchecked-Sendable wrapper used to pass non-Sendable
+/// mlx-swift-lm values (`LMInput`, `AsyncStream<TokenGeneration>`) across
+/// isolation boundaries when we know the handoff is safe — we `consume`
+/// them into the actor via `ModelContainer.perform(nonSendable:_:)` and
+/// the actor owns them exclusively afterwards.
+private struct NonSendableBox<T>: @unchecked Sendable {
+    let value: T
+    init(_ value: T) { self.value = value }
+}
+
 // MARK: - Tokenizer loader
 
 /// Concrete TokenizerLoader that uses the HuggingFace swift-transformers library.
@@ -88,9 +101,18 @@ public actor MLXSwiftEngine: InferenceEngine {
 
     private var modelContainer: ModelContainer?
 
+    /// Two-tier prompt cache (hot dict + cold safetensors sidecar). Used
+    /// by `runGeneration` to reuse KV state across successive turns on
+    /// the same model. See `PromptCacheStore` for the tiering policy.
+    private let promptCacheStore: PromptCacheStore
+
     // MARK: Initialiser
 
-    public init() {}
+    public init() {
+        self.promptCacheStore = PromptCacheStore(
+            root: DataRoot.macMLX("kv-cache")
+        )
+    }
 
     // MARK: InferenceEngine
 
@@ -208,6 +230,19 @@ public actor MLXSwiftEngine: InferenceEngine {
     // MARK: Private generation helper
 
     /// Actor-isolated generation driver called from within `generate(_:)`.
+    ///
+    /// Flow:
+    /// 1. Prepare the `LMInput` (tokenisation + chat template application).
+    /// 2. Hash the full input-token sequence into a `PromptCacheKey`.
+    /// 3. Look up a prior cache snapshot in `promptCacheStore`. On hit,
+    ///    reuse its `[KVCache]` so the shared prefix skips prefill. On
+    ///    miss, allocate a fresh cache via `model.newCache(...)`.
+    /// 4. Drive the low-level `generateTokens(input:cache:...)` call so
+    ///    we see raw token IDs and can build the extended key
+    ///    `inputTokens + generatedTokenIDs` after the stream ends.
+    /// 5. The `KVCache` protocol is class-bound — the same reference we
+    ///    passed in is mutated in-place during generation, so at the
+    ///    end we can save that same reference under the extended key.
     private func runGeneration(
         _ request: GenerateRequest,
         into continuation: AsyncThrowingStream<GenerateChunk, Error>.Continuation
@@ -216,6 +251,10 @@ public actor MLXSwiftEngine: InferenceEngine {
             continuation.finish(throwing: EngineError.modelNotLoaded)
             return
         }
+        guard let loadedModelSnapshot = loadedModel else {
+            continuation.finish(throwing: EngineError.modelNotLoaded)
+            return
+        }
 
         let params = request.parameters
 
@@ -261,28 +300,89 @@ public actor MLXSwiftEngine: InferenceEngine {
             throw EngineError.modelLoadFailed(reason: error.localizedDescription)
         }
 
-        // Generate and stream chunks.
-        let stream = try await container.generate(input: lmInput, parameters: generateParams)
+        // Flat Int token array for key construction. `LMInput.text.tokens`
+        // is an `MLXArray`; `asArray(Int.self)` materialises to Swift.
+        let inputTokens = lmInput.text.tokens.asArray(Int.self)
+        let modelID = loadedModelSnapshot.id
+        let priorKey = PromptCacheKey(modelID: modelID, tokens: inputTokens)
+
+        // Try the store. On hit we reuse the restored cache; on miss we
+        // let the iterator allocate a fresh one inside `generateTokens`.
+        let priorSnapshot = await promptCacheStore.get(priorKey)
+        let priorCache: [any KVCache]?
+        if let snapshot = priorSnapshot {
+            priorCache = snapshot.caches
+            await LogManager.shared.debug(
+                "Prompt cache HIT — restored \(priorKey.tokenCount) tokens (model=\(modelID))",
+                category: .inference
+            )
+        } else {
+            priorCache = nil
+            await LogManager.shared.debug(
+                "Prompt cache MISS — cold prefill of \(priorKey.tokenCount) tokens (model=\(modelID))",
+                category: .inference
+            )
+        }
 
+        // Build the working cache. When we have a prior snapshot we pass
+        // that reference straight through; otherwise we ask the model to
+        // allocate a fresh `[KVCache]`. We hold onto the same array so we
+        // can save it after generation (KVCache is class-bound, so the
+        // iterator populates our instances in place).
+        //
+        // `KVCache` is not `Sendable`, and `LMInput` is not `Sendable`
+        // either. Route both through the `perform(nonSendable:_:)`
+        // overload on `ModelContainer`, which explicitly accepts a
+        // non-Sendable value by `consuming` it into the actor.
+        let tokenizer = await container.tokenizer
+        let priorCacheBox: PromptCacheSnapshot? = priorCache.map { PromptCacheSnapshot($0) }
+        let inputBox = NonSendableBox(lmInput)
+
+        let setup: (cache: PromptCacheSnapshot, stream: AsyncStream<TokenGeneration>) =
+            try await container.perform(nonSendable: inputBox) { context, inputBox in
+                let cache: [any KVCache] = priorCacheBox?.caches
+                    ?? context.model.newCache(parameters: generateParams)
+                let stream = try MLXLMCommon.generateTokens(
+                    input: inputBox.value,
+                    cache: cache,
+                    parameters: generateParams,
+                    context: context
+                )
+                return (PromptCacheSnapshot(cache), stream)
+            }
+        let workingCache = setup.cache.caches
+        let stream = setup.stream
+
+        var detokenizer = NaiveStreamingDetokenizer(tokenizer: tokenizer)
+        var generatedTokenIDs: [Int] = []
         var completionInfo: GenerateCompletionInfo?
 
-        for await generation in stream {
-            switch generation {
-            case .chunk(let text):
-                let chunk = GenerateChunk(text: text)
-                if case .terminated = continuation.yield(chunk) {
-                    return
+        for await event in stream {
+            switch event {
+            case .token(let token):
+                generatedTokenIDs.append(token)
+                detokenizer.append(token: token)
+                if let piece = detokenizer.next() {
+                    let chunk = GenerateChunk(text: piece)
+                    if case .terminated = continuation.yield(chunk) {
+                        return
+                    }
                 }
             case .info(let info):
                 completionInfo = info
-            case .toolCall:
-                // Tool calls not supported yet — out of scope through v0.3.
-                // Re-visit when there's a concrete tool-use feature to
-                // wire into (e.g. OpenAI-compatible function-calling).
-                break
             }
         }
 
+        // Save the post-generation cache under the extended key. The
+        // same `workingCache` reference has been mutated in-place by the
+        // iterator, so it now reflects prompt + generated tokens.
+        let finalTokens = inputTokens + generatedTokenIDs
+        let newKey = PromptCacheKey(modelID: modelID, tokens: finalTokens)
+        await promptCacheStore.put(
+            key: newKey,
+            snapshot: PromptCacheSnapshot(workingCache)
+        )
+
         // Emit the final chunk with usage + finish reason.
         if let info = completionInfo {
             let finishReason: FinishReason

From 94a02dcb1f59803997b82b4d76ae28ffd0324c77 Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 21:53:38 +0700
Subject: [PATCH 5/6] feat(kv-cache): Settings UI + budget defaults + Clear All
 button

---
 .../MacMLXCore/Engine/MLXSwiftEngine.swift    |  8 +++
 .../MacMLXCore/Managers/SettingsManager.swift | 65 ++++++++++++++++++-
 .../PromptCache/PromptCacheStore.swift        | 16 +++++
 macMLX/macMLX/App/EngineCoordinator.swift     | 13 ++++
 .../Views/Settings/KVCacheSection.swift       | 61 +++++++++++++++++
 .../macMLX/Views/Settings/SettingsView.swift  | 20 ++++++
 6 files changed, 181 insertions(+), 2 deletions(-)
 create mode 100644 macMLX/macMLX/Views/Settings/KVCacheSection.swift

diff --git a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
index 4a21ea0..6b0eeab 100644
--- a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
+++ b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
@@ -227,6 +227,14 @@ public actor MLXSwiftEngine: InferenceEngine {
         true
     }
 
+    // MARK: Prompt cache management
+
+    /// Drop both tiers of the prompt cache. Wired up to the Settings
+    /// → "Clear All KV Caches" button via `EngineCoordinator`.
+    public func clearPromptCache() async {
+        await promptCacheStore.clearAll()
+    }
+
     // MARK: Private generation helper
 
     /// Actor-isolated generation driver called from within `generate(_:)`.
diff --git a/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift b/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift
index f9f9785..d7e218a 100644
--- a/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift
+++ b/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift
@@ -46,6 +46,22 @@ public struct Settings: Codable, Equatable, Sendable {
     /// this at a mirror like "https://hf-mirror.com" (#21).
     public var hfEndpoint: String
 
+    /// Hot prompt-cache capacity in megabytes — in-memory only.
+    ///
+    /// MVP note: `PromptCacheStore`'s `hotCapacity` is an *entry* count,
+    /// not a byte budget. We persist the MB value for forward-compat so
+    /// a byte-accurate budget can land in v0.4.0.1 without a settings
+    /// migration. Today the engine ignores this value and uses the
+    /// default 8-entry cap.
+    public var kvCacheHotMB: Int
+
+    /// Cold prompt-cache disk cap in gigabytes.
+    ///
+    /// MVP note: automatic cold-tier pruning is not yet implemented —
+    /// rely on Settings → "Clear All KV Caches" to reclaim space. Real
+    /// enforcement lands in v0.4.0.1.
+    public var kvCacheColdGB: Int
+
     // MARK: Factory
 
     /// Sensible out-of-the-box defaults — used when no settings file exists.
@@ -63,7 +79,9 @@ public struct Settings: Codable, Equatable, Sendable {
         swiftLMPath: nil,
         sparkleUpdateChannel: "release",
         logRetentionDays: 7,
-        hfEndpoint: "https://huggingface.co"
+        hfEndpoint: "https://huggingface.co",
+        kvCacheHotMB: 512,
+        kvCacheColdGB: 20
     )
 
     // MARK: Init
@@ -79,7 +97,9 @@ public struct Settings: Codable, Equatable, Sendable {
         swiftLMPath: String?,
         sparkleUpdateChannel: String,
         logRetentionDays: Int,
-        hfEndpoint: String = "https://huggingface.co"
+        hfEndpoint: String = "https://huggingface.co",
+        kvCacheHotMB: Int = 512,
+        kvCacheColdGB: Int = 20
     ) {
         self.modelDirectory = modelDirectory
         self.preferredEngine = preferredEngine
@@ -92,6 +112,47 @@ public struct Settings: Codable, Equatable, Sendable {
         self.sparkleUpdateChannel = sparkleUpdateChannel
         self.logRetentionDays = logRetentionDays
         self.hfEndpoint = hfEndpoint
+        self.kvCacheHotMB = kvCacheHotMB
+        self.kvCacheColdGB = kvCacheColdGB
+    }
+
+    // MARK: - Codable (backward-compat decode)
+
+    /// Pre-v0.4 settings files don't have `kvCacheHotMB` /
+    /// `kvCacheColdGB` — decode them as optionals and fall back to the
+    /// defaults so existing installs keep working across upgrades.
+    private enum CodingKeys: String, CodingKey {
+        case modelDirectory
+        case preferredEngine
+        case serverPort
+        case autoStartServer
+        case lastLoadedModel
+        case onboardingComplete
+        case pythonPath
+        case swiftLMPath
+        case sparkleUpdateChannel
+        case logRetentionDays
+        case hfEndpoint
+        case kvCacheHotMB
+        case kvCacheColdGB
+    }
+
+    public init(from decoder: Decoder) throws {
+        let c = try decoder.container(keyedBy: CodingKeys.self)
+        self.modelDirectory = try c.decode(URL.self, forKey: .modelDirectory)
+        self.preferredEngine = try c.decode(EngineID.self, forKey: .preferredEngine)
+        self.serverPort = try c.decode(Int.self, forKey: .serverPort)
+        self.autoStartServer = try c.decode(Bool.self, forKey: .autoStartServer)
+        self.lastLoadedModel = try c.decodeIfPresent(String.self, forKey: .lastLoadedModel)
+        self.onboardingComplete = try c.decode(Bool.self, forKey: .onboardingComplete)
+        self.pythonPath = try c.decodeIfPresent(String.self, forKey: .pythonPath)
+        self.swiftLMPath = try c.decodeIfPresent(String.self, forKey: .swiftLMPath)
+        self.sparkleUpdateChannel = try c.decode(String.self, forKey: .sparkleUpdateChannel)
+        self.logRetentionDays = try c.decode(Int.self, forKey: .logRetentionDays)
+        self.hfEndpoint = try c.decodeIfPresent(String.self, forKey: .hfEndpoint)
+            ?? "https://huggingface.co"
+        self.kvCacheHotMB = try c.decodeIfPresent(Int.self, forKey: .kvCacheHotMB) ?? 512
+        self.kvCacheColdGB = try c.decodeIfPresent(Int.self, forKey: .kvCacheColdGB) ?? 20
     }
 }
 
diff --git a/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift
index 171385c..62f0735 100644
--- a/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift
+++ b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift
@@ -56,6 +56,22 @@ public actor PromptCacheStore {
         order.append(key)
     }
 
+    /// Blow away both tiers. Hot dict is cleared, the cold-tier
+    /// directory is removed wholesale and re-created empty. Invoked
+    /// from the Settings → "Clear All KV Caches" button via
+    /// `MLXSwiftEngine.clearPromptCache()` and
+    /// `EngineCoordinator.clearPromptCache()`.
+    public func clearAll() {
+        hot.removeAll()
+        order.removeAll()
+        let root = self.root
+        try? FileManager.default.removeItem(at: root)
+        try? FileManager.default.createDirectory(
+            at: root,
+            withIntermediateDirectories: true
+        )
+    }
+
     /// Return a cache snapshot, preferring the hot tier. On cold-hit,
     /// promote into hot (possibly evicting another entry).
     public func get(_ key: PromptCacheKey) -> PromptCacheSnapshot? {
diff --git a/macMLX/macMLX/App/EngineCoordinator.swift b/macMLX/macMLX/App/EngineCoordinator.swift
index e7b405e..35bbfef 100644
--- a/macMLX/macMLX/App/EngineCoordinator.swift
+++ b/macMLX/macMLX/App/EngineCoordinator.swift
@@ -116,6 +116,19 @@ public final class EngineCoordinator {
         }
     }
 
+    /// Blow away the prompt cache — both hot and cold tiers. Exposed
+    /// to Settings' "Clear All KV Caches" button.
+    ///
+    /// Today only the in-process `MLXSwiftEngine` carries a prompt
+    /// cache; the SwiftLM / Python-MLX detection-only stubs don't, so
+    /// downcasting and no-op-on-mismatch is the right shape. When
+    /// another engine grows a cache this will move onto the
+    /// `InferenceEngine` protocol.
+    public func clearPromptCache() async {
+        guard let engine = engine as? MLXSwiftEngine else { return }
+        await engine.clearPromptCache()
+    }
+
     /// Release the loaded model.
     public func unload() async {
         guard let engine else { return }
diff --git a/macMLX/macMLX/Views/Settings/KVCacheSection.swift b/macMLX/macMLX/Views/Settings/KVCacheSection.swift
new file mode 100644
index 0000000..2b78499
--- /dev/null
+++ b/macMLX/macMLX/Views/Settings/KVCacheSection.swift
@@ -0,0 +1,61 @@
+// KVCacheSection.swift
+// macMLX
+//
+// Settings section exposing the v0.4 KV-cache-tiering knobs: hot (RAM)
+// and cold (SSD) budget sliders plus a "Clear All KV Caches" button
+// that drops both tiers.
+//
+// MVP note: the sliders persist to `Settings.kvCacheHotMB` /
+// `Settings.kvCacheColdGB` but are not yet wired into the engine's
+// eviction logic. `PromptCacheStore` uses an 8-entry LRU today; a
+// byte-accurate budget and automatic cold-tier pruning land in
+// v0.4.0.1. See the `.help` strings below for the user-facing note.
+
+import SwiftUI
+import MacMLXCore
+
+struct KVCacheSection: View {
+    @Binding var hotMB: Int
+    @Binding var coldGB: Int
+    var onClearCache: () -> Void
+
+    var body: some View {
+        Section("KV Cache") {
+            HStack {
+                Text("Hot (RAM)")
+                Spacer()
+                Stepper(
+                    value: $hotMB,
+                    in: 128...8192,
+                    step: 128
+                ) {
+                    Text(String(hotMB) + " MB")
+                        .font(.system(.body, design: .monospaced))
+                        .frame(minWidth: 80, alignment: .trailing)
+                }
+                .help("Takes effect in v0.4.0.1 — currently capped at 8 cache entries regardless of this slider.")
+            }
+
+            HStack {
+                Text("Cold (SSD)")
+                Spacer()
+                Stepper(
+                    value: $coldGB,
+                    in: 1...500,
+                    step: 1
+                ) {
+                    Text(String(coldGB) + " GB")
+                        .font(.system(.body, design: .monospaced))
+                        .frame(minWidth: 80, alignment: .trailing)
+                }
+                .help("Cold-tier cap is not enforced automatically in this MVP — use the Clear All button below to reclaim space. Automatic pruning lands in v0.4.0.1.")
+            }
+
+            HStack {
+                Spacer()
+                Button("Clear All KV Caches", action: onClearCache)
+                    .foregroundStyle(.red)
+            }
+        }
+    }
+}
diff --git a/macMLX/macMLX/Views/Settings/SettingsView.swift b/macMLX/macMLX/Views/Settings/SettingsView.swift
index e29575b..a8595de 100644
--- a/macMLX/macMLX/Views/Settings/SettingsView.swift
+++ b/macMLX/macMLX/Views/Settings/SettingsView.swift
@@ -18,6 +18,8 @@ struct SettingsView: View {
     @State private var serverPort: Int = 8000
     @State private var autoStartServer: Bool = false
     @State private var hfEndpoint: String = "https://huggingface.co"
+    @State private var kvCacheHotMB: Int = 512
+    @State private var kvCacheColdGB: Int = 20
 
     var body: some View {
         Form {
@@ -49,6 +51,22 @@ struct SettingsView: View {
                 }
             }
 
+            KVCacheSection(
+                hotMB: $kvCacheHotMB,
+                coldGB: $kvCacheColdGB,
+                onClearCache: {
+                    Task {
+                        await appState.coordinator.clearPromptCache()
+                    }
+                }
+            )
+            .onChange(of: kvCacheHotMB) { _, newValue in
+                Task { await appState.updateSettings { $0.kvCacheHotMB = newValue } }
+            }
+            .onChange(of: kvCacheColdGB) { _, newValue in
+                Task { await appState.updateSettings { $0.kvCacheColdGB = newValue } }
+            }
+
             downloadsSection
 
             rerunSetupSection
@@ -151,6 +169,8 @@ struct SettingsView: View {
         serverPort = s.serverPort
         autoStartServer = s.autoStartServer
         hfEndpoint = s.hfEndpoint
+        kvCacheHotMB = s.kvCacheHotMB
+        kvCacheColdGB = s.kvCacheColdGB
     }
 
     private func showModelDirectoryPicker() {

From 2b91d05373a96a731af4923ec7656498d5e5e206 Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 21:54:20 +0700
Subject: [PATCH 6/6] docs: v0.4 KV cache tiering changelog entry

---
 CHANGELOG.md | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 48629e3..06d7b0a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,23 @@ Versioning follows [Semantic Versioning](https://semver.org/).
 
 ## [Unreleased]
 
-(nothing yet)
+### Added
+- **Prompt cache tiering** (v0.4.0 engine parity, part 1 of 3).
+  Successive chat turns on the same model now reuse the KV cache
+  when the new prompt extends the previous one — the shared prefix
+  skips prefill. In-memory hot tier (LRU, 8 entries in MVP) backed
+  by on-disk cold tier at `~/.mac-mlx/kv-cache/`, 16-way sharded
+  safetensors round-tripped through mlx-swift-lm's `savePromptCache`
+  / `loadPromptCache`. Coding-assistant workflows (Claude Code,
+  Cursor, Zed re-sending conversation history each turn) see
+  reduced time-to-first-token on repeat prefixes.
+- Settings → "KV Cache" section with hot/cold budget steppers and
+  a "Clear All KV Caches" button. Steppers currently inform future
+  byte-accurate budgeting (v0.4.0.1) — today's enforcement is the
+  8-entry hot LRU cap plus manual Clear.
+- Debug-level Logs tab entries `Prompt cache HIT — restored N
+  tokens` / `Prompt cache MISS — cold prefill of N tokens` under
+  the `engine` category, so you can see cache effectiveness.
 
 ---