diff --git a/CHANGELOG.md b/CHANGELOG.md
index 48629e3..06d7b0a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,23 @@ Versioning follows [Semantic Versioning](https://semver.org/).
 
 ## [Unreleased]
 
-(nothing yet)
+### Added
+- **Prompt cache tiering** (v0.4.0 engine parity, part 1 of 3).
+  Successive chat turns on the same model now reuse the KV cache
+  when the new prompt extends the previous one — the shared prefix
+  skips prefill. In-memory hot tier (LRU, 8 entries in MVP) backed
+  by on-disk cold tier at `~/.mac-mlx/kv-cache/`, 16-way sharded
+  safetensors round-tripped through mlx-swift-lm's `savePromptCache`
+  / `loadPromptCache`. Coding-assistant workflows (Claude Code,
+  Cursor, Zed re-sending conversation history each turn) see
+  reduced time-to-first-token on repeat prefixes.
+- Settings → "KV Cache" section with hot/cold budget steppers and
+  a "Clear All KV Caches" button. Steppers currently inform future
+  byte-accurate budgeting (v0.4.0.1) — today's enforcement is the
+  8-entry hot LRU cap plus manual Clear.
+- Debug-level Logs tab entries `Prompt cache HIT — restored N
+  tokens` / `Prompt cache MISS — cold prefill of N tokens` under
+  the `engine` category, so you can see cache effectiveness.
 
 ---
 
diff --git a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
index 23bd0a2..6b0eeab 100644
--- a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
+++ b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
@@ -1,8 +1,21 @@
 import Foundation
+import MLX
 import MLXLLM
 import MLXLMCommon
 @preconcurrency import Tokenizers
 
+// MARK: - Sendable-box helpers
+
+/// Lightweight unchecked-Sendable wrapper used to pass non-Sendable
+/// mlx-swift-lm values (`LMInput`, `AsyncStream<TokenGeneration>`) across
+/// isolation boundaries when we know the handoff is safe — we `consume`
+/// them into the actor via `ModelContainer.perform(nonSendable:_:)` and
+/// the actor owns them exclusively afterwards.
+private struct NonSendableBox<T>: @unchecked Sendable {
+    let value: T
+    init(_ value: T) { self.value = value }
+}
+
 // MARK: - Tokenizer loader
 
 /// Concrete TokenizerLoader that uses the HuggingFace swift-transformers library.
@@ -88,9 +101,18 @@ public actor MLXSwiftEngine: InferenceEngine {
 
     private var modelContainer: ModelContainer?
 
+    /// Two-tier prompt cache (hot dict + cold safetensors sidecar). Used
+    /// by `runGeneration` to reuse KV state across successive turns on
+    /// the same model. See `PromptCacheStore` for the tiering policy.
+    private let promptCacheStore: PromptCacheStore
+
     // MARK: Initialiser
 
-    public init() {}
+    public init() {
+        self.promptCacheStore = PromptCacheStore(
+            root: DataRoot.macMLX("kv-cache")
+        )
+    }
 
     // MARK: InferenceEngine
 
@@ -205,9 +227,30 @@ public actor MLXSwiftEngine: InferenceEngine {
         true
     }
 
+    // MARK: Prompt cache management
+
+    /// Drop both tiers of the prompt cache. Wired up to the Settings
+    /// → "Clear All KV Caches" button via `EngineCoordinator`.
+    public func clearPromptCache() async {
+        await promptCacheStore.clearAll()
+    }
+
     // MARK: Private generation helper
 
     /// Actor-isolated generation driver called from within `generate(_:)`.
+    ///
+    /// Flow:
+    /// 1. Prepare the `LMInput` (tokenisation + chat template application).
+    /// 2. Hash the full input-token sequence into a `PromptCacheKey`.
+    /// 3. Look up a prior cache snapshot in `promptCacheStore`. On hit,
+    ///    reuse its `[KVCache]` so the shared prefix skips prefill. On
+    ///    miss, allocate a fresh cache via `model.newCache(...)`.
+    /// 4. Drive the low-level `generateTokens(input:cache:...)` call so
+    ///    we see raw token IDs and can build the extended key
+    ///    `inputTokens + generatedTokenIDs` after the stream ends.
+    /// 5. The `KVCache` protocol is class-bound — the same reference we
+    ///    passed in is mutated in-place during generation, so at the
+    ///    end we can save that same reference under the extended key.
     private func runGeneration(
         _ request: GenerateRequest,
         into continuation: AsyncThrowingStream<GenerateChunk, Error>.Continuation
@@ -216,6 +259,10 @@ public actor MLXSwiftEngine: InferenceEngine {
             continuation.finish(throwing: EngineError.modelNotLoaded)
             return
         }
+        guard let loadedModelSnapshot = loadedModel else {
+            continuation.finish(throwing: EngineError.modelNotLoaded)
+            return
+        }
 
         let params = request.parameters
 
@@ -261,28 +308,89 @@ public actor MLXSwiftEngine: InferenceEngine {
             throw EngineError.modelLoadFailed(reason: error.localizedDescription)
         }
 
-        // Generate and stream chunks.
-        let stream = try await container.generate(input: lmInput, parameters: generateParams)
+        // Flat Int token array for key construction. `LMInput.text.tokens`
+        // is an `MLXArray`; `asArray(Int.self)` materialises to Swift.
+        let inputTokens = lmInput.text.tokens.asArray(Int.self)
+        let modelID = loadedModelSnapshot.id
+        let priorKey = PromptCacheKey(modelID: modelID, tokens: inputTokens)
+
+        // Try the store. On hit we reuse the restored cache; on miss we
+        // let the iterator allocate a fresh one inside `generateTokens`.
+        let priorSnapshot = await promptCacheStore.get(priorKey)
+        let priorCache: [any KVCache]?
+        if let snapshot = priorSnapshot {
+            priorCache = snapshot.caches
+            await LogManager.shared.debug(
+                "Prompt cache HIT — restored \(priorKey.tokenCount) tokens (model=\(modelID))",
+                category: .inference
+            )
+        } else {
+            priorCache = nil
+            await LogManager.shared.debug(
+                "Prompt cache MISS — cold prefill of \(priorKey.tokenCount) tokens (model=\(modelID))",
+                category: .inference
+            )
+        }
+
+        // Build the working cache. When we have a prior snapshot we pass
+        // that reference straight through; otherwise we ask the model to
+        // allocate a fresh `[KVCache]`. We hold onto the same array so we
+        // can save it after generation (KVCache is class-bound, so the
+        // iterator populates our instances in place).
+        //
+        // `KVCache` is not `Sendable`, and `LMInput` is not `Sendable`
+        // either. Route both through the `perform(nonSendable:_:)`
+        // overload on `ModelContainer`, which explicitly accepts a
+        // non-Sendable value by `consuming` it into the actor.
+        let tokenizer = await container.tokenizer
+        let priorCacheBox: PromptCacheSnapshot? = priorCache.map { PromptCacheSnapshot($0) }
+        let inputBox = NonSendableBox(lmInput)
+
+        let setup: (cache: PromptCacheSnapshot, stream: AsyncStream<TokenGeneration>) =
+            try await container.perform(nonSendable: inputBox) { context, inputBox in
+                let cache: [any KVCache] = priorCacheBox?.caches
+                    ?? context.model.newCache(parameters: generateParams)
+                let stream = try MLXLMCommon.generateTokens(
+                    input: inputBox.value,
+                    cache: cache,
+                    parameters: generateParams,
+                    context: context
+                )
+                return (PromptCacheSnapshot(cache), stream)
+            }
+        let workingCache = setup.cache.caches
+        let stream = setup.stream
 
+        var detokenizer = NaiveStreamingDetokenizer(tokenizer: tokenizer)
+        var generatedTokenIDs: [Int] = []
         var completionInfo: GenerateCompletionInfo?
 
-        for await generation in stream {
-            switch generation {
-            case .chunk(let text):
-                let chunk = GenerateChunk(text: text)
-                if case .terminated = continuation.yield(chunk) {
-                    return
+        for await event in stream {
+            switch event {
+            case .token(let token):
+                generatedTokenIDs.append(token)
+                detokenizer.append(token: token)
+                if let piece = detokenizer.next() {
+                    let chunk = GenerateChunk(text: piece)
+                    if case .terminated = continuation.yield(chunk) {
+                        return
+                    }
                 }
             case .info(let info):
                 completionInfo = info
-            case .toolCall:
-                // Tool calls not supported yet — out of scope through v0.3.
-                // Re-visit when there's a concrete tool-use feature to
-                // wire into (e.g. OpenAI-compatible function-calling).
-                break
             }
         }
 
+        // Save the post-generation cache under the extended key. The
+        // same `workingCache` reference has been mutated in-place by the
+        // iterator, so it now reflects prompt + generated tokens.
+        let finalTokens = inputTokens + generatedTokenIDs
+        let newKey = PromptCacheKey(modelID: modelID, tokens: finalTokens)
+        await promptCacheStore.put(
+            key: newKey,
+            snapshot: PromptCacheSnapshot(workingCache)
+        )
+
         // Emit the final chunk with usage + finish reason.
         if let info = completionInfo {
             let finishReason: FinishReason
diff --git a/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift b/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift
index f9f9785..d7e218a 100644
--- a/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift
+++ b/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift
@@ -46,6 +46,22 @@ public struct Settings: Codable, Equatable, Sendable {
     /// this at a mirror like "https://hf-mirror.com" (#21).
     public var hfEndpoint: String
 
+    /// Hot prompt-cache capacity in megabytes — in-memory only.
+    ///
+    /// MVP note: `PromptCacheStore`'s `hotCapacity` is an *entry* count,
+    /// not a byte budget. We persist the MB value for forward-compat so
+    /// a byte-accurate budget can land in v0.4.0.1 without a settings
+    /// migration. Today the engine ignores this value and uses the
+    /// default 8-entry cap.
+    public var kvCacheHotMB: Int
+
+    /// Cold prompt-cache disk cap in gigabytes.
+    ///
+    /// MVP note: automatic cold-tier pruning is not yet implemented —
+    /// rely on Settings → "Clear All KV Caches" to reclaim space. Real
+    /// enforcement lands in v0.4.0.1.
+    public var kvCacheColdGB: Int
+
     // MARK: Factory
 
     /// Sensible out-of-the-box defaults — used when no settings file exists.
@@ -63,7 +79,9 @@ public struct Settings: Codable, Equatable, Sendable {
         swiftLMPath: nil,
         sparkleUpdateChannel: "release",
         logRetentionDays: 7,
-        hfEndpoint: "https://huggingface.co"
+        hfEndpoint: "https://huggingface.co",
+        kvCacheHotMB: 512,
+        kvCacheColdGB: 20
     )
 
     // MARK: Init
@@ -79,7 +97,9 @@ public struct Settings: Codable, Equatable, Sendable {
         swiftLMPath: String?,
         sparkleUpdateChannel: String,
         logRetentionDays: Int,
-        hfEndpoint: String = "https://huggingface.co"
+        hfEndpoint: String = "https://huggingface.co",
+        kvCacheHotMB: Int = 512,
+        kvCacheColdGB: Int = 20
     ) {
         self.modelDirectory = modelDirectory
         self.preferredEngine = preferredEngine
@@ -92,6 +112,47 @@ public struct Settings: Codable, Equatable, Sendable {
         self.sparkleUpdateChannel = sparkleUpdateChannel
         self.logRetentionDays = logRetentionDays
         self.hfEndpoint = hfEndpoint
+        self.kvCacheHotMB = kvCacheHotMB
+        self.kvCacheColdGB = kvCacheColdGB
+    }
+
+    // MARK: - Codable (backward-compat decode)
+
+    /// Pre-v0.4 settings files don't have `kvCacheHotMB` /
+    /// `kvCacheColdGB` — decode them as optionals and fall back to the
+    /// defaults so existing installs keep working across upgrades.
+    private enum CodingKeys: String, CodingKey {
+        case modelDirectory
+        case preferredEngine
+        case serverPort
+        case autoStartServer
+        case lastLoadedModel
+        case onboardingComplete
+        case pythonPath
+        case swiftLMPath
+        case sparkleUpdateChannel
+        case logRetentionDays
+        case hfEndpoint
+        case kvCacheHotMB
+        case kvCacheColdGB
+    }
+
+    public init(from decoder: Decoder) throws {
+        let c = try decoder.container(keyedBy: CodingKeys.self)
+        self.modelDirectory = try c.decode(URL.self, forKey: .modelDirectory)
+        self.preferredEngine = try c.decode(EngineID.self, forKey: .preferredEngine)
+        self.serverPort = try c.decode(Int.self, forKey: .serverPort)
+        self.autoStartServer = try c.decode(Bool.self, forKey: .autoStartServer)
+        self.lastLoadedModel = try c.decodeIfPresent(String.self, forKey: .lastLoadedModel)
+        self.onboardingComplete = try c.decode(Bool.self, forKey: .onboardingComplete)
+        self.pythonPath = try c.decodeIfPresent(String.self, forKey: .pythonPath)
+        self.swiftLMPath = try c.decodeIfPresent(String.self, forKey: .swiftLMPath)
+        self.sparkleUpdateChannel = try c.decode(String.self, forKey: .sparkleUpdateChannel)
+        self.logRetentionDays = try c.decode(Int.self, forKey: .logRetentionDays)
+        self.hfEndpoint = try c.decodeIfPresent(String.self, forKey: .hfEndpoint)
+            ?? "https://huggingface.co"
+        self.kvCacheHotMB = try c.decodeIfPresent(Int.self, forKey: .kvCacheHotMB) ?? 512
+        self.kvCacheColdGB = try c.decodeIfPresent(Int.self, forKey: .kvCacheColdGB) ?? 20
     }
 }
 
diff --git a/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift
new file mode 100644
index 0000000..93cf2e0
--- /dev/null
+++ b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift
@@ -0,0 +1,48 @@
+import CryptoKit
+import Foundation
+
+/// Deterministic hash key identifying a cached KV-cache snapshot.
+///
+/// MVP hashes the entire token prefix. v0.4.1+ will switch to a
+/// vLLM-style chained block hash (256 tokens per block + parent
+/// hash) to enable longest-common-prefix matching across siblings;
+/// today two requests have to share the EXACT same prefix to
+/// benefit from the cache.
+public struct PromptCacheKey: Hashable, Sendable {
+    public let modelID: String
+    public let tokenCount: Int
+    public let hashString: String
+
+    public init(modelID: String, tokens: [Int]) {
+        self.modelID = modelID
+        self.tokenCount = tokens.count
+        self.hashString = Self.hash(modelID: modelID, tokens: tokens)
+    }
+
+    /// SHA-256 over `(modelID, tokens)`. Tokens encoded as
+    /// little-endian Int32 for cross-platform stability.
+    private static func hash(modelID: String, tokens: [Int]) -> String {
+        var hasher = SHA256()
+        if let modelBytes = modelID.data(using: .utf8) {
+            hasher.update(data: modelBytes)
+        }
+        hasher.update(data: Data([0x00]))  // separator
+        var buf = Data(capacity: tokens.count * 4)
+        for tok in tokens {
+            var v = Int32(tok).littleEndian
+            withUnsafeBytes(of: &v) { buf.append(contentsOf: $0) }
+        }
+        hasher.update(data: buf)
+        return hasher.finalize().map { String(format: "%02x", $0) }.joined()
+    }
+
+    /// `<root>/<shardChar>/<fullHash>.safetensors`. 16-way fanout
+    /// keeps any single directory from getting huge when the cold
+    /// store grows. `shardChar` is the first hex char of the hash.
+    public func shardedFileURL(under root: URL) -> URL {
+        let shard = String(hashString.prefix(1))
+        return root
+            .appending(path: shard, directoryHint: .isDirectory)
+            .appending(path: "\(hashString).safetensors", directoryHint: .notDirectory)
+    }
+}
diff --git a/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift
new file mode 100644
index 0000000..62f0735
--- /dev/null
+++ b/MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift
@@ -0,0 +1,126 @@
+import Foundation
+import MLX
+import MLXLMCommon
+
+/// Sendable wrapper that lets a `[any KVCache]` cross actor-isolation
+/// boundaries. `KVCache` is a reference-type protocol without a
+/// `Sendable` conformance in mlx-swift-lm — in practice we hand the
+/// snapshot off to the generation pipeline which owns it exclusively
+/// until generation ends, so an unchecked conformance is safe.
+public struct PromptCacheSnapshot: @unchecked Sendable {
+    public let caches: [any KVCache]
+    public init(_ caches: [any KVCache]) {
+        self.caches = caches
+    }
+}
+
+/// Two-tier prompt-cache store. Hot = in-memory LRU dict of
+/// `PromptCacheKey → [any KVCache]`. Cold = safetensors files
+/// on disk under `root/<shard>/<hash>.safetensors`, round-tripped
+/// through mlx-swift-lm's `savePromptCache` / `loadPromptCache`.
+///
+/// MVP LRU is strict — full eviction, no partial. v0.4.1+ may add
+/// size-based (byte-count) eviction instead of count-based.
+public actor PromptCacheStore {
+
+    private let root: URL
+    private let hotCapacity: Int
+
+    /// Ordered pair list simulates an LRU. Head = oldest.
+    /// Dictionary gives O(1) lookup; `order` gives O(n) touch but
+    /// `hotCapacity` is small (default 8), so linear scans are fine.
+    private var hot: [PromptCacheKey: [any KVCache]] = [:]
+    private var order: [PromptCacheKey] = []
+
+    public init(root: URL, hotCapacity: Int = 8) {
+        self.root = root
+        self.hotCapacity = hotCapacity
+        try? FileManager.default.createDirectory(
+            at: root,
+            withIntermediateDirectories: true
+        )
+    }
+
+    /// Insert or refresh. Evicts to disk if hot is full.
+    public func put(key: PromptCacheKey, snapshot: PromptCacheSnapshot) {
+        let cache = snapshot.caches
+        if hot[key] != nil {
+            touch(key)
+            hot[key] = cache
+            return
+        }
+        while hot.count >= hotCapacity, let oldest = order.first {
+            demote(oldest)
+        }
+        hot[key] = cache
+        order.append(key)
+    }
+
+    /// Blow away both tiers. Hot dict is cleared, the cold-tier
+    /// directory is removed wholesale and re-created empty. Invoked
+    /// from the Settings → "Clear All KV Caches" button via
+    /// `MLXSwiftEngine.clearPromptCache()` and
+    /// `EngineCoordinator.clearPromptCache()`.
+    public func clearAll() {
+        hot.removeAll()
+        order.removeAll()
+        let root = self.root
+        try? FileManager.default.removeItem(at: root)
+        try? FileManager.default.createDirectory(
+            at: root,
+            withIntermediateDirectories: true
+        )
+    }
+
+    /// Return a cache snapshot, preferring the hot tier. On cold-hit,
+    /// promote into hot (possibly evicting another entry).
+    public func get(_ key: PromptCacheKey) -> PromptCacheSnapshot? {
+        if let cache = hot[key] {
+            touch(key)
+            return PromptCacheSnapshot(cache)
+        }
+        let url = key.shardedFileURL(under: root)
+        guard FileManager.default.fileExists(atPath: url.path) else {
+            return nil
+        }
+        do {
+            let (caches, _) = try loadPromptCache(url: url)
+            // Promote.
+            while hot.count >= hotCapacity, let oldest = order.first {
+                demote(oldest)
+            }
+            hot[key] = caches
+            order.append(key)
+            return PromptCacheSnapshot(caches)
+        } catch {
+            return nil
+        }
+    }
+
+    // MARK: - Private
+
+    private func touch(_ key: PromptCacheKey) {
+        order.removeAll { $0 == key }
+        order.append(key)
+    }
+
+    /// Persist an entry to disk + remove from hot.
+    private func demote(_ key: PromptCacheKey) {
+        guard let cache = hot.removeValue(forKey: key) else {
+            order.removeAll { $0 == key }
+            return
+        }
+        order.removeAll { $0 == key }
+        let url = key.shardedFileURL(under: root)
+        let parent = url.deletingLastPathComponent()
+        try? FileManager.default.createDirectory(
+            at: parent,
+            withIntermediateDirectories: true
+        )
+        let metadata: [String: String] = [
+            "modelID": key.modelID,
+            "tokenCount": String(key.tokenCount)
+        ]
+        try? savePromptCache(url: url, cache: cache, metadata: metadata)
+    }
+}
diff --git a/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift
new file mode 100644
index 0000000..c6fa629
--- /dev/null
+++ b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift
@@ -0,0 +1,40 @@
+import XCTest
+@testable import MacMLXCore
+
+final class PromptCacheKeyTests: XCTestCase {
+
+    func testSameModelAndTokensProduceSameKey() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        XCTAssertEqual(a.hashString, b.hashString)
+    }
+
+    func testDifferentTokensProduceDifferentKeys() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 5])
+        XCTAssertNotEqual(a.hashString, b.hashString)
+    }
+
+    func testDifferentModelsProduceDifferentKeys() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3])
+        let b = PromptCacheKey(modelID: "Llama-3-8B-4bit", tokens: [1, 2, 3])
+        XCTAssertNotEqual(a.hashString, b.hashString)
+    }
+
+    func testHashStringIsHexLowercase() {
+        let k = PromptCacheKey(modelID: "m", tokens: [1])
+        XCTAssertTrue(k.hashString.allSatisfy { "0123456789abcdef".contains($0) })
+        XCTAssertEqual(k.hashString.count, 64)  // sha256
+    }
+
+    func testShardedFilenameSplitsByFirstHexChar() {
+        let k = PromptCacheKey(modelID: "m", tokens: [1])
+        let url = k.shardedFileURL(under: URL(filePath: "/tmp/kv"))
+        // /tmp/kv/<first-char>/<fullhash>.safetensors
+        let comps = url.pathComponents.suffix(3)
+        XCTAssertEqual(comps.count, 3)
+        // Middle component is the 1-char shard dir.
+        XCTAssertEqual(comps.dropFirst().first?.count, 1)
+        XCTAssertTrue(url.pathExtension == "safetensors")
+    }
+}
diff --git a/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift
new file mode 100644
index 0000000..b0f262c
--- /dev/null
+++ b/MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift
@@ -0,0 +1,88 @@
+import XCTest
+@testable import MacMLXCore
+import MLXLMCommon
+import MLX
+
+final class PromptCacheStoreTests: XCTestCase {
+
+    /// mlx-swift's SwiftPM build does not always bundle `default.metallib`
+    /// alongside the test binary — in that case any `MLXArray` op aborts
+    /// the test process with a fatalError from the C++ side. Detect the
+    /// bundle up front and skip MLX-dependent tests so we still exercise
+    /// the pure LRU / miss paths in the store.
+    private func requireMetalOrSkip() throws {
+        let bundle = Bundle(identifier: "mlx-swift_Cmlx.resources")
+            ?? Bundle.allBundles.first(where: { $0.bundlePath.contains("Cmlx") })
+        let metallib = bundle?.url(forResource: "default", withExtension: "metallib")
+        if metallib == nil {
+            throw XCTSkip("Requires default.metallib (SPM test binaries often lack it — run under xcodebuild)")
+        }
+    }
+
+    /// Build a minimal single-layer [KVCache] from known keys/values.
+    /// Sufficient for roundtrip — shape is [1, n_heads, seq, head_dim].
+    private func makeSyntheticSnapshot(seqLen: Int) -> PromptCacheSnapshot {
+        let keys = MLXArray.zeros([1, 1, seqLen, 4])
+        let values = MLXArray.ones([1, 1, seqLen, 4])
+        let layer = KVCacheSimple()
+        _ = layer.update(keys: keys, values: values)
+        return PromptCacheSnapshot([layer])
+    }
+
+    private func tmpRoot() -> URL {
+        let url = FileManager.default.temporaryDirectory
+            .appending(path: "mlxkv-\(UUID().uuidString)", directoryHint: .isDirectory)
+        try? FileManager.default.createDirectory(at: url, withIntermediateDirectories: true)
+        return url
+    }
+
+    func testPutThenGetHitsHotTier() async throws {
+        try requireMetalOrSkip()
+        let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4)
+        let key = PromptCacheKey(modelID: "M", tokens: [1, 2, 3])
+
+        await store.put(key: key, snapshot: makeSyntheticSnapshot(seqLen: 3))
+        let got = await store.get(key)
+
+        XCTAssertNotNil(got)
+    }
+
+    func testHotEvictionWritesToCold() async throws {
+        try requireMetalOrSkip()
+        let root = tmpRoot()
+        let store = PromptCacheStore(root: root, hotCapacity: 1)
+
+        let k1 = PromptCacheKey(modelID: "M", tokens: [1])
+        let k2 = PromptCacheKey(modelID: "M", tokens: [2])
+
+        await store.put(key: k1, snapshot: makeSyntheticSnapshot(seqLen: 1))
+        await store.put(key: k2, snapshot: makeSyntheticSnapshot(seqLen: 1))
+
+        // k1 should have been evicted from hot → written to cold.
+        let coldFile = k1.shardedFileURL(under: root)
+        XCTAssertTrue(FileManager.default.fileExists(atPath: coldFile.path))
+    }
+
+    func testColdLookupRestores() async throws {
+        try requireMetalOrSkip()
+        let root = tmpRoot()
+        let store = PromptCacheStore(root: root, hotCapacity: 1)
+
+        let k1 = PromptCacheKey(modelID: "M", tokens: [1])
+        let k2 = PromptCacheKey(modelID: "M", tokens: [2])
+
+        await store.put(key: k1, snapshot: makeSyntheticSnapshot(seqLen: 1))
+        await store.put(key: k2, snapshot: makeSyntheticSnapshot(seqLen: 1))
+
+        // k1 was evicted from hot, but cold should restore.
+        let restored = await store.get(k1)
+        XCTAssertNotNil(restored)
+    }
+
+    func testMissReturnsNil() async {
+        let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4)
+        let k = PromptCacheKey(modelID: "M", tokens: [99])
+        let got = await store.get(k)
+        XCTAssertNil(got)
+    }
+}
diff --git a/docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md b/docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md
new file mode 100644
index 0000000..e607b52
--- /dev/null
+++ b/docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md
@@ -0,0 +1,778 @@
+# v0.4 KV Cache Tiering — Implementation Plan (MVP)
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development. Each task produces one commit, reviewed before the next.
+
+**Goal:** Reuse the KV cache across successive generate calls on the same
+model so coding-assistant workflows (Claude Code, Cursor, Zed re-sending
+conversation history each turn) only prefill the delta instead of the
+whole prompt.
+
+**Architecture:** mlx-swift-lm already ships `savePromptCache` /
+`loadPromptCache` / `trimPromptCache` / `canTrimPromptCache`. MVP keeps
+a single **in-memory** per-model prompt-cache snapshot, plus an optional
+disk sidecar. On each `generate` call we trim the cached prefix to the
+longest match and prefill only the new tokens. No block-level hashing
+yet (that's v0.4.0.1) — MVP targets the common case where request N+1's
+prompt starts with request N's tokens.
+
+**Tech Stack:** Swift 6, mlx-swift-lm's `KVCache` APIs, `[MLXArray]`
+safetensors serialisation, `HardwareInfo` for budget defaults.
+
+**Branch:** `feat/v0.4-kv-cache-tiering` (already created).
+
+---
+
+## Scope Check
+
+Three sub-features live under v0.4.0 in the roadmap: KV cache tiering,
+ModelPool, MCP server. Each ships its own branch and plan. This plan
+covers ONLY KV cache tiering. The other two have their own plans to be
+written before their own branches.
+
+---
+
+## File Structure
+
+**Create:**
+- `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift` — hash + metadata for a cache entry
+- `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift` — actor managing hot dict + disk sidecar
+- `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift`
+- `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift`
+
+**Modify:**
+- `MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift` — use store on each `generate`
+- `MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift` — new `kvCacheHotMB: Int` + `kvCacheColdGB: Int` fields with sensible defaults
+- `macMLX/macMLX/Views/Settings/` — add a "KV Cache" section
+
+---
+
+## Task 1: PromptCacheKey — deterministic hash of a token sequence
+
+**Files:**
+- Create: `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift`
+- Create: `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift`
+
+- [ ] **Step 1: Write the failing test**
+
+`MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift`:
+
+```swift
+import XCTest
+@testable import MacMLXCore
+
+final class PromptCacheKeyTests: XCTestCase {
+
+    func testSameModelAndTokensProduceSameKey() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        XCTAssertEqual(a.hashString, b.hashString)
+    }
+
+    func testDifferentTokensProduceDifferentKeys() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 4])
+        let b = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3, 5])
+        XCTAssertNotEqual(a.hashString, b.hashString)
+    }
+
+    func testDifferentModelsProduceDifferentKeys() {
+        let a = PromptCacheKey(modelID: "Qwen3-8B-4bit", tokens: [1, 2, 3])
+        let b = PromptCacheKey(modelID: "Llama-3-8B-4bit", tokens: [1, 2, 3])
+        XCTAssertNotEqual(a.hashString, b.hashString)
+    }
+
+    func testHashStringIsHexLowercase() {
+        let k = PromptCacheKey(modelID: "m", tokens: [1])
+        XCTAssertTrue(k.hashString.allSatisfy { "0123456789abcdef".contains($0) })
+        XCTAssertEqual(k.hashString.count, 64)  // sha256
+    }
+
+    func testShardedFilenameSplitsByFirstHexChar() {
+        let k = PromptCacheKey(modelID: "m", tokens: [1])
+        let url = k.shardedFileURL(under: URL(filePath: "/tmp/kv"))
+        // /tmp/kv/<first-char>/<fullhash>.safetensors
+        let comps = url.pathComponents.suffix(3)
+        XCTAssertEqual(comps.count, 3)
+        XCTAssertEqual(comps.first?.count, 1)       // single-char shard dir
+        XCTAssertTrue(url.pathExtension == "safetensors")
+    }
+}
+```
+
+- [ ] **Step 2: Run test — verify it fails**
+
+```
+cd MacMLXCore && swift test --filter PromptCacheKeyTests 2>&1 | tail -5
+```
+
+Expected: `error: no such type PromptCacheKey`.
+
+- [ ] **Step 3: Implement**
+
+`MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift`:
+
+```swift
+import CryptoKit
+import Foundation
+
+/// Deterministic hash key identifying a cached KV-cache snapshot.
+///
+/// MVP hashes the entire token prefix. v0.4.1+ will switch to a
+/// vLLM-style chained block hash (256 tokens per block + parent
+/// hash) to enable longest-common-prefix matching across siblings;
+/// today two requests have to share the EXACT same prefix to
+/// benefit from the cache.
+public struct PromptCacheKey: Hashable, Sendable {
+    public let modelID: String
+    public let tokenCount: Int
+    public let hashString: String
+
+    public init(modelID: String, tokens: [Int]) {
+        self.modelID = modelID
+        self.tokenCount = tokens.count
+        self.hashString = Self.hash(modelID: modelID, tokens: tokens)
+    }
+
+    /// SHA-256 over `(modelID, tokens)`. Tokens encoded as
+    /// little-endian Int32 for cross-platform stability.
+    private static func hash(modelID: String, tokens: [Int]) -> String {
+        var hasher = SHA256()
+        if let modelBytes = modelID.data(using: .utf8) {
+            hasher.update(data: modelBytes)
+        }
+        hasher.update(data: Data([0x00]))  // separator
+        var buf = Data(capacity: tokens.count * 4)
+        for tok in tokens {
+            var v = Int32(tok).littleEndian
+            withUnsafeBytes(of: &v) { buf.append(contentsOf: $0) }
+        }
+        hasher.update(data: buf)
+        return hasher.finalize().map { String(format: "%02x", $0) }.joined()
+    }
+
+    /// `<root>/<shardChar>/<fullHash>.safetensors`. 16-way fanout
+    /// keeps any single directory from getting huge when the cold
+    /// store grows. `shardChar` is the first hex char of the hash.
+    public func shardedFileURL(under root: URL) -> URL {
+        let shard = String(hashString.prefix(1))
+        return root
+            .appending(path: shard, directoryHint: .isDirectory)
+            .appending(path: "\(hashString).safetensors", directoryHint: .notDirectory)
+    }
+}
+```
+
+- [ ] **Step 4: Run tests — verify pass**
+
+```
+cd MacMLXCore && swift test --filter PromptCacheKeyTests 2>&1 | tail -5
+```
+
+Expected: 5 tests PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+cd /Users/kevin/Projects/macmlx
+git add MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheKey.swift \
+        MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheKeyTests.swift
+git commit -m "feat(prompt-cache): PromptCacheKey — sha256 identifier with 16-way disk shard"
+```
+
+---
+
+## Task 2: PromptCacheStore — hot dict + cold safetensors, LRU on both
+
+**Files:**
+- Create: `MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift`
+- Create: `MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift`
+
+- [ ] **Step 1: Write the failing test**
+
+`MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift`:
+
+```swift
+import XCTest
+@testable import MacMLXCore
+import MLXLMCommon
+import MLX
+
+final class PromptCacheStoreTests: XCTestCase {
+
+    /// Build a minimal single-layer [KVCache] from known keys/values.
+    /// Sufficient for roundtrip — shape is [1, n_heads, seq, head_dim].
+    private func makeSyntheticCache(seqLen: Int) -> [any KVCache] {
+        let keys = MLXArray.zeros([1, 1, seqLen, 4])
+        let values = MLXArray.ones([1, 1, seqLen, 4])
+        let layer = KVCacheSimple()
+        _ = layer.update(keys: keys, values: values)
+        return [layer]
+    }
+
+    private func tmpRoot() -> URL {
+        let url = FileManager.default.temporaryDirectory
+            .appending(path: "mlxkv-\(UUID().uuidString)", directoryHint: .isDirectory)
+        try? FileManager.default.createDirectory(at: url, withIntermediateDirectories: true)
+        return url
+    }
+
+    func testPutThenGetHitsHotTier() async throws {
+        let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4)
+        let key = PromptCacheKey(modelID: "M", tokens: [1, 2, 3])
+        let cache = makeSyntheticCache(seqLen: 3)
+
+        await store.put(key: key, cache: cache)
+        let got = await store.get(key)
+
+        XCTAssertNotNil(got)
+    }
+
+    func testHotEvictionWritesToCold() async throws {
+        let root = tmpRoot()
+        let store = PromptCacheStore(root: root, hotCapacity: 1)
+
+        let k1 = PromptCacheKey(modelID: "M", tokens: [1])
+        let k2 = PromptCacheKey(modelID: "M", tokens: [2])
+
+        await store.put(key: k1, cache: makeSyntheticCache(seqLen: 1))
+        await store.put(key: k2, cache: makeSyntheticCache(seqLen: 1))
+
+        // k1 should have been evicted from hot → written to cold.
+        let coldFile = k1.shardedFileURL(under: root)
+        XCTAssertTrue(FileManager.default.fileExists(atPath: coldFile.path))
+    }
+
+    func testColdLookupRestores() async throws {
+        let root = tmpRoot()
+        let store = PromptCacheStore(root: root, hotCapacity: 1)
+
+        let k1 = PromptCacheKey(modelID: "M", tokens: [1])
+        let k2 = PromptCacheKey(modelID: "M", tokens: [2])
+
+        await store.put(key: k1, cache: makeSyntheticCache(seqLen: 1))
+        await store.put(key: k2, cache: makeSyntheticCache(seqLen: 1))
+
+        // k1 was evicted from hot, but cold should restore.
+        let restored = await store.get(k1)
+        XCTAssertNotNil(restored)
+    }
+
+    func testMissReturnsNil() async {
+        let store = PromptCacheStore(root: tmpRoot(), hotCapacity: 4)
+        let k = PromptCacheKey(modelID: "M", tokens: [99])
+        let got = await store.get(k)
+        XCTAssertNil(got)
+    }
+}
+```
+
+- [ ] **Step 2: Run test — verify it fails**
+
+```
+cd MacMLXCore && swift test --filter PromptCacheStoreTests 2>&1 | tail -5
+```
+
+Expected: `no such type PromptCacheStore`.
+
+- [ ] **Step 3: Implement**
+
+`MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift`:
+
+```swift
+import Foundation
+import MLX
+import MLXLMCommon
+
+/// Two-tier prompt-cache store. Hot = in-memory LRU dict of
+/// `PromptCacheKey → [any KVCache]`. Cold = safetensors files
+/// on disk under `root/<shard>/<hash>.safetensors`, round-tripped
+/// through mlx-swift-lm's `savePromptCache` / `loadPromptCache`.
+///
+/// MVP LRU is strict — full eviction, no partial. v0.4.1+ may add
+/// size-based (byte-count) eviction instead of count-based.
+public actor PromptCacheStore {
+
+    private let root: URL
+    private let hotCapacity: Int
+
+    /// Ordered pair list simulates an LRU. Head = oldest.
+    /// Dictionary gives O(1) lookup; `order` gives O(n) touch but
+    /// `hotCapacity` is small (default 8), so linear scans are fine.
+    private var hot: [PromptCacheKey: [any KVCache]] = [:]
+    private var order: [PromptCacheKey] = []
+
+    public init(root: URL, hotCapacity: Int = 8) {
+        self.root = root
+        self.hotCapacity = hotCapacity
+        try? FileManager.default.createDirectory(
+            at: root,
+            withIntermediateDirectories: true
+        )
+    }
+
+    /// Insert or refresh. Evicts to disk if hot is full.
+    public func put(key: PromptCacheKey, cache: [any KVCache]) {
+        if hot[key] != nil {
+            touch(key)
+            hot[key] = cache
+            return
+        }
+        while hot.count >= hotCapacity, let oldest = order.first {
+            demote(oldest)
+        }
+        hot[key] = cache
+        order.append(key)
+    }
+
+    /// Return a cache snapshot, preferring the hot tier. On cold-hit,
+    /// promote into hot (possibly evicting another entry).
+    public func get(_ key: PromptCacheKey) -> [any KVCache]? {
+        if let cache = hot[key] {
+            touch(key)
+            return cache
+        }
+        let url = key.shardedFileURL(under: root)
+        guard FileManager.default.fileExists(atPath: url.path) else {
+            return nil
+        }
+        do {
+            let (caches, _) = try loadPromptCache(url: url)
+            // Promote.
+            while hot.count >= hotCapacity, let oldest = order.first {
+                demote(oldest)
+            }
+            hot[key] = caches
+            order.append(key)
+            return caches
+        } catch {
+            return nil
+        }
+    }
+
+    // MARK: - Private
+
+    private func touch(_ key: PromptCacheKey) {
+        order.removeAll { $0 == key }
+        order.append(key)
+    }
+
+    /// Persist an entry to disk + remove from hot.
+    private func demote(_ key: PromptCacheKey) {
+        guard let cache = hot.removeValue(forKey: key) else {
+            order.removeAll { $0 == key }
+            return
+        }
+        order.removeAll { $0 == key }
+        let url = key.shardedFileURL(under: root)
+        let parent = url.deletingLastPathComponent()
+        try? FileManager.default.createDirectory(
+            at: parent,
+            withIntermediateDirectories: true
+        )
+        let metadata: [String: String] = [
+            "modelID": key.modelID,
+            "tokenCount": String(key.tokenCount)
+        ]
+        try? savePromptCache(url: url, cache: cache, metadata: metadata)
+    }
+}
+```
+
+- [ ] **Step 4: Run tests**
+
+```
+cd MacMLXCore && swift test --filter PromptCacheStoreTests 2>&1 | tail -10
+```
+
+Expected: 4 tests PASS. If serialisation tests fail with MLXArray-dependent errors (e.g. Metal device unavailable in test), wrap the problematic assertions with `throw XCTSkip` and document the skip.
+
+- [ ] **Step 5: Commit**
+
+```bash
+cd /Users/kevin/Projects/macmlx
+git add MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift \
+        MacMLXCore/Tests/MacMLXCoreTests/PromptCache/PromptCacheStoreTests.swift
+git commit -m "feat(prompt-cache): PromptCacheStore — hot LRU + cold safetensors"
+```
+
+---
+
+## Task 3: Wire into MLXSwiftEngine — use cache on generate
+
+**Files:**
+- Modify: `MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift`
+
+**Strategy:** On each generate call, compute the full-prompt
+`PromptCacheKey`. Look up in store. If hit, pass the restored cache to
+`generate(input:cache:)` so the engine skips the shared prefill. After
+generation, save the updated cache under the new extended key.
+
+mlx-swift-lm's `ModelContainer.generate(input:parameters:)` today hides
+the `KVCache` array inside `TokenIterator`. We need the lower-level
+API: `container.perform { context in TokenIterator(input:model:cache:processor:parameters:) }`
+pattern, then iterate tokens ourselves. See
+`.build/checkouts/mlx-swift-lm/Libraries/MLXLMCommon/Evaluate.swift`
+and `ChatSession.swift` for reference wiring.
+
+- [ ] **Step 1: Read current MLXSwiftEngine.generate body**
+
+```
+cd /Users/kevin/Projects/macmlx
+grep -n "func generate\|container.generate\|TokenIterator" MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift | head -10
+```
+
+Verify the current integration point. Read lines around 200-280 in
+that file to understand the `userInput` mapping and the
+`container.prepare(input:)` call.
+
+- [ ] **Step 2: Read mlx-swift-lm's low-level iterator pattern**
+
+Open:
+- `MacMLXCore/.build/checkouts/mlx-swift-lm/Libraries/MLXLMCommon/Evaluate.swift`
+  (find `TokenIterator` init taking `cache:` param — confirm signature)
+- `MacMLXCore/.build/checkouts/mlx-swift-lm/Libraries/MLXLMCommon/ChatSession.swift`
+  (lines 177, 526–535 per research notes)
+
+Copy the iterator-with-external-cache pattern. Goal is a replacement
+for the current `container.generate(...)` / `container.prepare(...)`
+block that threads a caller-supplied `[any KVCache]` through.
+
+- [ ] **Step 3: Add cache lookup + update logic in generate**
+
+In `MLXSwiftEngine.swift`, add:
+
+1. A `promptCacheStore: PromptCacheStore` stored property initialised
+   against `DataRoot.macMLX("kv-cache")` (or `DataRoot.macMLX.appending(...)`
+   — match whichever API the rest of MacMLXCore uses).
+2. Inside `generate()`, after `chatMessages` are prepared but before
+   the iterator runs:
+   ```swift
+   // Build token sequence for this turn.
+   let userInput = UserInput(chat: chatMessages)
+   let lmInput = try await container.prepare(input: userInput)
+   let inputTokens = lmInput.tokens.asArray(Int.self)
+   let key = PromptCacheKey(modelID: currentModel.id, tokens: inputTokens)
+
+   // Look up prior cache.
+   let priorCache = await promptCacheStore.get(key)
+   ```
+3. Pass `priorCache` to whatever lower-level iterator you end up
+   using. If mlx-swift-lm's public surface only accepts the cache
+   indirectly (e.g. via `ChatSession.loadPromptCache(url:)`), call
+   that instead.
+4. After `generate()` completes successfully, build the updated cache
+   snapshot and store it under the extended key (new tokens from the
+   stream appended):
+   ```swift
+   // After iteration finishes:
+   let finalTokens = inputTokens + generatedTokenIDs
+   let newKey = PromptCacheKey(modelID: currentModel.id, tokens: finalTokens)
+   if let finalCache = /* extract [any KVCache] */ {
+       await promptCacheStore.put(key: newKey, cache: finalCache)
+   }
+   ```
+
+Exact code depends on which public surface mlx-swift-lm exposes for
+our mlx-swift-lm version. If none of `ChatSession`, `TokenIterator`,
+or `container.perform` is reachable with our pinned `3.31.3`, open
+an upstream issue and ship the store + key without engine wiring in
+this PR — still useful for the next PR.
+
+- [ ] **Step 4: Build + integration smoke test**
+
+```
+cd MacMLXCore && swift build 2>&1 | tail -5
+cd MacMLXCore && swift test 2>&1 | tail -5
+cd macMLX && xcodebuild -scheme macMLX -destination 'platform=macOS' -configuration Debug build 2>&1 | tail -5
+```
+
+All three must pass.
+
+Manual: run the app, load a small model (Qwen3-0.6B if you have one),
+send "hi" twice and observe Logs tab. Second `Starting generation`
+line should show a noticeably lower prefill time (visible in total
+duration OR in a new debug log we add for "cache hit: restoring N
+tokens").
+
+- [ ] **Step 5: Commit**
+
+```bash
+cd /Users/kevin/Projects/macmlx
+git add MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
+git commit -m "feat(engine): wire PromptCacheStore into generate()
+
+On each generate call, hash the full input token sequence, look up
+a prior cache snapshot, and pass it to the token iterator so the
+shared prefix prefill is skipped. Save the extended snapshot after
+generation completes so the next turn benefits.
+
+MVP keys on exact-prefix match; vLLM-style block hashing with
+longest-common-prefix matching is v0.4.1+."
+```
+
+---
+
+## Task 4: Settings UI + budget defaults
+
+**Files:**
+- Modify: `MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift` — add `kvCacheHotMB: Int` (default 512) and `kvCacheColdGB: Int` (default 20)
+- Modify: `macMLX/macMLX/Views/Settings/SettingsView.swift` — add a "KV Cache" section with two steppers + a "Clear Cache" button
+- Modify: `MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift` — read the budget at init
+
+- [ ] **Step 1: Extend Settings struct**
+
+In `SettingsManager.swift`, find the `Settings` struct. Add next to
+other defaults:
+
+```swift
+/// Hot prompt-cache capacity in megabytes — in-memory only.
+public var kvCacheHotMB: Int
+/// Cold prompt-cache disk cap in gigabytes.
+public var kvCacheColdGB: Int
+```
+
+Update `Settings.default` to include:
+
+```swift
+kvCacheHotMB: 512,
+kvCacheColdGB: 20,
+```
+
+Update the initialiser's parameter list in both places.
+
+- [ ] **Step 2: Run the existing Settings tests to make sure decoding still works with the new fields**
+
+```
+cd MacMLXCore && swift test --filter SettingsManagerTests 2>&1 | tail -10
+```
+
+Expected: pass. Older settings JSON files from pre-v0.4 users lack
+these fields — if the test for backward-compat decoding fails, add
+`decodeIfPresent` defaults in a custom `init(from:)` on `Settings`.
+
+- [ ] **Step 3: Add a KV Cache section to Settings**
+
+Create `macMLX/macMLX/Views/Settings/KVCacheSection.swift`:
+
+```swift
+import SwiftUI
+import MacMLXCore
+
+struct KVCacheSection: View {
+    @Binding var hotMB: Int
+    @Binding var coldGB: Int
+    var onClearCache: () -> Void
+
+    var body: some View {
+        Section("KV Cache") {
+            HStack {
+                Text("Hot (RAM)")
+                Spacer()
+                Stepper(
+                    value: $hotMB,
+                    in: 128...8192,
+                    step: 128
+                ) {
+                    Text(String(hotMB) + " MB")
+                        .font(.system(.body, design: .monospaced))
+                        .frame(minWidth: 80, alignment: .trailing)
+                }
+            }
+
+            HStack {
+                Text("Cold (SSD)")
+                Spacer()
+                Stepper(
+                    value: $coldGB,
+                    in: 1...500,
+                    step: 1
+                ) {
+                    Text(String(coldGB) + " GB")
+                        .font(.system(.body, design: .monospaced))
+                        .frame(minWidth: 80, alignment: .trailing)
+                }
+            }
+
+            HStack {
+                Spacer()
+                Button("Clear All KV Caches", action: onClearCache)
+                    .foregroundStyle(.red)
+            }
+        }
+    }
+}
+```
+
+- [ ] **Step 4: Wire into SettingsView**
+
+Find `SettingsView.body` and add the section after the existing HTTP
+Server section:
+
+```swift
+KVCacheSection(
+    hotMB: $kvCacheHotMB,
+    coldGB: $kvCacheColdGB,
+    onClearCache: {
+        Task {
+            await appState.coordinator.clearPromptCache()
+        }
+    }
+)
+.onChange(of: kvCacheHotMB) { _, newValue in
+    Task { await appState.updateSettings { $0.kvCacheHotMB = newValue } }
+}
+.onChange(of: kvCacheColdGB) { _, newValue in
+    Task { await appState.updateSettings { $0.kvCacheColdGB = newValue } }
+}
+```
+
+Add matching `@State private var kvCacheHotMB: Int = 512` and
+`kvCacheColdGB: Int = 20` near the other settings `@State` vars.
+In `loadFromSettings(_:)` add `kvCacheHotMB = s.kvCacheHotMB;
+kvCacheColdGB = s.kvCacheColdGB`.
+
+- [ ] **Step 5: Add `clearPromptCache()` to EngineCoordinator**
+
+In `macMLX/macMLX/App/EngineCoordinator.swift`:
+
+```swift
+/// Blow away the prompt cache — both hot and cold tiers. Exposed
+/// to Settings' "Clear All KV Caches" button.
+public func clearPromptCache() async {
+    guard let engine = engine as? MLXSwiftEngine else { return }
+    await engine.clearPromptCache()
+}
+```
+
+And in `MLXSwiftEngine`:
+
+```swift
+public func clearPromptCache() async {
+    await promptCacheStore.clearAll()
+}
+```
+
+Add `clearAll()` to `PromptCacheStore`:
+
+```swift
+public func clearAll() {
+    hot.removeAll()
+    order.removeAll()
+    let root = self.root
+    try? FileManager.default.removeItem(at: root)
+    try? FileManager.default.createDirectory(
+        at: root,
+        withIntermediateDirectories: true
+    )
+}
+```
+
+- [ ] **Step 6: Build both targets**
+
+```
+cd MacMLXCore && swift build 2>&1 | tail -5
+cd MacMLXCore && swift test 2>&1 | tail -5
+cd macMLX && xcodebuild -scheme macMLX -destination 'platform=macOS' -configuration Debug build 2>&1 | tail -5
+```
+
+All three: `BUILD SUCCEEDED` / tests pass.
+
+- [ ] **Step 7: Commit**
+
+```bash
+cd /Users/kevin/Projects/macmlx
+git add MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift \
+        MacMLXCore/Sources/MacMLXCore/PromptCache/PromptCacheStore.swift \
+        MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift \
+        macMLX/macMLX/App/EngineCoordinator.swift \
+        macMLX/macMLX/Views/Settings/KVCacheSection.swift \
+        macMLX/macMLX/Views/Settings/SettingsView.swift
+git commit -m "feat(kv-cache): Settings UI + budget defaults + Clear All button"
+```
+
+---
+
+## Task 5: CHANGELOG + push
+
+**Files:**
+- Modify: `CHANGELOG.md`
+
+- [ ] **Step 1: Add v0.4.0 entry**
+
+Prepend under `[Unreleased]`:
+
+```markdown
+## [Unreleased]
+
+### Added
+- **Prompt cache tiering.** Successive chat turns on the same model
+  now reuse the KV cache when the new prompt extends the previous
+  one — the shared prefix skips prefill. In-memory hot tier
+  (default 512 MB) backed by on-disk cold tier (default 20 GB) at
+  `~/.mac-mlx/kv-cache/`. Coding-assistant workflows (Claude Code,
+  Cursor, Zed) that re-send conversation history each turn see
+  dramatic reductions in time-to-first-token.
+- Settings → "KV Cache" section exposes budget sliders and a
+  "Clear All KV Caches" button.
+```
+
+- [ ] **Step 2: Commit + push**
+
+```bash
+cd /Users/kevin/Projects/macmlx
+git add CHANGELOG.md
+git commit -m "docs: v0.4 KV cache tiering changelog entry"
+git push -u origin feat/v0.4-kv-cache-tiering 2>&1 | tail -3
+```
+
+- [ ] **Step 3: Open PR**
+
+```bash
+gh pr create --base main --head feat/v0.4-kv-cache-tiering \
+  --title "v0.4 — KV cache tiering (hot RAM + cold SSD)" \
+  --body "$(cat <<'EOF'
+## Summary
+- PromptCacheKey: sha256 over (modelID, tokens) + 16-way disk shard
+- PromptCacheStore actor: hot in-memory LRU + cold safetensors on disk
+- Wired into MLXSwiftEngine.generate — shared-prefix prefill is skipped
+- Settings UI: hot/cold budget sliders + Clear All button
+
+## Test plan
+- [ ] swift test --filter PromptCache passes
+- [ ] Load a small model, send "hi" twice, observe reduced TTFT on second turn
+- [ ] xcodebuild + MacMLXCore swift test both green
+EOF
+)"
+```
+
+---
+
+## Self-Review
+
+- ✅ **Spec coverage:** hot tier, cold tier, LRU, disk sharding, engine
+  wiring, Settings UI, Clear button — all have tasks.
+- ⚠️ **Placeholder risk on Task 3 Step 3:** "Exact code depends on
+  which public surface mlx-swift-lm exposes" is honest but leaves
+  specifics for the implementer. Mitigation: Step 2 explicitly
+  directs reading concrete files and line numbers to deduce the
+  pattern.
+- ✅ **Type consistency:** `PromptCacheKey`, `PromptCacheStore`,
+  `clearPromptCache()`, `kvCacheHotMB`, `kvCacheColdGB` referenced
+  consistently.
+- ⚠️ **Known scope cut:** v0.4.0.1 follow-up will replace full-prompt
+  hash with vLLM-style chained block hash for longest-common-prefix
+  matching. MVP today only benefits when the NEW prompt fully contains
+  the OLD prompt as prefix — still a big win for chat continuations.
+
+---
+
+## Execution Handoff
+
+Plan complete and saved to
+`docs/superpowers/plans/2026-04-18-v0.4-kv-cache-tiering.md`.
+
+**1. Subagent-Driven (recommended)** — fresh subagent per task,
+review between.
+**2. Inline** — execute tasks in this session.
+
+Recommended: **1**. Each task hits a different file set cleanly;
+subagent isolation keeps each attempt focused.
diff --git a/macMLX/macMLX/App/EngineCoordinator.swift b/macMLX/macMLX/App/EngineCoordinator.swift
index e7b405e..35bbfef 100644
--- a/macMLX/macMLX/App/EngineCoordinator.swift
+++ b/macMLX/macMLX/App/EngineCoordinator.swift
@@ -116,6 +116,19 @@ public final class EngineCoordinator {
         }
     }
 
+    /// Blow away the prompt cache — both hot and cold tiers. Exposed
+    /// to Settings' "Clear All KV Caches" button.
+    ///
+    /// Today only the in-process `MLXSwiftEngine` carries a prompt
+    /// cache; the SwiftLM / Python-MLX detection-only stubs don't, so
+    /// downcasting and no-op-on-mismatch is the right shape. When
+    /// another engine grows a cache this will move onto the
+    /// `InferenceEngine` protocol.
+    public func clearPromptCache() async {
+        guard let engine = engine as? MLXSwiftEngine else { return }
+        await engine.clearPromptCache()
+    }
+
     /// Release the loaded model.
     public func unload() async {
         guard let engine else { return }
diff --git a/macMLX/macMLX/Views/Settings/KVCacheSection.swift b/macMLX/macMLX/Views/Settings/KVCacheSection.swift
new file mode 100644
index 0000000..2b78499
--- /dev/null
+++ b/macMLX/macMLX/Views/Settings/KVCacheSection.swift
@@ -0,0 +1,61 @@
+// KVCacheSection.swift
+// macMLX
+//
+// Settings section exposing the v0.4 KV-cache-tiering knobs: hot (RAM)
+// and cold (SSD) budget sliders plus a "Clear All KV Caches" button
+// that drops both tiers.
+//
+// MVP note: the sliders persist to `Settings.kvCacheHotMB` /
+// `Settings.kvCacheColdGB` but are not yet wired into the engine's
+// eviction logic. `PromptCacheStore` uses an 8-entry LRU today; a
+// byte-accurate budget and automatic cold-tier pruning land in
+// v0.4.0.1. See the `.help` strings below for the user-facing note.
+
+import SwiftUI
+import MacMLXCore
+
+struct KVCacheSection: View {
+    @Binding var hotMB: Int
+    @Binding var coldGB: Int
+    var onClearCache: () -> Void
+
+    var body: some View {
+        Section("KV Cache") {
+            HStack {
+                Text("Hot (RAM)")
+                Spacer()
+                Stepper(
+                    value: $hotMB,
+                    in: 128...8192,
+                    step: 128
+                ) {
+                    Text(String(hotMB) + " MB")
+                        .font(.system(.body, design: .monospaced))
+                        .frame(minWidth: 80, alignment: .trailing)
+                }
+                .help("Takes effect in v0.4.0.1 — currently capped at 8 cache entries regardless of this slider.")
+            }
+
+            HStack {
+                Text("Cold (SSD)")
+                Spacer()
+                Stepper(
+                    value: $coldGB,
+                    in: 1...500,
+                    step: 1
+                ) {
+                    Text(String(coldGB) + " GB")
+                        .font(.system(.body, design: .monospaced))
+                        .frame(minWidth: 80, alignment: .trailing)
+                }
+                .help("Cold-tier cap is not enforced automatically in this MVP — use the Clear All button below to reclaim space. Automatic pruning lands in v0.4.0.1.")
+            }
+
+            HStack {
+                Spacer()
+                Button("Clear All KV Caches", action: onClearCache)
+                    .foregroundStyle(.red)
+            }
+        }
+    }
+}
diff --git a/macMLX/macMLX/Views/Settings/SettingsView.swift b/macMLX/macMLX/Views/Settings/SettingsView.swift
index e29575b..a8595de 100644
--- a/macMLX/macMLX/Views/Settings/SettingsView.swift
+++ b/macMLX/macMLX/Views/Settings/SettingsView.swift
@@ -18,6 +18,8 @@ struct SettingsView: View {
     @State private var serverPort: Int = 8000
     @State private var autoStartServer: Bool = false
     @State private var hfEndpoint: String = "https://huggingface.co"
+    @State private var kvCacheHotMB: Int = 512
+    @State private var kvCacheColdGB: Int = 20
 
     var body: some View {
         Form {
@@ -49,6 +51,22 @@ struct SettingsView: View {
                 }
             }
 
+            KVCacheSection(
+                hotMB: $kvCacheHotMB,
+                coldGB: $kvCacheColdGB,
+                onClearCache: {
+                    Task {
+                        await appState.coordinator.clearPromptCache()
+                    }
+                }
+            )
+            .onChange(of: kvCacheHotMB) { _, newValue in
+                Task { await appState.updateSettings { $0.kvCacheHotMB = newValue } }
+            }
+            .onChange(of: kvCacheColdGB) { _, newValue in
+                Task { await appState.updateSettings { $0.kvCacheColdGB = newValue } }
+            }
+
             downloadsSection
 
             rerunSetupSection
@@ -151,6 +169,8 @@ struct SettingsView: View {
         serverPort = s.serverPort
         autoStartServer = s.autoStartServer
         hfEndpoint = s.hfEndpoint
+        kvCacheHotMB = s.kvCacheHotMB
+        kvCacheColdGB = s.kvCacheColdGB
     }
 
     private func showModelDirectoryPicker() {