From dd12c2f7171d852eed58340a0b97abd42dd405c2 Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 22:01:50 +0700
Subject: [PATCH 1/6] =?UTF-8?q?docs:=20v0.4=20ModelPool=20plan=20=E2=80=94?=
 =?UTF-8?q?=205=20tasks,=20MVP=20with=20LRU=20+=20pin?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../plans/2026-04-18-v0.4-model-pool.md       | 772 ++++++++++++++++++
 1 file changed, 772 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-04-18-v0.4-model-pool.md

diff --git a/docs/superpowers/plans/2026-04-18-v0.4-model-pool.md b/docs/superpowers/plans/2026-04-18-v0.4-model-pool.md
new file mode 100644
index 0000000..53058e7
--- /dev/null
+++ b/docs/superpowers/plans/2026-04-18-v0.4-model-pool.md
@@ -0,0 +1,772 @@
+# v0.4 ModelPool — Implementation Plan (MVP)
+
+> REQUIRED SUB-SKILL: superpowers:subagent-driven-development
+
+**Goal:** Multiple MLX models co-resident in memory, auto-evicted LRU when over budget, so chat / HTTP API / cold-swap can switch between pinned models without disk re-read.
+
+**Architecture:** `actor ModelPool` holds `[String: PooledEngine]`. Load is serialized under an internal inflight-task map. Budget is a user-settable GB cap (default = 50% of total RAM). LRU eviction on load when total estimated bytes exceed budget. Pinned models never evict. `EngineCoordinator` routes `load(_:)` through the pool and exposes `currentEngine` = most-recently-touched non-detection engine.
+
+**Tech Stack:** Swift 6 actor, `Memory.cacheLimit` / `MLX.GPU.clearCache()`, safetensors pre-scan for size estimate.
+
+**Branch:** `feat/v0.4-model-pool` (created).
+
+---
+
+## Out of scope for this MVP
+
+- Live `DispatchSource` memory-pressure watcher — defer to v0.4.0.1.
+- Pinned-set persistence to disk — defer to next PR.
+- Per-model live RSS display — coarse "loaded models count" only.
+
+---
+
+## File Structure
+
+**Create:**
+- `MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift` — the actor
+- `MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift` — value type bundling engine + metadata
+- `MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift`
+
+**Modify:**
+- `MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift` — add `maxResidentMemoryGB: Int` (default: `HardwareInfo.totalMemoryGB() / 2`)
+- `macMLX/macMLX/App/EngineCoordinator.swift` — use ModelPool under the hood; track `currentModel` as most-recent-load
+- `macMLX/macMLX/Views/Settings/` — add "Model Pool" section (budget slider + auto-evict toggle)
+- `macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift` — pin/unpin toggle button + "loaded" indicator
+
+---
+
+## Task 1: PooledEngineEntry value type
+
+**Files:**
+- Create: `MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift`
+
+- [ ] **Step 1: Implement**
+
+```swift
+import Foundation
+
+/// Bookkeeping struct held by `ModelPool` per resident model.
+/// The engine itself is not stored here (it's an actor in the pool's
+/// dict); this is the value-type metadata.
+public struct PooledEngineEntry: Sendable, Equatable {
+    /// Model identifier (matches `LocalModel.id`).
+    public let modelID: String
+    /// Estimated memory cost — sum of safetensors file sizes in
+    /// the model directory. Rough but stable for budget math;
+    /// actual MLX allocator usage can exceed this by 10–30%.
+    public let estimatedBytes: Int64
+    /// Wall-clock time of last `engine(for:)` or `load(_:)` access.
+    public var lastAccess: Date
+    /// Pinned entries are never evicted by the LRU sweeper.
+    public var isPinned: Bool
+
+    public init(
+        modelID: String,
+        estimatedBytes: Int64,
+        lastAccess: Date = Date(),
+        isPinned: Bool = false
+    ) {
+        self.modelID = modelID
+        self.estimatedBytes = estimatedBytes
+        self.lastAccess = lastAccess
+        self.isPinned = isPinned
+    }
+}
+
+/// Sum of `.safetensors` files under `directory`. Rough proxy for
+/// how much memory the model needs when loaded. Returns 0 on any
+/// filesystem error.
+public func estimateModelSize(at directory: URL) -> Int64 {
+    guard let files = try? FileManager.default.contentsOfDirectory(
+        at: directory,
+        includingPropertiesForKeys: [.fileSizeKey]
+    ) else {
+        return 0
+    }
+    return files
+        .filter { $0.pathExtension.lowercased() == "safetensors" }
+        .compactMap { url -> Int64? in
+            guard let values = try? url.resourceValues(forKeys: [.fileSizeKey]),
+                  let size = values.fileSize else { return nil }
+            return Int64(size)
+        }
+        .reduce(0, +)
+}
+```
+
+- [ ] **Step 2: No dedicated test — pure value type, exercised via ModelPoolTests in Task 2.**
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift
+git commit -m "feat(pool): PooledEngineEntry + estimateModelSize helper"
+```
+
+---
+
+## Task 2: ModelPool actor with LRU + budget
+
+**Files:**
+- Create: `MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift`
+- Create: `MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift`
+
+- [ ] **Step 1: Write the failing test**
+
+`MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift`:
+
+```swift
+import XCTest
+@testable import MacMLXCore
+
+/// Stub engine for pool tests — no Metal/MLX required. Implements
+/// the minimum InferenceEngine surface the pool touches: load,
+/// unload, engineID. Generate throws since it shouldn't be called.
+private actor StubEngine: InferenceEngine {
+    let engineID: EngineID = .mlxSwift
+    public var version: String = "stub"
+    public var loadedModel: LocalModel?
+    nonisolated public var activeEngineID: EngineID { .mlxSwift }
+
+    func load(_ model: LocalModel) async throws {
+        loadedModel = model
+    }
+    func unload() async throws {
+        loadedModel = nil
+    }
+    func generate(_ request: GenerateRequest) -> AsyncThrowingStream<GenerateChunk, Error> {
+        AsyncThrowingStream { cont in
+            cont.finish(throwing: EngineError.notLoaded)
+        }
+    }
+}
+
+final class ModelPoolTests: XCTestCase {
+
+    private func mkModel(_ id: String, size: Int64 = 1_000_000_000) -> LocalModel {
+        LocalModel(
+            id: id,
+            displayName: id,
+            directory: FileManager.default.temporaryDirectory,
+            sizeBytes: size,
+            format: .mlx,
+            quantization: nil,
+            parameterCount: nil,
+            architecture: nil
+        )
+    }
+
+    func testLoadAddsToPool() async throws {
+        let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() })
+        let m = mkModel("A", size: 1_000_000_000)
+        _ = try await pool.load(m)
+        let residents = await pool.residentModelIDs()
+        XCTAssertEqual(residents, ["A"])
+    }
+
+    func testLoadReuseExistingInstance() async throws {
+        let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() })
+        let m = mkModel("A", size: 1_000_000_000)
+        let e1 = try await pool.load(m) as AnyObject
+        let e2 = try await pool.load(m) as AnyObject
+        XCTAssertTrue(e1 === e2)
+    }
+
+    func testOverBudgetEvictsLRU() async throws {
+        let pool = ModelPool(maxBytes: 2_500_000_000, engineFactory: { _ in StubEngine() })
+        _ = try await pool.load(mkModel("A", size: 1_000_000_000))
+        _ = try await pool.load(mkModel("B", size: 1_000_000_000))
+        // Budget has 2.5 GB, A+B = 2 GB fits.
+        _ = try await pool.load(mkModel("C", size: 1_000_000_000))
+        // A+B+C = 3 GB — over. Oldest (A) evicted.
+        let residents = await pool.residentModelIDs()
+        XCTAssertFalse(residents.contains("A"))
+        XCTAssertTrue(residents.contains("B"))
+        XCTAssertTrue(residents.contains("C"))
+    }
+
+    func testPinnedNotEvicted() async throws {
+        let pool = ModelPool(maxBytes: 2_500_000_000, engineFactory: { _ in StubEngine() })
+        _ = try await pool.load(mkModel("A", size: 1_000_000_000))
+        await pool.setPinned("A", true)
+        _ = try await pool.load(mkModel("B", size: 1_000_000_000))
+        _ = try await pool.load(mkModel("C", size: 1_000_000_000))
+        // A is pinned → B (next-oldest) evicted instead.
+        let residents = await pool.residentModelIDs()
+        XCTAssertTrue(residents.contains("A"))
+        XCTAssertFalse(residents.contains("B"))
+        XCTAssertTrue(residents.contains("C"))
+    }
+
+    func testUnloadRemovesFromPool() async throws {
+        let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() })
+        _ = try await pool.load(mkModel("A"))
+        await pool.unload("A")
+        let residents = await pool.residentModelIDs()
+        XCTAssertTrue(residents.isEmpty)
+    }
+
+    func testEngineForReturnsNilWhenNotLoaded() async {
+        let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() })
+        let e = await pool.engine(for: "A")
+        XCTAssertNil(e)
+    }
+}
+```
+
+- [ ] **Step 2: Run — verify fail**
+
+```
+cd MacMLXCore && swift test --filter ModelPoolTests 2>&1 | tail -5
+```
+
+Expected: `no such type ModelPool`.
+
+- [ ] **Step 3: Implement**
+
+`MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift`:
+
+```swift
+import Foundation
+
+/// Actor managing multiple resident `InferenceEngine` instances with
+/// LRU + explicit pinning + byte-budget auto-evict. Use cases:
+///
+/// - Swap between chat models without re-reading weights from disk
+/// - External API cold-swap without unloading the GUI's current model
+/// - Keep a small always-ready model pinned alongside a big one that
+///   auto-evicts on memory pressure
+///
+/// Load path is serialised under `loadTasks` to avoid two concurrent
+/// requests double-loading the same weights — the second caller awaits
+/// the first's completion.
+public actor ModelPool {
+
+    public typealias EngineFactory = @Sendable (LocalModel) -> any InferenceEngine
+
+    // MARK: - State
+
+    /// Currently resident engines, keyed by model ID.
+    private var engines: [String: any InferenceEngine] = [:]
+    /// Bookkeeping keyed by model ID.
+    private var entries: [String: PooledEngineEntry] = [:]
+    /// In-flight loads so concurrent callers deduplicate.
+    private var loadTasks: [String: Task<any InferenceEngine, Error>] = [:]
+
+    private let engineFactory: EngineFactory
+
+    // MARK: - Budget
+
+    /// Maximum total estimated bytes that may be resident. Exceeding
+    /// this triggers LRU eviction (pinned entries are spared).
+    public var maxBytes: Int64
+
+    public init(
+        maxBytes: Int64,
+        engineFactory: @escaping EngineFactory
+    ) {
+        self.maxBytes = maxBytes
+        self.engineFactory = engineFactory
+    }
+
+    public func setMaxBytes(_ bytes: Int64) {
+        self.maxBytes = bytes
+    }
+
+    // MARK: - Public
+
+    public func residentModelIDs() -> [String] {
+        Array(engines.keys).sorted()
+    }
+
+    public func engine(for modelID: String) -> (any InferenceEngine)? {
+        guard let e = engines[modelID] else { return nil }
+        // Touch LRU timestamp.
+        if var entry = entries[modelID] {
+            entry.lastAccess = Date()
+            entries[modelID] = entry
+        }
+        return e
+    }
+
+    public func setPinned(_ modelID: String, _ pinned: Bool) {
+        guard var entry = entries[modelID] else { return }
+        entry.isPinned = pinned
+        entries[modelID] = entry
+    }
+
+    public func isPinned(_ modelID: String) -> Bool {
+        entries[modelID]?.isPinned ?? false
+    }
+
+    public func unload(_ modelID: String) async {
+        if let e = engines.removeValue(forKey: modelID) {
+            try? await e.unload()
+        }
+        entries.removeValue(forKey: modelID)
+    }
+
+    /// Return an engine with `model.id` loaded. Reuses an existing
+    /// entry when possible. Evicts LRU entries as needed to stay
+    /// within `maxBytes`. Concurrent loads of the same ID share.
+    @discardableResult
+    public func load(_ model: LocalModel) async throws -> any InferenceEngine {
+        // Already loaded? Touch and return.
+        if let e = engines[model.id] {
+            if var entry = entries[model.id] {
+                entry.lastAccess = Date()
+                entries[model.id] = entry
+            }
+            return e
+        }
+        // In-flight load by another caller? Join it.
+        if let pending = loadTasks[model.id] {
+            return try await pending.value
+        }
+
+        // Evict to fit before starting the load, using the model's
+        // sizeBytes (or our estimate) as the cost.
+        let cost = model.sizeBytes > 0 ? model.sizeBytes : estimateModelSize(at: model.directory)
+        evict(toFit: cost)
+
+        let factory = engineFactory
+        let task = Task { () throws -> any InferenceEngine in
+            let engine = factory(model)
+            try await engine.load(model)
+            return engine
+        }
+        loadTasks[model.id] = task
+        do {
+            let engine = try await task.value
+            loadTasks.removeValue(forKey: model.id)
+            engines[model.id] = engine
+            entries[model.id] = PooledEngineEntry(
+                modelID: model.id,
+                estimatedBytes: cost
+            )
+            return engine
+        } catch {
+            loadTasks.removeValue(forKey: model.id)
+            throw error
+        }
+    }
+
+    // MARK: - Eviction
+
+    private func currentResidentBytes() -> Int64 {
+        entries.values.map(\.estimatedBytes).reduce(0, +)
+    }
+
+    /// Evict LRU non-pinned entries until (currentBytes + incoming) fits.
+    private func evict(toFit incoming: Int64) {
+        var target = maxBytes - incoming
+        if target < 0 { target = 0 }
+
+        // Candidates: non-pinned, oldest first.
+        let candidates = entries.values
+            .filter { !$0.isPinned }
+            .sorted { $0.lastAccess < $1.lastAccess }
+
+        var current = currentResidentBytes()
+        var iterator = candidates.makeIterator()
+        while current > target, let victim = iterator.next() {
+            if let e = engines.removeValue(forKey: victim.modelID) {
+                Task { try? await e.unload() }
+            }
+            entries.removeValue(forKey: victim.modelID)
+            current -= victim.estimatedBytes
+        }
+    }
+}
+```
+
+- [ ] **Step 4: Run tests — verify pass**
+
+```
+cd MacMLXCore && swift test --filter ModelPoolTests 2>&1 | tail -10
+```
+
+Expected: 6/6 PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift \
+        MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift
+git commit -m "feat(pool): ModelPool actor — LRU + pinning + byte-budget eviction"
+```
+
+---
+
+## Task 3: Route EngineCoordinator through ModelPool
+
+**Files:**
+- Modify: `macMLX/macMLX/App/EngineCoordinator.swift`
+
+**Strategy:** keep `EngineCoordinator.load(_:)` / `unload()` / `generate(_:)` public API intact so no callers break. Internally, replace the single `engine` property with a `pool: ModelPool` + `currentModelID: String?`. `load(model)` delegates to `pool.load(model)`. `unload()` unloads currently-active from pool (leaves others resident). `generate()` fetches via `pool.engine(for: currentModelID)`.
+
+- [ ] **Step 1: Read current EngineCoordinator**
+
+```
+cd /Users/kevin/Projects/macmlx
+grep -n "class EngineCoordinator\|public private(set) var\|private var engine\|func load\|func unload\|func generate\|switchTo" macMLX/macMLX/App/EngineCoordinator.swift | head -20
+```
+
+Understand existing members: `engineID`, `engineVersion`, `currentModel`, `loadedModel`, `status`, `tokensGeneratedTotal`, `onModelLoaded`, plus methods `switchTo(_:)`, `load(_:)`, `unload()`, `generate(_:)`, `clearPromptCache()`.
+
+- [ ] **Step 2: Refactor — minimal surgical change**
+
+Replace `private var engine: (any InferenceEngine)?` with:
+
+```swift
+/// Pool of resident engines. Single-model-at-a-time behaviour (today)
+/// is preserved by only calling `pool.load` when `currentModel` needs
+/// to change; ModelPool lets us lazily keep previously-loaded ones
+/// resident without explicit unload, for v0.4.0+ multi-model workflows.
+private let pool: ModelPool
+
+/// Kept as an optional — nil when no model is active.
+/// `activeEngine` (nonisolated accessor used by HummingbirdServer)
+/// reads this and does a synchronous dict lookup against the pool.
+public private(set) var currentModelID: String?
+```
+
+Update `activeEngine` computed:
+
+```swift
+public var activeEngine: (any InferenceEngine)? {
+    get async {
+        guard let id = currentModelID else { return nil }
+        return await pool.engine(for: id)
+    }
+}
+```
+
+Note: changing `activeEngine` from sync to async breaks HummingbirdServer's call. Inspect `startServer` in AppState.swift — it does `coordinator.activeEngine` synchronously. Change that site to `await coordinator.activeEngine` and propagate.
+
+In `init`, construct the pool:
+
+```swift
+let totalGB = HardwareInfo.totalMemoryGB()
+let budgetBytes = Int64(Double(totalGB) * 0.5 * 1_073_741_824)
+self.pool = ModelPool(maxBytes: budgetBytes) { _ in
+    MLXSwiftEngine()
+}
+```
+
+Rewrite `load(_:)`:
+
+```swift
+public func load(_ model: LocalModel) async -> Result<Void, Error> {
+    status = .loading(model: model.id)
+    await logs.log("Loading model: \(model.id)", level: .info, category: .engine)
+    do {
+        _ = try await pool.load(model)
+        currentModel = model
+        loadedModel = model
+        currentModelID = model.id
+        status = .ready(model: model.id)
+        await refreshEngineVersion()
+        await onModelLoaded?(model)
+        await logs.log("Model loaded: \(model.id)", level: .info, category: .engine)
+        return .success(())
+    } catch {
+        status = .error(error.localizedDescription)
+        return .failure(error)
+    }
+}
+```
+
+Rewrite `unload()`:
+
+```swift
+public func unload() async {
+    guard let id = currentModelID else { return }
+    await pool.unload(id)
+    currentModelID = nil
+    currentModel = nil
+    loadedModel = nil
+    status = .idle
+    await logs.log("Engine unloaded", level: .info, category: .engine)
+}
+```
+
+Rewrite `generate(_:)`:
+
+```swift
+public func generate(_ request: GenerateRequest) -> AsyncThrowingStream<GenerateChunk, Error> {
+    AsyncThrowingStream { continuation in
+        Task {
+            guard let id = currentModelID,
+                  let engine = await pool.engine(for: id) else {
+                continuation.finish(throwing: EngineError.notLoaded)
+                return
+            }
+            do {
+                for try await chunk in engine.generate(request) {
+                    continuation.yield(chunk)
+                    tokensGeneratedTotal += 1
+                }
+                continuation.finish()
+            } catch {
+                continuation.finish(throwing: error)
+            }
+        }
+    }
+}
+```
+
+Add pin pass-throughs:
+
+```swift
+public func setPinned(_ modelID: String, _ pinned: Bool) async {
+    await pool.setPinned(modelID, pinned)
+}
+public func residentModelIDs() async -> [String] {
+    await pool.residentModelIDs()
+}
+```
+
+Update `clearPromptCache()`: the old implementation downcast `engine` to `MLXSwiftEngine`. Now iterate pool:
+
+```swift
+public func clearPromptCache() async {
+    for id in await pool.residentModelIDs() {
+        if let mlx = await pool.engine(for: id) as? MLXSwiftEngine {
+            await mlx.clearPromptCache()
+        }
+    }
+}
+```
+
+- [ ] **Step 3: Fix call sites that relied on sync `activeEngine`**
+
+```
+grep -rn "coordinator.activeEngine\|coordinator.engine" macMLX/macMLX/ MacMLXCore/Sources/MacMLXCore/ | grep -v ".build"
+```
+
+Update each `coordinator.activeEngine` to `await coordinator.activeEngine`. In SwiftUI views that aren't async contexts, hop into a `Task`.
+
+- [ ] **Step 4: Build + test**
+
+```
+cd MacMLXCore && swift build 2>&1 | tail -5
+cd MacMLXCore && swift test 2>&1 | tail -5
+cd macMLX && xcodebuild -scheme macMLX -destination 'platform=macOS' -configuration Debug build 2>&1 | tail -5
+```
+
+All three must succeed. If any call site refuses to go async (e.g. a `var` binding in a non-async computed property), wrap with `Task { ... }` or rearchitect that specific call — do not make a sync bridge that blocks the main thread.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add macMLX/macMLX/App/EngineCoordinator.swift macMLX/macMLX/App/AppState.swift
+# plus any call-site fixups
+git commit -m "feat(pool): route EngineCoordinator load/unload/generate through ModelPool
+
+Single active model is still the UX default — currentModelID is the
+one shown in the toolbar + menu bar. But previously-loaded models
+now stay resident in the pool until LRU evicts, so the next
+switch-back skips disk read. Engines spawn via an injected factory
+so tests can stub."
+```
+
+---
+
+## Task 4: Settings UI (budget slider) + Models-tab pin toggle
+
+**Files:**
+- Modify: `MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift` — `maxResidentMemoryGB: Int`
+- Modify: `macMLX/macMLX/Views/Settings/` — new `ModelPoolSection.swift` + wire into `SettingsView`
+- Modify: `macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift` — pin toggle
+- Modify: `macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift` — pass pin state + callback
+
+### Step 1: Extend Settings
+
+`SettingsManager.swift`:
+
+```swift
+public var maxResidentMemoryGB: Int
+```
+
+In `Settings.default`:
+
+```swift
+maxResidentMemoryGB: max(4, Int(HardwareInfo.totalMemoryGB()) / 2),
+```
+
+Add `decodeIfPresent` default to the custom `init(from:)` so pre-v0.4 settings JSONs decode with a sensible fallback:
+
+```swift
+self.maxResidentMemoryGB = (try c.decodeIfPresent(Int.self, forKey: .maxResidentMemoryGB))
+    ?? max(4, Int(HardwareInfo.totalMemoryGB()) / 2)
+```
+
+### Step 2: ModelPoolSection.swift
+
+Create `macMLX/macMLX/Views/Settings/ModelPoolSection.swift`:
+
+```swift
+import SwiftUI
+import MacMLXCore
+
+struct ModelPoolSection: View {
+    @Binding var maxResidentGB: Int
+
+    var body: some View {
+        Section("Model Pool") {
+            HStack {
+                Text("Max resident memory")
+                Spacer()
+                Stepper(
+                    value: $maxResidentGB,
+                    in: 2...256,
+                    step: 1
+                ) {
+                    Text(String(maxResidentGB) + " GB")
+                        .font(.system(.body, design: .monospaced))
+                        .frame(minWidth: 80, alignment: .trailing)
+                }
+            }
+            .help("When multiple loaded models exceed this, the least-recently-used non-pinned one is unloaded.")
+        }
+    }
+}
+```
+
+### Step 3: Wire into SettingsView
+
+Add `@State private var maxResidentMemoryGB: Int = 8` + load from settings. Render the section near the KV Cache section. `.onChange(of: maxResidentMemoryGB)` → `appState.updateSettings { $0.maxResidentMemoryGB = newValue }` + propagate to coordinator:
+
+```swift
+.onChange(of: maxResidentMemoryGB) { _, newValue in
+    Task {
+        await appState.updateSettings { $0.maxResidentMemoryGB = newValue }
+        await appState.coordinator.setPoolBudget(
+            bytes: Int64(newValue) * 1_073_741_824
+        )
+    }
+}
+```
+
+Add `setPoolBudget(bytes:)` on EngineCoordinator:
+
+```swift
+public func setPoolBudget(bytes: Int64) async {
+    await pool.setMaxBytes(bytes)
+}
+```
+
+### Step 4: LocalModelRow pin toggle
+
+Add a new param:
+
+```swift
+let isPinned: Bool
+let onTogglePin: () -> Void
+```
+
+Render a pin icon button in the row's trailing metadata area:
+
+```swift
+Button {
+    onTogglePin()
+} label: {
+    Image(systemName: isPinned ? "pin.fill" : "pin")
+        .foregroundStyle(isPinned ? .orange : .secondary)
+}
+.buttonStyle(.plain)
+.help(isPinned ? "Pinned — won't auto-evict" : "Pin to keep resident")
+```
+
+### Step 5: Wire pin callback from ModelLibraryView
+
+In `ModelLibraryView.localTab`, the `LocalModelRow(...)` call gains:
+
+```swift
+isPinned: viewModel.pinnedModelIDs.contains(model.id),
+onTogglePin: { Task { await viewModel.togglePin(model) } }
+```
+
+Add to `ModelLibraryViewModel`:
+
+```swift
+var pinnedModelIDs: Set<String> = []
+
+func togglePin(_ model: LocalModel) async {
+    let nowPinned = !pinnedModelIDs.contains(model.id)
+    await appState.coordinator.setPinned(model.id, nowPinned)
+    if nowPinned {
+        pinnedModelIDs.insert(model.id)
+    } else {
+        pinnedModelIDs.remove(model.id)
+    }
+}
+```
+
+### Step 6: Build + test + commit
+
+```
+cd MacMLXCore && swift build 2>&1 | tail -5
+cd MacMLXCore && swift test 2>&1 | tail -5
+cd macMLX && xcodebuild -scheme macMLX -destination 'platform=macOS' -configuration Debug build 2>&1 | tail -5
+```
+
+```bash
+git add MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift \
+        macMLX/macMLX/App/EngineCoordinator.swift \
+        macMLX/macMLX/Views/Settings/ModelPoolSection.swift \
+        macMLX/macMLX/Views/Settings/SettingsView.swift \
+        macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift \
+        macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift \
+        macMLX/macMLX/Views/ModelLibrary/ModelLibraryViewModel.swift
+git commit -m "feat(pool): Settings max-resident slider + Models-tab pin toggle"
+```
+
+---
+
+## Task 5: CHANGELOG + PR
+
+- [ ] Prepend under `[Unreleased]` in CHANGELOG.md:
+
+```markdown
+### Added
+- **Multi-model pool** (v0.4.0 engine parity, part 2 of 3). Load
+  multiple models at once — previously the engine unloaded one to
+  load another. Pool is bounded by a user-configurable resident
+  memory cap (Settings → Model Pool; default 50% of total RAM).
+  Least-recently-used non-pinned models auto-evict when the cap is
+  exceeded. Pin a model from its row in the Models tab to keep it
+  resident regardless of LRU order.
+```
+
+- [ ] **Push + open PR**
+
+```bash
+git add CHANGELOG.md
+git commit -m "docs: v0.4 ModelPool changelog entry"
+git push -u origin feat/v0.4-model-pool 2>&1 | tail -3
+gh pr create --base main --head feat/v0.4-model-pool \
+  --title "v0.4 — ModelPool (multi-model + LRU + pin)" \
+  --body "Second of three v0.4.0 engine-parity sub-features. ModelPool actor holds multiple InferenceEngine instances, LRU-evicts under a byte budget, respects pinned entries. EngineCoordinator now delegates load/unload/generate to the pool. Settings slider + Models-tab pin toggle expose the user-visible controls. 6 new ModelPoolTests passing."
+```
+
+---
+
+## Self-Review
+
+- ✅ Spec coverage: budget, LRU, pin, UI both-sides, test.
+- ✅ No placeholders — every step has concrete code or commands.
+- ⚠️ Task 3 Step 3 blast radius — switching `activeEngine` from sync
+  to async computed touches HummingbirdServer wiring in AppState.
+  Budget extra time for this.
+- ⚠️ The engine factory `{ _ in MLXSwiftEngine() }` construction
+  ignores the `LocalModel` — current `MLXSwiftEngine()` takes no
+  args and `load(model)` is separate. Verify the current MLXSwiftEngine
+  init signature before committing Task 3.
+
+---
+
+## Execution Handoff
+
+Subagent-driven execution. Same pattern as the KV cache PR.

From 71e98d8e8d31d60f63ab11992a3653dd58b9c6c4 Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 22:02:58 +0700
Subject: [PATCH 2/6] feat(pool): PooledEngineEntry + estimateModelSize helper

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../ModelPool/PooledEngineEntry.swift         | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift

diff --git a/MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift b/MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift
new file mode 100644
index 0000000..aa77465
--- /dev/null
+++ b/MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift
@@ -0,0 +1,49 @@
+import Foundation
+
+/// Bookkeeping struct held by `ModelPool` per resident model.
+/// The engine itself is not stored here (it's an actor in the pool's
+/// dict); this is the value-type metadata.
+public struct PooledEngineEntry: Sendable, Equatable {
+    /// Model identifier (matches `LocalModel.id`).
+    public let modelID: String
+    /// Estimated memory cost — sum of safetensors file sizes in
+    /// the model directory. Rough but stable for budget math;
+    /// actual MLX allocator usage can exceed this by 10–30%.
+    public let estimatedBytes: Int64
+    /// Wall-clock time of last `engine(for:)` or `load(_:)` access.
+    public var lastAccess: Date
+    /// Pinned entries are never evicted by the LRU sweeper.
+    public var isPinned: Bool
+
+    public init(
+        modelID: String,
+        estimatedBytes: Int64,
+        lastAccess: Date = Date(),
+        isPinned: Bool = false
+    ) {
+        self.modelID = modelID
+        self.estimatedBytes = estimatedBytes
+        self.lastAccess = lastAccess
+        self.isPinned = isPinned
+    }
+}
+
+/// Sum of `.safetensors` files under `directory`. Rough proxy for
+/// how much memory the model needs when loaded. Returns 0 on any
+/// filesystem error.
+public func estimateModelSize(at directory: URL) -> Int64 {
+    guard let files = try? FileManager.default.contentsOfDirectory(
+        at: directory,
+        includingPropertiesForKeys: [.fileSizeKey]
+    ) else {
+        return 0
+    }
+    return files
+        .filter { $0.pathExtension.lowercased() == "safetensors" }
+        .compactMap { url -> Int64? in
+            guard let values = try? url.resourceValues(forKeys: [.fileSizeKey]),
+                  let size = values.fileSize else { return nil }
+            return Int64(size)
+        }
+        .reduce(0, +)
+}

From 6e47fe4d60bb640cc7b9d9768c88c1f8eba6bfea Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 22:04:14 +0700
Subject: [PATCH 3/6] =?UTF-8?q?feat(pool):=20ModelPool=20actor=20=E2=80=94?=
 =?UTF-8?q?=20LRU=20+=20pinning=20+=20byte-budget=20eviction?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../MacMLXCore/ModelPool/ModelPool.swift      | 151 ++++++++++++++++++
 .../ModelPool/ModelPoolTests.swift            | 102 ++++++++++++
 2 files changed, 253 insertions(+)
 create mode 100644 MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift
 create mode 100644 MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift

diff --git a/MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift b/MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift
new file mode 100644
index 0000000..6c0c726
--- /dev/null
+++ b/MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift
@@ -0,0 +1,151 @@
+import Foundation
+
+/// Actor managing multiple resident `InferenceEngine` instances with
+/// LRU + explicit pinning + byte-budget auto-evict. Use cases:
+///
+/// - Swap between chat models without re-reading weights from disk
+/// - External API cold-swap without unloading the GUI's current model
+/// - Keep a small always-ready model pinned alongside a big one that
+///   auto-evicts on memory pressure
+///
+/// Load path is serialised under `loadTasks` to avoid two concurrent
+/// requests double-loading the same weights — the second caller awaits
+/// the first's completion.
+public actor ModelPool {
+
+    public typealias EngineFactory = @Sendable (LocalModel) -> any InferenceEngine
+
+    // MARK: - State
+
+    /// Currently resident engines, keyed by model ID.
+    private var engines: [String: any InferenceEngine] = [:]
+    /// Bookkeeping keyed by model ID.
+    private var entries: [String: PooledEngineEntry] = [:]
+    /// In-flight loads so concurrent callers deduplicate.
+    private var loadTasks: [String: Task<any InferenceEngine, Error>] = [:]
+
+    private let engineFactory: EngineFactory
+
+    // MARK: - Budget
+
+    /// Maximum total estimated bytes that may be resident. Exceeding
+    /// this triggers LRU eviction (pinned entries are spared).
+    public var maxBytes: Int64
+
+    public init(
+        maxBytes: Int64,
+        engineFactory: @escaping EngineFactory
+    ) {
+        self.maxBytes = maxBytes
+        self.engineFactory = engineFactory
+    }
+
+    public func setMaxBytes(_ bytes: Int64) {
+        self.maxBytes = bytes
+    }
+
+    // MARK: - Public
+
+    public func residentModelIDs() -> [String] {
+        Array(engines.keys).sorted()
+    }
+
+    public func engine(for modelID: String) -> (any InferenceEngine)? {
+        guard let e = engines[modelID] else { return nil }
+        // Touch LRU timestamp.
+        if var entry = entries[modelID] {
+            entry.lastAccess = Date()
+            entries[modelID] = entry
+        }
+        return e
+    }
+
+    public func setPinned(_ modelID: String, _ pinned: Bool) {
+        guard var entry = entries[modelID] else { return }
+        entry.isPinned = pinned
+        entries[modelID] = entry
+    }
+
+    public func isPinned(_ modelID: String) -> Bool {
+        entries[modelID]?.isPinned ?? false
+    }
+
+    public func unload(_ modelID: String) async {
+        if let e = engines.removeValue(forKey: modelID) {
+            try? await e.unload()
+        }
+        entries.removeValue(forKey: modelID)
+    }
+
+    /// Return an engine with `model.id` loaded. Reuses an existing
+    /// entry when possible. Evicts LRU entries as needed to stay
+    /// within `maxBytes`. Concurrent loads of the same ID share.
+    @discardableResult
+    public func load(_ model: LocalModel) async throws -> any InferenceEngine {
+        // Already loaded? Touch and return.
+        if let e = engines[model.id] {
+            if var entry = entries[model.id] {
+                entry.lastAccess = Date()
+                entries[model.id] = entry
+            }
+            return e
+        }
+        // In-flight load by another caller? Join it.
+        if let pending = loadTasks[model.id] {
+            return try await pending.value
+        }
+
+        // Evict to fit before starting the load, using the model's
+        // sizeBytes (or our estimate) as the cost.
+        let cost = model.sizeBytes > 0 ? model.sizeBytes : estimateModelSize(at: model.directory)
+        evict(toFit: cost)
+
+        let factory = engineFactory
+        let task = Task { () throws -> any InferenceEngine in
+            let engine = factory(model)
+            try await engine.load(model)
+            return engine
+        }
+        loadTasks[model.id] = task
+        do {
+            let engine = try await task.value
+            loadTasks.removeValue(forKey: model.id)
+            engines[model.id] = engine
+            entries[model.id] = PooledEngineEntry(
+                modelID: model.id,
+                estimatedBytes: cost
+            )
+            return engine
+        } catch {
+            loadTasks.removeValue(forKey: model.id)
+            throw error
+        }
+    }
+
+    // MARK: - Eviction
+
+    private func currentResidentBytes() -> Int64 {
+        entries.values.map(\.estimatedBytes).reduce(0, +)
+    }
+
+    /// Evict LRU non-pinned entries until (currentBytes + incoming) fits.
+    private func evict(toFit incoming: Int64) {
+        var target = maxBytes - incoming
+        if target < 0 { target = 0 }
+
+        // Candidates: non-pinned, oldest first.
+        let candidates = entries.values
+            .filter { !$0.isPinned }
+            .sorted { $0.lastAccess < $1.lastAccess }
+
+        var current = currentResidentBytes()
+        var iterator = candidates.makeIterator()
+        while current > target, let victim = iterator.next() {
+            if let e = engines.removeValue(forKey: victim.modelID) {
+                Task { try? await e.unload() }
+            }
+            entries.removeValue(forKey: victim.modelID)
+            current -= victim.estimatedBytes
+        }
+    }
+}
diff --git a/MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift
new file mode 100644
index 0000000..364ee0f
--- /dev/null
+++ b/MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift
@@ -0,0 +1,102 @@
+import XCTest
+@testable import MacMLXCore
+
+/// Stub engine for pool tests — no Metal/MLX required. Implements
+/// the minimum InferenceEngine surface the pool touches: load,
+/// unload, engineID. Generate throws since it shouldn't be called.
+private actor StubEngine: InferenceEngine {
+    nonisolated let engineID: EngineID = .mlxSwift
+    var status: EngineStatus = .idle
+    var loadedModel: LocalModel?
+    var version: String = "stub"
+
+    func load(_ model: LocalModel) async throws {
+        loadedModel = model
+        status = .ready(model: model.id)
+    }
+
+    func unload() async throws {
+        loadedModel = nil
+        status = .idle
+    }
+
+    nonisolated func generate(_ request: GenerateRequest) -> AsyncThrowingStream<GenerateChunk, Error> {
+        AsyncThrowingStream { cont in
+            cont.finish(throwing: EngineError.modelNotLoaded)
+        }
+    }
+
+    func healthCheck() async -> Bool { true }
+}
+
+final class ModelPoolTests: XCTestCase {
+
+    private func mkModel(_ id: String, size: Int64 = 1_000_000_000) -> LocalModel {
+        LocalModel(
+            id: id,
+            displayName: id,
+            directory: FileManager.default.temporaryDirectory,
+            sizeBytes: size,
+            format: .mlx,
+            quantization: nil,
+            parameterCount: nil,
+            architecture: nil
+        )
+    }
+
+    func testLoadAddsToPool() async throws {
+        let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() })
+        let m = mkModel("A", size: 1_000_000_000)
+        _ = try await pool.load(m)
+        let residents = await pool.residentModelIDs()
+        XCTAssertEqual(residents, ["A"])
+    }
+
+    func testLoadReuseExistingInstance() async throws {
+        let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() })
+        let m = mkModel("A", size: 1_000_000_000)
+        let e1 = try await pool.load(m) as AnyObject
+        let e2 = try await pool.load(m) as AnyObject
+        XCTAssertTrue(e1 === e2)
+    }
+
+    func testOverBudgetEvictsLRU() async throws {
+        let pool = ModelPool(maxBytes: 2_500_000_000, engineFactory: { _ in StubEngine() })
+        _ = try await pool.load(mkModel("A", size: 1_000_000_000))
+        _ = try await pool.load(mkModel("B", size: 1_000_000_000))
+        // Budget has 2.5 GB, A+B = 2 GB fits.
+        _ = try await pool.load(mkModel("C", size: 1_000_000_000))
+        // A+B+C = 3 GB — over. Oldest (A) evicted.
+        let residents = await pool.residentModelIDs()
+        XCTAssertFalse(residents.contains("A"))
+        XCTAssertTrue(residents.contains("B"))
+        XCTAssertTrue(residents.contains("C"))
+    }
+
+    func testPinnedNotEvicted() async throws {
+        let pool = ModelPool(maxBytes: 2_500_000_000, engineFactory: { _ in StubEngine() })
+        _ = try await pool.load(mkModel("A", size: 1_000_000_000))
+        await pool.setPinned("A", true)
+        _ = try await pool.load(mkModel("B", size: 1_000_000_000))
+        _ = try await pool.load(mkModel("C", size: 1_000_000_000))
+        // A is pinned → B (next-oldest) evicted instead.
+        let residents = await pool.residentModelIDs()
+        XCTAssertTrue(residents.contains("A"))
+        XCTAssertFalse(residents.contains("B"))
+        XCTAssertTrue(residents.contains("C"))
+    }
+
+    func testUnloadRemovesFromPool() async throws {
+        let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() })
+        _ = try await pool.load(mkModel("A"))
+        await pool.unload("A")
+        let residents = await pool.residentModelIDs()
+        XCTAssertTrue(residents.isEmpty)
+    }
+
+    func testEngineForReturnsNilWhenNotLoaded() async {
+        let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() })
+        let e = await pool.engine(for: "A")
+        XCTAssertNil(e)
+    }
+}

From 978bbf58ab0aaf7cc961ae73faa9665d66ee584d Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 22:07:42 +0700
Subject: [PATCH 4/6] feat(pool): route EngineCoordinator load/unload/generate
 through ModelPool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Single active model is still the UX default — currentModelID is the
one shown in the toolbar + menu bar. But previously-loaded models
now stay resident in the pool until LRU evicts, so the next
switch-back skips disk read. Engines spawn via an injected factory
so tests can stub.
---
 macMLX/macMLX/App/AppState.swift          |   2 +-
 macMLX/macMLX/App/EngineCoordinator.swift | 143 ++++++++++++++++------
 2 files changed, 106 insertions(+), 39 deletions(-)

diff --git a/macMLX/macMLX/App/AppState.swift b/macMLX/macMLX/App/AppState.swift
index 90b59bf..e67a6c2 100644
--- a/macMLX/macMLX/App/AppState.swift
+++ b/macMLX/macMLX/App/AppState.swift
@@ -175,7 +175,7 @@ public final class AppState {
     /// to `LogManager`.
     public func startServer() async {
         guard server == nil, !isServerToggling else { return }
-        guard let engine = coordinator.activeEngine else {
+        guard let engine = await coordinator.activeEngine else {
             await logs.log(
                 "Cannot start server: no engine loaded",
                 level: .warning,
diff --git a/macMLX/macMLX/App/EngineCoordinator.swift b/macMLX/macMLX/App/EngineCoordinator.swift
index 35bbfef..671d4c9 100644
--- a/macMLX/macMLX/App/EngineCoordinator.swift
+++ b/macMLX/macMLX/App/EngineCoordinator.swift
@@ -5,6 +5,12 @@
 // `status` and `currentModel` for live updates; load/unload/generate go
 // through the coordinator so the rest of the app never holds a direct
 // reference to a specific engine implementation.
+//
+// v0.4+: the coordinator delegates model lifecycle to a `ModelPool` actor
+// so previously-loaded (non-pinned) models stay resident until the pool's
+// byte budget forces LRU eviction. Single-active-model UX is preserved —
+// `currentModel` / `activeEngine` point at the most-recently-loaded
+// entry and drive the toolbar + menu bar.
 
 import Foundation
 import MacMLXCore
@@ -22,8 +28,8 @@ public final class EngineCoordinator {
     /// Active engine's `version` string (e.g. `"mlx-swift-lm 3.31.3"`),
     /// cached synchronously on the @MainActor so SwiftUI views and the
     /// benchmark feature can read it without awaiting the engine actor.
-    /// Refreshed on every `switchTo(_:)`; empty string if no engine is
-    /// wired (detection-only engines).
+    /// Refreshed on every `switchTo(_:)` / `load(_:)`; empty string if
+    /// no engine is wired (detection-only engines).
     public private(set) var engineVersion: String = ""
 
     /// Tokens generated by the current process — used for the menu bar
@@ -40,12 +46,26 @@ public final class EngineCoordinator {
 
     // MARK: - Private state
 
-    private var engine: (any InferenceEngine)?
+    /// Pool of resident engines. Single-model-at-a-time UX is preserved
+    /// because we only call `pool.load` when `currentModel` changes;
+    /// ModelPool lazily keeps previously-loaded ones resident (subject
+    /// to its byte budget) so the next switch-back skips disk read.
+    private let pool: ModelPool
+
+    /// ID of the model the GUI currently treats as active. `nil` when
+    /// no model has been loaded (or when the last load was for a
+    /// detection-only engine).
+    private var currentModelID: String?
 
     /// Current active engine, exposed to lifecycle consumers like the
     /// HummingbirdServer that need an `any InferenceEngine` reference.
-    /// Nonisolated-read-only snapshot — callers should not mutate.
-    public var activeEngine: (any InferenceEngine)? { engine }
+    /// Async because the pool is an actor — callers must `await`.
+    public var activeEngine: (any InferenceEngine)? {
+        get async {
+            guard let id = currentModelID else { return nil }
+            return await pool.engine(for: id)
+        }
+    }
 
     private let logs: LogManager
 
@@ -53,40 +73,53 @@ public final class EngineCoordinator {
 
     public init(logs: LogManager) {
         self.logs = logs
-        self.engine = MLXSwiftEngine()
         self.engineID = .mlxSwift
-        Task { [weak self] in await self?.refreshEngineVersion() }
+        // Default budget: 50% of total RAM (10^9 GB — Apple convention).
+        // Task 4 will persist this in SettingsManager and push updates
+        // through `setPoolBudget(bytes:)`.
+        let totalGB = MemoryProbe.totalMemoryGB()
+        let budgetGB = max(4.0, totalGB * 0.5)
+        let budgetBytes = Int64(budgetGB * 1_000_000_000)
+        self.pool = ModelPool(maxBytes: budgetBytes) { _ in
+            MLXSwiftEngine()
+        }
     }
 
     // MARK: - Engine selection
 
     /// Swap to a different engine implementation. v0.1 only supports
     /// `.mlxSwift` (in-process); the other two are detection-only stubs.
+    /// Changing engineID alone doesn't affect the pool — the pool's
+    /// factory still produces `MLXSwiftEngine` instances because that's
+    /// the only real engine in v0.4. Detection-only engines cause
+    /// `load(_:)` to fail fast at status check time.
     public func switchTo(_ id: EngineID) {
         engineID = id
-        switch id {
-        case .mlxSwift:
-            engine = MLXSwiftEngine()
-        case .swiftLM, .pythonMLX:
-            // v0.1 detection only — leave engine nil so load() fails fast.
-            engine = nil
-        }
         Task { [weak self] in await self?.refreshEngineVersion() }
     }
 
     /// Pull the active engine's `version` string onto the coordinator's
-    /// synchronous state. Called on init + after every `switchTo(_:)`.
+    /// synchronous state. Called on init + after every `switchTo(_:)`
+    /// / successful `load(_:)`.
     private func refreshEngineVersion() async {
-        guard let engine else { engineVersion = ""; return }
+        guard let id = currentModelID,
+              let engine = await pool.engine(for: id) else {
+            engineVersion = ""
+            return
+        }
         engineVersion = await engine.version
     }
 
     // MARK: - Lifecycle
 
-    /// Load a local model into the active engine.
+    /// Load a local model through the pool. The previously-loaded
+    /// model stays resident (subject to LRU + byte budget); only the
+    /// newly-loaded model becomes the `currentModel`. Detection-only
+    /// engines (`.swiftLM`, `.pythonMLX`) fail fast since the pool's
+    /// factory always produces `MLXSwiftEngine`.
     @discardableResult
     public func load(_ model: LocalModel) async -> Result<Void, Error> {
-        guard let engine else {
+        guard engineID == .mlxSwift else {
             let err = EngineError.unsupportedOperation(
                 "Engine \(engineID.rawValue) is detection-only in v0.1"
             )
@@ -96,9 +129,11 @@ public final class EngineCoordinator {
         status = .loading(model: model.id)
         await logs.log("Loading model: \(model.id)", level: .info, category: .engine)
         do {
-            try await engine.load(model)
+            _ = try await pool.load(model)
             currentModel = model
+            currentModelID = model.id
             status = .ready(model: model.id)
+            await refreshEngineVersion()
             await logs.log("Model loaded: \(model.id)", level: .info, category: .engine)
             // Fire the post-load hook so AppState can rehydrate per-model
             // state (e.g. Parameters Inspector overrides) even if the
@@ -119,40 +154,48 @@ public final class EngineCoordinator {
     /// Blow away the prompt cache — both hot and cold tiers. Exposed
     /// to Settings' "Clear All KV Caches" button.
     ///
-    /// Today only the in-process `MLXSwiftEngine` carries a prompt
-    /// cache; the SwiftLM / Python-MLX detection-only stubs don't, so
-    /// downcasting and no-op-on-mismatch is the right shape. When
-    /// another engine grows a cache this will move onto the
-    /// `InferenceEngine` protocol.
+    /// v0.4: iterates every resident engine in the pool, so caches
+    /// belonging to non-current models also get flushed. Non-MLX
+    /// engines (future/detection-only) are skipped via `as?`.
     public func clearPromptCache() async {
-        guard let engine = engine as? MLXSwiftEngine else { return }
-        await engine.clearPromptCache()
+        for id in await pool.residentModelIDs() {
+            if let mlx = await pool.engine(for: id) as? MLXSwiftEngine {
+                await mlx.clearPromptCache()
+            }
+        }
     }
 
-    /// Release the loaded model.
+    /// Release the currently-active model from the pool.
+    ///
+    /// v0.4 semantics: this unloads **only** `currentModel`; other
+    /// previously-loaded models stay resident in the pool. That's the
+    /// main user-visible v0.4 improvement — cold-switching back to a
+    /// recent model no longer re-reads weights from disk.
     public func unload() async {
-        guard let engine else { return }
-        try? await engine.unload()
+        guard let id = currentModelID else { return }
+        await pool.unload(id)
+        currentModelID = nil
         currentModel = nil
         status = .idle
+        engineVersion = ""
         await logs.log("Engine unloaded", level: .info, category: .engine)
     }
 
     /// Stream tokens for `request` against the active engine. Returns an
-    /// empty stream that finishes immediately if no engine is wired.
+    /// empty stream that finishes immediately if no model is loaded.
     public func generate(_ request: GenerateRequest) -> AsyncThrowingStream<GenerateChunk, Error> {
-        guard let engine else {
-            return AsyncThrowingStream { continuation in
-                continuation.finish(throwing: EngineError.modelNotLoaded)
-            }
-        }
-        let stream = engine.generate(request)
-        // Wrap to bump the token counter as chunks arrive.
+        let pool = self.pool
+        let currentID = self.currentModelID
         return AsyncThrowingStream { continuation in
             Task { @MainActor in
+                guard let id = currentID,
+                      let engine = await pool.engine(for: id) else {
+                    continuation.finish(throwing: EngineError.modelNotLoaded)
+                    return
+                }
                 self.status = .generating
                 do {
-                    for try await chunk in stream {
+                    for try await chunk in engine.generate(request) {
                         if let usage = chunk.usage {
                             self.tokensGeneratedTotal += usage.completionTokens
                         }
@@ -171,4 +214,28 @@ public final class EngineCoordinator {
             }
         }
     }
+
+    // MARK: - Pool pass-throughs (v0.4)
+
+    /// Pin / unpin a resident model so the LRU sweeper won't evict it
+    /// when the pool's byte budget is exceeded. No-op for models that
+    /// aren't currently resident.
+    public func setPinned(_ modelID: String, _ pinned: Bool) async {
+        await pool.setPinned(modelID, pinned)
+    }
+
+    /// IDs of every currently-resident model in the pool (sorted).
+    /// Used by the Models tab to render the "loaded" indicator and
+    /// by Settings' pool-stats panel (Task 4).
+    public func residentModelIDs() async -> [String] {
+        await pool.residentModelIDs()
+    }
+
+    /// Update the pool's byte budget. Called by the Settings
+    /// "Max resident memory" slider (Task 4). Shrinking the budget
+    /// below current usage doesn't synchronously evict — eviction
+    /// happens on the next `load(_:)`.
+    public func setPoolBudget(bytes: Int64) async {
+        await pool.setMaxBytes(bytes)
+    }
 }

From 22e8b6ad4a177b56cf69430d76b400f9fe4a0d7e Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 22:12:22 +0700
Subject: [PATCH 5/6] feat(pool): Settings max-resident slider + Models-tab pin
 toggle

---
 .../MacMLXCore/Managers/SettingsManager.swift | 18 ++++++++--
 .../Views/ModelLibrary/LocalModelRow.swift    | 13 +++++++
 .../Views/ModelLibrary/ModelLibraryView.swift |  4 +++
 .../ModelLibrary/ModelLibraryViewModel.swift  | 18 ++++++++++
 .../Views/Settings/ModelPoolSection.swift     | 35 +++++++++++++++++++
 .../macMLX/Views/Settings/SettingsView.swift  | 12 +++++++
 6 files changed, 98 insertions(+), 2 deletions(-)
 create mode 100644 macMLX/macMLX/Views/Settings/ModelPoolSection.swift

diff --git a/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift b/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift
index d7e218a..287d47a 100644
--- a/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift
+++ b/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift
@@ -62,6 +62,13 @@ public struct Settings: Codable, Equatable, Sendable {
     /// enforcement lands in v0.4.0.1.
     public var kvCacheColdGB: Int
 
+    /// ModelPool byte budget, expressed in gigabytes (Apple's 10^9 GB
+    /// convention). When resident models' summed estimated footprint
+    /// exceeds this, the pool LRU-evicts non-pinned entries. Default
+    /// is 50% of the machine's physical RAM, clamped to a 4 GB floor
+    /// for small-memory Macs.
+    public var maxResidentMemoryGB: Int
+
     // MARK: Factory
 
     /// Sensible out-of-the-box defaults — used when no settings file exists.
@@ -81,7 +88,8 @@ public struct Settings: Codable, Equatable, Sendable {
         logRetentionDays: 7,
         hfEndpoint: "https://huggingface.co",
         kvCacheHotMB: 512,
-        kvCacheColdGB: 20
+        kvCacheColdGB: 20,
+        maxResidentMemoryGB: max(4, Int(MemoryProbe.totalMemoryGB()) / 2)
     )
 
     // MARK: Init
@@ -99,7 +107,8 @@ public struct Settings: Codable, Equatable, Sendable {
         logRetentionDays: Int,
         hfEndpoint: String = "https://huggingface.co",
         kvCacheHotMB: Int = 512,
-        kvCacheColdGB: Int = 20
+        kvCacheColdGB: Int = 20,
+        maxResidentMemoryGB: Int = max(4, Int(MemoryProbe.totalMemoryGB()) / 2)
     ) {
         self.modelDirectory = modelDirectory
         self.preferredEngine = preferredEngine
@@ -114,6 +123,7 @@ public struct Settings: Codable, Equatable, Sendable {
         self.hfEndpoint = hfEndpoint
         self.kvCacheHotMB = kvCacheHotMB
         self.kvCacheColdGB = kvCacheColdGB
+        self.maxResidentMemoryGB = maxResidentMemoryGB
     }
 
     // MARK: - Codable (backward-compat decode)
@@ -135,6 +145,7 @@ public struct Settings: Codable, Equatable, Sendable {
         case hfEndpoint
         case kvCacheHotMB
         case kvCacheColdGB
+        case maxResidentMemoryGB
     }
 
     public init(from decoder: Decoder) throws {
@@ -153,6 +164,9 @@ public struct Settings: Codable, Equatable, Sendable {
             ?? "https://huggingface.co"
         self.kvCacheHotMB = try c.decodeIfPresent(Int.self, forKey: .kvCacheHotMB) ?? 512
         self.kvCacheColdGB = try c.decodeIfPresent(Int.self, forKey: .kvCacheColdGB) ?? 20
+        self.maxResidentMemoryGB =
+            (try c.decodeIfPresent(Int.self, forKey: .maxResidentMemoryGB))
+            ?? max(4, Int(MemoryProbe.totalMemoryGB()) / 2)
     }
 }
 
diff --git a/macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift b/macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift
index bf94d7a..3d74780 100644
--- a/macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift
+++ b/macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift
@@ -9,9 +9,11 @@ struct LocalModelRow: View {
     let model: LocalModel
     let isLoaded: Bool
     let isLoading: Bool
+    let isPinned: Bool
     let hasUpdateAvailable: Bool
     let onLoad: () -> Void
     let onUnload: () -> Void
+    let onTogglePin: () -> Void
     let onDelete: () -> Void
 
     var body: some View {
@@ -75,6 +77,13 @@ struct LocalModelRow: View {
                     .controlSize(.small)
             }
 
+            Button(action: onTogglePin) {
+                Image(systemName: isPinned ? "pin.fill" : "pin")
+                    .foregroundStyle(isPinned ? .orange : .secondary)
+            }
+            .buttonStyle(.plain)
+            .help(isPinned ? "Pinned — won't auto-evict" : "Pin to keep resident")
+
             Button(role: .destructive, action: onDelete) {
                 Image(systemName: "trash")
             }
@@ -102,18 +111,22 @@ struct LocalModelRow: View {
             model: model,
             isLoaded: true,
             isLoading: false,
+            isPinned: true,
             hasUpdateAvailable: false,
             onLoad: {},
             onUnload: {},
+            onTogglePin: {},
             onDelete: {}
         )
         LocalModelRow(
             model: model,
             isLoaded: false,
             isLoading: false,
+            isPinned: false,
             hasUpdateAvailable: true,
             onLoad: {},
             onUnload: {},
+            onTogglePin: {},
             onDelete: {}
         )
     }
diff --git a/macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift b/macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift
index 0dd094a..b204090 100644
--- a/macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift
+++ b/macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift
@@ -160,6 +160,7 @@ private struct ModelLibraryContent: View {
                             model: model,
                             isLoaded: viewModel.loadedModelID == model.id,
                             isLoading: viewModel.loadingModelID == model.id,
+                            isPinned: viewModel.pinnedModelIDs.contains(model.id),
                             hasUpdateAvailable: viewModel.modelsWithUpdate.contains(model.id),
                             onLoad: {
                                 Task { await viewModel.loadModel(model) }
@@ -167,6 +168,9 @@ private struct ModelLibraryContent: View {
                             onUnload: {
                                 Task { await viewModel.unloadModel() }
                             },
+                            onTogglePin: {
+                                Task { await viewModel.togglePin(model) }
+                            },
                             onDelete: {
                                 viewModel.deleteModel(model)
                             }
diff --git a/macMLX/macMLX/Views/ModelLibrary/ModelLibraryViewModel.swift b/macMLX/macMLX/Views/ModelLibrary/ModelLibraryViewModel.swift
index 02082e1..288187d 100644
--- a/macMLX/macMLX/Views/ModelLibrary/ModelLibraryViewModel.swift
+++ b/macMLX/macMLX/Views/ModelLibrary/ModelLibraryViewModel.swift
@@ -48,6 +48,11 @@ final class ModelLibraryViewModel {
     /// Model IDs for which an update is available on HF.
     var modelsWithUpdate: Set<String> = []
 
+    /// Model IDs the user has pinned — ModelPool won't LRU-evict these.
+    /// v0.4 MVP: in-memory only, reset on app launch. Disk persistence
+    /// is deferred per the plan's "Out of scope" section.
+    var pinnedModelIDs: Set<String> = []
+
     private var lastUpdateCheck: Date?
     private let updateCheckInterval: TimeInterval = 24 * 60 * 60  // 1 day
 
@@ -178,6 +183,19 @@ final class ModelLibraryViewModel {
         await coordinator.unload()
     }
 
+    /// Flip pin state for a single model. Propagates to the pool so the
+    /// LRU sweeper respects the new state immediately, then updates the
+    /// observable `pinnedModelIDs` so the SwiftUI row re-renders.
+    func togglePin(_ model: LocalModel) async {
+        let nowPinned = !pinnedModelIDs.contains(model.id)
+        await coordinator.setPinned(model.id, nowPinned)
+        if nowPinned {
+            pinnedModelIDs.insert(model.id)
+        } else {
+            pinnedModelIDs.remove(model.id)
+        }
+    }
+
     func deleteModel(_ model: LocalModel) {
         do {
             try FileManager.default.removeItem(at: model.directory)
diff --git a/macMLX/macMLX/Views/Settings/ModelPoolSection.swift b/macMLX/macMLX/Views/Settings/ModelPoolSection.swift
new file mode 100644
index 0000000..2bcd940
--- /dev/null
+++ b/macMLX/macMLX/Views/Settings/ModelPoolSection.swift
@@ -0,0 +1,35 @@
+// ModelPoolSection.swift
+// macMLX
+//
+// Settings section exposing the v0.4 ModelPool "max resident memory"
+// budget. Changing the stepper value both persists to
+// `Settings.maxResidentMemoryGB` and pushes the new byte budget into
+// `EngineCoordinator.setPoolBudget(bytes:)` so the live pool picks it
+// up without a restart. Pin/unpin controls live on the Models tab's
+// `LocalModelRow` instead — this section is only about the byte cap.
+
+import SwiftUI
+import MacMLXCore
+
+struct ModelPoolSection: View {
+    @Binding var maxResidentGB: Int
+
+    var body: some View {
+        Section("Model Pool") {
+            HStack {
+                Text("Max resident memory")
+                Spacer()
+                Stepper(
+                    value: $maxResidentGB,
+                    in: 2...256,
+                    step: 1
+                ) {
+                    Text(String(maxResidentGB) + " GB")
+                        .font(.system(.body, design: .monospaced))
+                        .frame(minWidth: 80, alignment: .trailing)
+                }
+            }
+            .help("When multiple loaded models exceed this, the least-recently-used non-pinned one is unloaded.")
+        }
+    }
+}
diff --git a/macMLX/macMLX/Views/Settings/SettingsView.swift b/macMLX/macMLX/Views/Settings/SettingsView.swift
index a8595de..52ad485 100644
--- a/macMLX/macMLX/Views/Settings/SettingsView.swift
+++ b/macMLX/macMLX/Views/Settings/SettingsView.swift
@@ -20,6 +20,7 @@ struct SettingsView: View {
     @State private var hfEndpoint: String = "https://huggingface.co"
     @State private var kvCacheHotMB: Int = 512
     @State private var kvCacheColdGB: Int = 20
+    @State private var maxResidentMemoryGB: Int = 8
 
     var body: some View {
         Form {
@@ -67,6 +68,16 @@ struct SettingsView: View {
                 Task { await appState.updateSettings { $0.kvCacheColdGB = newValue } }
             }
 
+            ModelPoolSection(maxResidentGB: $maxResidentMemoryGB)
+                .onChange(of: maxResidentMemoryGB) { _, newValue in
+                    Task {
+                        await appState.updateSettings { $0.maxResidentMemoryGB = newValue }
+                        await appState.coordinator.setPoolBudget(
+                            bytes: Int64(newValue) * 1_000_000_000
+                        )
+                    }
+                }
+
             downloadsSection
 
             rerunSetupSection
@@ -171,6 +182,7 @@ struct SettingsView: View {
         hfEndpoint = s.hfEndpoint
         kvCacheHotMB = s.kvCacheHotMB
         kvCacheColdGB = s.kvCacheColdGB
+        maxResidentMemoryGB = s.maxResidentMemoryGB
     }
 
     private func showModelDirectoryPicker() {

From eb4fa3cf19375fe237bddd3111cd6ff4aac8af09 Mon Sep 17 00:00:00 2001
From: Kefeng Zhou <magicnight@gmail.com>
Date: Sat, 18 Apr 2026 22:12:58 +0700
Subject: [PATCH 6/6] docs: v0.4 ModelPool changelog entry

---
 CHANGELOG.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 06d7b0a..3a65b33 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,16 @@ Versioning follows [Semantic Versioning](https://semver.org/).
 - Debug-level Logs tab entries `Prompt cache HIT — restored N
   tokens` / `Prompt cache MISS — cold prefill of N tokens` under
   the `engine` category, so you can see cache effectiveness.
+- **Multi-model pool** (v0.4.0 engine parity, part 2 of 3). Load
+  multiple models at once — previously the engine had to unload
+  the old model before loading a new one, which meant every API
+  cold-swap paid the full weight-read cost. Pool is bounded by a
+  user-configurable resident memory cap (Settings → Model Pool;
+  default 50% of total RAM). Least-recently-used non-pinned
+  models auto-evict when the cap is exceeded. Pin a model from
+  its row in the Models tab (pin icon) to keep it resident
+  regardless of LRU order. Pinned state is in-memory for this
+  release; persistence across restarts is a follow-up.
 
 ---