From dd12c2f7171d852eed58340a0b97abd42dd405c2 Mon Sep 17 00:00:00 2001 From: Kefeng Zhou Date: Sat, 18 Apr 2026 22:01:50 +0700 Subject: [PATCH 1/6] =?UTF-8?q?docs:=20v0.4=20ModelPool=20plan=20=E2=80=94?= =?UTF-8?q?=205=20tasks,=20MVP=20with=20LRU=20+=20pin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../plans/2026-04-18-v0.4-model-pool.md | 772 ++++++++++++++++++ 1 file changed, 772 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-18-v0.4-model-pool.md diff --git a/docs/superpowers/plans/2026-04-18-v0.4-model-pool.md b/docs/superpowers/plans/2026-04-18-v0.4-model-pool.md new file mode 100644 index 0000000..53058e7 --- /dev/null +++ b/docs/superpowers/plans/2026-04-18-v0.4-model-pool.md @@ -0,0 +1,772 @@ +# v0.4 ModelPool — Implementation Plan (MVP) + +> REQUIRED SUB-SKILL: superpowers:subagent-driven-development + +**Goal:** Multiple MLX models co-resident in memory, auto-evicted LRU when over budget, so chat / HTTP API / cold-swap can switch between pinned models without disk re-read. + +**Architecture:** `actor ModelPool` holds `[String: PooledEngine]`. Load is serialized under an internal inflight-task map. Budget is a user-settable GB cap (default = 50% of total RAM). LRU eviction on load when total estimated bytes exceed budget. Pinned models never evict. `EngineCoordinator` routes `load(_:)` through the pool and exposes `currentEngine` = most-recently-touched non-detection engine. + +**Tech Stack:** Swift 6 actor, `Memory.cacheLimit` / `MLX.GPU.clearCache()`, safetensors pre-scan for size estimate. + +**Branch:** `feat/v0.4-model-pool` (created). + +--- + +## Out of scope for this MVP + +- Live `DispatchSource` memory-pressure watcher — defer to v0.4.0.1. +- Pinned-set persistence to disk — defer to next PR. +- Per-model live RSS display — coarse "loaded models count" only. + +--- + +## File Structure + +**Create:** +- `MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift` — the actor +- `MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift` — value type bundling engine + metadata +- `MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift` + +**Modify:** +- `MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift` — add `maxResidentMemoryGB: Int` (default: `HardwareInfo.totalMemoryGB() / 2`) +- `macMLX/macMLX/App/EngineCoordinator.swift` — use ModelPool under the hood; track `currentModel` as most-recent-load +- `macMLX/macMLX/Views/Settings/` — add "Model Pool" section (budget slider + auto-evict toggle) +- `macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift` — pin/unpin toggle button + "loaded" indicator + +--- + +## Task 1: PooledEngineEntry value type + +**Files:** +- Create: `MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift` + +- [ ] **Step 1: Implement** + +```swift +import Foundation + +/// Bookkeeping struct held by `ModelPool` per resident model. +/// The engine itself is not stored here (it's an actor in the pool's +/// dict); this is the value-type metadata. +public struct PooledEngineEntry: Sendable, Equatable { + /// Model identifier (matches `LocalModel.id`). + public let modelID: String + /// Estimated memory cost — sum of safetensors file sizes in + /// the model directory. Rough but stable for budget math; + /// actual MLX allocator usage can exceed this by 10–30%. + public let estimatedBytes: Int64 + /// Wall-clock time of last `engine(for:)` or `load(_:)` access. + public var lastAccess: Date + /// Pinned entries are never evicted by the LRU sweeper. + public var isPinned: Bool + + public init( + modelID: String, + estimatedBytes: Int64, + lastAccess: Date = Date(), + isPinned: Bool = false + ) { + self.modelID = modelID + self.estimatedBytes = estimatedBytes + self.lastAccess = lastAccess + self.isPinned = isPinned + } +} + +/// Sum of `.safetensors` files under `directory`. Rough proxy for +/// how much memory the model needs when loaded. Returns 0 on any +/// filesystem error. +public func estimateModelSize(at directory: URL) -> Int64 { + guard let files = try? FileManager.default.contentsOfDirectory( + at: directory, + includingPropertiesForKeys: [.fileSizeKey] + ) else { + return 0 + } + return files + .filter { $0.pathExtension.lowercased() == "safetensors" } + .compactMap { url -> Int64? in + guard let values = try? url.resourceValues(forKeys: [.fileSizeKey]), + let size = values.fileSize else { return nil } + return Int64(size) + } + .reduce(0, +) +} +``` + +- [ ] **Step 2: No dedicated test — pure value type, exercised via ModelPoolTests in Task 2.** + +- [ ] **Step 3: Commit** + +```bash +git add MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift +git commit -m "feat(pool): PooledEngineEntry + estimateModelSize helper" +``` + +--- + +## Task 2: ModelPool actor with LRU + budget + +**Files:** +- Create: `MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift` +- Create: `MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift` + +- [ ] **Step 1: Write the failing test** + +`MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift`: + +```swift +import XCTest +@testable import MacMLXCore + +/// Stub engine for pool tests — no Metal/MLX required. Implements +/// the minimum InferenceEngine surface the pool touches: load, +/// unload, engineID. Generate throws since it shouldn't be called. +private actor StubEngine: InferenceEngine { + let engineID: EngineID = .mlxSwift + public var version: String = "stub" + public var loadedModel: LocalModel? + nonisolated public var activeEngineID: EngineID { .mlxSwift } + + func load(_ model: LocalModel) async throws { + loadedModel = model + } + func unload() async throws { + loadedModel = nil + } + func generate(_ request: GenerateRequest) -> AsyncThrowingStream { + AsyncThrowingStream { cont in + cont.finish(throwing: EngineError.notLoaded) + } + } +} + +final class ModelPoolTests: XCTestCase { + + private func mkModel(_ id: String, size: Int64 = 1_000_000_000) -> LocalModel { + LocalModel( + id: id, + displayName: id, + directory: FileManager.default.temporaryDirectory, + sizeBytes: size, + format: .mlx, + quantization: nil, + parameterCount: nil, + architecture: nil + ) + } + + func testLoadAddsToPool() async throws { + let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() }) + let m = mkModel("A", size: 1_000_000_000) + _ = try await pool.load(m) + let residents = await pool.residentModelIDs() + XCTAssertEqual(residents, ["A"]) + } + + func testLoadReuseExistingInstance() async throws { + let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() }) + let m = mkModel("A", size: 1_000_000_000) + let e1 = try await pool.load(m) as AnyObject + let e2 = try await pool.load(m) as AnyObject + XCTAssertTrue(e1 === e2) + } + + func testOverBudgetEvictsLRU() async throws { + let pool = ModelPool(maxBytes: 2_500_000_000, engineFactory: { _ in StubEngine() }) + _ = try await pool.load(mkModel("A", size: 1_000_000_000)) + _ = try await pool.load(mkModel("B", size: 1_000_000_000)) + // Budget has 2.5 GB, A+B = 2 GB fits. + _ = try await pool.load(mkModel("C", size: 1_000_000_000)) + // A+B+C = 3 GB — over. Oldest (A) evicted. + let residents = await pool.residentModelIDs() + XCTAssertFalse(residents.contains("A")) + XCTAssertTrue(residents.contains("B")) + XCTAssertTrue(residents.contains("C")) + } + + func testPinnedNotEvicted() async throws { + let pool = ModelPool(maxBytes: 2_500_000_000, engineFactory: { _ in StubEngine() }) + _ = try await pool.load(mkModel("A", size: 1_000_000_000)) + await pool.setPinned("A", true) + _ = try await pool.load(mkModel("B", size: 1_000_000_000)) + _ = try await pool.load(mkModel("C", size: 1_000_000_000)) + // A is pinned → B (next-oldest) evicted instead. + let residents = await pool.residentModelIDs() + XCTAssertTrue(residents.contains("A")) + XCTAssertFalse(residents.contains("B")) + XCTAssertTrue(residents.contains("C")) + } + + func testUnloadRemovesFromPool() async throws { + let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() }) + _ = try await pool.load(mkModel("A")) + await pool.unload("A") + let residents = await pool.residentModelIDs() + XCTAssertTrue(residents.isEmpty) + } + + func testEngineForReturnsNilWhenNotLoaded() async { + let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() }) + let e = await pool.engine(for: "A") + XCTAssertNil(e) + } +} +``` + +- [ ] **Step 2: Run — verify fail** + +``` +cd MacMLXCore && swift test --filter ModelPoolTests 2>&1 | tail -5 +``` + +Expected: `no such type ModelPool`. + +- [ ] **Step 3: Implement** + +`MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift`: + +```swift +import Foundation + +/// Actor managing multiple resident `InferenceEngine` instances with +/// LRU + explicit pinning + byte-budget auto-evict. Use cases: +/// +/// - Swap between chat models without re-reading weights from disk +/// - External API cold-swap without unloading the GUI's current model +/// - Keep a small always-ready model pinned alongside a big one that +/// auto-evicts on memory pressure +/// +/// Load path is serialised under `loadTasks` to avoid two concurrent +/// requests double-loading the same weights — the second caller awaits +/// the first's completion. +public actor ModelPool { + + public typealias EngineFactory = @Sendable (LocalModel) -> any InferenceEngine + + // MARK: - State + + /// Currently resident engines, keyed by model ID. + private var engines: [String: any InferenceEngine] = [:] + /// Bookkeeping keyed by model ID. + private var entries: [String: PooledEngineEntry] = [:] + /// In-flight loads so concurrent callers deduplicate. + private var loadTasks: [String: Task] = [:] + + private let engineFactory: EngineFactory + + // MARK: - Budget + + /// Maximum total estimated bytes that may be resident. Exceeding + /// this triggers LRU eviction (pinned entries are spared). + public var maxBytes: Int64 + + public init( + maxBytes: Int64, + engineFactory: @escaping EngineFactory + ) { + self.maxBytes = maxBytes + self.engineFactory = engineFactory + } + + public func setMaxBytes(_ bytes: Int64) { + self.maxBytes = bytes + } + + // MARK: - Public + + public func residentModelIDs() -> [String] { + Array(engines.keys).sorted() + } + + public func engine(for modelID: String) -> (any InferenceEngine)? { + guard let e = engines[modelID] else { return nil } + // Touch LRU timestamp. + if var entry = entries[modelID] { + entry.lastAccess = Date() + entries[modelID] = entry + } + return e + } + + public func setPinned(_ modelID: String, _ pinned: Bool) { + guard var entry = entries[modelID] else { return } + entry.isPinned = pinned + entries[modelID] = entry + } + + public func isPinned(_ modelID: String) -> Bool { + entries[modelID]?.isPinned ?? false + } + + public func unload(_ modelID: String) async { + if let e = engines.removeValue(forKey: modelID) { + try? await e.unload() + } + entries.removeValue(forKey: modelID) + } + + /// Return an engine with `model.id` loaded. Reuses an existing + /// entry when possible. Evicts LRU entries as needed to stay + /// within `maxBytes`. Concurrent loads of the same ID share. + @discardableResult + public func load(_ model: LocalModel) async throws -> any InferenceEngine { + // Already loaded? Touch and return. + if let e = engines[model.id] { + if var entry = entries[model.id] { + entry.lastAccess = Date() + entries[model.id] = entry + } + return e + } + // In-flight load by another caller? Join it. + if let pending = loadTasks[model.id] { + return try await pending.value + } + + // Evict to fit before starting the load, using the model's + // sizeBytes (or our estimate) as the cost. + let cost = model.sizeBytes > 0 ? model.sizeBytes : estimateModelSize(at: model.directory) + evict(toFit: cost) + + let factory = engineFactory + let task = Task { () throws -> any InferenceEngine in + let engine = factory(model) + try await engine.load(model) + return engine + } + loadTasks[model.id] = task + do { + let engine = try await task.value + loadTasks.removeValue(forKey: model.id) + engines[model.id] = engine + entries[model.id] = PooledEngineEntry( + modelID: model.id, + estimatedBytes: cost + ) + return engine + } catch { + loadTasks.removeValue(forKey: model.id) + throw error + } + } + + // MARK: - Eviction + + private func currentResidentBytes() -> Int64 { + entries.values.map(\.estimatedBytes).reduce(0, +) + } + + /// Evict LRU non-pinned entries until (currentBytes + incoming) fits. + private func evict(toFit incoming: Int64) { + var target = maxBytes - incoming + if target < 0 { target = 0 } + + // Candidates: non-pinned, oldest first. + let candidates = entries.values + .filter { !$0.isPinned } + .sorted { $0.lastAccess < $1.lastAccess } + + var current = currentResidentBytes() + var iterator = candidates.makeIterator() + while current > target, let victim = iterator.next() { + if let e = engines.removeValue(forKey: victim.modelID) { + Task { try? await e.unload() } + } + entries.removeValue(forKey: victim.modelID) + current -= victim.estimatedBytes + } + } +} +``` + +- [ ] **Step 4: Run tests — verify pass** + +``` +cd MacMLXCore && swift test --filter ModelPoolTests 2>&1 | tail -10 +``` + +Expected: 6/6 PASS. + +- [ ] **Step 5: Commit** + +```bash +git add MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift \ + MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift +git commit -m "feat(pool): ModelPool actor — LRU + pinning + byte-budget eviction" +``` + +--- + +## Task 3: Route EngineCoordinator through ModelPool + +**Files:** +- Modify: `macMLX/macMLX/App/EngineCoordinator.swift` + +**Strategy:** keep `EngineCoordinator.load(_:)` / `unload()` / `generate(_:)` public API intact so no callers break. Internally, replace the single `engine` property with a `pool: ModelPool` + `currentModelID: String?`. `load(model)` delegates to `pool.load(model)`. `unload()` unloads currently-active from pool (leaves others resident). `generate()` fetches via `pool.engine(for: currentModelID)`. + +- [ ] **Step 1: Read current EngineCoordinator** + +``` +cd /Users/kevin/Projects/macmlx +grep -n "class EngineCoordinator\|public private(set) var\|private var engine\|func load\|func unload\|func generate\|switchTo" macMLX/macMLX/App/EngineCoordinator.swift | head -20 +``` + +Understand existing members: `engineID`, `engineVersion`, `currentModel`, `loadedModel`, `status`, `tokensGeneratedTotal`, `onModelLoaded`, plus methods `switchTo(_:)`, `load(_:)`, `unload()`, `generate(_:)`, `clearPromptCache()`. + +- [ ] **Step 2: Refactor — minimal surgical change** + +Replace `private var engine: (any InferenceEngine)?` with: + +```swift +/// Pool of resident engines. Single-model-at-a-time behaviour (today) +/// is preserved by only calling `pool.load` when `currentModel` needs +/// to change; ModelPool lets us lazily keep previously-loaded ones +/// resident without explicit unload, for v0.4.0+ multi-model workflows. +private let pool: ModelPool + +/// Kept as an optional — nil when no model is active. +/// `activeEngine` (nonisolated accessor used by HummingbirdServer) +/// reads this and does a synchronous dict lookup against the pool. +public private(set) var currentModelID: String? +``` + +Update `activeEngine` computed: + +```swift +public var activeEngine: (any InferenceEngine)? { + get async { + guard let id = currentModelID else { return nil } + return await pool.engine(for: id) + } +} +``` + +Note: changing `activeEngine` from sync to async breaks HummingbirdServer's call. Inspect `startServer` in AppState.swift — it does `coordinator.activeEngine` synchronously. Change that site to `await coordinator.activeEngine` and propagate. + +In `init`, construct the pool: + +```swift +let totalGB = HardwareInfo.totalMemoryGB() +let budgetBytes = Int64(Double(totalGB) * 0.5 * 1_073_741_824) +self.pool = ModelPool(maxBytes: budgetBytes) { _ in + MLXSwiftEngine() +} +``` + +Rewrite `load(_:)`: + +```swift +public func load(_ model: LocalModel) async -> Result { + status = .loading(model: model.id) + await logs.log("Loading model: \(model.id)", level: .info, category: .engine) + do { + _ = try await pool.load(model) + currentModel = model + loadedModel = model + currentModelID = model.id + status = .ready(model: model.id) + await refreshEngineVersion() + await onModelLoaded?(model) + await logs.log("Model loaded: \(model.id)", level: .info, category: .engine) + return .success(()) + } catch { + status = .error(error.localizedDescription) + return .failure(error) + } +} +``` + +Rewrite `unload()`: + +```swift +public func unload() async { + guard let id = currentModelID else { return } + await pool.unload(id) + currentModelID = nil + currentModel = nil + loadedModel = nil + status = .idle + await logs.log("Engine unloaded", level: .info, category: .engine) +} +``` + +Rewrite `generate(_:)`: + +```swift +public func generate(_ request: GenerateRequest) -> AsyncThrowingStream { + AsyncThrowingStream { continuation in + Task { + guard let id = currentModelID, + let engine = await pool.engine(for: id) else { + continuation.finish(throwing: EngineError.notLoaded) + return + } + do { + for try await chunk in engine.generate(request) { + continuation.yield(chunk) + tokensGeneratedTotal += 1 + } + continuation.finish() + } catch { + continuation.finish(throwing: error) + } + } + } +} +``` + +Add pin pass-throughs: + +```swift +public func setPinned(_ modelID: String, _ pinned: Bool) async { + await pool.setPinned(modelID, pinned) +} +public func residentModelIDs() async -> [String] { + await pool.residentModelIDs() +} +``` + +Update `clearPromptCache()`: the old implementation downcast `engine` to `MLXSwiftEngine`. Now iterate pool: + +```swift +public func clearPromptCache() async { + for id in await pool.residentModelIDs() { + if let mlx = await pool.engine(for: id) as? MLXSwiftEngine { + await mlx.clearPromptCache() + } + } +} +``` + +- [ ] **Step 3: Fix call sites that relied on sync `activeEngine`** + +``` +grep -rn "coordinator.activeEngine\|coordinator.engine" macMLX/macMLX/ MacMLXCore/Sources/MacMLXCore/ | grep -v ".build" +``` + +Update each `coordinator.activeEngine` to `await coordinator.activeEngine`. In SwiftUI views that aren't async contexts, hop into a `Task`. + +- [ ] **Step 4: Build + test** + +``` +cd MacMLXCore && swift build 2>&1 | tail -5 +cd MacMLXCore && swift test 2>&1 | tail -5 +cd macMLX && xcodebuild -scheme macMLX -destination 'platform=macOS' -configuration Debug build 2>&1 | tail -5 +``` + +All three must succeed. If any call site refuses to go async (e.g. a `var` binding in a non-async computed property), wrap with `Task { ... }` or rearchitect that specific call — do not make a sync bridge that blocks the main thread. + +- [ ] **Step 5: Commit** + +```bash +git add macMLX/macMLX/App/EngineCoordinator.swift macMLX/macMLX/App/AppState.swift +# plus any call-site fixups +git commit -m "feat(pool): route EngineCoordinator load/unload/generate through ModelPool + +Single active model is still the UX default — currentModelID is the +one shown in the toolbar + menu bar. But previously-loaded models +now stay resident in the pool until LRU evicts, so the next +switch-back skips disk read. Engines spawn via an injected factory +so tests can stub." +``` + +--- + +## Task 4: Settings UI (budget slider) + Models-tab pin toggle + +**Files:** +- Modify: `MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift` — `maxResidentMemoryGB: Int` +- Modify: `macMLX/macMLX/Views/Settings/` — new `ModelPoolSection.swift` + wire into `SettingsView` +- Modify: `macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift` — pin toggle +- Modify: `macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift` — pass pin state + callback + +### Step 1: Extend Settings + +`SettingsManager.swift`: + +```swift +public var maxResidentMemoryGB: Int +``` + +In `Settings.default`: + +```swift +maxResidentMemoryGB: max(4, Int(HardwareInfo.totalMemoryGB()) / 2), +``` + +Add `decodeIfPresent` default to the custom `init(from:)` so pre-v0.4 settings JSONs decode with a sensible fallback: + +```swift +self.maxResidentMemoryGB = (try c.decodeIfPresent(Int.self, forKey: .maxResidentMemoryGB)) + ?? max(4, Int(HardwareInfo.totalMemoryGB()) / 2) +``` + +### Step 2: ModelPoolSection.swift + +Create `macMLX/macMLX/Views/Settings/ModelPoolSection.swift`: + +```swift +import SwiftUI +import MacMLXCore + +struct ModelPoolSection: View { + @Binding var maxResidentGB: Int + + var body: some View { + Section("Model Pool") { + HStack { + Text("Max resident memory") + Spacer() + Stepper( + value: $maxResidentGB, + in: 2...256, + step: 1 + ) { + Text(String(maxResidentGB) + " GB") + .font(.system(.body, design: .monospaced)) + .frame(minWidth: 80, alignment: .trailing) + } + } + .help("When multiple loaded models exceed this, the least-recently-used non-pinned one is unloaded.") + } + } +} +``` + +### Step 3: Wire into SettingsView + +Add `@State private var maxResidentMemoryGB: Int = 8` + load from settings. Render the section near the KV Cache section. `.onChange(of: maxResidentMemoryGB)` → `appState.updateSettings { $0.maxResidentMemoryGB = newValue }` + propagate to coordinator: + +```swift +.onChange(of: maxResidentMemoryGB) { _, newValue in + Task { + await appState.updateSettings { $0.maxResidentMemoryGB = newValue } + await appState.coordinator.setPoolBudget( + bytes: Int64(newValue) * 1_073_741_824 + ) + } +} +``` + +Add `setPoolBudget(bytes:)` on EngineCoordinator: + +```swift +public func setPoolBudget(bytes: Int64) async { + await pool.setMaxBytes(bytes) +} +``` + +### Step 4: LocalModelRow pin toggle + +Add a new param: + +```swift +let isPinned: Bool +let onTogglePin: () -> Void +``` + +Render a pin icon button in the row's trailing metadata area: + +```swift +Button { + onTogglePin() +} label: { + Image(systemName: isPinned ? "pin.fill" : "pin") + .foregroundStyle(isPinned ? .orange : .secondary) +} +.buttonStyle(.plain) +.help(isPinned ? "Pinned — won't auto-evict" : "Pin to keep resident") +``` + +### Step 5: Wire pin callback from ModelLibraryView + +In `ModelLibraryView.localTab`, the `LocalModelRow(...)` call gains: + +```swift +isPinned: viewModel.pinnedModelIDs.contains(model.id), +onTogglePin: { Task { await viewModel.togglePin(model) } } +``` + +Add to `ModelLibraryViewModel`: + +```swift +var pinnedModelIDs: Set = [] + +func togglePin(_ model: LocalModel) async { + let nowPinned = !pinnedModelIDs.contains(model.id) + await appState.coordinator.setPinned(model.id, nowPinned) + if nowPinned { + pinnedModelIDs.insert(model.id) + } else { + pinnedModelIDs.remove(model.id) + } +} +``` + +### Step 6: Build + test + commit + +``` +cd MacMLXCore && swift build 2>&1 | tail -5 +cd MacMLXCore && swift test 2>&1 | tail -5 +cd macMLX && xcodebuild -scheme macMLX -destination 'platform=macOS' -configuration Debug build 2>&1 | tail -5 +``` + +```bash +git add MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift \ + macMLX/macMLX/App/EngineCoordinator.swift \ + macMLX/macMLX/Views/Settings/ModelPoolSection.swift \ + macMLX/macMLX/Views/Settings/SettingsView.swift \ + macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift \ + macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift \ + macMLX/macMLX/Views/ModelLibrary/ModelLibraryViewModel.swift +git commit -m "feat(pool): Settings max-resident slider + Models-tab pin toggle" +``` + +--- + +## Task 5: CHANGELOG + PR + +- [ ] Prepend under `[Unreleased]` in CHANGELOG.md: + +```markdown +### Added +- **Multi-model pool** (v0.4.0 engine parity, part 2 of 3). Load + multiple models at once — previously the engine unloaded one to + load another. Pool is bounded by a user-configurable resident + memory cap (Settings → Model Pool; default 50% of total RAM). + Least-recently-used non-pinned models auto-evict when the cap is + exceeded. Pin a model from its row in the Models tab to keep it + resident regardless of LRU order. +``` + +- [ ] **Push + open PR** + +```bash +git add CHANGELOG.md +git commit -m "docs: v0.4 ModelPool changelog entry" +git push -u origin feat/v0.4-model-pool 2>&1 | tail -3 +gh pr create --base main --head feat/v0.4-model-pool \ + --title "v0.4 — ModelPool (multi-model + LRU + pin)" \ + --body "Second of three v0.4.0 engine-parity sub-features. ModelPool actor holds multiple InferenceEngine instances, LRU-evicts under a byte budget, respects pinned entries. EngineCoordinator now delegates load/unload/generate to the pool. Settings slider + Models-tab pin toggle expose the user-visible controls. 6 new ModelPoolTests passing." +``` + +--- + +## Self-Review + +- ✅ Spec coverage: budget, LRU, pin, UI both-sides, test. +- ✅ No placeholders — every step has concrete code or commands. +- ⚠️ Task 3 Step 3 blast radius — switching `activeEngine` from sync + to async computed touches HummingbirdServer wiring in AppState. + Budget extra time for this. +- ⚠️ The engine factory `{ _ in MLXSwiftEngine() }` construction + ignores the `LocalModel` — current `MLXSwiftEngine()` takes no + args and `load(model)` is separate. Verify the current MLXSwiftEngine + init signature before committing Task 3. + +--- + +## Execution Handoff + +Subagent-driven execution. Same pattern as the KV cache PR. From 71e98d8e8d31d60f63ab11992a3653dd58b9c6c4 Mon Sep 17 00:00:00 2001 From: Kefeng Zhou Date: Sat, 18 Apr 2026 22:02:58 +0700 Subject: [PATCH 2/6] feat(pool): PooledEngineEntry + estimateModelSize helper Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ModelPool/PooledEngineEntry.swift | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift diff --git a/MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift b/MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift new file mode 100644 index 0000000..aa77465 --- /dev/null +++ b/MacMLXCore/Sources/MacMLXCore/ModelPool/PooledEngineEntry.swift @@ -0,0 +1,49 @@ +import Foundation + +/// Bookkeeping struct held by `ModelPool` per resident model. +/// The engine itself is not stored here (it's an actor in the pool's +/// dict); this is the value-type metadata. +public struct PooledEngineEntry: Sendable, Equatable { + /// Model identifier (matches `LocalModel.id`). + public let modelID: String + /// Estimated memory cost — sum of safetensors file sizes in + /// the model directory. Rough but stable for budget math; + /// actual MLX allocator usage can exceed this by 10–30%. + public let estimatedBytes: Int64 + /// Wall-clock time of last `engine(for:)` or `load(_:)` access. + public var lastAccess: Date + /// Pinned entries are never evicted by the LRU sweeper. + public var isPinned: Bool + + public init( + modelID: String, + estimatedBytes: Int64, + lastAccess: Date = Date(), + isPinned: Bool = false + ) { + self.modelID = modelID + self.estimatedBytes = estimatedBytes + self.lastAccess = lastAccess + self.isPinned = isPinned + } +} + +/// Sum of `.safetensors` files under `directory`. Rough proxy for +/// how much memory the model needs when loaded. Returns 0 on any +/// filesystem error. +public func estimateModelSize(at directory: URL) -> Int64 { + guard let files = try? FileManager.default.contentsOfDirectory( + at: directory, + includingPropertiesForKeys: [.fileSizeKey] + ) else { + return 0 + } + return files + .filter { $0.pathExtension.lowercased() == "safetensors" } + .compactMap { url -> Int64? in + guard let values = try? url.resourceValues(forKeys: [.fileSizeKey]), + let size = values.fileSize else { return nil } + return Int64(size) + } + .reduce(0, +) +} From 6e47fe4d60bb640cc7b9d9768c88c1f8eba6bfea Mon Sep 17 00:00:00 2001 From: Kefeng Zhou Date: Sat, 18 Apr 2026 22:04:14 +0700 Subject: [PATCH 3/6] =?UTF-8?q?feat(pool):=20ModelPool=20actor=20=E2=80=94?= =?UTF-8?q?=20LRU=20+=20pinning=20+=20byte-budget=20eviction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- .../MacMLXCore/ModelPool/ModelPool.swift | 151 ++++++++++++++++++ .../ModelPool/ModelPoolTests.swift | 102 ++++++++++++ 2 files changed, 253 insertions(+) create mode 100644 MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift create mode 100644 MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift diff --git a/MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift b/MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift new file mode 100644 index 0000000..6c0c726 --- /dev/null +++ b/MacMLXCore/Sources/MacMLXCore/ModelPool/ModelPool.swift @@ -0,0 +1,151 @@ +import Foundation + +/// Actor managing multiple resident `InferenceEngine` instances with +/// LRU + explicit pinning + byte-budget auto-evict. Use cases: +/// +/// - Swap between chat models without re-reading weights from disk +/// - External API cold-swap without unloading the GUI's current model +/// - Keep a small always-ready model pinned alongside a big one that +/// auto-evicts on memory pressure +/// +/// Load path is serialised under `loadTasks` to avoid two concurrent +/// requests double-loading the same weights — the second caller awaits +/// the first's completion. +public actor ModelPool { + + public typealias EngineFactory = @Sendable (LocalModel) -> any InferenceEngine + + // MARK: - State + + /// Currently resident engines, keyed by model ID. + private var engines: [String: any InferenceEngine] = [:] + /// Bookkeeping keyed by model ID. + private var entries: [String: PooledEngineEntry] = [:] + /// In-flight loads so concurrent callers deduplicate. + private var loadTasks: [String: Task] = [:] + + private let engineFactory: EngineFactory + + // MARK: - Budget + + /// Maximum total estimated bytes that may be resident. Exceeding + /// this triggers LRU eviction (pinned entries are spared). + public var maxBytes: Int64 + + public init( + maxBytes: Int64, + engineFactory: @escaping EngineFactory + ) { + self.maxBytes = maxBytes + self.engineFactory = engineFactory + } + + public func setMaxBytes(_ bytes: Int64) { + self.maxBytes = bytes + } + + // MARK: - Public + + public func residentModelIDs() -> [String] { + Array(engines.keys).sorted() + } + + public func engine(for modelID: String) -> (any InferenceEngine)? { + guard let e = engines[modelID] else { return nil } + // Touch LRU timestamp. + if var entry = entries[modelID] { + entry.lastAccess = Date() + entries[modelID] = entry + } + return e + } + + public func setPinned(_ modelID: String, _ pinned: Bool) { + guard var entry = entries[modelID] else { return } + entry.isPinned = pinned + entries[modelID] = entry + } + + public func isPinned(_ modelID: String) -> Bool { + entries[modelID]?.isPinned ?? false + } + + public func unload(_ modelID: String) async { + if let e = engines.removeValue(forKey: modelID) { + try? await e.unload() + } + entries.removeValue(forKey: modelID) + } + + /// Return an engine with `model.id` loaded. Reuses an existing + /// entry when possible. Evicts LRU entries as needed to stay + /// within `maxBytes`. Concurrent loads of the same ID share. + @discardableResult + public func load(_ model: LocalModel) async throws -> any InferenceEngine { + // Already loaded? Touch and return. + if let e = engines[model.id] { + if var entry = entries[model.id] { + entry.lastAccess = Date() + entries[model.id] = entry + } + return e + } + // In-flight load by another caller? Join it. + if let pending = loadTasks[model.id] { + return try await pending.value + } + + // Evict to fit before starting the load, using the model's + // sizeBytes (or our estimate) as the cost. + let cost = model.sizeBytes > 0 ? model.sizeBytes : estimateModelSize(at: model.directory) + evict(toFit: cost) + + let factory = engineFactory + let task = Task { () throws -> any InferenceEngine in + let engine = factory(model) + try await engine.load(model) + return engine + } + loadTasks[model.id] = task + do { + let engine = try await task.value + loadTasks.removeValue(forKey: model.id) + engines[model.id] = engine + entries[model.id] = PooledEngineEntry( + modelID: model.id, + estimatedBytes: cost + ) + return engine + } catch { + loadTasks.removeValue(forKey: model.id) + throw error + } + } + + // MARK: - Eviction + + private func currentResidentBytes() -> Int64 { + entries.values.map(\.estimatedBytes).reduce(0, +) + } + + /// Evict LRU non-pinned entries until (currentBytes + incoming) fits. + private func evict(toFit incoming: Int64) { + var target = maxBytes - incoming + if target < 0 { target = 0 } + + // Candidates: non-pinned, oldest first. + let candidates = entries.values + .filter { !$0.isPinned } + .sorted { $0.lastAccess < $1.lastAccess } + + var current = currentResidentBytes() + var iterator = candidates.makeIterator() + while current > target, let victim = iterator.next() { + if let e = engines.removeValue(forKey: victim.modelID) { + Task { try? await e.unload() } + } + entries.removeValue(forKey: victim.modelID) + current -= victim.estimatedBytes + } + } +} diff --git a/MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift new file mode 100644 index 0000000..364ee0f --- /dev/null +++ b/MacMLXCore/Tests/MacMLXCoreTests/ModelPool/ModelPoolTests.swift @@ -0,0 +1,102 @@ +import XCTest +@testable import MacMLXCore + +/// Stub engine for pool tests — no Metal/MLX required. Implements +/// the minimum InferenceEngine surface the pool touches: load, +/// unload, engineID. Generate throws since it shouldn't be called. +private actor StubEngine: InferenceEngine { + nonisolated let engineID: EngineID = .mlxSwift + var status: EngineStatus = .idle + var loadedModel: LocalModel? + var version: String = "stub" + + func load(_ model: LocalModel) async throws { + loadedModel = model + status = .ready(model: model.id) + } + + func unload() async throws { + loadedModel = nil + status = .idle + } + + nonisolated func generate(_ request: GenerateRequest) -> AsyncThrowingStream { + AsyncThrowingStream { cont in + cont.finish(throwing: EngineError.modelNotLoaded) + } + } + + func healthCheck() async -> Bool { true } +} + +final class ModelPoolTests: XCTestCase { + + private func mkModel(_ id: String, size: Int64 = 1_000_000_000) -> LocalModel { + LocalModel( + id: id, + displayName: id, + directory: FileManager.default.temporaryDirectory, + sizeBytes: size, + format: .mlx, + quantization: nil, + parameterCount: nil, + architecture: nil + ) + } + + func testLoadAddsToPool() async throws { + let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() }) + let m = mkModel("A", size: 1_000_000_000) + _ = try await pool.load(m) + let residents = await pool.residentModelIDs() + XCTAssertEqual(residents, ["A"]) + } + + func testLoadReuseExistingInstance() async throws { + let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() }) + let m = mkModel("A", size: 1_000_000_000) + let e1 = try await pool.load(m) as AnyObject + let e2 = try await pool.load(m) as AnyObject + XCTAssertTrue(e1 === e2) + } + + func testOverBudgetEvictsLRU() async throws { + let pool = ModelPool(maxBytes: 2_500_000_000, engineFactory: { _ in StubEngine() }) + _ = try await pool.load(mkModel("A", size: 1_000_000_000)) + _ = try await pool.load(mkModel("B", size: 1_000_000_000)) + // Budget has 2.5 GB, A+B = 2 GB fits. + _ = try await pool.load(mkModel("C", size: 1_000_000_000)) + // A+B+C = 3 GB — over. Oldest (A) evicted. + let residents = await pool.residentModelIDs() + XCTAssertFalse(residents.contains("A")) + XCTAssertTrue(residents.contains("B")) + XCTAssertTrue(residents.contains("C")) + } + + func testPinnedNotEvicted() async throws { + let pool = ModelPool(maxBytes: 2_500_000_000, engineFactory: { _ in StubEngine() }) + _ = try await pool.load(mkModel("A", size: 1_000_000_000)) + await pool.setPinned("A", true) + _ = try await pool.load(mkModel("B", size: 1_000_000_000)) + _ = try await pool.load(mkModel("C", size: 1_000_000_000)) + // A is pinned → B (next-oldest) evicted instead. + let residents = await pool.residentModelIDs() + XCTAssertTrue(residents.contains("A")) + XCTAssertFalse(residents.contains("B")) + XCTAssertTrue(residents.contains("C")) + } + + func testUnloadRemovesFromPool() async throws { + let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() }) + _ = try await pool.load(mkModel("A")) + await pool.unload("A") + let residents = await pool.residentModelIDs() + XCTAssertTrue(residents.isEmpty) + } + + func testEngineForReturnsNilWhenNotLoaded() async { + let pool = ModelPool(maxBytes: 4_000_000_000, engineFactory: { _ in StubEngine() }) + let e = await pool.engine(for: "A") + XCTAssertNil(e) + } +} From 978bbf58ab0aaf7cc961ae73faa9665d66ee584d Mon Sep 17 00:00:00 2001 From: Kefeng Zhou Date: Sat, 18 Apr 2026 22:07:42 +0700 Subject: [PATCH 4/6] feat(pool): route EngineCoordinator load/unload/generate through ModelPool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single active model is still the UX default — currentModelID is the one shown in the toolbar + menu bar. But previously-loaded models now stay resident in the pool until LRU evicts, so the next switch-back skips disk read. Engines spawn via an injected factory so tests can stub. --- macMLX/macMLX/App/AppState.swift | 2 +- macMLX/macMLX/App/EngineCoordinator.swift | 143 ++++++++++++++++------ 2 files changed, 106 insertions(+), 39 deletions(-) diff --git a/macMLX/macMLX/App/AppState.swift b/macMLX/macMLX/App/AppState.swift index 90b59bf..e67a6c2 100644 --- a/macMLX/macMLX/App/AppState.swift +++ b/macMLX/macMLX/App/AppState.swift @@ -175,7 +175,7 @@ public final class AppState { /// to `LogManager`. public func startServer() async { guard server == nil, !isServerToggling else { return } - guard let engine = coordinator.activeEngine else { + guard let engine = await coordinator.activeEngine else { await logs.log( "Cannot start server: no engine loaded", level: .warning, diff --git a/macMLX/macMLX/App/EngineCoordinator.swift b/macMLX/macMLX/App/EngineCoordinator.swift index 35bbfef..671d4c9 100644 --- a/macMLX/macMLX/App/EngineCoordinator.swift +++ b/macMLX/macMLX/App/EngineCoordinator.swift @@ -5,6 +5,12 @@ // `status` and `currentModel` for live updates; load/unload/generate go // through the coordinator so the rest of the app never holds a direct // reference to a specific engine implementation. +// +// v0.4+: the coordinator delegates model lifecycle to a `ModelPool` actor +// so previously-loaded (non-pinned) models stay resident until the pool's +// byte budget forces LRU eviction. Single-active-model UX is preserved — +// `currentModel` / `activeEngine` point at the most-recently-loaded +// entry and drive the toolbar + menu bar. import Foundation import MacMLXCore @@ -22,8 +28,8 @@ public final class EngineCoordinator { /// Active engine's `version` string (e.g. `"mlx-swift-lm 3.31.3"`), /// cached synchronously on the @MainActor so SwiftUI views and the /// benchmark feature can read it without awaiting the engine actor. - /// Refreshed on every `switchTo(_:)`; empty string if no engine is - /// wired (detection-only engines). + /// Refreshed on every `switchTo(_:)` / `load(_:)`; empty string if + /// no engine is wired (detection-only engines). public private(set) var engineVersion: String = "" /// Tokens generated by the current process — used for the menu bar @@ -40,12 +46,26 @@ public final class EngineCoordinator { // MARK: - Private state - private var engine: (any InferenceEngine)? + /// Pool of resident engines. Single-model-at-a-time UX is preserved + /// because we only call `pool.load` when `currentModel` changes; + /// ModelPool lazily keeps previously-loaded ones resident (subject + /// to its byte budget) so the next switch-back skips disk read. + private let pool: ModelPool + + /// ID of the model the GUI currently treats as active. `nil` when + /// no model has been loaded (or when the last load was for a + /// detection-only engine). + private var currentModelID: String? /// Current active engine, exposed to lifecycle consumers like the /// HummingbirdServer that need an `any InferenceEngine` reference. - /// Nonisolated-read-only snapshot — callers should not mutate. - public var activeEngine: (any InferenceEngine)? { engine } + /// Async because the pool is an actor — callers must `await`. + public var activeEngine: (any InferenceEngine)? { + get async { + guard let id = currentModelID else { return nil } + return await pool.engine(for: id) + } + } private let logs: LogManager @@ -53,40 +73,53 @@ public final class EngineCoordinator { public init(logs: LogManager) { self.logs = logs - self.engine = MLXSwiftEngine() self.engineID = .mlxSwift - Task { [weak self] in await self?.refreshEngineVersion() } + // Default budget: 50% of total RAM (10^9 GB — Apple convention). + // Task 4 will persist this in SettingsManager and push updates + // through `setPoolBudget(bytes:)`. + let totalGB = MemoryProbe.totalMemoryGB() + let budgetGB = max(4.0, totalGB * 0.5) + let budgetBytes = Int64(budgetGB * 1_000_000_000) + self.pool = ModelPool(maxBytes: budgetBytes) { _ in + MLXSwiftEngine() + } } // MARK: - Engine selection /// Swap to a different engine implementation. v0.1 only supports /// `.mlxSwift` (in-process); the other two are detection-only stubs. + /// Changing engineID alone doesn't affect the pool — the pool's + /// factory still produces `MLXSwiftEngine` instances because that's + /// the only real engine in v0.4. Detection-only engines cause + /// `load(_:)` to fail fast at status check time. public func switchTo(_ id: EngineID) { engineID = id - switch id { - case .mlxSwift: - engine = MLXSwiftEngine() - case .swiftLM, .pythonMLX: - // v0.1 detection only — leave engine nil so load() fails fast. - engine = nil - } Task { [weak self] in await self?.refreshEngineVersion() } } /// Pull the active engine's `version` string onto the coordinator's - /// synchronous state. Called on init + after every `switchTo(_:)`. + /// synchronous state. Called on init + after every `switchTo(_:)` + /// / successful `load(_:)`. private func refreshEngineVersion() async { - guard let engine else { engineVersion = ""; return } + guard let id = currentModelID, + let engine = await pool.engine(for: id) else { + engineVersion = "" + return + } engineVersion = await engine.version } // MARK: - Lifecycle - /// Load a local model into the active engine. + /// Load a local model through the pool. The previously-loaded + /// model stays resident (subject to LRU + byte budget); only the + /// newly-loaded model becomes the `currentModel`. Detection-only + /// engines (`.swiftLM`, `.pythonMLX`) fail fast since the pool's + /// factory always produces `MLXSwiftEngine`. @discardableResult public func load(_ model: LocalModel) async -> Result { - guard let engine else { + guard engineID == .mlxSwift else { let err = EngineError.unsupportedOperation( "Engine \(engineID.rawValue) is detection-only in v0.1" ) @@ -96,9 +129,11 @@ public final class EngineCoordinator { status = .loading(model: model.id) await logs.log("Loading model: \(model.id)", level: .info, category: .engine) do { - try await engine.load(model) + _ = try await pool.load(model) currentModel = model + currentModelID = model.id status = .ready(model: model.id) + await refreshEngineVersion() await logs.log("Model loaded: \(model.id)", level: .info, category: .engine) // Fire the post-load hook so AppState can rehydrate per-model // state (e.g. Parameters Inspector overrides) even if the @@ -119,40 +154,48 @@ public final class EngineCoordinator { /// Blow away the prompt cache — both hot and cold tiers. Exposed /// to Settings' "Clear All KV Caches" button. /// - /// Today only the in-process `MLXSwiftEngine` carries a prompt - /// cache; the SwiftLM / Python-MLX detection-only stubs don't, so - /// downcasting and no-op-on-mismatch is the right shape. When - /// another engine grows a cache this will move onto the - /// `InferenceEngine` protocol. + /// v0.4: iterates every resident engine in the pool, so caches + /// belonging to non-current models also get flushed. Non-MLX + /// engines (future/detection-only) are skipped via `as?`. public func clearPromptCache() async { - guard let engine = engine as? MLXSwiftEngine else { return } - await engine.clearPromptCache() + for id in await pool.residentModelIDs() { + if let mlx = await pool.engine(for: id) as? MLXSwiftEngine { + await mlx.clearPromptCache() + } + } } - /// Release the loaded model. + /// Release the currently-active model from the pool. + /// + /// v0.4 semantics: this unloads **only** `currentModel`; other + /// previously-loaded models stay resident in the pool. That's the + /// main user-visible v0.4 improvement — cold-switching back to a + /// recent model no longer re-reads weights from disk. public func unload() async { - guard let engine else { return } - try? await engine.unload() + guard let id = currentModelID else { return } + await pool.unload(id) + currentModelID = nil currentModel = nil status = .idle + engineVersion = "" await logs.log("Engine unloaded", level: .info, category: .engine) } /// Stream tokens for `request` against the active engine. Returns an - /// empty stream that finishes immediately if no engine is wired. + /// empty stream that finishes immediately if no model is loaded. public func generate(_ request: GenerateRequest) -> AsyncThrowingStream { - guard let engine else { - return AsyncThrowingStream { continuation in - continuation.finish(throwing: EngineError.modelNotLoaded) - } - } - let stream = engine.generate(request) - // Wrap to bump the token counter as chunks arrive. + let pool = self.pool + let currentID = self.currentModelID return AsyncThrowingStream { continuation in Task { @MainActor in + guard let id = currentID, + let engine = await pool.engine(for: id) else { + continuation.finish(throwing: EngineError.modelNotLoaded) + return + } self.status = .generating do { - for try await chunk in stream { + for try await chunk in engine.generate(request) { if let usage = chunk.usage { self.tokensGeneratedTotal += usage.completionTokens } @@ -171,4 +214,28 @@ public final class EngineCoordinator { } } } + + // MARK: - Pool pass-throughs (v0.4) + + /// Pin / unpin a resident model so the LRU sweeper won't evict it + /// when the pool's byte budget is exceeded. No-op for models that + /// aren't currently resident. + public func setPinned(_ modelID: String, _ pinned: Bool) async { + await pool.setPinned(modelID, pinned) + } + + /// IDs of every currently-resident model in the pool (sorted). + /// Used by the Models tab to render the "loaded" indicator and + /// by Settings' pool-stats panel (Task 4). + public func residentModelIDs() async -> [String] { + await pool.residentModelIDs() + } + + /// Update the pool's byte budget. Called by the Settings + /// "Max resident memory" slider (Task 4). Shrinking the budget + /// below current usage doesn't synchronously evict — eviction + /// happens on the next `load(_:)`. + public func setPoolBudget(bytes: Int64) async { + await pool.setMaxBytes(bytes) + } } From 22e8b6ad4a177b56cf69430d76b400f9fe4a0d7e Mon Sep 17 00:00:00 2001 From: Kefeng Zhou Date: Sat, 18 Apr 2026 22:12:22 +0700 Subject: [PATCH 5/6] feat(pool): Settings max-resident slider + Models-tab pin toggle --- .../MacMLXCore/Managers/SettingsManager.swift | 18 ++++++++-- .../Views/ModelLibrary/LocalModelRow.swift | 13 +++++++ .../Views/ModelLibrary/ModelLibraryView.swift | 4 +++ .../ModelLibrary/ModelLibraryViewModel.swift | 18 ++++++++++ .../Views/Settings/ModelPoolSection.swift | 35 +++++++++++++++++++ .../macMLX/Views/Settings/SettingsView.swift | 12 +++++++ 6 files changed, 98 insertions(+), 2 deletions(-) create mode 100644 macMLX/macMLX/Views/Settings/ModelPoolSection.swift diff --git a/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift b/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift index d7e218a..287d47a 100644 --- a/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift +++ b/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift @@ -62,6 +62,13 @@ public struct Settings: Codable, Equatable, Sendable { /// enforcement lands in v0.4.0.1. public var kvCacheColdGB: Int + /// ModelPool byte budget, expressed in gigabytes (Apple's 10^9 GB + /// convention). When resident models' summed estimated footprint + /// exceeds this, the pool LRU-evicts non-pinned entries. Default + /// is 50% of the machine's physical RAM, clamped to a 4 GB floor + /// for small-memory Macs. + public var maxResidentMemoryGB: Int + // MARK: Factory /// Sensible out-of-the-box defaults — used when no settings file exists. @@ -81,7 +88,8 @@ public struct Settings: Codable, Equatable, Sendable { logRetentionDays: 7, hfEndpoint: "https://huggingface.co", kvCacheHotMB: 512, - kvCacheColdGB: 20 + kvCacheColdGB: 20, + maxResidentMemoryGB: max(4, Int(MemoryProbe.totalMemoryGB()) / 2) ) // MARK: Init @@ -99,7 +107,8 @@ public struct Settings: Codable, Equatable, Sendable { logRetentionDays: Int, hfEndpoint: String = "https://huggingface.co", kvCacheHotMB: Int = 512, - kvCacheColdGB: Int = 20 + kvCacheColdGB: Int = 20, + maxResidentMemoryGB: Int = max(4, Int(MemoryProbe.totalMemoryGB()) / 2) ) { self.modelDirectory = modelDirectory self.preferredEngine = preferredEngine @@ -114,6 +123,7 @@ public struct Settings: Codable, Equatable, Sendable { self.hfEndpoint = hfEndpoint self.kvCacheHotMB = kvCacheHotMB self.kvCacheColdGB = kvCacheColdGB + self.maxResidentMemoryGB = maxResidentMemoryGB } // MARK: - Codable (backward-compat decode) @@ -135,6 +145,7 @@ public struct Settings: Codable, Equatable, Sendable { case hfEndpoint case kvCacheHotMB case kvCacheColdGB + case maxResidentMemoryGB } public init(from decoder: Decoder) throws { @@ -153,6 +164,9 @@ public struct Settings: Codable, Equatable, Sendable { ?? "https://huggingface.co" self.kvCacheHotMB = try c.decodeIfPresent(Int.self, forKey: .kvCacheHotMB) ?? 512 self.kvCacheColdGB = try c.decodeIfPresent(Int.self, forKey: .kvCacheColdGB) ?? 20 + self.maxResidentMemoryGB = + (try c.decodeIfPresent(Int.self, forKey: .maxResidentMemoryGB)) + ?? max(4, Int(MemoryProbe.totalMemoryGB()) / 2) } } diff --git a/macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift b/macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift index bf94d7a..3d74780 100644 --- a/macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift +++ b/macMLX/macMLX/Views/ModelLibrary/LocalModelRow.swift @@ -9,9 +9,11 @@ struct LocalModelRow: View { let model: LocalModel let isLoaded: Bool let isLoading: Bool + let isPinned: Bool let hasUpdateAvailable: Bool let onLoad: () -> Void let onUnload: () -> Void + let onTogglePin: () -> Void let onDelete: () -> Void var body: some View { @@ -75,6 +77,13 @@ struct LocalModelRow: View { .controlSize(.small) } + Button(action: onTogglePin) { + Image(systemName: isPinned ? "pin.fill" : "pin") + .foregroundStyle(isPinned ? .orange : .secondary) + } + .buttonStyle(.plain) + .help(isPinned ? "Pinned — won't auto-evict" : "Pin to keep resident") + Button(role: .destructive, action: onDelete) { Image(systemName: "trash") } @@ -102,18 +111,22 @@ struct LocalModelRow: View { model: model, isLoaded: true, isLoading: false, + isPinned: true, hasUpdateAvailable: false, onLoad: {}, onUnload: {}, + onTogglePin: {}, onDelete: {} ) LocalModelRow( model: model, isLoaded: false, isLoading: false, + isPinned: false, hasUpdateAvailable: true, onLoad: {}, onUnload: {}, + onTogglePin: {}, onDelete: {} ) } diff --git a/macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift b/macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift index 0dd094a..b204090 100644 --- a/macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift +++ b/macMLX/macMLX/Views/ModelLibrary/ModelLibraryView.swift @@ -160,6 +160,7 @@ private struct ModelLibraryContent: View { model: model, isLoaded: viewModel.loadedModelID == model.id, isLoading: viewModel.loadingModelID == model.id, + isPinned: viewModel.pinnedModelIDs.contains(model.id), hasUpdateAvailable: viewModel.modelsWithUpdate.contains(model.id), onLoad: { Task { await viewModel.loadModel(model) } @@ -167,6 +168,9 @@ private struct ModelLibraryContent: View { onUnload: { Task { await viewModel.unloadModel() } }, + onTogglePin: { + Task { await viewModel.togglePin(model) } + }, onDelete: { viewModel.deleteModel(model) } diff --git a/macMLX/macMLX/Views/ModelLibrary/ModelLibraryViewModel.swift b/macMLX/macMLX/Views/ModelLibrary/ModelLibraryViewModel.swift index 02082e1..288187d 100644 --- a/macMLX/macMLX/Views/ModelLibrary/ModelLibraryViewModel.swift +++ b/macMLX/macMLX/Views/ModelLibrary/ModelLibraryViewModel.swift @@ -48,6 +48,11 @@ final class ModelLibraryViewModel { /// Model IDs for which an update is available on HF. var modelsWithUpdate: Set = [] + /// Model IDs the user has pinned — ModelPool won't LRU-evict these. + /// v0.4 MVP: in-memory only, reset on app launch. Disk persistence + /// is deferred per the plan's "Out of scope" section. + var pinnedModelIDs: Set = [] + private var lastUpdateCheck: Date? private let updateCheckInterval: TimeInterval = 24 * 60 * 60 // 1 day @@ -178,6 +183,19 @@ final class ModelLibraryViewModel { await coordinator.unload() } + /// Flip pin state for a single model. Propagates to the pool so the + /// LRU sweeper respects the new state immediately, then updates the + /// observable `pinnedModelIDs` so the SwiftUI row re-renders. + func togglePin(_ model: LocalModel) async { + let nowPinned = !pinnedModelIDs.contains(model.id) + await coordinator.setPinned(model.id, nowPinned) + if nowPinned { + pinnedModelIDs.insert(model.id) + } else { + pinnedModelIDs.remove(model.id) + } + } + func deleteModel(_ model: LocalModel) { do { try FileManager.default.removeItem(at: model.directory) diff --git a/macMLX/macMLX/Views/Settings/ModelPoolSection.swift b/macMLX/macMLX/Views/Settings/ModelPoolSection.swift new file mode 100644 index 0000000..2bcd940 --- /dev/null +++ b/macMLX/macMLX/Views/Settings/ModelPoolSection.swift @@ -0,0 +1,35 @@ +// ModelPoolSection.swift +// macMLX +// +// Settings section exposing the v0.4 ModelPool "max resident memory" +// budget. Changing the stepper value both persists to +// `Settings.maxResidentMemoryGB` and pushes the new byte budget into +// `EngineCoordinator.setPoolBudget(bytes:)` so the live pool picks it +// up without a restart. Pin/unpin controls live on the Models tab's +// `LocalModelRow` instead — this section is only about the byte cap. + +import SwiftUI +import MacMLXCore + +struct ModelPoolSection: View { + @Binding var maxResidentGB: Int + + var body: some View { + Section("Model Pool") { + HStack { + Text("Max resident memory") + Spacer() + Stepper( + value: $maxResidentGB, + in: 2...256, + step: 1 + ) { + Text(String(maxResidentGB) + " GB") + .font(.system(.body, design: .monospaced)) + .frame(minWidth: 80, alignment: .trailing) + } + } + .help("When multiple loaded models exceed this, the least-recently-used non-pinned one is unloaded.") + } + } +} diff --git a/macMLX/macMLX/Views/Settings/SettingsView.swift b/macMLX/macMLX/Views/Settings/SettingsView.swift index a8595de..52ad485 100644 --- a/macMLX/macMLX/Views/Settings/SettingsView.swift +++ b/macMLX/macMLX/Views/Settings/SettingsView.swift @@ -20,6 +20,7 @@ struct SettingsView: View { @State private var hfEndpoint: String = "https://huggingface.co" @State private var kvCacheHotMB: Int = 512 @State private var kvCacheColdGB: Int = 20 + @State private var maxResidentMemoryGB: Int = 8 var body: some View { Form { @@ -67,6 +68,16 @@ struct SettingsView: View { Task { await appState.updateSettings { $0.kvCacheColdGB = newValue } } } + ModelPoolSection(maxResidentGB: $maxResidentMemoryGB) + .onChange(of: maxResidentMemoryGB) { _, newValue in + Task { + await appState.updateSettings { $0.maxResidentMemoryGB = newValue } + await appState.coordinator.setPoolBudget( + bytes: Int64(newValue) * 1_000_000_000 + ) + } + } + downloadsSection rerunSetupSection @@ -171,6 +182,7 @@ struct SettingsView: View { hfEndpoint = s.hfEndpoint kvCacheHotMB = s.kvCacheHotMB kvCacheColdGB = s.kvCacheColdGB + maxResidentMemoryGB = s.maxResidentMemoryGB } private func showModelDirectoryPicker() { From eb4fa3cf19375fe237bddd3111cd6ff4aac8af09 Mon Sep 17 00:00:00 2001 From: Kefeng Zhou Date: Sat, 18 Apr 2026 22:12:58 +0700 Subject: [PATCH 6/6] docs: v0.4 ModelPool changelog entry --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 06d7b0a..3a65b33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,16 @@ Versioning follows [Semantic Versioning](https://semver.org/). - Debug-level Logs tab entries `Prompt cache HIT — restored N tokens` / `Prompt cache MISS — cold prefill of N tokens` under the `engine` category, so you can see cache effectiveness. +- **Multi-model pool** (v0.4.0 engine parity, part 2 of 3). Load + multiple models at once — previously the engine had to unload + the old model before loading a new one, which meant every API + cold-swap paid the full weight-read cost. Pool is bounded by a + user-configurable resident memory cap (Settings → Model Pool; + default 50% of total RAM). Least-recently-used non-pinned + models auto-evict when the cap is exceeded. Pin a model from + its row in the Models tab (pin icon) to keep it resident + regardless of LRU order. Pinned state is in-memory for this + release; persistence across restarts is a follow-up. ---