Pushkinist · Pushkinist · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/crates/rmlx-cli/src/commands/serve.rs b/crates/rmlx-cli/src/commands/serve.rs
@@ -1132,15 +1132,29 @@ pub(crate) fn run_serve(
             tts_model: Arc::new(parking_lot::RwLock::new(None)),
         };
 
-        // Eager model preload — load every registry entry before
-        // serving requests so cold TTFT does not include model-load overhead.
+        // Eager model preload — warm the resident set before serving requests
+        // so cold TTFT does not include model-load overhead. Bounded to AT MOST
+        // `max_loaded_models` entries (the `cap` alphabetically-first ids, since
+        // `registry.list()` iterates a BTreeMap sorted by id — not JSON order):
+        // anything beyond the resident cap would be evicted by the next
+        // `ensure_loaded` (see `AppState::ensure_loaded` LRU swap), so preloading
+        // it is pure load-cost + transient memory pressure with nothing kept.
+        // The rest stay lazy — the documented load-on-demand / idle-unload path
+        // handles them on first request.
         // `ensure_loaded` is synchronous (CPU-bound disk + dequant); run it in
         // the blocking-thread pool so we do not stall the async runtime.
         // Best-effort: a load failure logs a warning but does not abort startup
         // (the first real request will attempt the load again via the normal
         // on-demand path and surface a 503 if it still fails).
         {
-            let ids: Vec<String> = state.registry.list().iter().map(|e| e.id.clone()).collect();
+            let cap = max_loaded_models.max(1);
+            let ids: Vec<String> = state
+                .registry
+                .list()
+                .iter()
+                .take(cap)
+                .map(|e| e.id.clone())
+                .collect();
             let state_ref = state.clone();
             tokio::task::spawn_blocking(move || {
                 for id in &ids {

diff --git a/crates/rmlx-cli/tests/e2e/manifest.toml b/crates/rmlx-cli/tests/e2e/manifest.toml
@@ -715,8 +715,9 @@ tags = ["phase2"]
 
 # Multi-model lifecycle: the runner owns a registry-mode serve (Bonsai + a 2nd
 # model resolved from GEMMA4_E2B) under single-MLX discipline. Proves:
-#   (a) load A → loaded; (c) cap=1 eager preload of [A,B] → B resident, A LRU-
-#   evicted (status flips); (d) explicit unload B → loaded:false, 2nd unload →
+#   (a) load A → loaded; (c) cap=1 warms the alphabetically-first id (one entry,
+#   bounded to cap); explicit load B forces the LRU swap → B resident, A evicted
+#   (status flips); (d) explicit unload B → loaded:false, 2nd unload →
 #   404; (e) claim enforcement — a 2nd `rmlx serve` on the HELD port is rejected
 #   (exit 11, no competing Metal context). When GEMMA4_E2B is absent the runner
 #   runs the single-model subset (leg a + claim leg) and marks the 2-model legs

diff --git a/crates/rmlx-cli/tests/e2e/runner.rs b/crates/rmlx-cli/tests/e2e/runner.rs
@@ -1957,9 +1957,11 @@ fn assert_cache_hit_equivalence(
 /// registry path (multiple model entries) instead of a single `--model`, and
 /// leaves `RUST_LOG=warn` (the lifecycle proof reads the HTTP API, not logs).
 ///
-/// Registry mode eagerly pre-loads every entry at startup, bounded by the slot
-/// LRU at `cap` — so on a green `/health` the resident set is already the
-/// `cap`-survivor of the eager preload (see serve.rs "Eager model preload").
+/// Registry mode eagerly pre-loads AT MOST `cap` entries at startup (the
+/// alphabetically-first `min(cap, N)` model ids, since registry entries are
+/// BTreeMap-sorted by id; see serve.rs "Eager model preload") — so on a green
+/// `/health` those ids are already resident and the rest stay lazy until their
+/// first request.
 fn spawn_serve_registry(
     registry_json: &std::path::Path,
     port: u16,
@@ -2131,9 +2133,11 @@ fn assert_model_lifecycle(
     }
 
     // ── Legs (a)+(c)+(d): cap=1 registry with A (+B when present). ───────────
-    // With cap=1 + eager preload, the LAST registry entry survives the preload.
-    // Order [A, B] → B survives → A evicted. That proves leg (c) LRU eviction
-    // directly out of the eager preload. With only A, A is resident (leg a).
+    // With cap=1, eager preload warms whichever id sorts alphabetically first
+    // (registry entries are BTreeMap-sorted by id, not by JSON order). The
+    // defensive load-B below forces the LRU swap regardless of which was
+    // preloaded: the resident set ends up B-resident, A-evicted — proving
+    // cap=1 LRU eviction (leg c). With only A, A is resident (leg a).
     let entries_1: Vec<(&str, &std::path::Path)> = match (&model_b, &id_b) {
         (Some(pb), Some(idb)) => vec![(id_a, model_a), (idb.as_str(), pb.as_path())],
         _ => vec![(id_a, model_a)],
@@ -2158,7 +2162,9 @@ fn assert_model_lifecycle(
     // Two-model legs (a)/(c)/(d) when B is present; single-model leg (a) only
     // otherwise.
     if let Some(idb) = id_b.clone() {
-        // Eager preload of [A,B] at cap=1 → B resident, A evicted: leg (c).
+        // Eager preload at cap=1 warms the alphabetically-first id (one entry);
+        // the other stays lazy. The defensive load-B below forces the LRU
+        // swap regardless of which was preloaded: leg (c).
         let a_loaded = match model_loaded(port, id_a) {
             Ok(v) => v,
             Err(e) => return fail_lc(&cap1_guard, &lc_home, mk, format!("status A (cap1): {e}")),

diff --git a/docs/CLI.md b/docs/CLI.md
@@ -58,7 +58,7 @@ mutually exclusive.
 | Flag | Type | Default | Description |
 |---|---|---|---|
 | `--model` | path | — | Path to a model snapshot directory. Mutually exclusive with `--registry`. |
-| `--registry` | path | — | Path to a JSON registry file. Format: `{"models":[{"id":"name","path":"/abs/path"},…]}`. Mutually exclusive with `--model`. |
+| `--registry` | path | — | Path to a JSON registry file. Format: `{"models":[{"id":"name","path":"/abs/path"},…]}`. Mutually exclusive with `--model`. At startup the server eagerly warms **at most `--max-loaded-models`** entries (the alphabetically-first `cap` model ids, since the registry iterates entries sorted by id — not JSON array order); the rest stay lazy and load on first request (load-on-demand + idle-unload). A large registry therefore does not pull every model through GPU memory at boot. |
 | `--profile` | string | — | Named launch profile from `<RMLX_HOME>/profiles.toml`. CLI flags override profile values. See `rmlx profile list`. |
 | `--port` | u16 | 8080 | TCP port to listen on. |
 | `--host` | string | `127.0.0.1` | Host or IP to bind. |
@@ -83,7 +83,7 @@ mutually exclusive.
 | `--turbo-flash-lock` | bool flag | off | Enable TurboFlash lock variant. Has no effect unless `--turbo-flash` or `RMLX_TURBO_FLASH=1` is also active. |
 | `--planar-flash-decode` | `on` \| `off` \| `auto` | `auto` | PlanarK single-pass flash-decode MSL kernel. `auto` (default): resolves OFF on every host — validation confirmed the kernel is bit-for-bit identical to the fused chain (`update_and_sdpa_planar_k_fused` dispatch_delta>0 with output matching OFF byte-for-byte on Bonsai) but did not deliver a measurable decode-TPS gain (-0.19% mean at 4k canary; well below the ≥10% Auto-flip gate). A pre-existing PlanarK-on-Bonsai long-prompt chunked-prefill bug (`docs/KV_QUANT.md` §"Correctness gap") also prevented the NIAH correctness anchor from passing on the only reachable arch. `on` forces `RMLX_PLANAR_FLASH_DECODE=1` (opt-in ablation). `off` **hard-overrides** — removes any pre-existing `RMLX_PLANAR_FLASH_DECODE` from the env so a stale `=1` cannot latch the OnceLock. |
 | `--require-smoke-probe` | bool flag | off | Run 8-token smoke probe on every model load; reject `BrokenPunctLoop` / `BrokenNan` results with HTTP 503. |
-| `--max-loaded-models` | usize | 1 | Maximum models held resident in GPU memory. LRU eviction when exceeded. |
+| `--max-loaded-models` | usize | 1 | Maximum models held resident in GPU memory. LRU eviction when exceeded. Also bounds registry eager-preload: only the alphabetically-first `min(cap, N)` model ids are warmed at boot (anything beyond the cap would be evicted by the next load, so preloading it is pure waste). |
 | `--max-queue-depth` | usize | 64 | FIFO admission queue depth. Requests beyond this limit receive HTTP 429. `0` = unlimited. |
 | `--adaptive-admission` | bool flag | off | Enable the in-process adaptive admission controller. When set, the controller adjusts `max_queue_depth` dynamically based on SLA telemetry and rejects requests with HTTP 503 + `Retry-After: 5` when the end-to-end step estimate exceeds `2 × step-target-ms`. When absent, the static `--max-queue-depth` is used unchanged. |
 | `--step-target-ms` | u64 | 500 | End-to-end step SLA target in milliseconds for the adaptive controller. Anticipatory 503 fires when `est_step > 2 × this`. Requires `--adaptive-admission`. `--ttft-target-ms` is accepted as a hidden alias for backward compatibility. |