diff --git a/crates/rmlx-cli/src/commands/serve.rs b/crates/rmlx-cli/src/commands/serve.rs index e22c7f0..0ede8bf 100644 --- a/crates/rmlx-cli/src/commands/serve.rs +++ b/crates/rmlx-cli/src/commands/serve.rs @@ -1132,15 +1132,29 @@ pub(crate) fn run_serve( tts_model: Arc::new(parking_lot::RwLock::new(None)), }; - // Eager model preload — load every registry entry before - // serving requests so cold TTFT does not include model-load overhead. + // Eager model preload — warm the resident set before serving requests + // so cold TTFT does not include model-load overhead. Bounded to AT MOST + // `max_loaded_models` entries (the `cap` alphabetically-first ids, since + // `registry.list()` iterates a BTreeMap sorted by id — not JSON order): + // anything beyond the resident cap would be evicted by the next + // `ensure_loaded` (see `AppState::ensure_loaded` LRU swap), so preloading + // it is pure load-cost + transient memory pressure with nothing kept. + // The rest stay lazy — the documented load-on-demand / idle-unload path + // handles them on first request. // `ensure_loaded` is synchronous (CPU-bound disk + dequant); run it in // the blocking-thread pool so we do not stall the async runtime. // Best-effort: a load failure logs a warning but does not abort startup // (the first real request will attempt the load again via the normal // on-demand path and surface a 503 if it still fails). { - let ids: Vec = state.registry.list().iter().map(|e| e.id.clone()).collect(); + let cap = max_loaded_models.max(1); + let ids: Vec = state + .registry + .list() + .iter() + .take(cap) + .map(|e| e.id.clone()) + .collect(); let state_ref = state.clone(); tokio::task::spawn_blocking(move || { for id in &ids { diff --git a/crates/rmlx-cli/tests/e2e/manifest.toml b/crates/rmlx-cli/tests/e2e/manifest.toml index 6ccf3a4..fb2a689 100644 --- a/crates/rmlx-cli/tests/e2e/manifest.toml +++ b/crates/rmlx-cli/tests/e2e/manifest.toml @@ -715,8 +715,9 @@ tags = ["phase2"] # Multi-model lifecycle: the runner owns a registry-mode serve (Bonsai + a 2nd # model resolved from GEMMA4_E2B) under single-MLX discipline. Proves: -# (a) load A → loaded; (c) cap=1 eager preload of [A,B] → B resident, A LRU- -# evicted (status flips); (d) explicit unload B → loaded:false, 2nd unload → +# (a) load A → loaded; (c) cap=1 warms the alphabetically-first id (one entry, +# bounded to cap); explicit load B forces the LRU swap → B resident, A evicted +# (status flips); (d) explicit unload B → loaded:false, 2nd unload → # 404; (e) claim enforcement — a 2nd `rmlx serve` on the HELD port is rejected # (exit 11, no competing Metal context). When GEMMA4_E2B is absent the runner # runs the single-model subset (leg a + claim leg) and marks the 2-model legs diff --git a/crates/rmlx-cli/tests/e2e/runner.rs b/crates/rmlx-cli/tests/e2e/runner.rs index 16deaea..528ad2b 100644 --- a/crates/rmlx-cli/tests/e2e/runner.rs +++ b/crates/rmlx-cli/tests/e2e/runner.rs @@ -1957,9 +1957,11 @@ fn assert_cache_hit_equivalence( /// registry path (multiple model entries) instead of a single `--model`, and /// leaves `RUST_LOG=warn` (the lifecycle proof reads the HTTP API, not logs). /// -/// Registry mode eagerly pre-loads every entry at startup, bounded by the slot -/// LRU at `cap` — so on a green `/health` the resident set is already the -/// `cap`-survivor of the eager preload (see serve.rs "Eager model preload"). +/// Registry mode eagerly pre-loads AT MOST `cap` entries at startup (the +/// alphabetically-first `min(cap, N)` model ids, since registry entries are +/// BTreeMap-sorted by id; see serve.rs "Eager model preload") — so on a green +/// `/health` those ids are already resident and the rest stay lazy until their +/// first request. fn spawn_serve_registry( registry_json: &std::path::Path, port: u16, @@ -2131,9 +2133,11 @@ fn assert_model_lifecycle( } // ── Legs (a)+(c)+(d): cap=1 registry with A (+B when present). ─────────── - // With cap=1 + eager preload, the LAST registry entry survives the preload. - // Order [A, B] → B survives → A evicted. That proves leg (c) LRU eviction - // directly out of the eager preload. With only A, A is resident (leg a). + // With cap=1, eager preload warms whichever id sorts alphabetically first + // (registry entries are BTreeMap-sorted by id, not by JSON order). The + // defensive load-B below forces the LRU swap regardless of which was + // preloaded: the resident set ends up B-resident, A-evicted — proving + // cap=1 LRU eviction (leg c). With only A, A is resident (leg a). let entries_1: Vec<(&str, &std::path::Path)> = match (&model_b, &id_b) { (Some(pb), Some(idb)) => vec![(id_a, model_a), (idb.as_str(), pb.as_path())], _ => vec![(id_a, model_a)], @@ -2158,7 +2162,9 @@ fn assert_model_lifecycle( // Two-model legs (a)/(c)/(d) when B is present; single-model leg (a) only // otherwise. if let Some(idb) = id_b.clone() { - // Eager preload of [A,B] at cap=1 → B resident, A evicted: leg (c). + // Eager preload at cap=1 warms the alphabetically-first id (one entry); + // the other stays lazy. The defensive load-B below forces the LRU + // swap regardless of which was preloaded: leg (c). let a_loaded = match model_loaded(port, id_a) { Ok(v) => v, Err(e) => return fail_lc(&cap1_guard, &lc_home, mk, format!("status A (cap1): {e}")), diff --git a/docs/CLI.md b/docs/CLI.md index 018a2af..a2f5e39 100644 --- a/docs/CLI.md +++ b/docs/CLI.md @@ -58,7 +58,7 @@ mutually exclusive. | Flag | Type | Default | Description | |---|---|---|---| | `--model` | path | — | Path to a model snapshot directory. Mutually exclusive with `--registry`. | -| `--registry` | path | — | Path to a JSON registry file. Format: `{"models":[{"id":"name","path":"/abs/path"},…]}`. Mutually exclusive with `--model`. | +| `--registry` | path | — | Path to a JSON registry file. Format: `{"models":[{"id":"name","path":"/abs/path"},…]}`. Mutually exclusive with `--model`. At startup the server eagerly warms **at most `--max-loaded-models`** entries (the alphabetically-first `cap` model ids, since the registry iterates entries sorted by id — not JSON array order); the rest stay lazy and load on first request (load-on-demand + idle-unload). A large registry therefore does not pull every model through GPU memory at boot. | | `--profile` | string | — | Named launch profile from `/profiles.toml`. CLI flags override profile values. See `rmlx profile list`. | | `--port` | u16 | 8080 | TCP port to listen on. | | `--host` | string | `127.0.0.1` | Host or IP to bind. | @@ -83,7 +83,7 @@ mutually exclusive. | `--turbo-flash-lock` | bool flag | off | Enable TurboFlash lock variant. Has no effect unless `--turbo-flash` or `RMLX_TURBO_FLASH=1` is also active. | | `--planar-flash-decode` | `on` \| `off` \| `auto` | `auto` | PlanarK single-pass flash-decode MSL kernel. `auto` (default): resolves OFF on every host — validation confirmed the kernel is bit-for-bit identical to the fused chain (`update_and_sdpa_planar_k_fused` dispatch_delta>0 with output matching OFF byte-for-byte on Bonsai) but did not deliver a measurable decode-TPS gain (-0.19% mean at 4k canary; well below the ≥10% Auto-flip gate). A pre-existing PlanarK-on-Bonsai long-prompt chunked-prefill bug (`docs/KV_QUANT.md` §"Correctness gap") also prevented the NIAH correctness anchor from passing on the only reachable arch. `on` forces `RMLX_PLANAR_FLASH_DECODE=1` (opt-in ablation). `off` **hard-overrides** — removes any pre-existing `RMLX_PLANAR_FLASH_DECODE` from the env so a stale `=1` cannot latch the OnceLock. | | `--require-smoke-probe` | bool flag | off | Run 8-token smoke probe on every model load; reject `BrokenPunctLoop` / `BrokenNan` results with HTTP 503. | -| `--max-loaded-models` | usize | 1 | Maximum models held resident in GPU memory. LRU eviction when exceeded. | +| `--max-loaded-models` | usize | 1 | Maximum models held resident in GPU memory. LRU eviction when exceeded. Also bounds registry eager-preload: only the alphabetically-first `min(cap, N)` model ids are warmed at boot (anything beyond the cap would be evicted by the next load, so preloading it is pure waste). | | `--max-queue-depth` | usize | 64 | FIFO admission queue depth. Requests beyond this limit receive HTTP 429. `0` = unlimited. | | `--adaptive-admission` | bool flag | off | Enable the in-process adaptive admission controller. When set, the controller adjusts `max_queue_depth` dynamically based on SLA telemetry and rejects requests with HTTP 503 + `Retry-After: 5` when the end-to-end step estimate exceeds `2 × step-target-ms`. When absent, the static `--max-queue-depth` is used unchanged. | | `--step-target-ms` | u64 | 500 | End-to-end step SLA target in milliseconds for the adaptive controller. Anticipatory 503 fires when `est_step > 2 × this`. Requires `--adaptive-admission`. `--ttft-target-ms` is accepted as a hidden alias for backward compatibility. |