EricLBuehler · camalolo · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026 · Apr 5, 2026
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,9 @@ mistral.rs/
 mistralrs-web-chat/cache
 
 # mdbook output
-docs/book/
+docs/book/
+
+# Local build tooling (machine-specific)
+build.bat
+nvcc_wrapper.bat
+deploy.ps1
diff --git a/AGENTS.md b/AGENTS.md
@@ -8,17 +8,22 @@ This file provides instructions for AI agents to understand the layout of the `m
 - `/mistralrs/`           : Main Rust crate (text & multimodal inference API)
 - `/mistralrs-core/`      : Core inference logic and tensor operations (text models)
 - `/mistralrs-vision/`    : Image processing utilities (resizing, preprocessing for multimodal models)
+- `/mistralrs-audio/`     : Audio processing
 - `/mistralrs-quant/`     : Quantization support (ISQ, GGUF, GPTQ, AWQ, FP8, HQQ, etc.)
 - `/mistralrs-paged-attn/`: PagedAttention implementation
 - `/mistralrs-pyo3/`      : Python bindings (PyO3)
 - `/mistralrs-cli/`       : Unified CLI binary (commands: run, serve, bench, from-config)
 - `/mistralrs-server-core/`: Shared server core logic
+- `/mistralrs-server/`    : Server binary (standalone, separate from CLI)
+- `/mistralrs-serve/`     : Web UI and HTTP API (served by mistralrs-cli/mistralrs-server)
+- `/mistralrs-mcp/`       : Model Context Protocol client
+- `/mistralrs-macros/`    : Procedural macros for derive helpers
 - `/mistralrs-web-chat/`  : (Deprecated) Use `mistralrs serve --ui` instead
 - `/mistralrs-bench/`     : (Deprecated) Use `mistralrs bench` instead
 - `/docs/`                : Markdown documentation for models, features, and guides
 - `/examples/`            : Usage examples (Rust, Python, server samples, notebooks)
 - `/chat_templates/`      : Chat formatting templates (JSON/Jinja)
-- `/scripts/`             : Utility scripts (e.g., AWQ conversion)
+- `/scripts/`             : Utility scripts (e.g., AWZ conversion)
 
 ## Feature Organization
 
@@ -64,6 +69,61 @@ Mistral.rs supports multiple model types and advanced features via dedicated cra
    cargo install --path mistralrs-cli --features "<features>"
    ```
 
+### Building with CUDA on this machine
+
+This machine has CUDA 13.2 at `E:\Cuda` and Visual Studio 2022 (v18) at `E:\Program Files\Microsoft Visual Studio\18\Community`. The shell session does NOT have MSVC in PATH by default — you must source `vcvars64.bat` first.
+
+**The CCCL + MSVC preprocessor problem:** CUDA 13.2's CCCL library requires MSVC's conformant preprocessor (`/Zc:preprocessor`). MSVC defaults to the "traditional" preprocessor, causing a fatal `#error` in `<cuda/std/__cccl/preprocessor.h>`. The fix is to inject `--compiler-options /Zc:preprocessor` into every nvcc invocation. This is done via an nvcc wrapper script — see `nvcc_wrapper.bat` below. The wrapper is transparent to all CUDA builds (mistralrs-quant, mistralrs-core, candle-kernels) because `cudaforge` respects the `NVCC` env var.
+
+**Local build scripts (DO NOT commit):**
+
+`nvcc_wrapper.bat` — wraps nvcc, injecting `/Zc:preprocessor`:
+```bat
+@echo off
+E:\Cuda\bin\nvcc.exe --compiler-options /Zc:preprocessor %*
+```
+
+`build.bat` — sets up MSVC toolchain, points NVCC to wrapper, runs cargo:
+```bat
+@echo off
+call "E:\Program Files\Microsoft Visual Studio\18\Community\VC\Auxiliary\Build\vcvars64.bat" >NUL 2>&1
+set CUDA_COMPUTE_CAP=86
+set NVCC=%~dp0nvcc_wrapper.bat
+cargo build --release -p mistralrs-cli --features cuda %*
+```
+
+**Build from PowerShell (one-liner, no local scripts needed):**
+```powershell
+pwsh -NoProfile -Command 'cmd /c "call ""E:\Program Files\Microsoft Visual Studio\18\Community\VC\Auxiliary\Build\vcvars64.bat"" >NUL 2>&1 && set CUDA_COMPUTE_CAP=86 && set NVCC=E:\Development\Rust\mistral.rs\nvcc_wrapper.bat && cargo build --release -p mistralrs-cli --features cuda 2>&1"'
+```
+
+**Key env vars:**
+- `NVCC=<path_to_wrapper>` — tells `cudaforge` to use the wrapper instead of raw nvcc (required on this machine with CUDA 13.2 + MSVC)
+- `CUDA_COMPUTE_CAP=86` — target GPU architecture (RTX 3070)
+- `CCCL_IGNORE_MSVC_TRADITIONAL_PREPROCESSOR_WARNING=1` — does NOT work reliably; the wrapper approach above is required instead
+
+**Build only the CLI (no CUDA kernels):**
+```bash
+cargo build --release -p mistralrs-cli
+```
+
+**Deploy to Windows service:**
+- Service name: `MistralRs` (managed by NSSM — Non-Sucking Service Manager)
+- Deploy dir: `E:\MistralRs\`
+- Binary: `target\release\mistralrs.exe` → copy to `E:\MistralRs\mistralrs.exe`
+- Start/stop (requires UAC elevation):
+  ```powershell
+  Start-Process nssm -ArgumentList 'stop','MistralRs' -Verb RunAs -Wait
+  Start-Process nssm -ArgumentList 'start','MistralRs' -Verb RunAs -Wait
+  ```
+- Command: `mistralrs.exe serve --ui --idle-timeout-secs 1800 --models-dir=E:\MistralRs\models -p 1234`
+- **NSSM quoting gotcha:** NSSM calls `CreateProcess` directly (not cmd.exe), so quotes around argument values with spaces get stripped. Use `--param=value` syntax (no quotes needed) instead of `--param "value with spaces"`. Example: `--models-dir=E:\MistralRs\models`, NOT `--models-dir "E:\MistralRs\models"`.
+- Models dir: `E:\MistralRs\models\` (contains llama-3.1-8b, mistral-7b-v03, phi-3.5-mini, qwen3-4b, qwen3-8b)
+- UI override dir: `E:\MistralRs\ui\` (optional disk-based UI files; accessed at `http://127.0.0.1:1234/ui` — note: no trailing slash)
+
+**DO NOT commit:** `build.bat`, `deploy.ps1`, `nvcc_wrapper.bat` (local tooling)
+**DO NOT touch:** `E:\MistralRs\` deployment directory (permission-blocked from git)
+
 ## Models
 
 When integrating a new model, make sure it respects all of the varbuilder `.pp` calls. In Candle, a VarBuilder maintains an internal path vector that acts like a “current working directory” for model weights; every call to pp("sub") (alias for push_prefix) clones the builder and appends sub, so successive calls accumulate a dotted prefix such as transformer.h.0 while leaving the original builder untouched . When you eventually call get(...), Candle joins that prefix with the tensor name (prefix + "." + name) and looks it up in the checkpoint backend, producing keys that exactly match the dot-separated names emitted by PyTorch’s state_dict/named_parameters, which means PyTorch-trained weights can be loaded without any renaming  . This lets you recreate the PyTorch module tree in Rust by “walking” it: e.g. vb.pp("word_embeddings") grabs word_embeddings.*, while a chain like vb.pp("encoder").pp("layers").pp(i.to_string()) targets keys such as encoder.layers.0.*, exactly as shown in community tutorials porting Transformers models to Candle  . As one maintainer put it, the prefix system lets you “cd” around the parameter hierarchy, giving a lightweight namespace mechanism that keeps Candle fully compatible with PyTorch naming conventions while remaining ergonomic to use.

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
     "mistralrs-cli",
     "mistralrs-core",
     "mistralrs-pyo3",
+    "mistralrs-serve",
     "mistralrs",
     "mistralrs-bench",
     "mistralrs-vision",
@@ -87,6 +88,7 @@ half = "2.7.1"
 rayon = "1.11.0"
 url = "2.5.8"
 utoipa = "5.4.0"
+comfy-table = "7.1"
 walkdir = "2.5.0"
 data-url = "0.3.2"
 float8 = "0.7.0"
@@ -187,6 +189,7 @@ mistralrs-paged-attn = { path = "mistralrs-paged-attn", version = "0.8.1" }
 mistralrs-quant = { path = "mistralrs-quant", version = "0.8.1" }
 mistralrs-vision = { path = "mistralrs-vision", version = "0.8.1" }
 mistralrs-server-core = { path = "mistralrs-server-core", version = "0.8.1" }
+mistralrs-serve = { path = "mistralrs-serve", version = "0.8.1" }
 mistralrs = { path = "mistralrs", version = "0.8.1" }
 mistralrs-audio = { path = "mistralrs-audio", version = "0.8.1" }
 mistralrs-mcp = { path = "mistralrs-mcp", version = "0.8.1" }

diff --git a/build.bat b/build.bat
@@ -0,0 +1,6 @@
+@echo off
+call "E:\Program Files\Microsoft Visual Studio\18\Community\VC\Auxiliary\Build\vcvars64.bat" >NUL 2>&1
+set CCCL_IGNORE_MSVC_TRADITIONAL_PREPROCESSOR_WARNING=1
+set CUDA_COMPUTE_CAP=89
+set NVCC=%~dp0nvcc_wrapper.bat
+cargo build --release -p mistralrs-cli --features cuda %*
diff --git a/docs/CLI.md b/docs/CLI.md
@@ -129,6 +129,12 @@ mistralrs serve -m Qwen/Qwen3-4B -p 3000
 
 # Start server with MCP support
 mistralrs serve -m Qwen/Qwen3-4B --mcp-port 8081
+
+# Serve all models in a directory with lazy loading
+mistralrs serve --models-dir ./models --idle-timeout-secs 1800
+
+# Combine auto-discovery with the web UI
+mistralrs serve --models-dir ./models --ui --idle-timeout-secs 1800
 ```
 
 **Server Options:**
@@ -138,6 +144,10 @@ mistralrs serve -m Qwen/Qwen3-4B --mcp-port 8081
 | `-p, --port <PORT>` | `1234` | HTTP server port |
 | `--host <HOST>` | `0.0.0.0` | Bind address |
 | `--ui` | disabled | Serve built-in web UI at `/ui` |
+
+To use a custom web UI, place your files in a `ui/` directory next to the `mistralrs` executable. Any file served from `/ui/` will be loaded from this directory if it exists, falling back to the built-in UI otherwise. The server logs a message at startup when a UI override directory is detected.
+| `--models-dir <PATH>` | none | Directory to scan for models (auto-discovery / lazy loading). Each subdirectory is treated as a separate model, loaded on first request |
+| `--idle-timeout-secs <N>` | `0` (disabled) | Auto-unload models after N seconds without requests. Unloaded models reload automatically on next request |
 | `--mcp-port <PORT>` | none | MCP protocol server port |
 | `--mcp-config <PATH>` | none | MCP client configuration file |
 

diff --git a/docs/CLI_CONFIG.md b/docs/CLI_CONFIG.md
@@ -57,6 +57,10 @@ HTTP server configuration.
 | `port` | `1234` | HTTP server port |
 | `host` | `"0.0.0.0"` | Bind address |
 | `ui` | `false` | Serve built-in web UI at `/ui` |
+
+Custom UI files can be placed in a `ui/` directory next to the executable (no config option needed). Files in this directory override the built-in UI.
+| `models_dir` | none | Directory to scan for models (auto-discovery / lazy loading). Each subdirectory is treated as a separate model, loaded on first request |
+| `idle_timeout_secs` | `0` (disabled) | Auto-unload models after N seconds without requests. Unloaded models reload automatically on next request |
 | `mcp_port` | none | MCP protocol server port (enables MCP if set) |
 | `mcp_config` | none | MCP client configuration file path |
 

diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md
@@ -87,6 +87,18 @@ mistralrs serve --ui --isq 4 -m Qwen/Qwen3-4B
 # Visit http://localhost:1234/ui
 ```
 
+To override the built-in UI with your own files, create a `ui/` directory next to the `mistralrs` executable. Files placed there are served in place of the embedded UI.
+
+### Auto-Discovery and Lazy Loading
+
+Instead of specifying a single model with `-m`, you can point `--models-dir` at a directory of models. Each subdirectory is treated as a separate model (GGUF, GGML, or Safetensors/Plain format is auto-detected). Models are loaded lazily on first request and the directory is polled for changes, so adding or removing a model subdirectory is picked up automatically.
+
+Combined with `--idle-timeout-secs`, models are automatically unloaded after a period of inactivity and reloaded on demand. This is useful for managing GPU memory when serving many models.
+
+```bash
+mistralrs serve --models-dir ./models --ui --idle-timeout-secs 1800
+```
+
 ## Step 5: Use the Python SDK
 
 ```bash

diff --git a/docs/HTTP.md b/docs/HTTP.md
@@ -544,6 +544,7 @@ The `status` field in responses can be one of:
 | Status | Description |
 |--------|-------------|
 | `loaded` | Model is loaded and ready to serve requests |
+| `pending` | Model is registered but not yet loaded (will load on first request) |
 | `unloaded` | Model is unloaded but can be reloaded |
 | `reloading` | Model is currently being reloaded |
 | `not_found` | Model ID not recognized |
@@ -559,6 +560,10 @@ When an error occurs, the response may include an `error` field with additional
 }
 ```
 
+### Idle Timeout
+
+When the `--idle-timeout-secs` option is set (or `idle_timeout_secs` in the config file), models are automatically unloaded after the specified number of seconds without any requests. Unloaded models are reloaded automatically on the next request targeting them. This works in both single-model and multi-model modes.
+
 ### Auto-Reload Behavior
 
 When a request (e.g., chat completion) is sent to an unloaded model, the model will automatically reload before processing the request. This enables a "lazy loading" pattern where models are only loaded when needed, helping manage GPU memory efficiently.

diff --git a/docs/multi_model/overview.md b/docs/multi_model/overview.md
@@ -255,12 +255,17 @@ Response:
 
 Possible status values:
 - `loaded`: Model is loaded and ready
+- `pending`: Model is registered but not yet loaded (will load on first request)
 - `unloaded`: Model is unloaded but can be reloaded
 - `reloading`: Model is currently being reloaded
 - `not_found`: Model ID not recognized
 - `no_loader_config`: Model cannot be reloaded (missing loader configuration)
 - `internal_error`: An internal error occurred
 
+### Idle Timeout
+
+When the `--idle-timeout-secs` option is set (or `idle_timeout_secs` in the config file), models are automatically unloaded after the specified number of seconds without any requests. Unloaded models are reloaded automatically on the next request targeting them.
+
 ### Auto-Reload
 
 When a request is sent to an unloaded model, it will automatically reload before processing the request. This enables a "lazy loading" pattern where models are only loaded when needed.

diff --git a/mistralrs-cli/Cargo.toml b/mistralrs-cli/Cargo.toml
@@ -21,6 +21,7 @@ doc = false
 [dependencies]
 mistralrs-core = { workspace = true }
 mistralrs-server-core = { workspace = true }
+mistralrs-serve = { workspace = true }
 mistralrs = { workspace = true }
 candle-core = { workspace = true }
 
@@ -36,29 +37,21 @@ toml = { workspace = true }
 ctrlc = { workspace = true }
 rustyline = { workspace = true }
 directories = { workspace = true }
-axum = { workspace = true, features = ["multipart", "ws"] }
 either = { workspace = true }
 indexmap = { workspace = true }
 regex = { workspace = true }
-include_dir = { workspace = true }
-mime_guess = { workspace = true }
-tower-http = { workspace = true, features = ["fs"] }
-uuid = { workspace = true }
-chrono = { workspace = true }
-futures-util = { workspace = true }
-image = { workspace = true }
-comfy-table = "7"
-dirs = { workspace = true }
+axum = { workspace = true }
+comfy-table = { workspace = true }
 walkdir = { workspace = true }
 
 [features]
 default = []
-cuda = ["mistralrs-core/cuda", "mistralrs-server-core/cuda"]
-nccl = ["mistralrs-core/nccl", "mistralrs-server-core/nccl"]
-cudnn = ["mistralrs-core/cudnn", "mistralrs-server-core/cudnn"]
-flash-attn = ["cuda", "mistralrs-core/flash-attn", "mistralrs-server-core/flash-attn"]
-flash-attn-v3 = ["cuda", "mistralrs-core/flash-attn-v3", "mistralrs-server-core/flash-attn-v3"]
-accelerate = ["mistralrs-core/accelerate", "mistralrs-server-core/accelerate"]
-metal = ["mistralrs-core/metal", "mistralrs-server-core/metal"]
-mkl = ["mistralrs-core/mkl", "mistralrs-server-core/mkl"]
-ring = ["mistralrs-core/ring", "mistralrs-server-core/ring"]
+cuda = ["mistralrs-core/cuda", "mistralrs-server-core/cuda", "mistralrs-serve/cuda"]
+nccl = ["mistralrs-core/nccl", "mistralrs-server-core/nccl", "mistralrs-serve/nccl"]
+cudnn = ["mistralrs-core/cudnn", "mistralrs-server-core/cudnn", "mistralrs-serve/cudnn"]
+flash-attn = ["cuda", "mistralrs-core/flash-attn", "mistralrs-server-core/flash-attn", "mistralrs-serve/flash-attn"]
+flash-attn-v3 = ["cuda", "mistralrs-core/flash-attn-v3", "mistralrs-server-core/flash-attn-v3", "mistralrs-serve/flash-attn-v3"]
+accelerate = ["mistralrs-core/accelerate", "mistralrs-server-core/accelerate", "mistralrs-serve/accelerate"]
+metal = ["mistralrs-core/metal", "mistralrs-server-core/metal", "mistralrs-serve/metal"]
+mkl = ["mistralrs-core/mkl", "mistralrs-server-core/mkl", "mistralrs-serve/mkl"]
+ring = ["mistralrs-core/ring", "mistralrs-server-core/ring", "mistralrs-serve/ring"]
diff --git a/mistralrs-cli/src/args/server.rs b/mistralrs-cli/src/args/server.rs
@@ -43,6 +43,18 @@ pub struct ServerOptions {
     #[arg(long)]
     #[serde(default)]
     pub tool_dispatch_url: Option<String>,
+
+    /// Auto-unload models after this many seconds without requests (0 to disable)
+    #[arg(long, default_value_t = 0)]
+    #[serde(default)]
+    pub idle_timeout_secs: u64,
+
+    /// Directory to scan for models (auto-discovery / lazy loading mode).
+    /// Each subdirectory is treated as a separate model.
+    /// Models are loaded on first request.
+    #[arg(long)]
+    #[serde(default)]
+    pub models_dir: Option<PathBuf>,
 }
 
 impl Default for ServerOptions {
@@ -55,6 +67,8 @@ impl Default for ServerOptions {
             ui: false,
             max_tool_rounds: None,
             tool_dispatch_url: None,
+            idle_timeout_secs: 0,
+            models_dir: None,
         }
     }
 }

diff --git a/mistralrs-cli/src/commands/config.rs b/mistralrs-cli/src/commands/config.rs
@@ -12,7 +12,6 @@ use mistralrs_server_core::{
 use crate::commands::run::interactive_mode;
 use crate::commands::serve::convert_to_model_selected;
 use crate::config::{load_cli_config, CliConfig};
-use crate::ui::build_ui_router;
 
 /// Execute the CLI using a TOML configuration file.
 pub async fn run_from_config(path: std::path::PathBuf) -> Result<()> {
@@ -99,7 +98,7 @@ async fn run_serve_config(cfg: crate::config::ServeConfig) -> Result<()> {
         .await?;
 
     if server.ui {
-        let ui_router = build_ui_router(
+        let ui_router = mistralrs_serve::ui::build_ui_router(
             mistralrs_for_ui,
             runtime.enable_search,
             runtime.search_embedding_model.map(|m| m.into()),

diff --git a/mistralrs-cli/src/commands/mod.rs b/mistralrs-cli/src/commands/mod.rs
@@ -17,5 +17,5 @@ pub use doctor::run_doctor;
 pub use login::run_login;
 pub use quantize::run_quantize;
 pub use run::run_interactive;
-pub use serve::run_server;
+pub use serve::{run_server, run_server_lazy};
 pub use tune::run_tune;