ai-dynamo · zhengluo-nv · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -135,7 +135,6 @@ Cache directory resolution order: `MODEL_EXPRESS_CACHE_DIRECTORY` -> `HF_HUB_CAC
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `MODEL_EXPRESS_URL` | `localhost:8001` | gRPC server address |
-| `MX_REGISTER_LOADERS` | `1` | Auto-register the mx vLLM loader |
 | `MX_POOL_REG` | `0` | Allocation-level NIXL registration (registers cudaMalloc blocks instead of individual tensors) |
 | `MX_EXPECTED_WORKERS` | `8` | Number of GPU workers to wait for |
 | `MX_SYNC_PUBLISH` | `1` | Source: wait for all workers before publishing |

diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ ModelExpress orchestrates the full flow—from download to GPU memory. It ensure
 
 - **Cold start reduction** — GPU-to-GPU P2P transfer over InfiniBand instead of disk load
 - **HuggingFace caching** — PVC-backed cache, `HF_HUB_OFFLINE`, `ignore_weights`, `get_model_path` for Dynamo
-- **P2P GPU transfer** — vLLM `mx` loader and TRT-LLM `PRESHARDED` loader with NVIDIA NIXL over RDMA
+- **P2P GPU transfer** — vLLM `modelexpress` loader (`mx` alias) and TRT-LLM `PRESHARDED` loader with NVIDIA NIXL over RDMA
 - **Metadata backends** — In-memory, Redis, or Kubernetes CRD (layered write-through for HA)
 - **Kubernetes** — Helm chart, CRDs/Redis for P2P, no-shared-storage support
 - **CLI** — Health, download, list, validate, clear; init-container support for pre-warming
@@ -64,7 +64,7 @@ ModelExpress orchestrates the full flow—from download to GPU memory. It ensure
 
 | Runtime | Integration |
 |---------|-------------|
-| vLLM | `--load-format mx` for P2P weight transfer |
+| vLLM | `--load-format modelexpress` for P2P weight transfer; `mx` is a backward-compatible alias |
 | NVIDIA Dynamo (vLLM) | `get_model_path` API; [Dynamo model cache K8s example](examples/dynamo_model_cache_k8s/README.md) |
 | TensorRT-LLM | `LoadFormat.PRESHARDED` with `MxLiveCheckpointLoader` for P2P weight transfer (beta) — [TRT-LLM examples](examples/p2p_transfer_k8s/client/trtllm/) |
 | SGLang | `remote_instance` + `modelexpress` backend with `transport=nixl` or `transport=transfer_engine` — see [`docs/SGLANG.md`](docs/SGLANG.md) |
@@ -75,7 +75,7 @@ ModelExpress orchestrates the full flow—from download to GPU memory. It ensure
 
 ![ModelExpress Architecture: Upload once, then autoscale new pods via NIXL GPUDirect RDMA from seed GPU](model-express-architecture.png)
 
-*Phase 1 — Upload once:* Model Source (HuggingFace Hub, NFS) downloads to the Seed Pod (GPU), which loads and postprocesses weights, registers VRAM with NIXL, and publishes metadata to the MX Server. *Phase 2 — Autoscale:* New pods receive weights via NIXL GPUDirect RDMA (GPU VRAM → GPU VRAM, zero-copy) from the seed GPU, using `--load-format mx` for inference.
+*Phase 1 — Upload once:* Model Source (HuggingFace Hub, NFS) downloads to the Seed Pod (GPU), which loads and postprocesses weights, registers VRAM with NIXL, and publishes metadata to the MX Server. *Phase 2 — Autoscale:* New pods receive weights via NIXL GPUDirect RDMA (GPU VRAM → GPU VRAM, zero-copy) from the seed GPU, using `--load-format modelexpress` for inference.
 
 ```
                     ┌─────────────────────────────────────────────────────────────────┐
@@ -150,7 +150,8 @@ Override [values-production.yaml](helm/values-production.yaml) for your env. Ful
 ```python
 from modelexpress import register_modelexpress_loaders
 register_modelexpress_loaders()
-# vllm serve <model> --load-format mx --worker-cls=modelexpress.vllm_worker.ModelExpressWorker
+# vllm serve <model> --load-format modelexpress
+# The mx load format is kept as a backward-compatible alias.
 ```
 
 First instance loads from disk; subsequent instances receive via RDMA. [P2P guide](examples/p2p_transfer_k8s/README.md) · [Server setup](examples/p2p_transfer_k8s/server/README.md).

diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
@@ -156,7 +156,7 @@ ModelExpress/
 │       ├── tensor_utils.py             # Tensor collection, checksums, storage views
 │       ├── transfer_safety.py          # MLA feature gate, TransferFingerprint
 │       ├── rank_utils.py               # Rank detection utilities
-│       ├── vllm_worker.py              # ModelExpressWorker (custom vLLM worker)
+│       ├── vllm_worker.py              # Compatibility worker for older manual registration
 │       ├── types.py                    # TensorDescriptor, WorkerMetadata dataclasses
 │       ├── p2p_pb2.py                  # Generated protobuf stubs
 │       └── p2p_pb2_grpc.py             # Generated gRPC stubs
@@ -516,7 +516,7 @@ Loading precedence: CLI args > environment variables > config file > defaults.
 
 | Module | Purpose |
 |--------|---------|
-| `__init__.py` | Package init, exports `register_modelexpress_loaders()` for callers to register the `mx` loader with vLLM |
+| `__init__.py` | Package init, exports `register_modelexpress_loaders()` for callers to register the `modelexpress` and `mx` loaders with vLLM |
 | `client.py` | `MxClient` - gRPC client wrapping `PublishMetadata`, `ListSources`, `GetMetadata`, and `UpdateStatus` RPCs |
 | `nixl_transfer.py` | `NixlTransferManager` - NIXL agent lifecycle, tensor registration, RDMA transfers |
 | `gds_transfer.py` | GPUDirect Storage availability check and transfer utilities |
@@ -529,7 +529,7 @@ Loading precedence: CLI args > environment variables > config file > defaults.
 | `engines/sglang/` | `SglangAdapter` and `MxModelLoader` - maps strategy hooks to SGLang's `remote_instance` backend |
 | `tensor_utils.py` | Tensor collection, checksums, storage views, `capture_tensor_attrs` |
 | `rank_utils.py` | `get_global_rank`, `get_worker_rank` |
-| `vllm_worker.py` | `ModelExpressWorker` - custom vLLM worker class (use `--worker-cls=modelexpress.vllm_worker.ModelExpressWorker`) |
+| `vllm_worker.py` | `ModelExpressWorker` - compatibility worker class for older manual-registration workflows |
 | `types.py` | `TensorDescriptor`, `WorkerMetadata`, `GetMetadataResponse` dataclasses |
 | `p2p_pb2.py` / `p2p_pb2_grpc.py` | Generated protobuf/gRPC stubs |
 
@@ -562,7 +562,7 @@ Manages a NIXL agent and RDMA transfers for a single GPU worker:
 
 ### vLLM Loader
 
-**MxModelLoader** (extends `BaseModelLoader`, registered as `--load-format mx`):
+**MxModelLoader** (extends `BaseModelLoader`, registered as `--load-format modelexpress`; `mx` alias):
 
 Thin orchestration layer that delegates to `LoadStrategyChain.run()`. Builds a `LoadContext` from vLLM config, initializes the model, runs the strategy chain, and updates global registries.
 
@@ -729,7 +729,6 @@ See [`metadata.md`](metadata.md) for the full storage schema and debugging guide
 
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `MX_REGISTER_LOADERS` | `1` | Auto-register the mx loader with vLLM |
 | `MODEL_EXPRESS_URL` | `localhost:8001` | gRPC server address |
 | `MX_SERVER_ADDRESS` | `localhost:8001` | Backward-compat alias for `MODEL_EXPRESS_URL` |
 | `MX_METADATA_BACKEND` | (required on server; `""` on client) | Server: `redis` or `kubernetes`. Client: `""` / `server` / `redis` / `kubernetes` (central server) or `k8s-service` (decentralized via K8s Service routing) |

diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md
@@ -302,7 +302,7 @@ See [`../examples/dynamo_model_cache_k8s/README.md`](../examples/dynamo_model_ca
 
 ## P2P GPU Weight Transfers
 
-ModelExpress supports GPU-to-GPU model weight transfers between supported inference instances using NVIDIA NIXL over RDMA. vLLM uses `--load-format mx`, which auto-detects whether to load from disk or receive via RDMA. SGLang uses `remote_instance` with the `modelexpress` backend; see [SGLang Clients](#sglang-clients).
+ModelExpress supports GPU-to-GPU model weight transfers between supported inference instances using NVIDIA NIXL over RDMA. vLLM uses `--load-format modelexpress`, which auto-detects whether to load from disk or receive via RDMA; `mx` is a backward-compatible alias. SGLang uses `remote_instance` with the `modelexpress` backend; see [SGLang Clients](#sglang-clients).
 
 ### Choosing a Metadata Backend
 
@@ -328,7 +328,6 @@ See [`K8S_SERVICE_BACKEND.md`](K8S_SERVICE_BACKEND.md) for the design rationale,
 | `MX_METADATA_BACKEND` | (required on server; `""` on client) | Server: `redis` or `kubernetes`. Client: `""`/`server`/`redis`/`kubernetes` (central server) or `k8s-service` (decentralized via K8s Service routing). |
 | `MODEL_EXPRESS_URL` | `localhost:8001` | gRPC server address (ignored when client uses `k8s-service` backend) |
 | `MX_SERVER_ADDRESS` | `localhost:8001` | Backward-compat alias for `MODEL_EXPRESS_URL` |
-| `MX_REGISTER_LOADERS` | `1` | Auto-register the mx loader with vLLM |
 | `MX_POOL_REG` | `0` | Allocation-level NIXL registration via `cuMemGetAddressRange`. Registers each unique cudaMalloc block instead of each tensor, typically 80-99% fewer registrations, without changing transfer semantics. `MX_VMM_ARENA=1` uses direct arena registration and does not require pool-reg. |
 | `MX_VMM_ARENA` | `0` | Route weight allocations into a CUDA VMM arena via PyTorch's `CUDAPluggableAllocator`, then register the used arena range as one NIXL MR with dmabuf at end-of-load. Reserves 16.0 TiB of VA by default, with no physical commit until allocations are mapped. Requires the `modelexpress.vmm._alloc_ext` C extension to have built at install time; if it did not, this flag is a no-op with a warning and the loader falls back to the pool-reg path. See [VMM Arena](#vmm-arena-single-mr-registration). |
 | `UCX_CUDA_COPY_REG_WHOLE_ALLOC` | (UCX default) | Set to `off` with `MX_VMM_ARENA=1` until the upstream UCX `cuda_copy_md` length-truncation fix ships. |
@@ -348,7 +347,7 @@ See [`K8S_SERVICE_BACKEND.md`](K8S_SERVICE_BACKEND.md) for the design rationale,
 | `REDIS_URL` | `redis://localhost:6379` | Redis connection URL (Redis backend only) |
 | `MX_METADATA_NAMESPACE` | `default` | K8s namespace for CRD backend |
 | `VLLM_RPC_TIMEOUT` | `7200000` | vLLM RPC timeout in ms (2 hours for large models) |
-| `VLLM_PLUGINS` | - | Set to `modelexpress` to register the mx loader |
+| `VLLM_PLUGINS` | - | Set to `modelexpress` to register the `modelexpress` and `mx` loaders |
 
 Each GPU worker publishes independently using its global rank (`torch.distributed.get_rank()`). No inter-worker coordination or barriers required.
 
@@ -599,7 +598,7 @@ spec:
             - { name: MX_MODEL_REVISION,   value: "<pinned-commit-sha>" }
             - { name: MX_WORKER_GRPC_PORT, value: "6555" }
             # MX_K8S_SERVICE_PATTERN defaults to `mx-sources`; omit unless overriding.
-          args: ["--model", "$(MODEL_NAME)", "--load-format", "mx", "--tensor-parallel-size", "2"]
+          args: ["--model", "$(MODEL_NAME)", "--load-format", "modelexpress", "--tensor-parallel-size", "2"]
           resources: { limits: { nvidia.com/gpu: 2 } }
 ```
 

diff --git a/docs/metadata.md b/docs/metadata.md
@@ -356,7 +356,7 @@ sequenceDiagram
 
 ### Three-Tier Loading Strategy
 
-The `MxModelLoader` (`--load-format mx`) auto-detects the best loading strategy:
+The `MxModelLoader` (`--load-format modelexpress`; `mx` alias) auto-detects the best loading strategy:
 
 1. **RDMA** -- If `ListSources` returns READY instances with matching rank, receive weights via NIXL/Mooncake
 2. **GDS** -- If no source available and GPUDirect Storage is available, load directly from file to GPU

diff --git a/examples/dynamo_p2p_transfer_k8s/README.md b/examples/dynamo_p2p_transfer_k8s/README.md
@@ -43,7 +43,7 @@ graph LR
 
 - **ModelExpress server** (Kubernetes CRD backend): tracks which workers have the model Ready, handles heartbeats, and reaps stale entries. Decode and prefill publish under the same `source_id` (derived from model identity), so any worker of either service can serve as an RDMA source for any other worker of either service.
 - **Frontend**: Dynamo's HTTP entry point; routes to decode workers round-robin.
-- **Workers**: `--load-format mx` means the first replica loads from disk and publishes metadata; every subsequent replica receives weights from a Ready source over RDMA.
+- **Workers**: `--load-format modelexpress` means the first replica loads from disk and publishes metadata; every subsequent replica receives weights from a Ready source over RDMA. The `mx` load format is kept as a backward-compatible alias.
 
 ## Prerequisites
 

diff --git a/examples/dynamo_p2p_transfer_k8s/vllm/vllm-multi-node-aggregated.yaml b/examples/dynamo_p2p_transfer_k8s/vllm/vllm-multi-node-aggregated.yaml
@@ -142,7 +142,7 @@ spec:
             - --model
             - nvidia/Kimi-K2.5-NVFP4
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "4"
             - --pipeline-parallel-size

diff --git a/examples/dynamo_p2p_transfer_k8s/vllm/vllm-single-node-disaggregated.yaml b/examples/dynamo_p2p_transfer_k8s/vllm/vllm-single-node-disaggregated.yaml
@@ -138,7 +138,7 @@ spec:
             - --model
             - nvidia/Kimi-K2.5-NVFP4
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "4"
             - --max-model-len
@@ -207,7 +207,7 @@ spec:
             - --model
             - nvidia/Kimi-K2.5-NVFP4
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "4"
             - --max-model-len

diff --git a/examples/k8s_service_sources/sources-tp2-single-pod.yaml b/examples/k8s_service_sources/sources-tp2-single-pod.yaml
@@ -112,7 +112,7 @@ spec:
             - --model
             - $(MODEL_NAME)
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "2"
           # Expose both per-rank ports on the pod; the Service routes

diff --git a/examples/k8s_service_sources/sources-tp2.yaml b/examples/k8s_service_sources/sources-tp2.yaml
@@ -106,7 +106,7 @@ spec:
             - --model
             - $(MODEL_NAME)
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "1"
           ports:
@@ -211,7 +211,7 @@ spec:
             - --model
             - $(MODEL_NAME)
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "1"
           ports:

diff --git a/examples/k8s_service_sources/target.yaml b/examples/k8s_service_sources/target.yaml
@@ -85,7 +85,7 @@ spec:
             - --model
             - $(MODEL_NAME)
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "1"
           ports:
@@ -160,7 +160,7 @@ spec:
             - --model
             - $(MODEL_NAME)
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "1"
           ports:

diff --git a/examples/model_streamer_k8s/README.md b/examples/model_streamer_k8s/README.md
@@ -1,6 +1,6 @@
 # ModelStreamer Kubernetes Examples
 
-These examples deploy vLLM with ModelExpress `--load-format mx` and stream model weights from storage through RunAI ModelStreamer. They do not require a ModelExpress server, RDMA resources, or a model PVC for object storage sources.
+These examples deploy vLLM with ModelExpress `--load-format modelexpress` and stream model weights from storage through RunAI ModelStreamer. They do not require a ModelExpress server, RDMA resources, or a model PVC for object storage sources. The `mx` load format is kept as a backward-compatible alias.
 
 For P2P RDMA weight transfer between vLLM pods, see [`../p2p_transfer_k8s/`](../p2p_transfer_k8s/).
 
@@ -18,7 +18,7 @@ For the Azure Blob end-to-end setup, see [`client/vllm/README.md`](client/vllm/R
 
 All manifests use:
 
-- `--load-format mx`
+- `--load-format modelexpress`
 - `VLLM_PLUGINS=modelexpress`
 - `MX_MODEL_URI` as the model path passed to vLLM
 

diff --git a/examples/model_streamer_k8s/client/vllm/README.md b/examples/model_streamer_k8s/client/vllm/README.md
@@ -4,7 +4,7 @@
 
 This recipe shows how to start a vLLM pod that loads model weights from Azure Blob Storage through ModelExpress ModelStreamer. It uses [`vllm-single-node-streamer-azure.yaml`](vllm-single-node-streamer-azure.yaml) as the deployment template.
 
-This path does not require a model PVC. The pod streams weights directly from Azure Blob Storage at startup by using `--load-format mx` and an `az://` model URI.
+This path does not require a model PVC. The pod streams weights directly from Azure Blob Storage at startup by using `--load-format modelexpress` and an `az://` model URI. The `mx` load format is kept as a backward-compatible alias.
 
 ### Prerequisites
 

diff --git a/examples/model_streamer_k8s/client/vllm/vllm-single-node-streamer-azure.yaml b/examples/model_streamer_k8s/client/vllm/vllm-single-node-streamer-azure.yaml
@@ -93,7 +93,7 @@ spec:
             - --served-model-name
             - $(MODEL_NAME)
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "8"
             - --enable-expert-parallel

diff --git a/examples/model_streamer_k8s/client/vllm/vllm-single-node-streamer-local.yaml b/examples/model_streamer_k8s/client/vllm/vllm-single-node-streamer-local.yaml
@@ -72,7 +72,7 @@ spec:
             - --model
             - $(MX_MODEL_URI)
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "8"
             - --enable-expert-parallel

diff --git a/examples/model_streamer_k8s/client/vllm/vllm-single-node-streamer-s3.yaml b/examples/model_streamer_k8s/client/vllm/vllm-single-node-streamer-s3.yaml
@@ -88,7 +88,7 @@ spec:
             - --served-model-name
             - $(MODEL_NAME)
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "8"
             - --enable-expert-parallel

diff --git a/examples/p2p_transfer_k8s/README.md b/examples/p2p_transfer_k8s/README.md
@@ -20,7 +20,7 @@ graph TD
 
 ### Key Design Points
 
-1. **Engine loader integration**: vLLM uses `--load-format mx`; SGLang uses `remote_instance` with backend `modelexpress`.
+1. **Engine loader integration**: vLLM uses `--load-format modelexpress`; `mx` is a backward-compatible alias. SGLang uses `remote_instance` with backend `modelexpress`.
 2. **MxClient**: All gRPC communication goes through `MxClient` (workers never access Redis directly).
 3. **Engine post-load hooks**: The ModelExpress adapter handles engine-specific post-load processing and tensor discovery.
 4. **Tensor Parallelism**: Full TP support with rank-matched transfers (one NIXL agent per GPU).

diff --git a/examples/p2p_transfer_k8s/client/vllm/aws_efa/vllm-aws-efa.yaml b/examples/p2p_transfer_k8s/client/vllm/aws_efa/vllm-aws-efa.yaml
@@ -115,7 +115,7 @@ spec:
           args:
             - $(MODEL_NAME)
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "1"
           ports:

diff --git a/examples/p2p_transfer_k8s/client/vllm/vllm-multi-node.yaml b/examples/p2p_transfer_k8s/client/vllm/vllm-multi-node.yaml
@@ -139,7 +139,7 @@ spec:
                 ray start --head --port=6379 --num-gpus=4
                 until ray status --address=127.0.0.1:6379 >/dev/null 2>&1; do sleep 1; done
                 RAY_ADDRESS=127.0.0.1:6379 vllm serve $MODEL_NAME \
-                  --load-format mx \
+                  --load-format modelexpress \
                   --tensor-parallel-size 4 \
                   --pipeline-parallel-size 2 \
                   --max-model-len 32768 \

diff --git a/examples/p2p_transfer_k8s/client/vllm/vllm-single-node-p2p.yaml b/examples/p2p_transfer_k8s/client/vllm/vllm-single-node-p2p.yaml
@@ -108,7 +108,7 @@ spec:
             - --model
             - $(MODEL_NAME)
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "8"
           resources:

diff --git a/examples/p2p_transfer_k8s/client/vllm/vllm-single-node.yaml b/examples/p2p_transfer_k8s/client/vllm/vllm-single-node.yaml
@@ -103,7 +103,7 @@ spec:
             - --model
             - $(MODEL_NAME)
             - --load-format
-            - mx
+            - modelexpress
             - --tensor-parallel-size
             - "8"
           resources: