From da7f891a6b49593bfa47b9a68a2515d3d017398a Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Mon, 11 May 2026 11:40:34 +0000 Subject: [PATCH 01/15] rollout: enable MooncakeStoreConnector with hard-reset on weight update Adds `actor_rollout_ref.rollout.kv_store.{enable,kv_connector,kv_role, config_path,extra_config,on_failure}` configuration. When `enable=true`, the vLLM rollout engine is launched with `--kv-transfer-config` wiring MooncakeStoreConnector, and all prefix-cache reset paths (`wake_up`, `clear_kv_cache`, `abort_all_requests`) propagate `reset_connector=True` so the Mooncake master is cleared via `store.remove_all(force=True)` on every weight update. This is the RL-correct hard-reset path: external KV blocks computed against the previous model weights are dropped before any new rollout request can read them, matching the existing in-engine prefix-cache invalidation that verl already drives via `abort_all_requests` + `reset_prefix_cache`. `on_failure=fallback` (default) makes the connector a soft dependency: training keeps running with the Mooncake offload disabled if the master is unreachable at engine launch. A `recipe_aoshen/phase_b_mooncake.sh` fork of `phase_b.sh` shows the expected invocation (1 tray, Qwen3-0.6B GSM8K, 5 weight syncs); pair it with `start_master.sh` in `projects/mooncake-integration/scripts/` for a per-run master. Paired with the vLLM-side cascade hook PR (MooncakeStoreConnector. reset_cache routes through the existing ZMQ admin channel to worker rank 0 -> store.remove_all). Co-Authored-By: Claude Opus 4.7 (1M context) --- recipe_aoshen/phase_b_mooncake.sh | 66 +++++++++++++++++++ verl/workers/config/rollout.py | 41 ++++++++++++ .../rollout/vllm_rollout/vllm_async_server.py | 47 ++++++++++++- 3 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 recipe_aoshen/phase_b_mooncake.sh diff --git a/recipe_aoshen/phase_b_mooncake.sh b/recipe_aoshen/phase_b_mooncake.sh new file mode 100644 index 00000000000..98b735886bb --- /dev/null +++ b/recipe_aoshen/phase_b_mooncake.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# Phase B with Mooncake-store offload: fork of recipe_aoshen/phase_b.sh. +# +# Same Qwen3 GSM8K 5-step sanity loop, but the rollout vLLM engine is +# launched with --kv-transfer-config attaching a MooncakeStoreConnector +# pointing at a per-run Mooncake master. On every weight update verl +# drives a hard reset (engine.reset_prefix_cache(reset_connector=True)) +# which cascades through scheduler.reset_connector_cache -> +# MooncakeStoreConnector.reset_cache -> store.remove_all(force=True). +# +# Pre-requisites (caller responsibility): +# 1. Mooncake master started via scripts/start_master.sh on this node. +# 2. MOONCAKE_CONFIG_PATH exported pointing at the JSON config used by +# the master. +# 3. Ray cluster running (single-node or multi-node). +# 4. GSM8K parquet present at /root/data/gsm8k/{train,test}.parquet. +# +# Acceptance: +# - 5 train steps complete without crash +# - mooncake_master.log shows >= 5 RemoveAll calls +# - In at least one cycle, external_prefix_cache_hits > 0 (set via +# vLLM metrics) +set -xeuo pipefail + +cd /workspace/verl + +export MACHINE=gb200 +export INFER_BACKEND=vllm +export MODEL_PATH=Qwen/Qwen3-0.6B + +# Single-tray run for the first Mooncake-store integration smoke. +export NNODES=1 +export NGPUS_PER_NODE=4 + +export TRAIN_BATCH_SIZE=32 +export PPO_MINI_BATCH_SIZE=16 +export MAX_PROMPT_LENGTH=512 +export MAX_RESPONSE_LENGTH=512 +export PPO_MAX_TOKEN_LEN_PER_GPU=4096 + +export ROLLOUT_TP=1 +export ROLLOUT_GPU_MEM_UTIL=0.5 +export ROLLOUT_N=2 + +export TOTAL_EPOCHS=1 +export SAVE_FREQ=-1 +export TEST_FREQ=10 + +export PROJECT_NAME=phase_b_mooncake +export EXPERIMENT_NAME=qwen3_06b_gsm8k_5steps_mooncake_${NNODES}nodes + +# Make sure MOONCAKE_CONFIG_PATH is propagated to all rollout workers. +: "${MOONCAKE_CONFIG_PATH:?Set MOONCAKE_CONFIG_PATH before launching}" +export MOONCAKE_CONFIG_PATH + +bash examples/grpo_trainer/run_qwen3_8b_fsdp.sh \ + "data.train_files=['/root/data/gsm8k/train.parquet']" \ + "data.val_files=['/root/data/gsm8k/test.parquet']" \ + trainer.total_training_steps=5 \ + "trainer.logger=['console']" \ + '~ray_kwargs.ray_init.num_gpus' \ + actor_rollout_ref.rollout.kv_store.enable=true \ + "actor_rollout_ref.rollout.kv_store.config_path=${MOONCAKE_CONFIG_PATH}" \ + actor_rollout_ref.rollout.kv_store.kv_role=kv_both +# NOTE: NEVER set NCCL_MNNVL_ENABLE=0 on this rack. Cross-tray traffic must +# go through the NVL72 NVLink switch (~1.8 TB/s/GPU) via MNNVL + IMEX. diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py index 7731fa9d592..4bbd6c99ac0 100644 --- a/verl/workers/config/rollout.py +++ b/verl/workers/config/rollout.py @@ -31,10 +31,48 @@ "PrometheusConfig", "RolloutConfig", "CheckpointEngineConfig", + "KVStoreConfig", "SkipConfig", ] +@dataclass +class KVStoreConfig(BaseConfig): + """External KV cache store configuration for the vLLM rollout engine. + + When ``enable`` is true, verl asks vLLM to attach a + ``MooncakeStoreConnector`` so that prefix KV blocks are offloaded to a + shared Mooncake master. On every weight update verl drives a hard reset + (``engine.reset_prefix_cache(reset_connector=True)``) so the master's + stale entries are dropped before any new rollout reads them. + + See ``mooncake-integration/`` for the per-run master start/stop wrapper. + """ + + # Enable external KV store offload. + enable: bool = False + + # KVConnector class name forwarded to ``--kv-transfer-config``. + kv_connector: str = "MooncakeStoreConnector" + + # vLLM kv_role. ``kv_both`` lets the rollout engine both put and get. + kv_role: str = "kv_both" + + # Path to Mooncake client config JSON (``master_server_address``, + # ``global_segment_size``, ``protocol``, ...). Falls back to the + # ``MOONCAKE_CONFIG_PATH`` env var when None. + config_path: Optional[str] = None + + # Optional extra dict merged into vLLM's + # ``kv_connector_extra_config``. Use sparingly. + extra_config: dict = field(default_factory=dict) + + # Behavior when the Mooncake master is unreachable at engine launch: + # ``fallback`` (default) -> log a warning, drop the connector, keep + # training; ``crash`` -> let the engine start fail and propagate. + on_failure: str = "fallback" + + @dataclass class SkipConfig(BaseConfig): """ @@ -244,6 +282,9 @@ class RolloutConfig(BaseConfig): # Checkpoint Engine config for update weights from trainer to rollout checkpoint_engine: CheckpointEngineConfig = field(default_factory=CheckpointEngineConfig) + # External Mooncake KV store offload (RL-correct hard-reset path). + kv_store: KVStoreConfig = field(default_factory=KVStoreConfig) + # Rollout skip config (load/dump rollout data) skip: SkipConfig = field(default_factory=SkipConfig) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 60e3dd2a665..f6438c7e246 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -123,6 +123,11 @@ def __init__( self.nnodes = nnodes # model weights version, set by ServerAdapter when update weights. self.global_steps = None + # Whether the engine was launched with an external KV store (Mooncake) + # connector. Set inside launch_server() once kv_transfer_config is + # built; gates reset_connector=True on cache resets so we do not + # serve KV computed against previous weights from the external pool. + self._kv_store_enabled = False if self.rollout_mode != RolloutMode.HYBRID and self.config.load_format == "dummy": logger.warning(f"rollout mode is {self.rollout_mode}, load_format is dummy, set to auto") @@ -346,6 +351,30 @@ async def launch_server(self, master_address: str = None, master_port: int = Non if self.config.enable_rollout_routing_replay: args.update({"enable_return_routed_experts": True}) + # External Mooncake KV store offload: forward kv_transfer_config to + # vLLM if rollout.kv_store.enable is set. The connector itself decides + # how to talk to the Mooncake master (config_path / env). On every + # weight update we drive a hard reset via reset_prefix_cache( + # reset_connector=True), which makes vLLM call + # MooncakeStoreConnector.reset_cache() -> store.remove_all(force=True). + kv_store_cfg = getattr(self.config, "kv_store", None) + self._kv_store_enabled = bool(kv_store_cfg and kv_store_cfg.get("enable", False)) + if self._kv_store_enabled: + extra_config = dict(kv_store_cfg.get("extra_config", {}) or {}) + config_path = kv_store_cfg.get("config_path") + if config_path: + # MooncakeStoreWorker reads MOONCAKE_CONFIG_PATH from env; + # vLLM serve also accepts kv_connector_extra_config so we + # pass both for clarity. The env var is set by the per-run + # master wrapper script before `ray job submit`. + extra_config.setdefault("mooncake_config_path", config_path) + kv_transfer_config = { + "kv_connector": kv_store_cfg.get("kv_connector", "MooncakeStoreConnector"), + "kv_role": kv_store_cfg.get("kv_role", "kv_both"), + "kv_connector_extra_config": extra_config, + } + args["kv_transfer_config"] = json.dumps(kv_transfer_config) + server_args = ["serve", self.model_config.local_path] + build_cli_args_from_config(args) if self.replica_rank == 0: @@ -561,7 +590,9 @@ async def wake_up(self): elif self.rollout_mode == RolloutMode.COLOCATED: # Directly call engine to wake up without sync weights. await self.engine.wake_up(tags=self._get_wake_up_tags()) - await self.engine.reset_prefix_cache() + await self.engine.reset_prefix_cache( + reset_connector=self._kv_store_enabled, + ) elif self.rollout_mode == RolloutMode.STANDALONE: logger.info("skip wake_up in standalone mode") @@ -594,7 +625,13 @@ async def stop_profile(self): async def clear_kv_cache(self): if self.node_rank == 0: - await self.engine.reset_prefix_cache() + # When kv_store is enabled, propagate reset_connector=True so the + # external Mooncake store (whose entries were computed against the + # previous model weights) is also dropped before any new request + # can read stale KV via vLLM's external prefix cache path. + await self.engine.reset_prefix_cache( + reset_connector=self._kv_store_enabled, + ) async def set_global_steps(self, global_steps: int): """Set the global steps of the model weights.""" @@ -632,6 +669,12 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str, wait_for_inflight_requests=False, clear_cache=reset_prefix_cache, ) + # AsyncLLM.pause_generation's internal cache flush does not + # propagate reset_connector through; do an explicit one when + # an external KV store is attached so the Mooncake master is + # cleared along with the in-engine prefix cache. + if reset_prefix_cache and self._kv_store_enabled: + await self.engine.reset_prefix_cache(reset_connector=True) else: # Take an atomic snapshot to avoid race conditions with the vLLM engine thread request_states_snapshot = list(self.engine.output_processor.request_states.items()) From 891050893af58c4b2942547841ebbc291cdca468 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Mon, 11 May 2026 12:22:29 +0000 Subject: [PATCH 02/15] docs: replace example recipe with rollout KV-offload guide Drops the recipe_aoshen/phase_b_mooncake.sh example (sample-only, not upstream-relevant) and instead documents the new actor_rollout_ref.rollout.kv_store knob and the RL-correctness contract (hard-reset on every weight update via reset_connector=True) in a new advance/rollout_kv_offload.md page. The doc covers: - When to enable the feature (and when not to) - The hard-reset cascade ending in store.remove_all(force=True) so reviewers can trace it without re-reading the diff - Configuration reference for every kv_store.* field - Required env vars (MOONCAKE_CONFIG_PATH, optional PYTHONHASHSEED=0) - Operational notes (per-run master, reset cost, failure modes) - Comparison vs SGLang's opt-in flush flow Wired into docs/index.rst under "Advanced Features" next to rollout_skip / rollout_trace. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/advance/rollout_kv_offload.md | 172 +++++++++++++++++++++++++++++ docs/index.rst | 1 + recipe_aoshen/phase_b_mooncake.sh | 66 ----------- 3 files changed, 173 insertions(+), 66 deletions(-) create mode 100644 docs/advance/rollout_kv_offload.md delete mode 100644 recipe_aoshen/phase_b_mooncake.sh diff --git a/docs/advance/rollout_kv_offload.md b/docs/advance/rollout_kv_offload.md new file mode 100644 index 00000000000..1b903c72b6f --- /dev/null +++ b/docs/advance/rollout_kv_offload.md @@ -0,0 +1,172 @@ +# Rollout KV Cache Offload via Mooncake-Store + +Last updated: 2026-05-11. + +This document covers how to offload prefix KV blocks from the vLLM rollout +engine to a shared **Mooncake** distributed store, so that long shared +prefixes (system prompt + task description + earlier turns in agentic +workloads) get deduplicated across requests and across rollout replicas +within a single weight generation. + +## When to use this + +Enable when **all** of these hold: + +- Rollout workload has long, reusable prefixes (multi-turn / agentic / shared + system prompt with many `actor_rollout_ref.rollout.n` samples per prompt). +- The rollout-side prefix-cache hit rate on a single engine is already + saturated, but cross-engine prefix sharing (e.g., DP > 1, multi-replica + fully-async) is leaving hits on the table. +- You can run a Mooncake master process colocated with (or reachable from) + every rollout host, and you have enough RDMA / TCP bandwidth between the + rollout workers and the master. + +Do **not** enable for short prompts or small ``rollout.n`` workloads where +within-engine prefix cache is already enough — the round-trip to Mooncake +will net out negative. + +## RL-correctness contract: hard reset on every weight update + +The unique constraint of RL training is that the model weights change +between rollout steps. Any KV block written to the external store before a +weight update is **computed against the previous policy** — serving it to a +post-update request would silently corrupt inference. + +verl handles this correctly: the existing ``update_weights`` flow in +``verl.checkpoint_engine.base.CheckpointEngineManager`` already does an +``abort_all_requests`` -> drain -> sleep -> NCCL weight sync -> ``wake_up`` +sequence. When this feature is enabled, every prefix-cache-reset call site +in ``vllm_async_server.py`` (``wake_up``, ``clear_kv_cache``, the +``abort_all_requests`` fallback path) additionally passes +``reset_connector=True`` to ``engine.reset_prefix_cache(...)``. + +That flag cascades into vLLM as: + + Scheduler.reset_prefix_cache(reset_connector=True) + -> Scheduler.reset_connector_cache() + -> MooncakeStoreConnector.reset_cache() (SCHEDULER role) + -> MooncakeStoreScheduler.reset_store() + -> LookupKeyClient.reset() (ZMQ admin frame) + -> LookupKeyServer (worker rank 0) recognizes RESET_MAGIC + -> store.remove_all(force=True) (Mooncake master) + +The net effect is that after each ``update_weights`` round, the Mooncake +master is empty before any new rollout request starts — matching the +existing in-engine prefix-cache behavior. The contract is symmetric: if +the in-engine guard (``BlockPool.reset_prefix_cache``) fails to clear (e.g. +an in-flight sequence still holds blocks), the external store is *also* +left untouched, so internal and external caches never desynchronize. + +## Pre-requisites + +1. **Mooncake**: install the Python binding (``pip install + mooncake-transfer-engine``) and a master binary that exposes the + ``RemoveAll`` RPC. On aarch64 GB200 this currently means building from + ``ivanium/Mooncake`` ``yifan/dev`` with ``-DUSE_CUDA=ON + -DWITH_NVIDIA_PEERMEM=OFF -DUSE_MNNVL=ON``. +2. **vLLM**: 0.20.1+ with the paired ``MooncakeStoreConnector.reset_cache`` + patch (the cascade hook). Without it the ``reset_connector=True`` flag + is a no-op and silent stale-cache corruption is possible — do not enable + ``kv_store.enable`` without the paired patch. +3. **Mooncake master process**: launched out-of-band (typically per-run, see + ``scripts/mooncake/start_mooncake_master.sh``). Single-tenant per run is + recommended because ``RemoveAll`` is master-wide; multi-tenant sharing + would let one experiment wipe another's cache. + +## Configuration + +Under ``actor_rollout_ref.rollout``: + +```yaml +actor_rollout_ref: + rollout: + kv_store: + enable: true # default false + kv_connector: MooncakeStoreConnector # forwarded to vLLM + kv_role: kv_both # both put + get + config_path: /path/to/mooncake_config.json # passed via env + extra_config: {} # additional kv_connector_extra_config + on_failure: fallback # fallback | crash +``` + +Field reference: + +- ``enable``: Master switch. When false (default), no ``kv_transfer_config`` + is attached to the vLLM engine and verl behaves exactly as before. +- ``kv_connector``: KVConnector class name forwarded to vLLM's + ``--kv-transfer-config``. Override only for testing alternate backends. +- ``kv_role``: ``kv_both`` lets the rollout engine both write blocks to and + read blocks from the store. Other values are exposed for the same reason + as ``kv_connector`` but the rollout path expects ``kv_both``. +- ``config_path``: Path to the Mooncake client JSON config + (``master_server_address``, ``global_segment_size``, ``protocol``, ...). + Falls back to the ``MOONCAKE_CONFIG_PATH`` environment variable when + unset — most installations should set the env once at the cluster level + and leave this field empty. +- ``extra_config``: Dict merged into vLLM's ``kv_connector_extra_config``. + Reserve for connector-specific knobs that don't have a first-class field. +- ``on_failure``: Behavior when the Mooncake master is unreachable at + rollout-engine launch. + ``fallback`` (default) drops the connector and continues training with + external offload disabled — a soft dependency suited to long RL runs + where pausing training for an infra hiccup is more expensive than losing + cross-engine prefix hits. Set to ``crash`` to fail fast (e.g., in CI). + +## Required environment variables + +These are intentionally *not* set automatically — they're cluster-level +choices. + +- ``MOONCAKE_CONFIG_PATH``: required by the Mooncake client to locate + ``master_server_address`` etc. Set on every rollout actor; verl + propagates it via ``ray`` runtime_env. +- ``PYTHONHASHSEED=0``: **required if** you have ``DP > 1`` or multiple + rollout replicas — vLLM's block-hash seed is randomized per process and + cross-engine prefix-cache hits will silently drop to zero without a + fixed seed. Single-engine rollouts can leave this unset. + +## Operational notes + +- **Cluster hygiene**: a per-run master is the recommended deployment. + Reuse across runs invites cross-experiment cache pollution (and a + ``reset_connector=True`` from one run will wipe the other's keys). +- **Reset cost**: every weight update triggers ``RemoveAll`` on the + master, which iterates all metadata shards. On a ~600 GB store this is + sub-second. Frequent fully-async weight syncs (sync every 1-2 rollout + steps) will see the per-update hit rate stay low; this is expected and + matches the in-engine prefix-cache behavior under the same conditions. +- **No version tagging**: keys are pure content-addressed + (``{model_name}@tp_rank:N@...@{block_hash}``); no weight-generation + field. The hard-reset model relies on the master being cleaned per + weight update, not on key versioning. +- **Failure modes**: + - Master unreachable at launch: see ``on_failure``. + - Master goes down mid-run: the rollout engine continues using its + in-engine prefix cache; ``reset_store`` returns False and the rest of + ``reset_prefix_cache`` still works. Re-attaching to a new master + mid-run is **not** supported in this version. + - Cross-rank ``block_hash`` divergence (forgot ``PYTHONHASHSEED``): + silent zero hit rate. Inspect + ``vllm_external_prefix_cache_hits`` metric to detect. + +## Comparison vs. SGLang's HiCacheStorage flow + +| Aspect | SGLang ``/flush_cache`` | verl + vLLM ``MooncakeStoreConnector`` | +|---|---|---| +| Reset trigger | Per-RPC opt-in ``flush_cache=True`` flag on each ``update_weights_*`` call — forget once -> silent stale-cache corruption | Automatic data-plane cascade through ``reset_prefix_cache(reset_connector=True)`` whenever verl drives the existing hard-reset path; no flag | +| Multi-rank coordination | Each scheduler instance calls ``remove_all`` (idempotent, but N redundant RPCs) | Scheduler-side ZMQ admin RPC to rank-0 worker only -> one ``RemoveAll`` per reset | +| Guard / cache consistency | ``is_fully_idle()`` silently false on failure | Cascades only when in-engine guard passes; internal and external caches stay in lockstep | +| Failure surface | Client must read return value and re-issue | Soft-dependency (``on_failure=fallback``) by default; explicit False propagates up to the scheduler | + +## Reference + +- Paired vLLM upstream patch: implements + ``MooncakeStoreConnector.reset_cache()`` and the + ``RESET_MAGIC`` ZMQ discriminator (see ``aoshen02/vllm:feat/mooncake-clear-hook`` + -> ``ivanium/vllm`` -> upstream). +- vLLM scheduler hook the cascade rides on: + ``vllm/v1/core/sched/scheduler.py:1871`` (``reset_prefix_cache``) and + ``1917`` (``reset_connector_cache``) — already shipped in 0.20.1, the + patch only provides the connector-specific ``reset_cache`` body. +- Sister rollout-correctness recipes: ``advance/rollout_corr.md``, + ``advance/rollout_corr_math.md``. diff --git a/docs/index.rst b/docs/index.rst index 6d9714acbe1..cc8505b7ee2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -143,6 +143,7 @@ verl is fast with: examples/sandbox_fusion_example advance/rollout_trace.rst advance/rollout_skip.rst + advance/rollout_kv_offload.md advance/agent_loop advance/reward_loop data/transfer_queue.md diff --git a/recipe_aoshen/phase_b_mooncake.sh b/recipe_aoshen/phase_b_mooncake.sh deleted file mode 100644 index 98b735886bb..00000000000 --- a/recipe_aoshen/phase_b_mooncake.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env bash -# Phase B with Mooncake-store offload: fork of recipe_aoshen/phase_b.sh. -# -# Same Qwen3 GSM8K 5-step sanity loop, but the rollout vLLM engine is -# launched with --kv-transfer-config attaching a MooncakeStoreConnector -# pointing at a per-run Mooncake master. On every weight update verl -# drives a hard reset (engine.reset_prefix_cache(reset_connector=True)) -# which cascades through scheduler.reset_connector_cache -> -# MooncakeStoreConnector.reset_cache -> store.remove_all(force=True). -# -# Pre-requisites (caller responsibility): -# 1. Mooncake master started via scripts/start_master.sh on this node. -# 2. MOONCAKE_CONFIG_PATH exported pointing at the JSON config used by -# the master. -# 3. Ray cluster running (single-node or multi-node). -# 4. GSM8K parquet present at /root/data/gsm8k/{train,test}.parquet. -# -# Acceptance: -# - 5 train steps complete without crash -# - mooncake_master.log shows >= 5 RemoveAll calls -# - In at least one cycle, external_prefix_cache_hits > 0 (set via -# vLLM metrics) -set -xeuo pipefail - -cd /workspace/verl - -export MACHINE=gb200 -export INFER_BACKEND=vllm -export MODEL_PATH=Qwen/Qwen3-0.6B - -# Single-tray run for the first Mooncake-store integration smoke. -export NNODES=1 -export NGPUS_PER_NODE=4 - -export TRAIN_BATCH_SIZE=32 -export PPO_MINI_BATCH_SIZE=16 -export MAX_PROMPT_LENGTH=512 -export MAX_RESPONSE_LENGTH=512 -export PPO_MAX_TOKEN_LEN_PER_GPU=4096 - -export ROLLOUT_TP=1 -export ROLLOUT_GPU_MEM_UTIL=0.5 -export ROLLOUT_N=2 - -export TOTAL_EPOCHS=1 -export SAVE_FREQ=-1 -export TEST_FREQ=10 - -export PROJECT_NAME=phase_b_mooncake -export EXPERIMENT_NAME=qwen3_06b_gsm8k_5steps_mooncake_${NNODES}nodes - -# Make sure MOONCAKE_CONFIG_PATH is propagated to all rollout workers. -: "${MOONCAKE_CONFIG_PATH:?Set MOONCAKE_CONFIG_PATH before launching}" -export MOONCAKE_CONFIG_PATH - -bash examples/grpo_trainer/run_qwen3_8b_fsdp.sh \ - "data.train_files=['/root/data/gsm8k/train.parquet']" \ - "data.val_files=['/root/data/gsm8k/test.parquet']" \ - trainer.total_training_steps=5 \ - "trainer.logger=['console']" \ - '~ray_kwargs.ray_init.num_gpus' \ - actor_rollout_ref.rollout.kv_store.enable=true \ - "actor_rollout_ref.rollout.kv_store.config_path=${MOONCAKE_CONFIG_PATH}" \ - actor_rollout_ref.rollout.kv_store.kv_role=kv_both -# NOTE: NEVER set NCCL_MNNVL_ENABLE=0 on this rack. Cross-tray traffic must -# go through the NVL72 NVLink switch (~1.8 TB/s/GPU) via MNNVL + IMEX. From 3d5329ce1c763cb5f9824066e12f56361e007470 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Mon, 11 May 2026 12:34:46 +0000 Subject: [PATCH 03/15] rollout: simplify Mooncake hard-reset and move doc to perf/ Two cleanups requested in PR review: 1. Use pause_generation's new reset_connector kwarg instead of an extra engine.reset_prefix_cache call afterwards. The paired vLLM patch now threads reset_connector all the way through pause_generation -> pause_scheduler_async -> EngineCore. pause_scheduler -> _reset_caches -> Scheduler.reset_prefix_cache, so the hard-reset is a single call: await self.engine.pause_generation( wait_for_inflight_requests=False, clear_cache=reset_prefix_cache, reset_connector=reset_prefix_cache and self._kv_store_enabled, ) No more "AsyncLLM.pause_generation does not propagate reset_connector" workaround. 2. Move docs/advance/rollout_kv_offload.md -> docs/perf/rollout_kv_offload.md. This is a performance / KV offload feature, fits better under "Performance Tuning Guide" next to perf/perf_tuning, perf/dpsk, etc. than under "Advanced Features". Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/{advance => perf}/rollout_kv_offload.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/{advance => perf}/rollout_kv_offload.md (100%) diff --git a/docs/advance/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md similarity index 100% rename from docs/advance/rollout_kv_offload.md rename to docs/perf/rollout_kv_offload.md From 92736ca77c78ec5eaa984b7cc029a7c9f0496824 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Mon, 11 May 2026 13:00:46 +0000 Subject: [PATCH 04/15] rollout: actually drop the workaround + register doc in perf toctree Previous commit ("simplify Mooncake hard-reset and move doc to perf/") only landed the file rename; the two payload edits accidentally got dropped from the staging round. This commit ships them: - verl/workers/rollout/vllm_rollout/vllm_async_server.py: replace the trailing `await self.engine.reset_prefix_cache(reset_connector=True)` workaround with a single `pause_generation(..., reset_connector=...)` call (rides on the paired vllm reset_connector kwarg now plumbed through pause_generation -> EngineCore._reset_caches). - docs/index.rst: move rollout_kv_offload from "Advanced Features" toctree to "Performance Tuning Guide" toctree, next to perf_tuning and dpsk where it belongs as a perf/KV-offload feature. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/index.rst | 2 +- .../workers/rollout/vllm_rollout/vllm_async_server.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index cc8505b7ee2..cf4adbe4537 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -101,6 +101,7 @@ verl is fast with: perf/best_practices perf/perf_tuning perf/perf_tuning_on_ascend.rst + perf/rollout_kv_offload.md README_vllm0.8.md perf/device_tuning perf/verl_profiler_system.md @@ -143,7 +144,6 @@ verl is fast with: examples/sandbox_fusion_example advance/rollout_trace.rst advance/rollout_skip.rst - advance/rollout_kv_offload.md advance/agent_loop advance/reward_loop data/transfer_queue.md diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index f6438c7e246..0db52437706 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -665,16 +665,15 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str, # 2. Abort all in-flight requests # 3. Wait for requests to drain # 4. Clear prefix and mm caches if clear_cache=True + # reset_connector=True (when an external KV store is attached) + # extends step 4 to also clear the external store, e.g. the + # Mooncake master, so post-update requests can't read KV + # computed against the previous model weights. await self.engine.pause_generation( wait_for_inflight_requests=False, clear_cache=reset_prefix_cache, + reset_connector=reset_prefix_cache and self._kv_store_enabled, ) - # AsyncLLM.pause_generation's internal cache flush does not - # propagate reset_connector through; do an explicit one when - # an external KV store is attached so the Mooncake master is - # cleared along with the in-engine prefix cache. - if reset_prefix_cache and self._kv_store_enabled: - await self.engine.reset_prefix_cache(reset_connector=True) else: # Take an atomic snapshot to avoid race conditions with the vLLM engine thread request_states_snapshot = list(self.engine.output_processor.request_states.items()) From 258d7f6d1e9e779877fabeeb06e5d63baac4e74a Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Mon, 11 May 2026 13:08:39 +0000 Subject: [PATCH 05/15] rollout: drop _kv_store_enabled flag, pass reset_connector=True always MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The flag served two roles: 1. Gate whether to attach kv_transfer_config to vllm serve args at launch time. 2. Gate whether to pass reset_connector=True on every cache reset. Role 2 is unnecessary now that the paired vLLM patch (scheduler: treat reset_connector with no connector as no-op success) makes Scheduler.reset_connector_cache return True without a warning when no connector is attached. The reset paths can simply ask for a connector reset unconditionally; the engine decides what to do. Role 1 stays as an inline check on `kv_store_cfg.get("enable", False)` in launch_server — no need to remember the result on the adapter. Result: three reset call sites (`wake_up`, `clear_kv_cache`, `abort_all_requests` via `pause_generation`) all pass `reset_connector=True` unconditionally. No instance flag, no conditional, less state to reason about. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../rollout/vllm_rollout/vllm_async_server.py | 41 ++++++++----------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 0db52437706..b13fdc0bb8c 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -123,11 +123,6 @@ def __init__( self.nnodes = nnodes # model weights version, set by ServerAdapter when update weights. self.global_steps = None - # Whether the engine was launched with an external KV store (Mooncake) - # connector. Set inside launch_server() once kv_transfer_config is - # built; gates reset_connector=True on cache resets so we do not - # serve KV computed against previous weights from the external pool. - self._kv_store_enabled = False if self.rollout_mode != RolloutMode.HYBRID and self.config.load_format == "dummy": logger.warning(f"rollout mode is {self.rollout_mode}, load_format is dummy, set to auto") @@ -357,9 +352,11 @@ async def launch_server(self, master_address: str = None, master_port: int = Non # weight update we drive a hard reset via reset_prefix_cache( # reset_connector=True), which makes vLLM call # MooncakeStoreConnector.reset_cache() -> store.remove_all(force=True). + # When no connector is attached the scheduler treats reset_connector=True + # as a no-op success, so we can pass it unconditionally on the reset + # paths and avoid carrying state into the adapter. kv_store_cfg = getattr(self.config, "kv_store", None) - self._kv_store_enabled = bool(kv_store_cfg and kv_store_cfg.get("enable", False)) - if self._kv_store_enabled: + if kv_store_cfg and kv_store_cfg.get("enable", False): extra_config = dict(kv_store_cfg.get("extra_config", {}) or {}) config_path = kv_store_cfg.get("config_path") if config_path: @@ -590,9 +587,10 @@ async def wake_up(self): elif self.rollout_mode == RolloutMode.COLOCATED: # Directly call engine to wake up without sync weights. await self.engine.wake_up(tags=self._get_wake_up_tags()) - await self.engine.reset_prefix_cache( - reset_connector=self._kv_store_enabled, - ) + # reset_connector=True is a no-op when no connector is attached + # (scheduler treats it as success), so we don't need to gate this + # behind a kv_store-enabled flag. + await self.engine.reset_prefix_cache(reset_connector=True) elif self.rollout_mode == RolloutMode.STANDALONE: logger.info("skip wake_up in standalone mode") @@ -625,13 +623,11 @@ async def stop_profile(self): async def clear_kv_cache(self): if self.node_rank == 0: - # When kv_store is enabled, propagate reset_connector=True so the - # external Mooncake store (whose entries were computed against the - # previous model weights) is also dropped before any new request - # can read stale KV via vLLM's external prefix cache path. - await self.engine.reset_prefix_cache( - reset_connector=self._kv_store_enabled, - ) + # reset_connector=True drops any attached external KV store + # (e.g. MooncakeStoreConnector) whose entries were computed + # against the previous model weights. With no connector it + # is a no-op success, so we can pass it unconditionally. + await self.engine.reset_prefix_cache(reset_connector=True) async def set_global_steps(self, global_steps: int): """Set the global steps of the model weights.""" @@ -664,15 +660,14 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str, # 1. Set engine to paused state (blocks new generate calls) # 2. Abort all in-flight requests # 3. Wait for requests to drain - # 4. Clear prefix and mm caches if clear_cache=True - # reset_connector=True (when an external KV store is attached) - # extends step 4 to also clear the external store, e.g. the - # Mooncake master, so post-update requests can't read KV - # computed against the previous model weights. + # 4. Clear prefix and mm caches if clear_cache=True; extend + # step 4 to also clear the external store (e.g. Mooncake + # master) when reset_connector=True. No-op success when + # no connector is attached. await self.engine.pause_generation( wait_for_inflight_requests=False, clear_cache=reset_prefix_cache, - reset_connector=reset_prefix_cache and self._kv_store_enabled, + reset_connector=reset_prefix_cache, ) else: # Take an atomic snapshot to avoid race conditions with the vLLM engine thread From 87aba0d2c7dadc6a62f7242dfe4375826d628368 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Sat, 16 May 2026 14:27:34 +0000 Subject: [PATCH 06/15] rollout: drop KVStoreConfig, use engine_kwargs.vllm.kv_transfer_config passthrough MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the verl-specific KVStoreConfig dataclass (enable/kv_connector/ kv_role/config_path/extra_config/on_failure) with the existing generic `actor_rollout_ref.rollout.engine_kwargs.vllm.` passthrough. Users opting into the Mooncake-Store offload now set `engine_kwargs.vllm.kv_transfer_config` to the JSON string that vLLM's own --kv-transfer-config CLI flag already accepts (vLLM decodes it into its first-class KVTransferConfig). Net effect: - No verl-side schema to learn / document / migrate when vLLM adds new kv_transfer_config fields or new KV connectors (NixlConnector, P2pNcclConnector, MultiConnector, future ones — all work without a verl change). - ~40 lines of dataclass + 23 lines of "if enable: build dict" launch logic deleted; no behavior change for existing users (the field was default-disabled). - The on_failure="fallback" soft-dependency knob is dropped. Silent disable of a configured KV store is the wrong default for RL runs — hours of post-update stale-cache reads would be hidden. vLLM serve now fails loud if the Mooncake master is unreachable at engine launch; callers wanting soft mode can wrap the launch with a pre-flight healthcheck. Also drop the now-unsupported `reset_connector=` kwarg from the `AsyncLLM.pause_generation(...)` call in abort_all_requests — upstream vLLM does not (and per the paired cascade PR, does not need to) expose that kwarg. EngineCore._reset_caches defaults reset_connector=True so the connector cascade fires automatically whenever pause_generation runs with clear_cache=True. The wake_up / clear_kv_cache call sites keep the explicit reset_connector=True on reset_prefix_cache (the supported upstream entry point); both are no-op success when no connector is configured, so they remain safe to pass unconditionally. Paired vLLM PR: vllm-project/vllm#42694. Co-Authored-By: Claude Opus 4.7 (1M context) --- verl/workers/config/rollout.py | 41 ----------------- .../rollout/vllm_rollout/vllm_async_server.py | 44 +++++-------------- 2 files changed, 10 insertions(+), 75 deletions(-) diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py index 4bbd6c99ac0..7731fa9d592 100644 --- a/verl/workers/config/rollout.py +++ b/verl/workers/config/rollout.py @@ -31,48 +31,10 @@ "PrometheusConfig", "RolloutConfig", "CheckpointEngineConfig", - "KVStoreConfig", "SkipConfig", ] -@dataclass -class KVStoreConfig(BaseConfig): - """External KV cache store configuration for the vLLM rollout engine. - - When ``enable`` is true, verl asks vLLM to attach a - ``MooncakeStoreConnector`` so that prefix KV blocks are offloaded to a - shared Mooncake master. On every weight update verl drives a hard reset - (``engine.reset_prefix_cache(reset_connector=True)``) so the master's - stale entries are dropped before any new rollout reads them. - - See ``mooncake-integration/`` for the per-run master start/stop wrapper. - """ - - # Enable external KV store offload. - enable: bool = False - - # KVConnector class name forwarded to ``--kv-transfer-config``. - kv_connector: str = "MooncakeStoreConnector" - - # vLLM kv_role. ``kv_both`` lets the rollout engine both put and get. - kv_role: str = "kv_both" - - # Path to Mooncake client config JSON (``master_server_address``, - # ``global_segment_size``, ``protocol``, ...). Falls back to the - # ``MOONCAKE_CONFIG_PATH`` env var when None. - config_path: Optional[str] = None - - # Optional extra dict merged into vLLM's - # ``kv_connector_extra_config``. Use sparingly. - extra_config: dict = field(default_factory=dict) - - # Behavior when the Mooncake master is unreachable at engine launch: - # ``fallback`` (default) -> log a warning, drop the connector, keep - # training; ``crash`` -> let the engine start fail and propagate. - on_failure: str = "fallback" - - @dataclass class SkipConfig(BaseConfig): """ @@ -282,9 +244,6 @@ class RolloutConfig(BaseConfig): # Checkpoint Engine config for update weights from trainer to rollout checkpoint_engine: CheckpointEngineConfig = field(default_factory=CheckpointEngineConfig) - # External Mooncake KV store offload (RL-correct hard-reset path). - kv_store: KVStoreConfig = field(default_factory=KVStoreConfig) - # Rollout skip config (load/dump rollout data) skip: SkipConfig = field(default_factory=SkipConfig) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index b13fdc0bb8c..38b5c6b1524 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -346,32 +346,6 @@ async def launch_server(self, master_address: str = None, master_port: int = Non if self.config.enable_rollout_routing_replay: args.update({"enable_return_routed_experts": True}) - # External Mooncake KV store offload: forward kv_transfer_config to - # vLLM if rollout.kv_store.enable is set. The connector itself decides - # how to talk to the Mooncake master (config_path / env). On every - # weight update we drive a hard reset via reset_prefix_cache( - # reset_connector=True), which makes vLLM call - # MooncakeStoreConnector.reset_cache() -> store.remove_all(force=True). - # When no connector is attached the scheduler treats reset_connector=True - # as a no-op success, so we can pass it unconditionally on the reset - # paths and avoid carrying state into the adapter. - kv_store_cfg = getattr(self.config, "kv_store", None) - if kv_store_cfg and kv_store_cfg.get("enable", False): - extra_config = dict(kv_store_cfg.get("extra_config", {}) or {}) - config_path = kv_store_cfg.get("config_path") - if config_path: - # MooncakeStoreWorker reads MOONCAKE_CONFIG_PATH from env; - # vLLM serve also accepts kv_connector_extra_config so we - # pass both for clarity. The env var is set by the per-run - # master wrapper script before `ray job submit`. - extra_config.setdefault("mooncake_config_path", config_path) - kv_transfer_config = { - "kv_connector": kv_store_cfg.get("kv_connector", "MooncakeStoreConnector"), - "kv_role": kv_store_cfg.get("kv_role", "kv_both"), - "kv_connector_extra_config": extra_config, - } - args["kv_transfer_config"] = json.dumps(kv_transfer_config) - server_args = ["serve", self.model_config.local_path] + build_cli_args_from_config(args) if self.replica_rank == 0: @@ -587,9 +561,10 @@ async def wake_up(self): elif self.rollout_mode == RolloutMode.COLOCATED: # Directly call engine to wake up without sync weights. await self.engine.wake_up(tags=self._get_wake_up_tags()) - # reset_connector=True is a no-op when no connector is attached - # (scheduler treats it as success), so we don't need to gate this - # behind a kv_store-enabled flag. + # reset_connector=True drops any attached external KV store + # (e.g. MooncakeStoreConnector) whose entries were computed + # against the previous weights. No-op success when no connector + # is configured (vLLM scheduler treats it as such). await self.engine.reset_prefix_cache(reset_connector=True) elif self.rollout_mode == RolloutMode.STANDALONE: logger.info("skip wake_up in standalone mode") @@ -660,14 +635,15 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str, # 1. Set engine to paused state (blocks new generate calls) # 2. Abort all in-flight requests # 3. Wait for requests to drain - # 4. Clear prefix and mm caches if clear_cache=True; extend - # step 4 to also clear the external store (e.g. Mooncake - # master) when reset_connector=True. No-op success when - # no connector is attached. + # 4. Clear prefix and mm caches if clear_cache=True. + # EngineCore._reset_caches defaults reset_connector=True + # on this path, so any attached external KV store (e.g. + # MooncakeStoreConnector) is invalidated along with the + # local prefix cache — RL-correct hard-reset at every + # weight update boundary, no extra kwargs needed. await self.engine.pause_generation( wait_for_inflight_requests=False, clear_cache=reset_prefix_cache, - reset_connector=reset_prefix_cache, ) else: # Take an atomic snapshot to avoid race conditions with the vLLM engine thread From 2529613084b56334094d060cc067a161ee31be95 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Sat, 16 May 2026 14:27:45 +0000 Subject: [PATCH 07/15] docs: rewrite rollout_kv_offload for engine_kwargs.vllm.kv_transfer_config passthrough MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The verl-side KVStoreConfig dataclass is gone; the doc page no longer needs to document its fields. Replace the configuration section with the actual minimal recipe: set engine_kwargs.vllm.kv_transfer_config to a JSON string that vLLM's KVTransferConfig already accepts. Also: - Update the RL-correctness section to describe both reset entry points: explicit reset_prefix_cache(reset_connector=True) from wake_up / clear_kv_cache, and the automatic cascade through EngineCore._reset_caches when abort_all_requests goes through pause_generation(clear_cache=True). - Pin the paired vLLM PR reference to the canonical upstream PR vllm-project/vllm#42694 (replaces the earlier ivanium-fork link). - Drop the on_failure "fallback" row from the SGLang comparison table; verl no longer offers that knob — vLLM serve fails loud if the master is unreachable, which is the right default for RL. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/perf/rollout_kv_offload.md | 139 ++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 62 deletions(-) diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md index 1b903c72b6f..86a69db8576 100644 --- a/docs/perf/rollout_kv_offload.md +++ b/docs/perf/rollout_kv_offload.md @@ -1,6 +1,6 @@ # Rollout KV Cache Offload via Mooncake-Store -Last updated: 2026-05-11. +Last updated: 2026-05-16. This document covers how to offload prefix KV blocks from the vLLM rollout engine to a shared **Mooncake** distributed store, so that long shared @@ -13,7 +13,7 @@ within a single weight generation. Enable when **all** of these hold: - Rollout workload has long, reusable prefixes (multi-turn / agentic / shared - system prompt with many `actor_rollout_ref.rollout.n` samples per prompt). + system prompt with many ``actor_rollout_ref.rollout.n`` samples per prompt). - The rollout-side prefix-cache hit rate on a single engine is already saturated, but cross-engine prefix sharing (e.g., DP > 1, multi-replica fully-async) is leaving hits on the table. @@ -32,13 +32,16 @@ between rollout steps. Any KV block written to the external store before a weight update is **computed against the previous policy** — serving it to a post-update request would silently corrupt inference. -verl handles this correctly: the existing ``update_weights`` flow in -``verl.checkpoint_engine.base.CheckpointEngineManager`` already does an -``abort_all_requests`` -> drain -> sleep -> NCCL weight sync -> ``wake_up`` -sequence. When this feature is enabled, every prefix-cache-reset call site -in ``vllm_async_server.py`` (``wake_up``, ``clear_kv_cache``, the -``abort_all_requests`` fallback path) additionally passes -``reset_connector=True`` to ``engine.reset_prefix_cache(...)``. +verl handles this correctly. The three prefix-cache-reset call sites in +``vllm_async_server.py`` propagate the connector reset: + +- ``wake_up`` and ``clear_kv_cache`` call + ``engine.reset_prefix_cache(reset_connector=True)`` explicitly. +- ``abort_all_requests`` calls + ``engine.pause_generation(clear_cache=True)``; in vLLM ≥ the paired + cascade patch, ``EngineCore._reset_caches`` defaults + ``reset_connector=True`` so the connector cascade fires automatically + whenever ``pause_generation`` clears caches. That flag cascades into vLLM as: @@ -47,7 +50,7 @@ That flag cascades into vLLM as: -> MooncakeStoreConnector.reset_cache() (SCHEDULER role) -> MooncakeStoreScheduler.reset_store() -> LookupKeyClient.reset() (ZMQ admin frame) - -> LookupKeyServer (worker rank 0) recognizes RESET_MAGIC + -> LookupKeyServer (worker rank 0) typed dispatch -> store.remove_all(force=True) (Mooncake master) The net effect is that after each ``update_weights`` round, the Mooncake @@ -57,60 +60,71 @@ the in-engine guard (``BlockPool.reset_prefix_cache``) fails to clear (e.g. an in-flight sequence still holds blocks), the external store is *also* left untouched, so internal and external caches never desynchronize. +When **no** KV connector is attached, ``reset_connector=True`` is a no-op +success in upstream vLLM (the scheduler treats "nothing to reset" as +trivially OK). Passing it unconditionally is therefore safe for every +rollout — there is no verl-side feature flag to remember. + ## Pre-requisites -1. **Mooncake**: install the Python binding (``pip install +1. **vLLM**: build that includes the ``MooncakeStoreConnector.reset_cache`` + cascade (vllm-project/vllm#42694) and the ``EngineCore._reset_caches`` + default that threads ``reset_connector=True`` from + ``pause_generation(clear_cache=True)``. Without the cascade, + ``reset_connector=True`` clears only the local prefix cache and leaves + the Mooncake master populated with stale KV — silent correctness loss. + Do **not** enable the external store without that vLLM build. +2. **Mooncake**: install the Python binding (``pip install mooncake-transfer-engine``) and a master binary that exposes the - ``RemoveAll`` RPC. On aarch64 GB200 this currently means building from - ``ivanium/Mooncake`` ``yifan/dev`` with ``-DUSE_CUDA=ON - -DWITH_NVIDIA_PEERMEM=OFF -DUSE_MNNVL=ON``. -2. **vLLM**: 0.20.1+ with the paired ``MooncakeStoreConnector.reset_cache`` - patch (the cascade hook). Without it the ``reset_connector=True`` flag - is a no-op and silent stale-cache corruption is possible — do not enable - ``kv_store.enable`` without the paired patch. -3. **Mooncake master process**: launched out-of-band (typically per-run, see - ``scripts/mooncake/start_mooncake_master.sh``). Single-tenant per run is - recommended because ``RemoveAll`` is master-wide; multi-tenant sharing - would let one experiment wipe another's cache. + ``RemoveAll`` RPC. On aarch64 GB200 build from upstream with + ``-DUSE_CUDA=ON -DWITH_NVIDIA_PEERMEM=OFF -DUSE_MNNVL=ON``. +3. **Mooncake master process**: launched out-of-band, typically per-run + (see ``scripts/mooncake/start_mooncake_master.sh`` in the + mooncake-integration project). Single-tenant per run is recommended + because ``RemoveAll`` is master-wide; multi-tenant sharing would let + one experiment wipe another's cache. ## Configuration -Under ``actor_rollout_ref.rollout``: +verl forwards any key under ``actor_rollout_ref.rollout.engine_kwargs.vllm`` +to ``vllm serve`` as a CLI flag. To attach the Mooncake connector, set +``kv_transfer_config`` directly — the JSON shape is vLLM's own +``KVTransferConfig`` schema (see ``vllm/config/__init__.py`` -> +``KVTransferConfig``): ```yaml actor_rollout_ref: rollout: - kv_store: - enable: true # default false - kv_connector: MooncakeStoreConnector # forwarded to vLLM - kv_role: kv_both # both put + get - config_path: /path/to/mooncake_config.json # passed via env - extra_config: {} # additional kv_connector_extra_config - on_failure: fallback # fallback | crash + engine_kwargs: + vllm: + kv_transfer_config: |- + { + "kv_connector": "MooncakeStoreConnector", + "kv_role": "kv_both", + "kv_connector_extra_config": { + "mooncake_config_path": "/path/to/mooncake_config.json" + } + } ``` -Field reference: - -- ``enable``: Master switch. When false (default), no ``kv_transfer_config`` - is attached to the vLLM engine and verl behaves exactly as before. -- ``kv_connector``: KVConnector class name forwarded to vLLM's - ``--kv-transfer-config``. Override only for testing alternate backends. -- ``kv_role``: ``kv_both`` lets the rollout engine both write blocks to and - read blocks from the store. Other values are exposed for the same reason - as ``kv_connector`` but the rollout path expects ``kv_both``. -- ``config_path``: Path to the Mooncake client JSON config - (``master_server_address``, ``global_segment_size``, ``protocol``, ...). - Falls back to the ``MOONCAKE_CONFIG_PATH`` environment variable when - unset — most installations should set the env once at the cluster level - and leave this field empty. -- ``extra_config``: Dict merged into vLLM's ``kv_connector_extra_config``. - Reserve for connector-specific knobs that don't have a first-class field. -- ``on_failure``: Behavior when the Mooncake master is unreachable at - rollout-engine launch. - ``fallback`` (default) drops the connector and continues training with - external offload disabled — a soft dependency suited to long RL runs - where pausing training for an infra hiccup is more expensive than losing - cross-engine prefix hits. Set to ``crash`` to fail fast (e.g., in CI). +Equivalently as inline JSON (single line): + +```yaml +actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config: '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"mooncake_config_path":"/path/to/mooncake_config.json"}}' +``` + +verl serializes ``engine_kwargs.vllm.kv_transfer_config`` into the +``--kv-transfer-config`` argument of ``vllm serve``; vLLM's own arg parser +decodes the JSON into ``KVTransferConfig`` and constructs the connector. +There is no verl-side schema layer — any field vLLM accepts is accepted +here, and any future vLLM-side KV connector (NIXL, P2pNcclConnector, future +ones) can be wired the same way without a verl change. + +If the Mooncake master is unreachable at engine launch, vLLM crashes the +serve subprocess. That's the intended fail-loud behavior; an RL run that +silently disables a configured KV store would hide hours of stale-cache +corruption. If you want a "soft" mode, wrap the launch with a healthcheck +of the master before starting verl. ## Required environment variables @@ -119,7 +133,8 @@ choices. - ``MOONCAKE_CONFIG_PATH``: required by the Mooncake client to locate ``master_server_address`` etc. Set on every rollout actor; verl - propagates it via ``ray`` runtime_env. + propagates it via ``ray`` runtime_env. You can also pass it inline via + ``kv_connector_extra_config.mooncake_config_path`` as shown above. - ``PYTHONHASHSEED=0``: **required if** you have ``DP > 1`` or multiple rollout replicas — vLLM's block-hash seed is randomized per process and cross-engine prefix-cache hits will silently drop to zero without a @@ -140,7 +155,8 @@ choices. field. The hard-reset model relies on the master being cleaned per weight update, not on key versioning. - **Failure modes**: - - Master unreachable at launch: see ``on_failure``. + - Master unreachable at launch: vLLM serve fails to start; verl + surfaces the underlying connector error. Fail-loud. - Master goes down mid-run: the rollout engine continues using its in-engine prefix cache; ``reset_store`` returns False and the rest of ``reset_prefix_cache`` still works. Re-attaching to a new master @@ -156,17 +172,16 @@ choices. | Reset trigger | Per-RPC opt-in ``flush_cache=True`` flag on each ``update_weights_*`` call — forget once -> silent stale-cache corruption | Automatic data-plane cascade through ``reset_prefix_cache(reset_connector=True)`` whenever verl drives the existing hard-reset path; no flag | | Multi-rank coordination | Each scheduler instance calls ``remove_all`` (idempotent, but N redundant RPCs) | Scheduler-side ZMQ admin RPC to rank-0 worker only -> one ``RemoveAll`` per reset | | Guard / cache consistency | ``is_fully_idle()`` silently false on failure | Cascades only when in-engine guard passes; internal and external caches stay in lockstep | -| Failure surface | Client must read return value and re-issue | Soft-dependency (``on_failure=fallback``) by default; explicit False propagates up to the scheduler | ## Reference -- Paired vLLM upstream patch: implements - ``MooncakeStoreConnector.reset_cache()`` and the - ``RESET_MAGIC`` ZMQ discriminator (see ``aoshen02/vllm:feat/mooncake-clear-hook`` - -> ``ivanium/vllm`` -> upstream). +- Paired vLLM upstream PR (must be in your vLLM build): + [vllm-project/vllm#42694](https://github.com/vllm-project/vllm/pull/42694) — + implements ``MooncakeStoreConnector.reset_cache()``, the typed-tag ZMQ + protocol, and the ``EngineCore._reset_caches`` default that threads + ``reset_connector=True`` through ``pause_generation``. - vLLM scheduler hook the cascade rides on: - ``vllm/v1/core/sched/scheduler.py:1871`` (``reset_prefix_cache``) and - ``1917`` (``reset_connector_cache``) — already shipped in 0.20.1, the - patch only provides the connector-specific ``reset_cache`` body. + ``vllm/v1/core/sched/scheduler.py`` (``reset_prefix_cache`` and + ``reset_connector_cache``). - Sister rollout-correctness recipes: ``advance/rollout_corr.md``, ``advance/rollout_corr_math.md``. From e1c471dda10f65be5e00ecc9f817a3965010fdb8 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Mon, 25 May 2026 12:09:42 +0000 Subject: [PATCH 08/15] docs(rollout_kv_offload): simplify, defer Mooncake setup to vLLM upstream doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cut 188 → 89 lines. The previous version duplicated vLLM-side setup (client install, master launch, JSON config) that already lives in the official vLLM Mooncake guide and drifts out of date here. Point readers at for setup and keep only what is verl-specific: the engine_kwargs.vllm. kv_transfer_config wiring, the reset_connector=True cascade contract for RL correctness (with the required vLLM build), and the when-to-enable / failure-mode summary. Co-authored-by: Claude Signed-off-by: aoshen --- docs/perf/rollout_kv_offload.md | 234 ++++++++++---------------------- 1 file changed, 68 insertions(+), 166 deletions(-) diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md index 86a69db8576..fe077874293 100644 --- a/docs/perf/rollout_kv_offload.md +++ b/docs/perf/rollout_kv_offload.md @@ -1,96 +1,22 @@ # Rollout KV Cache Offload via Mooncake-Store -Last updated: 2026-05-16. - -This document covers how to offload prefix KV blocks from the vLLM rollout -engine to a shared **Mooncake** distributed store, so that long shared -prefixes (system prompt + task description + earlier turns in agentic -workloads) get deduplicated across requests and across rollout replicas -within a single weight generation. - -## When to use this - -Enable when **all** of these hold: - -- Rollout workload has long, reusable prefixes (multi-turn / agentic / shared - system prompt with many ``actor_rollout_ref.rollout.n`` samples per prompt). -- The rollout-side prefix-cache hit rate on a single engine is already - saturated, but cross-engine prefix sharing (e.g., DP > 1, multi-replica - fully-async) is leaving hits on the table. -- You can run a Mooncake master process colocated with (or reachable from) - every rollout host, and you have enough RDMA / TCP bandwidth between the - rollout workers and the master. - -Do **not** enable for short prompts or small ``rollout.n`` workloads where -within-engine prefix cache is already enough — the round-trip to Mooncake -will net out negative. - -## RL-correctness contract: hard reset on every weight update - -The unique constraint of RL training is that the model weights change -between rollout steps. Any KV block written to the external store before a -weight update is **computed against the previous policy** — serving it to a -post-update request would silently corrupt inference. - -verl handles this correctly. The three prefix-cache-reset call sites in -``vllm_async_server.py`` propagate the connector reset: - -- ``wake_up`` and ``clear_kv_cache`` call - ``engine.reset_prefix_cache(reset_connector=True)`` explicitly. -- ``abort_all_requests`` calls - ``engine.pause_generation(clear_cache=True)``; in vLLM ≥ the paired - cascade patch, ``EngineCore._reset_caches`` defaults - ``reset_connector=True`` so the connector cascade fires automatically - whenever ``pause_generation`` clears caches. - -That flag cascades into vLLM as: - - Scheduler.reset_prefix_cache(reset_connector=True) - -> Scheduler.reset_connector_cache() - -> MooncakeStoreConnector.reset_cache() (SCHEDULER role) - -> MooncakeStoreScheduler.reset_store() - -> LookupKeyClient.reset() (ZMQ admin frame) - -> LookupKeyServer (worker rank 0) typed dispatch - -> store.remove_all(force=True) (Mooncake master) - -The net effect is that after each ``update_weights`` round, the Mooncake -master is empty before any new rollout request starts — matching the -existing in-engine prefix-cache behavior. The contract is symmetric: if -the in-engine guard (``BlockPool.reset_prefix_cache``) fails to clear (e.g. -an in-flight sequence still holds blocks), the external store is *also* -left untouched, so internal and external caches never desynchronize. - -When **no** KV connector is attached, ``reset_connector=True`` is a no-op -success in upstream vLLM (the scheduler treats "nothing to reset" as -trivially OK). Passing it unconditionally is therefore safe for every -rollout — there is no verl-side feature flag to remember. - -## Pre-requisites - -1. **vLLM**: build that includes the ``MooncakeStoreConnector.reset_cache`` - cascade (vllm-project/vllm#42694) and the ``EngineCore._reset_caches`` - default that threads ``reset_connector=True`` from - ``pause_generation(clear_cache=True)``. Without the cascade, - ``reset_connector=True`` clears only the local prefix cache and leaves - the Mooncake master populated with stale KV — silent correctness loss. - Do **not** enable the external store without that vLLM build. -2. **Mooncake**: install the Python binding (``pip install - mooncake-transfer-engine``) and a master binary that exposes the - ``RemoveAll`` RPC. On aarch64 GB200 build from upstream with - ``-DUSE_CUDA=ON -DWITH_NVIDIA_PEERMEM=OFF -DUSE_MNNVL=ON``. -3. **Mooncake master process**: launched out-of-band, typically per-run - (see ``scripts/mooncake/start_mooncake_master.sh`` in the - mooncake-integration project). Single-tenant per run is recommended - because ``RemoveAll`` is master-wide; multi-tenant sharing would let - one experiment wipe another's cache. - -## Configuration - -verl forwards any key under ``actor_rollout_ref.rollout.engine_kwargs.vllm`` -to ``vllm serve`` as a CLI flag. To attach the Mooncake connector, set -``kv_transfer_config`` directly — the JSON shape is vLLM's own -``KVTransferConfig`` schema (see ``vllm/config/__init__.py`` -> -``KVTransferConfig``): +Offload prefix KV blocks from the vLLM rollout engine to a shared +[Mooncake](https://github.com/kvcache-ai/Mooncake) store so long shared +prefixes (system prompt, agentic tool history, `rollout.n` samples per prompt) +get deduplicated across requests and rollout replicas. + +## Setup Mooncake + vLLM + +Follow vLLM's official guide for installing the Mooncake client, starting a +master, and writing the JSON config: +**** + +The verl side only consumes whatever that doc produces — no extra steps. + +## Enable in verl + +verl forwards `engine_kwargs.vllm.*` straight to `vllm serve` as CLI flags. +To attach the Mooncake connector, set `kv_transfer_config`: ```yaml actor_rollout_ref: @@ -107,81 +33,57 @@ actor_rollout_ref: } ``` -Equivalently as inline JSON (single line): +Or as a Hydra CLI override: -```yaml -actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config: '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"mooncake_config_path":"/path/to/mooncake_config.json"}}' +```bash ++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector=MooncakeStoreConnector \ ++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_role=kv_both \ ++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector_extra_config.mooncake_config_path=/path/to/mooncake_config.json ``` -verl serializes ``engine_kwargs.vllm.kv_transfer_config`` into the -``--kv-transfer-config`` argument of ``vllm serve``; vLLM's own arg parser -decodes the JSON into ``KVTransferConfig`` and constructs the connector. -There is no verl-side schema layer — any field vLLM accepts is accepted -here, and any future vLLM-side KV connector (NIXL, P2pNcclConnector, future -ones) can be wired the same way without a verl change. - -If the Mooncake master is unreachable at engine launch, vLLM crashes the -serve subprocess. That's the intended fail-loud behavior; an RL run that -silently disables a configured KV store would hide hours of stale-cache -corruption. If you want a "soft" mode, wrap the launch with a healthcheck -of the master before starting verl. - -## Required environment variables - -These are intentionally *not* set automatically — they're cluster-level -choices. - -- ``MOONCAKE_CONFIG_PATH``: required by the Mooncake client to locate - ``master_server_address`` etc. Set on every rollout actor; verl - propagates it via ``ray`` runtime_env. You can also pass it inline via - ``kv_connector_extra_config.mooncake_config_path`` as shown above. -- ``PYTHONHASHSEED=0``: **required if** you have ``DP > 1`` or multiple - rollout replicas — vLLM's block-hash seed is randomized per process and - cross-engine prefix-cache hits will silently drop to zero without a - fixed seed. Single-engine rollouts can leave this unset. - -## Operational notes - -- **Cluster hygiene**: a per-run master is the recommended deployment. - Reuse across runs invites cross-experiment cache pollution (and a - ``reset_connector=True`` from one run will wipe the other's keys). -- **Reset cost**: every weight update triggers ``RemoveAll`` on the - master, which iterates all metadata shards. On a ~600 GB store this is - sub-second. Frequent fully-async weight syncs (sync every 1-2 rollout - steps) will see the per-update hit rate stay low; this is expected and - matches the in-engine prefix-cache behavior under the same conditions. -- **No version tagging**: keys are pure content-addressed - (``{model_name}@tp_rank:N@...@{block_hash}``); no weight-generation - field. The hard-reset model relies on the master being cleaned per - weight update, not on key versioning. -- **Failure modes**: - - Master unreachable at launch: vLLM serve fails to start; verl - surfaces the underlying connector error. Fail-loud. - - Master goes down mid-run: the rollout engine continues using its - in-engine prefix cache; ``reset_store`` returns False and the rest of - ``reset_prefix_cache`` still works. Re-attaching to a new master - mid-run is **not** supported in this version. - - Cross-rank ``block_hash`` divergence (forgot ``PYTHONHASHSEED``): - silent zero hit rate. Inspect - ``vllm_external_prefix_cache_hits`` metric to detect. - -## Comparison vs. SGLang's HiCacheStorage flow - -| Aspect | SGLang ``/flush_cache`` | verl + vLLM ``MooncakeStoreConnector`` | -|---|---|---| -| Reset trigger | Per-RPC opt-in ``flush_cache=True`` flag on each ``update_weights_*`` call — forget once -> silent stale-cache corruption | Automatic data-plane cascade through ``reset_prefix_cache(reset_connector=True)`` whenever verl drives the existing hard-reset path; no flag | -| Multi-rank coordination | Each scheduler instance calls ``remove_all`` (idempotent, but N redundant RPCs) | Scheduler-side ZMQ admin RPC to rank-0 worker only -> one ``RemoveAll`` per reset | -| Guard / cache consistency | ``is_fully_idle()`` silently false on failure | Cascades only when in-engine guard passes; internal and external caches stay in lockstep | - -## Reference - -- Paired vLLM upstream PR (must be in your vLLM build): - [vllm-project/vllm#42694](https://github.com/vllm-project/vllm/pull/42694) — - implements ``MooncakeStoreConnector.reset_cache()``, the typed-tag ZMQ - protocol, and the ``EngineCore._reset_caches`` default that threads - ``reset_connector=True`` through ``pause_generation``. -- vLLM scheduler hook the cascade rides on: - ``vllm/v1/core/sched/scheduler.py`` (``reset_prefix_cache`` and - ``reset_connector_cache``). -- Sister rollout-correctness recipes: ``advance/rollout_corr.md``, - ``advance/rollout_corr_math.md``. +Set `MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json` on every rollout +actor (verl propagates via Ray `runtime_env`). For `DP>1` or multiple rollout +replicas, also set `PYTHONHASHSEED=0` — vLLM's block-hash seed is randomized +per process and cross-engine hits drop to zero without it. + +## RL correctness: hard reset on every weight update + +Model weights change between rollout steps, so any KV block written to the +external store under the previous policy must be evicted before the next +rollout starts — otherwise stale KV silently corrupts inference. verl +handles this automatically via `engine.reset_prefix_cache(reset_connector=True)` +in `vllm_async_server.py`'s `wake_up` / `clear_kv_cache` / `abort_all_requests` +paths. The flag cascades through vLLM into `MooncakeStoreConnector.reset_cache()`, +which clears the master via the `RemoveAll` RPC. + +**Required vLLM build**: must include +[vllm-project/vllm#42694](https://github.com/vllm-project/vllm/pull/42694) +(`MooncakeStoreConnector.reset_cache` + the `EngineCore._reset_caches` default +that threads `reset_connector=True` through `pause_generation`). Without it, +`reset_connector=True` clears only the local prefix cache and leaves the +Mooncake master populated with stale KV — silent correctness loss. Do not +enable the connector on an older vLLM build. + +When no connector is attached, `reset_connector=True` is a no-op success in +upstream vLLM, so this code path is always safe. + +## When to enable + +Enable when **all** hold: + +- Rollout has long, reusable prefixes (multi-turn agentic, large `rollout.n`). +- Within-engine prefix cache hit rate is already saturated. +- You have cross-engine reuse opportunity (DP > 1, multi-replica fully-async). + +For short prompts or small `rollout.n` on a single engine, the round-trip to +Mooncake nets out negative — keep it off. + +## Failure modes + +- **Master unreachable at launch**: vLLM serve fails to start. Fail-loud is + intentional — silently disabling a configured KV store would hide stale-cache + corruption. +- **Master dies mid-run**: rollout falls back to local prefix cache; + re-attaching is not supported in this version. +- **Cross-rank `block_hash` divergence** (forgot `PYTHONHASHSEED`): silent + zero hit rate. Check the `vllm_external_prefix_cache_hits` metric. From 22b94d2c7be4f3a67b121ae1e9e93c2e4a25c91c Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Mon, 25 May 2026 12:21:50 +0000 Subject: [PATCH 09/15] docs(rollout_kv_offload): drop Failure modes and When-to-enable sections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 89 → 57 lines. Cut three sections that don't belong in a "how to enable" doc: Failure modes (operational advice that drifts), When to enable (opinion / negative recommendation), the duplicate Hydra CLI override block (YAML form is enough), and a trailing line on no-op upstream behavior. Keep only: setup link to vLLM doc, the YAML wiring, and the RL-correctness reset cascade contract with required vLLM build. Co-authored-by: Claude Signed-off-by: aoshen --- docs/perf/rollout_kv_offload.md | 35 +-------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md index fe077874293..8dbcd380528 100644 --- a/docs/perf/rollout_kv_offload.md +++ b/docs/perf/rollout_kv_offload.md @@ -33,14 +33,6 @@ actor_rollout_ref: } ``` -Or as a Hydra CLI override: - -```bash -+actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector=MooncakeStoreConnector \ -+actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_role=kv_both \ -+actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector_extra_config.mooncake_config_path=/path/to/mooncake_config.json -``` - Set `MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json` on every rollout actor (verl propagates via Ray `runtime_env`). For `DP>1` or multiple rollout replicas, also set `PYTHONHASHSEED=0` — vLLM's block-hash seed is randomized @@ -61,29 +53,4 @@ which clears the master via the `RemoveAll` RPC. (`MooncakeStoreConnector.reset_cache` + the `EngineCore._reset_caches` default that threads `reset_connector=True` through `pause_generation`). Without it, `reset_connector=True` clears only the local prefix cache and leaves the -Mooncake master populated with stale KV — silent correctness loss. Do not -enable the connector on an older vLLM build. - -When no connector is attached, `reset_connector=True` is a no-op success in -upstream vLLM, so this code path is always safe. - -## When to enable - -Enable when **all** hold: - -- Rollout has long, reusable prefixes (multi-turn agentic, large `rollout.n`). -- Within-engine prefix cache hit rate is already saturated. -- You have cross-engine reuse opportunity (DP > 1, multi-replica fully-async). - -For short prompts or small `rollout.n` on a single engine, the round-trip to -Mooncake nets out negative — keep it off. - -## Failure modes - -- **Master unreachable at launch**: vLLM serve fails to start. Fail-loud is - intentional — silently disabling a configured KV store would hide stale-cache - corruption. -- **Master dies mid-run**: rollout falls back to local prefix cache; - re-attaching is not supported in this version. -- **Cross-rank `block_hash` divergence** (forgot `PYTHONHASHSEED`): silent - zero hit rate. Check the `vllm_external_prefix_cache_hits` metric. +Mooncake master populated with stale KV — silent correctness loss. From fb7ceef319988a21637b4b32b65c2b834e968356 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Mon, 25 May 2026 12:23:18 +0000 Subject: [PATCH 10/15] docs(rollout_kv_offload): restore Hydra CLI override example Some users drive verl entirely from CLI overrides without touching YAML files, so keep the CLI form as a parallel example. Co-authored-by: Claude Signed-off-by: aoshen --- docs/perf/rollout_kv_offload.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md index 8dbcd380528..aff9a74185d 100644 --- a/docs/perf/rollout_kv_offload.md +++ b/docs/perf/rollout_kv_offload.md @@ -33,6 +33,14 @@ actor_rollout_ref: } ``` +Or as a Hydra CLI override: + +```bash ++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector=MooncakeStoreConnector \ ++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_role=kv_both \ ++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector_extra_config.mooncake_config_path=/path/to/mooncake_config.json +``` + Set `MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json` on every rollout actor (verl propagates via Ray `runtime_env`). For `DP>1` or multiple rollout replicas, also set `PYTHONHASHSEED=0` — vLLM's block-hash seed is randomized From baf6efda00dc5dd533c007f10ea29d92e52af576 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Wed, 27 May 2026 12:23:20 +0000 Subject: [PATCH 11/15] docs: simplify rollout kv offload guide --- docs/perf/rollout_kv_offload.md | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md index aff9a74185d..507ffd1ad68 100644 --- a/docs/perf/rollout_kv_offload.md +++ b/docs/perf/rollout_kv_offload.md @@ -3,7 +3,9 @@ Offload prefix KV blocks from the vLLM rollout engine to a shared [Mooncake](https://github.com/kvcache-ai/Mooncake) store so long shared prefixes (system prompt, agentic tool history, `rollout.n` samples per prompt) -get deduplicated across requests and rollout replicas. +get deduplicated across requests and rollout replicas. This also helps +long-tail load balancing: when work migrates to idle rollout replicas, shared +prefix KV reduces the re-prefill cost. ## Setup Mooncake + vLLM @@ -41,24 +43,10 @@ Or as a Hydra CLI override: +actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector_extra_config.mooncake_config_path=/path/to/mooncake_config.json ``` -Set `MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json` on every rollout -actor (verl propagates via Ray `runtime_env`). For `DP>1` or multiple rollout -replicas, also set `PYTHONHASHSEED=0` — vLLM's block-hash seed is randomized -per process and cross-engine hits drop to zero without it. - ## RL correctness: hard reset on every weight update -Model weights change between rollout steps, so any KV block written to the -external store under the previous policy must be evicted before the next -rollout starts — otherwise stale KV silently corrupts inference. verl -handles this automatically via `engine.reset_prefix_cache(reset_connector=True)` -in `vllm_async_server.py`'s `wake_up` / `clear_kv_cache` / `abort_all_requests` -paths. The flag cascades through vLLM into `MooncakeStoreConnector.reset_cache()`, -which clears the master via the `RemoveAll` RPC. +verl clears both local and Mooncake KV caches at every weight update boundary +to avoid reusing KV from the previous policy. -**Required vLLM build**: must include -[vllm-project/vllm#42694](https://github.com/vllm-project/vllm/pull/42694) -(`MooncakeStoreConnector.reset_cache` + the `EngineCore._reset_caches` default -that threads `reset_connector=True` through `pause_generation`). Without it, -`reset_connector=True` clears only the local prefix cache and leaves the -Mooncake master populated with stale KV — silent correctness loss. +**Required vLLM version**: use vLLM 0.22 or newer. Older builds may leave stale +KV in the Mooncake master after a weight update. From 07f176519fffc937dae0354ff5ce5d0d75fe4cd3 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Wed, 27 May 2026 12:41:27 +0000 Subject: [PATCH 12/15] fix(vllm): guard reset_connector for older vllm --- .../rollout/vllm_rollout/vllm_async_server.py | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index dadd9a4243d..5390b197ce5 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -54,6 +54,9 @@ ) _VLLM_VERSION = version.parse(vllm.__version__) +_RESET_PREFIX_CACHE_KWARGS = {} +if _VLLM_VERSION >= version.parse("0.22.0"): + _RESET_PREFIX_CACHE_KWARGS["reset_connector"] = True if _VLLM_VERSION > version.parse("0.11.0"): @@ -607,15 +610,11 @@ async def wake_up(self, tags: list[str] | None = None): # processes across all DP shards (unlike collective_rpc which only reaches # TP workers within a single shard). await self.engine.wake_up(tags=tags or self._get_wake_up_tags()) - await self.engine.reset_prefix_cache(reset_connector=True) + await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) elif self.rollout_mode == RolloutMode.COLOCATED: # Directly call engine to wake up without sync weights. await self.engine.wake_up(tags=self._get_wake_up_tags()) - # reset_connector=True drops any attached external KV store - # (e.g. MooncakeStoreConnector) whose entries were computed - # against the previous weights. No-op success when no connector - # is configured (vLLM scheduler treats it as such). - await self.engine.reset_prefix_cache(reset_connector=True) + await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) elif self.rollout_mode == RolloutMode.STANDALONE: logger.info("skip wake_up in standalone mode") @@ -632,11 +631,7 @@ async def sleep(self): async def clear_kv_cache(self): if self.node_rank == 0: - # reset_connector=True drops any attached external KV store - # (e.g. MooncakeStoreConnector) whose entries were computed - # against the previous model weights. With no connector it - # is a no-op success, so we can pass it unconditionally. - await self.engine.reset_prefix_cache(reset_connector=True) + await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) async def release_kv_cache(self): """Release only kv_cache GPU memory, keeping model weights intact. @@ -698,11 +693,9 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str, # 2. Abort all in-flight requests # 3. Wait for requests to drain # 4. Clear prefix and mm caches if clear_cache=True. - # EngineCore._reset_caches defaults reset_connector=True - # on this path, so any attached external KV store (e.g. - # MooncakeStoreConnector) is invalidated along with the - # local prefix cache — RL-correct hard-reset at every - # weight update boundary, no extra kwargs needed. + # On vLLM >= 0.22.0, attached external KV stores (e.g. + # MooncakeStoreConnector) are reset along with the local + # prefix cache. await self.engine.pause_generation( wait_for_inflight_requests=False, clear_cache=reset_prefix_cache, From 6fd5654d60b207a9281cfcd4c0a4e4fbee93f118 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Wed, 27 May 2026 13:10:45 +0000 Subject: [PATCH 13/15] Revert "fix(vllm): guard reset_connector for older vllm" This reverts commit 07f176519fffc937dae0354ff5ce5d0d75fe4cd3. --- .../rollout/vllm_rollout/vllm_async_server.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 5390b197ce5..dadd9a4243d 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -54,9 +54,6 @@ ) _VLLM_VERSION = version.parse(vllm.__version__) -_RESET_PREFIX_CACHE_KWARGS = {} -if _VLLM_VERSION >= version.parse("0.22.0"): - _RESET_PREFIX_CACHE_KWARGS["reset_connector"] = True if _VLLM_VERSION > version.parse("0.11.0"): @@ -610,11 +607,15 @@ async def wake_up(self, tags: list[str] | None = None): # processes across all DP shards (unlike collective_rpc which only reaches # TP workers within a single shard). await self.engine.wake_up(tags=tags or self._get_wake_up_tags()) - await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) + await self.engine.reset_prefix_cache(reset_connector=True) elif self.rollout_mode == RolloutMode.COLOCATED: # Directly call engine to wake up without sync weights. await self.engine.wake_up(tags=self._get_wake_up_tags()) - await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) + # reset_connector=True drops any attached external KV store + # (e.g. MooncakeStoreConnector) whose entries were computed + # against the previous weights. No-op success when no connector + # is configured (vLLM scheduler treats it as such). + await self.engine.reset_prefix_cache(reset_connector=True) elif self.rollout_mode == RolloutMode.STANDALONE: logger.info("skip wake_up in standalone mode") @@ -631,7 +632,11 @@ async def sleep(self): async def clear_kv_cache(self): if self.node_rank == 0: - await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) + # reset_connector=True drops any attached external KV store + # (e.g. MooncakeStoreConnector) whose entries were computed + # against the previous model weights. With no connector it + # is a no-op success, so we can pass it unconditionally. + await self.engine.reset_prefix_cache(reset_connector=True) async def release_kv_cache(self): """Release only kv_cache GPU memory, keeping model weights intact. @@ -693,9 +698,11 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str, # 2. Abort all in-flight requests # 3. Wait for requests to drain # 4. Clear prefix and mm caches if clear_cache=True. - # On vLLM >= 0.22.0, attached external KV stores (e.g. - # MooncakeStoreConnector) are reset along with the local - # prefix cache. + # EngineCore._reset_caches defaults reset_connector=True + # on this path, so any attached external KV store (e.g. + # MooncakeStoreConnector) is invalidated along with the + # local prefix cache — RL-correct hard-reset at every + # weight update boundary, no extra kwargs needed. await self.engine.pause_generation( wait_for_inflight_requests=False, clear_cache=reset_prefix_cache, From e93c8f669faf48fb9250cbbebeda968e886c76b9 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Wed, 27 May 2026 13:23:43 +0000 Subject: [PATCH 14/15] fix(vllm): guard reset_connector for vllm 0.13 --- docs/perf/rollout_kv_offload.md | 2 ++ verl/workers/rollout/vllm_rollout/vllm_async_server.py | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md index 507ffd1ad68..a6011cf46d1 100644 --- a/docs/perf/rollout_kv_offload.md +++ b/docs/perf/rollout_kv_offload.md @@ -1,5 +1,7 @@ # Rollout KV Cache Offload via Mooncake-Store +Last updated: 05/27/2026. + Offload prefix KV blocks from the vLLM rollout engine to a shared [Mooncake](https://github.com/kvcache-ai/Mooncake) store so long shared prefixes (system prompt, agentic tool history, `rollout.n` samples per prompt) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index dadd9a4243d..d4194e1ce55 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -54,6 +54,9 @@ ) _VLLM_VERSION = version.parse(vllm.__version__) +_RESET_PREFIX_CACHE_KWARGS = {} +if _VLLM_VERSION >= version.parse("0.13.0"): + _RESET_PREFIX_CACHE_KWARGS["reset_connector"] = True if _VLLM_VERSION > version.parse("0.11.0"): @@ -607,7 +610,7 @@ async def wake_up(self, tags: list[str] | None = None): # processes across all DP shards (unlike collective_rpc which only reaches # TP workers within a single shard). await self.engine.wake_up(tags=tags or self._get_wake_up_tags()) - await self.engine.reset_prefix_cache(reset_connector=True) + await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) elif self.rollout_mode == RolloutMode.COLOCATED: # Directly call engine to wake up without sync weights. await self.engine.wake_up(tags=self._get_wake_up_tags()) @@ -615,7 +618,7 @@ async def wake_up(self, tags: list[str] | None = None): # (e.g. MooncakeStoreConnector) whose entries were computed # against the previous weights. No-op success when no connector # is configured (vLLM scheduler treats it as such). - await self.engine.reset_prefix_cache(reset_connector=True) + await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) elif self.rollout_mode == RolloutMode.STANDALONE: logger.info("skip wake_up in standalone mode") @@ -636,7 +639,7 @@ async def clear_kv_cache(self): # (e.g. MooncakeStoreConnector) whose entries were computed # against the previous model weights. With no connector it # is a no-op success, so we can pass it unconditionally. - await self.engine.reset_prefix_cache(reset_connector=True) + await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) async def release_kv_cache(self): """Release only kv_cache GPU memory, keeping model weights intact. From 00346addb08e8873c88d346b717f6a1f211f6856 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Wed, 27 May 2026 13:38:50 +0000 Subject: [PATCH 15/15] docs: document vllm version requirement for kv offload --- docs/perf/rollout_kv_offload.md | 3 +++ verl/workers/rollout/vllm_rollout/vllm_async_server.py | 9 +++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md index a6011cf46d1..4dd1c6b1ddb 100644 --- a/docs/perf/rollout_kv_offload.md +++ b/docs/perf/rollout_kv_offload.md @@ -11,6 +11,9 @@ prefix KV reduces the re-prefill cost. ## Setup Mooncake + vLLM +Use vLLM 0.22 or newer; earlier vLLM versions do not provide the full +MooncakeStoreConnector hard-reset behavior required by this integration. + Follow vLLM's official guide for installing the Mooncake client, starting a master, and writing the JSON config: **** diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index d4194e1ce55..dadd9a4243d 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -54,9 +54,6 @@ ) _VLLM_VERSION = version.parse(vllm.__version__) -_RESET_PREFIX_CACHE_KWARGS = {} -if _VLLM_VERSION >= version.parse("0.13.0"): - _RESET_PREFIX_CACHE_KWARGS["reset_connector"] = True if _VLLM_VERSION > version.parse("0.11.0"): @@ -610,7 +607,7 @@ async def wake_up(self, tags: list[str] | None = None): # processes across all DP shards (unlike collective_rpc which only reaches # TP workers within a single shard). await self.engine.wake_up(tags=tags or self._get_wake_up_tags()) - await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) + await self.engine.reset_prefix_cache(reset_connector=True) elif self.rollout_mode == RolloutMode.COLOCATED: # Directly call engine to wake up without sync weights. await self.engine.wake_up(tags=self._get_wake_up_tags()) @@ -618,7 +615,7 @@ async def wake_up(self, tags: list[str] | None = None): # (e.g. MooncakeStoreConnector) whose entries were computed # against the previous weights. No-op success when no connector # is configured (vLLM scheduler treats it as such). - await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) + await self.engine.reset_prefix_cache(reset_connector=True) elif self.rollout_mode == RolloutMode.STANDALONE: logger.info("skip wake_up in standalone mode") @@ -639,7 +636,7 @@ async def clear_kv_cache(self): # (e.g. MooncakeStoreConnector) whose entries were computed # against the previous model weights. With no connector it # is a no-op success, so we can pass it unconditionally. - await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS) + await self.engine.reset_prefix_cache(reset_connector=True) async def release_kv_cache(self): """Release only kv_cache GPU memory, keeping model weights intact.