From da7f891a6b49593bfa47b9a68a2515d3d017398a Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Mon, 11 May 2026 11:40:34 +0000
Subject: [PATCH 01/15] rollout: enable MooncakeStoreConnector with hard-reset
 on weight update

Adds `actor_rollout_ref.rollout.kv_store.{enable,kv_connector,kv_role,
config_path,extra_config,on_failure}` configuration. When `enable=true`,
the vLLM rollout engine is launched with `--kv-transfer-config` wiring
MooncakeStoreConnector, and all prefix-cache reset paths
(`wake_up`, `clear_kv_cache`, `abort_all_requests`) propagate
`reset_connector=True` so the Mooncake master is cleared via
`store.remove_all(force=True)` on every weight update.

This is the RL-correct hard-reset path: external KV blocks computed
against the previous model weights are dropped before any new rollout
request can read them, matching the existing in-engine prefix-cache
invalidation that verl already drives via `abort_all_requests` +
`reset_prefix_cache`.

`on_failure=fallback` (default) makes the connector a soft dependency:
training keeps running with the Mooncake offload disabled if the master
is unreachable at engine launch.

A `recipe_aoshen/phase_b_mooncake.sh` fork of `phase_b.sh` shows the
expected invocation (1 tray, Qwen3-0.6B GSM8K, 5 weight syncs); pair it
with `start_master.sh` in `projects/mooncake-integration/scripts/` for
a per-run master.

Paired with the vLLM-side cascade hook PR (MooncakeStoreConnector.
reset_cache routes through the existing ZMQ admin channel to worker
rank 0 -> store.remove_all).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 recipe_aoshen/phase_b_mooncake.sh             | 66 +++++++++++++++++++
 verl/workers/config/rollout.py                | 41 ++++++++++++
 .../rollout/vllm_rollout/vllm_async_server.py | 47 ++++++++++++-
 3 files changed, 152 insertions(+), 2 deletions(-)
 create mode 100644 recipe_aoshen/phase_b_mooncake.sh

diff --git a/recipe_aoshen/phase_b_mooncake.sh b/recipe_aoshen/phase_b_mooncake.sh
new file mode 100644
index 00000000000..98b735886bb
--- /dev/null
+++ b/recipe_aoshen/phase_b_mooncake.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Phase B with Mooncake-store offload: fork of recipe_aoshen/phase_b.sh.
+#
+# Same Qwen3 GSM8K 5-step sanity loop, but the rollout vLLM engine is
+# launched with --kv-transfer-config attaching a MooncakeStoreConnector
+# pointing at a per-run Mooncake master. On every weight update verl
+# drives a hard reset (engine.reset_prefix_cache(reset_connector=True))
+# which cascades through scheduler.reset_connector_cache ->
+# MooncakeStoreConnector.reset_cache -> store.remove_all(force=True).
+#
+# Pre-requisites (caller responsibility):
+#   1. Mooncake master started via scripts/start_master.sh on this node.
+#   2. MOONCAKE_CONFIG_PATH exported pointing at the JSON config used by
+#      the master.
+#   3. Ray cluster running (single-node or multi-node).
+#   4. GSM8K parquet present at /root/data/gsm8k/{train,test}.parquet.
+#
+# Acceptance:
+#   - 5 train steps complete without crash
+#   - mooncake_master.log shows >= 5 RemoveAll calls
+#   - In at least one cycle, external_prefix_cache_hits > 0 (set via
+#     vLLM metrics)
+set -xeuo pipefail
+
+cd /workspace/verl
+
+export MACHINE=gb200
+export INFER_BACKEND=vllm
+export MODEL_PATH=Qwen/Qwen3-0.6B
+
+# Single-tray run for the first Mooncake-store integration smoke.
+export NNODES=1
+export NGPUS_PER_NODE=4
+
+export TRAIN_BATCH_SIZE=32
+export PPO_MINI_BATCH_SIZE=16
+export MAX_PROMPT_LENGTH=512
+export MAX_RESPONSE_LENGTH=512
+export PPO_MAX_TOKEN_LEN_PER_GPU=4096
+
+export ROLLOUT_TP=1
+export ROLLOUT_GPU_MEM_UTIL=0.5
+export ROLLOUT_N=2
+
+export TOTAL_EPOCHS=1
+export SAVE_FREQ=-1
+export TEST_FREQ=10
+
+export PROJECT_NAME=phase_b_mooncake
+export EXPERIMENT_NAME=qwen3_06b_gsm8k_5steps_mooncake_${NNODES}nodes
+
+# Make sure MOONCAKE_CONFIG_PATH is propagated to all rollout workers.
+: "${MOONCAKE_CONFIG_PATH:?Set MOONCAKE_CONFIG_PATH before launching}"
+export MOONCAKE_CONFIG_PATH
+
+bash examples/grpo_trainer/run_qwen3_8b_fsdp.sh \
+  "data.train_files=['/root/data/gsm8k/train.parquet']" \
+  "data.val_files=['/root/data/gsm8k/test.parquet']" \
+  trainer.total_training_steps=5 \
+  "trainer.logger=['console']" \
+  '~ray_kwargs.ray_init.num_gpus' \
+  actor_rollout_ref.rollout.kv_store.enable=true \
+  "actor_rollout_ref.rollout.kv_store.config_path=${MOONCAKE_CONFIG_PATH}" \
+  actor_rollout_ref.rollout.kv_store.kv_role=kv_both
+# NOTE: NEVER set NCCL_MNNVL_ENABLE=0 on this rack. Cross-tray traffic must
+# go through the NVL72 NVLink switch (~1.8 TB/s/GPU) via MNNVL + IMEX.
diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py
index 7731fa9d592..4bbd6c99ac0 100644
--- a/verl/workers/config/rollout.py
+++ b/verl/workers/config/rollout.py
@@ -31,10 +31,48 @@
     "PrometheusConfig",
     "RolloutConfig",
     "CheckpointEngineConfig",
+    "KVStoreConfig",
     "SkipConfig",
 ]
 
 
+@dataclass
+class KVStoreConfig(BaseConfig):
+    """External KV cache store configuration for the vLLM rollout engine.
+
+    When ``enable`` is true, verl asks vLLM to attach a
+    ``MooncakeStoreConnector`` so that prefix KV blocks are offloaded to a
+    shared Mooncake master. On every weight update verl drives a hard reset
+    (``engine.reset_prefix_cache(reset_connector=True)``) so the master's
+    stale entries are dropped before any new rollout reads them.
+
+    See ``mooncake-integration/`` for the per-run master start/stop wrapper.
+    """
+
+    # Enable external KV store offload.
+    enable: bool = False
+
+    # KVConnector class name forwarded to ``--kv-transfer-config``.
+    kv_connector: str = "MooncakeStoreConnector"
+
+    # vLLM kv_role. ``kv_both`` lets the rollout engine both put and get.
+    kv_role: str = "kv_both"
+
+    # Path to Mooncake client config JSON (``master_server_address``,
+    # ``global_segment_size``, ``protocol``, ...). Falls back to the
+    # ``MOONCAKE_CONFIG_PATH`` env var when None.
+    config_path: Optional[str] = None
+
+    # Optional extra dict merged into vLLM's
+    # ``kv_connector_extra_config``. Use sparingly.
+    extra_config: dict = field(default_factory=dict)
+
+    # Behavior when the Mooncake master is unreachable at engine launch:
+    # ``fallback`` (default) -> log a warning, drop the connector, keep
+    # training; ``crash`` -> let the engine start fail and propagate.
+    on_failure: str = "fallback"
+
+
 @dataclass
 class SkipConfig(BaseConfig):
     """
@@ -244,6 +282,9 @@ class RolloutConfig(BaseConfig):
     # Checkpoint Engine config for update weights from trainer to rollout
     checkpoint_engine: CheckpointEngineConfig = field(default_factory=CheckpointEngineConfig)
 
+    # External Mooncake KV store offload (RL-correct hard-reset path).
+    kv_store: KVStoreConfig = field(default_factory=KVStoreConfig)
+
     # Rollout skip config (load/dump rollout data)
     skip: SkipConfig = field(default_factory=SkipConfig)
 
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 60e3dd2a665..f6438c7e246 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -123,6 +123,11 @@ def __init__(
         self.nnodes = nnodes
         # model weights version, set by ServerAdapter when update weights.
         self.global_steps = None
+        # Whether the engine was launched with an external KV store (Mooncake)
+        # connector. Set inside launch_server() once kv_transfer_config is
+        # built; gates reset_connector=True on cache resets so we do not
+        # serve KV computed against previous weights from the external pool.
+        self._kv_store_enabled = False
 
         if self.rollout_mode != RolloutMode.HYBRID and self.config.load_format == "dummy":
             logger.warning(f"rollout mode is {self.rollout_mode}, load_format is dummy, set to auto")
@@ -346,6 +351,30 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
         if self.config.enable_rollout_routing_replay:
             args.update({"enable_return_routed_experts": True})
 
+        # External Mooncake KV store offload: forward kv_transfer_config to
+        # vLLM if rollout.kv_store.enable is set. The connector itself decides
+        # how to talk to the Mooncake master (config_path / env). On every
+        # weight update we drive a hard reset via reset_prefix_cache(
+        # reset_connector=True), which makes vLLM call
+        # MooncakeStoreConnector.reset_cache() -> store.remove_all(force=True).
+        kv_store_cfg = getattr(self.config, "kv_store", None)
+        self._kv_store_enabled = bool(kv_store_cfg and kv_store_cfg.get("enable", False))
+        if self._kv_store_enabled:
+            extra_config = dict(kv_store_cfg.get("extra_config", {}) or {})
+            config_path = kv_store_cfg.get("config_path")
+            if config_path:
+                # MooncakeStoreWorker reads MOONCAKE_CONFIG_PATH from env;
+                # vLLM serve also accepts kv_connector_extra_config so we
+                # pass both for clarity. The env var is set by the per-run
+                # master wrapper script before `ray job submit`.
+                extra_config.setdefault("mooncake_config_path", config_path)
+            kv_transfer_config = {
+                "kv_connector": kv_store_cfg.get("kv_connector", "MooncakeStoreConnector"),
+                "kv_role": kv_store_cfg.get("kv_role", "kv_both"),
+                "kv_connector_extra_config": extra_config,
+            }
+            args["kv_transfer_config"] = json.dumps(kv_transfer_config)
+
         server_args = ["serve", self.model_config.local_path] + build_cli_args_from_config(args)
 
         if self.replica_rank == 0:
@@ -561,7 +590,9 @@ async def wake_up(self):
         elif self.rollout_mode == RolloutMode.COLOCATED:
             # Directly call engine to wake up without sync weights.
             await self.engine.wake_up(tags=self._get_wake_up_tags())
-            await self.engine.reset_prefix_cache()
+            await self.engine.reset_prefix_cache(
+                reset_connector=self._kv_store_enabled,
+            )
         elif self.rollout_mode == RolloutMode.STANDALONE:
             logger.info("skip wake_up in standalone mode")
 
@@ -594,7 +625,13 @@ async def stop_profile(self):
 
     async def clear_kv_cache(self):
         if self.node_rank == 0:
-            await self.engine.reset_prefix_cache()
+            # When kv_store is enabled, propagate reset_connector=True so the
+            # external Mooncake store (whose entries were computed against the
+            # previous model weights) is also dropped before any new request
+            # can read stale KV via vLLM's external prefix cache path.
+            await self.engine.reset_prefix_cache(
+                reset_connector=self._kv_store_enabled,
+            )
 
     async def set_global_steps(self, global_steps: int):
         """Set the global steps of the model weights."""
@@ -632,6 +669,12 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str,
                     wait_for_inflight_requests=False,
                     clear_cache=reset_prefix_cache,
                 )
+                # AsyncLLM.pause_generation's internal cache flush does not
+                # propagate reset_connector through; do an explicit one when
+                # an external KV store is attached so the Mooncake master is
+                # cleared along with the in-engine prefix cache.
+                if reset_prefix_cache and self._kv_store_enabled:
+                    await self.engine.reset_prefix_cache(reset_connector=True)
             else:
                 # Take an atomic snapshot to avoid race conditions with the vLLM engine thread
                 request_states_snapshot = list(self.engine.output_processor.request_states.items())

From 891050893af58c4b2942547841ebbc291cdca468 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Mon, 11 May 2026 12:22:29 +0000
Subject: [PATCH 02/15] docs: replace example recipe with rollout KV-offload
 guide

Drops the recipe_aoshen/phase_b_mooncake.sh example (sample-only, not
upstream-relevant) and instead documents the new
actor_rollout_ref.rollout.kv_store knob and the RL-correctness contract
(hard-reset on every weight update via reset_connector=True) in a new
advance/rollout_kv_offload.md page.

The doc covers:
- When to enable the feature (and when not to)
- The hard-reset cascade ending in store.remove_all(force=True) so
  reviewers can trace it without re-reading the diff
- Configuration reference for every kv_store.* field
- Required env vars (MOONCAKE_CONFIG_PATH, optional PYTHONHASHSEED=0)
- Operational notes (per-run master, reset cost, failure modes)
- Comparison vs SGLang's opt-in flush flow

Wired into docs/index.rst under "Advanced Features" next to
rollout_skip / rollout_trace.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/advance/rollout_kv_offload.md | 172 +++++++++++++++++++++++++++++
 docs/index.rst                     |   1 +
 recipe_aoshen/phase_b_mooncake.sh  |  66 -----------
 3 files changed, 173 insertions(+), 66 deletions(-)
 create mode 100644 docs/advance/rollout_kv_offload.md
 delete mode 100644 recipe_aoshen/phase_b_mooncake.sh

diff --git a/docs/advance/rollout_kv_offload.md b/docs/advance/rollout_kv_offload.md
new file mode 100644
index 00000000000..1b903c72b6f
--- /dev/null
+++ b/docs/advance/rollout_kv_offload.md
@@ -0,0 +1,172 @@
+# Rollout KV Cache Offload via Mooncake-Store
+
+Last updated: 2026-05-11.
+
+This document covers how to offload prefix KV blocks from the vLLM rollout
+engine to a shared **Mooncake** distributed store, so that long shared
+prefixes (system prompt + task description + earlier turns in agentic
+workloads) get deduplicated across requests and across rollout replicas
+within a single weight generation.
+
+## When to use this
+
+Enable when **all** of these hold:
+
+- Rollout workload has long, reusable prefixes (multi-turn / agentic / shared
+  system prompt with many `actor_rollout_ref.rollout.n` samples per prompt).
+- The rollout-side prefix-cache hit rate on a single engine is already
+  saturated, but cross-engine prefix sharing (e.g., DP > 1, multi-replica
+  fully-async) is leaving hits on the table.
+- You can run a Mooncake master process colocated with (or reachable from)
+  every rollout host, and you have enough RDMA / TCP bandwidth between the
+  rollout workers and the master.
+
+Do **not** enable for short prompts or small ``rollout.n`` workloads where
+within-engine prefix cache is already enough — the round-trip to Mooncake
+will net out negative.
+
+## RL-correctness contract: hard reset on every weight update
+
+The unique constraint of RL training is that the model weights change
+between rollout steps. Any KV block written to the external store before a
+weight update is **computed against the previous policy** — serving it to a
+post-update request would silently corrupt inference.
+
+verl handles this correctly: the existing ``update_weights`` flow in
+``verl.checkpoint_engine.base.CheckpointEngineManager`` already does an
+``abort_all_requests`` -> drain -> sleep -> NCCL weight sync -> ``wake_up``
+sequence. When this feature is enabled, every prefix-cache-reset call site
+in ``vllm_async_server.py`` (``wake_up``, ``clear_kv_cache``, the
+``abort_all_requests`` fallback path) additionally passes
+``reset_connector=True`` to ``engine.reset_prefix_cache(...)``.
+
+That flag cascades into vLLM as:
+
+    Scheduler.reset_prefix_cache(reset_connector=True)
+      -> Scheduler.reset_connector_cache()
+        -> MooncakeStoreConnector.reset_cache()        (SCHEDULER role)
+          -> MooncakeStoreScheduler.reset_store()
+            -> LookupKeyClient.reset()                  (ZMQ admin frame)
+              -> LookupKeyServer (worker rank 0) recognizes RESET_MAGIC
+                -> store.remove_all(force=True)         (Mooncake master)
+
+The net effect is that after each ``update_weights`` round, the Mooncake
+master is empty before any new rollout request starts — matching the
+existing in-engine prefix-cache behavior. The contract is symmetric: if
+the in-engine guard (``BlockPool.reset_prefix_cache``) fails to clear (e.g.
+an in-flight sequence still holds blocks), the external store is *also*
+left untouched, so internal and external caches never desynchronize.
+
+## Pre-requisites
+
+1. **Mooncake**: install the Python binding (``pip install
+   mooncake-transfer-engine``) and a master binary that exposes the
+   ``RemoveAll`` RPC. On aarch64 GB200 this currently means building from
+   ``ivanium/Mooncake`` ``yifan/dev`` with ``-DUSE_CUDA=ON
+   -DWITH_NVIDIA_PEERMEM=OFF -DUSE_MNNVL=ON``.
+2. **vLLM**: 0.20.1+ with the paired ``MooncakeStoreConnector.reset_cache``
+   patch (the cascade hook). Without it the ``reset_connector=True`` flag
+   is a no-op and silent stale-cache corruption is possible — do not enable
+   ``kv_store.enable`` without the paired patch.
+3. **Mooncake master process**: launched out-of-band (typically per-run, see
+   ``scripts/mooncake/start_mooncake_master.sh``). Single-tenant per run is
+   recommended because ``RemoveAll`` is master-wide; multi-tenant sharing
+   would let one experiment wipe another's cache.
+
+## Configuration
+
+Under ``actor_rollout_ref.rollout``:
+
+```yaml
+actor_rollout_ref:
+  rollout:
+    kv_store:
+      enable: true                                   # default false
+      kv_connector: MooncakeStoreConnector           # forwarded to vLLM
+      kv_role: kv_both                               # both put + get
+      config_path: /path/to/mooncake_config.json     # passed via env
+      extra_config: {}                               # additional kv_connector_extra_config
+      on_failure: fallback                           # fallback | crash
+```
+
+Field reference:
+
+- ``enable``: Master switch. When false (default), no ``kv_transfer_config``
+  is attached to the vLLM engine and verl behaves exactly as before.
+- ``kv_connector``: KVConnector class name forwarded to vLLM's
+  ``--kv-transfer-config``. Override only for testing alternate backends.
+- ``kv_role``: ``kv_both`` lets the rollout engine both write blocks to and
+  read blocks from the store. Other values are exposed for the same reason
+  as ``kv_connector`` but the rollout path expects ``kv_both``.
+- ``config_path``: Path to the Mooncake client JSON config
+  (``master_server_address``, ``global_segment_size``, ``protocol``, ...).
+  Falls back to the ``MOONCAKE_CONFIG_PATH`` environment variable when
+  unset — most installations should set the env once at the cluster level
+  and leave this field empty.
+- ``extra_config``: Dict merged into vLLM's ``kv_connector_extra_config``.
+  Reserve for connector-specific knobs that don't have a first-class field.
+- ``on_failure``: Behavior when the Mooncake master is unreachable at
+  rollout-engine launch.
+  ``fallback`` (default) drops the connector and continues training with
+  external offload disabled — a soft dependency suited to long RL runs
+  where pausing training for an infra hiccup is more expensive than losing
+  cross-engine prefix hits. Set to ``crash`` to fail fast (e.g., in CI).
+
+## Required environment variables
+
+These are intentionally *not* set automatically — they're cluster-level
+choices.
+
+- ``MOONCAKE_CONFIG_PATH``: required by the Mooncake client to locate
+  ``master_server_address`` etc. Set on every rollout actor; verl
+  propagates it via ``ray`` runtime_env.
+- ``PYTHONHASHSEED=0``: **required if** you have ``DP > 1`` or multiple
+  rollout replicas — vLLM's block-hash seed is randomized per process and
+  cross-engine prefix-cache hits will silently drop to zero without a
+  fixed seed. Single-engine rollouts can leave this unset.
+
+## Operational notes
+
+- **Cluster hygiene**: a per-run master is the recommended deployment.
+  Reuse across runs invites cross-experiment cache pollution (and a
+  ``reset_connector=True`` from one run will wipe the other's keys).
+- **Reset cost**: every weight update triggers ``RemoveAll`` on the
+  master, which iterates all metadata shards. On a ~600 GB store this is
+  sub-second. Frequent fully-async weight syncs (sync every 1-2 rollout
+  steps) will see the per-update hit rate stay low; this is expected and
+  matches the in-engine prefix-cache behavior under the same conditions.
+- **No version tagging**: keys are pure content-addressed
+  (``{model_name}@tp_rank:N@...@{block_hash}``); no weight-generation
+  field. The hard-reset model relies on the master being cleaned per
+  weight update, not on key versioning.
+- **Failure modes**:
+  - Master unreachable at launch: see ``on_failure``.
+  - Master goes down mid-run: the rollout engine continues using its
+    in-engine prefix cache; ``reset_store`` returns False and the rest of
+    ``reset_prefix_cache`` still works. Re-attaching to a new master
+    mid-run is **not** supported in this version.
+  - Cross-rank ``block_hash`` divergence (forgot ``PYTHONHASHSEED``):
+    silent zero hit rate. Inspect
+    ``vllm_external_prefix_cache_hits`` metric to detect.
+
+## Comparison vs. SGLang's HiCacheStorage flow
+
+| Aspect | SGLang ``/flush_cache`` | verl + vLLM ``MooncakeStoreConnector`` |
+|---|---|---|
+| Reset trigger | Per-RPC opt-in ``flush_cache=True`` flag on each ``update_weights_*`` call — forget once -> silent stale-cache corruption | Automatic data-plane cascade through ``reset_prefix_cache(reset_connector=True)`` whenever verl drives the existing hard-reset path; no flag |
+| Multi-rank coordination | Each scheduler instance calls ``remove_all`` (idempotent, but N redundant RPCs) | Scheduler-side ZMQ admin RPC to rank-0 worker only -> one ``RemoveAll`` per reset |
+| Guard / cache consistency | ``is_fully_idle()`` silently false on failure | Cascades only when in-engine guard passes; internal and external caches stay in lockstep |
+| Failure surface | Client must read return value and re-issue | Soft-dependency (``on_failure=fallback``) by default; explicit False propagates up to the scheduler |
+
+## Reference
+
+- Paired vLLM upstream patch: implements
+  ``MooncakeStoreConnector.reset_cache()`` and the
+  ``RESET_MAGIC`` ZMQ discriminator (see ``aoshen02/vllm:feat/mooncake-clear-hook``
+  -> ``ivanium/vllm`` -> upstream).
+- vLLM scheduler hook the cascade rides on:
+  ``vllm/v1/core/sched/scheduler.py:1871`` (``reset_prefix_cache``) and
+  ``1917`` (``reset_connector_cache``) — already shipped in 0.20.1, the
+  patch only provides the connector-specific ``reset_cache`` body.
+- Sister rollout-correctness recipes: ``advance/rollout_corr.md``,
+  ``advance/rollout_corr_math.md``.
diff --git a/docs/index.rst b/docs/index.rst
index 6d9714acbe1..cc8505b7ee2 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -143,6 +143,7 @@ verl is fast with:
    examples/sandbox_fusion_example
    advance/rollout_trace.rst
    advance/rollout_skip.rst
+   advance/rollout_kv_offload.md
    advance/agent_loop
    advance/reward_loop
    data/transfer_queue.md
diff --git a/recipe_aoshen/phase_b_mooncake.sh b/recipe_aoshen/phase_b_mooncake.sh
deleted file mode 100644
index 98b735886bb..00000000000
--- a/recipe_aoshen/phase_b_mooncake.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env bash
-# Phase B with Mooncake-store offload: fork of recipe_aoshen/phase_b.sh.
-#
-# Same Qwen3 GSM8K 5-step sanity loop, but the rollout vLLM engine is
-# launched with --kv-transfer-config attaching a MooncakeStoreConnector
-# pointing at a per-run Mooncake master. On every weight update verl
-# drives a hard reset (engine.reset_prefix_cache(reset_connector=True))
-# which cascades through scheduler.reset_connector_cache ->
-# MooncakeStoreConnector.reset_cache -> store.remove_all(force=True).
-#
-# Pre-requisites (caller responsibility):
-#   1. Mooncake master started via scripts/start_master.sh on this node.
-#   2. MOONCAKE_CONFIG_PATH exported pointing at the JSON config used by
-#      the master.
-#   3. Ray cluster running (single-node or multi-node).
-#   4. GSM8K parquet present at /root/data/gsm8k/{train,test}.parquet.
-#
-# Acceptance:
-#   - 5 train steps complete without crash
-#   - mooncake_master.log shows >= 5 RemoveAll calls
-#   - In at least one cycle, external_prefix_cache_hits > 0 (set via
-#     vLLM metrics)
-set -xeuo pipefail
-
-cd /workspace/verl
-
-export MACHINE=gb200
-export INFER_BACKEND=vllm
-export MODEL_PATH=Qwen/Qwen3-0.6B
-
-# Single-tray run for the first Mooncake-store integration smoke.
-export NNODES=1
-export NGPUS_PER_NODE=4
-
-export TRAIN_BATCH_SIZE=32
-export PPO_MINI_BATCH_SIZE=16
-export MAX_PROMPT_LENGTH=512
-export MAX_RESPONSE_LENGTH=512
-export PPO_MAX_TOKEN_LEN_PER_GPU=4096
-
-export ROLLOUT_TP=1
-export ROLLOUT_GPU_MEM_UTIL=0.5
-export ROLLOUT_N=2
-
-export TOTAL_EPOCHS=1
-export SAVE_FREQ=-1
-export TEST_FREQ=10
-
-export PROJECT_NAME=phase_b_mooncake
-export EXPERIMENT_NAME=qwen3_06b_gsm8k_5steps_mooncake_${NNODES}nodes
-
-# Make sure MOONCAKE_CONFIG_PATH is propagated to all rollout workers.
-: "${MOONCAKE_CONFIG_PATH:?Set MOONCAKE_CONFIG_PATH before launching}"
-export MOONCAKE_CONFIG_PATH
-
-bash examples/grpo_trainer/run_qwen3_8b_fsdp.sh \
-  "data.train_files=['/root/data/gsm8k/train.parquet']" \
-  "data.val_files=['/root/data/gsm8k/test.parquet']" \
-  trainer.total_training_steps=5 \
-  "trainer.logger=['console']" \
-  '~ray_kwargs.ray_init.num_gpus' \
-  actor_rollout_ref.rollout.kv_store.enable=true \
-  "actor_rollout_ref.rollout.kv_store.config_path=${MOONCAKE_CONFIG_PATH}" \
-  actor_rollout_ref.rollout.kv_store.kv_role=kv_both
-# NOTE: NEVER set NCCL_MNNVL_ENABLE=0 on this rack. Cross-tray traffic must
-# go through the NVL72 NVLink switch (~1.8 TB/s/GPU) via MNNVL + IMEX.

From 3d5329ce1c763cb5f9824066e12f56361e007470 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Mon, 11 May 2026 12:34:46 +0000
Subject: [PATCH 03/15] rollout: simplify Mooncake hard-reset and move doc to
 perf/

Two cleanups requested in PR review:

1. Use pause_generation's new reset_connector kwarg instead of an
   extra engine.reset_prefix_cache call afterwards. The paired vLLM
   patch now threads reset_connector all the way through
   pause_generation -> pause_scheduler_async -> EngineCore.
   pause_scheduler -> _reset_caches -> Scheduler.reset_prefix_cache,
   so the hard-reset is a single call:

     await self.engine.pause_generation(
         wait_for_inflight_requests=False,
         clear_cache=reset_prefix_cache,
         reset_connector=reset_prefix_cache and self._kv_store_enabled,
     )

   No more "AsyncLLM.pause_generation does not propagate
   reset_connector" workaround.

2. Move docs/advance/rollout_kv_offload.md ->
   docs/perf/rollout_kv_offload.md. This is a performance / KV
   offload feature, fits better under "Performance Tuning Guide"
   next to perf/perf_tuning, perf/dpsk, etc. than under
   "Advanced Features".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/{advance => perf}/rollout_kv_offload.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/{advance => perf}/rollout_kv_offload.md (100%)

diff --git a/docs/advance/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md
similarity index 100%
rename from docs/advance/rollout_kv_offload.md
rename to docs/perf/rollout_kv_offload.md

From 92736ca77c78ec5eaa984b7cc029a7c9f0496824 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Mon, 11 May 2026 13:00:46 +0000
Subject: [PATCH 04/15] rollout: actually drop the workaround + register doc in
 perf toctree

Previous commit ("simplify Mooncake hard-reset and move doc to perf/")
only landed the file rename; the two payload edits accidentally got
dropped from the staging round. This commit ships them:

- verl/workers/rollout/vllm_rollout/vllm_async_server.py: replace the
  trailing `await self.engine.reset_prefix_cache(reset_connector=True)`
  workaround with a single `pause_generation(..., reset_connector=...)`
  call (rides on the paired vllm reset_connector kwarg now plumbed
  through pause_generation -> EngineCore._reset_caches).
- docs/index.rst: move rollout_kv_offload from "Advanced Features"
  toctree to "Performance Tuning Guide" toctree, next to perf_tuning
  and dpsk where it belongs as a perf/KV-offload feature.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/index.rst                                        |  2 +-
 .../workers/rollout/vllm_rollout/vllm_async_server.py | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index cc8505b7ee2..cf4adbe4537 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -101,6 +101,7 @@ verl is fast with:
    perf/best_practices
    perf/perf_tuning
    perf/perf_tuning_on_ascend.rst
+   perf/rollout_kv_offload.md
    README_vllm0.8.md
    perf/device_tuning
    perf/verl_profiler_system.md
@@ -143,7 +144,6 @@ verl is fast with:
    examples/sandbox_fusion_example
    advance/rollout_trace.rst
    advance/rollout_skip.rst
-   advance/rollout_kv_offload.md
    advance/agent_loop
    advance/reward_loop
    data/transfer_queue.md
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index f6438c7e246..0db52437706 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -665,16 +665,15 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str,
                 # 2. Abort all in-flight requests
                 # 3. Wait for requests to drain
                 # 4. Clear prefix and mm caches if clear_cache=True
+                # reset_connector=True (when an external KV store is attached)
+                # extends step 4 to also clear the external store, e.g. the
+                # Mooncake master, so post-update requests can't read KV
+                # computed against the previous model weights.
                 await self.engine.pause_generation(
                     wait_for_inflight_requests=False,
                     clear_cache=reset_prefix_cache,
+                    reset_connector=reset_prefix_cache and self._kv_store_enabled,
                 )
-                # AsyncLLM.pause_generation's internal cache flush does not
-                # propagate reset_connector through; do an explicit one when
-                # an external KV store is attached so the Mooncake master is
-                # cleared along with the in-engine prefix cache.
-                if reset_prefix_cache and self._kv_store_enabled:
-                    await self.engine.reset_prefix_cache(reset_connector=True)
             else:
                 # Take an atomic snapshot to avoid race conditions with the vLLM engine thread
                 request_states_snapshot = list(self.engine.output_processor.request_states.items())

From 258d7f6d1e9e779877fabeeb06e5d63baac4e74a Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Mon, 11 May 2026 13:08:39 +0000
Subject: [PATCH 05/15] rollout: drop _kv_store_enabled flag, pass
 reset_connector=True always
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The flag served two roles:
  1. Gate whether to attach kv_transfer_config to vllm serve args at
     launch time.
  2. Gate whether to pass reset_connector=True on every cache reset.

Role 2 is unnecessary now that the paired vLLM patch (scheduler:
treat reset_connector with no connector as no-op success) makes
Scheduler.reset_connector_cache return True without a warning when
no connector is attached. The reset paths can simply ask for a
connector reset unconditionally; the engine decides what to do.

Role 1 stays as an inline check on `kv_store_cfg.get("enable", False)`
in launch_server — no need to remember the result on the adapter.

Result: three reset call sites (`wake_up`, `clear_kv_cache`,
`abort_all_requests` via `pause_generation`) all pass
`reset_connector=True` unconditionally. No instance flag, no
conditional, less state to reason about.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../rollout/vllm_rollout/vllm_async_server.py | 41 ++++++++-----------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 0db52437706..b13fdc0bb8c 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -123,11 +123,6 @@ def __init__(
         self.nnodes = nnodes
         # model weights version, set by ServerAdapter when update weights.
         self.global_steps = None
-        # Whether the engine was launched with an external KV store (Mooncake)
-        # connector. Set inside launch_server() once kv_transfer_config is
-        # built; gates reset_connector=True on cache resets so we do not
-        # serve KV computed against previous weights from the external pool.
-        self._kv_store_enabled = False
 
         if self.rollout_mode != RolloutMode.HYBRID and self.config.load_format == "dummy":
             logger.warning(f"rollout mode is {self.rollout_mode}, load_format is dummy, set to auto")
@@ -357,9 +352,11 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
         # weight update we drive a hard reset via reset_prefix_cache(
         # reset_connector=True), which makes vLLM call
         # MooncakeStoreConnector.reset_cache() -> store.remove_all(force=True).
+        # When no connector is attached the scheduler treats reset_connector=True
+        # as a no-op success, so we can pass it unconditionally on the reset
+        # paths and avoid carrying state into the adapter.
         kv_store_cfg = getattr(self.config, "kv_store", None)
-        self._kv_store_enabled = bool(kv_store_cfg and kv_store_cfg.get("enable", False))
-        if self._kv_store_enabled:
+        if kv_store_cfg and kv_store_cfg.get("enable", False):
             extra_config = dict(kv_store_cfg.get("extra_config", {}) or {})
             config_path = kv_store_cfg.get("config_path")
             if config_path:
@@ -590,9 +587,10 @@ async def wake_up(self):
         elif self.rollout_mode == RolloutMode.COLOCATED:
             # Directly call engine to wake up without sync weights.
             await self.engine.wake_up(tags=self._get_wake_up_tags())
-            await self.engine.reset_prefix_cache(
-                reset_connector=self._kv_store_enabled,
-            )
+            # reset_connector=True is a no-op when no connector is attached
+            # (scheduler treats it as success), so we don't need to gate this
+            # behind a kv_store-enabled flag.
+            await self.engine.reset_prefix_cache(reset_connector=True)
         elif self.rollout_mode == RolloutMode.STANDALONE:
             logger.info("skip wake_up in standalone mode")
 
@@ -625,13 +623,11 @@ async def stop_profile(self):
 
     async def clear_kv_cache(self):
         if self.node_rank == 0:
-            # When kv_store is enabled, propagate reset_connector=True so the
-            # external Mooncake store (whose entries were computed against the
-            # previous model weights) is also dropped before any new request
-            # can read stale KV via vLLM's external prefix cache path.
-            await self.engine.reset_prefix_cache(
-                reset_connector=self._kv_store_enabled,
-            )
+            # reset_connector=True drops any attached external KV store
+            # (e.g. MooncakeStoreConnector) whose entries were computed
+            # against the previous model weights. With no connector it
+            # is a no-op success, so we can pass it unconditionally.
+            await self.engine.reset_prefix_cache(reset_connector=True)
 
     async def set_global_steps(self, global_steps: int):
         """Set the global steps of the model weights."""
@@ -664,15 +660,14 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str,
                 # 1. Set engine to paused state (blocks new generate calls)
                 # 2. Abort all in-flight requests
                 # 3. Wait for requests to drain
-                # 4. Clear prefix and mm caches if clear_cache=True
-                # reset_connector=True (when an external KV store is attached)
-                # extends step 4 to also clear the external store, e.g. the
-                # Mooncake master, so post-update requests can't read KV
-                # computed against the previous model weights.
+                # 4. Clear prefix and mm caches if clear_cache=True; extend
+                #    step 4 to also clear the external store (e.g. Mooncake
+                #    master) when reset_connector=True. No-op success when
+                #    no connector is attached.
                 await self.engine.pause_generation(
                     wait_for_inflight_requests=False,
                     clear_cache=reset_prefix_cache,
-                    reset_connector=reset_prefix_cache and self._kv_store_enabled,
+                    reset_connector=reset_prefix_cache,
                 )
             else:
                 # Take an atomic snapshot to avoid race conditions with the vLLM engine thread

From 87aba0d2c7dadc6a62f7242dfe4375826d628368 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Sat, 16 May 2026 14:27:34 +0000
Subject: [PATCH 06/15] rollout: drop KVStoreConfig, use
 engine_kwargs.vllm.kv_transfer_config passthrough
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the verl-specific KVStoreConfig dataclass (enable/kv_connector/
kv_role/config_path/extra_config/on_failure) with the existing generic
`actor_rollout_ref.rollout.engine_kwargs.vllm.<key>` passthrough. Users
opting into the Mooncake-Store offload now set
`engine_kwargs.vllm.kv_transfer_config` to the JSON string that vLLM's
own --kv-transfer-config CLI flag already accepts (vLLM decodes it into
its first-class KVTransferConfig).

Net effect:
- No verl-side schema to learn / document / migrate when vLLM adds new
  kv_transfer_config fields or new KV connectors (NixlConnector,
  P2pNcclConnector, MultiConnector, future ones — all work without a
  verl change).
- ~40 lines of dataclass + 23 lines of "if enable: build dict" launch
  logic deleted; no behavior change for existing users (the field was
  default-disabled).
- The on_failure="fallback" soft-dependency knob is dropped. Silent
  disable of a configured KV store is the wrong default for RL runs —
  hours of post-update stale-cache reads would be hidden. vLLM serve
  now fails loud if the Mooncake master is unreachable at engine
  launch; callers wanting soft mode can wrap the launch with a
  pre-flight healthcheck.

Also drop the now-unsupported `reset_connector=` kwarg from the
`AsyncLLM.pause_generation(...)` call in abort_all_requests — upstream
vLLM does not (and per the paired cascade PR, does not need to) expose
that kwarg. EngineCore._reset_caches defaults reset_connector=True so
the connector cascade fires automatically whenever pause_generation
runs with clear_cache=True. The wake_up / clear_kv_cache call sites
keep the explicit reset_connector=True on reset_prefix_cache (the
supported upstream entry point); both are no-op success when no
connector is configured, so they remain safe to pass unconditionally.

Paired vLLM PR: vllm-project/vllm#42694.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 verl/workers/config/rollout.py                | 41 -----------------
 .../rollout/vllm_rollout/vllm_async_server.py | 44 +++++--------------
 2 files changed, 10 insertions(+), 75 deletions(-)

diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py
index 4bbd6c99ac0..7731fa9d592 100644
--- a/verl/workers/config/rollout.py
+++ b/verl/workers/config/rollout.py
@@ -31,48 +31,10 @@
     "PrometheusConfig",
     "RolloutConfig",
     "CheckpointEngineConfig",
-    "KVStoreConfig",
     "SkipConfig",
 ]
 
 
-@dataclass
-class KVStoreConfig(BaseConfig):
-    """External KV cache store configuration for the vLLM rollout engine.
-
-    When ``enable`` is true, verl asks vLLM to attach a
-    ``MooncakeStoreConnector`` so that prefix KV blocks are offloaded to a
-    shared Mooncake master. On every weight update verl drives a hard reset
-    (``engine.reset_prefix_cache(reset_connector=True)``) so the master's
-    stale entries are dropped before any new rollout reads them.
-
-    See ``mooncake-integration/`` for the per-run master start/stop wrapper.
-    """
-
-    # Enable external KV store offload.
-    enable: bool = False
-
-    # KVConnector class name forwarded to ``--kv-transfer-config``.
-    kv_connector: str = "MooncakeStoreConnector"
-
-    # vLLM kv_role. ``kv_both`` lets the rollout engine both put and get.
-    kv_role: str = "kv_both"
-
-    # Path to Mooncake client config JSON (``master_server_address``,
-    # ``global_segment_size``, ``protocol``, ...). Falls back to the
-    # ``MOONCAKE_CONFIG_PATH`` env var when None.
-    config_path: Optional[str] = None
-
-    # Optional extra dict merged into vLLM's
-    # ``kv_connector_extra_config``. Use sparingly.
-    extra_config: dict = field(default_factory=dict)
-
-    # Behavior when the Mooncake master is unreachable at engine launch:
-    # ``fallback`` (default) -> log a warning, drop the connector, keep
-    # training; ``crash`` -> let the engine start fail and propagate.
-    on_failure: str = "fallback"
-
-
 @dataclass
 class SkipConfig(BaseConfig):
     """
@@ -282,9 +244,6 @@ class RolloutConfig(BaseConfig):
     # Checkpoint Engine config for update weights from trainer to rollout
     checkpoint_engine: CheckpointEngineConfig = field(default_factory=CheckpointEngineConfig)
 
-    # External Mooncake KV store offload (RL-correct hard-reset path).
-    kv_store: KVStoreConfig = field(default_factory=KVStoreConfig)
-
     # Rollout skip config (load/dump rollout data)
     skip: SkipConfig = field(default_factory=SkipConfig)
 
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index b13fdc0bb8c..38b5c6b1524 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -346,32 +346,6 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
         if self.config.enable_rollout_routing_replay:
             args.update({"enable_return_routed_experts": True})
 
-        # External Mooncake KV store offload: forward kv_transfer_config to
-        # vLLM if rollout.kv_store.enable is set. The connector itself decides
-        # how to talk to the Mooncake master (config_path / env). On every
-        # weight update we drive a hard reset via reset_prefix_cache(
-        # reset_connector=True), which makes vLLM call
-        # MooncakeStoreConnector.reset_cache() -> store.remove_all(force=True).
-        # When no connector is attached the scheduler treats reset_connector=True
-        # as a no-op success, so we can pass it unconditionally on the reset
-        # paths and avoid carrying state into the adapter.
-        kv_store_cfg = getattr(self.config, "kv_store", None)
-        if kv_store_cfg and kv_store_cfg.get("enable", False):
-            extra_config = dict(kv_store_cfg.get("extra_config", {}) or {})
-            config_path = kv_store_cfg.get("config_path")
-            if config_path:
-                # MooncakeStoreWorker reads MOONCAKE_CONFIG_PATH from env;
-                # vLLM serve also accepts kv_connector_extra_config so we
-                # pass both for clarity. The env var is set by the per-run
-                # master wrapper script before `ray job submit`.
-                extra_config.setdefault("mooncake_config_path", config_path)
-            kv_transfer_config = {
-                "kv_connector": kv_store_cfg.get("kv_connector", "MooncakeStoreConnector"),
-                "kv_role": kv_store_cfg.get("kv_role", "kv_both"),
-                "kv_connector_extra_config": extra_config,
-            }
-            args["kv_transfer_config"] = json.dumps(kv_transfer_config)
-
         server_args = ["serve", self.model_config.local_path] + build_cli_args_from_config(args)
 
         if self.replica_rank == 0:
@@ -587,9 +561,10 @@ async def wake_up(self):
         elif self.rollout_mode == RolloutMode.COLOCATED:
             # Directly call engine to wake up without sync weights.
             await self.engine.wake_up(tags=self._get_wake_up_tags())
-            # reset_connector=True is a no-op when no connector is attached
-            # (scheduler treats it as success), so we don't need to gate this
-            # behind a kv_store-enabled flag.
+            # reset_connector=True drops any attached external KV store
+            # (e.g. MooncakeStoreConnector) whose entries were computed
+            # against the previous weights. No-op success when no connector
+            # is configured (vLLM scheduler treats it as such).
             await self.engine.reset_prefix_cache(reset_connector=True)
         elif self.rollout_mode == RolloutMode.STANDALONE:
             logger.info("skip wake_up in standalone mode")
@@ -660,14 +635,15 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str,
                 # 1. Set engine to paused state (blocks new generate calls)
                 # 2. Abort all in-flight requests
                 # 3. Wait for requests to drain
-                # 4. Clear prefix and mm caches if clear_cache=True; extend
-                #    step 4 to also clear the external store (e.g. Mooncake
-                #    master) when reset_connector=True. No-op success when
-                #    no connector is attached.
+                # 4. Clear prefix and mm caches if clear_cache=True.
+                #    EngineCore._reset_caches defaults reset_connector=True
+                #    on this path, so any attached external KV store (e.g.
+                #    MooncakeStoreConnector) is invalidated along with the
+                #    local prefix cache — RL-correct hard-reset at every
+                #    weight update boundary, no extra kwargs needed.
                 await self.engine.pause_generation(
                     wait_for_inflight_requests=False,
                     clear_cache=reset_prefix_cache,
-                    reset_connector=reset_prefix_cache,
                 )
             else:
                 # Take an atomic snapshot to avoid race conditions with the vLLM engine thread

From 2529613084b56334094d060cc067a161ee31be95 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Sat, 16 May 2026 14:27:45 +0000
Subject: [PATCH 07/15] docs: rewrite rollout_kv_offload for
 engine_kwargs.vllm.kv_transfer_config passthrough
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The verl-side KVStoreConfig dataclass is gone; the doc page no longer
needs to document its fields. Replace the configuration section with
the actual minimal recipe: set engine_kwargs.vllm.kv_transfer_config
to a JSON string that vLLM's KVTransferConfig already accepts.

Also:
- Update the RL-correctness section to describe both reset entry
  points: explicit reset_prefix_cache(reset_connector=True) from
  wake_up / clear_kv_cache, and the automatic cascade through
  EngineCore._reset_caches when abort_all_requests goes through
  pause_generation(clear_cache=True).
- Pin the paired vLLM PR reference to the canonical upstream PR
  vllm-project/vllm#42694 (replaces the earlier ivanium-fork link).
- Drop the on_failure "fallback" row from the SGLang comparison
  table; verl no longer offers that knob — vLLM serve fails loud if
  the master is unreachable, which is the right default for RL.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/perf/rollout_kv_offload.md | 139 ++++++++++++++++++--------------
 1 file changed, 77 insertions(+), 62 deletions(-)

diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md
index 1b903c72b6f..86a69db8576 100644
--- a/docs/perf/rollout_kv_offload.md
+++ b/docs/perf/rollout_kv_offload.md
@@ -1,6 +1,6 @@
 # Rollout KV Cache Offload via Mooncake-Store
 
-Last updated: 2026-05-11.
+Last updated: 2026-05-16.
 
 This document covers how to offload prefix KV blocks from the vLLM rollout
 engine to a shared **Mooncake** distributed store, so that long shared
@@ -13,7 +13,7 @@ within a single weight generation.
 Enable when **all** of these hold:
 
 - Rollout workload has long, reusable prefixes (multi-turn / agentic / shared
-  system prompt with many `actor_rollout_ref.rollout.n` samples per prompt).
+  system prompt with many ``actor_rollout_ref.rollout.n`` samples per prompt).
 - The rollout-side prefix-cache hit rate on a single engine is already
   saturated, but cross-engine prefix sharing (e.g., DP > 1, multi-replica
   fully-async) is leaving hits on the table.
@@ -32,13 +32,16 @@ between rollout steps. Any KV block written to the external store before a
 weight update is **computed against the previous policy** — serving it to a
 post-update request would silently corrupt inference.
 
-verl handles this correctly: the existing ``update_weights`` flow in
-``verl.checkpoint_engine.base.CheckpointEngineManager`` already does an
-``abort_all_requests`` -> drain -> sleep -> NCCL weight sync -> ``wake_up``
-sequence. When this feature is enabled, every prefix-cache-reset call site
-in ``vllm_async_server.py`` (``wake_up``, ``clear_kv_cache``, the
-``abort_all_requests`` fallback path) additionally passes
-``reset_connector=True`` to ``engine.reset_prefix_cache(...)``.
+verl handles this correctly. The three prefix-cache-reset call sites in
+``vllm_async_server.py`` propagate the connector reset:
+
+- ``wake_up`` and ``clear_kv_cache`` call
+  ``engine.reset_prefix_cache(reset_connector=True)`` explicitly.
+- ``abort_all_requests`` calls
+  ``engine.pause_generation(clear_cache=True)``; in vLLM ≥ the paired
+  cascade patch, ``EngineCore._reset_caches`` defaults
+  ``reset_connector=True`` so the connector cascade fires automatically
+  whenever ``pause_generation`` clears caches.
 
 That flag cascades into vLLM as:
 
@@ -47,7 +50,7 @@ That flag cascades into vLLM as:
         -> MooncakeStoreConnector.reset_cache()        (SCHEDULER role)
           -> MooncakeStoreScheduler.reset_store()
             -> LookupKeyClient.reset()                  (ZMQ admin frame)
-              -> LookupKeyServer (worker rank 0) recognizes RESET_MAGIC
+              -> LookupKeyServer (worker rank 0) typed dispatch
                 -> store.remove_all(force=True)         (Mooncake master)
 
 The net effect is that after each ``update_weights`` round, the Mooncake
@@ -57,60 +60,71 @@ the in-engine guard (``BlockPool.reset_prefix_cache``) fails to clear (e.g.
 an in-flight sequence still holds blocks), the external store is *also*
 left untouched, so internal and external caches never desynchronize.
 
+When **no** KV connector is attached, ``reset_connector=True`` is a no-op
+success in upstream vLLM (the scheduler treats "nothing to reset" as
+trivially OK). Passing it unconditionally is therefore safe for every
+rollout — there is no verl-side feature flag to remember.
+
 ## Pre-requisites
 
-1. **Mooncake**: install the Python binding (``pip install
+1. **vLLM**: build that includes the ``MooncakeStoreConnector.reset_cache``
+   cascade (vllm-project/vllm#42694) and the ``EngineCore._reset_caches``
+   default that threads ``reset_connector=True`` from
+   ``pause_generation(clear_cache=True)``. Without the cascade,
+   ``reset_connector=True`` clears only the local prefix cache and leaves
+   the Mooncake master populated with stale KV — silent correctness loss.
+   Do **not** enable the external store without that vLLM build.
+2. **Mooncake**: install the Python binding (``pip install
    mooncake-transfer-engine``) and a master binary that exposes the
-   ``RemoveAll`` RPC. On aarch64 GB200 this currently means building from
-   ``ivanium/Mooncake`` ``yifan/dev`` with ``-DUSE_CUDA=ON
-   -DWITH_NVIDIA_PEERMEM=OFF -DUSE_MNNVL=ON``.
-2. **vLLM**: 0.20.1+ with the paired ``MooncakeStoreConnector.reset_cache``
-   patch (the cascade hook). Without it the ``reset_connector=True`` flag
-   is a no-op and silent stale-cache corruption is possible — do not enable
-   ``kv_store.enable`` without the paired patch.
-3. **Mooncake master process**: launched out-of-band (typically per-run, see
-   ``scripts/mooncake/start_mooncake_master.sh``). Single-tenant per run is
-   recommended because ``RemoveAll`` is master-wide; multi-tenant sharing
-   would let one experiment wipe another's cache.
+   ``RemoveAll`` RPC. On aarch64 GB200 build from upstream with
+   ``-DUSE_CUDA=ON -DWITH_NVIDIA_PEERMEM=OFF -DUSE_MNNVL=ON``.
+3. **Mooncake master process**: launched out-of-band, typically per-run
+   (see ``scripts/mooncake/start_mooncake_master.sh`` in the
+   mooncake-integration project). Single-tenant per run is recommended
+   because ``RemoveAll`` is master-wide; multi-tenant sharing would let
+   one experiment wipe another's cache.
 
 ## Configuration
 
-Under ``actor_rollout_ref.rollout``:
+verl forwards any key under ``actor_rollout_ref.rollout.engine_kwargs.vllm``
+to ``vllm serve`` as a CLI flag. To attach the Mooncake connector, set
+``kv_transfer_config`` directly — the JSON shape is vLLM's own
+``KVTransferConfig`` schema (see ``vllm/config/__init__.py`` ->
+``KVTransferConfig``):
 
 ```yaml
 actor_rollout_ref:
   rollout:
-    kv_store:
-      enable: true                                   # default false
-      kv_connector: MooncakeStoreConnector           # forwarded to vLLM
-      kv_role: kv_both                               # both put + get
-      config_path: /path/to/mooncake_config.json     # passed via env
-      extra_config: {}                               # additional kv_connector_extra_config
-      on_failure: fallback                           # fallback | crash
+    engine_kwargs:
+      vllm:
+        kv_transfer_config: |-
+          {
+            "kv_connector": "MooncakeStoreConnector",
+            "kv_role": "kv_both",
+            "kv_connector_extra_config": {
+              "mooncake_config_path": "/path/to/mooncake_config.json"
+            }
+          }
 ```
 
-Field reference:
-
-- ``enable``: Master switch. When false (default), no ``kv_transfer_config``
-  is attached to the vLLM engine and verl behaves exactly as before.
-- ``kv_connector``: KVConnector class name forwarded to vLLM's
-  ``--kv-transfer-config``. Override only for testing alternate backends.
-- ``kv_role``: ``kv_both`` lets the rollout engine both write blocks to and
-  read blocks from the store. Other values are exposed for the same reason
-  as ``kv_connector`` but the rollout path expects ``kv_both``.
-- ``config_path``: Path to the Mooncake client JSON config
-  (``master_server_address``, ``global_segment_size``, ``protocol``, ...).
-  Falls back to the ``MOONCAKE_CONFIG_PATH`` environment variable when
-  unset — most installations should set the env once at the cluster level
-  and leave this field empty.
-- ``extra_config``: Dict merged into vLLM's ``kv_connector_extra_config``.
-  Reserve for connector-specific knobs that don't have a first-class field.
-- ``on_failure``: Behavior when the Mooncake master is unreachable at
-  rollout-engine launch.
-  ``fallback`` (default) drops the connector and continues training with
-  external offload disabled — a soft dependency suited to long RL runs
-  where pausing training for an infra hiccup is more expensive than losing
-  cross-engine prefix hits. Set to ``crash`` to fail fast (e.g., in CI).
+Equivalently as inline JSON (single line):
+
+```yaml
+actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config: '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"mooncake_config_path":"/path/to/mooncake_config.json"}}'
+```
+
+verl serializes ``engine_kwargs.vllm.kv_transfer_config`` into the
+``--kv-transfer-config`` argument of ``vllm serve``; vLLM's own arg parser
+decodes the JSON into ``KVTransferConfig`` and constructs the connector.
+There is no verl-side schema layer — any field vLLM accepts is accepted
+here, and any future vLLM-side KV connector (NIXL, P2pNcclConnector, future
+ones) can be wired the same way without a verl change.
+
+If the Mooncake master is unreachable at engine launch, vLLM crashes the
+serve subprocess. That's the intended fail-loud behavior; an RL run that
+silently disables a configured KV store would hide hours of stale-cache
+corruption. If you want a "soft" mode, wrap the launch with a healthcheck
+of the master before starting verl.
 
 ## Required environment variables
 
@@ -119,7 +133,8 @@ choices.
 
 - ``MOONCAKE_CONFIG_PATH``: required by the Mooncake client to locate
   ``master_server_address`` etc. Set on every rollout actor; verl
-  propagates it via ``ray`` runtime_env.
+  propagates it via ``ray`` runtime_env. You can also pass it inline via
+  ``kv_connector_extra_config.mooncake_config_path`` as shown above.
 - ``PYTHONHASHSEED=0``: **required if** you have ``DP > 1`` or multiple
   rollout replicas — vLLM's block-hash seed is randomized per process and
   cross-engine prefix-cache hits will silently drop to zero without a
@@ -140,7 +155,8 @@ choices.
   field. The hard-reset model relies on the master being cleaned per
   weight update, not on key versioning.
 - **Failure modes**:
-  - Master unreachable at launch: see ``on_failure``.
+  - Master unreachable at launch: vLLM serve fails to start; verl
+    surfaces the underlying connector error. Fail-loud.
   - Master goes down mid-run: the rollout engine continues using its
     in-engine prefix cache; ``reset_store`` returns False and the rest of
     ``reset_prefix_cache`` still works. Re-attaching to a new master
@@ -156,17 +172,16 @@ choices.
 | Reset trigger | Per-RPC opt-in ``flush_cache=True`` flag on each ``update_weights_*`` call — forget once -> silent stale-cache corruption | Automatic data-plane cascade through ``reset_prefix_cache(reset_connector=True)`` whenever verl drives the existing hard-reset path; no flag |
 | Multi-rank coordination | Each scheduler instance calls ``remove_all`` (idempotent, but N redundant RPCs) | Scheduler-side ZMQ admin RPC to rank-0 worker only -> one ``RemoveAll`` per reset |
 | Guard / cache consistency | ``is_fully_idle()`` silently false on failure | Cascades only when in-engine guard passes; internal and external caches stay in lockstep |
-| Failure surface | Client must read return value and re-issue | Soft-dependency (``on_failure=fallback``) by default; explicit False propagates up to the scheduler |
 
 ## Reference
 
-- Paired vLLM upstream patch: implements
-  ``MooncakeStoreConnector.reset_cache()`` and the
-  ``RESET_MAGIC`` ZMQ discriminator (see ``aoshen02/vllm:feat/mooncake-clear-hook``
-  -> ``ivanium/vllm`` -> upstream).
+- Paired vLLM upstream PR (must be in your vLLM build):
+  [vllm-project/vllm#42694](https://github.com/vllm-project/vllm/pull/42694) —
+  implements ``MooncakeStoreConnector.reset_cache()``, the typed-tag ZMQ
+  protocol, and the ``EngineCore._reset_caches`` default that threads
+  ``reset_connector=True`` through ``pause_generation``.
 - vLLM scheduler hook the cascade rides on:
-  ``vllm/v1/core/sched/scheduler.py:1871`` (``reset_prefix_cache``) and
-  ``1917`` (``reset_connector_cache``) — already shipped in 0.20.1, the
-  patch only provides the connector-specific ``reset_cache`` body.
+  ``vllm/v1/core/sched/scheduler.py`` (``reset_prefix_cache`` and
+  ``reset_connector_cache``).
 - Sister rollout-correctness recipes: ``advance/rollout_corr.md``,
   ``advance/rollout_corr_math.md``.

From e1c471dda10f65be5e00ecc9f817a3965010fdb8 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Mon, 25 May 2026 12:09:42 +0000
Subject: [PATCH 08/15] docs(rollout_kv_offload): simplify, defer Mooncake
 setup to vLLM upstream doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cut 188 → 89 lines. The previous version duplicated vLLM-side setup
(client install, master launch, JSON config) that already lives in the
official vLLM Mooncake guide and drifts out of date here. Point readers
at <https://docs.vllm.ai/en/latest/features/mooncake_store_connector_usage/>
for setup and keep only what is verl-specific: the engine_kwargs.vllm.
kv_transfer_config wiring, the reset_connector=True cascade contract for
RL correctness (with the required vLLM build), and the when-to-enable /
failure-mode summary.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: aoshen <aoshen@inferact.ai>
---
 docs/perf/rollout_kv_offload.md | 234 ++++++++++----------------------
 1 file changed, 68 insertions(+), 166 deletions(-)

diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md
index 86a69db8576..fe077874293 100644
--- a/docs/perf/rollout_kv_offload.md
+++ b/docs/perf/rollout_kv_offload.md
@@ -1,96 +1,22 @@
 # Rollout KV Cache Offload via Mooncake-Store
 
-Last updated: 2026-05-16.
-
-This document covers how to offload prefix KV blocks from the vLLM rollout
-engine to a shared **Mooncake** distributed store, so that long shared
-prefixes (system prompt + task description + earlier turns in agentic
-workloads) get deduplicated across requests and across rollout replicas
-within a single weight generation.
-
-## When to use this
-
-Enable when **all** of these hold:
-
-- Rollout workload has long, reusable prefixes (multi-turn / agentic / shared
-  system prompt with many ``actor_rollout_ref.rollout.n`` samples per prompt).
-- The rollout-side prefix-cache hit rate on a single engine is already
-  saturated, but cross-engine prefix sharing (e.g., DP > 1, multi-replica
-  fully-async) is leaving hits on the table.
-- You can run a Mooncake master process colocated with (or reachable from)
-  every rollout host, and you have enough RDMA / TCP bandwidth between the
-  rollout workers and the master.
-
-Do **not** enable for short prompts or small ``rollout.n`` workloads where
-within-engine prefix cache is already enough — the round-trip to Mooncake
-will net out negative.
-
-## RL-correctness contract: hard reset on every weight update
-
-The unique constraint of RL training is that the model weights change
-between rollout steps. Any KV block written to the external store before a
-weight update is **computed against the previous policy** — serving it to a
-post-update request would silently corrupt inference.
-
-verl handles this correctly. The three prefix-cache-reset call sites in
-``vllm_async_server.py`` propagate the connector reset:
-
-- ``wake_up`` and ``clear_kv_cache`` call
-  ``engine.reset_prefix_cache(reset_connector=True)`` explicitly.
-- ``abort_all_requests`` calls
-  ``engine.pause_generation(clear_cache=True)``; in vLLM ≥ the paired
-  cascade patch, ``EngineCore._reset_caches`` defaults
-  ``reset_connector=True`` so the connector cascade fires automatically
-  whenever ``pause_generation`` clears caches.
-
-That flag cascades into vLLM as:
-
-    Scheduler.reset_prefix_cache(reset_connector=True)
-      -> Scheduler.reset_connector_cache()
-        -> MooncakeStoreConnector.reset_cache()        (SCHEDULER role)
-          -> MooncakeStoreScheduler.reset_store()
-            -> LookupKeyClient.reset()                  (ZMQ admin frame)
-              -> LookupKeyServer (worker rank 0) typed dispatch
-                -> store.remove_all(force=True)         (Mooncake master)
-
-The net effect is that after each ``update_weights`` round, the Mooncake
-master is empty before any new rollout request starts — matching the
-existing in-engine prefix-cache behavior. The contract is symmetric: if
-the in-engine guard (``BlockPool.reset_prefix_cache``) fails to clear (e.g.
-an in-flight sequence still holds blocks), the external store is *also*
-left untouched, so internal and external caches never desynchronize.
-
-When **no** KV connector is attached, ``reset_connector=True`` is a no-op
-success in upstream vLLM (the scheduler treats "nothing to reset" as
-trivially OK). Passing it unconditionally is therefore safe for every
-rollout — there is no verl-side feature flag to remember.
-
-## Pre-requisites
-
-1. **vLLM**: build that includes the ``MooncakeStoreConnector.reset_cache``
-   cascade (vllm-project/vllm#42694) and the ``EngineCore._reset_caches``
-   default that threads ``reset_connector=True`` from
-   ``pause_generation(clear_cache=True)``. Without the cascade,
-   ``reset_connector=True`` clears only the local prefix cache and leaves
-   the Mooncake master populated with stale KV — silent correctness loss.
-   Do **not** enable the external store without that vLLM build.
-2. **Mooncake**: install the Python binding (``pip install
-   mooncake-transfer-engine``) and a master binary that exposes the
-   ``RemoveAll`` RPC. On aarch64 GB200 build from upstream with
-   ``-DUSE_CUDA=ON -DWITH_NVIDIA_PEERMEM=OFF -DUSE_MNNVL=ON``.
-3. **Mooncake master process**: launched out-of-band, typically per-run
-   (see ``scripts/mooncake/start_mooncake_master.sh`` in the
-   mooncake-integration project). Single-tenant per run is recommended
-   because ``RemoveAll`` is master-wide; multi-tenant sharing would let
-   one experiment wipe another's cache.
-
-## Configuration
-
-verl forwards any key under ``actor_rollout_ref.rollout.engine_kwargs.vllm``
-to ``vllm serve`` as a CLI flag. To attach the Mooncake connector, set
-``kv_transfer_config`` directly — the JSON shape is vLLM's own
-``KVTransferConfig`` schema (see ``vllm/config/__init__.py`` ->
-``KVTransferConfig``):
+Offload prefix KV blocks from the vLLM rollout engine to a shared
+[Mooncake](https://github.com/kvcache-ai/Mooncake) store so long shared
+prefixes (system prompt, agentic tool history, `rollout.n` samples per prompt)
+get deduplicated across requests and rollout replicas.
+
+## Setup Mooncake + vLLM
+
+Follow vLLM's official guide for installing the Mooncake client, starting a
+master, and writing the JSON config:
+**<https://docs.vllm.ai/en/latest/features/mooncake_store_connector_usage/>**
+
+The verl side only consumes whatever that doc produces — no extra steps.
+
+## Enable in verl
+
+verl forwards `engine_kwargs.vllm.*` straight to `vllm serve` as CLI flags.
+To attach the Mooncake connector, set `kv_transfer_config`:
 
 ```yaml
 actor_rollout_ref:
@@ -107,81 +33,57 @@ actor_rollout_ref:
           }
 ```
 
-Equivalently as inline JSON (single line):
+Or as a Hydra CLI override:
 
-```yaml
-actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config: '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"mooncake_config_path":"/path/to/mooncake_config.json"}}'
+```bash
++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector=MooncakeStoreConnector \
++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_role=kv_both \
++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector_extra_config.mooncake_config_path=/path/to/mooncake_config.json
 ```
 
-verl serializes ``engine_kwargs.vllm.kv_transfer_config`` into the
-``--kv-transfer-config`` argument of ``vllm serve``; vLLM's own arg parser
-decodes the JSON into ``KVTransferConfig`` and constructs the connector.
-There is no verl-side schema layer — any field vLLM accepts is accepted
-here, and any future vLLM-side KV connector (NIXL, P2pNcclConnector, future
-ones) can be wired the same way without a verl change.
-
-If the Mooncake master is unreachable at engine launch, vLLM crashes the
-serve subprocess. That's the intended fail-loud behavior; an RL run that
-silently disables a configured KV store would hide hours of stale-cache
-corruption. If you want a "soft" mode, wrap the launch with a healthcheck
-of the master before starting verl.
-
-## Required environment variables
-
-These are intentionally *not* set automatically — they're cluster-level
-choices.
-
-- ``MOONCAKE_CONFIG_PATH``: required by the Mooncake client to locate
-  ``master_server_address`` etc. Set on every rollout actor; verl
-  propagates it via ``ray`` runtime_env. You can also pass it inline via
-  ``kv_connector_extra_config.mooncake_config_path`` as shown above.
-- ``PYTHONHASHSEED=0``: **required if** you have ``DP > 1`` or multiple
-  rollout replicas — vLLM's block-hash seed is randomized per process and
-  cross-engine prefix-cache hits will silently drop to zero without a
-  fixed seed. Single-engine rollouts can leave this unset.
-
-## Operational notes
-
-- **Cluster hygiene**: a per-run master is the recommended deployment.
-  Reuse across runs invites cross-experiment cache pollution (and a
-  ``reset_connector=True`` from one run will wipe the other's keys).
-- **Reset cost**: every weight update triggers ``RemoveAll`` on the
-  master, which iterates all metadata shards. On a ~600 GB store this is
-  sub-second. Frequent fully-async weight syncs (sync every 1-2 rollout
-  steps) will see the per-update hit rate stay low; this is expected and
-  matches the in-engine prefix-cache behavior under the same conditions.
-- **No version tagging**: keys are pure content-addressed
-  (``{model_name}@tp_rank:N@...@{block_hash}``); no weight-generation
-  field. The hard-reset model relies on the master being cleaned per
-  weight update, not on key versioning.
-- **Failure modes**:
-  - Master unreachable at launch: vLLM serve fails to start; verl
-    surfaces the underlying connector error. Fail-loud.
-  - Master goes down mid-run: the rollout engine continues using its
-    in-engine prefix cache; ``reset_store`` returns False and the rest of
-    ``reset_prefix_cache`` still works. Re-attaching to a new master
-    mid-run is **not** supported in this version.
-  - Cross-rank ``block_hash`` divergence (forgot ``PYTHONHASHSEED``):
-    silent zero hit rate. Inspect
-    ``vllm_external_prefix_cache_hits`` metric to detect.
-
-## Comparison vs. SGLang's HiCacheStorage flow
-
-| Aspect | SGLang ``/flush_cache`` | verl + vLLM ``MooncakeStoreConnector`` |
-|---|---|---|
-| Reset trigger | Per-RPC opt-in ``flush_cache=True`` flag on each ``update_weights_*`` call — forget once -> silent stale-cache corruption | Automatic data-plane cascade through ``reset_prefix_cache(reset_connector=True)`` whenever verl drives the existing hard-reset path; no flag |
-| Multi-rank coordination | Each scheduler instance calls ``remove_all`` (idempotent, but N redundant RPCs) | Scheduler-side ZMQ admin RPC to rank-0 worker only -> one ``RemoveAll`` per reset |
-| Guard / cache consistency | ``is_fully_idle()`` silently false on failure | Cascades only when in-engine guard passes; internal and external caches stay in lockstep |
-
-## Reference
-
-- Paired vLLM upstream PR (must be in your vLLM build):
-  [vllm-project/vllm#42694](https://github.com/vllm-project/vllm/pull/42694) —
-  implements ``MooncakeStoreConnector.reset_cache()``, the typed-tag ZMQ
-  protocol, and the ``EngineCore._reset_caches`` default that threads
-  ``reset_connector=True`` through ``pause_generation``.
-- vLLM scheduler hook the cascade rides on:
-  ``vllm/v1/core/sched/scheduler.py`` (``reset_prefix_cache`` and
-  ``reset_connector_cache``).
-- Sister rollout-correctness recipes: ``advance/rollout_corr.md``,
-  ``advance/rollout_corr_math.md``.
+Set `MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json` on every rollout
+actor (verl propagates via Ray `runtime_env`). For `DP>1` or multiple rollout
+replicas, also set `PYTHONHASHSEED=0` — vLLM's block-hash seed is randomized
+per process and cross-engine hits drop to zero without it.
+
+## RL correctness: hard reset on every weight update
+
+Model weights change between rollout steps, so any KV block written to the
+external store under the previous policy must be evicted before the next
+rollout starts — otherwise stale KV silently corrupts inference. verl
+handles this automatically via `engine.reset_prefix_cache(reset_connector=True)`
+in `vllm_async_server.py`'s `wake_up` / `clear_kv_cache` / `abort_all_requests`
+paths. The flag cascades through vLLM into `MooncakeStoreConnector.reset_cache()`,
+which clears the master via the `RemoveAll` RPC.
+
+**Required vLLM build**: must include
+[vllm-project/vllm#42694](https://github.com/vllm-project/vllm/pull/42694)
+(`MooncakeStoreConnector.reset_cache` + the `EngineCore._reset_caches` default
+that threads `reset_connector=True` through `pause_generation`). Without it,
+`reset_connector=True` clears only the local prefix cache and leaves the
+Mooncake master populated with stale KV — silent correctness loss. Do not
+enable the connector on an older vLLM build.
+
+When no connector is attached, `reset_connector=True` is a no-op success in
+upstream vLLM, so this code path is always safe.
+
+## When to enable
+
+Enable when **all** hold:
+
+- Rollout has long, reusable prefixes (multi-turn agentic, large `rollout.n`).
+- Within-engine prefix cache hit rate is already saturated.
+- You have cross-engine reuse opportunity (DP > 1, multi-replica fully-async).
+
+For short prompts or small `rollout.n` on a single engine, the round-trip to
+Mooncake nets out negative — keep it off.
+
+## Failure modes
+
+- **Master unreachable at launch**: vLLM serve fails to start. Fail-loud is
+  intentional — silently disabling a configured KV store would hide stale-cache
+  corruption.
+- **Master dies mid-run**: rollout falls back to local prefix cache;
+  re-attaching is not supported in this version.
+- **Cross-rank `block_hash` divergence** (forgot `PYTHONHASHSEED`): silent
+  zero hit rate. Check the `vllm_external_prefix_cache_hits` metric.

From 22b94d2c7be4f3a67b121ae1e9e93c2e4a25c91c Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Mon, 25 May 2026 12:21:50 +0000
Subject: [PATCH 09/15] docs(rollout_kv_offload): drop Failure modes and
 When-to-enable sections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

89 → 57 lines. Cut three sections that don't belong in a "how to enable"
doc: Failure modes (operational advice that drifts), When to enable
(opinion / negative recommendation), the duplicate Hydra CLI override
block (YAML form is enough), and a trailing line on no-op upstream
behavior. Keep only: setup link to vLLM doc, the YAML wiring, and the
RL-correctness reset cascade contract with required vLLM build.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: aoshen <aoshen@inferact.ai>
---
 docs/perf/rollout_kv_offload.md | 35 +--------------------------------
 1 file changed, 1 insertion(+), 34 deletions(-)

diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md
index fe077874293..8dbcd380528 100644
--- a/docs/perf/rollout_kv_offload.md
+++ b/docs/perf/rollout_kv_offload.md
@@ -33,14 +33,6 @@ actor_rollout_ref:
           }
 ```
 
-Or as a Hydra CLI override:
-
-```bash
-+actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector=MooncakeStoreConnector \
-+actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_role=kv_both \
-+actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector_extra_config.mooncake_config_path=/path/to/mooncake_config.json
-```
-
 Set `MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json` on every rollout
 actor (verl propagates via Ray `runtime_env`). For `DP>1` or multiple rollout
 replicas, also set `PYTHONHASHSEED=0` — vLLM's block-hash seed is randomized
@@ -61,29 +53,4 @@ which clears the master via the `RemoveAll` RPC.
 (`MooncakeStoreConnector.reset_cache` + the `EngineCore._reset_caches` default
 that threads `reset_connector=True` through `pause_generation`). Without it,
 `reset_connector=True` clears only the local prefix cache and leaves the
-Mooncake master populated with stale KV — silent correctness loss. Do not
-enable the connector on an older vLLM build.
-
-When no connector is attached, `reset_connector=True` is a no-op success in
-upstream vLLM, so this code path is always safe.
-
-## When to enable
-
-Enable when **all** hold:
-
-- Rollout has long, reusable prefixes (multi-turn agentic, large `rollout.n`).
-- Within-engine prefix cache hit rate is already saturated.
-- You have cross-engine reuse opportunity (DP > 1, multi-replica fully-async).
-
-For short prompts or small `rollout.n` on a single engine, the round-trip to
-Mooncake nets out negative — keep it off.
-
-## Failure modes
-
-- **Master unreachable at launch**: vLLM serve fails to start. Fail-loud is
-  intentional — silently disabling a configured KV store would hide stale-cache
-  corruption.
-- **Master dies mid-run**: rollout falls back to local prefix cache;
-  re-attaching is not supported in this version.
-- **Cross-rank `block_hash` divergence** (forgot `PYTHONHASHSEED`): silent
-  zero hit rate. Check the `vllm_external_prefix_cache_hits` metric.
+Mooncake master populated with stale KV — silent correctness loss.

From fb7ceef319988a21637b4b32b65c2b834e968356 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Mon, 25 May 2026 12:23:18 +0000
Subject: [PATCH 10/15] docs(rollout_kv_offload): restore Hydra CLI override
 example

Some users drive verl entirely from CLI overrides without touching YAML
files, so keep the CLI form as a parallel example.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: aoshen <aoshen@inferact.ai>
---
 docs/perf/rollout_kv_offload.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md
index 8dbcd380528..aff9a74185d 100644
--- a/docs/perf/rollout_kv_offload.md
+++ b/docs/perf/rollout_kv_offload.md
@@ -33,6 +33,14 @@ actor_rollout_ref:
           }
 ```
 
+Or as a Hydra CLI override:
+
+```bash
++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector=MooncakeStoreConnector \
++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_role=kv_both \
++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector_extra_config.mooncake_config_path=/path/to/mooncake_config.json
+```
+
 Set `MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json` on every rollout
 actor (verl propagates via Ray `runtime_env`). For `DP>1` or multiple rollout
 replicas, also set `PYTHONHASHSEED=0` — vLLM's block-hash seed is randomized

From baf6efda00dc5dd533c007f10ea29d92e52af576 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Wed, 27 May 2026 12:23:20 +0000
Subject: [PATCH 11/15] docs: simplify rollout kv offload guide

---
 docs/perf/rollout_kv_offload.md | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md
index aff9a74185d..507ffd1ad68 100644
--- a/docs/perf/rollout_kv_offload.md
+++ b/docs/perf/rollout_kv_offload.md
@@ -3,7 +3,9 @@
 Offload prefix KV blocks from the vLLM rollout engine to a shared
 [Mooncake](https://github.com/kvcache-ai/Mooncake) store so long shared
 prefixes (system prompt, agentic tool history, `rollout.n` samples per prompt)
-get deduplicated across requests and rollout replicas.
+get deduplicated across requests and rollout replicas. This also helps
+long-tail load balancing: when work migrates to idle rollout replicas, shared
+prefix KV reduces the re-prefill cost.
 
 ## Setup Mooncake + vLLM
 
@@ -41,24 +43,10 @@ Or as a Hydra CLI override:
 +actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector_extra_config.mooncake_config_path=/path/to/mooncake_config.json
 ```
 
-Set `MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json` on every rollout
-actor (verl propagates via Ray `runtime_env`). For `DP>1` or multiple rollout
-replicas, also set `PYTHONHASHSEED=0` — vLLM's block-hash seed is randomized
-per process and cross-engine hits drop to zero without it.
-
 ## RL correctness: hard reset on every weight update
 
-Model weights change between rollout steps, so any KV block written to the
-external store under the previous policy must be evicted before the next
-rollout starts — otherwise stale KV silently corrupts inference. verl
-handles this automatically via `engine.reset_prefix_cache(reset_connector=True)`
-in `vllm_async_server.py`'s `wake_up` / `clear_kv_cache` / `abort_all_requests`
-paths. The flag cascades through vLLM into `MooncakeStoreConnector.reset_cache()`,
-which clears the master via the `RemoveAll` RPC.
+verl clears both local and Mooncake KV caches at every weight update boundary
+to avoid reusing KV from the previous policy.
 
-**Required vLLM build**: must include
-[vllm-project/vllm#42694](https://github.com/vllm-project/vllm/pull/42694)
-(`MooncakeStoreConnector.reset_cache` + the `EngineCore._reset_caches` default
-that threads `reset_connector=True` through `pause_generation`). Without it,
-`reset_connector=True` clears only the local prefix cache and leaves the
-Mooncake master populated with stale KV — silent correctness loss.
+**Required vLLM version**: use vLLM 0.22 or newer. Older builds may leave stale
+KV in the Mooncake master after a weight update.

From 07f176519fffc937dae0354ff5ce5d0d75fe4cd3 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Wed, 27 May 2026 12:41:27 +0000
Subject: [PATCH 12/15] fix(vllm): guard reset_connector for older vllm

---
 .../rollout/vllm_rollout/vllm_async_server.py | 25 +++++++------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index dadd9a4243d..5390b197ce5 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -54,6 +54,9 @@
 )
 
 _VLLM_VERSION = version.parse(vllm.__version__)
+_RESET_PREFIX_CACHE_KWARGS = {}
+if _VLLM_VERSION >= version.parse("0.22.0"):
+    _RESET_PREFIX_CACHE_KWARGS["reset_connector"] = True
 
 
 if _VLLM_VERSION > version.parse("0.11.0"):
@@ -607,15 +610,11 @@ async def wake_up(self, tags: list[str] | None = None):
             # processes across all DP shards (unlike collective_rpc which only reaches
             # TP workers within a single shard).
             await self.engine.wake_up(tags=tags or self._get_wake_up_tags())
-            await self.engine.reset_prefix_cache(reset_connector=True)
+            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
         elif self.rollout_mode == RolloutMode.COLOCATED:
             # Directly call engine to wake up without sync weights.
             await self.engine.wake_up(tags=self._get_wake_up_tags())
-            # reset_connector=True drops any attached external KV store
-            # (e.g. MooncakeStoreConnector) whose entries were computed
-            # against the previous weights. No-op success when no connector
-            # is configured (vLLM scheduler treats it as such).
-            await self.engine.reset_prefix_cache(reset_connector=True)
+            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
         elif self.rollout_mode == RolloutMode.STANDALONE:
             logger.info("skip wake_up in standalone mode")
 
@@ -632,11 +631,7 @@ async def sleep(self):
 
     async def clear_kv_cache(self):
         if self.node_rank == 0:
-            # reset_connector=True drops any attached external KV store
-            # (e.g. MooncakeStoreConnector) whose entries were computed
-            # against the previous model weights. With no connector it
-            # is a no-op success, so we can pass it unconditionally.
-            await self.engine.reset_prefix_cache(reset_connector=True)
+            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
 
     async def release_kv_cache(self):
         """Release only kv_cache GPU memory, keeping model weights intact.
@@ -698,11 +693,9 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str,
                 # 2. Abort all in-flight requests
                 # 3. Wait for requests to drain
                 # 4. Clear prefix and mm caches if clear_cache=True.
-                #    EngineCore._reset_caches defaults reset_connector=True
-                #    on this path, so any attached external KV store (e.g.
-                #    MooncakeStoreConnector) is invalidated along with the
-                #    local prefix cache — RL-correct hard-reset at every
-                #    weight update boundary, no extra kwargs needed.
+                #    On vLLM >= 0.22.0, attached external KV stores (e.g.
+                #    MooncakeStoreConnector) are reset along with the local
+                #    prefix cache.
                 await self.engine.pause_generation(
                     wait_for_inflight_requests=False,
                     clear_cache=reset_prefix_cache,

From 6fd5654d60b207a9281cfcd4c0a4e4fbee93f118 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Wed, 27 May 2026 13:10:45 +0000
Subject: [PATCH 13/15] Revert "fix(vllm): guard reset_connector for older
 vllm"

This reverts commit 07f176519fffc937dae0354ff5ce5d0d75fe4cd3.
---
 .../rollout/vllm_rollout/vllm_async_server.py | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 5390b197ce5..dadd9a4243d 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -54,9 +54,6 @@
 )
 
 _VLLM_VERSION = version.parse(vllm.__version__)
-_RESET_PREFIX_CACHE_KWARGS = {}
-if _VLLM_VERSION >= version.parse("0.22.0"):
-    _RESET_PREFIX_CACHE_KWARGS["reset_connector"] = True
 
 
 if _VLLM_VERSION > version.parse("0.11.0"):
@@ -610,11 +607,15 @@ async def wake_up(self, tags: list[str] | None = None):
             # processes across all DP shards (unlike collective_rpc which only reaches
             # TP workers within a single shard).
             await self.engine.wake_up(tags=tags or self._get_wake_up_tags())
-            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
+            await self.engine.reset_prefix_cache(reset_connector=True)
         elif self.rollout_mode == RolloutMode.COLOCATED:
             # Directly call engine to wake up without sync weights.
             await self.engine.wake_up(tags=self._get_wake_up_tags())
-            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
+            # reset_connector=True drops any attached external KV store
+            # (e.g. MooncakeStoreConnector) whose entries were computed
+            # against the previous weights. No-op success when no connector
+            # is configured (vLLM scheduler treats it as such).
+            await self.engine.reset_prefix_cache(reset_connector=True)
         elif self.rollout_mode == RolloutMode.STANDALONE:
             logger.info("skip wake_up in standalone mode")
 
@@ -631,7 +632,11 @@ async def sleep(self):
 
     async def clear_kv_cache(self):
         if self.node_rank == 0:
-            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
+            # reset_connector=True drops any attached external KV store
+            # (e.g. MooncakeStoreConnector) whose entries were computed
+            # against the previous model weights. With no connector it
+            # is a no-op success, so we can pass it unconditionally.
+            await self.engine.reset_prefix_cache(reset_connector=True)
 
     async def release_kv_cache(self):
         """Release only kv_cache GPU memory, keeping model weights intact.
@@ -693,9 +698,11 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str,
                 # 2. Abort all in-flight requests
                 # 3. Wait for requests to drain
                 # 4. Clear prefix and mm caches if clear_cache=True.
-                #    On vLLM >= 0.22.0, attached external KV stores (e.g.
-                #    MooncakeStoreConnector) are reset along with the local
-                #    prefix cache.
+                #    EngineCore._reset_caches defaults reset_connector=True
+                #    on this path, so any attached external KV store (e.g.
+                #    MooncakeStoreConnector) is invalidated along with the
+                #    local prefix cache — RL-correct hard-reset at every
+                #    weight update boundary, no extra kwargs needed.
                 await self.engine.pause_generation(
                     wait_for_inflight_requests=False,
                     clear_cache=reset_prefix_cache,

From e93c8f669faf48fb9250cbbebeda968e886c76b9 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Wed, 27 May 2026 13:23:43 +0000
Subject: [PATCH 14/15] fix(vllm): guard reset_connector for vllm 0.13

---
 docs/perf/rollout_kv_offload.md                        | 2 ++
 verl/workers/rollout/vllm_rollout/vllm_async_server.py | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md
index 507ffd1ad68..a6011cf46d1 100644
--- a/docs/perf/rollout_kv_offload.md
+++ b/docs/perf/rollout_kv_offload.md
@@ -1,5 +1,7 @@
 # Rollout KV Cache Offload via Mooncake-Store
 
+Last updated: 05/27/2026.
+
 Offload prefix KV blocks from the vLLM rollout engine to a shared
 [Mooncake](https://github.com/kvcache-ai/Mooncake) store so long shared
 prefixes (system prompt, agentic tool history, `rollout.n` samples per prompt)
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index dadd9a4243d..d4194e1ce55 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -54,6 +54,9 @@
 )
 
 _VLLM_VERSION = version.parse(vllm.__version__)
+_RESET_PREFIX_CACHE_KWARGS = {}
+if _VLLM_VERSION >= version.parse("0.13.0"):
+    _RESET_PREFIX_CACHE_KWARGS["reset_connector"] = True
 
 
 if _VLLM_VERSION > version.parse("0.11.0"):
@@ -607,7 +610,7 @@ async def wake_up(self, tags: list[str] | None = None):
             # processes across all DP shards (unlike collective_rpc which only reaches
             # TP workers within a single shard).
             await self.engine.wake_up(tags=tags or self._get_wake_up_tags())
-            await self.engine.reset_prefix_cache(reset_connector=True)
+            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
         elif self.rollout_mode == RolloutMode.COLOCATED:
             # Directly call engine to wake up without sync weights.
             await self.engine.wake_up(tags=self._get_wake_up_tags())
@@ -615,7 +618,7 @@ async def wake_up(self, tags: list[str] | None = None):
             # (e.g. MooncakeStoreConnector) whose entries were computed
             # against the previous weights. No-op success when no connector
             # is configured (vLLM scheduler treats it as such).
-            await self.engine.reset_prefix_cache(reset_connector=True)
+            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
         elif self.rollout_mode == RolloutMode.STANDALONE:
             logger.info("skip wake_up in standalone mode")
 
@@ -636,7 +639,7 @@ async def clear_kv_cache(self):
             # (e.g. MooncakeStoreConnector) whose entries were computed
             # against the previous model weights. With no connector it
             # is a no-op success, so we can pass it unconditionally.
-            await self.engine.reset_prefix_cache(reset_connector=True)
+            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
 
     async def release_kv_cache(self):
         """Release only kv_cache GPU memory, keeping model weights intact.

From 00346addb08e8873c88d346b717f6a1f211f6856 Mon Sep 17 00:00:00 2001
From: aoshen02 <aoshen@inferact.ai>
Date: Wed, 27 May 2026 13:38:50 +0000
Subject: [PATCH 15/15] docs: document vllm version requirement for kv offload

---
 docs/perf/rollout_kv_offload.md                        | 3 +++
 verl/workers/rollout/vllm_rollout/vllm_async_server.py | 9 +++------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md
index a6011cf46d1..4dd1c6b1ddb 100644
--- a/docs/perf/rollout_kv_offload.md
+++ b/docs/perf/rollout_kv_offload.md
@@ -11,6 +11,9 @@ prefix KV reduces the re-prefill cost.
 
 ## Setup Mooncake + vLLM
 
+Use vLLM 0.22 or newer; earlier vLLM versions do not provide the full
+MooncakeStoreConnector hard-reset behavior required by this integration.
+
 Follow vLLM's official guide for installing the Mooncake client, starting a
 master, and writing the JSON config:
 **<https://docs.vllm.ai/en/latest/features/mooncake_store_connector_usage/>**
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index d4194e1ce55..dadd9a4243d 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -54,9 +54,6 @@
 )
 
 _VLLM_VERSION = version.parse(vllm.__version__)
-_RESET_PREFIX_CACHE_KWARGS = {}
-if _VLLM_VERSION >= version.parse("0.13.0"):
-    _RESET_PREFIX_CACHE_KWARGS["reset_connector"] = True
 
 
 if _VLLM_VERSION > version.parse("0.11.0"):
@@ -610,7 +607,7 @@ async def wake_up(self, tags: list[str] | None = None):
             # processes across all DP shards (unlike collective_rpc which only reaches
             # TP workers within a single shard).
             await self.engine.wake_up(tags=tags or self._get_wake_up_tags())
-            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
+            await self.engine.reset_prefix_cache(reset_connector=True)
         elif self.rollout_mode == RolloutMode.COLOCATED:
             # Directly call engine to wake up without sync weights.
             await self.engine.wake_up(tags=self._get_wake_up_tags())
@@ -618,7 +615,7 @@ async def wake_up(self, tags: list[str] | None = None):
             # (e.g. MooncakeStoreConnector) whose entries were computed
             # against the previous weights. No-op success when no connector
             # is configured (vLLM scheduler treats it as such).
-            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
+            await self.engine.reset_prefix_cache(reset_connector=True)
         elif self.rollout_mode == RolloutMode.STANDALONE:
             logger.info("skip wake_up in standalone mode")
 
@@ -639,7 +636,7 @@ async def clear_kv_cache(self):
             # (e.g. MooncakeStoreConnector) whose entries were computed
             # against the previous model weights. With no connector it
             # is a no-op success, so we can pass it unconditionally.
-            await self.engine.reset_prefix_cache(**_RESET_PREFIX_CACHE_KWARGS)
+            await self.engine.reset_prefix_cache(reset_connector=True)
 
     async def release_kv_cache(self):
         """Release only kv_cache GPU memory, keeping model weights intact.