diff --git a/docs/index.rst b/docs/index.rst index 85a5da22155..cb3f7227613 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -102,6 +102,7 @@ verl is fast with: perf/dpsk.md perf/best_practices perf/perf_tuning + perf/rollout_kv_offload.md README_vllm0.8.md perf/device_tuning perf/verl_profiler_system.md diff --git a/docs/perf/rollout_kv_offload.md b/docs/perf/rollout_kv_offload.md new file mode 100644 index 00000000000..4dd1c6b1ddb --- /dev/null +++ b/docs/perf/rollout_kv_offload.md @@ -0,0 +1,57 @@ +# Rollout KV Cache Offload via Mooncake-Store + +Last updated: 05/27/2026. + +Offload prefix KV blocks from the vLLM rollout engine to a shared +[Mooncake](https://github.com/kvcache-ai/Mooncake) store so long shared +prefixes (system prompt, agentic tool history, `rollout.n` samples per prompt) +get deduplicated across requests and rollout replicas. This also helps +long-tail load balancing: when work migrates to idle rollout replicas, shared +prefix KV reduces the re-prefill cost. + +## Setup Mooncake + vLLM + +Use vLLM 0.22 or newer; earlier vLLM versions do not provide the full +MooncakeStoreConnector hard-reset behavior required by this integration. + +Follow vLLM's official guide for installing the Mooncake client, starting a +master, and writing the JSON config: +**** + +The verl side only consumes whatever that doc produces — no extra steps. + +## Enable in verl + +verl forwards `engine_kwargs.vllm.*` straight to `vllm serve` as CLI flags. +To attach the Mooncake connector, set `kv_transfer_config`: + +```yaml +actor_rollout_ref: + rollout: + engine_kwargs: + vllm: + kv_transfer_config: |- + { + "kv_connector": "MooncakeStoreConnector", + "kv_role": "kv_both", + "kv_connector_extra_config": { + "mooncake_config_path": "/path/to/mooncake_config.json" + } + } +``` + +Or as a Hydra CLI override: + +```bash ++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector=MooncakeStoreConnector \ ++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_role=kv_both \ ++actor_rollout_ref.rollout.engine_kwargs.vllm.kv_transfer_config.kv_connector_extra_config.mooncake_config_path=/path/to/mooncake_config.json +``` + +## RL correctness: hard reset on every weight update + +verl clears both local and Mooncake KV caches at every weight update boundary +to avoid reusing KV from the previous policy. + +**Required vLLM version**: use vLLM 0.22 or newer. Older builds may leave stale +KV in the Mooncake master after a weight update. diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index fee46046cc3..dadd9a4243d 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -607,11 +607,15 @@ async def wake_up(self, tags: list[str] | None = None): # processes across all DP shards (unlike collective_rpc which only reaches # TP workers within a single shard). await self.engine.wake_up(tags=tags or self._get_wake_up_tags()) - await self.engine.reset_prefix_cache() + await self.engine.reset_prefix_cache(reset_connector=True) elif self.rollout_mode == RolloutMode.COLOCATED: # Directly call engine to wake up without sync weights. await self.engine.wake_up(tags=self._get_wake_up_tags()) - await self.engine.reset_prefix_cache() + # reset_connector=True drops any attached external KV store + # (e.g. MooncakeStoreConnector) whose entries were computed + # against the previous weights. No-op success when no connector + # is configured (vLLM scheduler treats it as such). + await self.engine.reset_prefix_cache(reset_connector=True) elif self.rollout_mode == RolloutMode.STANDALONE: logger.info("skip wake_up in standalone mode") @@ -628,7 +632,11 @@ async def sleep(self): async def clear_kv_cache(self): if self.node_rank == 0: - await self.engine.reset_prefix_cache() + # reset_connector=True drops any attached external KV store + # (e.g. MooncakeStoreConnector) whose entries were computed + # against the previous model weights. With no connector it + # is a no-op success, so we can pass it unconditionally. + await self.engine.reset_prefix_cache(reset_connector=True) async def release_kv_cache(self): """Release only kv_cache GPU memory, keeping model weights intact. @@ -689,7 +697,12 @@ async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str, # 1. Set engine to paused state (blocks new generate calls) # 2. Abort all in-flight requests # 3. Wait for requests to drain - # 4. Clear prefix and mm caches if clear_cache=True + # 4. Clear prefix and mm caches if clear_cache=True. + # EngineCore._reset_caches defaults reset_connector=True + # on this path, so any attached external KV store (e.g. + # MooncakeStoreConnector) is invalidated along with the + # local prefix cache — RL-correct hard-reset at every + # weight update boundary, no extra kwargs needed. await self.engine.pause_generation( wait_for_inflight_requests=False, clear_cache=reset_prefix_cache,