From 36d78cc468c1242e8e59c1ab7f6d7a05899a0eef Mon Sep 17 00:00:00 2001 From: kevincheng2 Date: Mon, 30 Mar 2026 20:47:27 +0800 Subject: [PATCH 1/2] [Cherry-Pick][BugFix][KVCache] Add enc_dec_block_num to prefill_kvcache_block_num check ## Motivation Cherry-pick from release/2.5: the original assertion only checked `prefill_kvcache_block_num >= max_block_num_per_seq`, but for encoder-decoder models the kvcache must also reserve blocks for the encoder side (`enc_dec_block_num`). Without this check, the service could silently allocate insufficient blocks for enc-dec sequences. ## Modifications - `CacheConfig.postprocess`: tighten assertion to `prefill_kvcache_block_num >= max_block_num_per_seq + enc_dec_block_num` - `CacheConfig.reset`: same tightening - Improve error message to guide users to reduce `max_model_len` or increase `num_gpu_blocks_override` ## Usage or Command No change to launch command. If the assertion fires, adjust: ```bash # Option 1: reduce max_model_len python -m fastdeploy.entrypoints.openai.api_server \ --max-model-len ... # Option 2: increase GPU block count python -m fastdeploy.entrypoints.openai.api_server \ --num-gpu-blocks-override ... ``` Co-Authored-By: Claude Sonnet 4.6 --- fastdeploy/config.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index ea13434dd97..9f810974491 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1409,9 +1409,11 @@ def postprocess(self, num_total_tokens, number_of_tasks): self.prefill_kvcache_block_num = self.total_block_num else: self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) - assert ( - self.prefill_kvcache_block_num >= self.max_block_num_per_seq - ), f"current block number :{self.prefill_kvcache_block_num} should be greater than or equal to current model len needed minimum block number :{self.max_block_num_per_seq}" + assert self.prefill_kvcache_block_num >= self.max_block_num_per_seq + self.enc_dec_block_num, ( + f"prefill_kvcache_block_num: {self.prefill_kvcache_block_num} should be larger " + f"than or equal to {self.max_block_num_per_seq + self.enc_dec_block_num}, please reduce " + "the max_model_len or increase num_gpu_blocks_override" + ) else: length = num_total_tokens // number_of_tasks block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size @@ -1432,9 +1434,11 @@ def reset(self, num_gpu_blocks): f"Reset block num, the total_block_num:{self.total_block_num}," f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}" ) - assert ( - self.prefill_kvcache_block_num >= self.max_block_num_per_seq - ), f"current block number :{self.prefill_kvcache_block_num} should be greater than or equal to current model len needed minimum block number :{self.max_block_num_per_seq}" + assert self.prefill_kvcache_block_num >= self.max_block_num_per_seq + self.enc_dec_block_num, ( + f"prefill_kvcache_block_num: {self.prefill_kvcache_block_num} should be larger " + f"than or equal to {self.max_block_num_per_seq + self.enc_dec_block_num}, please reduce " + "the max_model_len or increase num_gpu_blocks_override" + ) def print(self): """ From 255f57254f7aa50b90607701277ef61a4eff3acd Mon Sep 17 00:00:00 2001 From: kevincheng2 Date: Mon, 30 Mar 2026 20:48:24 +0800 Subject: [PATCH 2/2] [Cherry-Pick][BugFix][KVCache] Add enc_dec_block_num to prefill_kvcache_block_num check ## Motivation Cherry-pick from release/2.5: the original assertion only checked `prefill_kvcache_block_num >= max_block_num_per_seq`, but for encoder-decoder models the kvcache must also reserve blocks for the encoder side (`enc_dec_block_num`). Without this check, the service could silently allocate insufficient blocks for enc-dec sequences. ## Modifications - `CacheConfig.postprocess`: tighten assertion to `prefill_kvcache_block_num >= max_block_num_per_seq + enc_dec_block_num`, error message guides user to reduce `max_model_len` or increase `num_gpu_blocks_override` - `CacheConfig.reset`: same tightening, error message guides user to reduce `max_model_len` or replace with larger GPU cards (override is not applicable here) ## Usage or Command No change to launch command. If the assertion fires, adjust: ```bash # Option 1: reduce max_model_len python -m fastdeploy.entrypoints.openai.api_server \ --max-model-len ... # Option 2 (postprocess only): increase GPU block count python -m fastdeploy.entrypoints.openai.api_server \ --num-gpu-blocks-override ... ``` Co-Authored-By: Claude Sonnet 4.6 --- fastdeploy/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 9f810974491..01b2a44c9de 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1435,9 +1435,9 @@ def reset(self, num_gpu_blocks): f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}" ) assert self.prefill_kvcache_block_num >= self.max_block_num_per_seq + self.enc_dec_block_num, ( - f"prefill_kvcache_block_num: {self.prefill_kvcache_block_num} should be larger " - f"than or equal to {self.max_block_num_per_seq + self.enc_dec_block_num}, please reduce " - "the max_model_len or increase num_gpu_blocks_override" + f"current device block num: {self.prefill_kvcache_block_num} " + f"should be larger than or equal to {self.max_block_num_per_seq + self.enc_dec_block_num}, please reduce " + "the max_model_len or replace the machine with larger GPU cards" ) def print(self):