diff --git a/fastdeploy/config.py b/fastdeploy/config.py index ea13434dd97..01b2a44c9de 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1409,9 +1409,11 @@ def postprocess(self, num_total_tokens, number_of_tasks): self.prefill_kvcache_block_num = self.total_block_num else: self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) - assert ( - self.prefill_kvcache_block_num >= self.max_block_num_per_seq - ), f"current block number :{self.prefill_kvcache_block_num} should be greater than or equal to current model len needed minimum block number :{self.max_block_num_per_seq}" + assert self.prefill_kvcache_block_num >= self.max_block_num_per_seq + self.enc_dec_block_num, ( + f"prefill_kvcache_block_num: {self.prefill_kvcache_block_num} should be larger " + f"than or equal to {self.max_block_num_per_seq + self.enc_dec_block_num}, please reduce " + "the max_model_len or increase num_gpu_blocks_override" + ) else: length = num_total_tokens // number_of_tasks block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size @@ -1432,9 +1434,11 @@ def reset(self, num_gpu_blocks): f"Reset block num, the total_block_num:{self.total_block_num}," f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}" ) - assert ( - self.prefill_kvcache_block_num >= self.max_block_num_per_seq - ), f"current block number :{self.prefill_kvcache_block_num} should be greater than or equal to current model len needed minimum block number :{self.max_block_num_per_seq}" + assert self.prefill_kvcache_block_num >= self.max_block_num_per_seq + self.enc_dec_block_num, ( + f"current device block num: {self.prefill_kvcache_block_num} " + f"should be larger than or equal to {self.max_block_num_per_seq + self.enc_dec_block_num}, please reduce " + "the max_model_len or replace the machine with larger GPU cards" + ) def print(self): """