diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 0dd1d1ccff..71e9c18012 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -8,7 +8,6 @@ import torch from vllm.config import CUDAGraphMode, VllmConfig ->>>>>>> v0.15.1 from vllm.config.cache import CacheDType from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -33,6 +32,7 @@ ) from vllm.v1.attention.ops.triton_unified_attention import unified_attention from vllm.v1.kv_cache_interface import AttentionSpec +import vllm.envs as envs logger = init_logger(__name__) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a36f484e42..fa4f68bc63 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1707,14 +1707,14 @@ def _get_block_table(kv_cache_gid: int): blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1) return blk_table_tensor - if not hasattr(self, "rotate"): - if not isinstance(self.model.model.layers[0], PPMissingLayer): - self.rotate = self.model.model.layers[0].self_attn.rotary_emb - else: - for lay in self.model.model.layers: - if not isinstance(lay, PPMissingLayer): - self.rotate = lay.self_attn.rotary_emb - break + if not hasattr(self, "rotate"): + if not isinstance(self.model.model.layers[0], PPMissingLayer): + self.rotate = self.model.model.layers[0].self_attn.rotary_emb + else: + for lay in self.model.model.layers: + if not isinstance(lay, PPMissingLayer): + self.rotate = lay.self_attn.rotary_emb + break assert slot_mappings is not None block_table_gid_0 = _get_block_table(0)