From 917ae97afa38fb980562caf3b185c3f60a0b435e Mon Sep 17 00:00:00 2001 From: Nathan Ordonez Date: Tue, 24 Feb 2026 10:38:11 -0500 Subject: [PATCH 1/2] simple fixes from latest upstream merge Signed-off-by: Nathan Ordonez --- examples/offline_inference/spans/spans.py | 2 +- vllm/v1/attention/backends/triton_attn.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/offline_inference/spans/spans.py b/examples/offline_inference/spans/spans.py index ebe42f7ba7..c6c147efcb 100644 --- a/examples/offline_inference/spans/spans.py +++ b/examples/offline_inference/spans/spans.py @@ -47,7 +47,7 @@ def initialize_vllm( def main(): model_names = [ - "ldsjmdy/Tulu3-Block-FT", # <- finetuned to handle block-attention + "dev/data/Tulu3-Block-FT", # <- finetuned to handle block-attention "ldsjmdy/Tulu3-RAG", # <- baseline ] model_name = model_names[0] diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 0dd1d1ccff..71e9c18012 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -8,7 +8,6 @@ import torch from vllm.config import CUDAGraphMode, VllmConfig ->>>>>>> v0.15.1 from vllm.config.cache import CacheDType from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -33,6 +32,7 @@ ) from vllm.v1.attention.ops.triton_unified_attention import unified_attention from vllm.v1.kv_cache_interface import AttentionSpec +import vllm.envs as envs logger = init_logger(__name__) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a36f484e42..fa4f68bc63 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1707,14 +1707,14 @@ def _get_block_table(kv_cache_gid: int): blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1) return blk_table_tensor - if not hasattr(self, "rotate"): - if not isinstance(self.model.model.layers[0], PPMissingLayer): - self.rotate = self.model.model.layers[0].self_attn.rotary_emb - else: - for lay in self.model.model.layers: - if not isinstance(lay, PPMissingLayer): - self.rotate = lay.self_attn.rotary_emb - break + if not hasattr(self, "rotate"): + if not isinstance(self.model.model.layers[0], PPMissingLayer): + self.rotate = self.model.model.layers[0].self_attn.rotary_emb + else: + for lay in self.model.model.layers: + if not isinstance(lay, PPMissingLayer): + self.rotate = lay.self_attn.rotary_emb + break assert slot_mappings is not None block_table_gid_0 = _get_block_table(0) From d65a9b76fd2385d1674293b142b3ad899afc340d Mon Sep 17 00:00:00 2001 From: Nathan Ordonez Date: Tue, 24 Feb 2026 10:46:17 -0500 Subject: [PATCH 2/2] pushed this detail by mistake Signed-off-by: Nathan Ordonez --- examples/offline_inference/spans/spans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/spans/spans.py b/examples/offline_inference/spans/spans.py index c6c147efcb..ebe42f7ba7 100644 --- a/examples/offline_inference/spans/spans.py +++ b/examples/offline_inference/spans/spans.py @@ -47,7 +47,7 @@ def initialize_vllm( def main(): model_names = [ - "dev/data/Tulu3-Block-FT", # <- finetuned to handle block-attention + "ldsjmdy/Tulu3-Block-FT", # <- finetuned to handle block-attention "ldsjmdy/Tulu3-RAG", # <- baseline ] model_name = model_names[0]