From 0d6b1cb75f04a3ad091dd7b0f9b5b6e3ff5bd9d5 Mon Sep 17 00:00:00 2001 From: Tahmid Muttaki Date: Fri, 3 Apr 2026 16:59:55 -0400 Subject: [PATCH] Fix TypeError when disabling on-device sampling Fixed incorrect import of Sampler class that caused a TypeError when NEURON_ON_DEVICE_SAMPLING_DISABLED=1 was set or when on_device_sampling_config was explicitly set to None. The bug occurred because the code was importing the sampler module instead of the Sampler class: - Before: from vllm.v1.sample import sampler as Sampler - After: from vllm.v1.sample.sampler import Sampler This caused a "TypeError: 'module' object is not callable" error when trying to instantiate the sampler at line 81. This fix enables CPU sampling mode, which is required for structured outputs and grammar-constrained generation that are not supported by on-device sampling. Tested on AWS Trainium (trn1.2xlarge) with TinyLlama-1.1B-Chat-v1.0 using structured output via response_format parameter. Signed-off-by: Tahmid Muttaki --- vllm_neuron/worker/neuronx_distributed_model_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_neuron/worker/neuronx_distributed_model_loader.py b/vllm_neuron/worker/neuronx_distributed_model_loader.py index 6d76487..f170375 100644 --- a/vllm_neuron/worker/neuronx_distributed_model_loader.py +++ b/vllm_neuron/worker/neuronx_distributed_model_loader.py @@ -51,7 +51,7 @@ ) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.v1.outputs import SamplerOutput -from vllm.v1.sample import sampler as Sampler +from vllm.v1.sample.sampler import Sampler from vllm_neuron.worker.constants import ( NEURON_MULTI_MODAL_MODELS,