From 0d6b1cb75f04a3ad091dd7b0f9b5b6e3ff5bd9d5 Mon Sep 17 00:00:00 2001
From: Tahmid Muttaki <tmuttaki@redhat.com>
Date: Fri, 3 Apr 2026 16:59:55 -0400
Subject: [PATCH] Fix TypeError when disabling on-device sampling

Fixed incorrect import of Sampler class that caused a TypeError when
NEURON_ON_DEVICE_SAMPLING_DISABLED=1 was set or when on_device_sampling_config
was explicitly set to None.

The bug occurred because the code was importing the sampler module instead of
the Sampler class:
  - Before: from vllm.v1.sample import sampler as Sampler
  - After: from vllm.v1.sample.sampler import Sampler

This caused a "TypeError: 'module' object is not callable" error when trying
to instantiate the sampler at line 81.

This fix enables CPU sampling mode, which is required for structured outputs
and grammar-constrained generation that are not supported by on-device sampling.

Tested on AWS Trainium (trn1.2xlarge) with TinyLlama-1.1B-Chat-v1.0 using
structured output via response_format parameter.

Signed-off-by: Tahmid Muttaki <tmuttaki@redhat.com>
---
 vllm_neuron/worker/neuronx_distributed_model_loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_neuron/worker/neuronx_distributed_model_loader.py b/vllm_neuron/worker/neuronx_distributed_model_loader.py
index 6d76487..f170375 100644
--- a/vllm_neuron/worker/neuronx_distributed_model_loader.py
+++ b/vllm_neuron/worker/neuronx_distributed_model_loader.py
@@ -51,7 +51,7 @@
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.v1.outputs import SamplerOutput
-from vllm.v1.sample import sampler as Sampler
+from vllm.v1.sample.sampler import Sampler
 
 from vllm_neuron.worker.constants import (
     NEURON_MULTI_MODAL_MODELS,