zhenwei-intel · zhenwei-intel · Oct 10, 2025 · Oct 11, 2025 · Oct 11, 2025 · zhenwei-intel
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -44,6 +44,5 @@ docker run \
     pytest -v -s v1/structured_output
     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
     pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
-    pytest -v -s v1/test_metrics
     pytest -v -s v1/test_serial_utils.py
 '
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
@@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
+
+# install nixl from source code
-# install nixl from source code
+# Install NIXL from source code
-# install nixl from source code
+# Install NIXL from source code
+RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
+
 ENTRYPOINT ["vllm", "serve"]
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
@@ -10,7 +10,6 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.61.2 # Required for N-gram speculative decoding
-nixl==0.3.0 # for PD disaggregation
 torch==2.8.0+xpu
 torchaudio
 torchvision

diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
@@ -135,6 +135,7 @@ def build_and_install_prerequisites(args):
         "--enable-devel-headers",
         "--with-verbs",
         "--enable-mt",
+        "--with-ze=no",
     ]
     run_command(configure_command, cwd=ucx_source_path)
     run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)

diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
@@ -54,6 +54,14 @@ def get_attn_backend_cls(
         has_sink: bool,
         use_sparse,
     ) -> str:
+        from vllm.v1.attention.backends.utils import set_kv_cache_layout
+
+        set_kv_cache_layout("NHD")
+        logger.info(
+            "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
+            "only NHD layout is supported by XPU attention kernels."
+        )
+
         from vllm.attention.backends.registry import _Backend
 
         if use_sparse:
@@ -190,13 +198,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 vllm_config.scheduler_config.max_model_len,
                 DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
-        from vllm.v1.attention.backends.utils import set_kv_cache_layout
-
-        set_kv_cache_layout("NHD")
-        logger.info(
-            "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
-            "only NHD layout is supported by XPU attention kernels."
-        )
 
     @classmethod
     def support_hybrid_kv_cache(cls) -> bool: