zhenwei-intel · zhenwei-intel · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026
diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3-8B",
             "tensor_parallel_size": 1,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy"
         },

diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,7 +10,6 @@
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy",
             "max-model-len": 2048,
@@ -37,7 +36,6 @@
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy",
             "max-model-len": 2048,
@@ -64,7 +62,6 @@
         "server_parameters": {
             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
             "tensor_parallel_size": 2,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy",
             "max-model-len": 2048,
@@ -91,7 +88,6 @@
         "server_parameters": {
             "model": "deepseek-ai/DeepSeek-R1",
             "tensor_parallel_size": 8,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy",
             "max-model-len": 2048,

diff --git a/.buildkite/performance-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,7 +5,6 @@
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy"
         },
@@ -23,7 +22,6 @@
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy"
         },
@@ -41,7 +39,6 @@
         "server_parameters": {
             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
             "tensor_parallel_size": 2,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy"
         },
@@ -59,7 +56,6 @@
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
             "tensor_parallel_size": 4,
-            "swap_space": 16,
             "speculative_config": {
                 "model": "turboderp/Qwama-0.5B-Instruct",
                 "num_speculative_tokens": 4,

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -1486,6 +1486,20 @@ steps:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
     - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
+- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+    - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
 ##### multi gpus test #####
 ##### A100 test #####
 
@@ -3136,6 +3150,20 @@ steps:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
     - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
+- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi355_4
+  # grade: Blocking
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+    - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
 ##### multi gpus test #####
 ##### A100 test #####
 

diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
@@ -145,7 +145,6 @@ def create_minimal_vllm_config(
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
         enable_prefix_caching=False,
     )

diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
@@ -141,7 +141,6 @@ def _create_vllm_config(
     cache_config = CacheConfig(
         block_size=config.block_size,
         cache_dtype="auto",
-        swap_space=0,
     )
     cache_config.num_gpu_blocks = max_num_blocks
     cache_config.num_cpu_blocks = 0

diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md
@@ -206,7 +206,7 @@ configuration.
 |---------|--------|-----------|-------------|------------|------|--------|-----------|-----|-----------------|--------------|
 | `CUTLASS_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
 | `FLASHINFER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
-| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
+| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
 | `FLASHMLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
 | `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x |
 | `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |

diff --git a/docs/design/metrics.md b/docs/design/metrics.md
@@ -507,10 +507,10 @@ longer relevant in v1:
 - `vllm:num_requests_swapped`
 - `vllm:cpu_cache_usage_perc`
 
-In this mode, when a request is preempted (e.g. to make room in KV
-cache to complete other requests), we swap kv cache blocks out to CPU
-memory. This is also known as "KV cache offloading" and is configured
-with `--swap-space` and `--preemption-mode`.
+In this mode, when a request was preempted (e.g. to make room in KV
+cache to complete other requests), kv cache blocks were swapped out to
+CPU memory. The `--swap-space` flag has been removed as this feature
+is no longer used in V1.
 
 Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
 SequenceGroup encapsulated the idea of N Sequences which

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -469,6 +469,8 @@ th {
 | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ |
 | `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
 | `RWForCausalLM` | Falcon RW | `tiiuae/falcon-40b`, etc. | | ✅︎ |
+| `SarvamMoEForCausalLM` | Sarvam 2 | `sarvamai/sarvam2-30b-a3b`, etc. | ✅︎ | ✅︎ |
+| `SarvamMLAForCausalLM` | Sarvam 2 | `sarvamai/sarvam2-105b-a9b`, etc. | | ✅︎ |
 | `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ |
 | `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ |
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |

diff --git a/docs/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md
@@ -17,7 +17,7 @@ llm = Vllm(
     model="microsoft/Orca-2-7b",
     tensor_parallel_size=4,
     max_new_tokens=100,
-    vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
+    vllm_kwargs={"gpu_memory_utilization": 0.5},
 )
 ```
 

diff --git a/tests/compile/test_graph_partition.py b/tests/compile/test_graph_partition.py
@@ -184,3 +184,56 @@ def model_fn(x: torch.Tensor) -> torch.Tensor:
     assert [node.op for node in splitting_gm.graph.nodes] == ["placeholder"] + 2 * [
         "call_function"
     ] + ["output"]
+
+
+def test_empty_only_partition_is_merged():
+    """
+    Test that an empty-allocation-only partition is merged into its previous
+    partition during Dynamo FX splitting.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        y = torch.sin(x)
+        out = torch.empty_like(y)
+        torch.ops.aten.cos.out(y, out=out)
+        return out
+
+    x = torch.randn(4, 3)
+    gm = make_fx(model_fn)(x)
+
+    split_ops = ["aten::sin", "aten::cos.out"]
+    split_gm, split_items = split_graph(gm, split_ops)
+
+    # Without the merge, this graph is split into 3 partitions where the
+    # middle partition contains only aten::empty_like.
+    assert len(split_items) == 2, "Empty-only partition should be merged"
+
+    output_original = gm(x)
+    output_split = split_gm(x)
+    assert torch.allclose(output_original, output_split), "Output mismatch after split"
+
+
+def test_builtin_empty_only_partition_is_merged():
+    """
+    In Dynamo graphs, torch.empty/empty_like may appear as builtin call targets
+    (not aten OpOverload). Ensure empty-only partitions are still merged.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        out1 = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, out1)
+        out2 = torch.empty_like(x)
+        torch.ops.silly.attention(out1, out1, out1, out2)
+        return out2
+
+    gm = torch.fx.symbolic_trace(model_fn)
+    split_gm, split_items = split_graph(gm, ["silly::attention"])
+
+    # Without the empty-only merge, this graph creates 4 partitions:
+    # [empty_like], [attention], [empty_like], [attention].
+    assert len(split_items) == 3, "Builtin empty-only partition should be merged"
+
+    x = torch.randn(2, 3, device="cuda")
+    output_original = gm(x)
+    output_split = split_gm(x)
+    assert torch.allclose(output_original, output_split), "Output mismatch after split"
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -794,7 +794,6 @@ def __init__(
         tensor_parallel_size: int = 1,
         block_size: int = 16 if not torch.xpu.is_available() else 64,
         enable_chunked_prefill: bool | None = False,
-        swap_space: int = 4,
         enforce_eager: bool | None = False,
         # Set this to avoid hanging issue
         default_torch_num_threads: int | None = None,
@@ -831,7 +830,6 @@ def __init__(
                 trust_remote_code=trust_remote_code,
                 dtype=dtype,
                 seed=seed,
-                swap_space=swap_space,
                 enforce_eager=enforce_eager,
                 disable_log_stats=disable_log_stats,
                 tensor_parallel_size=tensor_parallel_size,

diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
@@ -22,15 +22,14 @@
 
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# set different `gpu_memory_utilization` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
 llm = LLM(
     model="facebook/opt-125m",
     tensor_parallel_size=2,
     pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
     distributed_executor_backend="external_launcher",
     gpu_memory_utilization=random.uniform(0.7, 0.9),
-    swap_space=random.randint(1, 4),
     seed=0,
 )
 

diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py
@@ -28,7 +28,7 @@
 
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# set different `gpu_memory_utilization` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
 llm = LLM(
     model="microsoft/Phi-mini-MoE-instruct",
@@ -37,7 +37,6 @@
     enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
     distributed_executor_backend="external_launcher",
     gpu_memory_utilization=random.uniform(0.7, 0.9),
-    swap_space=random.randint(1, 4),
     seed=0,
 )