Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
ee8a295
[Bugfix] Fix compressed-tensors quantization failure for DeepSeek-R1 …
vllmellm Mar 7, 2026
00b814b
[V0 Deprecation] Remove unused swap_space parameter (#36216)
majiayu000 Mar 7, 2026
5261223
[Misc] Remove duplicate parser registration (#36303)
taneem-ibrahim Mar 7, 2026
85f50eb
Adding support to Sarvam's MoE models (#33942)
rahul-sarvam Mar 7, 2026
ebb9cc5
[UX][Startup] Account for CUDA graphs during memory profiling (#30515)
MatthewBonanni Mar 7, 2026
eebd146
[CI] Enable Crosslayer KV layout tests for ROCm platforms (#35416)
qli88 Mar 7, 2026
fc46577
[ROCm][CI] Enable AITER for failing `test_gpt_oss` test case on MI355…
micah-wil Mar 7, 2026
ee54f9c
[ROCm][CI] Accept Different But Valid Output for `test_olmoe_tp` (#35…
micah-wil Mar 7, 2026
a6be75d
[Core] NGram GPU Implementation compatible with Async Scheduler (#29184)
PatchouliTIS Mar 7, 2026
379689d
[Perf] Support FP8 KV cache for Flashinfer MLA Sparse (#35891)
wzhao18 Mar 7, 2026
2dde535
[compile] Split compile/warmup monitoring (#36098)
zou3519 Mar 7, 2026
63298ee
[Bugfix][LMCache][KVConnector] fix potential memory leak in LMCache m…
royyhuang Mar 7, 2026
5d6aae4
[LMCache MP Patch]: Race Condition + Duplicated Block Ids (#35831)
sammshen Mar 7, 2026
40077ea
[CI] fix flaky empty responses and add diagnostic assertions in visio…
AndreasKaratzas Mar 8, 2026
b7332b0
[Model] Nano Nemotron VL - fast media preprocessing (#35657)
nvnbagrov Mar 8, 2026
4497431
[Frontend] Add GPU-less render serving path (`vllm launch render`) (#…
sagearc Mar 8, 2026
0a6a3a1
Add support for ModelOpt MXFP8 MoE models (#35986)
danisereb Mar 8, 2026
e5ff140
[cudagraph] fix cudagraph warning in deepseekv32 (#28044)
ZJY0516 Mar 9, 2026
f999e2c
Add script for pd disagg on XPU
zhenwei-intel Mar 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .buildkite/performance-benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
Expand All @@ -37,7 +36,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
Expand All @@ -64,7 +62,6 @@
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
Expand All @@ -91,7 +88,6 @@
"server_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
Expand Down
4 changes: 0 additions & 4 deletions .buildkite/performance-benchmarks/tests/serving-tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
Expand All @@ -23,7 +22,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
Expand All @@ -41,7 +39,6 @@
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
Expand All @@ -59,7 +56,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": {
"model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
Expand Down
28 changes: 28 additions & 0 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1486,6 +1486,20 @@ steps:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
- CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

##### multi gpus test #####
##### A100 test #####

Expand Down Expand Up @@ -3136,6 +3150,20 @@ steps:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi355_4
# grade: Blocking
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
- CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

##### multi gpus test #####
##### A100 test #####

Expand Down
1 change: 0 additions & 1 deletion benchmarks/attention_benchmarks/mla_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ def create_minimal_vllm_config(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=False,
)
Expand Down
1 change: 0 additions & 1 deletion benchmarks/attention_benchmarks/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ def _create_vllm_config(
cache_config = CacheConfig(
block_size=config.block_size,
cache_dtype="auto",
swap_space=0,
)
cache_config.num_gpu_blocks = max_num_blocks
cache_config.num_cpu_blocks = 0
Expand Down
2 changes: 1 addition & 1 deletion docs/design/attention_backends.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ configuration.
|---------|--------|-----------|-------------|------------|------|--------|-----------|-----|-----------------|--------------|
| `CUTLASS_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
| `FLASHINFER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
| `FLASHMLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
| `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x |
| `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |
Expand Down
8 changes: 4 additions & 4 deletions docs/design/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -507,10 +507,10 @@ longer relevant in v1:
- `vllm:num_requests_swapped`
- `vllm:cpu_cache_usage_perc`

In this mode, when a request is preempted (e.g. to make room in KV
cache to complete other requests), we swap kv cache blocks out to CPU
memory. This is also known as "KV cache offloading" and is configured
with `--swap-space` and `--preemption-mode`.
In this mode, when a request was preempted (e.g. to make room in KV
cache to complete other requests), kv cache blocks were swapped out to
CPU memory. The `--swap-space` flag has been removed as this feature
is no longer used in V1.

Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
SequenceGroup encapsulated the idea of N Sequences which
Expand Down
2 changes: 2 additions & 0 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,8 @@ th {
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ |
| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
| `RWForCausalLM` | Falcon RW | `tiiuae/falcon-40b`, etc. | | ✅︎ |
| `SarvamMoEForCausalLM` | Sarvam 2 | `sarvamai/sarvam2-30b-a3b`, etc. | ✅︎ | ✅︎ |
| `SarvamMLAForCausalLM` | Sarvam 2 | `sarvamai/sarvam2-105b-a9b`, etc. | | ✅︎ |
| `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ |
| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ |
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
Expand Down
2 changes: 1 addition & 1 deletion docs/serving/integrations/llamaindex.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ llm = Vllm(
model="microsoft/Orca-2-7b",
tensor_parallel_size=4,
max_new_tokens=100,
vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
vllm_kwargs={"gpu_memory_utilization": 0.5},
)
```

Expand Down
53 changes: 53 additions & 0 deletions tests/compile/test_graph_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,56 @@ def model_fn(x: torch.Tensor) -> torch.Tensor:
assert [node.op for node in splitting_gm.graph.nodes] == ["placeholder"] + 2 * [
"call_function"
] + ["output"]


def test_empty_only_partition_is_merged():
"""
Test that an empty-allocation-only partition is merged into its previous
partition during Dynamo FX splitting.
"""

def model_fn(x: torch.Tensor) -> torch.Tensor:
y = torch.sin(x)
out = torch.empty_like(y)
torch.ops.aten.cos.out(y, out=out)
return out

x = torch.randn(4, 3)
gm = make_fx(model_fn)(x)

split_ops = ["aten::sin", "aten::cos.out"]
split_gm, split_items = split_graph(gm, split_ops)

# Without the merge, this graph is split into 3 partitions where the
# middle partition contains only aten::empty_like.
assert len(split_items) == 2, "Empty-only partition should be merged"

output_original = gm(x)
output_split = split_gm(x)
assert torch.allclose(output_original, output_split), "Output mismatch after split"


def test_builtin_empty_only_partition_is_merged():
"""
In Dynamo graphs, torch.empty/empty_like may appear as builtin call targets
(not aten OpOverload). Ensure empty-only partitions are still merged.
"""

def model_fn(x: torch.Tensor) -> torch.Tensor:
out1 = torch.empty_like(x)
torch.ops.silly.attention(x, x, x, out1)
out2 = torch.empty_like(x)
torch.ops.silly.attention(out1, out1, out1, out2)
return out2

gm = torch.fx.symbolic_trace(model_fn)
split_gm, split_items = split_graph(gm, ["silly::attention"])

# Without the empty-only merge, this graph creates 4 partitions:
# [empty_like], [attention], [empty_like], [attention].
assert len(split_items) == 3, "Builtin empty-only partition should be merged"

x = torch.randn(2, 3, device="cuda")
output_original = gm(x)
output_split = split_gm(x)
assert torch.allclose(output_original, output_split), "Output mismatch after split"
2 changes: 0 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,6 @@ def __init__(
tensor_parallel_size: int = 1,
block_size: int = 16 if not torch.xpu.is_available() else 64,
enable_chunked_prefill: bool | None = False,
swap_space: int = 4,
enforce_eager: bool | None = False,
# Set this to avoid hanging issue
default_torch_num_threads: int | None = None,
Expand Down Expand Up @@ -831,7 +830,6 @@ def __init__(
trust_remote_code=trust_remote_code,
dtype=dtype,
seed=seed,
swap_space=swap_space,
enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats,
tensor_parallel_size=tensor_parallel_size,
Expand Down
3 changes: 1 addition & 2 deletions tests/distributed/test_torchrun_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@

sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# set different `gpu_memory_utilization` and `swap_space` for different ranks,
# set different `gpu_memory_utilization` for different ranks,
# to test if all ranks agree on the same kv cache configuration.
llm = LLM(
model="facebook/opt-125m",
tensor_parallel_size=2,
pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9),
swap_space=random.randint(1, 4),
seed=0,
)

Expand Down
3 changes: 1 addition & 2 deletions tests/distributed/test_torchrun_example_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# set different `gpu_memory_utilization` and `swap_space` for different ranks,
# set different `gpu_memory_utilization` for different ranks,
# to test if all ranks agree on the same kv cache configuration.
llm = LLM(
model="microsoft/Phi-mini-MoE-instruct",
Expand All @@ -37,7 +37,6 @@
enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9),
swap_space=random.randint(1, 4),
seed=0,
)

Expand Down
Loading