diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md index 289877e504bb..3a321c0fefdf 100644 --- a/.buildkite/performance-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co "server_parameters": { "model": "meta-llama/Meta-Llama-3-8B", "tensor_parallel_size": 1, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json index a2e42aa16fd3..3929aa5fbbe0 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json @@ -10,7 +10,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy", "max-model-len": 2048, @@ -37,7 +36,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy", "max-model-len": 2048, @@ -64,7 +62,6 @@ "server_parameters": { "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "tensor_parallel_size": 2, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy", "max-model-len": 2048, @@ -91,7 +88,6 @@ "server_parameters": { "model": "deepseek-ai/DeepSeek-R1", "tensor_parallel_size": 8, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy", "max-model-len": 2048, diff --git a/.buildkite/performance-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json index a6d4141d5c2d..66d52abc1206 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests.json @@ -5,7 +5,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, @@ -23,7 +22,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, @@ -41,7 +39,6 @@ "server_parameters": { "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "tensor_parallel_size": 2, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, @@ -59,7 +56,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, - "swap_space": 16, "speculative_config": { "model": "turboderp/Qwama-0.5B-Instruct", "num_speculative_tokens": 4, diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index f69713a335df..9323310b411b 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1486,6 +1486,20 @@ steps: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh +- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + ##### multi gpus test ##### ##### A100 test ##### @@ -3136,6 +3150,20 @@ steps: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh +- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi355_4 + # grade: Blocking + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + ##### multi gpus test ##### ##### A100 test ##### diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py index 867f55fa9ef7..110f580fb7bd 100644 --- a/benchmarks/attention_benchmarks/mla_runner.py +++ b/benchmarks/attention_benchmarks/mla_runner.py @@ -145,7 +145,6 @@ def create_minimal_vllm_config( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", enable_prefix_caching=False, ) diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py index 9744b857d96b..7f968cfec148 100644 --- a/benchmarks/attention_benchmarks/runner.py +++ b/benchmarks/attention_benchmarks/runner.py @@ -141,7 +141,6 @@ def _create_vllm_config( cache_config = CacheConfig( block_size=config.block_size, cache_dtype="auto", - swap_space=0, ) cache_config.num_gpu_blocks = max_num_blocks cache_config.num_cpu_blocks = 0 diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md index e7170babb6c9..a2079e70d7e8 100644 --- a/docs/design/attention_backends.md +++ b/docs/design/attention_backends.md @@ -206,7 +206,7 @@ configuration. |---------|--------|-----------|-------------|------------|------|--------|-----------|-----|-----------------|--------------| | `CUTLASS_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x | | `FLASHINFER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x | -| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x | +| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x | | `FLASHMLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x | | `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x | | `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x | diff --git a/docs/design/metrics.md b/docs/design/metrics.md index a977ce9b9bb2..b24ff64b6783 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -507,10 +507,10 @@ longer relevant in v1: - `vllm:num_requests_swapped` - `vllm:cpu_cache_usage_perc` -In this mode, when a request is preempted (e.g. to make room in KV -cache to complete other requests), we swap kv cache blocks out to CPU -memory. This is also known as "KV cache offloading" and is configured -with `--swap-space` and `--preemption-mode`. +In this mode, when a request was preempted (e.g. to make room in KV +cache to complete other requests), kv cache blocks were swapped out to +CPU memory. The `--swap-space` flag has been removed as this feature +is no longer used in V1. Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The SequenceGroup encapsulated the idea of N Sequences which diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 967f3cfb6ddb..5ceea6228d9e 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -469,6 +469,8 @@ th { | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | | `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | | `RWForCausalLM` | Falcon RW | `tiiuae/falcon-40b`, etc. | | ✅︎ | +| `SarvamMoEForCausalLM` | Sarvam 2 | `sarvamai/sarvam2-30b-a3b`, etc. | ✅︎ | ✅︎ | +| `SarvamMLAForCausalLM` | Sarvam 2 | `sarvamai/sarvam2-105b-a9b`, etc. | | ✅︎ | | `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | | `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | diff --git a/docs/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md index 4b838cbcaa9d..3d669f169e01 100644 --- a/docs/serving/integrations/llamaindex.md +++ b/docs/serving/integrations/llamaindex.md @@ -17,7 +17,7 @@ llm = Vllm( model="microsoft/Orca-2-7b", tensor_parallel_size=4, max_new_tokens=100, - vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, + vllm_kwargs={"gpu_memory_utilization": 0.5}, ) ``` diff --git a/tests/compile/test_graph_partition.py b/tests/compile/test_graph_partition.py index 6d1e2daf989b..9aa11dbe2ca4 100644 --- a/tests/compile/test_graph_partition.py +++ b/tests/compile/test_graph_partition.py @@ -184,3 +184,56 @@ def model_fn(x: torch.Tensor) -> torch.Tensor: assert [node.op for node in splitting_gm.graph.nodes] == ["placeholder"] + 2 * [ "call_function" ] + ["output"] + + +def test_empty_only_partition_is_merged(): + """ + Test that an empty-allocation-only partition is merged into its previous + partition during Dynamo FX splitting. + """ + + def model_fn(x: torch.Tensor) -> torch.Tensor: + y = torch.sin(x) + out = torch.empty_like(y) + torch.ops.aten.cos.out(y, out=out) + return out + + x = torch.randn(4, 3) + gm = make_fx(model_fn)(x) + + split_ops = ["aten::sin", "aten::cos.out"] + split_gm, split_items = split_graph(gm, split_ops) + + # Without the merge, this graph is split into 3 partitions where the + # middle partition contains only aten::empty_like. + assert len(split_items) == 2, "Empty-only partition should be merged" + + output_original = gm(x) + output_split = split_gm(x) + assert torch.allclose(output_original, output_split), "Output mismatch after split" + + +def test_builtin_empty_only_partition_is_merged(): + """ + In Dynamo graphs, torch.empty/empty_like may appear as builtin call targets + (not aten OpOverload). Ensure empty-only partitions are still merged. + """ + + def model_fn(x: torch.Tensor) -> torch.Tensor: + out1 = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, out1) + out2 = torch.empty_like(x) + torch.ops.silly.attention(out1, out1, out1, out2) + return out2 + + gm = torch.fx.symbolic_trace(model_fn) + split_gm, split_items = split_graph(gm, ["silly::attention"]) + + # Without the empty-only merge, this graph creates 4 partitions: + # [empty_like], [attention], [empty_like], [attention]. + assert len(split_items) == 3, "Builtin empty-only partition should be merged" + + x = torch.randn(2, 3, device="cuda") + output_original = gm(x) + output_split = split_gm(x) + assert torch.allclose(output_original, output_split), "Output mismatch after split" diff --git a/tests/conftest.py b/tests/conftest.py index 1e9d46d3c169..4b907b7dd760 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -794,7 +794,6 @@ def __init__( tensor_parallel_size: int = 1, block_size: int = 16 if not torch.xpu.is_available() else 64, enable_chunked_prefill: bool | None = False, - swap_space: int = 4, enforce_eager: bool | None = False, # Set this to avoid hanging issue default_torch_num_threads: int | None = None, @@ -831,7 +830,6 @@ def __init__( trust_remote_code=trust_remote_code, dtype=dtype, seed=seed, - swap_space=swap_space, enforce_eager=enforce_eager, disable_log_stats=disable_log_stats, tensor_parallel_size=tensor_parallel_size, diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py index f415409d7b37..8c9898ca20f3 100644 --- a/tests/distributed/test_torchrun_example.py +++ b/tests/distributed/test_torchrun_example.py @@ -22,7 +22,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# set different `gpu_memory_utilization` and `swap_space` for different ranks, +# set different `gpu_memory_utilization` for different ranks, # to test if all ranks agree on the same kv cache configuration. llm = LLM( model="facebook/opt-125m", @@ -30,7 +30,6 @@ pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)), distributed_executor_backend="external_launcher", gpu_memory_utilization=random.uniform(0.7, 0.9), - swap_space=random.randint(1, 4), seed=0, ) diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py index 1aa7f1793570..a6298d1b6739 100644 --- a/tests/distributed/test_torchrun_example_moe.py +++ b/tests/distributed/test_torchrun_example_moe.py @@ -28,7 +28,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# set different `gpu_memory_utilization` and `swap_space` for different ranks, +# set different `gpu_memory_utilization` for different ranks, # to test if all ranks agree on the same kv cache configuration. llm = LLM( model="microsoft/Phi-mini-MoE-instruct", @@ -37,7 +37,6 @@ enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1, distributed_executor_backend="external_launcher", gpu_memory_utilization=random.uniform(0.7, 0.9), - swap_space=random.randint(1, 4), seed=0, ) diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index cbab74145433..58742f186851 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -6,7 +6,7 @@ import pytest -from ...utils import RemoteOpenAIServer +from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer from .conftest import add_attention_backend MISTRAL_FORMAT_ARGS = [ @@ -19,12 +19,55 @@ ] +async def transcribe_and_check( + client, + model_name: str, + file, + *, + language: str, + expected_text: str, + expected_seconds: int | None = None, + case_sensitive: bool = False, +): + """Run a transcription request and assert the output contains + *expected_text* and optionally that usage reports *expected_seconds*. + + Provides detailed failure messages with the actual transcription output. + """ + transcription = await client.audio.transcriptions.create( + model=model_name, + file=file, + language=language, + response_format="text", + temperature=0.0, + ) + out = json.loads(transcription) + out_text = out["text"] + out_usage = out["usage"] + + if case_sensitive: + assert expected_text in out_text, ( + f"Expected {expected_text!r} in transcription output, got: {out_text!r}" + ) + else: + assert expected_text.lower() in out_text.lower(), ( + f"Expected {expected_text!r} (case-insensitive) in transcription " + f"output, got: {out_text!r}" + ) + + if expected_seconds is not None: + assert out_usage["seconds"] == expected_seconds, ( + f"Expected {expected_seconds}s of audio, " + f"got {out_usage['seconds']}s. Full usage: {out_usage!r}" + ) + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", ["mistralai/Voxtral-Mini-3B-2507", "Qwen/Qwen3-ASR-0.6B"] ) async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention): - server_args = ["--enforce-eager"] + server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS] if model_name.startswith("mistralai"): server_args += MISTRAL_FORMAT_ARGS @@ -32,20 +75,18 @@ async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention): add_attention_backend(server_args, rocm_aiter_fa_attention) # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. - with RemoteOpenAIServer(model_name, server_args) as remote_server: + with RemoteOpenAIServer( + model_name, server_args, env_dict=ROCM_ENV_OVERRIDES + ) as remote_server: client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=model_name, - file=mary_had_lamb, + await transcribe_and_check( + client, + model_name, + mary_had_lamb, language="en", - response_format="text", - temperature=0.0, + expected_text="Mary had a little lamb", + expected_seconds=16, ) - out = json.loads(transcription) - out_text = out["text"] - out_usage = out["usage"] - assert "Mary had a little lamb" in out_text - assert out_usage["seconds"] == 16, out_usage["seconds"] @pytest.mark.asyncio @@ -74,20 +115,18 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention): add_attention_backend(server_args, rocm_aiter_fa_attention) # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. - with RemoteOpenAIServer(model_name, server_args) as remote_server: + with RemoteOpenAIServer( + model_name, server_args, env_dict=ROCM_ENV_OVERRIDES + ) as remote_server: client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=lora_model_name, - file=mary_had_lamb, + await transcribe_and_check( + client, + lora_model_name, + mary_had_lamb, language="en", - response_format="text", - temperature=0.0, + expected_text="mary had a little lamb", + expected_seconds=16, ) - out = json.loads(transcription) - out_text = out["text"] - out_usage = out["usage"] - assert "mary had a little lamb" in out_text - assert out_usage["seconds"] == 16, out_usage["seconds"] @pytest.mark.asyncio @@ -97,20 +136,21 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention): async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name): # Gemma accuracy on some of the audio samples we use is particularly bad, # hence we use a different one here. WER is evaluated separately. - server_args = ["--enforce-eager"] + server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS] add_attention_backend(server_args, rocm_aiter_fa_attention) with RemoteOpenAIServer( - model_name, server_args, max_wait_seconds=480 + model_name, + server_args, + max_wait_seconds=480, + env_dict=ROCM_ENV_OVERRIDES, ) as remote_server: client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=model_name, - file=foscolo, + await transcribe_and_check( + client, + model_name, + foscolo, language="it", - response_format="text", - temperature=0.0, + expected_text="ove il mio corpo fanciulletto giacque", ) - out = json.loads(transcription)["text"] - assert "ove il mio corpo fanciulletto giacque" in out diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 6c5a08ae2f91..c0d8b0532830 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -12,7 +12,7 @@ from vllm.multimodal.utils import encode_image_url, fetch_image from vllm.platforms import current_platform -from ...utils import RemoteOpenAIServer +from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer MODEL_NAME = "microsoft/Phi-3.5-vision-instruct" MAXIMUM_IMAGES = 2 @@ -48,10 +48,37 @@ def check_output_matches_terms(content: str, term_groups: list[list[str]]) -> bo All term groups must be satisfied. """ content_lower = content.lower() - for group in term_groups: - if not any(term.lower() in content_lower for term in group): - return False - return True + return all( + any(term.lower() in content_lower for term in group) for group in term_groups + ) + + +def assert_non_empty_content(chat_completion, *, context: str = "") -> str: + """Assert the first choice has non-empty string content; return it. + + Provides a detailed failure message including the full ChatCompletion + response so flaky / model-quality issues are easy to diagnose. + """ + prefix = f"[{context}] " if context else "" + choice = chat_completion.choices[0] + content = choice.message.content + + assert content is not None, ( + f"{prefix}Expected non-None content but got None. " + f"finish_reason={choice.finish_reason!r}, " + f"full message={choice.message!r}, " + f"usage={chat_completion.usage!r}" + ) + assert isinstance(content, str), ( + f"{prefix}Expected str content, got {type(content).__name__}: {content!r}" + ) + assert len(content) > 0, ( + f"{prefix}Expected non-empty content but got empty string. " + f"finish_reason={choice.finish_reason!r}, " + f"full message={choice.message!r}, " + f"usage={chat_completion.usage!r}" + ) + return content @pytest.fixture(scope="module") @@ -67,16 +94,22 @@ def server(): "--trust-remote-code", "--limit-mm-per-prompt", json.dumps({"image": MAXIMUM_IMAGES}), + *ROCM_EXTRA_ARGS, ] # ROCm: Increase timeouts to handle potential network delays and slower # video processing when downloading multiple videos from external sources - env_overrides = {} - if current_platform.is_rocm(): - env_overrides = { - "VLLM_VIDEO_FETCH_TIMEOUT": "120", - "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300", - } + env_overrides = { + **ROCM_ENV_OVERRIDES, + **( + { + "VLLM_VIDEO_FETCH_TIMEOUT": "120", + "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300", + } + if current_platform.is_rocm() + else {} + ), + } with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server: yield remote_server @@ -117,6 +150,51 @@ def dummy_messages_from_image_url( ] +def describe_image_messages( + image_url: str, *, extra_image_fields: dict | None = None +) -> list[dict]: + """Build the system + user messages used by the completions-with-image + family of tests. *extra_image_fields* is merged into the top-level + image content block (for uuid / bad-key tests).""" + image_block: dict = { + "type": "image_url", + "image_url": {"url": image_url}, + } + if extra_image_fields: + image_block.update(extra_image_fields) + + return [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image."}, + image_block, + ], + }, + ] + + +async def complete_and_check( + client: openai.AsyncOpenAI, + model_name: str, + messages: list[dict], + *, + context: str, + max_completion_tokens: int = 50, + temperature: float = 0.0, +) -> str: + """Run a chat completion and assert the output is non-empty. + Returns the content string.""" + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=max_completion_tokens, + temperature=temperature, + ) + return assert_non_empty_content(chat_completion, context=context) + + def get_hf_prompt_tokens(model_name, content, image_url): processor = AutoProcessor.from_pretrained( model_name, trust_remote_code=True, num_crops=4 @@ -153,7 +231,6 @@ async def test_single_chat_session_image( messages = dummy_messages_from_image_url(image_url, content_text) max_completion_tokens = 10 - # test single completion chat_completion = await client.chat.completions.create( model=model_name, messages=messages, @@ -162,32 +239,46 @@ async def test_single_chat_session_image( temperature=0.0, top_logprobs=5, ) - assert len(chat_completion.choices) == 1 + assert len(chat_completion.choices) == 1, ( + f"Expected 1 choice, got {len(chat_completion.choices)}" + ) choice = chat_completion.choices[0] - assert choice.finish_reason == "length" + assert choice.finish_reason == "length", ( + f"Expected finish_reason='length' (capped at {max_completion_tokens} " + f"tokens), got {choice.finish_reason!r}. " + f"content={choice.message.content!r}" + ) + hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url) - assert chat_completion.usage == openai.types.CompletionUsage( + expected_usage = openai.types.CompletionUsage( completion_tokens=max_completion_tokens, prompt_tokens=hf_prompt_tokens, total_tokens=hf_prompt_tokens + max_completion_tokens, ) + assert chat_completion.usage == expected_usage, ( + f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}" + ) message = choice.message - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 10 - assert message.role == "assistant" + assert message.content is not None and len(message.content) >= 10, ( + f"Expected content with >=10 chars, got {message.content!r}" + ) + assert message.role == "assistant", ( + f"Expected role='assistant', got {message.role!r}" + ) + messages.append({"role": "assistant", "content": message.content}) # test multi-turn dialogue messages.append({"role": "user", "content": "express your result in json"}) - chat_completion = await client.chat.completions.create( - model=model_name, - messages=messages, + await complete_and_check( + client, + model_name, + messages, + context=f"multi-turn follow-up for {image_url}", max_completion_tokens=10, ) - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 0 @pytest.mark.asyncio @@ -209,7 +300,7 @@ async def test_error_on_invalid_image_url_type( # image_url should be a dict {"url": "some url"}, not directly a string with pytest.raises(openai.BadRequestError): - _ = await client.chat.completions.create( + await client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=10, @@ -235,10 +326,15 @@ async def test_single_chat_session_image_beamsearch( top_logprobs=5, extra_body=dict(use_beam_search=True), ) - assert len(chat_completion.choices) == 2 - assert ( - chat_completion.choices[0].message.content - != chat_completion.choices[1].message.content + assert len(chat_completion.choices) == 2, ( + f"Expected 2 beam search choices, got {len(chat_completion.choices)}" + ) + + content_0 = chat_completion.choices[0].message.content + content_1 = chat_completion.choices[1].message.content + assert content_0 != content_1, ( + f"Beam search should produce different outputs for {image_url}, " + f"but both returned: {content_0!r}" ) @@ -269,33 +365,46 @@ async def test_single_chat_session_image_base64encoded( temperature=0.0, top_logprobs=5, ) - assert len(chat_completion.choices) == 1 + assert len(chat_completion.choices) == 1, ( + f"Expected 1 choice, got {len(chat_completion.choices)}" + ) choice = chat_completion.choices[0] - assert choice.finish_reason == "length" + assert choice.finish_reason == "length", ( + f"Expected finish_reason='length', got {choice.finish_reason!r}. " + f"content={choice.message.content!r}" + ) + hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url) - assert chat_completion.usage == openai.types.CompletionUsage( + expected_usage = openai.types.CompletionUsage( completion_tokens=max_completion_tokens, prompt_tokens=hf_prompt_tokens, total_tokens=hf_prompt_tokens + max_completion_tokens, ) + assert chat_completion.usage == expected_usage, ( + f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}" + ) message = choice.message - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 10 - assert message.role == "assistant" + assert message.content is not None and len(message.content) >= 10, ( + f"Expected content with >=10 chars, got {message.content!r}" + ) + assert message.role == "assistant", ( + f"Expected role='assistant', got {message.role!r}" + ) + messages.append({"role": "assistant", "content": message.content}) # test multi-turn dialogue messages.append({"role": "user", "content": "express your result in json"}) - chat_completion = await client.chat.completions.create( - model=model_name, - messages=messages, + await complete_and_check( + client, + model_name, + messages, + context=f"multi-turn base64 follow-up for {raw_image_url}", max_completion_tokens=10, temperature=0.0, ) - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 0 @pytest.mark.asyncio @@ -321,7 +430,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch( temperature=0.0, extra_body=dict(use_beam_search=True), ) - assert len(chat_completion.choices) == 2 + assert len(chat_completion.choices) == 2, ( + f"Expected 2 beam search choices for image {image_idx} " + f"({raw_image_url}), got {len(chat_completion.choices)}" + ) # Verify beam search produces two different non-empty outputs content_0 = chat_completion.choices[0].message.content @@ -333,18 +445,28 @@ async def test_single_chat_session_image_base64encoded_beamsearch( f"Output 0: {content_0!r}, Output 1: {content_1!r}" ) - assert content_0, "First beam search output should not be empty" - assert content_1, "Second beam search output should not be empty" - assert content_0 != content_1, "Beam search should produce different outputs" + assert content_0, ( + f"First beam output is empty for image {image_idx} ({raw_image_url}). " + f"finish_reason={chat_completion.choices[0].finish_reason!r}" + ) + assert content_1, ( + f"Second beam output is empty for image {image_idx} " + f"({raw_image_url}). " + f"finish_reason={chat_completion.choices[1].finish_reason!r}" + ) + assert content_0 != content_1, ( + f"Beam search produced identical outputs for image {image_idx} " + f"({raw_image_url}): {content_0!r}" + ) # Verify each output contains the required terms for this image for i, content in enumerate([content_0, content_1]): - if not check_output_matches_terms(content, required_terms): - pytest.fail( - f"Output {i} '{content}' doesn't contain required terms. " - f"Expected all of these term groups (at least one from each): " - f"{required_terms}" - ) + assert check_output_matches_terms(content, required_terms), ( + f"Beam output {i} for image {image_idx} ({raw_image_url}) " + f"doesn't match required terms.\n" + f" content: {content!r}\n" + f" required (all groups, >=1 per group): {required_terms}" + ) @pytest.mark.asyncio @@ -378,16 +500,29 @@ async def test_chat_streaming_image( async for chunk in stream: delta = chunk.choices[0].delta if delta.role: - assert delta.role == "assistant" + assert delta.role == "assistant", ( + f"Expected role='assistant' in stream delta, got {delta.role!r}" + ) if delta.content: chunks.append(delta.content) if chunk.choices[0].finish_reason is not None: finish_reason_count += 1 # finish reason should only return in last block - assert finish_reason_count == 1 - assert chunk.choices[0].finish_reason == stop_reason - assert delta.content - assert "".join(chunks) == output + assert finish_reason_count == 1, ( + f"Expected exactly 1 finish_reason across stream chunks, " + f"got {finish_reason_count}" + ) + assert chunk.choices[0].finish_reason == stop_reason, ( + f"Stream finish_reason={chunk.choices[0].finish_reason!r} " + f"doesn't match non-stream finish_reason={stop_reason!r}" + ) + + streamed_text = "".join(chunks) + assert streamed_text == output, ( + f"Streamed output doesn't match non-streamed for {image_url}.\n" + f" streamed: {streamed_text!r}\n" + f" non-streamed: {output!r}" + ) @pytest.mark.asyncio @@ -418,17 +553,19 @@ async def test_multi_image_input( max_tokens=5, temperature=0.0, ) - completion = completion.choices[0].text - assert completion is not None and len(completion) >= 0 + assert completion.choices[0].text is not None, ( + "Server failed to produce output after rejecting over-limit " + "multi-image request" + ) else: - chat_completion = await client.chat.completions.create( - model=model_name, - messages=messages, + await complete_and_check( + client, + model_name, + messages, + context=f"multi-image input ({len(image_urls)} images)", max_completion_tokens=10, temperature=0.0, ) - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 0 @pytest.mark.asyncio @@ -444,30 +581,13 @@ async def test_completions_with_image( image_urls: list[str], ): for image_url in image_urls: - chat_completion = await client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe this image.", - }, - { - "type": "image_url", - "image_url": { - "url": image_url, - }, - }, - ], - }, - ], - model=model_name, + messages = describe_image_messages(image_url) + await complete_and_check( + client, + model_name, + messages, + context=f"completions_with_image url={image_url}", ) - assert chat_completion.choices[0].message.content is not None - assert isinstance(chat_completion.choices[0].message.content, str) - assert len(chat_completion.choices[0].message.content) > 0 @pytest.mark.asyncio @@ -483,54 +603,33 @@ async def test_completions_with_image_with_uuid( image_urls: list[str], ): for image_url in image_urls: - chat_completion = await client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe this image.", - }, - { - "type": "image_url", - "image_url": { - "url": image_url, - }, - "uuid": image_url, - }, - ], - }, - ], - model=model_name, + messages = describe_image_messages( + image_url, + extra_image_fields={"uuid": image_url}, ) - assert chat_completion.choices[0].message.content is not None - assert isinstance(chat_completion.choices[0].message.content, str) - assert len(chat_completion.choices[0].message.content) > 0 - - # Second request, with empty image but the same uuid. - chat_completion_with_empty_image = await client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe this image.", - }, - {"type": "image_url", "image_url": {}, "uuid": image_url}, - ], - }, - ], - model=model_name, + await complete_and_check( + client, + model_name, + messages, + context=f"uuid first request url={image_url}", ) - assert chat_completion_with_empty_image.choices[0].message.content is not None - assert isinstance( - chat_completion_with_empty_image.choices[0].message.content, str + + cached_messages: list[dict] = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image."}, + {"type": "image_url", "image_url": {}, "uuid": image_url}, + ], + }, + ] + await complete_and_check( + client, + model_name, + cached_messages, + context=f"uuid cached (empty image) uuid={image_url}", ) - assert len(chat_completion_with_empty_image.choices[0].message.content) > 0 @pytest.mark.asyncio @@ -540,16 +639,13 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit( model_name: str, ): with pytest.raises(openai.BadRequestError): - _ = await client.chat.completions.create( + await client.chat.completions.create( messages=[ {"role": "system", "content": "You are a helpful assistant."}, { "role": "user", "content": [ - { - "type": "text", - "text": "Describe this image.", - }, + {"type": "text", "text": "Describe this image."}, { "type": "image_url", "image_url": {}, @@ -575,29 +671,18 @@ async def test_completions_with_image_with_incorrect_uuid_format( image_urls: list[str], ): for image_url in image_urls: - chat_completion = await client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe this image.", - }, - { - "type": "image_url", - "image_url": { - "url": image_url, - "incorrect_uuid_key": image_url, - }, - "also_incorrect_uuid_key": image_url, - }, - ], - }, - ], - model=model_name, + messages = describe_image_messages( + image_url, + extra_image_fields={ + "also_incorrect_uuid_key": image_url, + }, + ) + # Inject the bad key inside image_url dict too + messages[1]["content"][1]["image_url"]["incorrect_uuid_key"] = image_url + + await complete_and_check( + client, + model_name, + messages, + context=f"incorrect uuid format url={image_url}", ) - assert chat_completion.choices[0].message.content is not None - assert isinstance(chat_completion.choices[0].message.content, str) - assert len(chat_completion.choices[0].message.content) > 0 diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py index c9b2b85f004a..73502932dba1 100644 --- a/tests/kernels/moe/test_ocp_mx_moe.py +++ b/tests/kernels/moe/test_ocp_mx_moe.py @@ -20,6 +20,8 @@ current_platform.is_cuda() and current_platform.is_device_capability_family(100) ) +TRTLLM_GEN_MXFP8_AVAILABLE = TRTLLM_GEN_MXFP4_AVAILABLE + HOPPER_MXFP4_BF16_AVAILABLE = ( current_platform.is_cuda() and current_platform.is_device_capability(90) @@ -34,9 +36,15 @@ shuffle_matrix_a, shuffle_matrix_sf_a, trtllm_fp4_block_scale_moe, + trtllm_fp8_block_scale_moe, ) from flashinfer.fp4_quantization import nvfp4_block_scale_interleave - from flashinfer.fused_moe.core import get_w2_permute_indices_with_cache + +if TRTLLM_GEN_MXFP8_AVAILABLE: + from flashinfer.fused_moe.core import ( + Fp8QuantizationType, + get_w2_permute_indices_with_cache, + ) @dataclass @@ -160,6 +168,7 @@ def reference_moe( beta, limit, act_type, + is_gated, ): # renormalize routing experts = torch.topk(roouting_logits, k=topk, dim=-1, sorted=True) @@ -170,7 +179,12 @@ def reference_moe( mlp1_weight = w13[expert_indices, ...] mlp1_bias = bias13[expert_indices, ...] t = torch.einsum("beck,bk->bec", mlp1_weight, t) + mlp1_bias - t = swiglu(t, alpha=alpha, beta=beta, limit=limit) + if is_gated: + t = swiglu(t, alpha=alpha, beta=beta, limit=limit) + else: + # RELU2_NO_MUL: relu(x)^2 + t = torch.relu(t) + t = t * t if act_type == "mxfp8": t_quantized, t_scale = mxfp8_quantize( @@ -569,6 +583,7 @@ def test_trtllm_gen_mxfp4_fused_moe( beta, limit, act_type, + is_gated=True, ) ref_result[start_idx:end_idx].copy_(chunk_result) @@ -705,6 +720,7 @@ def test_flashinfer_cutlass_mxfp4_fused_moe( beta, limit, "bf16", + is_gated=True, ) from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe @@ -890,6 +906,7 @@ def dequant_mxfp4_batches(mat_fp4: torch.Tensor, scale_tensor: torch.Tensor): beta, limit, "mxfp8", + is_gated=True, ) # Prepare inputs for FlashInfer CUTLASS fused MoE @@ -965,3 +982,169 @@ def dequant_mxfp4_batches(mat_fp4: torch.Tensor, scale_tensor: torch.Tensor): # Allow some mismatch due to MXFP4 quantization check_accuracy(ref, out, atol=0, rtol=0.3, percent=0.8) + + +@pytest.mark.parametrize("topk", [1, 4]) +@pytest.mark.parametrize("num_experts", [32]) +@pytest.mark.parametrize("num_tokens", [1, 128]) +@pytest.mark.parametrize("intermediate_size,hidden_size", [(3072, 3072)]) +@pytest.mark.parametrize("is_gated", [True], ids=["gated"]) +@pytest.mark.skipif( + not TRTLLM_GEN_MXFP8_AVAILABLE, + reason="nvidia gpu and compute capability sm100 is required for this test", +) +def test_trtllm_gen_mxfp8_block_scale_moe( + topk: int, + num_experts: int, + num_tokens: int, + intermediate_size: int, + hidden_size: int, + is_gated: bool, +): + torch.manual_seed(42) + device = "cuda:0" + + inter_size = intermediate_size * (2 if is_gated else 1) + + hidden_states = ( + torch.randn(num_tokens, hidden_size, device=device, dtype=torch.bfloat16) / 20 + ) + w13 = ( + torch.randn( + num_experts, + inter_size, + hidden_size, + device=device, + dtype=torch.bfloat16, + ) + / 20 + ) + w2 = ( + torch.randn( + num_experts, + hidden_size, + intermediate_size, + device=device, + dtype=torch.bfloat16, + ) + / 20 + ) + router_logits = torch.rand( + num_tokens, num_experts, dtype=torch.float32, device=device + ) + router_logits_kernel = router_logits.to(torch.bfloat16) + + # Quantize weights to MXFP8 and normalize scales to [E, M, K//32]. + w13_q, w13_scale = mxfp8_quantize(w13, is_sf_swizzled_layout=False) + w2_q, w2_scale = mxfp8_quantize(w2, is_sf_swizzled_layout=False) + if w13_scale.ndim == 1: + w13_scale = w13_scale.view( + num_experts, + inter_size, + hidden_size // 32, + ) + if w2_scale.ndim == 1: + w2_scale = w2_scale.view(num_experts, hidden_size, intermediate_size // 32) + + # Quantize activations to MXFP8. + hidden_states_q, hidden_states_scale = mxfp8_quantize( + hidden_states, is_sf_swizzled_layout=False + ) + if hidden_states_scale.ndim == 1: + hidden_states_scale = hidden_states_scale.view(num_tokens, hidden_size // 32) + + # Reference output using dequantized tensors + MXFP8 intermediate quantization. + w13_ref = mxfp8_dequantize(w13_q, w13_scale).to(torch.float32) + w2_ref = mxfp8_dequantize(w2_q, w2_scale).to(torch.float32) + hidden_states_ref = mxfp8_dequantize(hidden_states_q, hidden_states_scale).to( + torch.float32 + ) + bias13 = torch.zeros( + num_experts, + intermediate_size * (2 if is_gated else 1), + device=device, + ) + bias2 = torch.zeros(num_experts, hidden_size, device=device) + ref = reference_moe( + router_logits_kernel.to(torch.float32), + topk, + num_experts, + hidden_states_ref, + w13_ref, + bias13, + w2_ref, + bias2, + alpha=1.0, + beta=0.0, + limit=None, + act_type="mxfp8", + is_gated=is_gated, + ) + + # Shuffle weights/scales with the same indexed layout used by TRTLLM kernels. + epilogue_tile_m = 128 + gemm1_weights_shuffled = [] + gemm1_scales_shuffled = [] + gemm2_weights_shuffled = [] + gemm2_scales_shuffled = [] + for i in range(num_experts): + w13_rows = intermediate_size * (2 if is_gated else 1) + w13_interleaved = w13_q[i].clone().reshape(w13_rows, -1) + w13_scale_interleaved = w13_scale[i].clone().reshape(w13_rows, -1) + if is_gated: + w13_interleaved = reorder_rows_for_gated_act_gemm(w13_interleaved) + w13_scale_interleaved = reorder_rows_for_gated_act_gemm( + w13_scale_interleaved + ) + gemm1_weights_shuffled.append( + shuffle_matrix_a(w13_interleaved.view(torch.uint8), epilogue_tile_m) + .contiguous() + .view(w13_q.dtype) + ) + gemm2_weights_shuffled.append( + shuffle_matrix_a(w2_q[i].view(torch.uint8), epilogue_tile_m) + .contiguous() + .view(w2_q.dtype) + ) + + gemm1_scales_shuffled.append( + shuffle_matrix_sf_a( + w13_scale_interleaved.view(torch.uint8).reshape(w13_rows, -1), + epilogue_tile_m, + ) + .contiguous() + .view(w13_scale.dtype) + ) + gemm2_scales_shuffled.append( + shuffle_matrix_sf_a( + w2_scale[i].view(torch.uint8).reshape(hidden_size, -1), epilogue_tile_m + ) + .contiguous() + .view(w2_scale.dtype) + ) + + out = trtllm_fp8_block_scale_moe( + routing_logits=router_logits_kernel, + routing_bias=None, + hidden_states=hidden_states_q, + hidden_states_scale=hidden_states_scale, + gemm1_weights=torch.stack(gemm1_weights_shuffled), + gemm1_weights_scale=torch.stack(gemm1_scales_shuffled), + gemm2_weights=torch.stack(gemm2_weights_shuffled), + gemm2_weights_scale=torch.stack(gemm2_scales_shuffled), + num_experts=num_experts, + top_k=topk, + n_group=None, + topk_group=None, + intermediate_size=intermediate_size, + local_expert_offset=0, + local_num_experts=num_experts, + routed_scaling_factor=None, + routing_method_type=1, # renormalize routing + use_shuffled_weight=True, + weight_layout=0, # MajorK + fp8_quantization_type=Fp8QuantizationType.MxFp8, + ) + + # Block-scale MXFP8 kernels are approximate; require majority close. + check_accuracy(ref, out, atol=0.1, rtol=0.85, percent=0.8) diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py index 5e38638b9b6f..492716b46451 100644 --- a/tests/lora/test_olmoe_tp.py +++ b/tests/lora/test_olmoe_tp.py @@ -3,6 +3,7 @@ import shutil +from collections.abc import Sequence import pytest import torch @@ -15,7 +16,7 @@ MODEL_PATH = "allenai/OLMoE-1B-7B-0125-Instruct" -PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. +PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me. Do not return any additional explanation. Below is an instruction that describes a task, Write a response that appropriately completes the request. " ##Instruction: candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key. @@ -39,10 +40,20 @@ "SELECT COUNT(Candidate_ID) FROM candidate", "SELECT COUNT(Candidate_ID) FROM candidate", "SELECT Candidate_ID, COUNT(*) as Total_Candidates\nFROM candidate\nINNER JOIN people ON candidate.People_ID = people.People_ID", # noqa: E501 - "SELECT Candidate_ID, Poll_Source FROM candidate WHERE People_ID IN (SELECT People_ID FROM people) ORDER BY COUNT(*) DESC LIMIT 1", # noqa: E501 + # There are multiple acceptable responses + ( + "SELECT Candidate_ID, Poll_Source FROM candidate WHERE People_ID IN (SELECT People_ID FROM people) ORDER BY COUNT(*) DESC LIMIT 1", # noqa: E501 + "SELECT Candidate_ID, Poll_Source FROM candidate WHERE COUNT(People_ID) = (SELECT COUNT(People_ID) FROM people) ORDER BY Candidate_ID DESC LIMIT 1", # noqa: E501 + ), ] +def _output_matches(generated: str, accepted: str | Sequence[str]) -> bool: + if isinstance(accepted, str): + accepted = (accepted,) + return any(generated.startswith(s) for s in accepted) + + def generate_and_test( llm: vllm.LLM, lora_path: str, @@ -90,9 +101,13 @@ def generate_and_test( if compare_lower: generated_text = generated_text.lower() - expected_output = expected_output.lower() - - assert generated_text.startswith(expected_output) + if isinstance(expected_output, str): + expected_output = (expected_output.lower(),) + else: + expected_output = tuple(s.lower() for s in expected_output) + assert _output_matches(generated_text, expected_output), ( + f"Output {i}: {generated_text!r} does not match any of {expected_output!r}" + ) def test_olmoe_lora(olmoe_lora_files): diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 274142e8d66e..4af3ccf893ff 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -64,7 +64,6 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]): device_config=DeviceConfig("cuda"), cache_config=CacheConfig( block_size=16, - swap_space=0, cache_dtype="auto", ), lora_config=LoRAConfig( diff --git a/tests/models/quantization/test_gpt_oss.py b/tests/models/quantization/test_gpt_oss.py index 7599a5a5ee4c..21cc9555bfde 100644 --- a/tests/models/quantization/test_gpt_oss.py +++ b/tests/models/quantization/test_gpt_oss.py @@ -21,6 +21,7 @@ import pytest from packaging import version +from vllm.platforms.rocm import on_gfx950 from vllm.utils.torch_utils import cuda_device_count_stateless MODEL_ACCURACIES = { @@ -83,11 +84,17 @@ def get_model_args(self, tp_size: int): @pytest.mark.parametrize("tp_size", [1, 2, 4, 8]) @pytest.mark.parametrize("model_name, expected_accuracy", MODEL_ACCURACIES.items()) def test_gpt_oss_attention_quantization( - model_name: str, tp_size: int, expected_accuracy: float + model_name: str, + tp_size: int, + expected_accuracy: float, + monkeypatch: pytest.MonkeyPatch, ): if tp_size > cuda_device_count_stateless(): pytest.skip("Not enough GPUs to run this test case") + if "amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-FP8-KV-FP8" in model_name and on_gfx950(): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") + model_args = EvaluationConfig(model_name).get_model_args(tp_size) extra_run_kwargs = { diff --git a/tests/models/registry.py b/tests/models/registry.py index 40c4d0d311bc..48e5c251d7a6 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -480,6 +480,18 @@ def check_available_online( min_transformers_version="4.56.3", ), "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"), + "SarvamMoEForCausalLM": _HfExamplesInfo( + "sarvamai/sarvam-30b", + trust_remote_code=True, + max_model_len=4096, + is_available_online=True, + ), + "SarvamMLAForCausalLM": _HfExamplesInfo( + "sarvamai/sarvam-105b", + trust_remote_code=True, + max_model_len=4096, + is_available_online=True, + ), "SeedOssForCausalLM": _HfExamplesInfo( "ByteDance-Seed/Seed-OSS-36B-Instruct", trust_remote_code=True, diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 32c0b9064275..86efefc3740f 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -327,6 +327,12 @@ def __init__( self._k_scale_float = 1.0 self._v_scale_float = 1.0 + self._decode_concat_quant_fp8_op = _DecodeConcatQuantFP8( + static=True, + group_shape=GroupShape.PER_TENSOR, + compile_native=True, + ) + def forward_impl( self, q: torch.Tensor, @@ -338,6 +344,7 @@ def forward_impl( ) -> torch.Tensor: """Forward for sparse MLA - uses forward_mqa for all tokens.""" kv_cache_dtype = getattr(self.impl, "kv_cache_dtype", "auto") + fp8_attention = kv_cache_dtype.startswith("fp8") # Write to KV cache if kv_cache.numel() > 0: @@ -350,6 +357,9 @@ def forward_impl( scale=self._k_scale, ) + if fp8_attention and kv_cache_dtype != "fp8_ds_mla": + kv_cache = kv_cache.view(current_platform.fp8_dtype()) + num_tokens = q.shape[0] # Sparse MLA uses forward_mqa for all tokens @@ -367,8 +377,14 @@ def forward_impl( # Convert from (N, B, L) to (B, N, L) mqa_ql_nope = mqa_ql_nope.transpose(0, 1) - # Pass as tuple to forward_mqa - mqa_q = (mqa_ql_nope, mqa_q_pe) + if fp8_attention and self.impl.supports_quant_query_input: + assert mqa_ql_nope.shape[0] == mqa_q_pe.shape[0] + assert mqa_ql_nope.shape[1] == mqa_q_pe.shape[1] + mqa_q = self._decode_concat_quant_fp8_op( + mqa_ql_nope, mqa_q_pe, self._q_scale + ) + else: + mqa_q = (mqa_ql_nope, mqa_q_pe) attn_out, _ = self.impl.forward_mqa(mqa_q, kv_cache, attn_metadata, self) diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py index 86cefa036b40..0fd0ba6fab0d 100644 --- a/tests/v1/attention/test_sparse_mla_backends.py +++ b/tests/v1/attention/test_sparse_mla_backends.py @@ -191,6 +191,16 @@ def test_sparse_backend_decode_correctness( if kv_cache_dtype not in backend_cls.supported_kv_cache_dtypes: pytest.skip(f"{backend_cls.get_name()} does not support {kv_cache_dtype}") + if ( + backend_cls == FlashMLASparseBackend + and kv_cache_dtype.startswith("fp8") + and kv_cache_dtype != "fp8_ds_mla" + ): + pytest.skip( + "FlashMLA Sparse Attention backend fp8 only supports " + "fp8_ds_mla kv-cache dtype" + ) + supported_block_sizes = backend_cls.get_supported_kernel_block_sizes() if block_size not in supported_block_sizes: pytest.skip( @@ -419,7 +429,7 @@ def test_sparse_backend_decode_correctness( num_blocks=vllm_config.cache_config.num_gpu_blocks, common_attn_metadata=common_attn_metadata, randomize_blocks=False, - kv_cache_dtype=kv_cache_dtype if use_fp8_ds_mla_quantization else "auto", + kv_cache_dtype=kv_cache_dtype, scale=kv_cache_scale, ) diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 3cff52929146..91decf6658a5 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -182,7 +182,6 @@ def create_vllm_config( cache_config = CacheConfig( block_size=block_size, cache_dtype="auto", - swap_space=0, ) # Set cache blocks for testing # (these may be set during initialization normally) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 24edfadb9b53..bbeca6ef7dba 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1776,7 +1776,6 @@ def create_scheduler_with_priority( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", enable_prefix_caching=enable_prefix_caching, ) @@ -3726,7 +3725,6 @@ def _create_encoder_decoder_scheduler( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", enable_prefix_caching=False, ) diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 90c174adf8c8..92122bcb0ba4 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -94,7 +94,6 @@ def create_scheduler( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", enable_prefix_caching=enable_prefix_caching, ) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 042e953866cf..c703d6aae9f9 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -98,7 +98,7 @@ def test_without_spec_decoding( @single_gpu_only @large_gpu_mark(min_gb=16) -def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch): +def test_with_eagle3_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch): """Test consistency and acceptance rates with some different combos of preemption, executor, async scheduling, prefill chunking, spec decoding model length. @@ -154,6 +154,42 @@ def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch) ) +def test_with_ngram_gpu_spec_decoding(monkeypatch: pytest.MonkeyPatch): + """Test ngram_gpu speculative decoding with different configurations. + + This test specifically validates ngram_gpu behavior with various: + - Number of speculative tokens (2-6) + - Prompt lookup window sizes (min/max) + - Async scheduling enabled (as in production) + - Different executors and chunking settings + """ + + # Variant with larger speculation window + ngram_gpu_config = { + "method": "ngram_gpu", + "num_speculative_tokens": 3, + "prompt_lookup_max": 3, + "prompt_lookup_min": 2, + } + + # Test configurations covering various scenarios + # test_preemption, executor, async_scheduling, + # spec_config, test_prefill_chunking + test_configs = [ + (False, "mp", False, None, False), + (False, "mp", False, ngram_gpu_config, False), + (True, "mp", False, ngram_gpu_config, True), + (False, "mp", True, ngram_gpu_config, False), + (True, "mp", True, ngram_gpu_config, False), + (True, "uni", True, ngram_gpu_config, False), + (True, "mp", True, ngram_gpu_config, True), + ] + + # Use MODEL (Qwen) for ngram_gpu tests as it's lighter weight + # and ngram_gpu doesn't require a specific draft model + run_tests(monkeypatch, MODEL, test_configs, [{}]) + + @dynamo_config.patch(cache_size_limit=16) def run_tests( monkeypatch: pytest.MonkeyPatch, @@ -282,11 +318,12 @@ def run_test( else dict(gpu_memory_utilization=0.9) ) spec_mml = (spec_config or {}).get("max_model_len") + spec_method = (spec_config or {}).get("method", "none") test_config = ( f"executor={executor}, preemption={test_preemption}, " f"async_sched={async_scheduling}, " f"chunk_prefill={test_prefill_chunking}, " - f"spec_decoding={spec_decoding}, spec_mml={spec_mml}" + f"spec_decoding={spec_decoding}, spec_method={spec_method}, spec_mml={spec_mml}" ) print("-" * 80) print(f"---- TESTING {test_str}: {test_config}") @@ -294,7 +331,7 @@ def run_test( with VllmRunner( model, - max_model_len=512, + max_model_len=4096, enable_chunked_prefill=test_prefill_chunking, # Force prefill chunking max_num_batched_tokens=48 if test_prefill_chunking else None, diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 4066dfe9e34d..3988070ca759 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -183,6 +183,34 @@ def test_ngram_and_suffix_correctness( cleanup_dist_env_and_memory() +@pytest.mark.parametrize("async_scheduling", [True], ids=["async"]) +@single_gpu_only +@large_gpu_mark(min_gb=20) +def test_ngram_gpu_default_with_async_scheduling( + async_scheduling: bool, +): + """ + Test ngram_gpu speculative decoding (k=3) correctness with and without + async scheduling, validated via GSM8K accuracy. + Uses Qwen/Qwen3-8B (ref GSM8K accuracy: 87%-92%). + """ + qwen3_model = "Qwen/Qwen3-8B" + spec_llm = LLM( + model=qwen3_model, + speculative_config={ + "method": "ngram_gpu", + "prompt_lookup_max": 3, + "prompt_lookup_min": 2, + "num_speculative_tokens": 2, + }, + max_model_len=4096, + async_scheduling=async_scheduling, + ) + evaluate_llm_for_gsm8k(spec_llm, expected_accuracy_threshold=0.8) + del spec_llm + cleanup_dist_env_and_memory() + + @single_gpu_only @large_gpu_mark(min_gb=20) def test_suffix_decoding_acceptance( diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 8d7377c286ac..ae674919ae91 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -506,7 +506,6 @@ def test_encoder_instance_zero_kv_cache( cache_config = CacheConfig( block_size=16, gpu_memory_utilization=gpu_memory_utilization, - swap_space=0, cache_dtype="auto", enable_prefix_caching=enable_prefix_caching, ) diff --git a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh index c35f4bfe8890..684e2ec4d7b9 100755 --- a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh @@ -56,24 +56,27 @@ run_tests() { echo "✅ All ${label} tests passed!" } -# Run tests +# Set backend +label="default backend" +cmdline_args="" if [[ -n "${ROCM_ATTN:-}" ]]; then echo "ROCM_ATTN is set, running with --attention-backend ROCM_ATTN" - run_tests "ROCM_ATTN backend" "--attention-backend ROCM_ATTN" -else - run_tests "default backend" "" -fi - -# Check if FLASHINFER is set (non-empty) -if [[ -n "${FLASHINFER:-}" ]]; then - echo "FLASHINFER is set, rerunning with --attention-backend FLASHINFER" - run_tests "FLASHINFER backend" "--attention-backend FLASHINFER" + label="ROCM_ATTN backend" + cmdline_args=" --attention-backend ROCM_ATTN " +elif [[ -n "${FLASHINFER:-}" ]]; then + echo "FLASHINFER is set, running with --attention-backend FLASHINFER" + label="FLASHINFER backend" + cmdline_args=" --attention-backend FLASHINFER " else - echo "FLASHINFER not set, skipping FLASHINFER runs." + echo "running with default attention backend" fi # Check if cross-layers is enabled (non-empty) if [[ -n "${CROSS_LAYERS_BLOCKS:-}" ]]; then - echo "CROSS_LAYERS_BLOCKS is set, rerunning with --enable-cross-layers" - run_tests "default backend" "--enable-cross-layers" + echo "CROSS_LAYERS_BLOCKS is set, running with --enable-cross-layers" + label+=" - CROSS_LAYERS_BLOCKS enabled" + cmdline_args+=" --enable-cross-layers " fi + +# Run tests +run_tests "${label}" "${cmdline_args}" diff --git a/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh new file mode 100644 index 000000000000..79863123b729 --- /dev/null +++ b/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh @@ -0,0 +1,174 @@ +#!/bin/bash +set -e + +# Hosts / ports +PREFILL_HOST=${PREFILL_HOST:-"localhost"} +PREFILL_PORT=${PREFILL_PORT:-8100} +PREFILL_NIXL_SIDE_PORT=${PREFILL_NIXL_SIDE_PORT:-5577} +DECODE_HOST=${DECODE_HOST:-"localhost"} +DECODE_PORT=${DECODE_PORT:-8200} +PROXY_HOST=${PROXY_HOST:-"localhost"} +PROXY_PORT=${PROXY_PORT:-8192} +BASELINE_HOST=${BASELINE_HOST:-"localhost"} +BASELINE_PORT=${BASELINE_PORT:-9290} + +# Model to run. +MODEL_NAME=${MODEL_NAME:-"Qwen/Qwen3-0.6B"} +MAX_MODEL_LEN=${MAX_MODEL_LEN:-1024} +BLOCK_SIZE=${BLOCK_SIZE:-64} +PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1} +DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} +KV_BUFFER_DEVICE=${KV_BUFFER_DEVICE:-"xpu"} +GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.8} + +generate_affinity_mask() { + local count=$1 + local start=${2:-0} + local mask="" + local i + + for ((i=0; i /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + +launch_baseline() { + BASELINE_BASE_CMD=" + ZE_AFFINITY_MASK=0 \ + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \ + --host ${BASELINE_HOST} \ + --port ${BASELINE_PORT} \ + --max-model-len ${MAX_MODEL_LEN}\ + --seed 42 \ + -tp 1 \ + --block-size ${BLOCK_SIZE} \ + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ + --dtype float16 \ + --enforce-eager" + echo ${BASELINE_BASE_CMD} + bash -c "${BASELINE_BASE_CMD}" & + sleep 10 + wait_for_server ${BASELINE_HOST} ${BASELINE_PORT} +} + +launch_pd() { + PREFILL_BASE_CMD=" + ZE_AFFINITY_MASK=${PREFILLER_ZE_AFFINITY_MASK} \ + VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ + VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \ + VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \ + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \ + --host ${PREFILL_HOST} \ + --port ${PREFILL_PORT} \ + --max-model-len ${MAX_MODEL_LEN}\ + --seed 42 \ + --block-size ${BLOCK_SIZE} \ + --enforce-eager \ + --dtype float16 \ + -tp ${PREFILLER_TP_SIZE} \ + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ + --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}'" + + + DECODE_BASE_CMD=" + ZE_AFFINITY_MASK=${DECODER_ZE_AFFINITY_MASK} \ + VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \ + --host ${DECODE_HOST} \ + --port ${DECODE_PORT} \ + --max-model-len ${MAX_MODEL_LEN}\ + --seed 42 \ + --block-size ${BLOCK_SIZE} \ + --enforce-eager \ + -tp ${DECODER_TP_SIZE} \ + --dtype float16 \ + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ + --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}'" + + echo ${PREFILL_BASE_CMD} + echo ${DECODE_BASE_CMD} + sleep 2 + + # execute on hosts + bash -c "${PREFILL_BASE_CMD}" & + bash -c "${DECODE_BASE_CMD}" & + sleep 1 + wait_for_server ${PREFILL_HOST} ${PREFILL_PORT} + sleep 1 + wait_for_server ${DECODE_HOST} ${DECODE_PORT} + sleep 1 +} + +launch_pd_proxy(){ + PROXY_BASE_CMD=" + python3 ${EXP_ROOT}/toy_proxy_server.py \ + --prefiller-host ${PREFILL_HOST} --prefiller-port ${PREFILL_PORT} \ + --decoder-host ${DECODE_HOST} --decoder-port ${DECODE_PORT} \ + --host=${PROXY_HOST} --port ${PROXY_PORT}" + echo ${PROXY_BASE_CMD} + bash -c "${PROXY_BASE_CMD}" & + sleep 2 +} + +run_tests(){ + local service_url=$1 + local mode=$2 + python3 ${EXP_ROOT}/test_disagg_accuracy.py --service_url=${service_url} --model_name=${MODEL_NAME} --mode=${mode} --file_name=${OUTPUT_FILE} +} + + +# run non-disagg. baseline & save outputs +launch_baseline +run_tests "http://${BASELINE_HOST}:${BASELINE_PORT}" "baseline" +cleanup +sleep 10 + + +# run disagg. & do exact-match with the outputs from baseline +launch_pd +launch_pd_proxy +run_tests "http://${PROXY_HOST}:${PROXY_PORT}" "disagg" +echo "-----P/D success----" + +rm ${OUTPUT_FILE} +cleanup + +exit 0 diff --git a/tests/v1/kv_connector/unit/test_moriio_connector.py b/tests/v1/kv_connector/unit/test_moriio_connector.py index 7aa824609b7e..2ee224013131 100644 --- a/tests/v1/kv_connector/unit/test_moriio_connector.py +++ b/tests/v1/kv_connector/unit/test_moriio_connector.py @@ -206,7 +206,6 @@ def create_vllm_config( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", enable_prefix_caching=True, ) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index d267299815a6..f03d7c479eb2 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -118,7 +118,6 @@ def create_vllm_config( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype=cache_dtype, enable_prefix_caching=True, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index a2c1466ca61a..c8a6c1301444 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -96,7 +96,6 @@ def get_vllm_config(): cache_config = CacheConfig( block_size=BLOCK_SIZE, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", ) parallel_config = ParallelConfig() @@ -809,7 +808,6 @@ def test_hybrid_attention_mamba_tensor_shapes(): cache_config = CacheConfig( block_size=BLOCK_SIZE, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", ) parallel_config = ParallelConfig() @@ -1242,7 +1240,6 @@ def test_cudagraph_sizes_capped_for_mamba_cache(): cache_config = CacheConfig( block_size=BLOCK_SIZE, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", ) parallel_config = ParallelConfig() diff --git a/tools/pre_commit/generate_attention_backend_docs.py b/tools/pre_commit/generate_attention_backend_docs.py index 628656f0df1a..3ec2248a82a4 100644 --- a/tools/pre_commit/generate_attention_backend_docs.py +++ b/tools/pre_commit/generate_attention_backend_docs.py @@ -49,6 +49,11 @@ # Backends to skip during doc generation SKIP_BACKENDS = {"CUSTOM", "TORCH_SDPA"} +BACKEND_KV_DTYPE_EXCLUDES: dict[str, set[str]] = { + # fp8 is an alias for fp8_ds_mla for FlashMLA Sparse + "FLASHMLA_SPARSE": {"fp8"}, +} + def is_relevant_file(filepath: str) -> bool: """Check if a file matches any of the relevant patterns.""" @@ -546,10 +551,19 @@ def analyze_backend(backend_name: str, class_path: str) -> dict[str, Any] | None tree, impl_class_name, "can_return_lse_for_decode", False, file_path ) + kv_cache_dtypes = parse_kv_cache_dtypes(class_node) + if backend_name in BACKEND_KV_DTYPE_EXCLUDES: + excluded = BACKEND_KV_DTYPE_EXCLUDES[backend_name] + kv_cache_dtypes = ", ".join( + d + for d in (d.strip() for d in kv_cache_dtypes.split(",")) + if d not in excluded + ) + return { "name": backend_name, "dtypes": parse_supported_dtypes(class_node), - "kv_cache_dtypes": parse_kv_cache_dtypes(class_node), + "kv_cache_dtypes": kv_cache_dtypes, "block_sizes": parse_block_sizes(class_node), "head_sizes": parse_head_sizes(class_node), "attn_types": parse_attention_types(class_node), diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 9d37a5331c96..6325d91a13fb 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -9,6 +9,7 @@ import os import pprint import time +from collections import defaultdict from collections.abc import Callable, Generator, Sequence from contextlib import contextmanager from copy import deepcopy @@ -405,6 +406,58 @@ class SplitItem: graph: fx.GraphModule +def _is_empty_allocation_node(node: fx.Node) -> bool: + if node.op == "call_method": + return node.target == "new_empty" + + if node.op != "call_function": + return False + + target = node.target + if target in (torch.empty, torch.empty_like, torch.empty_strided): + return True + + if isinstance(target, torch._ops.OpOverloadPacket): + packet_name = target._qualified_op_name + elif isinstance(target, torch._ops.OpOverload): + packet_name = target.name() + else: + return False + + return packet_name.startswith("aten::empty") or packet_name.startswith( + "aten::new_empty" + ) + + +def _merge_empty_only_subgraphs( + node_to_subgraph_id: dict[fx.Node, int], +) -> None: + """ + Merge a partition that only contains an empty allocation op into the + previous partition. This avoids generating standalone empty submodules, + which can lead to empty cudagraph captures. + """ + + nodes_by_subgraph_id: dict[int, list[fx.Node]] = defaultdict(list) + subgraph_id_order: list[int] = [] + for node, subgraph_id in node_to_subgraph_id.items(): + if subgraph_id not in nodes_by_subgraph_id: + subgraph_id_order.append(subgraph_id) + nodes_by_subgraph_id[subgraph_id].append(node) + + prev_subgraph_id: int | None = None + for subgraph_id in subgraph_id_order: + nodes = nodes_by_subgraph_id[subgraph_id] + if ( + len(nodes) == 1 + and _is_empty_allocation_node(nodes[0]) + and prev_subgraph_id is not None + ): + node_to_subgraph_id[nodes[0]] = prev_subgraph_id + continue + prev_subgraph_id = subgraph_id + + def split_graph( graph: fx.GraphModule, splitting_ops: list[str] ) -> tuple[fx.GraphModule, list[SplitItem]]: @@ -443,6 +496,8 @@ def split_graph( else: node_to_subgraph_id[node] = subgraph_id + _merge_empty_only_subgraphs(node_to_subgraph_id) + # `keep_original_order` is important! # otherwise pytorch might reorder the nodes and # the semantics of the graph will change when we @@ -907,6 +962,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any: # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE. disable_cache = not is_compile_cache_enabled(self.inductor_config) + # TODO(patchy): ngram gpu kernel will cause vllm torch compile cache errors. + is_ngram_gpu_enabled = ( + vllm_config.speculative_config is not None + and vllm_config.speculative_config.use_ngram_gpu() + ) + disable_cache = disable_cache or is_ngram_gpu_enabled + if disable_cache: logger.info_once("vLLM's torch.compile cache is disabled.", scope="local") else: diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py index 3eda948b693f..70fbaabb4aac 100644 --- a/vllm/compilation/caching.py +++ b/vllm/compilation/caching.py @@ -189,13 +189,13 @@ def __init__( self.shape_env = None self.vllm_backend = vllm_backend self.sym_tensor_indices = sym_tensor_indices + self._fake_mode: Any | None = None import torch._functorch.config as functorch_config self.aot_autograd_config = ( aot_autograd_config or functorch_config.save_config_portable() ) - sym_input = next( (i for i in self.example_inputs if isinstance(i, torch.SymInt)), None ) @@ -217,6 +217,7 @@ def serialize_compile_artifacts( state.pop("optimized_call") state.pop("shape_env") state.pop("vllm_backend", None) + state.pop("_fake_mode", None) for node in state["graph_module"].graph.nodes: node.meta.pop("source_fn_stack", None) node.meta.pop("nn_module_stack", None) @@ -351,8 +352,31 @@ def optimized_call(*example_inputs: Any) -> Any: return fn.optimized_call(*example_inputs) fn = cls(**state, optimized_call=optimized_call) + fn._fake_mode = fake_mode return fn + def finalize_loading(self, vllm_config: VllmConfig) -> None: + """Eagerly initialize the compiled backend and perform all loading. + + Must be called after _verify_source_unchanged has populated + compilation_config.traced_files, which is needed for cache dir + computation. + """ + if self._fake_mode is None: + return # Already finalized, or mega path (no _fake_mode set) + + from torch._guards import TracingContext, tracing + + from vllm.compilation.backends import VllmBackend + + vllm_backend = VllmBackend(vllm_config, self.prefix, self.is_encoder) + with tracing(TracingContext(self._fake_mode)): + result = vllm_backend(self.graph_module, list(self.example_inputs)) + self.optimized_call = result.optimized_call + self.vllm_backend = vllm_backend + + self._fake_mode = None + @property def co_name(self) -> Literal["VllmSerializableFunction"]: """ diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py index 41db70155e38..13e88448c0f1 100644 --- a/vllm/compilation/cuda_graph.py +++ b/vllm/compilation/cuda_graph.py @@ -2,10 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses +import weakref from collections import Counter from collections.abc import Callable from contextlib import ExitStack -from typing import Any +from typing import Any, ClassVar from unittest.mock import patch import torch @@ -162,6 +163,14 @@ class CUDAGraphWrapper: guaranteed when VLLM_LOGGING_LEVEL == "DEBUG". """ + _all_instances: ClassVar[weakref.WeakSet["CUDAGraphWrapper"]] = weakref.WeakSet() + + @classmethod + def clear_all_graphs(cls) -> None: + """Clear captured graphs from all CUDAGraphWrapper instances.""" + for instance in list(cls._all_instances): + instance.clear_graphs() + def __init__( self, runnable: Callable[..., Any], @@ -192,6 +201,8 @@ def __init__( # cudagraphs for. self.concrete_cudagraph_entries: dict[BatchDescriptor, CUDAGraphEntry] = {} + CUDAGraphWrapper._all_instances.add(self) + def __getattr__(self, key: str) -> Any: # allow accessing the attributes of the runnable. if hasattr(self.runnable, key): @@ -205,6 +216,13 @@ def unwrap(self) -> Callable[..., Any]: # in case we need to access the original runnable. return self.runnable + @property + def cudagraph_wrapper(self) -> "CUDAGraphWrapper": + return self + + def clear_graphs(self) -> None: + self.concrete_cudagraph_entries.clear() + def __call__(self, *args: Any, **kwargs: Any) -> Any | None: forward_context = get_forward_context() batch_descriptor = forward_context.batch_descriptor diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index fe0984baf97c..f8629be34b53 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -30,7 +30,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.torch_utils import is_torch_equal_or_newer -from .monitor import start_monitoring_torch_compile +from .monitor import monitor_profiling_run, monitor_torch_compile if TYPE_CHECKING: # Only added on nightly/2.10 so wrap @@ -434,17 +434,24 @@ def __call__(self: type[_T], *args: Any, **kwargs: Any) -> Any: cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}") aot_compilation_path = os.path.join(cache_dir, "model") try: - with ( - set_current_vllm_config(self.vllm_config), - open(aot_compilation_path, "rb") as f, - ): - start_monitoring_torch_compile(self.vllm_config) - loaded_fn = torch.compiler.load_compiled_function( - f, f_globals=self.forward.__globals__ - ) - _verify_source_unchanged(loaded_fn.source_info(), self.vllm_config) - if not self.compilation_config.dynamic_shapes_config.evaluate_guards: - loaded_fn.disable_guard_check() + with monitor_torch_compile(self.vllm_config): + with ( + set_current_vllm_config(self.vllm_config), + open(aot_compilation_path, "rb") as f, + ): + loaded_fn = torch.compiler.load_compiled_function( + f, f_globals=self.forward.__globals__ + ) + _verify_source_unchanged(loaded_fn.source_info(), self.vllm_config) + ds_config = self.compilation_config.dynamic_shapes_config + if not ds_config.evaluate_guards: + loaded_fn.disable_guard_check() + # Eagerly load compiled artifacts now that traced_files + # is populated by _verify_source_unchanged. + with maybe_use_cudagraph_partition_wrapper(self.vllm_config): + loaded_fn._artifacts.compiled_fn.finalize_loading( + self.vllm_config + ) self.aot_compiled_fn = loaded_fn self.was_aot_compile_fn_loaded_from_disk = True except Exception as e: @@ -465,12 +472,11 @@ def __call__(self: type[_T], *args: Any, **kwargs: Any) -> Any: logger.info( "Directly load AOT compilation from path %s", aot_compilation_path ) - # Apply partition wrapper context for proper CUDA graph capture - from .monitor import end_monitoring_torch_compile - - with maybe_use_cudagraph_partition_wrapper(self.vllm_config): + with ( + monitor_profiling_run(), + maybe_use_cudagraph_partition_wrapper(self.vllm_config), + ): output = self.aot_compiled_fn(self, *args, **kwargs) - end_monitoring_torch_compile(self.vllm_config) return output if self.compiled: @@ -489,8 +495,6 @@ def __call__(self: type[_T], *args: Any, **kwargs: Any) -> Any: **kwargs, ) - # here, it is the starting point of the `torch.compile` process - start_monitoring_torch_compile(self.vllm_config) original_code_object = self.original_code_object() logger.debug("Start compiling function %s", original_code_object) @@ -559,16 +563,26 @@ def patched_inline_call(self_: Any) -> Any: # store the path for saving after warmup self._aot_compilation_path = aot_compilation_path self._aot_cache_dir = cache_dir - self.aot_compiled_fn = self.aot_compile(*args, **kwargs) - # All compilation is done at this point, save the AOT artifact. - self.save_aot_compiled_function() - output = self.aot_compiled_fn(self, *args, **kwargs) - else: - output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) # type: ignore[arg-type] + with monitor_torch_compile(self.vllm_config): + self.aot_compiled_fn = self.aot_compile(*args, **kwargs) + # All compilation is done at this point, save the + # AOT artifact. + self.save_aot_compiled_function() - from .monitor import end_monitoring_torch_compile + with monitor_profiling_run(): + output = self.aot_compiled_fn(self, *args, **kwargs) + else: + with monitor_torch_compile( + self.vllm_config, + "torch.compile and initial profiling/warmup " + "run together took %.2f s in total", + ): + output = TorchCompileWithNoGuardsWrapper.__call__( + self, # type: ignore[arg-type] + *args, + **kwargs, + ) - end_monitoring_torch_compile(self.vllm_config) self.compiled = True return output diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index fb9dfa3ac127..f584f526f08f 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -1,46 +1,83 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib import time +from collections.abc import Generator -from vllm.config import CompilationConfig, CompilationMode, VllmConfig +from vllm.config import CompilationMode, VllmConfig from vllm.logger import init_logger logger = init_logger(__name__) -context_manager = None +# Shared global so backends.py can read the start time for Dynamo timing. torch_compile_start_time: float = 0.0 -def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None: +@contextlib.contextmanager +def monitor_torch_compile( + vllm_config: VllmConfig, + message: str = "torch.compile took %.2f s in total", +) -> Generator[None, None, None]: + """Context manager that times torch.compile and manages depyf debugging. + + On normal exit: logs the compile time and exits depyf. + On exception: cleans up depyf without logging (compilation failed). + """ global torch_compile_start_time torch_compile_start_time = time.perf_counter() - compilation_config: CompilationConfig = vllm_config.compilation_config + compilation_config = vllm_config.compilation_config + depyf_cm = None path = vllm_config.compile_debug_dump_path() if compilation_config.mode == CompilationMode.VLLM_COMPILE and path: import depyf path.mkdir(parents=True, exist_ok=True) logger.debug("Dumping depyf output to %s", path) - global context_manager - context_manager = depyf.prepare_debug(path.as_posix()) - context_manager.__enter__() - - -def end_monitoring_torch_compile(vllm_config: VllmConfig) -> None: - compilation_config: CompilationConfig = vllm_config.compilation_config - total_compile_time: float = time.perf_counter() - torch_compile_start_time - if compilation_config.mode == CompilationMode.VLLM_COMPILE: - logger.info_once( - "torch.compile and initial profiling run took %.2f s in total", - total_compile_time, - scope="local", - ) - global context_manager - if context_manager is not None: - context_manager.__exit__(None, None, None) - context_manager = None + depyf_cm = depyf.prepare_debug(path.as_posix()) + depyf_cm.__enter__() + + try: + yield + except Exception: + raise + else: + total_compile_time = time.perf_counter() - torch_compile_start_time + if compilation_config.mode == CompilationMode.VLLM_COMPILE: + logger.info_once(message, total_compile_time, scope="local") + finally: + if depyf_cm is not None: + try: + depyf_cm.__exit__(None, None, None) + except Exception: + logger.warning("Exception during depyf cleanup.", exc_info=True) + + +@contextlib.contextmanager +def monitor_profiling_run() -> Generator[None, None, None]: + """Context manager that times the initial profiling run. + + Asserts that no backend compilation occurs during the profiling run + (all compilation should have completed before this point). + """ + from vllm.compilation.counter import compilation_counter + + backend_compilations_before = compilation_counter.num_backend_compilations + start = time.perf_counter() + yield + elapsed = time.perf_counter() - start + assert ( + compilation_counter.num_backend_compilations == backend_compilations_before + ), ( + "backend compilation occurred during the initial profiling run; " + "all compilation should be complete before the profiling run starts." + ) + logger.info_once( + "Initial profiling/warmup run took %.2f s", + elapsed, + scope="local", + ) cudagraph_capturing_enabled: bool = True diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 8a94141c91b6..71603d8c883e 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -1,21 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import math from dataclasses import field -from typing import TYPE_CHECKING, Any, Literal +from typing import Literal from pydantic import Field, SkipValidation, field_validator from vllm.config.utils import config from vllm.logger import init_logger -from vllm.utils.mem_constants import GiB_bytes -from vllm.utils.mem_utils import format_gib, get_cpu_memory - -if TYPE_CHECKING: - from vllm.config.parallel import ParallelConfig -else: - ParallelConfig = Any logger = init_logger(__name__) @@ -53,8 +45,6 @@ class CacheConfig: not matter if you have another vLLM instance running on the same GPU. For example, if you have two vLLM instances running on the same GPU, you can set the GPU memory utilization to 0.5 for each instance.""" - swap_space: float = Field(default=4, ge=0) - """Size of the CPU swap space per GPU (in GiB).""" cache_dtype: CacheDType = "auto" """Data type for kv cache storage. If "auto", will use model data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports @@ -173,7 +163,6 @@ def compute_hash(self) -> str: ignored_factors = { # Runtime/derived knobs that don't affect compiled graph shape "gpu_memory_utilization", - "swap_space", "is_attention_free", "num_gpu_blocks_override", "enable_prefix_caching", @@ -208,24 +197,3 @@ def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType: "scaling factor." ) return cache_dtype - - def verify_with_parallel_config( - self, - parallel_config: ParallelConfig, - ) -> None: - swap_space_bytes = math.ceil(self.swap_space * GiB_bytes) - total_cpu_memory = get_cpu_memory() - # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel - # group are in the same node. However, the GPUs may span multiple nodes. - num_gpus_per_node = parallel_config.tensor_parallel_size - cpu_memory_usage = swap_space_bytes * num_gpus_per_node - - msg = ( - f"{format_gib(cpu_memory_usage)} GiB out of the " - f"{format_gib(total_cpu_memory)} GiB total CPU memory " - "is allocated for the swap space." - ) - if cpu_memory_usage > 0.7 * total_cpu_memory: - raise ValueError("Too large swap space. " + msg) - elif cpu_memory_usage > 0.4 * total_cpu_memory: - logger.warning("Possibly too large swap space. %s", msg) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index a950ba531ad2..27b5188eb52d 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -47,6 +47,7 @@ "step3p5_mtp", ] EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes] +NgramGPUTypes = Literal["ngram_gpu"] SpeculativeMethod = Literal[ "ngram", "medusa", @@ -54,6 +55,7 @@ "draft_model", "suffix", EagleModelTypes, + NgramGPUTypes, ] @@ -364,6 +366,8 @@ def __post_init__(self): self.quantization = self.target_model_config.quantization elif self.method in ("ngram", "[ngram]"): self.model = "ngram" + elif self.method == "ngram_gpu": + self.model = "ngram_gpu" elif self.method == "suffix": self.model = "suffix" elif self.method == "extract_hidden_states": @@ -374,8 +378,9 @@ def __post_init__(self): ) if self.method in ("ngram", "[ngram]"): - # Unified to "ngram" internally self.method = "ngram" + + if self.method in ("ngram", "ngram_gpu"): # Set default values if not provided if self.prompt_lookup_min is None and self.prompt_lookup_max is None: # TODO(woosuk): Tune these values. They are arbitrarily chosen. @@ -832,6 +837,9 @@ def uses_draft_model(self) -> bool: def uses_extract_hidden_states(self) -> bool: return self.method == "extract_hidden_states" + def use_ngram_gpu(self) -> bool: + return self.method == "ngram_gpu" + def __repr__(self) -> str: method = self.method model = ( diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 34c668362d40..16f2c375d5fd 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -41,7 +41,7 @@ from .parallel import ParallelConfig from .profiler import ProfilerConfig from .scheduler import SchedulerConfig -from .speculative import EagleModelTypes, SpeculativeConfig +from .speculative import EagleModelTypes, NgramGPUTypes, SpeculativeConfig from .structured_outputs import StructuredOutputsConfig from .utils import SupportsHash, config, replace from .weight_transfer import WeightTransferConfig @@ -674,8 +674,6 @@ def __post_init__(self): self.parallel_config.is_moe_model = self.model_config.is_moe - self.cache_config.verify_with_parallel_config(self.parallel_config) - if self.lora_config is not None: self.lora_config.verify_with_model_config(self.model_config) @@ -698,11 +696,13 @@ def __post_init__(self): if self.speculative_config is not None: if ( self.speculative_config.method not in get_args(EagleModelTypes) + and self.speculative_config.method not in get_args(NgramGPUTypes) and self.speculative_config.method != "draft_model" ): raise ValueError( "Currently, async scheduling is only supported " - "with EAGLE/MTP/Draft Model kind of speculative decoding." + "with EAGLE/MTP/Draft Model/NGram GPU kind of " + "speculative decoding" ) if self.speculative_config.disable_padded_drafter_batch: raise ValueError( @@ -720,6 +720,7 @@ def __post_init__(self): if ( self.speculative_config is not None and self.speculative_config.method not in get_args(EagleModelTypes) + and self.speculative_config.method not in get_args(NgramGPUTypes) ): logger.warning_once( "Async scheduling not supported with %s-based " diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index fc31836aa7e1..38dd980c62d6 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -336,11 +336,21 @@ def GetRetrieveMetadata( start_token_idx = start * vllm_block_size end_token_idx = end * vllm_block_size token_ids = list(tracker.all_token_ids) + + # Compute how many tokens at the start of the retrieve range + # overlap with APC-shared blocks. The server must skip writing + # to these positions to avoid a cross-stream data race: the + # retrieve writes on the LMCache CUDA stream while concurrent + # requests may read these APC-shared blocks on the vLLM stream. + apc_overlap_blocks = tracker.num_vllm_hit_blocks - start + skip_first_n_tokens = apc_overlap_blocks * vllm_block_size + op = LoadStoreOp( token_ids=token_ids, block_ids=block_ids, start=start_token_idx, end=end_token_idx, + skip_first_n_tokens=skip_first_n_tokens, ) ret = LMCacheMPRequestMetadata( @@ -700,13 +710,22 @@ def update_state_after_alloc( num_external_tokens (int): the number of tokens that will be loaded from the external KV cache. """ - # NOTE: the `blocks` are NEW BLOCKS allocated for this request. + # NOTE: `blocks` comes from kv_cache_manager.get_blocks(request_id), + # which returns ALL blocks for the request (not just newly allocated). + # This function may be called twice for async-load requests: + # 1st call: blocks = initial allocation (APC + fresh) + # 2nd call: blocks = all blocks + # (initial + newly allocated for remaining tokens) + # We must only append the NEW blocks beyond what's already tracked + # to avoid duplication, which would corrupt the store path's block indexing. tracker = self._get_request_tracker(request.request_id) block_ids = reformat_block_ids(blocks.get_block_ids()) - # No matter we need to retrieve or not, we need to update - # the block ids into the tracker - tracker.append_block_ids(block_ids) + # Only append blocks beyond what's already tracked + existing_count = len(tracker.allocated_block_ids) + new_block_ids = block_ids[existing_count:] + if new_block_ids: + tracker.append_block_ids(new_block_ids) # Update the state of the tracker condition = tracker.needs_retrieve() @@ -721,6 +740,34 @@ def update_state_after_alloc( # Clean up lookup future in scheduler adapter self.scheduler_adapter.cleanup_lookup_result(request.request_id) + # Free locks on chunks that vLLM already computed and won't + # retrieve from LMCache. + if tracker.num_lmcache_hit_blocks > 0: + if not condition: + # No retrieve needed — free ALL locked chunks + free_end = tracker.num_lmcache_hit_blocks * self.vllm_block_size + else: + # Note(Roy): Boundary misalignment between vLLM blocks and LMCache + # blocks is handled in free_lookup_locks. It makes sure that if + # the last vLLM computed block ends in the middle of a LMCache + # block, the end LMCache block is not freed (i.e., floor division) + # since it will still be needed by vLLM and such block's lock will + # be freed by vLLM's retrieve. + free_end = tracker.num_vllm_hit_blocks * self.vllm_block_size + + if free_end > 0: + self.scheduler_adapter.free_lookup_locks( + token_ids=list(tracker.all_token_ids), + start=0, + end=free_end, + request_id=request.request_id, + ) + logger.debug( + "Free locks of tokens %d-%d since it is cached by vLLM.", + 0, + free_end, + ) + def build_connector_meta( self, scheduler_output: SchedulerOutput ) -> KVConnectorMetadata: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 09ffd5e121cc..dc1735a01788 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -447,7 +447,6 @@ class EngineArgs: ) disable_sliding_window: bool = ModelConfig.disable_sliding_window disable_cascade_attn: bool = ModelConfig.disable_cascade_attn - swap_space: float = CacheConfig.swap_space offload_backend: str = OffloadConfig.offload_backend cpu_offload_gb: float = UVAOffloadConfig.cpu_offload_gb cpu_offload_params: set[str] = get_field(UVAOffloadConfig, "cpu_offload_params") @@ -961,7 +960,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: cache_group.add_argument( "--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"] ) - cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"]) cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"]) cache_group.add_argument( "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"] @@ -1526,7 +1524,6 @@ def create_engine_config( block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, kv_cache_memory_bytes=self.kv_cache_memory_bytes, - swap_space=self.swap_space, cache_dtype=resolved_cache_dtype, # type: ignore[arg-type] is_attention_free=model_config.is_attention_free, num_gpu_blocks_override=self.num_gpu_blocks_override, diff --git a/vllm/entrypoints/cli/launch.py b/vllm/entrypoints/cli/launch.py index f04a77d485c9..6afa2435365a 100644 --- a/vllm/entrypoints/cli/launch.py +++ b/vllm/entrypoints/cli/launch.py @@ -8,7 +8,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.cli.types import CLISubcommand from vllm.entrypoints.openai.api_server import ( - build_and_serve, + build_and_serve_renderer, setup_server, ) from vllm.entrypoints.openai.cli_args import ( @@ -109,19 +109,17 @@ def cmd_init() -> list[CLISubcommand]: async def run_launch_fastapi(args: argparse.Namespace) -> None: """Run the online serving layer with FastAPI (no GPU inference).""" from vllm.config import VllmConfig - from vllm.v1.engine.launch import LaunchEngineClient # 1. Socket binding listen_address, sock = setup_server(args) - # 2. Create LaunchEngineClient (no GPU) + # 2. Build and serve the API server engine_args = AsyncEngineArgs.from_cli_args(args) model_config = engine_args.create_model_config() vllm_config = VllmConfig(model_config=model_config) - engine_client = LaunchEngineClient.from_vllm_config(vllm_config) - - # 3. Build app, initialize state, and start serving - shutdown_task = await build_and_serve(engine_client, listen_address, sock, args) + shutdown_task = await build_and_serve_renderer( + vllm_config, listen_address, sock, args + ) try: await shutdown_task finally: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index eb1d4dbeb365..9c6d6ddcdf75 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -164,12 +164,6 @@ class LLM: compared with using gpu_memory_utilization. Note that kv_cache_memory_bytes (when not-None) ignores gpu_memory_utilization - swap_space: The size (GiB) of CPU memory per GPU to use as swap space. - This can be used for temporarily storing the states of the requests - when their `best_of` sampling parameters are larger than 1. If all - requests will have `best_of=1`, you can safely set this to 0. - Noting that `best_of` is only supported in V0. Otherwise, too small - values may cause out-of-memory (OOM) errors. cpu_offload_gb: The size (GiB) of CPU memory to use for offloading the model weights. This virtually increases the GPU memory space you can use to hold the model weights, at the cost of CPU-GPU data @@ -240,7 +234,6 @@ def __init__( chat_template: Path | str | None = None, seed: int = 0, gpu_memory_utilization: float = 0.9, - swap_space: float = 4, cpu_offload_gb: float = 0, offload_group_size: int = 0, offload_num_in_group: int = 1, @@ -265,6 +258,17 @@ def __init__( ) -> None: """LLM constructor.""" + if "swap_space" in kwargs: + kwargs.pop("swap_space") + import warnings + + warnings.warn( + "The 'swap_space' parameter is deprecated and ignored. " + "It will be removed in a future version.", + DeprecationWarning, + stacklevel=2, + ) + if "disable_log_stats" not in kwargs: kwargs["disable_log_stats"] = True @@ -353,7 +357,6 @@ def _make_config(value: Any, cls: type[_R]) -> _R: seed=seed, gpu_memory_utilization=gpu_memory_utilization, kv_cache_memory_bytes=kv_cache_memory_bytes, - swap_space=swap_space, cpu_offload_gb=cpu_offload_gb, offload_group_size=offload_group_size, offload_num_in_group=offload_num_in_group, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ee0b7115dd3c..7961daf160b4 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -22,6 +22,7 @@ from starlette.datastructures import State import vllm.envs as envs +from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import load_chat_template @@ -198,7 +199,7 @@ def build_app( register_sagemaker_api_router(app, supported_tasks) - if any(task in supported_tasks for task in ("generate", "render")): + if "generate" in supported_tasks: from vllm.entrypoints.openai.generate.api_router import ( register_generate_api_routers, ) @@ -223,6 +224,13 @@ def build_app( elastic_ep_attach_router(app) + if "generate" in supported_tasks or "render" in supported_tasks: + from vllm.entrypoints.serve.render.api_router import ( + attach_router as attach_render_router, + ) + + attach_render_router(app) + if "transcription" in supported_tasks: from vllm.entrypoints.openai.speech_to_text.api_router import ( attach_router as register_speech_to_text_api_router, @@ -363,7 +371,7 @@ async def init_app_state( trust_request_chat_template=args.trust_request_chat_template, ) - if any(task in supported_tasks for task in ("generate", "render")): + if "generate" in supported_tasks: from vllm.entrypoints.openai.generate.api_router import init_generate_state await init_generate_state( @@ -393,6 +401,64 @@ async def init_app_state( state.server_load_metrics = 0 +async def init_render_app_state( + vllm_config: VllmConfig, + state: State, + args: Namespace, +) -> None: + """Initialise FastAPI app state for a CPU-only render server. + + Unlike :func:`init_app_state` this function does not require an + :class:`~vllm.engine.protocol.EngineClient`; it bootstraps the + preprocessing pipeline (renderer, io_processor, input_processor) + directly from the :class:`~vllm.config.VllmConfig`. + """ + from vllm.entrypoints.chat_utils import load_chat_template + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + from vllm.plugins.io_processors import get_io_processor + from vllm.renderers import renderer_from_config + + served_model_names = args.served_model_name or [args.model] + + if args.enable_log_requests: + request_logger = RequestLogger(max_log_len=args.max_log_len) + else: + request_logger = None + + renderer = renderer_from_config(vllm_config) + io_processor = get_io_processor( + vllm_config, renderer, vllm_config.model_config.io_processor_plugin + ) + resolved_chat_template = load_chat_template(args.chat_template) + + state.openai_serving_render = OpenAIServingRender( + model_config=vllm_config.model_config, + renderer=renderer, + io_processor=io_processor, + served_model_names=served_model_names, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + enable_auto_tools=args.enable_auto_tool_choice, + exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, + tool_parser=args.tool_call_parser, + default_chat_template_kwargs=args.default_chat_template_kwargs, + log_error_stack=args.log_error_stack, + ) + + # Expose models endpoint via the render handler. + state.openai_serving_models = state.openai_serving_render + + state.vllm_config = vllm_config + # Disable stats logging — there is no engine to poll. + state.log_stats = False + state.engine_client = None + state.args = args + state.enable_server_load_tracking = False + state.server_load_metrics = 0 + + def create_server_socket(addr: tuple[str, int]) -> socket.socket: family = socket.AF_INET if is_valid_ipv6_address(addr[0]): @@ -494,7 +560,6 @@ async def build_and_serve( supported_tasks = await engine_client.get_supported_tasks() logger.info("Supported tasks: %s", supported_tasks) - app = build_app(args, supported_tasks) await init_app_state(engine_client, app.state, args, supported_tasks) @@ -522,6 +587,51 @@ async def build_and_serve( ) +async def build_and_serve_renderer( + vllm_config: VllmConfig, + listen_address: str, + sock: socket.socket, + args: Namespace, + **uvicorn_kwargs, +) -> asyncio.Task: + """Build FastAPI app for a CPU-only render server, initialize state, and + start serving. + + Returns the shutdown task for the caller to await. + """ + + # Get uvicorn log config (from file or with endpoint filter) + log_config = get_uvicorn_log_config(args) + if log_config is not None: + uvicorn_kwargs["log_config"] = log_config + + app = build_app(args, ("render",)) + await init_render_app_state(vllm_config, app.state, args) + + logger.info("Starting vLLM server on %s", listen_address) + + return await serve_http( + app, + sock=sock, + enable_ssl_refresh=args.enable_ssl_refresh, + host=args.host, + port=args.port, + log_level=args.uvicorn_log_level, + # NOTE: When the 'disable_uvicorn_access_log' value is True, + # no access log will be output. + access_log=not args.disable_uvicorn_access_log, + timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs, + ssl_ciphers=args.ssl_ciphers, + h11_max_incomplete_event_size=args.h11_max_incomplete_event_size, + h11_max_header_count=args.h11_max_header_count, + **uvicorn_kwargs, + ) + + async def run_server(args, **uvicorn_kwargs) -> None: """Run a single-worker API server.""" diff --git a/vllm/entrypoints/openai/chat_completion/api_router.py b/vllm/entrypoints/openai/chat_completion/api_router.py index 8f2c5c14f23c..f5569f5aba3e 100644 --- a/vllm/entrypoints/openai/chat_completion/api_router.py +++ b/vllm/entrypoints/openai/chat_completion/api_router.py @@ -71,34 +71,5 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post( - "/v1/chat/completions/render", - dependencies=[Depends(validate_json_request)], - response_model=list, - responses={ - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse}, - }, -) -async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request): - """Render chat completion request and return conversation and engine - prompts without generating.""" - handler = chat(raw_request) - if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Chat Completions API" - ) - - result = await handler.render_chat_request(request) - - if isinstance(result, ErrorResponse): - return JSONResponse(content=result.model_dump(), status_code=result.error.code) - - return JSONResponse(content=result) - - def attach_router(app: FastAPI): app.include_router(router) diff --git a/vllm/entrypoints/openai/completion/api_router.py b/vllm/entrypoints/openai/completion/api_router.py index 466c059aae94..56e961bef408 100644 --- a/vllm/entrypoints/openai/completion/api_router.py +++ b/vllm/entrypoints/openai/completion/api_router.py @@ -69,32 +69,5 @@ async def create_completion(request: CompletionRequest, raw_request: Request): return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post( - "/v1/completions/render", - dependencies=[Depends(validate_json_request)], - response_model=list, - responses={ - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -async def render_completion(request: CompletionRequest, raw_request: Request): - """render completion request and return engine prompts without generating.""" - handler = completion(raw_request) - if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Completions API" - ) - - result = await handler.render_completion_request(request) - - if isinstance(result, ErrorResponse): - return JSONResponse(content=result.model_dump(), status_code=result.error.code) - - return JSONResponse(content=result) - - def attach_router(app: FastAPI): app.include_router(router) diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py index 5e4f184a0145..f07f42f0c07d 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/openai/generate/api_router.py @@ -111,7 +111,7 @@ async def init_generate_state( enable_log_outputs=args.enable_log_outputs, enable_log_deltas=args.enable_log_deltas, ) - if any(task in supported_tasks for task in ("generate", "render")) + if "generate" in supported_tasks else None ) # Warm up chat template processing to avoid first-request latency @@ -126,7 +126,7 @@ async def init_generate_state( enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, ) - if any(task in supported_tasks for task in ("generate", "render")) + if "generate" in supported_tasks else None ) state.anthropic_serving_messages = ( @@ -160,3 +160,26 @@ async def init_generate_state( if "generate" in supported_tasks else None ) + + # Render endpoints are always backed by OpenAIServingRender so that + # /v1/chat/completions/render and /v1/completions/render work on both + # generate-mode and render-only servers. + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + + state.openai_serving_render = OpenAIServingRender( + model_config=engine_client.model_config, + renderer=engine_client.renderer, + io_processor=engine_client.io_processor, + served_model_names=[ + mp.name for mp in state.openai_serving_models.base_model_paths + ], + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + enable_auto_tools=args.enable_auto_tool_choice, + exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, + tool_parser=args.tool_call_parser, + default_chat_template_kwargs=args.default_chat_template_kwargs, + log_error_stack=args.log_error_stack, + ) diff --git a/vllm/entrypoints/serve/instrumentator/health.py b/vllm/entrypoints/serve/instrumentator/health.py index 8b079ce31618..5c0b2d1855d9 100644 --- a/vllm/entrypoints/serve/instrumentator/health.py +++ b/vllm/entrypoints/serve/instrumentator/health.py @@ -22,8 +22,12 @@ def engine_client(request: Request) -> EngineClient: @router.get("/health", response_class=Response) async def health(raw_request: Request) -> Response: """Health check.""" + client = engine_client(raw_request) + if client is None: + # Render-only servers have no engine; they are always healthy. + return Response(status_code=200) try: - await engine_client(raw_request).check_health() + await client.check_health() return Response(status_code=200) except EngineDeadError: return Response(status_code=503) diff --git a/vllm/entrypoints/serve/render/__init__.py b/vllm/entrypoints/serve/render/__init__.py new file mode 100644 index 000000000000..208f01a7cb5e --- /dev/null +++ b/vllm/entrypoints/serve/render/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/entrypoints/serve/render/api_router.py b/vllm/entrypoints/serve/render/api_router.py new file mode 100644 index 000000000000..a9f62e450ad7 --- /dev/null +++ b/vllm/entrypoints/serve/render/api_router.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from http import HTTPStatus + +from fastapi import APIRouter, Depends, FastAPI, Request +from fastapi.responses import JSONResponse + +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.completion.protocol import CompletionRequest +from vllm.entrypoints.openai.engine.protocol import ErrorResponse +from vllm.entrypoints.openai.utils import validate_json_request +from vllm.entrypoints.serve.render.serving import OpenAIServingRender +from vllm.entrypoints.utils import create_error_response +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() + + +def render(request: Request) -> OpenAIServingRender | None: + return getattr(request.app.state, "openai_serving_render", None) + + +@router.post( + "/v1/chat/completions/render", + dependencies=[Depends(validate_json_request)], + response_model=list, + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request): + handler = render(raw_request) + if handler is None: + error = create_error_response( + message="The model does not support Chat Completions Render API", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND, + ) + return JSONResponse( + status_code=HTTPStatus.NOT_FOUND, content=error.model_dump() + ) + + result = await handler.render_chat_request(request) + + if isinstance(result, ErrorResponse): + return JSONResponse(content=result.model_dump(), status_code=result.error.code) + + return JSONResponse(content=result) + + +@router.post( + "/v1/completions/render", + dependencies=[Depends(validate_json_request)], + response_model=list, + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +async def render_completion(request: CompletionRequest, raw_request: Request): + handler = render(raw_request) + if handler is None: + error = create_error_response( + message="The model does not support Completions Render API", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND, + ) + return JSONResponse( + status_code=HTTPStatus.NOT_FOUND, content=error.model_dump() + ) + + result = await handler.render_completion_request(request) + + if isinstance(result, ErrorResponse): + return JSONResponse(content=result.model_dump(), status_code=result.error.code) + + return JSONResponse(content=result) + + +def attach_router(app: FastAPI) -> None: + app.include_router(router) diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py new file mode 100644 index 000000000000..c0e32be7ea5e --- /dev/null +++ b/vllm/entrypoints/serve/render/serving.py @@ -0,0 +1,475 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import sys +import traceback +from collections.abc import Callable, Sequence +from http import HTTPStatus +from typing import Any + +import jinja2 +from openai_harmony import Message as OpenAIMessage + +from vllm.config import ModelConfig +from vllm.entrypoints.chat_utils import ( + ChatTemplateContentFormatOption, + ConversationMessage, +) +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.completion.protocol import CompletionRequest +from vllm.entrypoints.openai.engine.protocol import ( + ErrorInfo, + ErrorResponse, + ModelCard, + ModelList, + ModelPermission, +) +from vllm.entrypoints.openai.parser.harmony_utils import ( + get_developer_message, + get_system_message, + parse_chat_inputs_to_harmony_messages, + render_for_completion, +) +from vllm.entrypoints.utils import sanitize_message +from vllm.inputs.data import ProcessorInputs, PromptType, SingletonPrompt, TokensPrompt +from vllm.logger import init_logger +from vllm.parser import ParserManager +from vllm.renderers import BaseRenderer, merge_kwargs +from vllm.renderers.inputs.preprocess import parse_model_prompt, prompt_to_seq +from vllm.tokenizers import TokenizerLike +from vllm.tool_parsers import ToolParser +from vllm.utils.mistral import is_mistral_tokenizer +from vllm.utils.mistral import mt as _mt + +logger = init_logger(__name__) + + +class OpenAIServingRender: + def __init__( + self, + model_config: ModelConfig, + renderer: BaseRenderer, + io_processor: Any, + served_model_names: list[str], + *, + request_logger: RequestLogger | None, + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, + trust_request_chat_template: bool = False, + enable_auto_tools: bool = False, + exclude_tools_when_tool_choice_none: bool = False, + tool_parser: str | None = None, + default_chat_template_kwargs: dict[str, Any] | None = None, + log_error_stack: bool = False, + ) -> None: + self.model_config = model_config + self.renderer = renderer + self.io_processor = io_processor + self.served_model_names = served_model_names + self.request_logger = request_logger + self.chat_template = chat_template + self.chat_template_content_format: ChatTemplateContentFormatOption = ( + chat_template_content_format + ) + self.trust_request_chat_template = trust_request_chat_template + self.enable_auto_tools = enable_auto_tools + self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none + self.tool_parser: Callable[[TokenizerLike], ToolParser] | None = ( + ParserManager.get_tool_parser( + tool_parser_name=tool_parser, + enable_auto_tools=enable_auto_tools, + model_name=model_config.model, + ) + ) + self.default_chat_template_kwargs: dict[str, Any] = ( + default_chat_template_kwargs or {} + ) + self.log_error_stack = log_error_stack + self.use_harmony = model_config.hf_config.model_type == "gpt_oss" + self.supports_browsing = False + self.supports_code_interpreter = False + + async def render_chat_request( + self, + request: ChatCompletionRequest, + ) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse: + """Copied from OpenAIServingChat.render_chat_request. + + Differences: engine_client.errored check removed (no engine client). + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + logger.error("Error with model %s", error_check_ret) + return error_check_ret + + try: + tokenizer = self.renderer.tokenizer + + tool_parser = self.tool_parser + + if is_mistral_tokenizer(tokenizer): + # because of issues with pydantic we need to potentially + # re-serialize the tool_calls field of the request + # for more info: see comment in `maybe_serialize_tool_calls` + _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] + _mt.truncate_tool_call_ids(request) # type: ignore[arg-type] + _mt.validate_request_params(request) + + # Check if tool parsing is unavailable (common condition) + tool_parsing_unavailable = ( + tool_parser is None + and not is_mistral_tokenizer(tokenizer) + and not self.use_harmony + ) + + # Validate tool_choice when tool parsing is required but unavailable + if tool_parsing_unavailable and request.tool_choice not in ( + None, + "none", + ): + if request.tool_choice == "auto" and not self.enable_auto_tools: + # for hf tokenizers, "auto" tools requires + # --enable-auto-tool-choice and --tool-call-parser + return self.create_error_response( + '"auto" tool choice requires ' + "--enable-auto-tool-choice and --tool-call-parser to be set" + ) + elif request.tool_choice != "auto": + # "required" or named tool requires tool parser + return self.create_error_response( + f'tool_choice="{request.tool_choice}" requires ' + "--tool-call-parser to be set" + ) + + if request.tools is None or ( + request.tool_choice == "none" + and self.exclude_tools_when_tool_choice_none + ): + tool_dicts = None + else: + tool_dicts = [tool.model_dump() for tool in request.tools] + + if not self.use_harmony: + # Common case. + error_check_ret = self._validate_chat_template( + request_chat_template=request.chat_template, + chat_template_kwargs=request.chat_template_kwargs, + trust_request_chat_template=self.trust_request_chat_template, + ) + if error_check_ret is not None: + return error_check_ret + + conversation, engine_prompts = await self._preprocess_chat( + request, + request.messages, + default_template=self.chat_template, + default_template_content_format=self.chat_template_content_format, + default_template_kwargs=self.default_chat_template_kwargs, + tool_dicts=tool_dicts, + tool_parser=tool_parser, + ) + else: + # For GPT-OSS. + should_include_tools = tool_dicts is not None + conversation, engine_prompts = self._make_request_with_harmony( + request, should_include_tools + ) + except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(e) + + return conversation, engine_prompts + + async def render_completion_request( + self, + request: CompletionRequest, + ) -> list[ProcessorInputs] | ErrorResponse: + """Copied from OpenAIServingCompletion.render_completion_request. + + Differences: engine_client.errored check removed (no engine client). + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + # Return error for unsupported features. + if request.suffix is not None: + return self.create_error_response("suffix is not currently supported") + + if request.echo and request.prompt_embeds is not None: + return self.create_error_response("Echo is unsupported with prompt embeds.") + + if request.prompt_logprobs is not None and request.prompt_embeds is not None: + return self.create_error_response( + "prompt_logprobs is not compatible with prompt embeds." + ) + + try: + engine_prompts = await self._preprocess_completion( + request, + prompt_input=request.prompt, + prompt_embeds=request.prompt_embeds, + ) + except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(e) + + return engine_prompts + + def _make_request_with_harmony( + self, + request: ChatCompletionRequest, + should_include_tools: bool = True, + ): + """Copied from OpenAIServingChat._make_request_with_harmony.""" + messages: list[OpenAIMessage] = [] + + # because of issues with pydantic we need to potentially + # re-serialize the tool_calls field of the request + # for more info: see comment in `maybe_serialize_tool_calls` + _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] + + # Add system message. + # NOTE: In Chat Completion API, browsing is enabled by default + # if the model supports it. TODO: Support browsing. + assert not self.supports_browsing + assert not self.supports_code_interpreter + sys_msg = get_system_message( + reasoning_effort=request.reasoning_effort, + browser_description=None, + python_description=None, + with_custom_tools=should_include_tools, + ) + messages.append(sys_msg) + + # Add developer message. + if request.tools: + dev_msg = get_developer_message( + tools=request.tools if should_include_tools else None # type: ignore[arg-type] + ) + messages.append(dev_msg) + + # Add user message. + messages.extend(parse_chat_inputs_to_harmony_messages(request.messages)) + + # Render prompt token ids. + prompt_token_ids = render_for_completion(messages) + engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids) + + # Add cache_salt if provided in the request + if request.cache_salt is not None: + engine_prompt["cache_salt"] = request.cache_salt + + return messages, [engine_prompt] + + async def show_available_models(self) -> ModelList: + """Returns the models served by this render server.""" + max_model_len = self.model_config.max_model_len + return ModelList( + data=[ + ModelCard( + id=name, + max_model_len=max_model_len, + root=self.model_config.model, + permission=[ModelPermission()], + ) + for name in self.served_model_names + ] + ) + + def create_error_response( + self, + message: str | Exception, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + param: str | None = None, + ) -> ErrorResponse: + """Copied from OpenAIServing.create_error_response.""" + exc: Exception | None = None + + if isinstance(message, Exception): + exc = message + + from vllm.exceptions import VLLMValidationError + + if isinstance(exc, VLLMValidationError): + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = exc.parameter + elif isinstance(exc, (ValueError, TypeError, RuntimeError, OverflowError)): + # Common validation errors from user input + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + elif isinstance(exc, NotImplementedError): + err_type = "NotImplementedError" + status_code = HTTPStatus.NOT_IMPLEMENTED + param = None + elif exc.__class__.__name__ == "TemplateError": + # jinja2.TemplateError (avoid importing jinja2) + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + else: + err_type = "InternalServerError" + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + param = None + + message = str(exc) + + if self.log_error_stack: + exc_type, _, _ = sys.exc_info() + if exc_type is not None: + traceback.print_exc() + else: + traceback.print_stack() + + return ErrorResponse( + error=ErrorInfo( + message=sanitize_message(message), + type=err_type, + code=status_code.value, + param=param, + ) + ) + + def _is_model_supported(self, model_name: str) -> bool: + """Simplified from OpenAIServing._is_model_supported (no LoRA support).""" + return model_name in self.served_model_names + + async def _check_model( + self, + request: Any, + ) -> ErrorResponse | None: + """Simplified from OpenAIServing._check_model (no LoRA support).""" + if self._is_model_supported(request.model): + return None + return self.create_error_response( + message=f"The model `{request.model}` does not exist.", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND, + param="model", + ) + + def _validate_chat_template( + self, + request_chat_template: str | None, + chat_template_kwargs: dict[str, Any] | None, + trust_request_chat_template: bool, + ) -> ErrorResponse | None: + """Copied from OpenAIServing._validate_chat_template.""" + if not trust_request_chat_template and ( + request_chat_template is not None + or ( + chat_template_kwargs + and chat_template_kwargs.get("chat_template") is not None + ) + ): + return self.create_error_response( + "Chat template is passed with request, but " + "--trust-request-chat-template is not set. " + "Refused request with untrusted chat template." + ) + return None + + async def _preprocess_completion( + self, + request: Any, + prompt_input: str | list[str] | list[int] | list[list[int]] | None, + prompt_embeds: bytes | list[bytes] | None, + ) -> list[ProcessorInputs]: + """Copied from OpenAIServing._preprocess_completion.""" + prompts = list[SingletonPrompt | bytes]() + if prompt_embeds is not None: # embeds take higher priority + prompts.extend(prompt_to_seq(prompt_embeds)) + if prompt_input is not None: + prompts.extend(prompt_to_seq(prompt_input)) + return await self._preprocess_cmpl(request, prompts) + + async def _preprocess_cmpl( + self, + request: Any, + prompts: Sequence[PromptType | bytes], + ) -> list[ProcessorInputs]: + """Copied from OpenAIServing._preprocess_cmpl.""" + renderer = self.renderer + model_config = self.model_config + + parsed_prompts = [ + ( + prompt + if isinstance(prompt, bytes) + else parse_model_prompt(model_config, prompt) + ) + for prompt in prompts + ] + tok_params = request.build_tok_params(model_config) + + return await renderer.render_cmpl_async( + parsed_prompts, + tok_params, + prompt_extras={ + k: v + for k in ("mm_processor_kwargs", "cache_salt") + if (v := getattr(request, k, None)) is not None + }, + ) + + async def _preprocess_chat( + self, + request: Any, + messages: list[Any], + default_template: str | None, + default_template_content_format: ChatTemplateContentFormatOption, + default_template_kwargs: dict[str, Any] | None, + tool_dicts: list[dict[str, Any]] | None = None, + tool_parser: Callable[[TokenizerLike], ToolParser] | None = None, + ) -> tuple[list[ConversationMessage], list[ProcessorInputs]]: + """Copied from OpenAIServing._preprocess_chat. + + Differences: isinstance check is ChatCompletionRequest-only + (ResponsesRequest not supported here); TODO comment dropped accordingly. + """ + renderer = self.renderer + + default_template_kwargs = merge_kwargs( + default_template_kwargs, + dict( + tools=tool_dicts, + tokenize=is_mistral_tokenizer(renderer.tokenizer), + ), + ) + + tok_params = request.build_tok_params(self.model_config) + chat_params = request.build_chat_params( + default_template, default_template_content_format + ).with_defaults(default_template_kwargs) + + (conversation,), (engine_prompt,) = await renderer.render_chat_async( + [messages], + chat_params, + tok_params, + prompt_extras={ + k: v + for k in ("mm_processor_kwargs", "cache_salt") + if (v := getattr(request, k, None)) is not None + }, + ) + + # tool parsing is done only if a tool_parser has been set and if + # tool_choice is not "none" (if tool_choice is "none" but a tool_parser + # is set, we want to prevent parsing a tool_call hallucinated by the LLM + if tool_parser is not None: + tool_choice = getattr(request, "tool_choice", "none") + if tool_choice != "none": + if not isinstance(request, ChatCompletionRequest): + msg = ( + "Tool usage is only supported " + " for ChatCompletionRequest, but got " + f"{type(request).__name__}" + ) + raise NotImplementedError(msg) + tokenizer = renderer.get_tokenizer() + request = tool_parser(tokenizer).adjust_request(request=request) # type: ignore[arg-type] + + return conversation, [engine_prompt] diff --git a/vllm/envs.py b/vllm/envs.py index 66ddd7918768..716810da1c27 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -244,6 +244,7 @@ VLLM_CUDA_COMPATIBILITY_PATH: str | None = None VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False VLLM_ELASTIC_EP_DRAIN_REQUESTS: bool = False + VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: bool = False def get_default_cache_root(): @@ -1628,6 +1629,12 @@ def _get_or_set_default() -> str: "VLLM_ELASTIC_EP_DRAIN_REQUESTS": lambda: bool( int(os.getenv("VLLM_ELASTIC_EP_DRAIN_REQUESTS", "0")) ), + # If set to 1, enable CUDA graph memory estimation during memory profiling. + # This profiles CUDA graph memory usage to provide more accurate KV cache + # memory allocation. Disabled by default to preserve existing behavior. + "VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS": lambda: bool( + int(os.getenv("VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS", "0")) + ), } diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index b0e16fa5240d..97ae3ef1b9d7 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -331,11 +331,6 @@ def __init__( calculate_kv_scales = False self.quant_config = quant_config - # Initialize KV cache quantization attributes - self.kv_cache_dtype = kv_cache_dtype - self.calculate_kv_scales = calculate_kv_scales - _init_kv_cache_quant(self, quant_config, prefix) - dtype = torch.get_default_dtype() self.attn_backend = get_attn_backend( self.head_size, @@ -347,6 +342,36 @@ def __init__( num_heads=self.num_heads, ) + # FlashMLA Sparse Attention fp8 backend uses "fp8_ds_mla" kv-cache format + # Automatically convert fp8 kv-cache format to "fp8_ds_mla" + if ( + self.attn_backend.get_name() == "FLASHMLA_SPARSE" + and kv_cache_dtype.startswith("fp8") + and kv_cache_dtype != "fp8_ds_mla" + ): + assert cache_config is not None + cache_config.cache_dtype = "fp8_ds_mla" + kv_cache_dtype = "fp8_ds_mla" + logger.info_once( + "Using DeepSeek's fp8_ds_mla KV cache format. To use standard " + "fp8 kv-cache format, please set `--attention-backend " + "FLASHINFER_MLA_SPARSE`" + ) + + if ( + self.attn_backend.get_name() == "FLASHINFER_MLA_SPARSE" + and kv_cache_dtype.startswith("fp8") + ): + logger.info_once( + "Using standard fp8 KV cache format. To use DeepSeek's fp8_ds_mla " + "KV cache format, please set `--attention-backend FLASHMLA_SPARSE`" + ) + + # Initialize KV cache quantization attributes + self.kv_cache_dtype = kv_cache_dtype + self.calculate_kv_scales = calculate_kv_scales + _init_kv_cache_quant(self, quant_config, prefix) + if ( cache_config is not None and cache_config.enable_prefix_caching diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 6200477092ab..92b0f0e0da9d 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1204,17 +1204,26 @@ def weight_loader( # Determine per-tensor weight scale patterns based on variant # Use the dedicated method instead of brittle string matching uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern() + quant_method = getattr(param, "quant_method", None) # Call _load_per_tensor_weight_scale() to load per-tensor (scalar) # weights scales. # Input scales are always per-tensor. # Weight scales: FP4 uses "weight_scale_2" and FP8 uses # "weight_scale" for per-tensor scales. + # NOTE: ModelOpt MXFP8 MoE uses block scales in weight_scale + # tensors (quant_method=BLOCK), so those must not be treated + # as per-tensor scalars here. + is_block_weight_scale = ( + "weight_scale" in weight_name + and quant_method == FusedMoeWeightScaleSupported.BLOCK.value + ) is_per_tensor = ( "weight_scale_2" in weight_name if uses_weight_scale_2 else "weight_scale" in weight_name ) or "input_scale" in weight_name + is_per_tensor = is_per_tensor and not is_block_weight_scale if is_per_tensor: self._load_per_tensor_weight_scale( shard_id=shard_id, diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py new file mode 100644 index 000000000000..49406ba935e2 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from enum import Enum + +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig + +logger = init_logger(__name__) + + +class MxFp8MoeBackend(Enum): + FLASHINFER_TRTLLM = "FLASHINFER_TRTLLM" + + +def select_mxfp8_moe_backend( + config: FusedMoEConfig, +) -> MxFp8MoeBackend: + if config.is_lora_enabled: + raise NotImplementedError("LoRA is not supported for MXFP8 MoE.") + + AVAILABLE_BACKENDS = [ + MxFp8MoeBackend.FLASHINFER_TRTLLM, + ] + + runner_backend = config.moe_backend + if runner_backend != "auto": + mapping = { + "flashinfer_trtllm": MxFp8MoeBackend.FLASHINFER_TRTLLM, + } + if backend := mapping.get(runner_backend): + logger.info_once( + "Using '%s' MxFp8 MoE backend (user-requested).", + backend.value, + ) + return backend + raise ValueError( + f"moe_backend='{runner_backend}' is not supported for MXFP8 MoE. " + f"Expected one of {list(mapping.keys())}." + ) + + # Auto-select: only one backend available for now. + backend = AVAILABLE_BACKENDS[0] + logger.info_once("Using '%s' MxFp8 MoE backend.", backend.value) + return backend diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index f167e2134470..977612313f63 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -9,17 +9,19 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger -from vllm.model_executor.kernels.linear import ( - init_fp8_linear_kernel, -) +from vllm.model_executor.kernels.linear import init_fp8_linear_kernel from vllm.model_executor.layers.attention import Attention, MLAAttention +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, + RoutingMethodType, +) +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, ) from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, - FusedMoEMethodBase, FusedMoeWeightScaleSupported, ) from vllm.model_executor.layers.fused_moe.oracle.fp8 import ( @@ -28,6 +30,10 @@ make_fp8_moe_quant_config, select_fp8_moe_backend, ) +from vllm.model_executor.layers.fused_moe.oracle.mxfp8 import ( + MxFp8MoeBackend, + select_mxfp8_moe_backend, +) from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import ( convert_to_nvfp4_moe_kernel_format, is_global_sf_supported_for_nvfp4_backend, @@ -46,6 +52,9 @@ QuantizeMethodBase, ) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + swap_w13_to_w31, +) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( W8A8BlockFp8LinearOp, process_fp8_input_tensor_strategy_moe, @@ -60,6 +69,7 @@ MXFP8_VALUE_DTYPE, Mxfp8LinearBackend, Mxfp8LinearOp, + mxfp8_e4m3_quantize, swizzle_mxfp8_scale, ) from vllm.model_executor.layers.quantization.utils.nvfp4_utils import ( @@ -86,7 +96,8 @@ ModelWeightParameter, PerTensorScaleParameter, ) -from vllm.model_executor.utils import replace_parameter +from vllm.model_executor.utils import replace_parameter, set_weight_attrs +from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe if TYPE_CHECKING: from vllm.model_executor.models.utils import WeightsMapper @@ -1487,17 +1498,6 @@ def get_min_capability(cls) -> int: # MXFP8 hardware acceleration requires Blackwell (SM100) or newer return 100 - def get_quant_method( - self, layer: torch.nn.Module, prefix: str - ) -> "QuantizeMethodBase | None": - # MXFP8 does not yet support MoE models - if isinstance(layer, FusedMoE): - raise NotImplementedError( - "MXFP8 quantization does not yet support MoE models. " - "Please use FP8 or NVFP4 quantization for MoE models." - ) - return super().get_quant_method(layer, prefix) - @classmethod def override_quantization_method( cls, hf_quant_cfg, user_quant @@ -1699,8 +1699,351 @@ def apply( ) +class ModelOptMxFp8FusedMoE(FusedMoEMethodBase): + """FlashInfer TRTLLM MXFP8 block-scale MoE for ModelOpt checkpoints.""" + + def __init__( + self, + quant_config: ModelOptMxFp8Config, + moe_config: FusedMoEConfig, + ) -> None: + super().__init__(moe_config) + self.quant_config = quant_config + assert self.quant_config.is_checkpoint_mxfp8_serialized + + # Select MXFP8 MoE backend + self.mxfp8_backend = select_mxfp8_moe_backend(self.moe) + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + layer.intermediate_size_per_partition = intermediate_size_per_partition + layer.hidden_size = hidden_size + layer.orig_dtype = params_dtype + + if hidden_size % MXFP8_BLOCK_SIZE != 0: + raise ValueError( + f"MXFP8 MoE requires hidden_size divisible by {MXFP8_BLOCK_SIZE}, " + f"got {hidden_size}." + ) + if intermediate_size_per_partition % MXFP8_BLOCK_SIZE != 0: + raise ValueError( + "MXFP8 MoE requires intermediate_size_per_partition divisible by " + f"{MXFP8_BLOCK_SIZE}, got {intermediate_size_per_partition}." + ) + + layer.num_experts = num_experts + weight_loader = extra_weight_attrs.get("weight_loader") + w13_num_shards = 2 if self.moe.is_act_and_mul else 1 + + # GEMM 1 weights: [E, (2I or I), H] + w13_weight = ModelWeightParameter( + data=torch.empty( + num_experts, + w13_num_shards * intermediate_size_per_partition, + hidden_size, + dtype=MXFP8_VALUE_DTYPE, + ), + input_dim=2, + output_dim=1, + weight_loader=weight_loader, + ) + layer.register_parameter("w13_weight", w13_weight) + + # GEMM 2 weights: [E, H, I] + w2_weight = ModelWeightParameter( + data=torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=MXFP8_VALUE_DTYPE, + ), + input_dim=2, + output_dim=1, + weight_loader=weight_loader, + ) + layer.register_parameter("w2_weight", w2_weight) + + # Per-block (K=32) E8M0 scales. + w13_weight_scale = ModelWeightParameter( + data=torch.empty( + num_experts, + w13_num_shards * intermediate_size_per_partition, + hidden_size // MXFP8_BLOCK_SIZE, + dtype=MXFP8_SCALE_DTYPE, + ), + input_dim=2, + output_dim=1, + weight_loader=weight_loader, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + + w2_weight_scale = ModelWeightParameter( + data=torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // MXFP8_BLOCK_SIZE, + dtype=MXFP8_SCALE_DTYPE, + ), + input_dim=2, + output_dim=1, + weight_loader=weight_loader, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + + # Ensure the generic MoE weight-loader treats these as block scales. + set_weight_attrs( + layer.w13_weight_scale, + {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}, + ) + set_weight_attrs( + layer.w2_weight_scale, + {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}, + ) + + @staticmethod + def _check_weight_dtypes(layer: torch.nn.Module) -> None: + """Validate weight and scale dtypes before processing.""" + expected = { + "w13_weight": MXFP8_VALUE_DTYPE, + "w2_weight": MXFP8_VALUE_DTYPE, + "w13_weight_scale": MXFP8_SCALE_DTYPE, + "w2_weight_scale": MXFP8_SCALE_DTYPE, + } + for name, expected_dtype in expected.items(): + actual = getattr(layer, name).dtype + if actual != expected_dtype: + raise ValueError( + f"Expected {name} dtype {expected_dtype}, got {actual}." + ) + + def _shuffle_weights_for_trtllm(self, layer: torch.nn.Module) -> None: + """Shuffle weights and scales into FlashInfer TRTLLM MXFP8 layout.""" + from flashinfer import ( + reorder_rows_for_gated_act_gemm, + shuffle_matrix_a, + shuffle_matrix_sf_a, + ) + + epilogue_tile_m = 128 + num_experts = layer.w13_weight.shape[0] + is_gated = self.moe.is_act_and_mul + intermediate_size_factor = 2 if is_gated else 1 + + w13_weight = layer.w13_weight.data + w13_scale = layer.w13_weight_scale.data + if is_gated: + # FI TRTLLM gated kernels use W31 ordering. Model checkpoints store + # gated projection as W13, so convert once before shuffling. + w13_weight = swap_w13_to_w31(w13_weight) + w13_scale = swap_w13_to_w31(w13_scale) + + w13_weight_shuffled = [] + w2_weight_shuffled = [] + w13_scale_shuffled = [] + w2_scale_shuffled = [] + for i in range(num_experts): + w13_i = w13_weight[i].reshape( + intermediate_size_factor * layer.intermediate_size_per_partition, -1 + ) + w13_sf_i = w13_scale[i].reshape( + intermediate_size_factor * layer.intermediate_size_per_partition, -1 + ) + if is_gated: + # Reorder rows for gated activation layout expected by TRTLLM. + w13_i = reorder_rows_for_gated_act_gemm(w13_i.clone()) + w13_sf_i = reorder_rows_for_gated_act_gemm(w13_sf_i.clone()) + + w13_shuffled_i = shuffle_matrix_a(w13_i.view(torch.uint8), epilogue_tile_m) + w2_shuffled_i = shuffle_matrix_a( + layer.w2_weight.data[i].view(torch.uint8), epilogue_tile_m + ) + w13_weight_shuffled.append( + w13_shuffled_i.contiguous().view(MXFP8_VALUE_DTYPE) + ) + w2_weight_shuffled.append( + w2_shuffled_i.contiguous().view(MXFP8_VALUE_DTYPE) + ) + w13_sf_shuffled_i = shuffle_matrix_sf_a( + w13_sf_i.view(torch.uint8).reshape( + intermediate_size_factor * layer.intermediate_size_per_partition, + -1, + ), + epilogue_tile_m, + ) + w2_sf_shuffled_i = shuffle_matrix_sf_a( + layer.w2_weight_scale.data[i] + .view(torch.uint8) + .reshape(layer.hidden_size, -1), + epilogue_tile_m, + ) + w13_scale_shuffled.append( + w13_sf_shuffled_i.contiguous().view(MXFP8_SCALE_DTYPE) + ) + w2_scale_shuffled.append( + w2_sf_shuffled_i.contiguous().view(MXFP8_SCALE_DTYPE) + ) + + replace_parameter( + layer, "w13_weight", torch.stack(w13_weight_shuffled).contiguous() + ) + replace_parameter( + layer, "w2_weight", torch.stack(w2_weight_shuffled).contiguous() + ) + replace_parameter( + layer, + "w13_weight_scale", + torch.stack(w13_scale_shuffled).contiguous(), + ) + replace_parameter( + layer, + "w2_weight_scale", + torch.stack(w2_scale_shuffled).contiguous(), + ) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return + + self._check_weight_dtypes(layer) + self._shuffle_weights_for_trtllm(layer) + layer._already_called_process_weights_after_loading = True + + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> mk.FusedMoEPrepareAndFinalizeModular | None: + raise ValueError( + f"{self.__class__.__name__} uses the new modular kernel initialization " + "logic. This function should not be called." + ) + + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular, + layer: torch.nn.Module, + ) -> mk.FusedMoEExpertsModular: + raise ValueError( + f"{self.__class__.__name__} uses the new modular kernel initialization " + "logic. This function should not be called." + ) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + # TRTLLM MXFP8 path is monolithic and does not use modular kernel config. + return None + + @property + def is_monolithic(self) -> bool: + return self.mxfp8_backend == MxFp8MoeBackend.FLASHINFER_TRTLLM + + def apply_monolithic( + self, + layer: FusedMoE, + x: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + from flashinfer.fused_moe.core import ( + ActivationType, + Fp8QuantizationType, + ) + + assert self.mxfp8_backend == MxFp8MoeBackend.FLASHINFER_TRTLLM + + if layer.enable_eplb: + raise NotImplementedError( + "EPLB is not supported for FlashInfer TRTLLM MXFP8 MoE backend." + ) + + supported_activations = [MoEActivation.SILU] + if layer.activation not in supported_activations: + raise NotImplementedError( + "FlashInfer TRTLLM MXFP8 MoE supports only " + f"{supported_activations}, got {layer.activation}." + ) + + # Map vLLM MoEActivation to FlashInfer ActivationType. + activation_map = { + MoEActivation.SILU: ActivationType.Swiglu, + MoEActivation.RELU2_NO_MUL: ActivationType.Relu2, + } + fi_activation_type: ActivationType = activation_map[layer.activation] + + # DeepSeekV3 routing requires float32 logits; others expect bfloat16. + if layer.routing_method_type == RoutingMethodType.DeepSeekV3: + assert router_logits.dtype == torch.float32, ( + "DeepSeekV3 routing requires float32 router_logits, " + f"got {router_logits.dtype}." + ) + else: + router_logits = router_logits.to(torch.bfloat16) + + # Treat 0 as "unset" for compatibility with ungrouped routing configs. + n_group = layer.num_expert_group or None + topk_group = layer.topk_group or None + + hidden_states_mxfp8, hidden_states_scale = mxfp8_e4m3_quantize( + x, + is_sf_swizzled_layout=False, + ) + + kwargs: dict = dict( + routing_logits=router_logits, + routing_bias=layer.e_score_correction_bias, + hidden_states=hidden_states_mxfp8, + hidden_states_scale=hidden_states_scale, + gemm1_weights=layer.w13_weight, + gemm1_weights_scale=layer.w13_weight_scale, + gemm2_weights=layer.w2_weight, + gemm2_weights_scale=layer.w2_weight_scale, + num_experts=layer.global_num_experts, + top_k=layer.top_k, + # Keep Optional semantics: FlashInfer expects None for non-grouped + # routing (e.g. Qwen3 Renormalize), not 0. + n_group=n_group, + topk_group=topk_group, + intermediate_size=layer.intermediate_size_per_partition, + local_expert_offset=layer.ep_rank * layer.local_num_experts, + local_num_experts=layer.local_num_experts, + routed_scaling_factor=layer.routed_scaling_factor, + routing_method_type=layer.routing_method_type, + use_shuffled_weight=True, + weight_layout=0, + fp8_quantization_type=Fp8QuantizationType.MxFp8, + ) + + if fi_activation_type != ActivationType.Swiglu: + raise NotImplementedError( + "FlashInfer TRTLLM MXFP8 MoE supports only Swiglu activation, " + f"got {fi_activation_type}." + ) + + return flashinfer_trtllm_fp8_block_scale_moe(**kwargs) + + def apply( + self, + layer: FusedMoE, + x: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + assert not self.is_monolithic + raise NotImplementedError( + "Non-monolithic MXFP8 MoE path is not yet implemented." + ) + + # Register the method classes for ModelOptMxFp8Config ModelOptMxFp8Config.LinearMethodCls = ModelOptMxFp8LinearMethod +ModelOptMxFp8Config.FusedMoEMethodCls = ModelOptMxFp8FusedMoE ModelOptMxFp8Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 734e3ad2339f..0e35bedbc99f 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -31,20 +31,13 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: class DeepseekV32ForCausalLM(VerifyAndUpdateConfig): @classmethod def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: - """ - Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32 - """ hf_config = vllm_config.model_config.hf_config # Mirror the check in vllm/model_executor/models/deepseek_v2.py is_v32 = hasattr(hf_config, "index_topk") assert is_v32 - # For DeepSeekV3.2, a custom fp8 format is used when fp8 kv-cache is enabled. cache_config = vllm_config.cache_config - if cache_config.cache_dtype.startswith("fp8"): - cache_config.cache_dtype = "fp8_ds_mla" - logger.info("Using custom fp8 kv-cache format for DeepSeekV3.2") if cache_config.cache_dtype == "bfloat16": cache_config.cache_dtype = "auto" logger.info("Using bfloat16 kv-cache for DeepSeekV3.2") diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 5dd883f222e5..8277e99fdc37 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -756,7 +756,7 @@ def _min_latency_fused_qkv_a_proj_fake( ) -class DeepSeekV2FusedQkvAProj(MergedColumnParallelLinear): +class DeepSeekV2FusedQkvAProjLinear(MergedColumnParallelLinear): def __init__( self, input_size: int, @@ -848,7 +848,7 @@ def __init__( self.max_position_embeddings = max_position_embeddings if self.q_lora_rank is not None: - self.fused_qkv_a_proj = DeepSeekV2FusedQkvAProj( + self.fused_qkv_a_proj = DeepSeekV2FusedQkvAProjLinear( self.hidden_size, [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], quant_config=quant_config, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 9b9beadc099e..b32067557622 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -17,11 +17,11 @@ from typing import Annotated, Any, Literal, TypeAlias, TypeVar import einops +import numpy as np import numpy.typing as npt import regex as re import torch import torch.nn as nn -import torchvision.transforms as T from PIL import Image from transformers import BatchFeature, PretrainedConfig, TensorType @@ -214,7 +214,12 @@ class NanoNemotronVLVideoEmbeddingInputs(TensorSchema): def dynamic_preprocess( - image, *, image_size=512, max_num_tiles=12, use_thumbnail=True, idx=0 + image, + *, + image_size=512, + max_num_tiles=12, + use_thumbnail=True, + idx=0, ): orig_width, orig_height = image.size @@ -227,35 +232,44 @@ def dynamic_preprocess( image_size=image_size, use_thumbnail=False, ) - # resize the image - resized_img = image.resize((target_width, target_height)) - processed_images = [] - for i in range(blocks): - box = ( - (i % (target_width // image_size)) * image_size, - (i // (target_width // image_size)) * image_size, - ((i % (target_width // image_size)) + 1) * image_size, - ((i // (target_width // image_size)) + 1) * image_size, - ) - # split the image - split_img = resized_img.crop(box) - processed_images.append(split_img) - assert len(processed_images) == blocks - if use_thumbnail and len(processed_images) != 1: - thumbnail_img = image.resize((image_size, image_size)) - processed_images.append(thumbnail_img) - - processed_images = [ - img.convert("RGB") if img.mode != "RGB" else img for img in processed_images - ] - processed_images = [ - T.Resize((image_size, image_size), interpolation=T.InterpolationMode.BICUBIC)( - img + + image = np.asarray( + image.convert("RGB") if image.mode != "RGB" else image, dtype=np.uint8 + ) + + image = torch.from_numpy(image).unsqueeze(0) # (1, H, W, 3) + image = image.permute(0, 3, 1, 2) # (1, 3, H, W) + + resized_img = torch.nn.functional.interpolate( + image, + size=(target_height, target_width), + mode="bicubic", + align_corners=False, + antialias=True, + ) + B, C, H, W = resized_img.shape + hp, wp = H // image_size, W // image_size + patches = ( + resized_img.reshape(B, C, hp, image_size, wp, image_size) + .permute(0, 2, 4, 1, 3, 5) + .reshape(B * hp * wp, C, image_size, image_size) + / 255.0 + ) + + if use_thumbnail and patches.shape[0] > 1: + thumb = ( + torch.nn.functional.interpolate( + image, + size=(image_size, image_size), + mode="bicubic", + align_corners=False, + antialias=True, + ) + / 255.0 ) - for img in processed_images - ] - processed_images = [T.ToTensor()(img) for img in processed_images] - return processed_images + patches = torch.cat([patches, thumb], dim=0) + + return list(patches) def image_to_pixel_values( @@ -287,22 +301,21 @@ def video_to_pixel_values( ) -> torch.Tensor: assert max_num_tiles == 1, "Video modality always uses one tile" - # Convert each frame to a single resized tile tensor consistent - # with image path - frames_tensors: list[torch.Tensor] = [] - for frame in video: - pil_frame = dynamic_preprocess( - Image.fromarray(frame, mode="RGB"), - image_size=input_size, - max_num_tiles=max_num_tiles, - use_thumbnail=use_thumbnail, - idx=0, + # (num_frames, H, W, C) -> (num_frames, C, H, W) + video_tensor = torch.from_numpy(video).permute(0, 3, 1, 2) + + if video_tensor.shape[2] != input_size or video_tensor.shape[3] != input_size: + video_tensor = torch.nn.functional.interpolate( + video_tensor, + size=(input_size, input_size), + mode="bicubic", + align_corners=False, + antialias=True, ) - # dynamic_preprocess returns tensors already; take the single tile - assert len(pil_frame) >= 1 - frames_tensors.append(pil_frame[-1]) - return torch.stack(frames_tensors) + video_tensor = video_tensor / 255.0 + + return video_tensor def input_conditioner(x, norm_mean, norm_std): @@ -346,12 +359,6 @@ def __init__( self._factor_max = factor_max self.norm_mean = torch.tensor(norm_mean).reshape(3, 1, 1) self.norm_std = torch.tensor(norm_std).reshape(3, 1, 1) - self._transform = T.Compose( - [ - T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), - T.ToTensor(), - ] - ) assert downsample_ratio < 1 reduction_factor = 1 / downsample_ratio assert reduction_factor == 2.0 @@ -441,15 +448,25 @@ class DynamicResolutionParams: patch_size: tuple[int, int] def apply_params(self, params: DynamicResolutionParams) -> list[torch.Tensor]: - resized_img = params.media.resize( - ( - params.patch_size[0] * self._patch_size, - params.patch_size[1] * self._patch_size, + target_size = ( + params.patch_size[1] * self._patch_size, + params.patch_size[0] * self._patch_size, + ) + image = np.asarray( + params.media.convert("RGB") if params.media.mode != "RGB" else params.media, + dtype=np.uint8, + ) + resized_img = ( + torch.nn.functional.interpolate( + torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2), + size=target_size, + mode="bicubic", + align_corners=False, + antialias=True, ) + / 255.0 ) - processed_images = [resized_img] - - return [self._transform(img) for img in processed_images] + return list(resized_img) def process_media( self, @@ -803,6 +820,7 @@ def _preprocess_image( image_repl = self.get_image_repl(feature_size, num_patches) parts[i] = parts[i].replace("", image_repl.full) text = ["".join(parts)] + return text, image_inputs def _make_batch_input(self, input_item: Any | list[Any] | None = None): @@ -922,14 +940,14 @@ def _preprocess_video( frames_indices_lst = [ metadata["frames_indices"] for metadata in video_metadata_lst ] - + video_num_patches = torch.tensor( + [len(item) for item in pixel_values_lst_video] + ) video_inputs = { "pixel_values_flat_video": input_conditioner( torch.cat(pixel_values_lst_video), self.norm_mean, self.norm_std ), - "video_num_patches": torch.tensor( - [len(item) for item in pixel_values_lst_video] - ), + "video_num_patches": video_num_patches, "frames_indices": frames_indices_lst, "frame_duration_ms": torch.tensor(frame_duration_ms_lst), } @@ -985,6 +1003,7 @@ def _preprocess_video( video_repl.full, skip_special_tokens=False ) text = [t.replace("