From 833a7b2fdb586cb7948c3ee7646fa22006d25051 Mon Sep 17 00:00:00 2001
From: VS Chandra Mourya <msrinivasa@together.ai>
Date: Mon, 30 Mar 2026 17:07:50 -0700
Subject: [PATCH 1/6] test(completions): add E2E tests for /v1/completions gRPC
 endpoint

Signed-off-by: VS Chandra Mourya <msrinivasa@together.ai>
---
 e2e_test/completions/__init__.py   |   8 ++
 e2e_test/completions/test_basic.py | 219 +++++++++++++++++++++++++++++
 2 files changed, 227 insertions(+)
 create mode 100644 e2e_test/completions/__init__.py
 create mode 100644 e2e_test/completions/test_basic.py

diff --git a/e2e_test/completions/__init__.py b/e2e_test/completions/__init__.py
new file mode 100644
index 000000000..d4bd72635
--- /dev/null
+++ b/e2e_test/completions/__init__.py
@@ -0,0 +1,8 @@
+"""OpenAI Completions API E2E tests.
+
+Tests for the OpenAI Completions API endpoints (/v1/completions) including:
+- Basic non-streaming and streaming text completion
+- Stop sequences, echo, and suffix handling
+- Parallel sampling (n > 1)
+- Usage statistics validation
+"""
diff --git a/e2e_test/completions/test_basic.py b/e2e_test/completions/test_basic.py
new file mode 100644
index 000000000..fef02e738
--- /dev/null
+++ b/e2e_test/completions/test_basic.py
@@ -0,0 +1,219 @@
+"""Basic tests for OpenAI Completions API (/v1/completions).
+
+Tests for non-streaming and streaming text completion, echo, suffix,
+stop sequences, and parallel sampling via the OpenAI SDK.
+"""
+
+from __future__ import annotations
+
+import logging
+
+import pytest
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.engine("sglang", "vllm")
+@pytest.mark.gpu(1)
+@pytest.mark.model("meta-llama/Llama-3.1-8B-Instruct")
+@pytest.mark.parametrize("setup_backend", ["grpc"], indirect=True)
+class TestCompletionBasic:
+    """Tests for OpenAI-compatible /v1/completions API (non-streaming)."""
+
+    def test_non_streaming_basic(self, model, api_client):
+        """Test basic non-streaming text completion with response structure."""
+
+        response = api_client.completions.create(
+            model=model,
+            prompt="The capital of France is",
+            max_tokens=20,
+            temperature=0,
+        )
+
+        assert response.id is not None
+        assert response.object == "text_completion"
+        assert response.model is not None
+        assert response.created is not None
+        assert len(response.choices) == 1
+
+        choice = response.choices[0]
+        assert choice.index == 0
+        assert isinstance(choice.text, str)
+        assert len(choice.text) > 0
+        assert choice.finish_reason in ("stop", "length")
+
+        assert response.usage is not None
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens == (
+            response.usage.prompt_tokens + response.usage.completion_tokens
+        )
+
+    def test_non_streaming_max_tokens(self, model, api_client):
+        """Test that max_tokens limits output length."""
+
+        response = api_client.completions.create(
+            model=model,
+            prompt="Count from 1 to 100: 1, 2, 3,",
+            max_tokens=5,
+            temperature=0,
+        )
+
+        assert len(response.choices) == 1
+        assert response.choices[0].finish_reason == "length"
+        assert response.usage.completion_tokens <= 5
+
+    def test_non_streaming_stop_sequence(self, model, api_client):
+        """Test that stop sequences cause the model to stop generating."""
+
+        response = api_client.completions.create(
+            model=model,
+            prompt="Count: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10",
+            max_tokens=200,
+            temperature=0,
+            stop=[","],
+        )
+
+        assert response.choices[0].finish_reason == "stop"
+        assert "," not in response.choices[0].text
+
+    def test_non_streaming_echo(self, model, api_client):
+        """Test that echo=True prepends the prompt to the output."""
+
+        prompt = "The capital of France is"
+        response = api_client.completions.create(
+            model=model,
+            prompt=prompt,
+            max_tokens=20,
+            temperature=0,
+            echo=True,
+        )
+
+        assert response.choices[0].text.startswith(prompt)
+
+    def test_non_streaming_suffix(self, model, api_client):
+        """Test that suffix is appended to the output."""
+
+        suffix = " -- END"
+        response = api_client.completions.create(
+            model=model,
+            prompt="The capital of France is",
+            max_tokens=20,
+            temperature=0,
+            suffix=suffix,
+        )
+
+        assert response.choices[0].text.endswith(suffix)
+
+    @pytest.mark.parametrize("n", [1, 2])
+    def test_non_streaming_parallel_sampling(self, model, api_client, n):
+        """Test parallel sampling with n > 1."""
+
+        temperature = 0.7 if n > 1 else 0
+        response = api_client.completions.create(
+            model=model,
+            prompt="The meaning of life is",
+            max_tokens=30,
+            temperature=temperature,
+            n=n,
+        )
+
+        assert len(response.choices) == n
+        for i, choice in enumerate(response.choices):
+            assert choice.index == i
+            assert isinstance(choice.text, str)
+            assert len(choice.text) > 0
+
+    def test_non_streaming_usage(self, model, api_client):
+        """Test that usage statistics are returned correctly."""
+
+        response = api_client.completions.create(
+            model=model,
+            prompt="Hello",
+            max_tokens=10,
+            temperature=0,
+        )
+
+        assert response.usage is not None
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens == (
+            response.usage.prompt_tokens + response.usage.completion_tokens
+        )
+
+
+@pytest.mark.engine("sglang", "vllm")
+@pytest.mark.gpu(1)
+@pytest.mark.model("meta-llama/Llama-3.1-8B-Instruct")
+@pytest.mark.parametrize("setup_backend", ["grpc"], indirect=True)
+class TestCompletionStreaming:
+    """Tests for streaming /v1/completions API."""
+
+    def test_streaming_basic(self, model, api_client):
+        """Test streaming completion returns chunks with text deltas."""
+
+        stream = api_client.completions.create(
+            model=model,
+            prompt="The capital of France is",
+            max_tokens=20,
+            temperature=0,
+            stream=True,
+        )
+
+        texts = []
+        finish_reasons = []
+        for chunk in stream:
+            assert chunk.object == "text_completion"
+            if chunk.choices:
+                choice = chunk.choices[0]
+                if choice.text:
+                    texts.append(choice.text)
+                if choice.finish_reason:
+                    finish_reasons.append(choice.finish_reason)
+
+        assert len(texts) > 0, "No text chunks received"
+        full_text = "".join(texts)
+        assert len(full_text) > 0
+        assert len(finish_reasons) == 1
+        assert finish_reasons[0] in ("stop", "length")
+
+    def test_streaming_stop_sequence(self, model, api_client):
+        """Test that stop sequences work in streaming mode."""
+
+        stream = api_client.completions.create(
+            model=model,
+            prompt="Count: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10",
+            max_tokens=200,
+            temperature=0,
+            stop=[","],
+            stream=True,
+        )
+
+        texts = []
+        finish_reasons = []
+        for chunk in stream:
+            if chunk.choices:
+                choice = chunk.choices[0]
+                if choice.text:
+                    texts.append(choice.text)
+                if choice.finish_reason:
+                    finish_reasons.append(choice.finish_reason)
+
+        assert "stop" in finish_reasons
+        full_text = "".join(texts)
+        assert "," not in full_text
+
+    def test_streaming_collects_full_text(self, model, api_client):
+        """Test that streaming deltas concatenate to a non-empty completion."""
+
+        stream = api_client.completions.create(
+            model=model,
+            prompt="The capital of France is",
+            max_tokens=20,
+            temperature=0,
+            stream=True,
+        )
+        full_text = "".join(c.choices[0].text for c in stream if c.choices and c.choices[0].text)
+
+        assert len(full_text) > 0
+        assert "Paris" in full_text

From 3eaa520cfedd01f18cf39336630f709cffeaa794 Mon Sep 17 00:00:00 2001
From: VS Chandra Mourya <msrinivasa@together.ai>
Date: Wed, 1 Apr 2026 15:51:40 -0700
Subject: [PATCH 2/6] test(completions): add max_tokens=0 echo tests for
 streaming and non-streaming

Signed-off-by: VS Chandra Mourya <msrinivasa@together.ai>
---
 e2e_test/completions/test_basic.py | 43 ++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/e2e_test/completions/test_basic.py b/e2e_test/completions/test_basic.py
index fef02e738..371cb51a7 100644
--- a/e2e_test/completions/test_basic.py
+++ b/e2e_test/completions/test_basic.py
@@ -141,6 +141,21 @@ def test_non_streaming_usage(self, model, api_client):
             response.usage.prompt_tokens + response.usage.completion_tokens
         )
 
+    def test_non_streaming_echo_max_tokens_zero(self, model, api_client):
+        """Test that echo=True with max_tokens=0 returns just the prompt."""
+
+        prompt = "The capital of France is"
+        response = api_client.completions.create(
+            model=model,
+            prompt=prompt,
+            max_tokens=0,
+            temperature=0,
+            echo=True,
+        )
+
+        assert response.choices[0].text == prompt
+        assert response.usage.completion_tokens == 0
+
 
 @pytest.mark.engine("sglang", "vllm")
 @pytest.mark.gpu(1)
@@ -217,3 +232,31 @@ def test_streaming_collects_full_text(self, model, api_client):
 
         assert len(full_text) > 0
         assert "Paris" in full_text
+
+    def test_streaming_echo_max_tokens_zero(self, model, api_client):
+        """Test that echo=True with max_tokens=0 streams just the prompt."""
+
+        prompt = "The capital of France is"
+        stream = api_client.completions.create(
+            model=model,
+            prompt=prompt,
+            max_tokens=0,
+            temperature=0,
+            echo=True,
+            stream=True,
+        )
+
+        texts = []
+        finish_reasons = []
+        for chunk in stream:
+            if chunk.choices:
+                choice = chunk.choices[0]
+                if choice.text:
+                    texts.append(choice.text)
+                if choice.finish_reason:
+                    finish_reasons.append(choice.finish_reason)
+
+        full_text = "".join(texts)
+        assert full_text == prompt, f"Expected echoed prompt, got: {full_text!r}"
+        assert len(finish_reasons) == 1
+        assert finish_reasons[0] == "stop"

From 32877e8021245242fca971f21474fab01e38da1b Mon Sep 17 00:00:00 2001
From: VS Chandra Mourya <msrinivasa@together.ai>
Date: Wed, 1 Apr 2026 16:41:06 -0700
Subject: [PATCH 3/6] fix(completions): address review nits on E2E tests

- Extract _collect_stream helper to deduplicate streaming loop
- Assert chunk.object == "text_completion" on every streaming chunk
- Strengthen stop sequence assertions (exactly 1 finish_reason + text)
- Accept "stop" or "length" for max_tokens=0 finish_reason
- Add finish_reason assertion to non-streaming max_tokens=0 test

Refs: #1021
Signed-off-by: VS Chandra Mourya <msrinivasa@together.ai>
---
 e2e_test/completions/test_basic.py | 62 +++++++++++++-----------------
 1 file changed, 26 insertions(+), 36 deletions(-)

diff --git a/e2e_test/completions/test_basic.py b/e2e_test/completions/test_basic.py
index 371cb51a7..a4517d51e 100644
--- a/e2e_test/completions/test_basic.py
+++ b/e2e_test/completions/test_basic.py
@@ -154,6 +154,7 @@ def test_non_streaming_echo_max_tokens_zero(self, model, api_client):
         )
 
         assert response.choices[0].text == prompt
+        assert response.choices[0].finish_reason in ("stop", "length")
         assert response.usage.completion_tokens == 0
 
 
@@ -164,6 +165,21 @@ def test_non_streaming_echo_max_tokens_zero(self, model, api_client):
 class TestCompletionStreaming:
     """Tests for streaming /v1/completions API."""
 
+    @staticmethod
+    def _collect_stream(stream):
+        """Consume a streaming response, returning (full_text, finish_reasons)."""
+        texts = []
+        finish_reasons = []
+        for chunk in stream:
+            assert chunk.object == "text_completion"
+            if chunk.choices:
+                choice = chunk.choices[0]
+                if choice.text:
+                    texts.append(choice.text)
+                if choice.finish_reason:
+                    finish_reasons.append(choice.finish_reason)
+        return "".join(texts), finish_reasons
+
     def test_streaming_basic(self, model, api_client):
         """Test streaming completion returns chunks with text deltas."""
 
@@ -175,20 +191,9 @@ def test_streaming_basic(self, model, api_client):
             stream=True,
         )
 
-        texts = []
-        finish_reasons = []
-        for chunk in stream:
-            assert chunk.object == "text_completion"
-            if chunk.choices:
-                choice = chunk.choices[0]
-                if choice.text:
-                    texts.append(choice.text)
-                if choice.finish_reason:
-                    finish_reasons.append(choice.finish_reason)
+        full_text, finish_reasons = self._collect_stream(stream)
 
-        assert len(texts) > 0, "No text chunks received"
-        full_text = "".join(texts)
-        assert len(full_text) > 0
+        assert len(full_text) > 0, "No text chunks received"
         assert len(finish_reasons) == 1
         assert finish_reasons[0] in ("stop", "length")
 
@@ -204,18 +209,11 @@ def test_streaming_stop_sequence(self, model, api_client):
             stream=True,
         )
 
-        texts = []
-        finish_reasons = []
-        for chunk in stream:
-            if chunk.choices:
-                choice = chunk.choices[0]
-                if choice.text:
-                    texts.append(choice.text)
-                if choice.finish_reason:
-                    finish_reasons.append(choice.finish_reason)
+        full_text, finish_reasons = self._collect_stream(stream)
 
-        assert "stop" in finish_reasons
-        full_text = "".join(texts)
+        assert len(finish_reasons) == 1
+        assert finish_reasons[0] == "stop"
+        assert len(full_text) > 0, "No text chunks received"
         assert "," not in full_text
 
     def test_streaming_collects_full_text(self, model, api_client):
@@ -228,7 +226,8 @@ def test_streaming_collects_full_text(self, model, api_client):
             temperature=0,
             stream=True,
         )
-        full_text = "".join(c.choices[0].text for c in stream if c.choices and c.choices[0].text)
+
+        full_text, _ = self._collect_stream(stream)
 
         assert len(full_text) > 0
         assert "Paris" in full_text
@@ -246,17 +245,8 @@ def test_streaming_echo_max_tokens_zero(self, model, api_client):
             stream=True,
         )
 
-        texts = []
-        finish_reasons = []
-        for chunk in stream:
-            if chunk.choices:
-                choice = chunk.choices[0]
-                if choice.text:
-                    texts.append(choice.text)
-                if choice.finish_reason:
-                    finish_reasons.append(choice.finish_reason)
+        full_text, finish_reasons = self._collect_stream(stream)
 
-        full_text = "".join(texts)
         assert full_text == prompt, f"Expected echoed prompt, got: {full_text!r}"
         assert len(finish_reasons) == 1
-        assert finish_reasons[0] == "stop"
+        assert finish_reasons[0] in ("stop", "length")

From 830952d26d3e94e0aa1f53a61da2830698342887 Mon Sep 17 00:00:00 2001
From: VS Chandra Mourya <msrinivasa@together.ai>
Date: Wed, 8 Apr 2026 09:26:57 -0700
Subject: [PATCH 4/6] ci(completions): add e2e-1gpu-completions job to PR
 pipeline

Signed-off-by: VS Chandra Mourya <msrinivasa@together.ai>
---
 .github/workflows/pr-test-rust.yml | 37 +++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr-test-rust.yml b/.github/workflows/pr-test-rust.yml
index 67188d40d..dbd1fbe72 100644
--- a/.github/workflows/pr-test-rust.yml
+++ b/.github/workflows/pr-test-rust.yml
@@ -367,6 +367,7 @@ jobs:
     outputs:
       common: ${{ steps.filter.outputs.common }}
       chat-completions: ${{ steps.filter.outputs.chat-completions }}
+      completions: ${{ steps.filter.outputs.completions }}
       agentic: ${{ steps.filter.outputs.agentic }}
       embeddings: ${{ steps.filter.outputs.embeddings }}
       go-bindings: ${{ steps.filter.outputs.go-bindings }}
@@ -410,6 +411,10 @@ jobs:
               - 'e2e_test/messages/**'
               - 'scripts/ci_agentic_svc_deps.sh'
               - 'scripts/oracle_flyway/**'
+            completions:
+              - 'crates/grpc_client/**'
+              - 'grpc_servicer/**'
+              - 'e2e_test/completions/**'
             embeddings:
               - 'e2e_test/embeddings/**'
             go-bindings:
@@ -450,6 +455,35 @@ jobs:
       test_dirs: e2e_test/chat_completions
     secrets: inherit
 
+  e2e-1gpu-completions:
+    name: e2e-1gpu-completions (${{ matrix.engine }})
+    needs: [build-wheel, detect-changes]
+    if: >-
+      always()
+      && !cancelled()
+      && needs.build-wheel.result == 'success'
+      && github.actor != 'dependabot[bot]'
+      && (github.event_name != 'pull_request'
+          || (needs.detect-changes.result == 'success'
+              && (needs.detect-changes.outputs.common == 'true'
+                  || needs.detect-changes.outputs.completions == 'true')))
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - engine: sglang
+            timeout: 20
+          - engine: vllm
+            timeout: 20
+    uses: ./.github/workflows/e2e-gpu-job.yml
+    with:
+      engine: ${{ matrix.engine }}
+      gpu_tier: "1"
+      runner: 1-gpu-h100
+      timeout: ${{ matrix.timeout }}
+      test_dirs: e2e_test/completions
+    secrets: inherit
+
   e2e-1gpu-embeddings:
     name: e2e-1gpu-embeddings (${{ matrix.engine }})
     needs: [build-wheel, detect-changes]
@@ -913,7 +947,7 @@ jobs:
           path: benchmark_go_bindings/
 
   finish:
-    needs: [pre-commit, python-lint, grpc-proto-build-check, build-wheel, python-unit-tests, unit-tests, benchmarks, e2e-1gpu-chat, e2e-1gpu-embeddings, e2e-1gpu-gateway, e2e-2gpu-chat, e2e-2gpu-responses, e2e-2gpu-pd, e2e-4gpu-chat, e2e-4gpu-gateway, e2e-vendor, go-unit-tests, go-bindings-e2e]
+    needs: [pre-commit, python-lint, grpc-proto-build-check, build-wheel, python-unit-tests, unit-tests, benchmarks, e2e-1gpu-chat, e2e-1gpu-completions, e2e-1gpu-embeddings, e2e-1gpu-gateway, e2e-2gpu-chat, e2e-2gpu-responses, e2e-2gpu-pd, e2e-4gpu-chat, e2e-4gpu-gateway, e2e-vendor, go-unit-tests, go-bindings-e2e]
     if: always()
     runs-on: k8s-runner-cpu
     permissions: {}
@@ -928,6 +962,7 @@ jobs:
                 "${{ needs.unit-tests.result }}" == "failure" || \
                 "${{ needs.benchmarks.result }}" == "failure" || \
                 "${{ needs.e2e-1gpu-chat.result }}" == "failure" || \
+                "${{ needs.e2e-1gpu-completions.result }}" == "failure" || \
                 "${{ needs.e2e-1gpu-embeddings.result }}" == "failure" || \
                 "${{ needs.e2e-1gpu-gateway.result }}" == "failure" || \
                 "${{ needs.e2e-2gpu-chat.result }}" == "failure" || \

From bf7c93a08bbaa18f47aa1a828f3ecda9480a99b8 Mon Sep 17 00:00:00 2001
From: VS Chandra Mourya <msrinivasa@together.ai>
Date: Wed, 8 Apr 2026 09:27:32 -0700
Subject: [PATCH 5/6] fix(completions): address review nits on E2E tests

Signed-off-by: VS Chandra Mourya <msrinivasa@together.ai>
---
 e2e_test/completions/test_basic.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/e2e_test/completions/test_basic.py b/e2e_test/completions/test_basic.py
index a4517d51e..1568c2fe9 100644
--- a/e2e_test/completions/test_basic.py
+++ b/e2e_test/completions/test_basic.py
@@ -6,12 +6,8 @@
 
 from __future__ import annotations
 
-import logging
-
 import pytest
 
-logger = logging.getLogger(__name__)
-
 
 @pytest.mark.engine("sglang", "vllm")
 @pytest.mark.gpu(1)
@@ -20,6 +16,8 @@
 class TestCompletionBasic:
     """Tests for OpenAI-compatible /v1/completions API (non-streaming)."""
 
+    STOP_SEQUENCE_TRIMMED = True
+
     def test_non_streaming_basic(self, model, api_client):
         """Test basic non-streaming text completion with response structure."""
 
@@ -75,7 +73,11 @@ def test_non_streaming_stop_sequence(self, model, api_client):
         )
 
         assert response.choices[0].finish_reason == "stop"
-        assert "," not in response.choices[0].text
+        text = response.choices[0].text
+        if self.STOP_SEQUENCE_TRIMMED:
+            assert "," not in text, f"Stop sequence ',' should not appear in output: {text}"
+        else:
+            assert text.endswith(","), f"Stop sequence ',' should be the suffix of output: {text}"
 
     def test_non_streaming_echo(self, model, api_client):
         """Test that echo=True prepends the prompt to the output."""
@@ -165,6 +167,8 @@ def test_non_streaming_echo_max_tokens_zero(self, model, api_client):
 class TestCompletionStreaming:
     """Tests for streaming /v1/completions API."""
 
+    STOP_SEQUENCE_TRIMMED = True
+
     @staticmethod
     def _collect_stream(stream):
         """Consume a streaming response, returning (full_text, finish_reasons)."""
@@ -214,7 +218,14 @@ def test_streaming_stop_sequence(self, model, api_client):
         assert len(finish_reasons) == 1
         assert finish_reasons[0] == "stop"
         assert len(full_text) > 0, "No text chunks received"
-        assert "," not in full_text
+        if self.STOP_SEQUENCE_TRIMMED:
+            assert "," not in full_text, (
+                f"Stop sequence ',' should not appear in output: {full_text}"
+            )
+        else:
+            assert full_text.endswith(","), (
+                f"Stop sequence ',' should be the suffix of output: {full_text}"
+            )
 
     def test_streaming_collects_full_text(self, model, api_client):
         """Test that streaming deltas concatenate to a non-empty completion."""
@@ -230,7 +241,6 @@ def test_streaming_collects_full_text(self, model, api_client):
         full_text, _ = self._collect_stream(stream)
 
         assert len(full_text) > 0
-        assert "Paris" in full_text
 
     def test_streaming_echo_max_tokens_zero(self, model, api_client):
         """Test that echo=True with max_tokens=0 streams just the prompt."""

From 836154c8f38770bff92b4da69bf33857cc80ddfb Mon Sep 17 00:00:00 2001
From: VS Chandra Mourya <msrinivasa@together.ai>
Date: Wed, 8 Apr 2026 10:20:51 -0700
Subject: [PATCH 6/6] fix(completions): stabilize E2E tests for CI (vllm
 max_tokens=0, flaky stop seq)

Signed-off-by: VS Chandra Mourya <msrinivasa@together.ai>
---
 e2e_test/completions/test_basic.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/e2e_test/completions/test_basic.py b/e2e_test/completions/test_basic.py
index 1568c2fe9..e91d3f90f 100644
--- a/e2e_test/completions/test_basic.py
+++ b/e2e_test/completions/test_basic.py
@@ -143,6 +143,7 @@ def test_non_streaming_usage(self, model, api_client):
             response.usage.prompt_tokens + response.usage.completion_tokens
         )
 
+    @pytest.mark.skip_for_runtime("vllm", reason="vLLM rejects max_tokens=0")
     def test_non_streaming_echo_max_tokens_zero(self, model, api_client):
         """Test that echo=True with max_tokens=0 returns just the prompt."""
 
@@ -217,7 +218,6 @@ def test_streaming_stop_sequence(self, model, api_client):
 
         assert len(finish_reasons) == 1
         assert finish_reasons[0] == "stop"
-        assert len(full_text) > 0, "No text chunks received"
         if self.STOP_SEQUENCE_TRIMMED:
             assert "," not in full_text, (
                 f"Stop sequence ',' should not appear in output: {full_text}"
@@ -242,6 +242,7 @@ def test_streaming_collects_full_text(self, model, api_client):
 
         assert len(full_text) > 0
 
+    @pytest.mark.skip_for_runtime("vllm", reason="vLLM rejects max_tokens=0")
     def test_streaming_echo_max_tokens_zero(self, model, api_client):
         """Test that echo=True with max_tokens=0 streams just the prompt."""