From 833a7b2fdb586cb7948c3ee7646fa22006d25051 Mon Sep 17 00:00:00 2001 From: VS Chandra Mourya Date: Mon, 30 Mar 2026 17:07:50 -0700 Subject: [PATCH 1/6] test(completions): add E2E tests for /v1/completions gRPC endpoint Signed-off-by: VS Chandra Mourya --- e2e_test/completions/__init__.py | 8 ++ e2e_test/completions/test_basic.py | 219 +++++++++++++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 e2e_test/completions/__init__.py create mode 100644 e2e_test/completions/test_basic.py diff --git a/e2e_test/completions/__init__.py b/e2e_test/completions/__init__.py new file mode 100644 index 000000000..d4bd72635 --- /dev/null +++ b/e2e_test/completions/__init__.py @@ -0,0 +1,8 @@ +"""OpenAI Completions API E2E tests. + +Tests for the OpenAI Completions API endpoints (/v1/completions) including: +- Basic non-streaming and streaming text completion +- Stop sequences, echo, and suffix handling +- Parallel sampling (n > 1) +- Usage statistics validation +""" diff --git a/e2e_test/completions/test_basic.py b/e2e_test/completions/test_basic.py new file mode 100644 index 000000000..fef02e738 --- /dev/null +++ b/e2e_test/completions/test_basic.py @@ -0,0 +1,219 @@ +"""Basic tests for OpenAI Completions API (/v1/completions). + +Tests for non-streaming and streaming text completion, echo, suffix, +stop sequences, and parallel sampling via the OpenAI SDK. +""" + +from __future__ import annotations + +import logging + +import pytest + +logger = logging.getLogger(__name__) + + +@pytest.mark.engine("sglang", "vllm") +@pytest.mark.gpu(1) +@pytest.mark.model("meta-llama/Llama-3.1-8B-Instruct") +@pytest.mark.parametrize("setup_backend", ["grpc"], indirect=True) +class TestCompletionBasic: + """Tests for OpenAI-compatible /v1/completions API (non-streaming).""" + + def test_non_streaming_basic(self, model, api_client): + """Test basic non-streaming text completion with response structure.""" + + response = api_client.completions.create( + model=model, + prompt="The capital of France is", + max_tokens=20, + temperature=0, + ) + + assert response.id is not None + assert response.object == "text_completion" + assert response.model is not None + assert response.created is not None + assert len(response.choices) == 1 + + choice = response.choices[0] + assert choice.index == 0 + assert isinstance(choice.text, str) + assert len(choice.text) > 0 + assert choice.finish_reason in ("stop", "length") + + assert response.usage is not None + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert response.usage.total_tokens == ( + response.usage.prompt_tokens + response.usage.completion_tokens + ) + + def test_non_streaming_max_tokens(self, model, api_client): + """Test that max_tokens limits output length.""" + + response = api_client.completions.create( + model=model, + prompt="Count from 1 to 100: 1, 2, 3,", + max_tokens=5, + temperature=0, + ) + + assert len(response.choices) == 1 + assert response.choices[0].finish_reason == "length" + assert response.usage.completion_tokens <= 5 + + def test_non_streaming_stop_sequence(self, model, api_client): + """Test that stop sequences cause the model to stop generating.""" + + response = api_client.completions.create( + model=model, + prompt="Count: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10", + max_tokens=200, + temperature=0, + stop=[","], + ) + + assert response.choices[0].finish_reason == "stop" + assert "," not in response.choices[0].text + + def test_non_streaming_echo(self, model, api_client): + """Test that echo=True prepends the prompt to the output.""" + + prompt = "The capital of France is" + response = api_client.completions.create( + model=model, + prompt=prompt, + max_tokens=20, + temperature=0, + echo=True, + ) + + assert response.choices[0].text.startswith(prompt) + + def test_non_streaming_suffix(self, model, api_client): + """Test that suffix is appended to the output.""" + + suffix = " -- END" + response = api_client.completions.create( + model=model, + prompt="The capital of France is", + max_tokens=20, + temperature=0, + suffix=suffix, + ) + + assert response.choices[0].text.endswith(suffix) + + @pytest.mark.parametrize("n", [1, 2]) + def test_non_streaming_parallel_sampling(self, model, api_client, n): + """Test parallel sampling with n > 1.""" + + temperature = 0.7 if n > 1 else 0 + response = api_client.completions.create( + model=model, + prompt="The meaning of life is", + max_tokens=30, + temperature=temperature, + n=n, + ) + + assert len(response.choices) == n + for i, choice in enumerate(response.choices): + assert choice.index == i + assert isinstance(choice.text, str) + assert len(choice.text) > 0 + + def test_non_streaming_usage(self, model, api_client): + """Test that usage statistics are returned correctly.""" + + response = api_client.completions.create( + model=model, + prompt="Hello", + max_tokens=10, + temperature=0, + ) + + assert response.usage is not None + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert response.usage.total_tokens == ( + response.usage.prompt_tokens + response.usage.completion_tokens + ) + + +@pytest.mark.engine("sglang", "vllm") +@pytest.mark.gpu(1) +@pytest.mark.model("meta-llama/Llama-3.1-8B-Instruct") +@pytest.mark.parametrize("setup_backend", ["grpc"], indirect=True) +class TestCompletionStreaming: + """Tests for streaming /v1/completions API.""" + + def test_streaming_basic(self, model, api_client): + """Test streaming completion returns chunks with text deltas.""" + + stream = api_client.completions.create( + model=model, + prompt="The capital of France is", + max_tokens=20, + temperature=0, + stream=True, + ) + + texts = [] + finish_reasons = [] + for chunk in stream: + assert chunk.object == "text_completion" + if chunk.choices: + choice = chunk.choices[0] + if choice.text: + texts.append(choice.text) + if choice.finish_reason: + finish_reasons.append(choice.finish_reason) + + assert len(texts) > 0, "No text chunks received" + full_text = "".join(texts) + assert len(full_text) > 0 + assert len(finish_reasons) == 1 + assert finish_reasons[0] in ("stop", "length") + + def test_streaming_stop_sequence(self, model, api_client): + """Test that stop sequences work in streaming mode.""" + + stream = api_client.completions.create( + model=model, + prompt="Count: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10", + max_tokens=200, + temperature=0, + stop=[","], + stream=True, + ) + + texts = [] + finish_reasons = [] + for chunk in stream: + if chunk.choices: + choice = chunk.choices[0] + if choice.text: + texts.append(choice.text) + if choice.finish_reason: + finish_reasons.append(choice.finish_reason) + + assert "stop" in finish_reasons + full_text = "".join(texts) + assert "," not in full_text + + def test_streaming_collects_full_text(self, model, api_client): + """Test that streaming deltas concatenate to a non-empty completion.""" + + stream = api_client.completions.create( + model=model, + prompt="The capital of France is", + max_tokens=20, + temperature=0, + stream=True, + ) + full_text = "".join(c.choices[0].text for c in stream if c.choices and c.choices[0].text) + + assert len(full_text) > 0 + assert "Paris" in full_text From 3eaa520cfedd01f18cf39336630f709cffeaa794 Mon Sep 17 00:00:00 2001 From: VS Chandra Mourya Date: Wed, 1 Apr 2026 15:51:40 -0700 Subject: [PATCH 2/6] test(completions): add max_tokens=0 echo tests for streaming and non-streaming Signed-off-by: VS Chandra Mourya --- e2e_test/completions/test_basic.py | 43 ++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/e2e_test/completions/test_basic.py b/e2e_test/completions/test_basic.py index fef02e738..371cb51a7 100644 --- a/e2e_test/completions/test_basic.py +++ b/e2e_test/completions/test_basic.py @@ -141,6 +141,21 @@ def test_non_streaming_usage(self, model, api_client): response.usage.prompt_tokens + response.usage.completion_tokens ) + def test_non_streaming_echo_max_tokens_zero(self, model, api_client): + """Test that echo=True with max_tokens=0 returns just the prompt.""" + + prompt = "The capital of France is" + response = api_client.completions.create( + model=model, + prompt=prompt, + max_tokens=0, + temperature=0, + echo=True, + ) + + assert response.choices[0].text == prompt + assert response.usage.completion_tokens == 0 + @pytest.mark.engine("sglang", "vllm") @pytest.mark.gpu(1) @@ -217,3 +232,31 @@ def test_streaming_collects_full_text(self, model, api_client): assert len(full_text) > 0 assert "Paris" in full_text + + def test_streaming_echo_max_tokens_zero(self, model, api_client): + """Test that echo=True with max_tokens=0 streams just the prompt.""" + + prompt = "The capital of France is" + stream = api_client.completions.create( + model=model, + prompt=prompt, + max_tokens=0, + temperature=0, + echo=True, + stream=True, + ) + + texts = [] + finish_reasons = [] + for chunk in stream: + if chunk.choices: + choice = chunk.choices[0] + if choice.text: + texts.append(choice.text) + if choice.finish_reason: + finish_reasons.append(choice.finish_reason) + + full_text = "".join(texts) + assert full_text == prompt, f"Expected echoed prompt, got: {full_text!r}" + assert len(finish_reasons) == 1 + assert finish_reasons[0] == "stop" From 32877e8021245242fca971f21474fab01e38da1b Mon Sep 17 00:00:00 2001 From: VS Chandra Mourya Date: Wed, 1 Apr 2026 16:41:06 -0700 Subject: [PATCH 3/6] fix(completions): address review nits on E2E tests - Extract _collect_stream helper to deduplicate streaming loop - Assert chunk.object == "text_completion" on every streaming chunk - Strengthen stop sequence assertions (exactly 1 finish_reason + text) - Accept "stop" or "length" for max_tokens=0 finish_reason - Add finish_reason assertion to non-streaming max_tokens=0 test Refs: #1021 Signed-off-by: VS Chandra Mourya --- e2e_test/completions/test_basic.py | 62 +++++++++++++----------------- 1 file changed, 26 insertions(+), 36 deletions(-) diff --git a/e2e_test/completions/test_basic.py b/e2e_test/completions/test_basic.py index 371cb51a7..a4517d51e 100644 --- a/e2e_test/completions/test_basic.py +++ b/e2e_test/completions/test_basic.py @@ -154,6 +154,7 @@ def test_non_streaming_echo_max_tokens_zero(self, model, api_client): ) assert response.choices[0].text == prompt + assert response.choices[0].finish_reason in ("stop", "length") assert response.usage.completion_tokens == 0 @@ -164,6 +165,21 @@ def test_non_streaming_echo_max_tokens_zero(self, model, api_client): class TestCompletionStreaming: """Tests for streaming /v1/completions API.""" + @staticmethod + def _collect_stream(stream): + """Consume a streaming response, returning (full_text, finish_reasons).""" + texts = [] + finish_reasons = [] + for chunk in stream: + assert chunk.object == "text_completion" + if chunk.choices: + choice = chunk.choices[0] + if choice.text: + texts.append(choice.text) + if choice.finish_reason: + finish_reasons.append(choice.finish_reason) + return "".join(texts), finish_reasons + def test_streaming_basic(self, model, api_client): """Test streaming completion returns chunks with text deltas.""" @@ -175,20 +191,9 @@ def test_streaming_basic(self, model, api_client): stream=True, ) - texts = [] - finish_reasons = [] - for chunk in stream: - assert chunk.object == "text_completion" - if chunk.choices: - choice = chunk.choices[0] - if choice.text: - texts.append(choice.text) - if choice.finish_reason: - finish_reasons.append(choice.finish_reason) + full_text, finish_reasons = self._collect_stream(stream) - assert len(texts) > 0, "No text chunks received" - full_text = "".join(texts) - assert len(full_text) > 0 + assert len(full_text) > 0, "No text chunks received" assert len(finish_reasons) == 1 assert finish_reasons[0] in ("stop", "length") @@ -204,18 +209,11 @@ def test_streaming_stop_sequence(self, model, api_client): stream=True, ) - texts = [] - finish_reasons = [] - for chunk in stream: - if chunk.choices: - choice = chunk.choices[0] - if choice.text: - texts.append(choice.text) - if choice.finish_reason: - finish_reasons.append(choice.finish_reason) + full_text, finish_reasons = self._collect_stream(stream) - assert "stop" in finish_reasons - full_text = "".join(texts) + assert len(finish_reasons) == 1 + assert finish_reasons[0] == "stop" + assert len(full_text) > 0, "No text chunks received" assert "," not in full_text def test_streaming_collects_full_text(self, model, api_client): @@ -228,7 +226,8 @@ def test_streaming_collects_full_text(self, model, api_client): temperature=0, stream=True, ) - full_text = "".join(c.choices[0].text for c in stream if c.choices and c.choices[0].text) + + full_text, _ = self._collect_stream(stream) assert len(full_text) > 0 assert "Paris" in full_text @@ -246,17 +245,8 @@ def test_streaming_echo_max_tokens_zero(self, model, api_client): stream=True, ) - texts = [] - finish_reasons = [] - for chunk in stream: - if chunk.choices: - choice = chunk.choices[0] - if choice.text: - texts.append(choice.text) - if choice.finish_reason: - finish_reasons.append(choice.finish_reason) + full_text, finish_reasons = self._collect_stream(stream) - full_text = "".join(texts) assert full_text == prompt, f"Expected echoed prompt, got: {full_text!r}" assert len(finish_reasons) == 1 - assert finish_reasons[0] == "stop" + assert finish_reasons[0] in ("stop", "length") From 830952d26d3e94e0aa1f53a61da2830698342887 Mon Sep 17 00:00:00 2001 From: VS Chandra Mourya Date: Wed, 8 Apr 2026 09:26:57 -0700 Subject: [PATCH 4/6] ci(completions): add e2e-1gpu-completions job to PR pipeline Signed-off-by: VS Chandra Mourya --- .github/workflows/pr-test-rust.yml | 37 +++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-rust.yml b/.github/workflows/pr-test-rust.yml index 67188d40d..dbd1fbe72 100644 --- a/.github/workflows/pr-test-rust.yml +++ b/.github/workflows/pr-test-rust.yml @@ -367,6 +367,7 @@ jobs: outputs: common: ${{ steps.filter.outputs.common }} chat-completions: ${{ steps.filter.outputs.chat-completions }} + completions: ${{ steps.filter.outputs.completions }} agentic: ${{ steps.filter.outputs.agentic }} embeddings: ${{ steps.filter.outputs.embeddings }} go-bindings: ${{ steps.filter.outputs.go-bindings }} @@ -410,6 +411,10 @@ jobs: - 'e2e_test/messages/**' - 'scripts/ci_agentic_svc_deps.sh' - 'scripts/oracle_flyway/**' + completions: + - 'crates/grpc_client/**' + - 'grpc_servicer/**' + - 'e2e_test/completions/**' embeddings: - 'e2e_test/embeddings/**' go-bindings: @@ -450,6 +455,35 @@ jobs: test_dirs: e2e_test/chat_completions secrets: inherit + e2e-1gpu-completions: + name: e2e-1gpu-completions (${{ matrix.engine }}) + needs: [build-wheel, detect-changes] + if: >- + always() + && !cancelled() + && needs.build-wheel.result == 'success' + && github.actor != 'dependabot[bot]' + && (github.event_name != 'pull_request' + || (needs.detect-changes.result == 'success' + && (needs.detect-changes.outputs.common == 'true' + || needs.detect-changes.outputs.completions == 'true'))) + strategy: + fail-fast: false + matrix: + include: + - engine: sglang + timeout: 20 + - engine: vllm + timeout: 20 + uses: ./.github/workflows/e2e-gpu-job.yml + with: + engine: ${{ matrix.engine }} + gpu_tier: "1" + runner: 1-gpu-h100 + timeout: ${{ matrix.timeout }} + test_dirs: e2e_test/completions + secrets: inherit + e2e-1gpu-embeddings: name: e2e-1gpu-embeddings (${{ matrix.engine }}) needs: [build-wheel, detect-changes] @@ -913,7 +947,7 @@ jobs: path: benchmark_go_bindings/ finish: - needs: [pre-commit, python-lint, grpc-proto-build-check, build-wheel, python-unit-tests, unit-tests, benchmarks, e2e-1gpu-chat, e2e-1gpu-embeddings, e2e-1gpu-gateway, e2e-2gpu-chat, e2e-2gpu-responses, e2e-2gpu-pd, e2e-4gpu-chat, e2e-4gpu-gateway, e2e-vendor, go-unit-tests, go-bindings-e2e] + needs: [pre-commit, python-lint, grpc-proto-build-check, build-wheel, python-unit-tests, unit-tests, benchmarks, e2e-1gpu-chat, e2e-1gpu-completions, e2e-1gpu-embeddings, e2e-1gpu-gateway, e2e-2gpu-chat, e2e-2gpu-responses, e2e-2gpu-pd, e2e-4gpu-chat, e2e-4gpu-gateway, e2e-vendor, go-unit-tests, go-bindings-e2e] if: always() runs-on: k8s-runner-cpu permissions: {} @@ -928,6 +962,7 @@ jobs: "${{ needs.unit-tests.result }}" == "failure" || \ "${{ needs.benchmarks.result }}" == "failure" || \ "${{ needs.e2e-1gpu-chat.result }}" == "failure" || \ + "${{ needs.e2e-1gpu-completions.result }}" == "failure" || \ "${{ needs.e2e-1gpu-embeddings.result }}" == "failure" || \ "${{ needs.e2e-1gpu-gateway.result }}" == "failure" || \ "${{ needs.e2e-2gpu-chat.result }}" == "failure" || \ From bf7c93a08bbaa18f47aa1a828f3ecda9480a99b8 Mon Sep 17 00:00:00 2001 From: VS Chandra Mourya Date: Wed, 8 Apr 2026 09:27:32 -0700 Subject: [PATCH 5/6] fix(completions): address review nits on E2E tests Signed-off-by: VS Chandra Mourya --- e2e_test/completions/test_basic.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/e2e_test/completions/test_basic.py b/e2e_test/completions/test_basic.py index a4517d51e..1568c2fe9 100644 --- a/e2e_test/completions/test_basic.py +++ b/e2e_test/completions/test_basic.py @@ -6,12 +6,8 @@ from __future__ import annotations -import logging - import pytest -logger = logging.getLogger(__name__) - @pytest.mark.engine("sglang", "vllm") @pytest.mark.gpu(1) @@ -20,6 +16,8 @@ class TestCompletionBasic: """Tests for OpenAI-compatible /v1/completions API (non-streaming).""" + STOP_SEQUENCE_TRIMMED = True + def test_non_streaming_basic(self, model, api_client): """Test basic non-streaming text completion with response structure.""" @@ -75,7 +73,11 @@ def test_non_streaming_stop_sequence(self, model, api_client): ) assert response.choices[0].finish_reason == "stop" - assert "," not in response.choices[0].text + text = response.choices[0].text + if self.STOP_SEQUENCE_TRIMMED: + assert "," not in text, f"Stop sequence ',' should not appear in output: {text}" + else: + assert text.endswith(","), f"Stop sequence ',' should be the suffix of output: {text}" def test_non_streaming_echo(self, model, api_client): """Test that echo=True prepends the prompt to the output.""" @@ -165,6 +167,8 @@ def test_non_streaming_echo_max_tokens_zero(self, model, api_client): class TestCompletionStreaming: """Tests for streaming /v1/completions API.""" + STOP_SEQUENCE_TRIMMED = True + @staticmethod def _collect_stream(stream): """Consume a streaming response, returning (full_text, finish_reasons).""" @@ -214,7 +218,14 @@ def test_streaming_stop_sequence(self, model, api_client): assert len(finish_reasons) == 1 assert finish_reasons[0] == "stop" assert len(full_text) > 0, "No text chunks received" - assert "," not in full_text + if self.STOP_SEQUENCE_TRIMMED: + assert "," not in full_text, ( + f"Stop sequence ',' should not appear in output: {full_text}" + ) + else: + assert full_text.endswith(","), ( + f"Stop sequence ',' should be the suffix of output: {full_text}" + ) def test_streaming_collects_full_text(self, model, api_client): """Test that streaming deltas concatenate to a non-empty completion.""" @@ -230,7 +241,6 @@ def test_streaming_collects_full_text(self, model, api_client): full_text, _ = self._collect_stream(stream) assert len(full_text) > 0 - assert "Paris" in full_text def test_streaming_echo_max_tokens_zero(self, model, api_client): """Test that echo=True with max_tokens=0 streams just the prompt.""" From 836154c8f38770bff92b4da69bf33857cc80ddfb Mon Sep 17 00:00:00 2001 From: VS Chandra Mourya Date: Wed, 8 Apr 2026 10:20:51 -0700 Subject: [PATCH 6/6] fix(completions): stabilize E2E tests for CI (vllm max_tokens=0, flaky stop seq) Signed-off-by: VS Chandra Mourya --- e2e_test/completions/test_basic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/e2e_test/completions/test_basic.py b/e2e_test/completions/test_basic.py index 1568c2fe9..e91d3f90f 100644 --- a/e2e_test/completions/test_basic.py +++ b/e2e_test/completions/test_basic.py @@ -143,6 +143,7 @@ def test_non_streaming_usage(self, model, api_client): response.usage.prompt_tokens + response.usage.completion_tokens ) + @pytest.mark.skip_for_runtime("vllm", reason="vLLM rejects max_tokens=0") def test_non_streaming_echo_max_tokens_zero(self, model, api_client): """Test that echo=True with max_tokens=0 returns just the prompt.""" @@ -217,7 +218,6 @@ def test_streaming_stop_sequence(self, model, api_client): assert len(finish_reasons) == 1 assert finish_reasons[0] == "stop" - assert len(full_text) > 0, "No text chunks received" if self.STOP_SEQUENCE_TRIMMED: assert "," not in full_text, ( f"Stop sequence ',' should not appear in output: {full_text}" @@ -242,6 +242,7 @@ def test_streaming_collects_full_text(self, model, api_client): assert len(full_text) > 0 + @pytest.mark.skip_for_runtime("vllm", reason="vLLM rejects max_tokens=0") def test_streaming_echo_max_tokens_zero(self, model, api_client): """Test that echo=True with max_tokens=0 streams just the prompt."""