From 2a649892401365e447161799e9fac8a12d950021 Mon Sep 17 00:00:00 2001 From: Tom Wieczorek Date: Mon, 15 Jun 2026 15:51:52 +0200 Subject: [PATCH 1/3] Add a fail-fast input to the OS test workflow This is useful for the release workflow, where we're only interested in successful runs and a quicker feedback loop. Signed-off-by: Tom Wieczorek (cherry picked from commit cccefbfa00d48da3cd03eac594f648a8b15f9b4a) (cherry picked from commit 36dc6250e67747ad008bf30107c3e638d5d3c074) (cherry picked from commit 03fb7f19db876b24ef66984fcc4f2a3d17aa5467) (cherry picked from commit ea24fa55de7fa5d7e37a9ad839cf1b908ce8f273) --- .github/workflows/ostests-e2e.yaml | 15 ++++++++++++++- .github/workflows/ostests-matrix.yaml | 5 +++++ .github/workflows/release.yml | 1 + hack/ostests/README.md | 3 +++ 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ostests-e2e.yaml b/.github/workflows/ostests-e2e.yaml index 7921e666414e..c6e8738ed0e5 100644 --- a/.github/workflows/ostests-e2e.yaml +++ b/.github/workflows/ostests-e2e.yaml @@ -16,6 +16,10 @@ on: type: string description: The selector for the e2e tests to be run. default: \[Conformance\] + fail-fast: + type: boolean + description: Stop the e2e test suite after the first test failure. + default: false os: type: string description: The operating system to test. @@ -64,6 +68,7 @@ jobs: env: E2E_FOCUS: ${{ inputs.e2e-focus }} + E2E_FAIL_FAST: ${{ inputs.fail-fast }} TF_VAR_os: ${{ inputs.os }} TF_VAR_additional_tags: '{ "ostests.k0sproject.io/github-run-id"="${{ github.run_id }}", @@ -167,12 +172,16 @@ jobs: E2E_CONCURRENCY_LEVEL: ${{ inputs.e2e-concurrency-level }} run: | make bin/sonobuoy + ginkgoArgs="-v --timeout=120m --procs=$E2E_CONCURRENCY_LEVEL" + if [ "$E2E_FAIL_FAST" = true ]; then + ginkgoArgs="$ginkgoArgs --fail-fast" + fi bin/sonobuoy run -p e2e --wait=150 \ --kubernetes-version=v"$KUBERNETES_VERSION" \ --plugin-env=e2e.E2E_PARALLEL=true \ --plugin-env=e2e.E2E_FOCUS="$E2E_FOCUS" \ --plugin-env=e2e.E2E_SKIP='\[Serial\]' \ - --plugin-env=e2e.E2E_EXTRA_GINKGO_ARGS="-v --timeout=120m --procs=$E2E_CONCURRENCY_LEVEL" + --plugin-env=e2e.E2E_EXTRA_GINKGO_ARGS="$ginkgoArgs" - name: "e2e tests :: Retrieve parallel results" id: e2e-retrieve-parallel @@ -195,6 +204,10 @@ jobs: timeout-minutes: 180 # three hours run: | make bin/sonobuoy + ginkgoArgs="-v --timeout=120m" + if [ "$E2E_FAIL_FAST" = true ]; then + ginkgoArgs="$ginkgoArgs --fail-fast" + fi bin/sonobuoy run -p e2e --wait=150 \ --kubernetes-version=v"$KUBERNETES_VERSION" \ --plugin-env=e2e.E2E_FOCUS="$E2E_FOCUS" \ diff --git a/.github/workflows/ostests-matrix.yaml b/.github/workflows/ostests-matrix.yaml index 6bf395dbe1fa..dea1f0d49be3 100644 --- a/.github/workflows/ostests-matrix.yaml +++ b/.github/workflows/ostests-matrix.yaml @@ -20,6 +20,10 @@ on: type: string description: The selector for the e2e tests to be run. default: \[Conformance\] + fail-fast: + type: boolean + description: Stop each e2e test suite after the first test failure. + default: false oses: type: string description: The operating systems to test. @@ -89,6 +93,7 @@ jobs: k0sctl-version: ${{ inputs.k0sctl-version }} e2e-concurrency-level: ${{ fromJSON(inputs.e2e-concurrency-level) }} # infamous GH workflows bug that looses type information (actions/runner#2206) e2e-focus: ${{ inputs.e2e-focus }} + fail-fast: ${{ inputs.fail-fast }} os: ${{ matrix.os }} arch: ${{ inputs.arch }} network-provider: ${{ matrix.network-provider }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b0c1faf875bc..ca7601b208cf 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -572,6 +572,7 @@ jobs: os: debian_12 arch: ${{ matrix.arch }} network-provider: kuberouter + fail-fast: true secrets: aws-access-key-id: ${{ secrets.AWS_TERRAFORM_ID }} aws-secret-access-key: ${{ secrets.AWS_TERRAFORM_KEY }} diff --git a/hack/ostests/README.md b/hack/ostests/README.md index 0a1fce212788..5a2c38505535 100644 --- a/hack/ostests/README.md +++ b/hack/ostests/README.md @@ -178,6 +178,9 @@ $ gh workflow run ostests-matrix.yaml --ref some/experimental/branch \ To see runs for this workflow, try: gh run list --workflow=ostests-matrix.yaml ``` +Add `-f fail-fast=true` to the above command line to stop the e2e test suite +after the first test failure has been recorded. + [gh]: https://github.com/cli/cli ## TODO From 31bbbdbeb4de9eb3737ec91a79ab1b2c4983e0d6 Mon Sep 17 00:00:00 2001 From: Tom Wieczorek Date: Mon, 15 Jun 2026 16:31:32 +0200 Subject: [PATCH 2/3] Tune OS test timeouts for e2e runs This mostly applies to full conformance test runs that are called from the release workflow. In order to obtain the "Certified Kubernetes" badge, these cannot be run in parallel. They currently take around two hours to complete, close to the Ginkgo test timeout. Funnily enough, the 64-bit ARM VMs are slightly faster than the 64-bit x86_64 VMs, resulting in mostly successful arm64 runs and very unreliable amd64 runs. Moreover, Ginkgo/sonobuoy simply report any overall test timeouts as failures of the pending tests, with no indication that the tests did not even complete. This makes it very difficult to realize that this is a timeout issue rather than a flaky test. For an all-serial workflow step, use the following timeouts: * three hours for Ginkgo, * three and a half hours for Sonobuoy * and four hours for the workflow step. When the serial step follows a parallel step, use tighter timeouts because only the remaining serial tests should run: * thirty minutes for Ginkgo, * forty-five minutes for Sonobuoy, * and one hur for the workflow step. See: 71b3209d57 ("Use OS tests in release workflow") Signed-off-by: Tom Wieczorek (cherry picked from commit 9746895e25f5dba0f51969d5a5026e7f251b37db) (cherry picked from commit a23c8ccd95aa046edc5c90bf15464270a97b614a) (cherry picked from commit cca1b474bbc9c8aef0b96875a29de1c80b480638) (cherry picked from commit 7a535b55b89a77d0b0c785d71fe6b305a0650bf7) --- .github/workflows/ostests-e2e.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ostests-e2e.yaml b/.github/workflows/ostests-e2e.yaml index c6e8738ed0e5..719ace734b99 100644 --- a/.github/workflows/ostests-e2e.yaml +++ b/.github/workflows/ostests-e2e.yaml @@ -201,14 +201,17 @@ jobs: - name: "e2e tests :: Run serial tests" working-directory: inttest - timeout-minutes: 180 # three hours + timeout-minutes: ${{ inputs.e2e-concurrency-level > 1 && 60 || 240 }} # 1 hour in parallel mode, 4 hours in serial mode + env: + K0S_SONOBUOY_WAIT: "${{ inputs.e2e-concurrency-level > 1 && 45 || 210 }}" # 45 mins in parallel mode, 3.5 hours in serial mode + K0S_GINKGO_TIMEOUT: "${{ inputs.e2e-concurrency-level > 1 && '30m' || '3h' }}" # 30 mins in parallel mode, 3 hours in serial mode run: | make bin/sonobuoy - ginkgoArgs="-v --timeout=120m" + ginkgoArgs="-v --timeout=$K0S_GINKGO_TIMEOUT" if [ "$E2E_FAIL_FAST" = true ]; then ginkgoArgs="$ginkgoArgs --fail-fast" fi - bin/sonobuoy run -p e2e --wait=150 \ + bin/sonobuoy run -p e2e --wait="$K0S_SONOBUOY_WAIT" \ --kubernetes-version=v"$KUBERNETES_VERSION" \ --plugin-env=e2e.E2E_FOCUS="$E2E_FOCUS" \ --plugin-env=e2e.E2E_EXTRA_GINKGO_ARGS='-v --timeout=120m' From 0b2db08e6e86871440f59934b7f067873bb9541a Mon Sep 17 00:00:00 2001 From: Tom Wieczorek Date: Tue, 16 Jun 2026 09:26:54 +0200 Subject: [PATCH 3/3] Publish OS test results as GitHub step summaries Keep the format, but instead of writing it into a console group, write it into a step summary. Signed-off-by: Tom Wieczorek (cherry picked from commit cfc1ab1d4ace082d52d7126814e1e59e0f3030a8) (cherry picked from commit 255f47973ed41bcdb92648addbbcf3fe750397a7) (cherry picked from commit c3b437b899c096ce8e65a6c08953ccb6936d2406) (cherry picked from commit c39a6ba0e4fb3366d95357a7caff986020273c9f) --- .github/workflows/ostests-e2e.yaml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ostests-e2e.yaml b/.github/workflows/ostests-e2e.yaml index 719ace734b99..ca114c943ffb 100644 --- a/.github/workflows/ostests-e2e.yaml +++ b/.github/workflows/ostests-e2e.yaml @@ -245,11 +245,18 @@ jobs: run: | fail=0 for f in sonobuoy-e2e-*.tar.gz; do - echo "::group::$f" - bin/sonobuoy results "$f" - numNotPassedOrSkipped=$(bin/sonobuoy results "$f" -p=e2e --mode=detailed | jq --slurp '[.[] | select(.status != "passed" and .status != "skipped")] | length') - echo "Number of tests that didn't pass and weren't skipped: $numNotPassedOrSkipped" - echo ::endgroup:: + { + echo "### $f" + numNotPassedOrSkipped=$(bin/sonobuoy results "$f" -p=e2e --mode=detailed | jq --slurp '[.[] | select(.status != "passed" and .status != "skipped")] | length') + echo '
' + echo '' + echo "Number of tests that didn't pass and weren't skipped: $numNotPassedOrSkipped" + echo '' + echo + echo '```text' + bin/sonobuoy results "$f" + echo '```' + } >>"$GITHUB_STEP_SUMMARY" [ "$numNotPassedOrSkipped" = 0 ] || fail=1 done [ "$fail" = 0 ] || exit 1