test(e2e): address gpu workload review feedback

elezar · elezar · commit 8426fac56a61 · 2026-06-10T22:49:56.000+02:00
Signed-off-by: Evan Lezar &lt;elezar@nvidia.com&gt;
diff --git a/e2e/gpu/README.md b/e2e/gpu/README.md
@@ -3,8 +3,7 @@
 
 # GPU workload images
 
-This directory defines workload test images currently used by the OpenShell GPU
-e2e suite.
+This directory defines workload test images for OpenShell GPU validation.
 
 ## Contract
 
@@ -23,10 +22,11 @@ Each workload image must:
   command explicitly.
 
 OpenShell sandbox creation replaces the image entrypoint with the supervisor and
-does not run the OCI image `CMD`. E2e tests that use these images through
-OpenShell run the command from each manifest entry explicitly.
+does not run the OCI image `CMD`. When these images are used through OpenShell,
+the workload command from each manifest entry must be passed explicitly.
 
-The test harness is manifest-driven. Each workload entry carries:
+The image build task writes a local workload manifest. Each workload entry
+carries:
 
 - `name`
 - `image`
@@ -61,24 +61,27 @@ The build task uses `tasks/scripts/container-engine.sh`. Set
 `CONTAINER_ENGINE=docker` or `CONTAINER_ENGINE=podman` to choose an engine
 explicitly. When unset, the helper uses its existing auto-detection behavior.
 
-Local tags use the current commit short SHA. Dirty local trees append `-dirty`.
-Set `OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=<tag>` to override the tag.
+Local tags use the current commit short SHA plus a short fingerprint of the
+external build inputs. Dirty local trees append `-dirty`. Set
+`OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=<tag>` to override the tag.
 
 The task writes the latest build refs to:
 
 ```text
 e2e/gpu/images/.build/latest.env
 ```
 
-The task also writes the local workload manifest used by the Rust e2e runner:
+The task also writes a local workload manifest for downstream tooling and
+future workload-runner integration:
 
 ```text
 e2e/gpu/images/.build/workloads.yaml
 ```
 
 That local manifest is created by `mise run e2e:workloads:build`. It contains
 the full image reference, command, expected outcome, and requirements for each
-selected workload.
+selected workload. It also records the external build inputs used to produce
+the workload images.
 
 Use the env file in later commands:
 
@@ -87,7 +90,8 @@ source e2e/gpu/images/.build/latest.env
 ```
 
 That env file exports `OPENSHELL_E2E_WORKLOAD_MANIFEST` pointing at the local
-manifest. The per-image refs remain available as a convenience for direct
+manifest. The current checked-in Rust GPU e2e target does not consume this
+manifest yet. The per-image refs remain available as a convenience for direct
 container-engine validation.
 
 ## Direct Validation
@@ -120,57 +124,14 @@ where Podman CDI is configured.
 Direct container-engine validation catches image, CDI, CUDA, and host GPU setup
 issues before OpenShell sandbox behavior is involved.
 
-## Manifest-Driven Validation
+## OpenShell GPU E2E
 
-The Rust GPU validation target is:
+The current Rust GPU validation target is:
 
 ```shell
-cargo test --manifest-path e2e/rust/Cargo.toml --features e2e-docker-gpu --test gpu -- --nocapture
+mise run e2e:gpu
 ```
 
-The workload validation path reads:
-
-```text
-OPENSHELL_E2E_WORKLOAD_MANIFEST
-```
-
-When that variable is unset, the runner uses the default local manifest path:
-
-```text
-e2e/gpu/images/.build/workloads.yaml
-```
-
-If neither path exists, the workload validation test prints a clear skip
-message telling you to run:
-
-```shell
-mise run e2e:workloads:build
-```
-
-or to set `OPENSHELL_E2E_WORKLOAD_MANIFEST` to an external manifest.
-
-Each manifest entry supplies the sandbox image and command. OpenShell runs that
-command through `openshell sandbox create --gpu --from <image> -- <command>`.
-The test runner iterates all GPU-tagged workload entries and enforces each
-entry's declared expectation:
-
-- `expect: pass` requires `OPENSHELL_GPU_WORKLOAD_SUCCESS`
-- `expect: fail` requires `OPENSHELL_GPU_WORKLOAD_FAILURE`
-
-The current local manifest includes three workloads:
-
-- `smoke-pass` expected to pass
-- `smoke-fail` expected to fail
-- `cuda-basic` expected to pass
-
-## External Manifests
-
-External workload catalogs can use the same schema. Point the runner at one
-with:
-
-```shell
-export OPENSHELL_E2E_WORKLOAD_MANIFEST=/abs/path/to/workloads.yaml
-```
-
-That lets alternate workload manifests use the same test runner without
-introducing per-workload env vars.
+That target runs `gpu_device_selection`. It validates GPU request and device
+selection behavior against a Docker-backed gateway. It does not run the
+workload manifest generated by `mise run e2e:workloads:build`.
diff --git a/tasks/scripts/e2e-gpu-build-images.sh b/tasks/scripts/e2e-gpu-build-images.sh
@@ -16,6 +16,7 @@ BASE_IMAGE="${OPENSHELL_SANDBOX_BASE_IMAGE:-ghcr.io/nvidia/openshell-community/s
 CUDA_BUILD_IMAGE="${CUDA_BUILD_IMAGE:-nvcr.io/nvidia/cuda:12.8.1-base-ubuntu22.04}"
 CUDA_SAMPLES_REPO="${CUDA_SAMPLES_REPO:-https://github.com/NVIDIA/cuda-samples}"
 CUDA_SAMPLES_REF="${CUDA_SAMPLES_REF:-v12.8}"
+SUPPORTED_IMAGES=(smoke-pass smoke-fail cuda-basic)
 
 shell_quote() {
   local value=$1
@@ -39,22 +40,13 @@ yaml_quote() {
 }
 
 available_image_dirs() {
-  local dockerfile
   local preferred
-  local seen=" "
 
-  for preferred in smoke-pass smoke-fail cuda-basic; do
+  for preferred in "${SUPPORTED_IMAGES[@]}"; do
     if [[ -f "${IMAGES_ROOT}/${preferred}/Dockerfile" ]]; then
       echo "${preferred}"
-      seen+="${preferred} "
     fi
   done
-
-  find "${IMAGES_ROOT}" -mindepth 2 -maxdepth 2 -name Dockerfile -type f | sort | while IFS= read -r dockerfile; do
-    name="$(basename "$(dirname "${dockerfile}")")"
-    [[ "${seen}" == *" ${name} "* ]] && continue
-    echo "${name}"
-  done
 }
 
 contains_image() {
@@ -90,6 +82,19 @@ image_expectation() {
   esac
 }
 
+workload_input_fingerprint() {
+  local -a names=("$@")
+
+  {
+    printf 'OPENSHELL_SANDBOX_BASE_IMAGE=%s\n' "${BASE_IMAGE}"
+    if contains_image cuda-basic "${names[@]}"; then
+      printf 'CUDA_BUILD_IMAGE=%s\n' "${CUDA_BUILD_IMAGE}"
+      printf 'CUDA_SAMPLES_REPO=%s\n' "${CUDA_SAMPLES_REPO}"
+      printf 'CUDA_SAMPLES_REF=%s\n' "${CUDA_SAMPLES_REF}"
+    fi
+  } | git -C "${ROOT}" hash-object --stdin | cut -c1-8
+}
+
 mapfile -t available < <(available_image_dirs)
 if [[ ${#available[@]} -eq 0 ]]; then
   echo "No GPU workload image Dockerfiles found under ${IMAGES_ROOT}" >&2
@@ -128,11 +133,13 @@ fi
 if [[ -n "${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG:-}" ]]; then
   image_tag="${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG}"
 else
-  image_tag="${source_short_sha}"
+  input_fingerprint="$(workload_input_fingerprint "${selected[@]}")"
+  image_tag="${source_short_sha}-${input_fingerprint}"
   if [[ "${source_dirty}" == "true" ]]; then
     image_tag="${image_tag}-dirty"
   fi
 fi
+input_fingerprint="$(workload_input_fingerprint "${selected[@]}")"
 
 declare -A image_refs=()
 
@@ -148,12 +155,23 @@ for name in "${selected[@]}"; do
   build_args=(
     --build-arg "OPENSHELL_SANDBOX_BASE_IMAGE=${BASE_IMAGE}"
   )
+  build_labels=(
+    --label "com.nvidia.openshell.gpu-workload.source=${name}"
+    --label "com.nvidia.openshell.gpu-workload.base-image=${BASE_IMAGE}"
+    --label "com.nvidia.openshell.gpu-workload.input-fingerprint=${input_fingerprint}"
+    --label "org.opencontainers.image.revision=${source_sha}"
+  )
   if [[ "${name}" == "cuda-basic" ]]; then
     build_args+=(
       --build-arg "CUDA_BUILD_IMAGE=${CUDA_BUILD_IMAGE}"
       --build-arg "CUDA_SAMPLES_REPO=${CUDA_SAMPLES_REPO}"
       --build-arg "CUDA_SAMPLES_REF=${CUDA_SAMPLES_REF}"
     )
+    build_labels+=(
+      --label "com.nvidia.openshell.gpu-workload.cuda-build-image=${CUDA_BUILD_IMAGE}"
+      --label "com.nvidia.openshell.gpu-workload.cuda-samples-repo=${CUDA_SAMPLES_REPO}"
+      --label "com.nvidia.openshell.gpu-workload.cuda-samples-ref=${CUDA_SAMPLES_REF}"
+    )
   fi
 
   echo
@@ -162,8 +180,7 @@ for name in "${selected[@]}"; do
     --load \
     --provenance=false \
     -t "${image_ref}" \
-    --label "com.nvidia.openshell.gpu-workload.source=${name}" \
-    --label "org.opencontainers.image.revision=${source_sha}" \
+    "${build_labels[@]}" \
     "${build_args[@]}" \
     "${context}"
 
@@ -180,6 +197,11 @@ manifest_path="${BUILD_DIR}/workloads.yaml"
   write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_PATH "${IMAGES_ROOT}"
   write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_SHA "${source_sha}"
   write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_DIRTY "${source_dirty}"
+  write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_INPUT_FINGERPRINT "${input_fingerprint}"
+  write_env_var OPENSHELL_SANDBOX_BASE_IMAGE "${BASE_IMAGE}"
+  write_env_var CUDA_BUILD_IMAGE "${CUDA_BUILD_IMAGE}"
+  write_env_var CUDA_SAMPLES_REPO "${CUDA_SAMPLES_REPO}"
+  write_env_var CUDA_SAMPLES_REF "${CUDA_SAMPLES_REF}"
   write_env_var OPENSHELL_GPU_WORKLOAD_CONTAINER_ENGINE "${CONTAINER_ENGINE}"
   write_env_var OPENSHELL_E2E_WORKLOAD_MANIFEST "${manifest_path}"
   for name in "${selected[@]}"; do
@@ -194,11 +216,17 @@ manifest_path="${BUILD_DIR}/workloads.yaml"
   echo "  path: $(yaml_quote "${IMAGES_ROOT}")"
   echo "  revision: $(yaml_quote "${source_sha}")"
   echo "  dirty: ${source_dirty}"
+  echo "  input_fingerprint: $(yaml_quote "${input_fingerprint}")"
   echo "  container_engine: $(yaml_quote "${CONTAINER_ENGINE}")"
+  echo "  inputs:"
+  echo "    openshell_sandbox_base_image: $(yaml_quote "${BASE_IMAGE}")"
+  echo "    cuda_build_image: $(yaml_quote "${CUDA_BUILD_IMAGE}")"
+  echo "    cuda_samples_repo: $(yaml_quote "${CUDA_SAMPLES_REPO}")"
+  echo "    cuda_samples_ref: $(yaml_quote "${CUDA_SAMPLES_REF}")"
   echo "workloads:"
   for name in "${selected[@]}"; do
     echo "  - name: $(yaml_quote "${name}")"
-    echo "    image: $(yaml_quote "${image_refs[${name}]}" )"
+    echo "    image: $(yaml_quote "${image_refs[${name}]}")"
     echo "    command:"
     echo "      - $(yaml_quote "/usr/local/bin/openshell-gpu-workload")"
     echo "    expect: $(yaml_quote "$(image_expectation "${name}")")"
diff --git a/tasks/test.toml b/tasks/test.toml
@@ -26,7 +26,7 @@ description = "Run Docker GPU end-to-end tests"
 depends = ["e2e:docker:gpu"]
 
 ["e2e:workloads:build"]
-description = "Build local workload test images and manifest for e2e validation"
+description = "Build local GPU workload test images and manifest"
 run = "bash tasks/scripts/e2e-gpu-build-images.sh"
 
 ["e2e:k3s:gpu"]