From c7e03203cb151a73e1f211976878b1d59471e4dc Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 16 Jun 2026 16:32:48 +0200 Subject: [PATCH] fix(docker): honor configured supervisor image Signed-off-by: Evan Lezar --- crates/openshell-driver-docker/README.md | 9 +- crates/openshell-driver-docker/src/lib.rs | 103 +++++++++++++------- crates/openshell-driver-docker/src/tests.rs | 30 ++++++ docs/reference/gateway-config.mdx | 1 + e2e/with-docker-gateway.sh | 101 ++++++------------- 5 files changed, 133 insertions(+), 111 deletions(-) diff --git a/crates/openshell-driver-docker/README.md b/crates/openshell-driver-docker/README.md index 71159fe66..754802cbe 100644 --- a/crates/openshell-driver-docker/README.md +++ b/crates/openshell-driver-docker/README.md @@ -79,10 +79,11 @@ The Docker driver bind-mounts a host-side Linux `openshell-sandbox` binary into each sandbox container. Resolution order is: 1. `supervisor_bin` in `[openshell.drivers.docker]`. -2. A sibling `openshell-sandbox` next to the running `openshell-gateway` binary. -3. A local Linux cargo target build for the Docker daemon architecture. -4. `supervisor_image` in `[openshell.drivers.docker]`, or the - release-matched default supervisor image, extracting `/openshell-sandbox`. +2. `supervisor_image` in `[openshell.drivers.docker]`, extracting + `/openshell-sandbox` from that image. +3. A sibling `openshell-sandbox` next to the running `openshell-gateway` binary. +4. A local Linux cargo target build for the Docker daemon architecture. +5. The release-matched default supervisor image, extracting `/openshell-sandbox`. Release and Docker-image gateway builds bake the matching supervisor image tag into the binary at compile time. The default Docker supervisor image is not diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 963e7a0f7..c415151db 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -79,7 +79,8 @@ const DOCKER_NETWORK_DRIVER: &str = "bridge"; /// Default image holding the Linux `openshell-sandbox` binary. The gateway /// pulls this image and extracts the binary to a host-side cache when no -/// explicit `supervisor_bin` override or local build is available. +/// explicit `supervisor_bin`, configured `supervisor_image`, sibling binary, +/// or local build is available. const DEFAULT_DOCKER_SUPERVISOR_IMAGE_REPO: &str = "ghcr.io/nvidia/openshell/supervisor"; /// Return the default `ghcr.io/nvidia/openshell/supervisor:` reference @@ -155,10 +156,9 @@ pub struct DockerComputeConfig { /// Optional override for the Linux `openshell-sandbox` binary mounted into containers. pub supervisor_bin: Option, - /// Optional override for the image the gateway pulls to extract the - /// Linux `openshell-sandbox` binary when no explicit binary path or - /// local build is available. Defaults to - /// `ghcr.io/nvidia/openshell/supervisor:`. + /// Optional image used to extract the Linux `openshell-sandbox` binary. + /// Ignored when `supervisor_bin` is set. See `resolve_supervisor_bin` for + /// the full resolution order. pub supervisor_image: Option, /// Host-side CA certificate for Docker sandbox mTLS. @@ -2948,56 +2948,89 @@ fn normalize_docker_arch(arch: &str) -> String { } } -pub(crate) async fn resolve_supervisor_bin( - docker: &Docker, +#[derive(Debug, Eq, PartialEq)] +enum SupervisorBinSource { + Binary(PathBuf), + Image(String), +} + +fn resolve_supervisor_bin_source( docker_config: &DockerComputeConfig, - daemon_arch: &str, -) -> CoreResult { + current_exe: Option<&Path>, + target_candidates: &[PathBuf], +) -> CoreResult { // Tier 1: explicit supervisor_bin in [openshell.drivers.docker]. if let Some(path) = docker_config.supervisor_bin.clone() { let path = canonicalize_existing_file(&path, "docker supervisor binary")?; validate_linux_elf_binary(&path)?; - return Ok(path); + return Ok(SupervisorBinSource::Binary(path)); + } + + // Tier 2: explicit supervisor_image in [openshell.drivers.docker]. + // A configured image should be the source of truth even when a local + // developer build is present under target/. + if let Some(image) = docker_config.supervisor_image.clone() { + return Ok(SupervisorBinSource::Image(image)); } - // Tier 2: sibling `openshell-sandbox` next to the running gateway + // Tier 3: sibling `openshell-sandbox` next to the running gateway // (release artifact layout). Linux-only because the sibling must be a // Linux ELF to bind-mount into a Linux container. - if cfg!(target_os = "linux") { - let current_exe = std::env::current_exe() - .map_err(|err| Error::config(format!("failed to resolve current executable: {err}")))?; - if let Some(parent) = current_exe.parent() { - let sibling = parent.join("openshell-sandbox"); - if sibling.is_file() { - let path = canonicalize_existing_file(&sibling, "docker supervisor binary")?; - if validate_linux_elf_binary(&path).is_ok() { - return Ok(path); - } + if cfg!(target_os = "linux") + && let Some(current_exe) = current_exe + && let Some(parent) = current_exe.parent() + { + let sibling = parent.join("openshell-sandbox"); + if sibling.is_file() { + let path = canonicalize_existing_file(&sibling, "docker supervisor binary")?; + if validate_linux_elf_binary(&path).is_ok() { + return Ok(SupervisorBinSource::Binary(path)); } } } - // Tier 3: local cargo target build (developer workflow). Preferred - // over a registry pull when available because it matches whatever the - // developer just built. - let target_candidates = linux_supervisor_candidates(daemon_arch); - for candidate in &target_candidates { + // Tier 4: local cargo target build (developer workflow). Preferred + // over the default registry image when available because it matches + // whatever the developer just built. + for candidate in target_candidates { if candidate.is_file() { let path = canonicalize_existing_file(candidate, "docker supervisor binary")?; if validate_linux_elf_binary(&path).is_ok() { - return Ok(path); + return Ok(SupervisorBinSource::Binary(path)); } } } - // Tier 4: pull the supervisor image from a registry and extract the - // binary to a host-side cache keyed by image content digest. This is - // the default path for released gateway binaries. - let image = docker_config - .supervisor_image - .clone() - .unwrap_or_else(default_docker_supervisor_image); - extract_supervisor_bin_from_image(docker, &image).await + // Tier 5: pull the release-matched default supervisor image and extract + // the binary to a host-side cache keyed by image content digest. + Ok(SupervisorBinSource::Image(default_docker_supervisor_image())) +} + +pub(crate) async fn resolve_supervisor_bin( + docker: &Docker, + docker_config: &DockerComputeConfig, + daemon_arch: &str, +) -> CoreResult { + let current_exe = + if cfg!(target_os = "linux") + && docker_config.supervisor_bin.is_none() + && docker_config.supervisor_image.is_none() + { + Some(std::env::current_exe().map_err(|err| { + Error::config(format!("failed to resolve current executable: {err}")) + })?) + } else { + None + }; + let target_candidates = linux_supervisor_candidates(daemon_arch); + + match resolve_supervisor_bin_source(docker_config, current_exe.as_deref(), &target_candidates)? + { + SupervisorBinSource::Binary(path) => Ok(path), + SupervisorBinSource::Image(image) => { + extract_supervisor_bin_from_image(docker, &image).await + } + } } fn linux_supervisor_candidates(daemon_arch: &str) -> Vec { diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index d5132fe1a..b7124b228 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -1711,6 +1711,36 @@ fn default_docker_supervisor_image_uses_nvidia_ghcr_repo() { ); } +#[test] +fn configured_supervisor_image_takes_precedence_over_local_binaries() { + let tempdir = TempDir::new().unwrap(); + let bin_dir = tempdir.path().join("bin"); + fs::create_dir_all(&bin_dir).unwrap(); + let current_exe = bin_dir.join("openshell-gateway"); + let sibling = bin_dir.join("openshell-sandbox"); + fs::write(¤t_exe, b"gateway").unwrap(); + fs::write(&sibling, b"\x7fELFsibling").unwrap(); + + let local_build = tempdir.path().join("target/openshell-sandbox"); + fs::create_dir_all(local_build.parent().unwrap()).unwrap(); + fs::write(&local_build, b"\x7fELFlocal").unwrap(); + + let source = resolve_supervisor_bin_source( + &DockerComputeConfig { + supervisor_image: Some("example.com/openshell/supervisor:test".to_string()), + ..Default::default() + }, + Some(¤t_exe), + &[local_build], + ) + .unwrap(); + + assert_eq!( + source, + SupervisorBinSource::Image("example.com/openshell/supervisor:test".to_string()) + ); +} + #[test] fn docker_supervisor_image_tag_prefers_explicit_build_tags() { assert_eq!( diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx index ff4542136..d820a131d 100644 --- a/docs/reference/gateway-config.mdx +++ b/docs/reference/gateway-config.mdx @@ -218,6 +218,7 @@ sandbox_namespace = "docker-dev" grpc_endpoint = "https://host.openshell.internal:17670" # Skip the image-pull-and-extract step by pointing at a locally built binary. supervisor_bin = "/usr/local/libexec/openshell/openshell-sandbox" +# When supervisor_bin is omitted, Docker extracts /openshell-sandbox from this image. supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" guest_tls_ca = "/etc/openshell/certs/ca.pem" guest_tls_cert = "/etc/openshell/certs/client.pem" diff --git a/e2e/with-docker-gateway.sh b/e2e/with-docker-gateway.sh index 5dc61e6c1..fe9156f98 100755 --- a/e2e/with-docker-gateway.sh +++ b/e2e/with-docker-gateway.sh @@ -64,6 +64,9 @@ require_container_engine_lane() { } require_container_engine_lane docker Docker +CONTAINER_ENGINE_QUIET="${CONTAINER_ENGINE_QUIET:-1}" +# shellcheck source=tasks/scripts/container-engine.sh +source "${ROOT}/tasks/scripts/container-engine.sh" github_actions_host_docker_tmpdir() { if [ "${GITHUB_ACTIONS:-}" != "true" ] \ @@ -111,7 +114,6 @@ DOCKER_NETWORK_NAME="" DOCKER_NETWORK_CONNECTED_CONTAINER="" DOCKER_NETWORK_MANAGED=0 GPU_MODE="${OPENSHELL_E2E_DOCKER_GPU:-0}" -DOCKER_SUPERVISOR_ARGS=() # Isolate CLI/SDK gateway metadata from the developer's real config. export XDG_CONFIG_HOME="${WORKDIR}/config" @@ -292,25 +294,6 @@ if [ "${GPU_MODE}" = "1" ]; then fi fi -normalize_arch() { - case "$1" in - x86_64|amd64) echo "amd64" ;; - aarch64|arm64) echo "arm64" ;; - *) echo "$1" ;; - esac -} - -linux_target_triple() { - case "$1" in - amd64) echo "x86_64-unknown-linux-gnu" ;; - arm64) echo "aarch64-unknown-linux-gnu" ;; - *) - echo "ERROR: unsupported Docker daemon architecture '$1'" >&2 - exit 2 - ;; - esac -} - resolve_docker_supervisor_image() { if [ -n "${OPENSHELL_DOCKER_SUPERVISOR_IMAGE:-}" ]; then printf '%s\n' "${OPENSHELL_DOCKER_SUPERVISOR_IMAGE}" @@ -333,7 +316,7 @@ resolve_docker_supervisor_image() { return 0 fi - printf '%s\n' "" + printf '%s\n' "openshell/supervisor:dev" } docker_pull_with_retry() { @@ -362,6 +345,27 @@ docker_pull_with_retry() { return 1 } +build_local_docker_supervisor_image_if_required() { + local image=$1 + + if [ "${image}" != "openshell/supervisor:dev" ]; then + return 0 + fi + + local daemon_arch + daemon_arch="$(ce_info_arch)" + + echo "Building local Docker supervisor image ${image} for linux/${daemon_arch}..." + CONTAINER_ENGINE=docker DOCKER_PLATFORM="linux/${daemon_arch}" IMAGE_TAG=dev \ + bash "${ROOT}/tasks/scripts/docker-build-image.sh" supervisor + if docker image inspect "${image}" >/dev/null 2>&1; then + return 0 + fi + + echo "ERROR: expected supervisor image '${image}' after local build." >&2 + exit 2 +} + ensure_docker_supervisor_image() { local image=$1 @@ -414,47 +418,12 @@ ensure_sandbox_image_available() { docker_pull_with_retry "${image}" } -DAEMON_ARCH="$(normalize_arch "$(docker info --format '{{.Architecture}}' 2>/dev/null || true)")" -SUPERVISOR_TARGET="$(linux_target_triple "${DAEMON_ARCH}")" -HOST_OS="$(uname -s)" -HOST_ARCH="$(normalize_arch "$(uname -m)")" -SUPERVISOR_OUT_DIR="${WORKDIR}/supervisor/${DAEMON_ARCH}" -SUPERVISOR_BIN="${SUPERVISOR_OUT_DIR}/openshell-sandbox" - -CARGO_BUILD_JOBS_ARG=() -if [ -n "${CARGO_BUILD_JOBS:-}" ]; then - CARGO_BUILD_JOBS_ARG=(-j "${CARGO_BUILD_JOBS}") -fi - e2e_build_gateway_binaries "${ROOT}" TARGET_DIR GATEWAY_BIN CLI_BIN SUPERVISOR_IMAGE="$(resolve_docker_supervisor_image)" -if [ -n "${SUPERVISOR_IMAGE}" ]; then - ensure_docker_supervisor_image "${SUPERVISOR_IMAGE}" - echo "Using Docker supervisor image: ${SUPERVISOR_IMAGE}" - DOCKER_SUPERVISOR_ARGS=(--docker-supervisor-image "${SUPERVISOR_IMAGE}") -else - echo "Building openshell-sandbox for ${SUPERVISOR_TARGET}..." - mkdir -p "${SUPERVISOR_OUT_DIR}" - if [ "${HOST_OS}" = "Linux" ] && [ "${HOST_ARCH}" = "${DAEMON_ARCH}" ]; then - rustup target add "${SUPERVISOR_TARGET}" >/dev/null 2>&1 || true - cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ - --release -p openshell-sandbox --target "${SUPERVISOR_TARGET}" - cp "${TARGET_DIR}/${SUPERVISOR_TARGET}/release/openshell-sandbox" "${SUPERVISOR_BIN}" - else - CONTAINER_ENGINE=docker \ - DOCKER_PLATFORM="linux/${DAEMON_ARCH}" \ - DOCKER_OUTPUT="type=local,dest=${SUPERVISOR_OUT_DIR}" \ - bash "${ROOT}/tasks/scripts/docker-build-image.sh" supervisor-output - fi - - if [ ! -f "${SUPERVISOR_BIN}" ]; then - echo "ERROR: expected supervisor binary at ${SUPERVISOR_BIN}" >&2 - exit 1 - fi - chmod +x "${SUPERVISOR_BIN}" - DOCKER_SUPERVISOR_ARGS=(--docker-supervisor-bin "${SUPERVISOR_BIN}") -fi +build_local_docker_supervisor_image_if_required "${SUPERVISOR_IMAGE}" +ensure_docker_supervisor_image "${SUPERVISOR_IMAGE}" +echo "Using Docker supervisor image: ${SUPERVISOR_IMAGE}" DEFAULT_SANDBOX_IMAGE="ghcr.io/nvidia/openshell-community/sandboxes/base:latest" SANDBOX_IMAGE="${OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE:-${OPENSHELL_SANDBOX_IMAGE:-${DEFAULT_SANDBOX_IMAGE}}}" @@ -521,19 +490,7 @@ GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" printf 'guest_tls_cert = %s\n' "$(toml_string "${PKI_DIR}/client/tls.crt")" printf 'guest_tls_key = %s\n' "$(toml_string "${PKI_DIR}/client/tls.key")" printf 'enable_bind_mounts = true\n' - # DOCKER_SUPERVISOR_ARGS holds either ("--docker-supervisor-bin" "") - # or ("--docker-supervisor-image" ""); both map to TOML keys on - # the docker driver config. - for ((i=0; i<${#DOCKER_SUPERVISOR_ARGS[@]}; i+=2)); do - case "${DOCKER_SUPERVISOR_ARGS[$i]}" in - --docker-supervisor-bin) - printf 'supervisor_bin = %s\n' "$(toml_string "${DOCKER_SUPERVISOR_ARGS[$((i+1))]}")" - ;; - --docker-supervisor-image) - printf 'supervisor_image = %s\n' "$(toml_string "${DOCKER_SUPERVISOR_ARGS[$((i+1))]}")" - ;; - esac - done + printf 'supervisor_image = %s\n' "$(toml_string "${SUPERVISOR_IMAGE}")" if [ -n "${GATEWAY_HOST_ALIAS_IP}" ]; then printf 'host_gateway_ip = %s\n' "$(toml_string "${GATEWAY_HOST_ALIAS_IP}")" fi