Skip to content

Commit 62619ee

Browse files
authored
fix(docker): use supervisor image entrypoint path (#1259)
Signed-off-by: Drew Newberry <anewberry@nvidia.com>
1 parent 084c93b commit 62619ee

12 files changed

Lines changed: 57 additions & 24 deletions

File tree

.agents/skills/debug-openshell-cluster/SKILL.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ Use gateway metadata, deployment values, or the user's setup notes to identify t
6363
docker info
6464
docker ps --filter name=openshell
6565
docker logs <container> --tail=200
66+
docker run --rm --entrypoint /openshell-sandbox "${OPENSHELL_DOCKER_SUPERVISOR_IMAGE:-ghcr.io/nvidia/openshell/supervisor:latest}" --version
6667
openshell status
6768
```
6869

@@ -71,6 +72,7 @@ Common findings:
7172
- Docker daemon unavailable: start Docker Desktop or Docker Engine.
7273
- Gateway process stopped: inspect exit status and logs.
7374
- Sandbox image missing or pull denied: verify image reference and registry credentials.
75+
- Docker driver cannot initialize because it cannot find `openshell-sandbox`: verify `OPENSHELL_DOCKER_SUPERVISOR_BIN`, the sibling binary next to `openshell-gateway`, or the configured supervisor image contains `/openshell-sandbox`.
7476
- Sandbox never registers: check gateway logs and supervisor callback endpoint.
7577

7678
For source checkout development, restart the local gateway with:

.github/workflows/docker-build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ jobs:
155155
component: ${{ needs.resolve.outputs.binary_component }}
156156
arch: ${{ matrix.arch }}
157157
cargo-version: ${{ inputs['cargo-version'] }}
158+
image-tag: ${{ needs.resolve.outputs.image_tag_base }}
158159
checkout-ref: ${{ inputs['checkout-ref'] }}
159160
features: openshell-core/dev-settings
160161
artifact-name: ${{ needs.resolve.outputs.artifact_prefix }}-linux-${{ matrix.arch }}
@@ -238,7 +239,6 @@ jobs:
238239
--cache-to "type=gha,mode=max,scope=${{ inputs.component }}-${{ matrix.arch }}"
239240
240241
- name: Smoke check ${{ inputs.component }} image
241-
if: ${{ !inputs.push }}
242242
run: |
243243
set -euo pipefail
244244
image="${IMAGE_REGISTRY}/${{ inputs.component }}:${IMAGE_TAG}"
@@ -249,7 +249,7 @@ jobs:
249249
grep -q '^openshell-gateway ' <<<"$output"
250250
;;
251251
supervisor)
252-
output="$(docker run --rm --platform "${{ matrix.platform }}" "$image" --version)"
252+
output="$(docker run --rm --platform "${{ matrix.platform }}" --entrypoint /openshell-sandbox "$image" --version)"
253253
echo "$output"
254254
grep -q '^openshell-sandbox ' <<<"$output"
255255
;;

.github/workflows/release-dev.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,8 @@ jobs:
432432
sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${{ needs.compute-versions.outputs.cargo_version }}"'"/}' Cargo.toml
433433
434434
- name: Build ${{ matrix.target }}
435+
env:
436+
OPENSHELL_IMAGE_TAG: ${{ github.sha }}
435437
run: |
436438
set -euo pipefail
437439
mise x -- cargo build --release --target ${{ matrix.target }} -p openshell-server

.github/workflows/release-tag.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,8 @@ jobs:
466466
sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${{ needs.compute-versions.outputs.cargo_version }}"'"/}' Cargo.toml
467467
468468
- name: Build ${{ matrix.target }}
469+
env:
470+
OPENSHELL_IMAGE_TAG: ${{ needs.compute-versions.outputs.source_sha }}
469471
run: |
470472
set -euo pipefail
471473
mise x -- cargo build --release --target ${{ matrix.target }} -p openshell-server

.github/workflows/shadow-rust-native-build.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ on:
4242
required: false
4343
type: string
4444
default: ""
45+
image-tag:
46+
description: "Supervisor image tag to bake into gateway binaries"
47+
required: false
48+
type: string
49+
default: ""
4550
workflow_dispatch:
4651
inputs:
4752
component:
@@ -85,6 +90,11 @@ on:
8590
required: false
8691
type: string
8792
default: ""
93+
image-tag:
94+
description: "Supervisor image tag to bake into gateway binaries"
95+
required: false
96+
type: string
97+
default: ""
8898

8999
permissions:
90100
contents: read
@@ -207,6 +217,7 @@ jobs:
207217
# Preserve the release-codegen setting used by the old Dockerfile
208218
# Rust build path so image artifacts keep the same release profile.
209219
CARGO_PROFILE_RELEASE_CODEGEN_UNITS: "1"
220+
OPENSHELL_IMAGE_TAG: ${{ inputs['image-tag'] }}
210221
run: |
211222
set -euo pipefail
212223
args=(

architecture/build.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ OpenShell builds these main artifacts:
1212
|---|---|
1313
| Gateway binary | `crates/openshell-server` |
1414
| CLI package and Python SDK | `python/openshell` plus Rust binaries where packaged |
15-
| Gateway container image | `deploy/docker/Dockerfile.images` |
15+
| Gateway and supervisor container images | `deploy/docker/Dockerfile.images` |
1616
| Helm chart | `deploy/helm/openshell` |
1717
| VM driver/runtime assets | `crates/openshell-driver-vm` |
1818
| Published docs site | `docs/` rendered by Fern config in `fern/` |
@@ -25,6 +25,8 @@ The Docker image pipeline stages prebuilt Rust binaries, then builds container
2525
images from `deploy/docker/Dockerfile.images`. CI builds native artifacts on the
2626
target architecture, stages them under `deploy/docker/.build/`, and then uses
2727
Buildx to publish per-architecture images and multi-architecture tags.
28+
Gateway image builds bake the corresponding supervisor image tag into the
29+
gateway binary so Docker sandboxes do not depend on `:latest` by default.
2830

2931
Local image work should use `mise` tasks rather than direct Docker commands so
3032
the same staging and tagging assumptions are used locally and in CI.

architecture/compute-runtimes.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ The supervisor must be available inside each sandbox workload:
3838

3939
| Runtime | Delivery model |
4040
|---|---|
41-
| Docker | Bind-mounted or extracted supervisor binary configured by the gateway. |
41+
| Docker | Bind-mounted local supervisor binary, or a binary extracted from the configured supervisor image. |
4242
| Podman | Read-only OCI image volume containing the supervisor binary. |
4343
| Kubernetes | Sandbox pod image or pod template configuration. |
4444
| VM | Embedded in the guest rootfs bundle. |

crates/openshell-driver-docker/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,21 @@ contract:
3434

3535
The agent child process does not retain these supervisor privileges.
3636

37+
## Supervisor Binary Resolution
38+
39+
The Docker driver bind-mounts a host-side Linux `openshell-sandbox` binary into
40+
each sandbox container. Resolution order is:
41+
42+
1. `--docker-supervisor-bin` / `OPENSHELL_DOCKER_SUPERVISOR_BIN`.
43+
2. A sibling `openshell-sandbox` next to the running `openshell-gateway` binary.
44+
3. A local Linux cargo target build for the Docker daemon architecture.
45+
4. `--docker-supervisor-image` / `OPENSHELL_DOCKER_SUPERVISOR_IMAGE`, or the
46+
release-matched default supervisor image, extracting `/openshell-sandbox`.
47+
48+
Release and Docker-image gateway builds bake the matching supervisor image tag
49+
into the binary at compile time. The default Docker supervisor image is not
50+
`:latest` unless a custom build explicitly sets that tag.
51+
3752
## Callback and TLS
3853

3954
`OPENSHELL_ENDPOINT` is injected from the gateway's configured gRPC endpoint

crates/openshell-driver-docker/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1759,7 +1759,7 @@ async fn extract_supervisor_binary_bytes(docker: &Docker, image: &str) -> CoreRe
17591759
),
17601760
ContainerCreateBody {
17611761
image: Some(image.to_string()),
1762-
entrypoint: Some(vec!["/openshell-sandbox".to_string()]),
1762+
entrypoint: Some(vec![SUPERVISOR_IMAGE_BINARY_PATH.to_string()]),
17631763
cmd: Some(Vec::new()),
17641764
..Default::default()
17651765
},

crates/openshell-driver-kubernetes/src/driver.rs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -688,19 +688,19 @@ fn supervisor_volume_mount() -> serde_json::Value {
688688

689689
/// Path of the supervisor binary inside the supervisor image.
690690
///
691-
/// The supervisor image places the binary at the filesystem root and ships
692-
/// nothing else. We invoke it directly — there is no shell, `cp`, or PATH
693-
/// resolution available inside the image.
691+
/// The supervisor image places the binary at the filesystem root. We invoke
692+
/// it directly so the init path does not depend on shell utilities or PATH
693+
/// resolution inside the image.
694694
const SUPERVISOR_IMAGE_BINARY_PATH: &str = "/openshell-sandbox";
695695

696696
/// Build the init container that copies the supervisor binary into the emptyDir.
697697
///
698-
/// The supervisor image contains only the supervisor binary at
699-
/// `/openshell-sandbox`. We invoke that binary with the `copy-self`
700-
/// subcommand so it copies itself into the shared emptyDir volume, where the
701-
/// agent container then executes it from a fixed, writable path. This pattern
702-
/// (binary self-copy) avoids requiring `sh`/`cp` in the supervisor image and
703-
/// mirrors the approach used by argoexec's emissary executor.
698+
/// The supervisor image contains the supervisor binary at `/openshell-sandbox`.
699+
/// We invoke that binary with the `copy-self` subcommand so it copies itself
700+
/// into the shared emptyDir volume, where the agent container then executes it
701+
/// from a fixed, writable path. This pattern (binary self-copy) avoids requiring
702+
/// `sh`/`cp` in the supervisor image and mirrors the approach used by argoexec's
703+
/// emissary executor.
704704
fn supervisor_init_container(
705705
supervisor_image: &str,
706706
supervisor_image_pull_policy: &str,
@@ -1559,8 +1559,8 @@ mod tests {
15591559
assert_eq!(init_containers[0]["image"], "supervisor-image:latest");
15601560
assert_eq!(init_containers[0]["imagePullPolicy"], "IfNotPresent");
15611561

1562-
// The supervisor image ships only the binary (no shell). The init
1563-
// container must invoke the binary directly with `copy-self <DEST>`.
1562+
// The init container must invoke the binary directly with
1563+
// `copy-self <DEST>` rather than depending on shell utilities.
15641564
let init_command = init_containers[0]["command"]
15651565
.as_array()
15661566
.expect("init container command should be set");
@@ -1573,7 +1573,7 @@ mod tests {
15731573
);
15741574
assert!(
15751575
!init_command.iter().any(|v| v == "sh"),
1576-
"init container must not depend on a shell (supervisor image ships only the binary)"
1576+
"init container must not depend on a shell"
15771577
);
15781578

15791579
// Agent container command should be overridden to the emptyDir path

0 commit comments

Comments
 (0)