Skip to content

Commit 5c98604

Browse files
authored
feat(gpu): honor device IDs in Docker and Podman (#1253)
* feat(gpu): honor device IDs in Docker and Podman Signed-off-by: Evan Lezar <elezar@nvidia.com> * test(gpu): add Docker and Podman device selection e2e Signed-off-by: Evan Lezar <elezar@nvidia.com> * ci(gpu): run Docker GPU e2e workflow Signed-off-by: Evan Lezar <elezar@nvidia.com> --------- Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent dfd4768 commit 5c98604

17 files changed

Lines changed: 607 additions & 75 deletions

File tree

.github/workflows/e2e-gpu-test.yaml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ permissions:
1414

1515
jobs:
1616
e2e-gpu:
17-
name: "E2E GPU (${{ matrix.name }})"
17+
name: "E2E Docker GPU (${{ matrix.name }})"
1818
runs-on: ${{ matrix.runner }}
1919
continue-on-error: ${{ matrix.experimental }}
2020
timeout-minutes: 30
@@ -55,8 +55,12 @@ jobs:
5555
- name: Log in to GHCR
5656
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
5757

58-
- name: Install Python dependencies and generate protobuf stubs
59-
run: uv sync --frozen && mise run --no-deps python:proto
58+
- name: Check Docker GPU prerequisites
59+
run: |
60+
docker info --format '{{json .CDISpecDirs}}'
61+
GPU_PROBE_IMAGE="$(awk '$1 == "FROM" && $3 == "AS" && $4 == "gateway" { print $2; exit }' deploy/docker/Dockerfile.images)"
62+
test -n "${GPU_PROBE_IMAGE}"
63+
docker run --rm --device nvidia.com/gpu=all "${GPU_PROBE_IMAGE}" nvidia-smi -L
6064
6165
- name: Run tests
62-
run: mise run --no-deps --skip-deps e2e:python:gpu
66+
run: mise run --no-deps --skip-deps e2e:docker:gpu

TESTING.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,14 @@ Suites:
151151
- Docker suite (`--features e2e-docker`) - common suite plus Docker-only coverage such as Dockerfile image builds, Docker preflight checks, and managed Docker gateway resume.
152152
- Docker GPU suite (`--features e2e-docker-gpu`) - Docker suite plus GPU sandbox smoke coverage.
153153

154+
GPU device-selection tests compare OpenShell sandboxes against a plain Docker or
155+
Podman container that requests `--device nvidia.com/gpu=all`. The probe image
156+
defaults to the image used by the `gateway` stage in
157+
`deploy/docker/Dockerfile.images`; set `OPENSHELL_E2E_GPU_PROBE_IMAGE` to
158+
override it. Per-device checks run only for NVIDIA CDI device IDs reported by
159+
the runtime's discovered devices list, so WSL2 hosts that expose only
160+
`nvidia.com/gpu=all` skip the index-based cases.
161+
154162
Run the Docker-backed Rust CLI e2e suite:
155163

156164
```shell

crates/openshell-cli/src/main.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,8 +1061,9 @@ enum SandboxCommands {
10611061
#[arg(long)]
10621062
gpu: bool,
10631063

1064-
/// Target a specific GPU by PCI address (e.g. "0000:2d:00.0") or index (e.g. "0", "1").
1065-
/// Only valid with --gpu. When omitted with --gpu, the first available GPU is assigned.
1064+
/// Target a driver-specific GPU device. Docker and Podman use CDI device IDs
1065+
/// (for example "nvidia.com/gpu=0"); VM uses a PCI BDF or index.
1066+
/// Only valid with --gpu. When omitted with --gpu, the driver uses its default GPU selection.
10661067
#[arg(long, requires = "gpu")]
10671068
gpu_device: Option<String>,
10681069

crates/openshell-core/src/gpu.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//! Shared GPU request helpers.
5+
6+
use crate::config::CDI_GPU_DEVICE_ALL;
7+
8+
/// Resolve the existing GPU request fields into CDI device identifiers.
9+
///
10+
/// `None` means no GPU was requested. A GPU request with no explicit device
11+
/// ID uses the CDI all-GPU request; otherwise the driver-native ID passes
12+
/// through unchanged.
13+
#[must_use]
14+
pub fn cdi_gpu_device_ids(gpu: bool, gpu_device: &str) -> Option<Vec<String>> {
15+
gpu.then(|| {
16+
if gpu_device.is_empty() {
17+
vec![CDI_GPU_DEVICE_ALL.to_string()]
18+
} else {
19+
vec![gpu_device.to_string()]
20+
}
21+
})
22+
}
23+
24+
#[cfg(test)]
25+
mod tests {
26+
use super::*;
27+
28+
#[test]
29+
fn cdi_gpu_device_ids_returns_none_when_absent() {
30+
assert_eq!(cdi_gpu_device_ids(false, ""), None);
31+
}
32+
33+
#[test]
34+
fn cdi_gpu_device_ids_defaults_empty_request_to_all_gpus() {
35+
assert_eq!(
36+
cdi_gpu_device_ids(true, ""),
37+
Some(vec![CDI_GPU_DEVICE_ALL.to_string()])
38+
);
39+
}
40+
41+
#[test]
42+
fn cdi_gpu_device_ids_passes_explicit_device_id_through() {
43+
assert_eq!(
44+
cdi_gpu_device_ids(true, "nvidia.com/gpu=0"),
45+
Some(vec!["nvidia.com/gpu=0".to_string()])
46+
);
47+
}
48+
}

crates/openshell-core/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
pub mod config;
1313
pub mod error;
1414
pub mod forward;
15+
pub mod gpu;
1516
pub mod image;
1617
pub mod inference;
1718
pub mod metadata;

crates/openshell-driver-docker/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ contract:
3030
| `cap_add` | Grants supervisor-only capabilities required for namespace setup and process inspection. |
3131
| `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. |
3232
| `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. |
33-
| CDI GPU request | Requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |
33+
| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |
3434

3535
The agent child process does not retain these supervisor privileges.
3636

crates/openshell-driver-docker/src/lib.rs

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,8 @@ use bollard::query_parameters::{
1818
};
1919
use bytes::Bytes;
2020
use futures::{Stream, StreamExt};
21-
use openshell_core::config::{
22-
CDI_GPU_DEVICE_ALL, DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS,
23-
};
21+
use openshell_core::config::{DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS};
22+
use openshell_core::gpu::cdi_gpu_device_ids;
2423
use openshell_core::proto::compute::v1::{
2524
CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse,
2625
DriverCondition, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate,
@@ -311,11 +310,7 @@ impl DockerComputeDriver {
311310
"docker sandboxes require a template image",
312311
));
313312
}
314-
if spec.gpu && !config.supports_gpu {
315-
return Err(Status::failed_precondition(
316-
"docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
317-
));
318-
}
313+
Self::validate_gpu_request(spec.gpu, config.supports_gpu)?;
319314
if !template.agent_socket_path.trim().is_empty() {
320315
return Err(Status::failed_precondition(
321316
"docker compute driver does not support template.agent_socket_path",
@@ -335,6 +330,15 @@ impl DockerComputeDriver {
335330
Ok(())
336331
}
337332

333+
fn validate_gpu_request(gpu: bool, supports_gpu: bool) -> Result<(), Status> {
334+
if gpu && !supports_gpu {
335+
return Err(Status::failed_precondition(
336+
"docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
337+
));
338+
}
339+
Ok(())
340+
}
341+
338342
async fn get_sandbox_snapshot(
339343
&self,
340344
sandbox_id: &str,
@@ -941,11 +945,11 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig
941945
.collect()
942946
}
943947

944-
fn docker_gpu_device_requests(gpu: bool) -> Option<Vec<DeviceRequest>> {
945-
gpu.then(|| {
948+
fn docker_gpu_device_requests(gpu: bool, gpu_device: &str) -> Option<Vec<DeviceRequest>> {
949+
cdi_gpu_device_ids(gpu, gpu_device).map(|device_ids| {
946950
vec![DeviceRequest {
947951
driver: Some("cdi".to_string()),
948-
device_ids: Some(vec![CDI_GPU_DEVICE_ALL.to_string()]),
952+
device_ids: Some(device_ids),
949953
..Default::default()
950954
}]
951955
})
@@ -992,7 +996,7 @@ fn build_container_create_body(
992996
host_config: Some(HostConfig {
993997
nano_cpus: resource_limits.nano_cpus,
994998
memory: resource_limits.memory_bytes,
995-
device_requests: docker_gpu_device_requests(spec.gpu),
999+
device_requests: docker_gpu_device_requests(spec.gpu, &spec.gpu_device),
9961000
binds: Some(build_binds(config)),
9971001
restart_policy: Some(RestartPolicy {
9981002
name: Some(RestartPolicyNameEnum::UNLESS_STOPPED),

crates/openshell-driver-docker/src/tests.rs

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// SPDX-License-Identifier: Apache-2.0
33

44
use super::*;
5-
use openshell_core::config::DEFAULT_SERVER_PORT;
5+
use openshell_core::config::{CDI_GPU_DEVICE_ALL, DEFAULT_SERVER_PORT};
66
use openshell_core::proto::compute::v1::{
77
DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate,
88
};
@@ -507,6 +507,30 @@ fn build_container_create_body_maps_gpu_to_all_cdi_device() {
507507
);
508508
}
509509

510+
#[test]
511+
fn build_container_create_body_passes_explicit_cdi_device_id_through() {
512+
let mut config = runtime_config();
513+
config.supports_gpu = true;
514+
let mut sandbox = test_sandbox();
515+
let spec = sandbox.spec.as_mut().unwrap();
516+
spec.gpu = true;
517+
spec.gpu_device = "nvidia.com/gpu=0".to_string();
518+
519+
let create_body = build_container_create_body(&sandbox, &config).unwrap();
520+
let request = create_body
521+
.host_config
522+
.as_ref()
523+
.and_then(|host_config| host_config.device_requests.as_ref())
524+
.and_then(|requests| requests.first())
525+
.expect("GPU request should add a Docker device request");
526+
527+
assert_eq!(request.driver.as_deref(), Some("cdi"));
528+
assert_eq!(
529+
request.device_ids.as_ref().unwrap(),
530+
&vec!["nvidia.com/gpu=0".to_string()]
531+
);
532+
}
533+
510534
#[test]
511535
fn require_sandbox_identifier_rejects_when_id_and_name_are_empty() {
512536
// Regression test: `delete_sandbox` (and the other identifier-keyed

crates/openshell-driver-podman/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ The container spec in `container.rs` sets these security-critical fields:
4646
| `no_new_privileges` | `true` | Prevents privilege escalation after exec. |
4747
| `seccomp_profile_path` | `unconfined` | The supervisor installs its own policy-aware BPF filter. A container-level profile can block Landlock/seccomp syscalls during setup. |
4848
| `mounts` | Private tmpfs at `/run/netns` | Lets the supervisor create named network namespaces in rootless Podman. |
49+
| CDI GPU devices | Sandbox `gpu_device` value when set, otherwise all NVIDIA GPUs | Exposes requested GPUs to GPU-enabled sandbox containers. |
4950

5051
The restricted agent child does not retain these supervisor privileges.
5152

crates/openshell-driver-podman/src/container.rs

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
//! Container spec construction for the Podman driver.
55
66
use crate::config::PodmanComputeConfig;
7-
use openshell_core::config::CDI_GPU_DEVICE_ALL;
7+
use openshell_core::gpu::cdi_gpu_device_ids;
88
use openshell_core::proto::compute::v1::DriverSandbox;
99
use serde::Serialize;
1010
use serde_json::Value;
@@ -345,13 +345,13 @@ fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits {
345345

346346
/// Build CDI GPU device list if GPU is requested.
347347
fn build_devices(sandbox: &DriverSandbox) -> Option<Vec<LinuxDevice>> {
348-
if sandbox.spec.as_ref().is_some_and(|s| s.gpu) {
349-
Some(vec![LinuxDevice {
350-
path: CDI_GPU_DEVICE_ALL.into(),
351-
}])
352-
} else {
353-
None
354-
}
348+
let spec = sandbox.spec.as_ref()?;
349+
cdi_gpu_device_ids(spec.gpu, &spec.gpu_device).map(|device_ids| {
350+
device_ids
351+
.into_iter()
352+
.map(|path| LinuxDevice { path })
353+
.collect()
354+
})
355355
}
356356

357357
/// Build the Podman container creation JSON spec.
@@ -687,6 +687,53 @@ mod tests {
687687
assert_eq!(short_id("short"), "short");
688688
}
689689

690+
#[test]
691+
fn container_spec_omits_devices_without_gpu_request() {
692+
let sandbox = test_sandbox("test-id", "test-name");
693+
let config = test_config();
694+
let spec = build_container_spec(&sandbox, &config);
695+
696+
assert!(spec.get("devices").is_none());
697+
}
698+
699+
#[test]
700+
fn container_spec_maps_empty_gpu_request_to_all_cdi_device() {
701+
use openshell_core::config::CDI_GPU_DEVICE_ALL;
702+
use openshell_core::proto::compute::v1::DriverSandboxSpec;
703+
704+
let mut sandbox = test_sandbox("test-id", "test-name");
705+
sandbox.spec = Some(DriverSandboxSpec {
706+
gpu: true,
707+
..Default::default()
708+
});
709+
let config = test_config();
710+
let spec = build_container_spec(&sandbox, &config);
711+
712+
assert_eq!(
713+
spec["devices"][0]["path"].as_str(),
714+
Some(CDI_GPU_DEVICE_ALL)
715+
);
716+
}
717+
718+
#[test]
719+
fn container_spec_passes_explicit_cdi_device_id_through() {
720+
use openshell_core::proto::compute::v1::DriverSandboxSpec;
721+
722+
let mut sandbox = test_sandbox("test-id", "test-name");
723+
sandbox.spec = Some(DriverSandboxSpec {
724+
gpu: true,
725+
gpu_device: "nvidia.com/gpu=0".to_string(),
726+
..Default::default()
727+
});
728+
let config = test_config();
729+
let spec = build_container_spec(&sandbox, &config);
730+
731+
assert_eq!(
732+
spec["devices"][0]["path"].as_str(),
733+
Some("nvidia.com/gpu=0")
734+
);
735+
}
736+
690737
#[test]
691738
fn container_spec_includes_required_capabilities() {
692739
let sandbox = test_sandbox("test-id", "test-name");

0 commit comments

Comments
 (0)