Skip to content

Commit 4cedfec

Browse files
committed
fix(gpu): prefer single CDI devices for local runtimes
Prefer a single CDI-qualified device when Docker or Podman resolves the default GPU request to one GPU. Allow nvidia.com/gpu=all only as a WSL2 all-only compatibility fallback, using Docker daemon info and Podman's /dev/dxg probe to identify that case. Update driver docs, architecture notes, and GPU e2e coverage for the default selection behavior. Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent 69764d8 commit 4cedfec

10 files changed

Lines changed: 945 additions & 85 deletions

File tree

architecture/compute-runtimes.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ template resource limits. Docker and Podman apply them as runtime limits.
4040
Kubernetes mirrors each limit into the matching request. VM accepts the fields
4141
but currently ignores them.
4242

43+
GPU requests enter the driver layer through `SandboxSpec.gpu` and
44+
`SandboxSpec.gpu_device`. Docker and Podman map default GPU requests to one
45+
concrete NVIDIA CDI device when individual CDI devices are available, use
46+
`nvidia.com/gpu=all` only for WSL2/all-only compatibility, and pass explicit
47+
driver-native device IDs through.
48+
4349
VM runtime state paths are derived only from driver-validated sandbox IDs
4450
matching `[A-Za-z0-9._-]{1,128}`. The gateway-owned VM driver socket uses a
4551
private `run/` directory plus Unix peer UID/PID checks. Standalone

crates/openshell-core/src/gpu.rs

Lines changed: 296 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,183 @@
33

44
//! Shared GPU request helpers.
55
6+
use std::fmt;
7+
use std::sync::atomic::{AtomicUsize, Ordering};
8+
69
use crate::config::CDI_GPU_DEVICE_ALL;
710

8-
/// Resolve the existing GPU request fields into CDI device identifiers.
9-
///
10-
/// `None` means no GPU was requested. A GPU request with no explicit device
11-
/// ID uses the CDI all-GPU request; otherwise the driver-native ID passes
12-
/// through unchanged.
13-
#[must_use]
14-
pub fn cdi_gpu_device_ids(gpu: bool, gpu_device: &str) -> Option<Vec<String>> {
15-
gpu.then(|| {
16-
if gpu_device.is_empty() {
17-
vec![CDI_GPU_DEVICE_ALL.to_string()]
11+
const CDI_NVIDIA_GPU_PREFIX: &str = "nvidia.com/gpu=";
12+
const CDI_NVIDIA_GPU_ALL_SUFFIX: &str = "all";
13+
14+
/// Normalized CDI GPU inventory used by local container drivers.
15+
#[derive(Debug, Clone, Default, PartialEq, Eq)]
16+
pub struct CdiGpuInventory {
17+
device_ids: Vec<String>,
18+
}
19+
20+
impl CdiGpuInventory {
21+
/// Build a normalized inventory from runtime-reported CDI device IDs.
22+
#[must_use]
23+
pub fn new(device_ids: impl IntoIterator<Item = impl AsRef<str>>) -> Self {
24+
let mut device_ids = device_ids
25+
.into_iter()
26+
.filter_map(|id| {
27+
let id = id.as_ref().trim();
28+
id.starts_with(CDI_NVIDIA_GPU_PREFIX)
29+
.then(|| id.to_string())
30+
})
31+
.collect::<Vec<_>>();
32+
device_ids.sort();
33+
device_ids.dedup();
34+
Self { device_ids }
35+
}
36+
37+
#[must_use]
38+
pub fn as_slice(&self) -> &[String] {
39+
&self.device_ids
40+
}
41+
42+
#[must_use]
43+
pub fn is_empty(&self) -> bool {
44+
self.device_ids.is_empty()
45+
}
46+
47+
fn default_device_family(
48+
&self,
49+
allow_all_devices: bool,
50+
) -> Result<Vec<String>, CdiGpuSelectionError> {
51+
let mut indexed = self
52+
.device_ids
53+
.iter()
54+
.filter_map(|id| {
55+
let suffix = cdi_nvidia_gpu_suffix(id)?;
56+
let index = suffix.parse::<u64>().ok()?;
57+
Some((index, id.clone()))
58+
})
59+
.collect::<Vec<_>>();
60+
if !indexed.is_empty() {
61+
indexed.sort_by(|left, right| left.0.cmp(&right.0).then_with(|| left.1.cmp(&right.1)));
62+
return Ok(indexed.into_iter().map(|(_, id)| id).collect());
63+
}
64+
65+
let mut named = self
66+
.device_ids
67+
.iter()
68+
.filter_map(|id| {
69+
let suffix = cdi_nvidia_gpu_suffix(id)?;
70+
(suffix != CDI_NVIDIA_GPU_ALL_SUFFIX).then(|| id.clone())
71+
})
72+
.collect::<Vec<_>>();
73+
if !named.is_empty() {
74+
named.sort();
75+
return Ok(named);
76+
}
77+
78+
if self.device_ids.iter().any(|id| id == CDI_GPU_DEVICE_ALL) {
79+
if !allow_all_devices {
80+
return Err(CdiGpuSelectionError::AllDevicesDefaultUnsupported);
81+
}
82+
return Ok(vec![CDI_GPU_DEVICE_ALL.to_string()]);
83+
}
84+
85+
Err(CdiGpuSelectionError::NoAvailableDevices)
86+
}
87+
}
88+
89+
/// Concurrency-safe round-robin cursor for default CDI GPU selection.
90+
#[derive(Debug, Default)]
91+
pub struct CdiGpuRoundRobin {
92+
next: AtomicUsize,
93+
}
94+
95+
impl CdiGpuRoundRobin {
96+
#[must_use]
97+
pub const fn new() -> Self {
98+
Self {
99+
next: AtomicUsize::new(0),
100+
}
101+
}
102+
103+
/// Return the next default device ID and advance the cursor.
104+
pub fn next_default_device_id(
105+
&self,
106+
inventory: &CdiGpuInventory,
107+
allow_all_devices: bool,
108+
) -> Result<String, CdiGpuSelectionError> {
109+
self.selected_default_device_id(inventory, true, allow_all_devices)
110+
}
111+
112+
/// Return the current default device ID without advancing the cursor.
113+
pub fn peek_default_device_id(
114+
&self,
115+
inventory: &CdiGpuInventory,
116+
allow_all_devices: bool,
117+
) -> Result<String, CdiGpuSelectionError> {
118+
self.selected_default_device_id(inventory, false, allow_all_devices)
119+
}
120+
121+
fn selected_default_device_id(
122+
&self,
123+
inventory: &CdiGpuInventory,
124+
consume: bool,
125+
allow_all_devices: bool,
126+
) -> Result<String, CdiGpuSelectionError> {
127+
let devices = inventory.default_device_family(allow_all_devices)?;
128+
let base = if consume {
129+
self.next.fetch_add(1, Ordering::Relaxed)
18130
} else {
19-
vec![gpu_device.to_string()]
131+
self.next.load(Ordering::Relaxed)
132+
};
133+
Ok(devices[base % devices.len()].clone())
134+
}
135+
}
136+
137+
/// CDI GPU selection failed.
138+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
139+
pub enum CdiGpuSelectionError {
140+
NoAvailableDevices,
141+
MissingDefaultDevice,
142+
AllDevicesDefaultUnsupported,
143+
}
144+
145+
impl fmt::Display for CdiGpuSelectionError {
146+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
147+
match self {
148+
Self::NoAvailableDevices => f.write_str("no NVIDIA CDI GPU devices were discovered"),
149+
Self::MissingDefaultDevice => {
150+
f.write_str("GPU request requires a selected default CDI GPU device")
151+
}
152+
Self::AllDevicesDefaultUnsupported => f.write_str(
153+
"default GPU request resolved only to nvidia.com/gpu=all, which is not allowed on this platform; pass --gpu-device nvidia.com/gpu=all explicitly to request all GPUs",
154+
),
20155
}
21-
})
156+
}
157+
}
158+
159+
impl std::error::Error for CdiGpuSelectionError {}
160+
161+
/// Resolve the existing GPU request fields into CDI device identifiers.
162+
///
163+
/// `None` means no GPU was requested. A GPU request with an explicit device ID
164+
/// passes through unchanged. A default GPU request uses the driver-selected
165+
/// default CDI ID.
166+
pub fn cdi_gpu_device_ids(
167+
gpu: bool,
168+
gpu_device: &str,
169+
selected_default_device: Option<&str>,
170+
) -> Result<Option<Vec<String>>, CdiGpuSelectionError> {
171+
if !gpu {
172+
return Ok(None);
173+
}
174+
if !gpu_device.is_empty() {
175+
return Ok(Some(vec![gpu_device.to_string()]));
176+
}
177+
let device = selected_default_device.ok_or(CdiGpuSelectionError::MissingDefaultDevice)?;
178+
Ok(Some(vec![device.to_string()]))
179+
}
180+
181+
fn cdi_nvidia_gpu_suffix(id: &str) -> Option<&str> {
182+
id.strip_prefix(CDI_NVIDIA_GPU_PREFIX)
22183
}
23184

24185
#[cfg(test)]
@@ -27,22 +188,139 @@ mod tests {
27188

28189
#[test]
29190
fn cdi_gpu_device_ids_returns_none_when_absent() {
30-
assert_eq!(cdi_gpu_device_ids(false, ""), None);
191+
assert_eq!(cdi_gpu_device_ids(false, "", None), Ok(None));
31192
}
32193

33194
#[test]
34-
fn cdi_gpu_device_ids_defaults_empty_request_to_all_gpus() {
195+
fn cdi_gpu_device_ids_uses_selected_default_device() {
35196
assert_eq!(
36-
cdi_gpu_device_ids(true, ""),
37-
Some(vec![CDI_GPU_DEVICE_ALL.to_string()])
197+
cdi_gpu_device_ids(true, "", Some("nvidia.com/gpu=0")),
198+
Ok(Some(vec!["nvidia.com/gpu=0".to_string()]))
199+
);
200+
}
201+
202+
#[test]
203+
fn cdi_gpu_device_ids_rejects_missing_default_device() {
204+
assert_eq!(
205+
cdi_gpu_device_ids(true, "", None),
206+
Err(CdiGpuSelectionError::MissingDefaultDevice)
38207
);
39208
}
40209

41210
#[test]
42211
fn cdi_gpu_device_ids_passes_explicit_device_id_through() {
43212
assert_eq!(
44-
cdi_gpu_device_ids(true, "nvidia.com/gpu=0"),
45-
Some(vec!["nvidia.com/gpu=0".to_string()])
213+
cdi_gpu_device_ids(true, "nvidia.com/gpu=0", None),
214+
Ok(Some(vec!["nvidia.com/gpu=0".to_string()]))
215+
);
216+
}
217+
218+
#[test]
219+
fn inventory_filters_and_deduplicates_nvidia_gpu_ids() {
220+
let inventory = CdiGpuInventory::new([
221+
"nvidia.com/gpu=1",
222+
"vendor.example/device=0",
223+
"nvidia.com/gpu=1",
224+
" nvidia.com/gpu=0 ",
225+
]);
226+
227+
assert_eq!(
228+
inventory.as_slice(),
229+
&vec![
230+
"nvidia.com/gpu=0".to_string(),
231+
"nvidia.com/gpu=1".to_string()
232+
]
233+
);
234+
}
235+
236+
#[test]
237+
fn round_robin_prefers_indexed_family_and_sorts_numerically() {
238+
let inventory = CdiGpuInventory::new([
239+
"nvidia.com/gpu=10",
240+
"nvidia.com/gpu=UUID-b",
241+
"nvidia.com/gpu=2",
242+
"nvidia.com/gpu=all",
243+
]);
244+
let selector = CdiGpuRoundRobin::new();
245+
246+
assert_eq!(
247+
selector.next_default_device_id(&inventory, false),
248+
Ok("nvidia.com/gpu=2".to_string())
249+
);
250+
assert_eq!(
251+
selector.next_default_device_id(&inventory, false),
252+
Ok("nvidia.com/gpu=10".to_string())
253+
);
254+
assert_eq!(
255+
selector.next_default_device_id(&inventory, false),
256+
Ok("nvidia.com/gpu=2".to_string())
257+
);
258+
}
259+
260+
#[test]
261+
fn round_robin_uses_named_family_when_no_indexed_ids_exist() {
262+
let inventory = CdiGpuInventory::new(["nvidia.com/gpu=UUID-b", "nvidia.com/gpu=UUID-a"]);
263+
let selector = CdiGpuRoundRobin::new();
264+
265+
assert_eq!(
266+
selector.next_default_device_id(&inventory, false),
267+
Ok("nvidia.com/gpu=UUID-a".to_string())
268+
);
269+
}
270+
271+
#[test]
272+
fn round_robin_uses_all_only_inventory_when_allowed() {
273+
let inventory = CdiGpuInventory::new([CDI_GPU_DEVICE_ALL]);
274+
let selector = CdiGpuRoundRobin::new();
275+
276+
assert_eq!(
277+
selector.next_default_device_id(&inventory, true),
278+
Ok(CDI_GPU_DEVICE_ALL.to_string())
279+
);
280+
}
281+
282+
#[test]
283+
fn round_robin_rejects_all_only_inventory_when_not_allowed() {
284+
let inventory = CdiGpuInventory::new([CDI_GPU_DEVICE_ALL]);
285+
let selector = CdiGpuRoundRobin::new();
286+
287+
assert_eq!(
288+
selector.next_default_device_id(&inventory, false),
289+
Err(CdiGpuSelectionError::AllDevicesDefaultUnsupported)
290+
);
291+
}
292+
293+
#[test]
294+
fn round_robin_rejects_empty_inventory() {
295+
let inventory = CdiGpuInventory::new(["vendor.example/device=0"]);
296+
let selector = CdiGpuRoundRobin::new();
297+
298+
assert_eq!(
299+
selector.next_default_device_id(&inventory, false),
300+
Err(CdiGpuSelectionError::NoAvailableDevices)
301+
);
302+
}
303+
304+
#[test]
305+
fn peek_does_not_advance_round_robin_cursor() {
306+
let inventory = CdiGpuInventory::new(["nvidia.com/gpu=0", "nvidia.com/gpu=1"]);
307+
let selector = CdiGpuRoundRobin::new();
308+
309+
assert_eq!(
310+
selector.peek_default_device_id(&inventory, false),
311+
Ok("nvidia.com/gpu=0".to_string())
312+
);
313+
assert_eq!(
314+
selector.peek_default_device_id(&inventory, false),
315+
Ok("nvidia.com/gpu=0".to_string())
316+
);
317+
assert_eq!(
318+
selector.next_default_device_id(&inventory, false),
319+
Ok("nvidia.com/gpu=0".to_string())
320+
);
321+
assert_eq!(
322+
selector.next_default_device_id(&inventory, false),
323+
Ok("nvidia.com/gpu=1".to_string())
46324
);
47325
}
48326
}

crates/openshell-driver-docker/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ contract:
3232
| `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. |
3333
| `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. |
3434
| `PidsLimit` | Enforces the sandbox PID budget at the Docker cgroup layer. Set `[openshell.drivers.docker].sandbox_pids_limit = 0` to inherit the Docker/runtime default. |
35-
| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |
35+
| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise selects one concrete NVIDIA CDI GPU when the sandbox spec asks for GPU support and daemon CDI support is detected. Docker daemon `/info` can permit `nvidia.com/gpu=all` as a WSL2 all-only compatibility fallback. |
3636

3737
The agent child process does not retain these supervisor privileges.
3838

0 commit comments

Comments
 (0)