From 9506f162427e16dd2c262815f429aac2e2c71e9d Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 8 Jun 2026 18:22:20 +0200 Subject: [PATCH 1/2] feat(gpu)!: add resource requirements BREAKING CHANGE: SandboxSpec.gpu and DriverSandboxSpec.gpu were replaced with resource_requirements.gpu. This intentionally reuses protobuf field 9, changing it from a bool to a message for both public and driver APIs. Signed-off-by: Evan Lezar --- architecture/compute-runtimes.md | 4 +- crates/openshell-cli/src/main.rs | 37 +++++++- crates/openshell-cli/src/run.rs | 51 ++++++---- .../sandbox_create_lifecycle_integration.rs | 93 +++++++++++++++---- crates/openshell-core/src/gpu.rs | 60 +++++++++--- crates/openshell-driver-docker/README.md | 2 +- crates/openshell-driver-docker/src/lib.rs | 28 +++--- crates/openshell-driver-docker/src/tests.rs | 26 ++++-- crates/openshell-driver-kubernetes/README.md | 11 ++- .../openshell-driver-kubernetes/src/driver.rs | 88 +++++++++++++----- crates/openshell-driver-podman/README.md | 2 +- .../openshell-driver-podman/src/container.rs | 22 +++-- crates/openshell-driver-podman/src/driver.rs | 31 ++++++- crates/openshell-driver-vm/src/driver.rs | 44 ++++++--- crates/openshell-server/src/compute/mod.rs | 62 +++++++++---- crates/openshell-server/src/grpc/sandbox.rs | 4 +- .../openshell-server/src/grpc/validation.rs | 4 +- docs/reference/sandbox-compute-drivers.mdx | 2 +- proto/compute_driver.proto | 13 ++- proto/openshell.proto | 13 ++- 20 files changed, 444 insertions(+), 153 deletions(-) diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index 092f2c049..24e0d78ff 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -98,7 +98,9 @@ users. Custom sandbox images must include the agent runtime and any system dependencies, but they should not need to include the gateway. GPU-capable images must include the user-space libraries required by the workload. The -runtime still owns GPU device injection. +runtime still owns GPU device injection. GPU requests are explicit, and can be +refined with a driver-native device identifier; the gateway validates the +request shape and each runtime enforces the GPU allocation modes it supports. ## Deployment Shape diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index a21431e81..9d0044886 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -19,6 +19,7 @@ use openshell_bootstrap::{ use openshell_cli::completers; use openshell_cli::run; use openshell_cli::tls::TlsOptions; +use openshell_core::proto::GpuResourceRequirements; /// Resolved gateway context: name + gateway endpoint. struct GatewayContext { @@ -2625,6 +2626,7 @@ async fn main() -> Result<()> { .map(|s| openshell_core::forward::ForwardSpec::parse(&s)) .transpose()?; let keep = keep || !no_keep || editor.is_some() || forward.is_some(); + let gpu_requirements = gpu.then_some(GpuResourceRequirements {}); let ctx = resolve_gateway(&cli.gateway, &cli.gateway_endpoint)?; let endpoint = &ctx.endpoint; @@ -2637,7 +2639,7 @@ async fn main() -> Result<()> { &ctx.name, &upload_specs, keep, - gpu, + gpu_requirements, cpu.as_deref(), memory.as_deref(), driver_config_json.as_deref(), @@ -4495,6 +4497,39 @@ mod tests { } } + #[test] + fn sandbox_create_gpu_parses_driver_default() { + let cli = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu"]) + .expect("sandbox create --gpu should parse"); + + match cli.command { + Some(Commands::Sandbox { + command: Some(SandboxCommands::Create { gpu, .. }), + .. + }) => { + assert!(gpu); + } + other => panic!("expected SandboxCommands::Create, got: {other:?}"), + } + } + + #[test] + fn sandbox_create_gpu_driver_default_allows_trailing_command() { + let cli = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu", "--", "claude"]) + .expect("sandbox create --gpu -- claude should parse"); + + match cli.command { + Some(Commands::Sandbox { + command: Some(SandboxCommands::Create { gpu, command, .. }), + .. + }) => { + assert!(gpu); + assert_eq!(command, vec!["claude".to_string()]); + } + other => panic!("expected SandboxCommands::Create, got: {other:?}"), + } + } + #[test] fn service_expose_accepts_positional_target_port_and_service() { let cli = Cli::try_parse_from([ diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 6f5520755..418b200ec 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -42,12 +42,13 @@ use openshell_core::proto::{ GetClusterInferenceRequest, GetDraftHistoryRequest, GetDraftPolicyRequest, GetGatewayConfigRequest, GetProviderProfileRequest, GetProviderRefreshStatusRequest, GetProviderRequest, GetSandboxConfigRequest, GetSandboxLogsRequest, - GetSandboxPolicyStatusRequest, GetSandboxRequest, GetServiceRequest, HealthRequest, - ImportProviderProfilesRequest, LintProviderProfilesRequest, ListProviderProfilesRequest, - ListProvidersRequest, ListSandboxPoliciesRequest, ListSandboxProvidersRequest, - ListSandboxesRequest, ListServicesRequest, PlatformEvent, PolicySource, PolicyStatus, Provider, - ProviderCredentialRefreshStatus, ProviderCredentialRefreshStrategy, ProviderProfile, - ProviderProfileDiagnostic, ProviderProfileImportItem, RejectDraftChunkRequest, + GetSandboxPolicyStatusRequest, GetSandboxRequest, GetServiceRequest, GpuResourceRequirements, + HealthRequest, ImportProviderProfilesRequest, LintProviderProfilesRequest, + ListProviderProfilesRequest, ListProvidersRequest, ListSandboxPoliciesRequest, + ListSandboxProvidersRequest, ListSandboxesRequest, ListServicesRequest, PlatformEvent, + PolicySource, PolicyStatus, Provider, ProviderCredentialRefreshStatus, + ProviderCredentialRefreshStrategy, ProviderProfile, ProviderProfileDiagnostic, + ProviderProfileImportItem, RejectDraftChunkRequest, ResourceRequirements, RevokeSshSessionRequest, RotateProviderCredentialRequest, Sandbox, SandboxPhase, SandboxPolicy, SandboxSpec, SandboxTemplate, ServiceEndpointResponse, SetClusterInferenceRequest, SettingScope, SettingValue, TcpForwardFrame, TcpForwardInit, TcpRelayTarget, @@ -123,7 +124,7 @@ fn ready_false_condition_message( fn provisioning_timeout_message( timeout_secs: u64, - requested_gpu: bool, + resource_requirements: Option<&ResourceRequirements>, condition_message: Option<&str>, ) -> String { let mut message = format!("sandbox provisioning timed out after {timeout_secs}s"); @@ -133,7 +134,7 @@ fn provisioning_timeout_message( message.push_str(condition_message); } - if requested_gpu { + if resource_requirements.is_some_and(|requirements| requirements.gpu.is_some()) { message.push_str( ". Hint: this may be because the available GPU is already in use by another sandbox.", ); @@ -1753,7 +1754,7 @@ pub async fn sandbox_create( gateway_name: &str, uploads: &[(String, Option, bool)], keep: bool, - gpu: bool, + gpu_requirements: Option, cpu: Option<&str>, memory: Option<&str>, driver_config_json: Option<&str>, @@ -1809,8 +1810,6 @@ pub async fn sandbox_create( } None => None, }; - let requested_gpu = gpu; - let providers_v2_enabled = gateway_providers_v2_enabled(&mut client).await?; let inferred_types: Vec = if providers_v2_enabled { Vec::new() @@ -1842,9 +1841,11 @@ pub async fn sandbox_create( None }; + let resource_requirements = gpu_requirements.map(|gpu| ResourceRequirements { gpu: Some(gpu) }); + let request = CreateSandboxRequest { spec: Some(SandboxSpec { - gpu: requested_gpu, + resource_requirements, environment: environment.clone(), policy, providers: configured_providers, @@ -1989,7 +1990,7 @@ pub async fn sandbox_create( if remaining.is_zero() { let timeout_message = provisioning_timeout_message( provision_timeout.as_secs(), - requested_gpu, + resource_requirements.as_ref(), last_condition_message.as_deref(), ); if let Some(d) = display.as_mut() { @@ -2008,7 +2009,7 @@ pub async fn sandbox_create( // Timeout fired — the stream was idle for too long. let timeout_message = provisioning_timeout_message( provision_timeout.as_secs(), - requested_gpu, + resource_requirements.as_ref(), last_condition_message.as_deref(), ); if let Some(d) = display.as_mut() { @@ -7686,9 +7687,10 @@ mod tests { PROGRESS_STEP_STARTING_SANDBOX, }; use openshell_core::proto::{ - Provider, ProviderCredentialRefresh, ProviderCredentialRefreshStatus, - ProviderCredentialRefreshStrategy, ProviderCredentialTokenGrant, ProviderProfile, - ProviderProfileCredential, SandboxCondition, SandboxStatus, datamodel::v1::ObjectMeta, + GpuResourceRequirements, Provider, ProviderCredentialRefresh, + ProviderCredentialRefreshStatus, ProviderCredentialRefreshStrategy, + ProviderCredentialTokenGrant, ProviderProfile, ProviderProfileCredential, + ResourceRequirements, SandboxCondition, SandboxStatus, datamodel::v1::ObjectMeta, }; struct EnvVarGuard { @@ -8392,9 +8394,12 @@ mod tests { #[test] fn provisioning_timeout_message_includes_condition_and_gpu_hint() { + let resource_requirements = ResourceRequirements { + gpu: Some(GpuResourceRequirements {}), + }; let message = provisioning_timeout_message( 120, - true, + Some(&resource_requirements), Some("DependenciesNotReady: Pod exists with phase: Pending; Service Exists"), ); @@ -8405,7 +8410,15 @@ mod tests { #[test] fn provisioning_timeout_message_omits_gpu_hint_for_non_gpu_requests() { - let message = provisioning_timeout_message(120, false, None); + let message = provisioning_timeout_message(120, None, None); + + assert_eq!(message, "sandbox provisioning timed out after 120s"); + } + + #[test] + fn provisioning_timeout_message_omits_gpu_hint_without_gpu_requirements() { + let resource_requirements = ResourceRequirements { gpu: None }; + let message = provisioning_timeout_message(120, Some(&resource_requirements), None); assert_eq!(message, "sandbox provisioning timed out after 120s"); } diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index ef89cdf1e..3565c3b2c 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -18,13 +18,14 @@ use openshell_core::proto::{ ExecSandboxInput, ExecSandboxRequest, GatewayMessage, GetGatewayConfigRequest, GetGatewayConfigResponse, GetProviderRequest, GetSandboxConfigRequest, GetSandboxConfigResponse, GetSandboxProviderEnvironmentRequest, - GetSandboxProviderEnvironmentResponse, GetSandboxRequest, HealthRequest, HealthResponse, - ListProvidersRequest, ListProvidersResponse, ListSandboxProvidersRequest, - ListSandboxProvidersResponse, ListSandboxesRequest, ListSandboxesResponse, PlatformEvent, - ProviderResponse, RevokeSshSessionRequest, RevokeSshSessionResponse, Sandbox, SandboxCondition, - SandboxLogLine, SandboxPhase, SandboxResponse, SandboxStatus, SandboxStreamEvent, - ServiceStatus, SettingValue, SupervisorMessage, UpdateProviderRequest, WatchSandboxRequest, - sandbox_stream_event, setting_value, + GetSandboxProviderEnvironmentResponse, GetSandboxRequest, GpuResourceRequirements, + HealthRequest, HealthResponse, ListProvidersRequest, ListProvidersResponse, + ListSandboxProvidersRequest, ListSandboxProvidersResponse, ListSandboxesRequest, + ListSandboxesResponse, PlatformEvent, ProviderResponse, RevokeSshSessionRequest, + RevokeSshSessionResponse, Sandbox, SandboxCondition, SandboxLogLine, SandboxPhase, + SandboxResponse, SandboxStatus, SandboxStreamEvent, ServiceStatus, SettingValue, + SupervisorMessage, UpdateProviderRequest, WatchSandboxRequest, sandbox_stream_event, + setting_value, }; use std::collections::HashMap; use std::fs; @@ -1087,6 +1088,10 @@ fn test_tls(server: &TestServer) -> TlsOptions { server.tls.with_gateway_name("openshell") } +fn gpu_requirements() -> GpuResourceRequirements { + GpuResourceRequirements {} +} + #[tokio::test] async fn sandbox_create_keeps_command_sessions_by_default() { let server = run_server().await; @@ -1103,7 +1108,7 @@ async fn sandbox_create_keeps_command_sessions_by_default() { "openshell", &[], true, - false, + None, None, None, None, @@ -1146,7 +1151,7 @@ async fn sandbox_create_sends_cpu_and_memory_limits_only() { "openshell", &[], true, - false, + None, Some("500m"), Some("2Gi"), None, @@ -1223,7 +1228,7 @@ async fn sandbox_create_sends_driver_config_json() { "openshell", &[], true, - false, + None, None, None, Some(r#"{"kubernetes":{"pod":{"priority_class_name":"batch-low"}}}"#), @@ -1280,6 +1285,56 @@ async fn sandbox_create_sends_driver_config_json() { ); } +#[tokio::test] +async fn sandbox_create_sends_gpu_default_request() { + let server = run_server().await; + let fake_ssh_dir = tempfile::tempdir().unwrap(); + let xdg_dir = tempfile::tempdir().unwrap(); + let _env = test_env(&fake_ssh_dir, &xdg_dir); + let tls = test_tls(&server); + install_fake_ssh(&fake_ssh_dir); + + run::sandbox_create( + &server.endpoint, + Some("gpu-default"), + None, + "openshell", + &[], + true, + Some(gpu_requirements()), + None, + None, + None, + None, + &[], + None, + None, + &["echo".to_string(), "OK".to_string()], + Some(false), + Some(false), + &HashMap::new(), + &HashMap::new(), + "manual", + &tls, + ) + .await + .expect("sandbox create should succeed"); + + let requests = create_requests(&server).await; + let gpu = requests[0] + .spec + .as_ref() + .and_then(|spec| spec.resource_requirements.as_ref()) + .and_then(|requirements| requirements.gpu.as_ref()) + .expect("GPU requirement should be sent"); + + assert!(requests[0] + .spec + .as_ref() + .and_then(|spec| spec.resource_requirements.as_ref()) + .is_some_and(|requirements| requirements.gpu.is_some())); +} + #[tokio::test] async fn sandbox_create_does_not_infer_command_providers_when_v2_enabled() { let server = run_server().await; @@ -1297,7 +1352,7 @@ async fn sandbox_create_does_not_infer_command_providers_when_v2_enabled() { "openshell", &[], true, - false, + None, None, None, None, @@ -1355,7 +1410,7 @@ async fn sandbox_create_returns_vm_error_without_waiting_for_timeout() { "openshell", &[], true, - false, + None, None, None, None, @@ -1409,7 +1464,7 @@ async fn sandbox_create_keeps_waiting_while_vm_progress_arrives() { "openshell", &[], true, - false, + None, None, None, None, @@ -1455,7 +1510,7 @@ async fn sandbox_create_times_out_when_only_logs_arrive() { "openshell", &[], true, - false, + None, None, None, None, @@ -1497,7 +1552,7 @@ async fn sandbox_create_deletes_command_sessions_with_no_keep() { "openshell", &[], false, - false, + None, None, None, None, @@ -1543,7 +1598,7 @@ async fn sandbox_create_deletes_shell_sessions_with_no_keep() { "openshell", &[], false, - false, + None, None, None, None, @@ -1589,7 +1644,7 @@ async fn sandbox_create_keeps_sandbox_with_hidden_keep_flag() { "openshell", &[], true, - false, + None, None, None, None, @@ -1635,7 +1690,7 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() { "openshell", &[], false, - false, + None, None, None, None, @@ -1787,7 +1842,7 @@ async fn sandbox_create_sends_environment_variables() { "openshell", &[], true, - false, + None, None, None, None, diff --git a/crates/openshell-core/src/gpu.rs b/crates/openshell-core/src/gpu.rs index 3e5b38d7a..fe6744a28 100644 --- a/crates/openshell-core/src/gpu.rs +++ b/crates/openshell-core/src/gpu.rs @@ -1,12 +1,39 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Shared GPU request helpers. +//! Shared GPU resource requirement helpers. use std::fmt; use std::sync::atomic::{AtomicUsize, Ordering}; use crate::config::CDI_GPU_DEVICE_ALL; +use crate::proto::ResourceRequirements as SandboxResourceRequirements; +use crate::proto::compute::v1::{ + GpuResourceRequirements as DriverGpuResourceRequirements, + ResourceRequirements as DriverResourceRequirements, +}; + +/// Return whether sandbox resource requirements request a GPU. +#[must_use] +pub fn sandbox_gpu_requested(resources: Option<&SandboxResourceRequirements>) -> bool { + resources + .and_then(|resources| resources.gpu.as_ref()) + .is_some() +} + +/// Return whether compute-driver resource requirements request a GPU. +#[must_use] +pub fn driver_gpu_requested(resources: Option<&DriverResourceRequirements>) -> bool { + driver_gpu_requirements(resources).is_some() +} + +/// Return the requested compute-driver GPU requirements, if present. +#[must_use] +pub fn driver_gpu_requirements( + resources: Option<&DriverResourceRequirements>, +) -> Option<&DriverGpuResourceRequirements> { + resources.and_then(|resources| resources.gpu.as_ref()) +} const CDI_NVIDIA_GPU_PREFIX: &str = "nvidia.com/gpu="; const CDI_NVIDIA_GPU_ALL_SUFFIX: &str = "all"; @@ -158,17 +185,17 @@ impl fmt::Display for CdiGpuSelectionError { impl std::error::Error for CdiGpuSelectionError {} -/// Resolve a local runtime GPU request into CDI device identifiers. +/// Resolve a compute-driver GPU request into CDI device identifiers. /// /// `None` means no GPU was requested. Explicit driver-configured CDI devices /// pass through unchanged. A default GPU request uses the driver-selected /// default CDI ID. pub fn cdi_gpu_device_ids( - gpu: bool, + gpu: Option<&DriverGpuResourceRequirements>, cdi_devices: &[String], selected_default_device: Option<&str>, ) -> Result>, CdiGpuSelectionError> { - if !gpu { + if gpu.is_none() { return Ok(None); } if !cdi_devices.is_empty() { @@ -180,8 +207,11 @@ pub fn cdi_gpu_device_ids( /// Resolve a GPU request with the legacy all-GPU default. #[must_use] -pub fn cdi_gpu_device_ids_or_all(gpu: bool, cdi_devices: &[String]) -> Option> { - gpu.then(|| { +pub fn cdi_gpu_device_ids_or_all( + gpu: Option<&DriverGpuResourceRequirements>, + cdi_devices: &[String], +) -> Option> { + gpu.map(|_| { if cdi_devices.is_empty() { vec![CDI_GPU_DEVICE_ALL.to_string()] } else { @@ -200,30 +230,36 @@ mod tests { #[test] fn cdi_gpu_device_ids_returns_none_when_absent() { - assert_eq!(cdi_gpu_device_ids(false, &[], None), Ok(None)); + assert_eq!(cdi_gpu_device_ids(None, &[], None), Ok(None)); } #[test] fn cdi_gpu_device_ids_uses_selected_default_device() { + let gpu = DriverGpuResourceRequirements {}; + assert_eq!( - cdi_gpu_device_ids(true, &[], Some("nvidia.com/gpu=0")), + cdi_gpu_device_ids(Some(&gpu), &[], Some("nvidia.com/gpu=0")), Ok(Some(vec!["nvidia.com/gpu=0".to_string()])) ); } #[test] fn cdi_gpu_device_ids_rejects_missing_default_device() { + let gpu = DriverGpuResourceRequirements {}; + assert_eq!( - cdi_gpu_device_ids(true, &[], None), + cdi_gpu_device_ids(Some(&gpu), &[], None), Err(CdiGpuSelectionError::MissingDefaultDevice) ); } #[test] fn cdi_gpu_device_ids_passes_explicit_device_ids_through() { + let gpu = DriverGpuResourceRequirements {}; + assert_eq!( cdi_gpu_device_ids( - true, + Some(&gpu), &[ "nvidia.com/gpu=0".to_string(), "nvidia.com/gpu=1".to_string() @@ -239,8 +275,10 @@ mod tests { #[test] fn cdi_gpu_device_ids_or_all_uses_all_when_no_devices_are_configured() { + let gpu = DriverGpuResourceRequirements {}; + assert_eq!( - cdi_gpu_device_ids_or_all(true, &[]), + cdi_gpu_device_ids_or_all(Some(&gpu), &[]), Some(vec![CDI_GPU_DEVICE_ALL.to_string()]) ); } diff --git a/crates/openshell-driver-docker/README.md b/crates/openshell-driver-docker/README.md index 71159fe66..468d6d8cd 100644 --- a/crates/openshell-driver-docker/README.md +++ b/crates/openshell-driver-docker/README.md @@ -32,7 +32,7 @@ contract: | `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. | | `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. | | `PidsLimit` | Enforces the sandbox PID budget at the Docker cgroup layer. Set `[openshell.drivers.docker].sandbox_pids_limit = 0` to inherit the Docker/runtime default. | -| CDI GPU request | Uses `driver_config.cdi_devices` when set; otherwise selects one concrete NVIDIA CDI GPU when the sandbox spec asks for GPU support and daemon CDI support is detected. Docker daemon `/info` can permit `nvidia.com/gpu=all` as a WSL2 all-only compatibility fallback. | +| CDI GPU request | Uses `driver_config.cdi_devices` when set; otherwise selects one concrete NVIDIA CDI GPU when the sandbox resource requirements ask for GPU support and daemon CDI support is detected. Docker daemon `/info` can permit `nvidia.com/gpu=all` as a WSL2 all-only compatibility fallback. Counted GPU requests are rejected. | The agent child process does not retain these supervisor privileges. diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 963e7a0f7..ae3da7798 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -29,6 +29,7 @@ use openshell_core::driver_utils::{ }; use openshell_core::gpu::{ CdiGpuInventory, CdiGpuRoundRobin, CdiGpuSelectionError, cdi_gpu_device_ids, + driver_gpu_requirements, }; use openshell_core::progress::{ PROGRESS_STEP_PULLING_IMAGE, PROGRESS_STEP_REQUESTING_SANDBOX, PROGRESS_STEP_STARTING_SANDBOX, @@ -38,11 +39,11 @@ use openshell_core::proto::compute::v1::{ CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, DriverCondition, DriverPlatformEvent, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate, GetCapabilitiesRequest, GetCapabilitiesResponse, GetSandboxRequest, - GetSandboxResponse, ListSandboxesRequest, ListSandboxesResponse, StopSandboxRequest, - StopSandboxResponse, ValidateSandboxCreateRequest, ValidateSandboxCreateResponse, - WatchSandboxesDeletedEvent, WatchSandboxesEvent, WatchSandboxesPlatformEvent, - WatchSandboxesRequest, WatchSandboxesSandboxEvent, compute_driver_server::ComputeDriver, - watch_sandboxes_event, + GetSandboxResponse, GpuResourceRequirements, ListSandboxesRequest, ListSandboxesResponse, + StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateRequest, + ValidateSandboxCreateResponse, WatchSandboxesDeletedEvent, WatchSandboxesEvent, + WatchSandboxesPlatformEvent, WatchSandboxesRequest, WatchSandboxesSandboxEvent, + compute_driver_server::ComputeDriver, watch_sandboxes_event, }; use openshell_core::proto_struct::{ deserialize_optional_non_empty_string_list, struct_to_json_value, @@ -471,7 +472,8 @@ impl DockerComputeDriver { let driver_config = DockerSandboxDriverConfig::from_template(template).map_err(Status::invalid_argument)?; - Self::validate_gpu_request(spec.gpu, config.supports_gpu, &driver_config)?; + let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); + Self::validate_gpu_request(gpu_requirements, config.supports_gpu, &driver_config)?; Ok(()) } @@ -519,17 +521,17 @@ impl DockerComputeDriver { } fn validate_gpu_request( - gpu: bool, + gpu_requirements: Option<&GpuResourceRequirements>, supports_gpu: bool, driver_config: &DockerSandboxDriverConfig, ) -> Result<(), Status> { - if !gpu && driver_config.cdi_devices.is_some() { + if gpu_requirements.is_none() && driver_config.cdi_devices.is_some() { return Err(Status::invalid_argument( "driver_config.cdi_devices requires gpu=true", )); } - if gpu && !supports_gpu { + if gpu_requirements.is_some() && !supports_gpu { return Err(Status::failed_precondition( "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.", )); @@ -592,7 +594,8 @@ impl DockerComputeDriver { }; let driver_config = DockerSandboxDriverConfig::from_sandbox(sandbox).map_err(Status::invalid_argument)?; - if !spec.gpu || driver_config.cdi_devices.is_some() { + let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); + if gpu_requirements.is_none() || driver_config.cdi_devices.is_some() { return Ok(None); } @@ -2219,13 +2222,14 @@ fn build_device_requests( .map_err(Status::invalid_argument)? .cdi_devices .unwrap_or_default(); - if !spec.gpu && !cdi_devices.is_empty() { + let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); + if gpu_requirements.is_none() && !cdi_devices.is_empty() { return Err(Status::invalid_argument( "driver_config.cdi_devices requires gpu=true", )); } - cdi_gpu_device_ids(spec.gpu, &cdi_devices, selected_default_device) + cdi_gpu_device_ids(gpu_requirements, &cdi_devices, selected_default_device) .map(|device_ids| { device_ids.map(|device_ids| { vec![DeviceRequest { diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index d5132fe1a..a48b82f74 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -13,7 +13,8 @@ use openshell_core::progress::{ PROGRESS_STEP_STARTING_SANDBOX, }; use openshell_core::proto::compute::v1::{ - DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate, + DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate, GpuResourceRequirements, + ResourceRequirements, }; use std::fs; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; @@ -41,7 +42,7 @@ fn test_sandbox() -> DriverSandbox { environment: HashMap::from([("TEMPLATE_ENV".to_string(), "template".to_string())]), ..Default::default() }), - gpu: false, + resource_requirements: None, sandbox_token: String::new(), }), status: None, @@ -79,6 +80,12 @@ fn list_string_driver_config(field: &str, values: &[&str]) -> prost_types::Struc } } +fn gpu_resources() -> ResourceRequirements { + ResourceRequirements { + gpu: Some(GpuResourceRequirements {}), + } +} + fn runtime_config() -> DockerDriverRuntimeConfig { DockerDriverRuntimeConfig { default_image: "image:latest".to_string(), @@ -1045,7 +1052,7 @@ fn build_container_create_body_clears_inherited_cmd() { fn validate_sandbox_rejects_gpu_when_cdi_unavailable() { let config = runtime_config(); let mut sandbox = test_sandbox(); - sandbox.spec.as_mut().unwrap().gpu = true; + sandbox.spec.as_mut().unwrap().resource_requirements = Some(gpu_resources()); let err = DockerComputeDriver::validate_sandbox(&sandbox, &config).unwrap_err(); @@ -1058,7 +1065,7 @@ fn validate_sandbox_rejects_invalid_cdi_devices_before_gpu_capability() { let config = runtime_config(); let mut sandbox = test_sandbox(); let spec = sandbox.spec.as_mut().unwrap(); - spec.gpu = true; + spec.resource_requirements = Some(gpu_resources()); spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&[])); let err = DockerComputeDriver::validate_sandbox(&sandbox, &config).unwrap_err(); @@ -1073,7 +1080,7 @@ fn validate_sandbox_rejects_unknown_driver_config_fields() { let config = runtime_config(); let mut sandbox = test_sandbox(); let spec = sandbox.spec.as_mut().unwrap(); - spec.gpu = true; + spec.resource_requirements = Some(gpu_resources()); spec.template.as_mut().unwrap().driver_config = Some(cdi_device_typo_config(&["nvidia.com/gpu=0"])); @@ -1083,12 +1090,11 @@ fn validate_sandbox_rejects_unknown_driver_config_fields() { assert!(err.message().contains("unknown field")); } -#[test] fn validate_sandbox_rejects_template_errors_before_device_config() { let config = runtime_config(); let mut sandbox = test_sandbox(); let spec = sandbox.spec.as_mut().unwrap(); - spec.gpu = true; + spec.resource_requirements = Some(gpu_resources()); let template = spec.template.as_mut().unwrap(); template.agent_socket_path = "/tmp/agent.sock".to_string(); template.driver_config = Some(cdi_devices_config(&[])); @@ -1126,7 +1132,7 @@ fn build_container_create_body_maps_default_gpu_to_selected_cdi_device() { let mut config = runtime_config(); config.supports_gpu = true; let mut sandbox = test_sandbox(); - sandbox.spec.as_mut().unwrap().gpu = true; + sandbox.spec.as_mut().unwrap().resource_requirements = Some(gpu_resources()); let create_body = build_container_create_body_with_default(&sandbox, &config, Some("nvidia.com/gpu=1")) @@ -1168,7 +1174,7 @@ fn build_container_create_body_passes_explicit_cdi_device_id_through() { config.supports_gpu = true; let mut sandbox = test_sandbox(); let spec = sandbox.spec.as_mut().unwrap(); - spec.gpu = true; + spec.resource_requirements = Some(gpu_resources()); spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&["nvidia.com/gpu=0"])); let create_body = build_container_create_body(&sandbox, &config).unwrap(); @@ -1207,7 +1213,7 @@ fn build_container_create_body_rejects_cdi_devices_without_gpu() { fn build_container_create_body_rejects_empty_cdi_devices() { let mut sandbox = test_sandbox(); let spec = sandbox.spec.as_mut().unwrap(); - spec.gpu = true; + spec.resource_requirements = Some(gpu_resources()); spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&[])); let err = build_container_create_body(&sandbox, &runtime_config()).unwrap_err(); diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 6ad0b27c8..3cdb9fa57 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -62,9 +62,9 @@ the supervisor's network namespace mount setup on AppArmor-enabled nodes. ## GPU Support When a sandbox requests GPU support, the driver checks node allocatable capacity -for `nvidia.com/gpu` and requests one GPU resource in the workload spec. The -sandbox image must provide the user-space libraries needed by the agent -workload. +for `nvidia.com/gpu` and requests the configured GPU count in the workload spec. +When no count is set, the driver requests one GPU resource. The sandbox image +must provide the user-space libraries needed by the agent workload. ## Driver Config POC @@ -97,5 +97,6 @@ POC parser renders the keys listed above and rejects unknown fields. `pod.runtime_class_name` maps to PodSpec `runtimeClassName` and overrides the driver's configured `default_runtime_class_name`; the typed public `SandboxTemplate.runtime_class_name` still takes precedence when set. Use the -public `gpu` flag for the default GPU request and `driver_config` only for -additional driver-owned resource details. +public `--gpu` flag for the default GPU request, pass a count to `--gpu` for +counted GPU requests, and use `driver_config` only for additional driver-owned +resource details. diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index dc636efc3..d634cea9a 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -17,6 +17,7 @@ use kube::{Client, Error as KubeError}; use openshell_core::driver_utils::{ LABEL_MANAGED_BY, LABEL_MANAGED_BY_VALUE, LABEL_SANDBOX_ID, SUPERVISOR_IMAGE_BINARY_PATH, }; +use openshell_core::gpu::driver_gpu_requirements; use openshell_core::progress::{ PROGRESS_STEP_PULLING_IMAGE, PROGRESS_STEP_REQUESTING_SANDBOX, PROGRESS_STEP_STARTING_SANDBOX, format_bytes, mark_progress_active, mark_progress_complete, mark_progress_detail, @@ -25,8 +26,9 @@ use openshell_core::proto::compute::v1::{ DriverCondition as SandboxCondition, DriverPlatformEvent as PlatformEvent, DriverSandbox as Sandbox, DriverSandboxSpec as SandboxSpec, DriverSandboxStatus as SandboxStatus, DriverSandboxTemplate as SandboxTemplate, - GetCapabilitiesResponse, WatchSandboxesDeletedEvent, WatchSandboxesEvent, - WatchSandboxesPlatformEvent, WatchSandboxesSandboxEvent, watch_sandboxes_event, + GetCapabilitiesResponse, GpuResourceRequirements, WatchSandboxesDeletedEvent, + WatchSandboxesEvent, WatchSandboxesPlatformEvent, WatchSandboxesSandboxEvent, + watch_sandboxes_event, }; use openshell_core::proto_struct::{struct_to_json_object, value_to_json}; use serde::Deserialize; @@ -281,8 +283,12 @@ impl KubernetesComputeDriver { pub async fn validate_sandbox_create(&self, sandbox: &Sandbox) -> Result<(), tonic::Status> { let _ = KubernetesSandboxDriverConfig::from_sandbox(sandbox) .map_err(tonic::Status::invalid_argument)?; - let gpu_requested = sandbox.spec.as_ref().is_some_and(|spec| spec.gpu); - if gpu_requested + let gpu_requirements = sandbox + .spec + .as_ref() + .and_then(|spec| driver_gpu_requirements(spec.resource_requirements.as_ref())); + validate_gpu_request(gpu_requirements)?; + if gpu_requirements.is_some() && !self.has_gpu_capacity().await.map_err(|err| { tonic::Status::internal(format!("check GPU node capacity failed: {err}")) })? @@ -376,6 +382,13 @@ impl KubernetesComputeDriver { pub async fn create_sandbox(&self, sandbox: &Sandbox) -> Result<(), KubernetesDriverError> { let _ = KubernetesSandboxDriverConfig::from_sandbox(sandbox) .map_err(KubernetesDriverError::InvalidArgument)?; + let gpu_requirements = sandbox + .spec + .as_ref() + .and_then(|spec| driver_gpu_requirements(spec.resource_requirements.as_ref())); + validate_gpu_request(gpu_requirements).map_err(|status| { + KubernetesDriverError::InvalidArgument(status.message().to_string()) + })?; let name = sandbox.name.as_str(); info!( sandbox_id = %sandbox.id, @@ -638,6 +651,12 @@ impl KubernetesComputeDriver { } } +fn validate_gpu_request( + _gpu_requirements: Option<&GpuResourceRequirements>, +) -> Result<(), tonic::Status> { + Ok(()) +} + fn sandbox_labels(sandbox: &Sandbox) -> BTreeMap { let mut labels = BTreeMap::new(); labels.insert(LABEL_SANDBOX_ID.to_string(), sandbox.id.clone()); @@ -1201,7 +1220,13 @@ fn sandbox_to_k8s_spec( if let Some(template) = spec.template.as_ref() { root.insert( "podTemplate".to_string(), - sandbox_template_to_k8s(template, spec.gpu, &pod_env, inject_workspace, params), + sandbox_template_to_k8s_with_gpu_requirements( + template, + driver_gpu_requirements(spec.resource_requirements.as_ref()), + &pod_env, + inject_workspace, + params, + ), ); if !template.agent_socket_path.is_empty() { root.insert( @@ -1231,9 +1256,9 @@ fn sandbox_to_k8s_spec( let pod_env = spec_pod_env(spec); root.insert( "podTemplate".to_string(), - sandbox_template_to_k8s( + sandbox_template_to_k8s_with_gpu_requirements( &SandboxTemplate::default(), - spec.is_some_and(|s| s.gpu), + driver_gpu_requirements(spec.and_then(|s| s.resource_requirements.as_ref())), &pod_env, inject_workspace, params, @@ -1246,12 +1271,30 @@ fn sandbox_to_k8s_spec( ) } +#[cfg(test)] fn sandbox_template_to_k8s( template: &SandboxTemplate, gpu: bool, spec_environment: &std::collections::HashMap, inject_workspace: bool, params: &SandboxPodParams<'_>, +) -> serde_json::Value { + let gpu_requirements = gpu.then_some(GpuResourceRequirements {}); + sandbox_template_to_k8s_with_gpu_requirements( + template, + gpu_requirements.as_ref(), + spec_environment, + inject_workspace, + params, + ) +} + +fn sandbox_template_to_k8s_with_gpu_requirements( + template: &SandboxTemplate, + gpu_requirements: Option<&GpuResourceRequirements>, + spec_environment: &std::collections::HashMap, + inject_workspace: bool, + params: &SandboxPodParams<'_>, ) -> serde_json::Value { let driver_config = kubernetes_driver_config(template); @@ -1331,7 +1374,7 @@ fn sandbox_template_to_k8s( if use_user_namespaces { spec.insert("hostUsers".to_string(), serde_json::json!(false)); - if gpu { + if gpu_requirements.is_some() { warn!( "GPU sandbox with user namespaces enabled — \ NVIDIA device plugin compatibility is unverified" @@ -1440,7 +1483,7 @@ fn sandbox_template_to_k8s( serde_json::Value::Array(volume_mounts), ); - if let Some(resources) = container_resources(template, gpu) { + if let Some(resources) = container_resources(template, gpu_requirements) { container.insert("resources".to_string(), resources); } apply_agent_driver_resources(&mut container, &driver_config.containers.agent.resources); @@ -1618,7 +1661,10 @@ fn app_armor_profile_to_k8s(profile: &AppArmorProfile) -> serde_json::Value { value } -fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option { +fn container_resources( + template: &SandboxTemplate, + gpu_requirements: Option<&GpuResourceRequirements>, +) -> Option { // Start from the raw resources passthrough in platform_config (preserves // custom resource types like GPU limits that users set via the public API // Struct), then overlay the typed DriverResourceRequirements on top. @@ -1651,8 +1697,8 @@ fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option Option> = @@ -2000,10 +2044,9 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - gpu: true, template: Some(SandboxTemplate { driver_config: Some(json_struct(serde_json::json!({ - "cdi_devices": ["nvidia.com/gpu=0"] + "gpu_device_ids": ["0000:2d:00.0"] }))), ..Default::default() }), @@ -2014,9 +2057,9 @@ mod tests { let err = KubernetesSandboxDriverConfig::from_sandbox(&sandbox).unwrap_err(); assert!(err.contains("unknown field")); + assert!(err.contains("gpu_device_ids")); } - #[test] fn kube_pulling_event_adds_image_progress_metadata() { let mut metadata = std::collections::HashMap::new(); @@ -2345,7 +2388,6 @@ mod tests { ); } - #[test] fn gpu_sandbox_uses_template_runtime_class_name_when_set() { let template = SandboxTemplate { platform_config: Some(Struct { diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md index 6342c760e..1291dbc29 100644 --- a/crates/openshell-driver-podman/README.md +++ b/crates/openshell-driver-podman/README.md @@ -46,7 +46,7 @@ The container spec in `container.rs` sets these security-critical fields: | `no_new_privileges` | `true` | Prevents privilege escalation after exec. | | `seccomp_profile_path` | `unconfined` | The supervisor installs its own policy-aware BPF filter. A container-level profile can block Landlock/seccomp syscalls during setup. | | `mounts` | Private tmpfs at `/run/netns` | Lets the supervisor create named network namespaces in rootless Podman. | -| CDI GPU devices | `driver_config.cdi_devices` when set, otherwise one concrete NVIDIA CDI GPU. Local `/dev/dxg` permits `nvidia.com/gpu=all` as a WSL2 all-only compatibility fallback. | Exposes requested GPUs to GPU-enabled sandbox containers. | +| CDI GPU devices | `driver_config.cdi_devices` when set, otherwise one concrete NVIDIA CDI GPU. Local `/dev/dxg` permits `nvidia.com/gpu=all` as a WSL2 all-only compatibility fallback. | Exposes requested GPUs to GPU-enabled sandbox containers. Counted GPU requests are rejected. | The restricted agent child does not retain these supervisor privileges. diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index e1a511819..348ffe265 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -5,7 +5,7 @@ use crate::config::PodmanComputeConfig; use openshell_core::ComputeDriverError; -use openshell_core::gpu::cdi_gpu_device_ids; +use openshell_core::gpu::{cdi_gpu_device_ids, driver_gpu_requirements}; use openshell_core::proto::compute::v1::{DriverSandbox, DriverSandboxTemplate}; use openshell_core::proto_struct::deserialize_optional_non_empty_string_list; use openshell_core::{driver_mounts, proto_struct}; @@ -487,13 +487,14 @@ fn build_devices( let cdi_devices = PodmanSandboxDriverConfig::from_sandbox(sandbox)? .cdi_devices .unwrap_or_default(); - if !spec.gpu && !cdi_devices.is_empty() { + let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); + if gpu_requirements.is_none() && !cdi_devices.is_empty() { return Err(ComputeDriverError::InvalidArgument( "driver_config.cdi_devices requires gpu=true".to_string(), )); } - cdi_gpu_device_ids(spec.gpu, &cdi_devices, selected_default_device) + cdi_gpu_device_ids(gpu_requirements, &cdi_devices, selected_default_device) .map(|device_ids| { device_ids.map(|device_ids| { device_ids @@ -1107,6 +1108,7 @@ fn parse_memory_to_bytes(quantity: &str) -> Option { #[cfg(test)] mod tests { use super::*; + use openshell_core::proto::compute::v1::{GpuResourceRequirements, ResourceRequirements}; static ENV_LOCK: std::sync::LazyLock> = std::sync::LazyLock::new(|| std::sync::Mutex::new(())); @@ -1148,6 +1150,12 @@ mod tests { } } + fn gpu_resources() -> ResourceRequirements { + ResourceRequirements { + gpu: Some(GpuResourceRequirements {}), + } + } + #[test] fn parse_cpu_millicore() { assert_eq!(parse_cpu_to_microseconds("500m"), Some(50_000)); @@ -1260,7 +1268,7 @@ mod tests { let mut sandbox = test_sandbox("test-id", "test-name"); sandbox.spec = Some(DriverSandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources()), ..Default::default() }); let config = test_config(); @@ -1302,7 +1310,7 @@ mod tests { let mut sandbox = test_sandbox("test-id", "test-name"); sandbox.spec = Some(DriverSandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources()), template: Some(DriverSandboxTemplate { driver_config: Some(cdi_devices_config(&["nvidia.com/gpu=0"])), ..Default::default() @@ -1343,7 +1351,7 @@ mod tests { let mut sandbox = test_sandbox("test-id", "test-name"); sandbox.spec = Some(DriverSandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources()), template: Some(DriverSandboxTemplate { driver_config: Some(cdi_devices_config(&[])), ..Default::default() @@ -1363,7 +1371,7 @@ mod tests { let mut sandbox = test_sandbox("test-id", "test-name"); sandbox.spec = Some(DriverSandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources()), template: Some(DriverSandboxTemplate { driver_config: Some(cdi_device_typo_config(&["nvidia.com/gpu=0"])), ..Default::default() diff --git a/crates/openshell-driver-podman/src/driver.rs b/crates/openshell-driver-podman/src/driver.rs index f6d266ee4..927c02c28 100644 --- a/crates/openshell-driver-podman/src/driver.rs +++ b/crates/openshell-driver-podman/src/driver.rs @@ -12,8 +12,12 @@ use crate::watcher::{ use openshell_core::ComputeDriverError; use openshell_core::config::CDI_GPU_DEVICE_ALL; use openshell_core::driver_utils::supervisor_image_should_refresh; -use openshell_core::gpu::{CdiGpuInventory, CdiGpuRoundRobin, CdiGpuSelectionError}; -use openshell_core::proto::compute::v1::{DriverSandbox, GetCapabilitiesResponse}; +use openshell_core::gpu::{ + CdiGpuInventory, CdiGpuRoundRobin, CdiGpuSelectionError, driver_gpu_requirements, +}; +use openshell_core::proto::compute::v1::{ + DriverSandbox, GetCapabilitiesResponse, GpuResourceRequirements, +}; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::Duration; @@ -338,18 +342,34 @@ impl PodmanComputeDriver { &self, sandbox: &DriverSandbox, ) -> Result<(), ComputeDriverError> { - let gpu_requested = sandbox.spec.as_ref().is_some_and(|s| s.gpu); + let gpu_requirements = sandbox + .spec + .as_ref() + .and_then(|spec| spec.resource_requirements.as_ref()) + .and_then(|requirements| driver_gpu_requirements(Some(requirements))); let driver_config = PodmanSandboxDriverConfig::from_sandbox(sandbox)?; - if !gpu_requested && driver_config.cdi_devices.is_some() { + if gpu_requirements.is_none() && driver_config.cdi_devices.is_some() { return Err(ComputeDriverError::InvalidArgument( "driver_config.cdi_devices requires gpu=true".to_string(), )); } + Self::validate_gpu_request(gpu_requirements)?; self.validate_user_volume_mounts_available(sandbox).await?; let _ = self.peek_default_gpu_device(sandbox)?; Ok(()) } + fn validate_gpu_request( + gpu_requirements: Option<&GpuResourceRequirements>, + ) -> Result<(), ComputeDriverError> { + if gpu_requirements.is_some() && !Self::has_gpu_capacity() { + return Err(ComputeDriverError::Precondition( + "GPU sandbox requested, but no NVIDIA GPU devices are available.".to_string(), + )); + } + Ok(()) + } + fn peek_default_gpu_device( &self, sandbox: &DriverSandbox, @@ -373,7 +393,8 @@ impl PodmanComputeDriver { return Ok(None); }; let driver_config = PodmanSandboxDriverConfig::from_sandbox(sandbox)?; - if !spec.gpu || driver_config.cdi_devices.is_some() { + let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); + if gpu_requirements.is_none() || driver_config.cdi_devices.is_some() { return Ok(None); } diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index 30fecd8be..f0c669b87 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -29,6 +29,7 @@ use oci_client::manifest::{ }; use oci_client::secrets::RegistryAuth; use oci_client::{Reference, RegistryOperation}; +use openshell_core::gpu::driver_gpu_requirements; use openshell_core::progress::{ PROGRESS_STEP_PULLING_IMAGE, PROGRESS_STEP_REQUESTING_SANDBOX, PROGRESS_STEP_STARTING_SANDBOX, format_bytes, mark_progress_active, mark_progress_complete, mark_progress_detail, @@ -627,7 +628,12 @@ impl VmDriver { overlay_preparation: OverlayPreparation, ) -> Result<(), Status> { self.ensure_provisioning_active(&sandbox.id).await?; - let is_gpu = sandbox.spec.as_ref().is_some_and(|spec| spec.gpu); + let is_gpu = sandbox + .spec + .as_ref() + .and_then(|spec| spec.resource_requirements.as_ref()) + .and_then(|requirements| driver_gpu_requirements(Some(requirements))) + .is_some(); self.publish_platform_event( sandbox.id.clone(), platform_event( @@ -3079,7 +3085,7 @@ fn validate_vm_sandbox(sandbox: &Sandbox, gpu_enabled: bool) -> Result<(), Statu if let Some(template) = spec.template.as_ref() { validate_vm_sandbox_template(template)?; } - validate_vm_gpu_request(sandbox, gpu_enabled)?; + validate_gpu_request(sandbox, gpu_enabled)?; Ok(()) } @@ -3100,14 +3106,16 @@ fn validate_vm_sandbox_template(template: &SandboxTemplate) -> Result<(), Status } #[allow(clippy::result_large_err)] -fn validate_vm_gpu_request(sandbox: &Sandbox, gpu_enabled: bool) -> Result<(), Status> { +fn validate_gpu_request(sandbox: &Sandbox, gpu_enabled: bool) -> Result<(), Status> { let spec = sandbox .spec .as_ref() .ok_or_else(|| Status::invalid_argument("sandbox spec is required"))?; + let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); + let _ = vm_gpu_device_id(sandbox)?; - if spec.gpu && !gpu_enabled { + if gpu_requirements.is_some() && !gpu_enabled { return Err(Status::failed_precondition( "GPU support is not enabled on this driver; start with --gpu", )); @@ -3124,7 +3132,8 @@ fn vm_gpu_device_id(sandbox: &Sandbox) -> Result, Status> { .map_err(Status::invalid_argument)? .gpu_device_ids .unwrap_or_default(); - if !spec.gpu && !gpu_device_ids.is_empty() { + let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); + if gpu_requirements.is_none() && !gpu_device_ids.is_empty() { return Err(Status::invalid_argument( "driver_config.gpu_device_ids requires gpu=true", )); @@ -3135,8 +3144,8 @@ fn vm_gpu_device_id(sandbox: &Sandbox) -> Result, Status> { )); } - Ok(spec - .gpu + Ok(gpu_requirements + .is_some() .then(|| gpu_device_ids.into_iter().next().unwrap_or_default())) } @@ -5064,6 +5073,7 @@ mod tests { }; use openshell_core::proto::compute::v1::{ DriverSandboxSpec as SandboxSpec, DriverSandboxTemplate as SandboxTemplate, + GpuResourceRequirements, ResourceRequirements, }; use prost_types::{Struct, Value, value::Kind}; use std::fs; @@ -5102,6 +5112,12 @@ mod tests { } } + fn gpu_resources() -> ResourceRequirements { + ResourceRequirements { + gpu: Some(GpuResourceRequirements {}), + } + } + #[test] fn vm_pulling_layer_event_adds_progress_detail_metadata() { let mut event = platform_event( @@ -5169,7 +5185,7 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources()), ..Default::default() }), ..Default::default() @@ -5185,7 +5201,7 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources()), ..Default::default() }), ..Default::default() @@ -5193,12 +5209,10 @@ mod tests { validate_vm_sandbox(&sandbox, true).expect("gpu should be accepted when enabled"); } - #[test] fn validate_vm_sandbox_rejects_gpu_device_without_gpu() { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - gpu: false, template: Some(SandboxTemplate { driver_config: Some(gpu_device_ids_config(&["0000:2d:00.0"])), ..Default::default() @@ -5218,7 +5232,7 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources()), template: Some(SandboxTemplate { driver_config: Some(gpu_device_ids_config(&["0000:2d:00.0", "0000:31:00.0"])), ..Default::default() @@ -5238,7 +5252,7 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources()), template: Some(SandboxTemplate { driver_config: Some(gpu_device_ids_config(&[])), ..Default::default() @@ -5258,7 +5272,7 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources()), template: Some(SandboxTemplate { driver_config: Some(gpu_device_id_typo_config(&["0000:2d:00.0"])), ..Default::default() @@ -5278,7 +5292,7 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources()), template: Some(SandboxTemplate { agent_socket_path: "/tmp/agent.sock".to_string(), driver_config: Some(gpu_device_ids_config(&[])), diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 272c3907f..3dc28943f 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -20,10 +20,11 @@ use openshell_core::ComputeDriverKind; use openshell_core::proto::compute::v1::{ CreateSandboxRequest, DeleteSandboxRequest, DriverCondition, DriverPlatformEvent, DriverResourceRequirements, DriverSandbox, DriverSandboxSpec, DriverSandboxStatus, - DriverSandboxTemplate, GetCapabilitiesRequest, GetSandboxRequest, ListSandboxesRequest, - ValidateSandboxCreateRequest, WatchSandboxesEvent, WatchSandboxesRequest, - compute_driver_client::ComputeDriverClient, compute_driver_server::ComputeDriver, - watch_sandboxes_event, + DriverSandboxTemplate, GetCapabilitiesRequest, GetSandboxRequest, + GpuResourceRequirements as DriverGpuResourceRequirements, ListSandboxesRequest, + ResourceRequirements as DriverSandboxResourceRequirements, ValidateSandboxCreateRequest, + WatchSandboxesEvent, WatchSandboxesRequest, compute_driver_client::ComputeDriverClient, + compute_driver_server::ComputeDriver, watch_sandboxes_event, }; use openshell_core::proto::{ PlatformEvent, Sandbox, SandboxCondition, SandboxPhase, SandboxSpec, SandboxStatus, @@ -1500,7 +1501,14 @@ fn driver_sandbox_spec_from_public( .as_ref() .map(|template| driver_sandbox_template_from_public(template, driver_kind)) .transpose()?, - gpu: spec.gpu, + resource_requirements: spec.resource_requirements.as_ref().map(|requirements| { + DriverSandboxResourceRequirements { + gpu: requirements + .gpu + .as_ref() + .map(|_| DriverGpuResourceRequirements {}), + } + }), sandbox_token: String::new(), }) } @@ -1881,7 +1889,9 @@ fn derive_phase(status: Option<&DriverSandboxStatus>) -> SandboxPhase { } fn rewrite_user_facing_conditions(status: &mut Option, spec: Option<&SandboxSpec>) { - let gpu_requested = spec.is_some_and(|sandbox_spec| sandbox_spec.gpu); + let gpu_requested = spec + .and_then(|sandbox_spec| sandbox_spec.resource_requirements.as_ref()) + .is_some_and(|requirements| openshell_core::gpu::sandbox_gpu_requested(Some(requirements))); if !gpu_requested { return; } @@ -2078,6 +2088,26 @@ mod tests { } } + #[test] + fn driver_sandbox_spec_from_public_preserves_gpu_requirement() { + let public = SandboxSpec { + resource_requirements: Some(openshell_core::proto::ResourceRequirements { + gpu: Some(openshell_core::proto::GpuResourceRequirements {}), + }), + ..Default::default() + }; + + let driver = + driver_sandbox_spec_from_public(&public, None).expect("driver spec should map"); + + let gpu = driver + .resource_requirements + .as_ref() + .and_then(|requirements| requirements.gpu.as_ref()) + .expect("driver GPU requirement should be set"); + assert_eq!(gpu, &DriverGpuResourceRequirements {}); + } + #[test] fn select_driver_config_forwards_only_matching_driver_block() { let config = prost_types::Struct { @@ -2584,7 +2614,9 @@ mod tests { rewrite_user_facing_conditions( &mut status, Some(&SandboxSpec { - gpu: true, + resource_requirements: Some(openshell_core::proto::ResourceRequirements { + gpu: Some(openshell_core::proto::GpuResourceRequirements {}), + }), ..Default::default() }), ); @@ -2612,13 +2644,7 @@ mod tests { ..Default::default() }); - rewrite_user_facing_conditions( - &mut status, - Some(&SandboxSpec { - gpu: false, - ..Default::default() - }), - ); + rewrite_user_facing_conditions(&mut status, Some(&SandboxSpec::default())); assert_eq!(status.unwrap().conditions[0].message, original); } @@ -2949,7 +2975,9 @@ mod tests { let sandbox = Sandbox { spec: Some(SandboxSpec { - gpu: true, + resource_requirements: Some(openshell_core::proto::ResourceRequirements { + gpu: Some(openshell_core::proto::GpuResourceRequirements {}), + }), ..Default::default() }), ..sandbox_record("sb-1", "sandbox-a", SandboxPhase::Provisioning) @@ -2972,7 +3000,9 @@ mod tests { SandboxPhase::try_from(stored.phase()).unwrap(), SandboxPhase::Ready ); - assert!(stored.spec.as_ref().is_some_and(|spec| spec.gpu)); + assert!(stored.spec.as_ref().is_some_and(|spec| { + openshell_core::gpu::sandbox_gpu_requested(spec.resource_requirements.as_ref()) + })); } #[tokio::test] diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index e60ce3995..b32901bb2 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -98,9 +98,11 @@ fn emit_sandbox_create_telemetry( } else { SandboxTemplateSource::Default }; + let gpu_requested = + openshell_core::gpu::sandbox_gpu_requested(spec.resource_requirements.as_ref()); openshell_core::telemetry::emit_sandbox_create( outcome, - spec.gpu, + gpu_requested, spec.providers.len() as u64, spec.policy.is_some(), template_source, diff --git a/crates/openshell-server/src/grpc/validation.rs b/crates/openshell-server/src/grpc/validation.rs index 03a69d6e9..2ca71f7b3 100644 --- a/crates/openshell-server/src/grpc/validation.rs +++ b/crates/openshell-server/src/grpc/validation.rs @@ -760,7 +760,9 @@ mod tests { #[test] fn validate_sandbox_spec_accepts_gpu_flag() { let spec = SandboxSpec { - gpu: true, + resource_requirements: Some(openshell_core::proto::ResourceRequirements { + gpu: Some(openshell_core::proto::GpuResourceRequirements {}), + }), ..Default::default() }; assert!(validate_sandbox_spec("gpu-sandbox", &spec).is_ok()); diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index 95a319c37..e76a4b1ca 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -53,7 +53,7 @@ openshell sandbox create \ ``` Driver config is for fields without a stable public flag. Prefer `--cpu`, -`--memory`, and `--gpu` for portable resource intent. +`--memory`, and `--gpu` for supported resource intent. Exact GPU device selection remains driver-owned and requires `--gpu`. Docker and Podman accept `cdi_devices`; replace the top-level `docker` key with diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index dbcb9e818..bfff3d42d 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -83,8 +83,9 @@ message DriverSandboxSpec { map environment = 5; // Runtime template consumed by the driver during provisioning. DriverSandboxTemplate template = 6; - // Request NVIDIA GPU resources for this sandbox. - bool gpu = 9; + // Portable resource requirements used by the gateway for driver selection + // and by drivers for provisioning. + ResourceRequirements resource_requirements = 9; reserved 10; reserved "gpu_device"; // Gateway-minted JWT identifying this sandbox to the gateway. Set by @@ -96,6 +97,14 @@ message DriverSandboxSpec { string sandbox_token = 11; } +message ResourceRequirements { + // GPU requirements for the sandbox. Presence indicates a GPU request. + GpuResourceRequirements gpu = 1; +} + +// Driver GPU resource requirements. +message GpuResourceRequirements {} + // Driver-owned runtime template consumed by the compute platform. // // This message describes the sandbox workload in backend-neutral terms. diff --git a/proto/openshell.proto b/proto/openshell.proto index d701956d3..b7884f6e7 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -317,8 +317,9 @@ message SandboxSpec { openshell.sandbox.v1.SandboxPolicy policy = 7; // Provider names to attach to this sandbox. repeated string providers = 8; - // Request NVIDIA GPU resources for this sandbox. - bool gpu = 9; + // Portable resource requirements used by the gateway for driver selection + // and by drivers for provisioning. + ResourceRequirements resource_requirements = 9; reserved 10; reserved "gpu_device"; // Field 11 was `proposal_approval_mode`. The approval mode is now a @@ -329,6 +330,14 @@ message SandboxSpec { reserved "proposal_approval_mode"; } +message ResourceRequirements { + // GPU requirements for the sandbox. Presence indicates a GPU request. + GpuResourceRequirements gpu = 1; +} + +// Public GPU resource requirements. +message GpuResourceRequirements {} + // Public sandbox template mapped onto compute-driver template inputs. message SandboxTemplate { // Fully-qualified OCI image reference used to boot the sandbox. From 3a244beff8d7fc84689b19fec8d744893b48cacb Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 17 Jun 2026 11:19:16 +0200 Subject: [PATCH 2/2] feat(gpu): add GPU resource count Signed-off-by: Evan Lezar --- architecture/compute-runtimes.md | 13 +- crates/openshell-cli/src/main.rs | 138 ++++- crates/openshell-cli/src/run.rs | 2 +- .../sandbox_create_lifecycle_integration.rs | 52 +- crates/openshell-core/src/gpu.rs | 538 +++++++++++++----- crates/openshell-driver-docker/README.md | 2 +- crates/openshell-driver-docker/src/lib.rs | 294 +++++----- crates/openshell-driver-docker/src/tests.rs | 222 ++++++-- .../openshell-driver-kubernetes/src/driver.rs | 66 ++- crates/openshell-driver-podman/README.md | 2 +- .../openshell-driver-podman/src/container.rs | 148 +++-- crates/openshell-driver-podman/src/driver.rs | 262 ++++++--- crates/openshell-driver-vm/src/driver.rs | 183 +++++- crates/openshell-server/src/compute/mod.rs | 10 +- .../openshell-server/src/grpc/validation.rs | 37 +- docs/reference/sandbox-compute-drivers.mdx | 27 +- docs/sandboxes/manage-sandboxes.mdx | 24 +- proto/compute_driver.proto | 6 +- proto/openshell.proto | 6 +- .../README.md | 228 ++++++-- 20 files changed, 1693 insertions(+), 567 deletions(-) diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index 24e0d78ff..f5c795c52 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -55,11 +55,9 @@ through the driver configuration. The Helm chart defaults sandbox agents to `Unconfined` so runtime/default AppArmor profiles do not block supervisor network namespace setup on AppArmor-enabled nodes. -GPU requests enter the driver layer through `SandboxSpec.gpu` and -`SandboxSpec.gpu_device`. Docker and Podman map default GPU requests to one -concrete NVIDIA CDI device when individual CDI devices are available, use -`nvidia.com/gpu=all` only for WSL2/all-only compatibility, and pass explicit -driver-native device IDs through. +Resource requirements enter the driver layer through `SandboxSpec.resource_requirements`. This includes a set of GPU requirements, where a user +can request a specific number of GPUs or the driver-specific default behaviour. +For all in-tree drivers, this is equivalent to selecting a single GPU. VM runtime state paths are derived only from driver-validated sandbox IDs matching `[A-Za-z0-9._-]{1,128}`. The gateway-owned VM driver socket uses a @@ -99,8 +97,9 @@ Custom sandbox images must include the agent runtime and any system dependencies, but they should not need to include the gateway. GPU-capable images must include the user-space libraries required by the workload. The runtime still owns GPU device injection. GPU requests are explicit, and can be -refined with a driver-native device identifier; the gateway validates the -request shape and each runtime enforces the GPU allocation modes it supports. +refined with a driver-native device identifier or requested count; the gateway +validates the request shape and each runtime enforces the GPU allocation modes it +supports. ## Deployment Shape diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 9d0044886..ee138b430 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -29,6 +29,21 @@ struct GatewayContext { endpoint: String, } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum GpuCliRequest { + DriverDefault, + Count(u32), +} + +impl From for GpuResourceRequirements { + fn from(gpu: GpuCliRequest) -> Self { + match gpu { + GpuCliRequest::Count(count) => Self { count: Some(count) }, + GpuCliRequest::DriverDefault => Self { count: None }, + } + } +} + /// Resolve the gateway name to a [`GatewayContext`] with the gateway endpoint. /// /// Resolution priority: @@ -110,6 +125,21 @@ fn resolve_gateway( }) } +fn parse_gpu_request(value: &str) -> std::result::Result { + if value.is_empty() { + return Ok(GpuCliRequest::DriverDefault); + } + + let count = value + .parse::() + .map_err(|_| "GPU count must be a positive integer".to_string())?; + if count == 0 { + return Err("GPU count must be greater than 0".to_string()); + } + + Ok(GpuCliRequest::Count(count)) +} + fn resolve_gateway_name(gateway_flag: &Option) -> Option { gateway_flag .clone() @@ -1217,8 +1247,11 @@ enum SandboxCommands { editor: Option, /// Request GPU resources for the sandbox. - #[arg(long)] - gpu: bool, + /// + /// Omit COUNT for the driver's default GPU selection, or pass COUNT + /// to request a specific number of GPUs. + #[arg(long, num_args = 0..=1, value_name = "COUNT", default_missing_value = "", value_parser = parse_gpu_request)] + gpu: Option, /// CPU limit for the sandbox (for example: 500m, 1, 2.5). #[arg(long)] @@ -2626,7 +2659,7 @@ async fn main() -> Result<()> { .map(|s| openshell_core::forward::ForwardSpec::parse(&s)) .transpose()?; let keep = keep || !no_keep || editor.is_some() || forward.is_some(); - let gpu_requirements = gpu.then_some(GpuResourceRequirements {}); + let gpu_requirements: Option = gpu.map(Into::into); let ctx = resolve_gateway(&cli.gateway, &cli.gateway_endpoint)?; let endpoint = &ctx.endpoint; @@ -3636,6 +3669,27 @@ mod tests { }); } + #[test] + fn gpu_cli_request_option_maps_absent_gpu_to_no_requirements() { + let gpu: Option = Option::::None.map(Into::into); + + assert_eq!(gpu, None); + } + + #[test] + fn gpu_cli_request_driver_default_converts_to_requirements() { + let gpu = GpuResourceRequirements::from(GpuCliRequest::DriverDefault); + + assert_eq!(gpu.count, None); + } + + #[test] + fn gpu_cli_request_count_converts_to_requirements() { + let gpu = GpuResourceRequirements::from(GpuCliRequest::Count(2)); + + assert_eq!(gpu.count, Some(2)); + } + #[test] fn apply_auth_uses_stored_token() { let tmp = tempfile::tempdir().unwrap(); @@ -4507,7 +4561,23 @@ mod tests { command: Some(SandboxCommands::Create { gpu, .. }), .. }) => { - assert!(gpu); + assert_eq!(gpu, Some(GpuCliRequest::DriverDefault)); + } + other => panic!("expected SandboxCommands::Create, got: {other:?}"), + } + } + + #[test] + fn sandbox_create_gpu_count_parses_from_gpu_flag() { + let cli = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu", "2"]) + .expect("sandbox create --gpu 2 should parse"); + + match cli.command { + Some(Commands::Sandbox { + command: Some(SandboxCommands::Create { gpu, .. }), + .. + }) => { + assert_eq!(gpu, Some(GpuCliRequest::Count(2))); } other => panic!("expected SandboxCommands::Create, got: {other:?}"), } @@ -4523,13 +4593,71 @@ mod tests { command: Some(SandboxCommands::Create { gpu, command, .. }), .. }) => { - assert!(gpu); + assert_eq!(gpu, Some(GpuCliRequest::DriverDefault)); + assert_eq!(command, vec!["claude".to_string()]); + } + other => panic!("expected SandboxCommands::Create, got: {other:?}"), + } + } + + #[test] + fn sandbox_create_gpu_count_allows_trailing_command() { + let cli = Cli::try_parse_from([ + "openshell", + "sandbox", + "create", + "--gpu", + "2", + "--", + "claude", + ]) + .expect("sandbox create --gpu 2 -- claude should parse"); + + match cli.command { + Some(Commands::Sandbox { + command: Some(SandboxCommands::Create { gpu, command, .. }), + .. + }) => { + assert_eq!(gpu, Some(GpuCliRequest::Count(2))); assert_eq!(command, vec!["claude".to_string()]); } other => panic!("expected SandboxCommands::Create, got: {other:?}"), } } + #[test] + fn sandbox_create_gpu_count_rejects_zero() { + let result = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu", "0"]); + + assert!(result.is_err(), "sandbox create --gpu 0 should be rejected"); + } + + #[test] + fn sandbox_create_gpu_count_accepts_equals_syntax() { + let cli = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu=2"]) + .expect("sandbox create --gpu=2 should parse"); + + match cli.command { + Some(Commands::Sandbox { + command: Some(SandboxCommands::Create { gpu, .. }), + .. + }) => { + assert_eq!(gpu, Some(GpuCliRequest::Count(2))); + } + other => panic!("expected SandboxCommands::Create, got: {other:?}"), + } + } + + #[test] + fn sandbox_create_gpu_count_rejects_non_integer() { + let result = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu", "many"]); + + assert!( + result.is_err(), + "sandbox create --gpu many should be rejected" + ); + } + #[test] fn service_expose_accepts_positional_target_port_and_service() { let cli = Cli::try_parse_from([ diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 418b200ec..a77e986d4 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -8395,7 +8395,7 @@ mod tests { #[test] fn provisioning_timeout_message_includes_condition_and_gpu_hint() { let resource_requirements = ResourceRequirements { - gpu: Some(GpuResourceRequirements {}), + gpu: Some(GpuResourceRequirements { count: None }), }; let message = provisioning_timeout_message( 120, diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index 3565c3b2c..4d01f33a9 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -1088,8 +1088,8 @@ fn test_tls(server: &TestServer) -> TlsOptions { server.tls.with_gateway_name("openshell") } -fn gpu_requirements() -> GpuResourceRequirements { - GpuResourceRequirements {} +fn gpu_requirements(count: Option) -> GpuResourceRequirements { + GpuResourceRequirements { count } } #[tokio::test] @@ -1301,7 +1301,7 @@ async fn sandbox_create_sends_gpu_default_request() { "openshell", &[], true, - Some(gpu_requirements()), + Some(gpu_requirements(None)), None, None, None, @@ -1328,11 +1328,53 @@ async fn sandbox_create_sends_gpu_default_request() { .and_then(|requirements| requirements.gpu.as_ref()) .expect("GPU requirement should be sent"); - assert!(requests[0] + assert_eq!(gpu.count, None); +} + +#[tokio::test] +async fn sandbox_create_sends_gpu_count_request() { + let server = run_server().await; + let fake_ssh_dir = tempfile::tempdir().unwrap(); + let xdg_dir = tempfile::tempdir().unwrap(); + let _env = test_env(&fake_ssh_dir, &xdg_dir); + let tls = test_tls(&server); + install_fake_ssh(&fake_ssh_dir); + + run::sandbox_create( + &server.endpoint, + Some("gpu-two"), + None, + "openshell", + &[], + true, + Some(gpu_requirements(Some(2))), + None, + None, + None, + None, + &[], + None, + None, + &["echo".to_string(), "OK".to_string()], + Some(false), + Some(false), + &HashMap::new(), + &HashMap::new(), + "manual", + &tls, + ) + .await + .expect("sandbox create should succeed"); + + let requests = create_requests(&server).await; + let gpu = requests[0] .spec .as_ref() .and_then(|spec| spec.resource_requirements.as_ref()) - .is_some_and(|requirements| requirements.gpu.is_some())); + .and_then(|requirements| requirements.gpu.as_ref()) + .expect("GPU requirement should be sent"); + + assert_eq!(gpu.count, Some(2)); } #[tokio::test] diff --git a/crates/openshell-core/src/gpu.rs b/crates/openshell-core/src/gpu.rs index fe6744a28..f5ff67cd3 100644 --- a/crates/openshell-core/src/gpu.rs +++ b/crates/openshell-core/src/gpu.rs @@ -3,7 +3,9 @@ //! Shared GPU resource requirement helpers. +use std::collections::HashSet; use std::fmt; +use std::sync::RwLock; use std::sync::atomic::{AtomicUsize, Ordering}; use crate::config::CDI_GPU_DEVICE_ALL; @@ -21,10 +23,31 @@ pub fn sandbox_gpu_requested(resources: Option<&SandboxResourceRequirements>) -> .is_some() } -/// Return whether compute-driver resource requirements request a GPU. +/// Return the requested sandbox GPU count, if one was specified. #[must_use] -pub fn driver_gpu_requested(resources: Option<&DriverResourceRequirements>) -> bool { - driver_gpu_requirements(resources).is_some() +pub fn sandbox_gpu_count(resources: Option<&SandboxResourceRequirements>) -> Option { + resources + .and_then(|resources| resources.gpu.as_ref()) + .and_then(|gpu| gpu.count) +} + +/// Return the effective compute-driver GPU count. +/// +/// A present GPU requirement with an omitted count requests one GPU. +/// +/// # Errors +/// Returns an error when a GPU requirement explicitly requests zero GPUs. +pub fn effective_driver_gpu_count( + gpu: Option<&DriverGpuResourceRequirements>, +) -> Result, String> { + let Some(gpu) = gpu else { + return Ok(None); + }; + let count = gpu.count.unwrap_or(1); + if count == 0 { + return Err("gpu count must be greater than 0".to_string()); + } + Ok(Some(count)) } /// Return the requested compute-driver GPU requirements, if present. @@ -113,9 +136,84 @@ impl CdiGpuInventory { } } +#[derive(Debug)] +struct CdiGpuSelectorState { + inventory: CdiGpuInventory, + allow_all_devices: bool, +} + +/// Concurrency-safe default CDI GPU selector used by local container drivers. +#[derive(Debug)] +pub struct CdiGpuDefaultSelector { + state: RwLock, + round_robin: CdiGpuRoundRobin, +} + +impl CdiGpuDefaultSelector { + /// Create a selector with an initial discovered CDI GPU inventory snapshot. + #[must_use] + pub fn new(inventory: CdiGpuInventory, allow_all_devices: bool) -> Self { + Self { + state: RwLock::new(CdiGpuSelectorState { + inventory, + allow_all_devices, + }), + round_robin: CdiGpuRoundRobin::new(), + } + } + + /// Replace the cached inventory snapshot without resetting the cursor. + pub fn refresh(&self, inventory: CdiGpuInventory, allow_all_devices: bool) { + let mut state = self + .state + .write() + .expect("CDI GPU selector state lock poisoned"); + state.inventory = inventory; + state.allow_all_devices = allow_all_devices; + } + + /// Return the cached normalized inventory snapshot. + #[must_use] + pub fn device_ids(&self) -> Vec { + self.state + .read() + .expect("CDI GPU selector state lock poisoned") + .inventory + .as_slice() + .to_vec() + } + + /// Return the current default device IDs without advancing the cursor. + pub fn peek_device_ids(&self, count: u32) -> Result, CdiGpuSelectionError> { + self.selected_device_ids(count, false) + } + + /// Return the next default device IDs and advance the cursor. + pub fn next_device_ids(&self, count: u32) -> Result, CdiGpuSelectionError> { + self.selected_device_ids(count, true) + } + + fn selected_device_ids( + &self, + count: u32, + consume: bool, + ) -> Result, CdiGpuSelectionError> { + let state = self + .state + .read() + .expect("CDI GPU selector state lock poisoned"); + self.round_robin.selected_default_device_ids( + &state.inventory, + count, + consume, + state.allow_all_devices, + ) + } +} + /// Concurrency-safe round-robin cursor for default CDI GPU selection. #[derive(Debug, Default)] -pub struct CdiGpuRoundRobin { +struct CdiGpuRoundRobin { next: AtomicUsize, } @@ -127,37 +225,34 @@ impl CdiGpuRoundRobin { } } - /// Return the next default device ID and advance the cursor. - pub fn next_default_device_id( - &self, - inventory: &CdiGpuInventory, - allow_all_devices: bool, - ) -> Result { - self.selected_default_device_id(inventory, true, allow_all_devices) - } - - /// Return the current default device ID without advancing the cursor. - pub fn peek_default_device_id( - &self, - inventory: &CdiGpuInventory, - allow_all_devices: bool, - ) -> Result { - self.selected_default_device_id(inventory, false, allow_all_devices) - } - - fn selected_default_device_id( + fn selected_default_device_ids( &self, inventory: &CdiGpuInventory, + count: u32, consume: bool, allow_all_devices: bool, - ) -> Result { + ) -> Result, CdiGpuSelectionError> { let devices = inventory.default_device_family(allow_all_devices)?; + let count = + usize::try_from(count).map_err(|_| CdiGpuSelectionError::InsufficientDevices { + requested: count, + available: u32::try_from(devices.len()).unwrap_or(u32::MAX), + })?; + let available = devices.len(); + if count > available { + return Err(CdiGpuSelectionError::InsufficientDevices { + requested: u32::try_from(count).unwrap_or(u32::MAX), + available: u32::try_from(available).unwrap_or(u32::MAX), + }); + } let base = if consume { - self.next.fetch_add(1, Ordering::Relaxed) + self.next.fetch_add(count, Ordering::Relaxed) } else { self.next.load(Ordering::Relaxed) }; - Ok(devices[base % devices.len()].clone()) + Ok((0..count) + .map(|offset| devices[(base + offset) % available].clone()) + .collect()) } } @@ -165,121 +260,108 @@ impl CdiGpuRoundRobin { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum CdiGpuSelectionError { NoAvailableDevices, - MissingDefaultDevice, AllDevicesDefaultUnsupported, + InsufficientDevices { requested: u32, available: u32 }, } impl fmt::Display for CdiGpuSelectionError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::NoAvailableDevices => f.write_str("no NVIDIA CDI GPU devices were discovered"), - Self::MissingDefaultDevice => { - f.write_str("GPU request requires a selected default CDI GPU device") - } Self::AllDevicesDefaultUnsupported => f.write_str( "default GPU request resolved only to nvidia.com/gpu=all, which is not allowed on this platform; set driver_config.cdi_devices to [\"nvidia.com/gpu=all\"] explicitly to request all GPUs", ), + Self::InsufficientDevices { + requested, + available: 0, + } => write!( + f, + "GPU sandbox requested {requested} GPUs, but no selectable NVIDIA CDI GPU devices are available" + ), + Self::InsufficientDevices { + requested, + available: 1, + } => write!( + f, + "GPU sandbox requested {requested} GPUs, but only 1 selectable NVIDIA CDI GPU device is available" + ), + Self::InsufficientDevices { + requested, + available, + } => write!( + f, + "GPU sandbox requested {requested} GPUs, but only {available} selectable NVIDIA CDI GPU devices are available" + ), } } } impl std::error::Error for CdiGpuSelectionError {} -/// Resolve a compute-driver GPU request into CDI device identifiers. -/// -/// `None` means no GPU was requested. Explicit driver-configured CDI devices -/// pass through unchanged. A default GPU request uses the driver-selected -/// default CDI ID. -pub fn cdi_gpu_device_ids( - gpu: Option<&DriverGpuResourceRequirements>, - cdi_devices: &[String], - selected_default_device: Option<&str>, -) -> Result>, CdiGpuSelectionError> { - if gpu.is_none() { - return Ok(None); - } - if !cdi_devices.is_empty() { - return Ok(Some(cdi_devices.to_vec())); - } - let device = selected_default_device.ok_or(CdiGpuSelectionError::MissingDefaultDevice)?; - Ok(Some(vec![device.to_string()])) -} - -/// Resolve a GPU request with the legacy all-GPU default. -#[must_use] -pub fn cdi_gpu_device_ids_or_all( - gpu: Option<&DriverGpuResourceRequirements>, - cdi_devices: &[String], -) -> Option> { - gpu.map(|_| { - if cdi_devices.is_empty() { - vec![CDI_GPU_DEVICE_ALL.to_string()] - } else { - cdi_devices.to_vec() - } - }) -} - fn cdi_nvidia_gpu_suffix(id: &str) -> Option<&str> { id.strip_prefix(CDI_NVIDIA_GPU_PREFIX) } -#[cfg(test)] -mod tests { - use super::*; +/// Validate a compute-driver GPU request against driver-owned specific devices. +/// +/// Drivers call this when a sandbox request combines portable GPU requirements +/// with exact device identifiers in `driver_config`. +/// +/// # Errors +/// Returns an error when the sandbox GPU request is absent, when `gpu.count` +/// is zero, when device IDs are duplicated, or when the effective GPU count +/// does not equal the number of specific devices. +pub fn validate_specific_gpu_device_request( + gpu: Option<&DriverGpuResourceRequirements>, + specific_devices: &[String], + driver_config_field: &str, +) -> Result<(), String> { + let device_count = specific_devices.len(); + if device_count == 0 { + return Ok(()); + } - #[test] - fn cdi_gpu_device_ids_returns_none_when_absent() { - assert_eq!(cdi_gpu_device_ids(None, &[], None), Ok(None)); + let mut seen = HashSet::with_capacity(device_count); + for device in specific_devices { + if !seen.insert(device.as_str()) { + return Err(format!( + "{driver_config_field} contains duplicate device ID '{device}'" + )); + } } - #[test] - fn cdi_gpu_device_ids_uses_selected_default_device() { - let gpu = DriverGpuResourceRequirements {}; + let Some(count) = effective_driver_gpu_count(gpu)? else { + return Err(format!("{driver_config_field} requires a gpu request")); + }; - assert_eq!( - cdi_gpu_device_ids(Some(&gpu), &[], Some("nvidia.com/gpu=0")), - Ok(Some(vec!["nvidia.com/gpu=0".to_string()])) - ); + if usize::try_from(count).ok() != Some(device_count) { + return Err(format!( + "gpu count ({count}) must match {driver_config_field} length ({device_count})" + )); } - #[test] - fn cdi_gpu_device_ids_rejects_missing_default_device() { - let gpu = DriverGpuResourceRequirements {}; + Ok(()) +} - assert_eq!( - cdi_gpu_device_ids(Some(&gpu), &[], None), - Err(CdiGpuSelectionError::MissingDefaultDevice) - ); - } +#[cfg(test)] +mod tests { + use super::*; #[test] - fn cdi_gpu_device_ids_passes_explicit_device_ids_through() { - let gpu = DriverGpuResourceRequirements {}; + fn effective_driver_gpu_count_normalizes_missing_count() { + let gpu = DriverGpuResourceRequirements { count: None }; - assert_eq!( - cdi_gpu_device_ids( - Some(&gpu), - &[ - "nvidia.com/gpu=0".to_string(), - "nvidia.com/gpu=1".to_string() - ], - None - ), - Ok(Some(vec![ - "nvidia.com/gpu=0".to_string(), - "nvidia.com/gpu=1".to_string() - ])) - ); + assert_eq!(effective_driver_gpu_count(Some(&gpu)), Ok(Some(1))); + assert_eq!(effective_driver_gpu_count(None), Ok(None)); } #[test] - fn cdi_gpu_device_ids_or_all_uses_all_when_no_devices_are_configured() { - let gpu = DriverGpuResourceRequirements {}; + fn effective_driver_gpu_count_rejects_zero_count() { + let gpu = DriverGpuResourceRequirements { count: Some(0) }; assert_eq!( - cdi_gpu_device_ids_or_all(Some(&gpu), &[]), - Some(vec![CDI_GPU_DEVICE_ALL.to_string()]) + effective_driver_gpu_count(Some(&gpu)), + Err("gpu count must be greater than 0".to_string()) ); } @@ -302,69 +384,69 @@ mod tests { } #[test] - fn round_robin_prefers_indexed_family_and_sorts_numerically() { + fn selector_prefers_indexed_family_and_sorts_numerically() { let inventory = CdiGpuInventory::new([ "nvidia.com/gpu=10", "nvidia.com/gpu=UUID-b", "nvidia.com/gpu=2", "nvidia.com/gpu=all", ]); - let selector = CdiGpuRoundRobin::new(); + let selector = CdiGpuDefaultSelector::new(inventory, false); assert_eq!( - selector.next_default_device_id(&inventory, false), - Ok("nvidia.com/gpu=2".to_string()) + selector.next_device_ids(1), + Ok(vec!["nvidia.com/gpu=2".to_string()]) ); assert_eq!( - selector.next_default_device_id(&inventory, false), - Ok("nvidia.com/gpu=10".to_string()) + selector.next_device_ids(1), + Ok(vec!["nvidia.com/gpu=10".to_string()]) ); assert_eq!( - selector.next_default_device_id(&inventory, false), - Ok("nvidia.com/gpu=2".to_string()) + selector.next_device_ids(1), + Ok(vec!["nvidia.com/gpu=2".to_string()]) ); } #[test] - fn round_robin_uses_named_family_when_no_indexed_ids_exist() { + fn selector_uses_named_family_when_no_indexed_ids_exist() { let inventory = CdiGpuInventory::new(["nvidia.com/gpu=UUID-b", "nvidia.com/gpu=UUID-a"]); - let selector = CdiGpuRoundRobin::new(); + let selector = CdiGpuDefaultSelector::new(inventory, false); assert_eq!( - selector.next_default_device_id(&inventory, false), - Ok("nvidia.com/gpu=UUID-a".to_string()) + selector.next_device_ids(1), + Ok(vec!["nvidia.com/gpu=UUID-a".to_string()]) ); } #[test] - fn round_robin_uses_all_only_inventory_when_allowed() { + fn selector_uses_all_only_inventory_when_allowed() { let inventory = CdiGpuInventory::new([CDI_GPU_DEVICE_ALL]); - let selector = CdiGpuRoundRobin::new(); + let selector = CdiGpuDefaultSelector::new(inventory, true); assert_eq!( - selector.next_default_device_id(&inventory, true), - Ok(CDI_GPU_DEVICE_ALL.to_string()) + selector.next_device_ids(1), + Ok(vec![CDI_GPU_DEVICE_ALL.to_string()]) ); } #[test] - fn round_robin_rejects_all_only_inventory_when_not_allowed() { + fn selector_rejects_all_only_inventory_when_not_allowed() { let inventory = CdiGpuInventory::new([CDI_GPU_DEVICE_ALL]); - let selector = CdiGpuRoundRobin::new(); + let selector = CdiGpuDefaultSelector::new(inventory, false); assert_eq!( - selector.next_default_device_id(&inventory, false), + selector.next_device_ids(1), Err(CdiGpuSelectionError::AllDevicesDefaultUnsupported) ); } #[test] - fn round_robin_rejects_empty_inventory() { + fn selector_rejects_empty_inventory() { let inventory = CdiGpuInventory::new(["vendor.example/device=0"]); - let selector = CdiGpuRoundRobin::new(); + let selector = CdiGpuDefaultSelector::new(inventory, false); assert_eq!( - selector.next_default_device_id(&inventory, false), + selector.next_device_ids(1), Err(CdiGpuSelectionError::NoAvailableDevices) ); } @@ -372,23 +454,205 @@ mod tests { #[test] fn peek_does_not_advance_round_robin_cursor() { let inventory = CdiGpuInventory::new(["nvidia.com/gpu=0", "nvidia.com/gpu=1"]); - let selector = CdiGpuRoundRobin::new(); + let selector = CdiGpuDefaultSelector::new(inventory, false); + + assert_eq!( + selector.peek_device_ids(1), + Ok(vec!["nvidia.com/gpu=0".to_string()]) + ); + assert_eq!( + selector.peek_device_ids(1), + Ok(vec!["nvidia.com/gpu=0".to_string()]) + ); + assert_eq!( + selector.next_device_ids(1), + Ok(vec!["nvidia.com/gpu=0".to_string()]) + ); + assert_eq!( + selector.next_device_ids(1), + Ok(vec!["nvidia.com/gpu=1".to_string()]) + ); + } + + #[test] + fn selector_selects_multiple_distinct_devices_in_cursor_order() { + let inventory = + CdiGpuInventory::new(["nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"]); + let selector = CdiGpuDefaultSelector::new(inventory, false); + + assert_eq!( + selector.next_device_ids(2), + Ok(vec![ + "nvidia.com/gpu=0".to_string(), + "nvidia.com/gpu=1".to_string() + ]) + ); + assert_eq!( + selector.next_device_ids(2), + Ok(vec![ + "nvidia.com/gpu=2".to_string(), + "nvidia.com/gpu=0".to_string() + ]) + ); + } + + #[test] + fn selector_rejects_count_larger_than_selectable_family_without_advancing() { + let inventory = CdiGpuInventory::new(["nvidia.com/gpu=0", "nvidia.com/gpu=1"]); + let selector = CdiGpuDefaultSelector::new(inventory, false); assert_eq!( - selector.peek_default_device_id(&inventory, false), - Ok("nvidia.com/gpu=0".to_string()) + selector.next_device_ids(3), + Err(CdiGpuSelectionError::InsufficientDevices { + requested: 3, + available: 2 + }) ); assert_eq!( - selector.peek_default_device_id(&inventory, false), - Ok("nvidia.com/gpu=0".to_string()) + selector.next_device_ids(1), + Ok(vec!["nvidia.com/gpu=0".to_string()]) ); + } + + #[test] + fn selector_treats_all_only_inventory_as_one_selectable_device() { + let inventory = CdiGpuInventory::new([CDI_GPU_DEVICE_ALL]); + let selector = CdiGpuDefaultSelector::new(inventory, true); + assert_eq!( - selector.next_default_device_id(&inventory, false), - Ok("nvidia.com/gpu=0".to_string()) + selector.next_device_ids(2), + Err(CdiGpuSelectionError::InsufficientDevices { + requested: 2, + available: 1 + }) ); + } + + #[test] + fn selector_refreshes_inventory_without_resetting_cursor() { + let inventory = CdiGpuInventory::new(["nvidia.com/gpu=0", "nvidia.com/gpu=1"]); + let selector = CdiGpuDefaultSelector::new(inventory, false); + + assert_eq!( + selector.next_device_ids(1), + Ok(vec!["nvidia.com/gpu=0".to_string()]) + ); + selector.refresh( + CdiGpuInventory::new(["nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"]), + false, + ); + assert_eq!( + selector.next_device_ids(1), + Ok(vec!["nvidia.com/gpu=1".to_string()]) + ); + } + + #[test] + fn validate_specific_gpu_device_request_ignores_empty_devices() { + validate_specific_gpu_device_request(None, &[], "driver_config.cdi_devices") + .expect("empty exact device lists should not be validated"); + } + + #[test] + fn validate_specific_gpu_device_request_accepts_matching_count() { + let gpu = DriverGpuResourceRequirements { count: Some(2) }; + let specific_devices = vec![ + "nvidia.com/gpu=0".to_string(), + "nvidia.com/gpu=1".to_string(), + ]; + + validate_specific_gpu_device_request( + Some(&gpu), + &specific_devices, + "driver_config.cdi_devices", + ) + .expect("matching count should be accepted"); + } + + #[test] + fn validate_specific_gpu_device_request_accepts_missing_count_for_one_device() { + let gpu = DriverGpuResourceRequirements { count: None }; + let specific_devices = vec!["nvidia.com/gpu=0".to_string()]; + + validate_specific_gpu_device_request( + Some(&gpu), + &specific_devices, + "driver_config.cdi_devices", + ) + .expect("single exact device should be compatible with a default GPU request"); + } + + #[test] + fn validate_specific_gpu_device_request_rejects_missing_gpu_request() { + let specific_devices = vec!["nvidia.com/gpu=0".to_string()]; + + let err = validate_specific_gpu_device_request( + None, + &specific_devices, + "driver_config.cdi_devices", + ) + .expect_err("missing GPU request should be rejected"); + + assert_eq!(err, "driver_config.cdi_devices requires a gpu request"); + } + + #[test] + fn validate_specific_gpu_device_request_rejects_missing_count_for_multiple_devices() { + let gpu = DriverGpuResourceRequirements { count: None }; + let specific_devices = vec![ + "nvidia.com/gpu=0".to_string(), + "nvidia.com/gpu=1".to_string(), + ]; + + let err = validate_specific_gpu_device_request( + Some(&gpu), + &specific_devices, + "driver_config.cdi_devices", + ) + .expect_err("missing count should be rejected for multiple devices"); + + assert_eq!( + err, + "gpu count (1) must match driver_config.cdi_devices length (2)" + ); + } + + #[test] + fn validate_specific_gpu_device_request_rejects_mismatch() { + let gpu = DriverGpuResourceRequirements { count: Some(2) }; + let specific_devices = vec!["nvidia.com/gpu=0".to_string()]; + + let err = validate_specific_gpu_device_request( + Some(&gpu), + &specific_devices, + "driver_config.cdi_devices", + ) + .expect_err("mismatched count should be rejected"); + + assert_eq!( + err, + "gpu count (2) must match driver_config.cdi_devices length (1)" + ); + } + + #[test] + fn validate_specific_gpu_device_request_rejects_duplicate_ids() { + let gpu = DriverGpuResourceRequirements { count: Some(2) }; + let specific_devices = vec![ + "nvidia.com/gpu=0".to_string(), + "nvidia.com/gpu=0".to_string(), + ]; + + let err = validate_specific_gpu_device_request( + Some(&gpu), + &specific_devices, + "driver_config.cdi_devices", + ) + .expect_err("duplicates should be rejected"); + assert_eq!( - selector.next_default_device_id(&inventory, false), - Ok("nvidia.com/gpu=1".to_string()) + err, + "driver_config.cdi_devices contains duplicate device ID 'nvidia.com/gpu=0'" ); } } diff --git a/crates/openshell-driver-docker/README.md b/crates/openshell-driver-docker/README.md index 468d6d8cd..aedfa4e44 100644 --- a/crates/openshell-driver-docker/README.md +++ b/crates/openshell-driver-docker/README.md @@ -32,7 +32,7 @@ contract: | `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. | | `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. | | `PidsLimit` | Enforces the sandbox PID budget at the Docker cgroup layer. Set `[openshell.drivers.docker].sandbox_pids_limit = 0` to inherit the Docker/runtime default. | -| CDI GPU request | Uses `driver_config.cdi_devices` when set; otherwise selects one concrete NVIDIA CDI GPU when the sandbox resource requirements ask for GPU support and daemon CDI support is detected. Docker daemon `/info` can permit `nvidia.com/gpu=all` as a WSL2 all-only compatibility fallback. Counted GPU requests are rejected. | +| CDI GPU request | Uses opaque `driver_config.cdi_devices` values when set; otherwise selects the requested count of NVIDIA CDI GPUs in round-robin order when daemon CDI support is detected. Docker daemon `/info` can permit `nvidia.com/gpu=all` as a WSL2 all-only compatibility fallback, where it counts as one selectable device. Exact CDI device lists must not contain duplicates and must match the effective GPU count. | The agent child process does not retain these supervisor privileges. diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index ae3da7798..6ced54892 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -28,8 +28,8 @@ use openshell_core::driver_utils::{ LABEL_SANDBOX_NAMESPACE, SUPERVISOR_IMAGE_BINARY_PATH, supervisor_image_should_refresh, }; use openshell_core::gpu::{ - CdiGpuInventory, CdiGpuRoundRobin, CdiGpuSelectionError, cdi_gpu_device_ids, - driver_gpu_requirements, + CdiGpuDefaultSelector, CdiGpuInventory, CdiGpuSelectionError, driver_gpu_requirements, + effective_driver_gpu_count, validate_specific_gpu_device_request, }; use openshell_core::progress::{ PROGRESS_STEP_PULLING_IMAGE, PROGRESS_STEP_REQUESTING_SANDBOX, PROGRESS_STEP_STARTING_SANDBOX, @@ -234,7 +234,6 @@ struct DockerDriverRuntimeConfig { guest_tls: Option, daemon_version: String, supports_gpu: bool, - cdi_gpu_inventory: CdiGpuInventory, allow_all_default_gpu: bool, sandbox_pids_limit: i64, enable_bind_mounts: bool, @@ -256,7 +255,7 @@ pub struct DockerComputeDriver { events: broadcast::Sender, pending: Arc>>, supervisor_readiness: Arc, - gpu_selector: Arc, + gpu_selector: Arc, } struct PendingSandboxRecord { @@ -287,19 +286,13 @@ struct DockerSandboxDriverConfig { mounts: Vec, } -impl DockerSandboxDriverConfig { - fn from_sandbox(sandbox: &DriverSandbox) -> Result { - let Some(template) = sandbox - .spec - .as_ref() - .and_then(|spec| spec.template.as_ref()) - else { - return Ok(Self::default()); - }; - - Self::from_template(template) - } +struct ValidatedDockerSandbox<'a> { + template: &'a DriverSandboxTemplate, + driver_config: DockerSandboxDriverConfig, + gpu_requirements: Option<&'a GpuResourceRequirements>, +} +impl DockerSandboxDriverConfig { fn from_template(template: &DriverSandboxTemplate) -> Result { let Some(config) = template.driver_config.as_ref() else { return Ok(Self::default()); @@ -420,7 +413,6 @@ impl DockerComputeDriver { guest_tls, daemon_version: version.version.unwrap_or_else(|| "unknown".to_string()), supports_gpu, - cdi_gpu_inventory, allow_all_default_gpu, sandbox_pids_limit: docker_config.sandbox_pids_limit, enable_bind_mounts: docker_config.enable_bind_mounts, @@ -428,7 +420,10 @@ impl DockerComputeDriver { events: broadcast::channel(WATCH_BUFFER).0, pending: Arc::new(Mutex::new(HashMap::new())), supervisor_readiness, - gpu_selector: Arc::new(CdiGpuRoundRobin::new()), + gpu_selector: Arc::new(CdiGpuDefaultSelector::new( + cdi_gpu_inventory, + allow_all_default_gpu, + )), }; let poll_driver = driver.clone(); @@ -455,10 +450,19 @@ impl DockerComputeDriver { ) } + #[cfg(test)] fn validate_sandbox( sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig, ) -> Result<(), Status> { + let _ = Self::validated_sandbox(sandbox, config)?; + Ok(()) + } + + fn validated_sandbox<'a>( + sandbox: &'a DriverSandbox, + config: &DockerDriverRuntimeConfig, + ) -> Result, Status> { let spec = sandbox .spec .as_ref() @@ -468,19 +472,21 @@ impl DockerComputeDriver { .as_ref() .ok_or_else(|| Status::invalid_argument("sandbox.spec.template is required"))?; - Self::validate_sandbox_template(template, config)?; - + Self::validate_sandbox_template_base(template)?; + let _ = docker_resource_limits(template)?; let driver_config = DockerSandboxDriverConfig::from_template(template).map_err(Status::invalid_argument)?; + validate_docker_driver_mounts(&driver_config.mounts, config.enable_bind_mounts)?; let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); Self::validate_gpu_request(gpu_requirements, config.supports_gpu, &driver_config)?; - Ok(()) + Ok(ValidatedDockerSandbox { + template, + driver_config, + gpu_requirements, + }) } - fn validate_sandbox_template( - template: &DriverSandboxTemplate, - config: &DockerDriverRuntimeConfig, - ) -> Result<(), Status> { + fn validate_sandbox_template_base(template: &DriverSandboxTemplate) -> Result<(), Status> { if template.image.trim().is_empty() { return Err(Status::failed_precondition( "docker sandboxes require a template image", @@ -501,8 +507,6 @@ impl DockerComputeDriver { )); } - let _ = docker_driver_config(template, config.enable_bind_mounts)?; - let _ = docker_resource_limits(template)?; Ok(()) } @@ -525,31 +529,31 @@ impl DockerComputeDriver { supports_gpu: bool, driver_config: &DockerSandboxDriverConfig, ) -> Result<(), Status> { - if gpu_requirements.is_none() && driver_config.cdi_devices.is_some() { - return Err(Status::invalid_argument( - "driver_config.cdi_devices requires gpu=true", - )); - } - - if gpu_requirements.is_some() && !supports_gpu { + let requested_count = + effective_driver_gpu_count(gpu_requirements).map_err(Status::invalid_argument)?; + if requested_count.is_some() && !supports_gpu { return Err(Status::failed_precondition( "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.", )); } + + if let Some(cdi_devices) = driver_config.cdi_devices.as_deref() { + validate_specific_gpu_device_request( + gpu_requirements, + cdi_devices, + "driver_config.cdi_devices", + ) + .map_err(Status::invalid_argument)?; + } + Ok(()) } async fn validate_user_volume_mounts_available( &self, - sandbox: &DriverSandbox, + driver_config: &DockerSandboxDriverConfig, ) -> Result<(), Status> { - let template = sandbox - .spec - .as_ref() - .and_then(|spec| spec.template.as_ref()) - .ok_or_else(|| Status::invalid_argument("sandbox.spec.template is required"))?; - let config = docker_driver_config(template, self.config.enable_bind_mounts)?; - for mount in config.mounts { + for mount in &driver_config.mounts { if let DockerDriverMountConfig::Volume { source, .. } = mount { match self.docker.inspect_volume(source.trim()).await { Ok(volume) => { @@ -576,42 +580,48 @@ impl DockerComputeDriver { Ok(()) } - fn peek_default_gpu_device(&self, sandbox: &DriverSandbox) -> Result, Status> { - self.selected_default_gpu_device(sandbox, false) - } - - fn next_default_gpu_device(&self, sandbox: &DriverSandbox) -> Result, Status> { - self.selected_default_gpu_device(sandbox, true) + async fn refresh_gpu_inventory(&self) -> Result<(), Status> { + let info = self + .docker + .info() + .await + .map_err(|err| internal_status("query Docker daemon info", err))?; + self.gpu_selector.refresh( + docker_cdi_gpu_inventory(&info), + self.config.allow_all_default_gpu, + ); + Ok(()) } - fn selected_default_gpu_device( + async fn resolve_gpu_cdi_devices( &self, - sandbox: &DriverSandbox, - consume: bool, - ) -> Result, Status> { - let Some(spec) = sandbox.spec.as_ref() else { + gpu_requirements: Option<&GpuResourceRequirements>, + driver_config: &DockerSandboxDriverConfig, + select_default_devices: fn( + &CdiGpuDefaultSelector, + u32, + ) -> Result, CdiGpuSelectionError>, + ) -> Result>, Status> { + if let Some(cdi_devices) = driver_config.cdi_devices.as_deref() { + validate_specific_gpu_device_request( + gpu_requirements, + cdi_devices, + "driver_config.cdi_devices", + ) + .map_err(Status::invalid_argument)?; + return Ok(Some(cdi_devices.to_vec())); + } + + let Some(count) = + effective_driver_gpu_count(gpu_requirements).map_err(Status::invalid_argument)? + else { return Ok(None); }; - let driver_config = - DockerSandboxDriverConfig::from_sandbox(sandbox).map_err(Status::invalid_argument)?; - let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); - if gpu_requirements.is_none() || driver_config.cdi_devices.is_some() { - return Ok(None); - } - let selected = if consume { - self.gpu_selector.next_default_device_id( - &self.config.cdi_gpu_inventory, - self.config.allow_all_default_gpu, - ) - } else { - self.gpu_selector.peek_default_device_id( - &self.config.cdi_gpu_inventory, - self.config.allow_all_default_gpu, - ) - } - .map_err(docker_gpu_selection_status)?; - Ok(Some(selected)) + self.refresh_gpu_inventory().await?; + select_default_devices(&self.gpu_selector, count) + .map(Some) + .map_err(docker_gpu_selection_status) } async fn get_sandbox_snapshot( @@ -649,14 +659,22 @@ impl DockerComputeDriver { } async fn create_sandbox_inner(&self, sandbox: &DriverSandbox) -> Result<(), Status> { - Self::validate_sandbox(sandbox, &self.config)?; + let validated = Self::validated_sandbox(sandbox, &self.config)?; Self::validate_sandbox_auth(sandbox)?; - self.validate_user_volume_mounts_available(sandbox).await?; - let selected_default_gpu = self.peek_default_gpu_device(sandbox)?; - let _ = build_container_create_body_with_default( + self.validate_user_volume_mounts_available(&validated.driver_config) + .await?; + let gpu_devices = self + .resolve_gpu_cdi_devices( + validated.gpu_requirements, + &validated.driver_config, + CdiGpuDefaultSelector::peek_device_ids, + ) + .await?; + let _ = build_container_create_body_with_gpu_devices( sandbox, &self.config, - selected_default_gpu.as_deref(), + &validated.driver_config, + gpu_devices.as_deref(), )?; if self @@ -714,11 +732,10 @@ impl DockerComputeDriver { &self, sandbox: &DriverSandbox, ) -> Result<(), DockerProvisioningFailure> { - let template = sandbox - .spec - .as_ref() - .and_then(|spec| spec.template.as_ref()) - .expect("validated sandbox has template"); + let validated = Self::validated_sandbox(sandbox, &self.config).map_err(|status| { + DockerProvisioningFailure::new("ContainerCreateFailed", status.message()) + })?; + let template = validated.template; self.ensure_image_available(&sandbox.id, &template.image) .await .map_err(|status| { @@ -731,16 +748,24 @@ impl DockerComputeDriver { })?; let container_name = container_name_for_sandbox(sandbox); - let selected_default_gpu = self.next_default_gpu_device(sandbox).map_err(|status| { - if token_file_created { - cleanup_sandbox_token_file(sandbox, &self.config); - } - DockerProvisioningFailure::new("ContainerCreateFailed", status.message()) - })?; - let create_body = build_container_create_body_with_default( + let gpu_devices = self + .resolve_gpu_cdi_devices( + validated.gpu_requirements, + &validated.driver_config, + CdiGpuDefaultSelector::next_device_ids, + ) + .await + .map_err(|status| { + if token_file_created { + cleanup_sandbox_token_file(sandbox, &self.config); + } + DockerProvisioningFailure::new("ContainerCreateFailed", status.message()) + })?; + let create_body = build_container_create_body_with_gpu_devices( sandbox, &self.config, - selected_default_gpu.as_deref(), + &validated.driver_config, + gpu_devices.as_deref(), ) .map_err(|status| { if token_file_created { @@ -1377,9 +1402,16 @@ impl ComputeDriver for DockerComputeDriver { .into_inner() .sandbox .ok_or_else(|| Status::invalid_argument("sandbox is required"))?; - Self::validate_sandbox(&sandbox, &self.config)?; - self.validate_user_volume_mounts_available(&sandbox).await?; - let _ = self.peek_default_gpu_device(&sandbox)?; + let validated = Self::validated_sandbox(&sandbox, &self.config)?; + self.validate_user_volume_mounts_available(&validated.driver_config) + .await?; + let _ = self + .resolve_gpu_cdi_devices( + validated.gpu_requirements, + &validated.driver_config, + CdiGpuDefaultSelector::peek_device_ids, + ) + .await?; Ok(Response::new(ValidateSandboxCreateResponse {})) } @@ -1720,6 +1752,7 @@ fn attach_docker_progress_metadata( } } +#[cfg(test)] fn docker_driver_config( template: &DriverSandboxTemplate, enable_bind_mounts: bool, @@ -1730,11 +1763,7 @@ fn docker_driver_config( Ok(config) } -fn docker_driver_mounts( - template: &DriverSandboxTemplate, - enable_bind_mounts: bool, -) -> Result, Status> { - let config = docker_driver_config(template, enable_bind_mounts)?; +fn docker_driver_mounts(config: &DockerSandboxDriverConfig) -> Result, Status> { config.mounts.iter().map(docker_mount_from_config).collect() } @@ -2211,49 +2240,40 @@ fn docker_gpu_selection_status(err: CdiGpuSelectionError) -> Status { Status::failed_precondition(err.to_string()) } -fn build_device_requests( - sandbox: &DriverSandbox, - selected_default_device: Option<&str>, -) -> Result>, Status> { - let Some(spec) = sandbox.spec.as_ref() else { - return Ok(None); - }; - let cdi_devices = DockerSandboxDriverConfig::from_sandbox(sandbox) - .map_err(Status::invalid_argument)? - .cdi_devices - .unwrap_or_default(); - let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); - if gpu_requirements.is_none() && !cdi_devices.is_empty() { - return Err(Status::invalid_argument( - "driver_config.cdi_devices requires gpu=true", - )); - } - - cdi_gpu_device_ids(gpu_requirements, &cdi_devices, selected_default_device) - .map(|device_ids| { - device_ids.map(|device_ids| { - vec![DeviceRequest { - driver: Some("cdi".to_string()), - device_ids: Some(device_ids), - ..Default::default() - }] - }) - }) - .map_err(docker_gpu_selection_status) -} - #[cfg(test)] fn build_container_create_body( sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig, ) -> Result { - build_container_create_body_with_default(sandbox, config, None) + let template = sandbox + .spec + .as_ref() + .and_then(|spec| spec.template.as_ref()) + .ok_or_else(|| Status::invalid_argument("sandbox.spec.template is required"))?; + let driver_config = docker_driver_config(template, config.enable_bind_mounts)?; + let gpu_requirements = sandbox + .spec + .as_ref() + .and_then(|spec| driver_gpu_requirements(spec.resource_requirements.as_ref())); + let cdi_devices = if let Some(cdi_devices) = driver_config.cdi_devices.as_ref() { + validate_specific_gpu_device_request( + gpu_requirements, + cdi_devices, + "driver_config.cdi_devices", + ) + .map_err(Status::invalid_argument)?; + Some(cdi_devices.as_slice()) + } else { + None + }; + build_container_create_body_with_gpu_devices(sandbox, config, &driver_config, cdi_devices) } -fn build_container_create_body_with_default( +fn build_container_create_body_with_gpu_devices( sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig, - selected_default_device: Option<&str>, + driver_config: &DockerSandboxDriverConfig, + gpu_device_ids: Option<&[String]>, ) -> Result { let spec = sandbox .spec @@ -2264,8 +2284,14 @@ fn build_container_create_body_with_default( .as_ref() .ok_or_else(|| Status::invalid_argument("sandbox.spec.template is required"))?; let resource_limits = docker_resource_limits(template)?; - let user_mounts = docker_driver_mounts(template, config.enable_bind_mounts)?; - let device_requests = build_device_requests(sandbox, selected_default_device)?; + let user_mounts = docker_driver_mounts(driver_config)?; + let device_requests = gpu_device_ids.map(|device_ids| { + vec![DeviceRequest { + driver: Some("cdi".to_string()), + device_ids: Some(device_ids.to_vec()), + ..Default::default() + }] + }); let mut labels = template.labels.clone(); labels.insert( LABEL_MANAGED_BY.to_string(), diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index a48b82f74..4f5b26617 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -80,9 +80,9 @@ fn list_string_driver_config(field: &str, values: &[&str]) -> prost_types::Struc } } -fn gpu_resources() -> ResourceRequirements { +fn gpu_resources(count: Option) -> ResourceRequirements { ResourceRequirements { - gpu: Some(GpuResourceRequirements {}), + gpu: Some(GpuResourceRequirements { count }), } } @@ -111,7 +111,6 @@ fn runtime_config() -> DockerDriverRuntimeConfig { }), daemon_version: "28.0.0".to_string(), supports_gpu: false, - cdi_gpu_inventory: CdiGpuInventory::default(), allow_all_default_gpu: false, sandbox_pids_limit: DEFAULT_SANDBOX_PIDS_LIMIT, enable_bind_mounts: false, @@ -179,6 +178,7 @@ impl SupervisorReadiness for DisconnectedSupervisorReadiness { } fn test_driver_with_config(config: DockerDriverRuntimeConfig) -> DockerComputeDriver { + let allow_all_default_gpu = config.allow_all_default_gpu; DockerComputeDriver { docker: Arc::new( Docker::connect_with_http("http://127.0.0.1:2375", 1, bollard::API_DEFAULT_VERSION) @@ -188,7 +188,10 @@ fn test_driver_with_config(config: DockerDriverRuntimeConfig) -> DockerComputeDr events: broadcast::channel(WATCH_BUFFER).0, pending: Arc::new(tokio::sync::Mutex::new(HashMap::new())), supervisor_readiness: Arc::new(DisconnectedSupervisorReadiness), - gpu_selector: Arc::new(CdiGpuRoundRobin::new()), + gpu_selector: Arc::new(CdiGpuDefaultSelector::new( + CdiGpuInventory::default(), + allow_all_default_gpu, + )), } } @@ -1052,7 +1055,21 @@ fn build_container_create_body_clears_inherited_cmd() { fn validate_sandbox_rejects_gpu_when_cdi_unavailable() { let config = runtime_config(); let mut sandbox = test_sandbox(); - sandbox.spec.as_mut().unwrap().resource_requirements = Some(gpu_resources()); + sandbox.spec.as_mut().unwrap().resource_requirements = Some(gpu_resources(None)); + + let err = DockerComputeDriver::validate_sandbox(&sandbox, &config).unwrap_err(); + + assert_eq!(err.code(), tonic::Code::FailedPrecondition); + assert!(err.message().contains("Docker CDI")); +} + +#[test] +fn validate_sandbox_rejects_missing_gpu_support_before_request_shape() { + let config = runtime_config(); + let mut sandbox = test_sandbox(); + let spec = sandbox.spec.as_mut().unwrap(); + spec.resource_requirements = Some(gpu_resources(Some(2))); + spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&["nvidia.com/gpu=0"])); let err = DockerComputeDriver::validate_sandbox(&sandbox, &config).unwrap_err(); @@ -1065,7 +1082,7 @@ fn validate_sandbox_rejects_invalid_cdi_devices_before_gpu_capability() { let config = runtime_config(); let mut sandbox = test_sandbox(); let spec = sandbox.spec.as_mut().unwrap(); - spec.resource_requirements = Some(gpu_resources()); + spec.resource_requirements = Some(gpu_resources(None)); spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&[])); let err = DockerComputeDriver::validate_sandbox(&sandbox, &config).unwrap_err(); @@ -1080,7 +1097,7 @@ fn validate_sandbox_rejects_unknown_driver_config_fields() { let config = runtime_config(); let mut sandbox = test_sandbox(); let spec = sandbox.spec.as_mut().unwrap(); - spec.resource_requirements = Some(gpu_resources()); + spec.resource_requirements = Some(gpu_resources(None)); spec.template.as_mut().unwrap().driver_config = Some(cdi_device_typo_config(&["nvidia.com/gpu=0"])); @@ -1090,11 +1107,111 @@ fn validate_sandbox_rejects_unknown_driver_config_fields() { assert!(err.message().contains("unknown field")); } +#[test] +fn validate_sandbox_accepts_gpu_count_request_shape() { + let mut config = runtime_config(); + config.supports_gpu = true; + let mut sandbox = test_sandbox(); + sandbox.spec.as_mut().unwrap().resource_requirements = Some(gpu_resources(Some(2))); + + DockerComputeDriver::validate_sandbox(&sandbox, &config) + .expect("default GPU count shape should be accepted before inventory selection"); +} + +#[test] +fn validate_sandbox_accepts_gpu_count_matching_cdi_devices() { + let mut config = runtime_config(); + config.supports_gpu = true; + let mut sandbox = test_sandbox(); + let spec = sandbox.spec.as_mut().unwrap(); + spec.resource_requirements = Some(gpu_resources(Some(2))); + spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&[ + "nvidia.com/gpu=0", + "nvidia.com/gpu=1", + ])); + + DockerComputeDriver::validate_sandbox(&sandbox, &config) + .expect("matching explicit CDI device count should be accepted"); +} + +#[test] +fn validate_sandbox_accepts_single_cdi_device_without_gpu_count() { + let mut config = runtime_config(); + config.supports_gpu = true; + let mut sandbox = test_sandbox(); + let spec = sandbox.spec.as_mut().unwrap(); + spec.resource_requirements = Some(gpu_resources(None)); + spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&["nvidia.com/gpu=0"])); + + DockerComputeDriver::validate_sandbox(&sandbox, &config) + .expect("single exact CDI device should be compatible with a default GPU request"); +} + +#[test] +fn validate_sandbox_rejects_multiple_cdi_devices_without_gpu_count() { + let mut config = runtime_config(); + config.supports_gpu = true; + let mut sandbox = test_sandbox(); + let spec = sandbox.spec.as_mut().unwrap(); + spec.resource_requirements = Some(gpu_resources(None)); + spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&[ + "nvidia.com/gpu=0", + "nvidia.com/gpu=1", + ])); + + let err = DockerComputeDriver::validate_sandbox(&sandbox, &config).unwrap_err(); + + assert_eq!(err.code(), tonic::Code::InvalidArgument); + assert!( + err.message() + .contains("gpu count (1) must match driver_config.cdi_devices length (2)") + ); +} + +#[test] +fn validate_sandbox_rejects_cdi_devices_without_gpu_request() { + let mut config = runtime_config(); + config.supports_gpu = true; + let mut sandbox = test_sandbox(); + sandbox + .spec + .as_mut() + .unwrap() + .template + .as_mut() + .unwrap() + .driver_config = Some(cdi_devices_config(&["nvidia.com/gpu=0"])); + + let err = DockerComputeDriver::validate_sandbox(&sandbox, &config).unwrap_err(); + + assert_eq!(err.code(), tonic::Code::InvalidArgument); + assert!(err.message().contains("requires a gpu request")); +} + +#[test] +fn validate_sandbox_rejects_gpu_count_mismatched_cdi_devices() { + let mut config = runtime_config(); + config.supports_gpu = true; + let mut sandbox = test_sandbox(); + let spec = sandbox.spec.as_mut().unwrap(); + spec.resource_requirements = Some(gpu_resources(Some(2))); + spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&["nvidia.com/gpu=0"])); + + let err = DockerComputeDriver::validate_sandbox(&sandbox, &config).unwrap_err(); + + assert_eq!(err.code(), tonic::Code::InvalidArgument); + assert!( + err.message() + .contains("gpu count (2) must match driver_config.cdi_devices length (1)") + ); +} + +#[test] fn validate_sandbox_rejects_template_errors_before_device_config() { let config = runtime_config(); let mut sandbox = test_sandbox(); let spec = sandbox.spec.as_mut().unwrap(); - spec.resource_requirements = Some(gpu_resources()); + spec.resource_requirements = Some(gpu_resources(None)); let template = spec.template.as_mut().unwrap(); template.agent_socket_path = "/tmp/agent.sock".to_string(); template.driver_config = Some(cdi_devices_config(&[])); @@ -1132,11 +1249,17 @@ fn build_container_create_body_maps_default_gpu_to_selected_cdi_device() { let mut config = runtime_config(); config.supports_gpu = true; let mut sandbox = test_sandbox(); - sandbox.spec.as_mut().unwrap().resource_requirements = Some(gpu_resources()); - - let create_body = - build_container_create_body_with_default(&sandbox, &config, Some("nvidia.com/gpu=1")) - .unwrap(); + sandbox.spec.as_mut().unwrap().resource_requirements = Some(gpu_resources(None)); + + let driver_config = DockerSandboxDriverConfig::default(); + let gpu_devices = vec!["nvidia.com/gpu=1".to_string()]; + let create_body = build_container_create_body_with_gpu_devices( + &sandbox, + &config, + &driver_config, + Some(&gpu_devices), + ) + .unwrap(); let request = create_body .host_config .as_ref() @@ -1152,19 +1275,20 @@ fn build_container_create_body_maps_default_gpu_to_selected_cdi_device() { } #[test] -fn build_container_create_body_rejects_missing_default_cdi_device() { +fn build_container_create_body_omits_devices_without_resolved_default_cdi_devices() { let mut config = runtime_config(); config.supports_gpu = true; let mut sandbox = test_sandbox(); - sandbox.spec.as_mut().unwrap().gpu = true; + sandbox.spec.as_mut().unwrap().resource_requirements = Some(gpu_resources(None)); - let err = build_container_create_body(&sandbox, &config).unwrap_err(); + let create_body = build_container_create_body(&sandbox, &config).unwrap(); - assert_eq!(err.code(), tonic::Code::FailedPrecondition); assert!( - err.message().contains("selected default CDI GPU device"), - "unexpected error: {}", - err.message() + create_body + .host_config + .as_ref() + .and_then(|host_config| host_config.device_requests.as_ref()) + .is_none() ); } @@ -1174,7 +1298,7 @@ fn build_container_create_body_passes_explicit_cdi_device_id_through() { config.supports_gpu = true; let mut sandbox = test_sandbox(); let spec = sandbox.spec.as_mut().unwrap(); - spec.resource_requirements = Some(gpu_resources()); + spec.resource_requirements = Some(gpu_resources(None)); spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&["nvidia.com/gpu=0"])); let create_body = build_container_create_body(&sandbox, &config).unwrap(); @@ -1193,7 +1317,25 @@ fn build_container_create_body_passes_explicit_cdi_device_id_through() { } #[test] -fn build_container_create_body_rejects_cdi_devices_without_gpu() { +fn build_container_create_body_rejects_gpu_count_mismatched_cdi_devices() { + let mut config = runtime_config(); + config.supports_gpu = true; + let mut sandbox = test_sandbox(); + let spec = sandbox.spec.as_mut().unwrap(); + spec.resource_requirements = Some(gpu_resources(Some(2))); + spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&["nvidia.com/gpu=0"])); + + let err = build_container_create_body(&sandbox, &config).unwrap_err(); + + assert_eq!(err.code(), tonic::Code::InvalidArgument); + assert!( + err.message() + .contains("gpu count (2) must match driver_config.cdi_devices length (1)") + ); +} + +#[test] +fn build_container_create_body_rejects_cdi_devices_without_gpu_request() { let mut sandbox = test_sandbox(); sandbox .spec @@ -1206,14 +1348,14 @@ fn build_container_create_body_rejects_cdi_devices_without_gpu() { let err = build_container_create_body(&sandbox, &runtime_config()).unwrap_err(); assert_eq!(err.code(), tonic::Code::InvalidArgument); - assert!(err.message().contains("requires gpu=true")); + assert!(err.message().contains("requires a gpu request")); } #[test] fn build_container_create_body_rejects_empty_cdi_devices() { let mut sandbox = test_sandbox(); let spec = sandbox.spec.as_mut().unwrap(); - spec.resource_requirements = Some(gpu_resources()); + spec.resource_requirements = Some(gpu_resources(None)); spec.template.as_mut().unwrap().driver_config = Some(cdi_devices_config(&[])); let err = build_container_create_body(&sandbox, &runtime_config()).unwrap_err(); @@ -1225,40 +1367,46 @@ fn build_container_create_body_rejects_empty_cdi_devices() { fn driver_default_gpu_selection_consumes_distinct_devices_for_creates() { let mut config = runtime_config(); config.supports_gpu = true; - config.cdi_gpu_inventory = CdiGpuInventory::new(["nvidia.com/gpu=0", "nvidia.com/gpu=1"]); let driver = test_driver_with_config(config); + driver.gpu_selector.refresh( + CdiGpuInventory::new(["nvidia.com/gpu=0", "nvidia.com/gpu=1"]), + false, + ); let mut first_sandbox = test_sandbox(); first_sandbox.id = "sbx-first".to_string(); first_sandbox.name = "first".to_string(); - first_sandbox.spec.as_mut().unwrap().gpu = true; + first_sandbox.spec.as_mut().unwrap().resource_requirements = Some(gpu_resources(None)); let mut second_sandbox = test_sandbox(); second_sandbox.id = "sbx-second".to_string(); second_sandbox.name = "second".to_string(); - second_sandbox.spec.as_mut().unwrap().gpu = true; + second_sandbox.spec.as_mut().unwrap().resource_requirements = Some(gpu_resources(None)); DockerComputeDriver::validate_sandbox(&first_sandbox, &driver.config).unwrap(); assert_eq!( - driver.peek_default_gpu_device(&first_sandbox).unwrap(), - Some("nvidia.com/gpu=0".to_string()) + driver.gpu_selector.peek_device_ids(1).unwrap(), + vec!["nvidia.com/gpu=0".to_string()] ); - let first_device = driver.next_default_gpu_device(&first_sandbox).unwrap(); - let first_create_body = build_container_create_body_with_default( + let first_devices = driver.gpu_selector.next_device_ids(1).unwrap(); + let driver_config = DockerSandboxDriverConfig::default(); + let first_create_body = build_container_create_body_with_gpu_devices( &first_sandbox, &driver.config, - first_device.as_deref(), + &driver_config, + Some(&first_devices), ) .unwrap(); DockerComputeDriver::validate_sandbox(&second_sandbox, &driver.config).unwrap(); assert_eq!( - driver.peek_default_gpu_device(&second_sandbox).unwrap(), - Some("nvidia.com/gpu=1".to_string()) + driver.gpu_selector.peek_device_ids(1).unwrap(), + vec!["nvidia.com/gpu=1".to_string()] ); - let second_device = driver.next_default_gpu_device(&second_sandbox).unwrap(); - let second_create_body = build_container_create_body_with_default( + let second_devices = driver.gpu_selector.next_device_ids(1).unwrap(); + let second_create_body = build_container_create_body_with_gpu_devices( &second_sandbox, &driver.config, - second_device.as_deref(), + &driver_config, + Some(&second_devices), ) .unwrap(); diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index d634cea9a..a32034f3a 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -17,7 +17,7 @@ use kube::{Client, Error as KubeError}; use openshell_core::driver_utils::{ LABEL_MANAGED_BY, LABEL_MANAGED_BY_VALUE, LABEL_SANDBOX_ID, SUPERVISOR_IMAGE_BINARY_PATH, }; -use openshell_core::gpu::driver_gpu_requirements; +use openshell_core::gpu::{driver_gpu_requirements, effective_driver_gpu_count}; use openshell_core::progress::{ PROGRESS_STEP_PULLING_IMAGE, PROGRESS_STEP_REQUESTING_SANDBOX, PROGRESS_STEP_STARTING_SANDBOX, format_bytes, mark_progress_active, mark_progress_complete, mark_progress_detail, @@ -84,7 +84,6 @@ const SANDBOX_VERSION: &str = "v1alpha1"; pub const SANDBOX_KIND: &str = "Sandbox"; const GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; -const GPU_RESOURCE_QUANTITY: &str = "1"; const SPIFFE_WORKLOAD_API_VOLUME_NAME: &str = "spiffe-workload-api"; // This POC treats the selected Struct as a driver-local typed schema. Once the @@ -652,8 +651,10 @@ impl KubernetesComputeDriver { } fn validate_gpu_request( - _gpu_requirements: Option<&GpuResourceRequirements>, + gpu_requirements: Option<&GpuResourceRequirements>, ) -> Result<(), tonic::Status> { + let _ = + effective_driver_gpu_count(gpu_requirements).map_err(tonic::Status::invalid_argument)?; Ok(()) } @@ -1279,7 +1280,7 @@ fn sandbox_template_to_k8s( inject_workspace: bool, params: &SandboxPodParams<'_>, ) -> serde_json::Value { - let gpu_requirements = gpu.then_some(GpuResourceRequirements {}); + let gpu_requirements = gpu.then_some(GpuResourceRequirements { count: None }); sandbox_template_to_k8s_with_gpu_requirements( template, gpu_requirements.as_ref(), @@ -1697,8 +1698,9 @@ fn container_resources( apply("requests", "memory", memory_request); } - if gpu_requirements.is_some() { - apply_gpu_limit(&mut resources, GPU_RESOURCE_QUANTITY); + if let Some(gpu) = gpu_requirements { + let quantity = gpu.count.unwrap_or(1).to_string(); + apply_gpu_limit(&mut resources, &quantity); } if resources.as_object().is_some_and(serde_json::Map::is_empty) { None @@ -1970,7 +1972,7 @@ mod tests { PROGRESS_ACTIVE_DETAIL_KEY, PROGRESS_ACTIVE_STEP_KEY, PROGRESS_COMPLETE_LABEL_KEY, PROGRESS_COMPLETE_STEP_KEY, }; - use openshell_core::proto::compute::v1::GpuResourceRequirements; + use openshell_core::proto::compute::v1::{GpuResourceRequirements, ResourceRequirements}; use prost_types::{Struct, Value, value::Kind}; static ENV_LOCK: std::sync::LazyLock> = @@ -2060,6 +2062,28 @@ mod tests { assert!(err.contains("gpu_device_ids")); } + #[test] + fn validate_rejects_zero_gpu_count() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + resource_requirements: Some(ResourceRequirements { + gpu: Some(GpuResourceRequirements { count: Some(0) }), + }), + ..SandboxSpec::default() + }), + ..Sandbox::default() + }; + + let gpu_requirements = sandbox + .spec + .as_ref() + .and_then(|spec| driver_gpu_requirements(spec.resource_requirements.as_ref())); + let err = validate_gpu_request(gpu_requirements).unwrap_err(); + assert_eq!(err.code(), tonic::Code::InvalidArgument); + assert!(err.message().contains("gpu count must be greater than 0")); + } + + #[test] fn kube_pulling_event_adds_image_progress_metadata() { let mut metadata = std::collections::HashMap::new(); @@ -2384,10 +2408,31 @@ mod tests { ); assert_eq!( pod_template["spec"]["containers"][0]["resources"]["limits"][GPU_RESOURCE_NAME], - serde_json::json!(GPU_RESOURCE_QUANTITY) + serde_json::json!("1") ); } + #[test] + fn gpu_count_sandbox_adds_requested_gpu_limit() { + let pod_template = { + let params = SandboxPodParams::default(); + let gpu_requirements = GpuResourceRequirements { count: Some(2) }; + sandbox_template_to_k8s_with_gpu_requirements( + &SandboxTemplate::default(), + Some(&gpu_requirements), + &std::collections::HashMap::new(), + true, + ¶ms, + ) + }; + + assert_eq!( + pod_template["spec"]["containers"][0]["resources"]["limits"][GPU_RESOURCE_NAME], + serde_json::json!("2") + ); + } + + #[test] fn gpu_sandbox_uses_template_runtime_class_name_when_set() { let template = SandboxTemplate { platform_config: Some(Struct { @@ -2649,10 +2694,7 @@ mod tests { let limits = &pod_template["spec"]["containers"][0]["resources"]["limits"]; assert_eq!(limits["cpu"], serde_json::json!("2")); - assert_eq!( - limits[GPU_RESOURCE_NAME], - serde_json::json!(GPU_RESOURCE_QUANTITY) - ); + assert_eq!(limits[GPU_RESOURCE_NAME], serde_json::json!("1")); } #[test] diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md index 1291dbc29..c484bf8a4 100644 --- a/crates/openshell-driver-podman/README.md +++ b/crates/openshell-driver-podman/README.md @@ -46,7 +46,7 @@ The container spec in `container.rs` sets these security-critical fields: | `no_new_privileges` | `true` | Prevents privilege escalation after exec. | | `seccomp_profile_path` | `unconfined` | The supervisor installs its own policy-aware BPF filter. A container-level profile can block Landlock/seccomp syscalls during setup. | | `mounts` | Private tmpfs at `/run/netns` | Lets the supervisor create named network namespaces in rootless Podman. | -| CDI GPU devices | `driver_config.cdi_devices` when set, otherwise one concrete NVIDIA CDI GPU. Local `/dev/dxg` permits `nvidia.com/gpu=all` as a WSL2 all-only compatibility fallback. | Exposes requested GPUs to GPU-enabled sandbox containers. Counted GPU requests are rejected. | +| CDI GPU devices | Opaque `driver_config.cdi_devices` values when set, otherwise the requested count of NVIDIA CDI GPUs selected in round-robin order. Local `/dev/dxg` permits `nvidia.com/gpu=all` as a WSL2 all-only compatibility fallback, where it counts as one selectable device. | Exposes requested GPUs to GPU-enabled sandbox containers. Exact CDI device lists must not contain duplicates and must match the effective GPU count. | The restricted agent child does not retain these supervisor privileges. diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index 348ffe265..16814784a 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -5,7 +5,8 @@ use crate::config::PodmanComputeConfig; use openshell_core::ComputeDriverError; -use openshell_core::gpu::{cdi_gpu_device_ids, driver_gpu_requirements}; +#[cfg(test)] +use openshell_core::gpu::{driver_gpu_requirements, validate_specific_gpu_device_request}; use openshell_core::proto::compute::v1::{DriverSandbox, DriverSandboxTemplate}; use openshell_core::proto_struct::deserialize_optional_non_empty_string_list; use openshell_core::{driver_mounts, proto_struct}; @@ -476,36 +477,6 @@ fn podman_pids_limit(value: i64) -> Option { if value > 0 { Some(value) } else { None } } -/// Build CDI GPU device list if GPU is requested. -fn build_devices( - sandbox: &DriverSandbox, - selected_default_device: Option<&str>, -) -> Result>, ComputeDriverError> { - let Some(spec) = sandbox.spec.as_ref() else { - return Ok(None); - }; - let cdi_devices = PodmanSandboxDriverConfig::from_sandbox(sandbox)? - .cdi_devices - .unwrap_or_default(); - let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); - if gpu_requirements.is_none() && !cdi_devices.is_empty() { - return Err(ComputeDriverError::InvalidArgument( - "driver_config.cdi_devices requires gpu=true".to_string(), - )); - } - - cdi_gpu_device_ids(gpu_requirements, &cdi_devices, selected_default_device) - .map(|device_ids| { - device_ids.map(|device_ids| { - device_ids - .into_iter() - .map(|path| LinuxDevice { path }) - .collect() - }) - }) - .map_err(|err| ComputeDriverError::Precondition(err.to_string())) -} - pub fn podman_driver_volume_mount_sources( sandbox: &DriverSandbox, enable_bind_mounts: bool, @@ -796,14 +767,30 @@ pub fn try_build_container_spec_with_token( config: &PodmanComputeConfig, token_host_path: Option<&Path>, ) -> Result { - build_container_spec_with_token_and_gpu_default(sandbox, config, token_host_path, None) + let driver_config = PodmanSandboxDriverConfig::from_sandbox(sandbox)?; + let gpu_requirements = sandbox + .spec + .as_ref() + .and_then(|spec| driver_gpu_requirements(spec.resource_requirements.as_ref())); + let cdi_devices = if let Some(cdi_devices) = driver_config.cdi_devices.as_ref() { + validate_specific_gpu_device_request( + gpu_requirements, + cdi_devices, + "driver_config.cdi_devices", + ) + .map_err(ComputeDriverError::InvalidArgument)?; + Some(cdi_devices.as_slice()) + } else { + None + }; + build_container_spec_with_token_and_gpu_devices(sandbox, config, token_host_path, cdi_devices) } -pub fn build_container_spec_with_token_and_gpu_default( +pub fn build_container_spec_with_token_and_gpu_devices( sandbox: &DriverSandbox, config: &PodmanComputeConfig, token_host_path: Option<&Path>, - selected_default_device: Option<&str>, + gpu_device_ids: Option<&[String]>, ) -> Result { let image = resolve_image(sandbox, config); let name = container_name(&sandbox.name); @@ -814,7 +801,13 @@ pub fn build_container_spec_with_token_and_gpu_default( let resource_limits = build_resource_limits(sandbox, config); let user_mounts = podman_user_mounts(sandbox, config.enable_bind_mounts) .map_err(ComputeDriverError::InvalidArgument)?; - let devices = build_devices(sandbox, selected_default_device)?; + let devices = gpu_device_ids.map(|device_ids| { + device_ids + .iter() + .cloned() + .map(|path| LinuxDevice { path }) + .collect() + }); // Network configuration -- always bridge mode. // Matches libpod's network spec format `{name: {opts}}`; the unit-struct @@ -1150,9 +1143,9 @@ mod tests { } } - fn gpu_resources() -> ResourceRequirements { + fn gpu_resources(count: Option) -> ResourceRequirements { ResourceRequirements { - gpu: Some(GpuResourceRequirements {}), + gpu: Some(GpuResourceRequirements { count }), } } @@ -1268,15 +1261,16 @@ mod tests { let mut sandbox = test_sandbox("test-id", "test-name"); sandbox.spec = Some(DriverSandboxSpec { - resource_requirements: Some(gpu_resources()), + resource_requirements: Some(gpu_resources(None)), ..Default::default() }); let config = test_config(); - let spec = build_container_spec_with_token_and_gpu_default( + let gpu_devices = vec!["nvidia.com/gpu=1".to_string()]; + let spec = build_container_spec_with_token_and_gpu_devices( &sandbox, &config, None, - Some("nvidia.com/gpu=1"), + Some(&gpu_devices), ) .unwrap(); @@ -1287,21 +1281,20 @@ mod tests { } #[test] - fn container_spec_rejects_missing_default_cdi_device() { + fn container_spec_omits_devices_without_resolved_default_cdi_devices() { use openshell_core::proto::compute::v1::DriverSandboxSpec; let mut sandbox = test_sandbox("test-id", "test-name"); sandbox.spec = Some(DriverSandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources(None)), ..Default::default() }); let config = test_config(); - let err = build_container_spec_with_token_and_gpu_default(&sandbox, &config, None, None) - .unwrap_err(); + let spec = + build_container_spec_with_token_and_gpu_devices(&sandbox, &config, None, None).unwrap(); - assert!(matches!(err, ComputeDriverError::Precondition(_))); - assert!(err.to_string().contains("selected default CDI GPU device")); + assert!(spec.get("devices").is_none()); } #[test] @@ -1310,7 +1303,7 @@ mod tests { let mut sandbox = test_sandbox("test-id", "test-name"); sandbox.spec = Some(DriverSandboxSpec { - resource_requirements: Some(gpu_resources()), + resource_requirements: Some(gpu_resources(None)), template: Some(DriverSandboxTemplate { driver_config: Some(cdi_devices_config(&["nvidia.com/gpu=0"])), ..Default::default() @@ -1327,7 +1320,60 @@ mod tests { } #[test] - fn container_spec_rejects_cdi_devices_without_gpu() { + fn container_spec_accepts_gpu_count_matching_cdi_devices() { + use openshell_core::proto::compute::v1::{DriverSandboxSpec, DriverSandboxTemplate}; + + let mut sandbox = test_sandbox("test-id", "test-name"); + sandbox.spec = Some(DriverSandboxSpec { + resource_requirements: Some(gpu_resources(Some(2))), + template: Some(DriverSandboxTemplate { + driver_config: Some(cdi_devices_config(&[ + "nvidia.com/gpu=0", + "nvidia.com/gpu=1", + ])), + ..Default::default() + }), + ..Default::default() + }); + let config = test_config(); + let spec = build_container_spec(&sandbox, &config); + + assert_eq!(spec["devices"].as_array().map(Vec::len), Some(2)); + assert_eq!( + spec["devices"][0]["path"].as_str(), + Some("nvidia.com/gpu=0") + ); + assert_eq!( + spec["devices"][1]["path"].as_str(), + Some("nvidia.com/gpu=1") + ); + } + + #[test] + fn container_spec_rejects_gpu_count_mismatched_cdi_devices() { + use openshell_core::proto::compute::v1::{DriverSandboxSpec, DriverSandboxTemplate}; + + let mut sandbox = test_sandbox("test-id", "test-name"); + sandbox.spec = Some(DriverSandboxSpec { + resource_requirements: Some(gpu_resources(Some(2))), + template: Some(DriverSandboxTemplate { + driver_config: Some(cdi_devices_config(&["nvidia.com/gpu=0"])), + ..Default::default() + }), + ..Default::default() + }); + let config = test_config(); + + let err = try_build_container_spec_with_token(&sandbox, &config, None).unwrap_err(); + assert!(matches!(err, ComputeDriverError::InvalidArgument(_))); + assert!( + err.to_string() + .contains("gpu count (2) must match driver_config.cdi_devices length (1)") + ); + } + + #[test] + fn container_spec_rejects_cdi_devices_without_gpu_request() { use openshell_core::proto::compute::v1::{DriverSandboxSpec, DriverSandboxTemplate}; let mut sandbox = test_sandbox("test-id", "test-name"); @@ -1342,7 +1388,7 @@ mod tests { let err = try_build_container_spec_with_token(&sandbox, &config, None).unwrap_err(); assert!(matches!(err, ComputeDriverError::InvalidArgument(_))); - assert!(err.to_string().contains("requires gpu=true")); + assert!(err.to_string().contains("requires a gpu request")); } #[test] @@ -1351,7 +1397,7 @@ mod tests { let mut sandbox = test_sandbox("test-id", "test-name"); sandbox.spec = Some(DriverSandboxSpec { - resource_requirements: Some(gpu_resources()), + resource_requirements: Some(gpu_resources(None)), template: Some(DriverSandboxTemplate { driver_config: Some(cdi_devices_config(&[])), ..Default::default() @@ -1371,7 +1417,7 @@ mod tests { let mut sandbox = test_sandbox("test-id", "test-name"); sandbox.spec = Some(DriverSandboxSpec { - resource_requirements: Some(gpu_resources()), + resource_requirements: Some(gpu_resources(None)), template: Some(DriverSandboxTemplate { driver_config: Some(cdi_device_typo_config(&["nvidia.com/gpu=0"])), ..Default::default() diff --git a/crates/openshell-driver-podman/src/driver.rs b/crates/openshell-driver-podman/src/driver.rs index 927c02c28..27d510281 100644 --- a/crates/openshell-driver-podman/src/driver.rs +++ b/crates/openshell-driver-podman/src/driver.rs @@ -13,7 +13,8 @@ use openshell_core::ComputeDriverError; use openshell_core::config::CDI_GPU_DEVICE_ALL; use openshell_core::driver_utils::supervisor_image_should_refresh; use openshell_core::gpu::{ - CdiGpuInventory, CdiGpuRoundRobin, CdiGpuSelectionError, driver_gpu_requirements, + CdiGpuDefaultSelector, CdiGpuInventory, CdiGpuSelectionError, driver_gpu_requirements, + effective_driver_gpu_count, validate_specific_gpu_device_request, }; use openshell_core::proto::compute::v1::{ DriverSandbox, GetCapabilitiesResponse, GpuResourceRequirements, @@ -41,9 +42,8 @@ pub struct PodmanComputeDriver { /// The host's IP on the bridge network. Sandbox containers use this to /// reach the gateway server when no explicit gRPC endpoint is configured. network_gateway_ip: Option, - gpu_inventory: CdiGpuInventory, - allow_all_default_gpu: bool, - gpu_selector: Arc, + gpu_selector: Arc, + gpu_inventory_refresh: Arc (CdiGpuInventory, bool) + Send + Sync>, } impl std::fmt::Debug for PodmanComputeDriver { @@ -52,12 +52,16 @@ impl std::fmt::Debug for PodmanComputeDriver { .field("socket_path", &self.config.socket_path) .field("default_image", &self.config.default_image) .field("network_name", &self.config.network_name) - .field("gpu_inventory", &self.gpu_inventory) - .field("allow_all_default_gpu", &self.allow_all_default_gpu) + .field("gpu_inventory", &self.gpu_selector.device_ids()) .finish() } } +struct ValidatedPodmanSandbox<'a> { + driver_config: PodmanSandboxDriverConfig, + gpu_requirements: Option<&'a GpuResourceRequirements>, +} + /// Construct and validate a container name from a sandbox name. /// /// Combines the prefix with the sandbox name and validates the result @@ -170,6 +174,13 @@ fn local_podman_all_gpu_default_supported() -> bool { local_podman_all_gpu_default_supported_from(Path::new("/dev")) } +fn local_podman_gpu_selector_state() -> (CdiGpuInventory, bool) { + ( + local_podman_cdi_gpu_inventory(), + local_podman_all_gpu_default_supported(), + ) +} + fn podman_gpu_selection_error(err: CdiGpuSelectionError) -> ComputeDriverError { ComputeDriverError::Precondition(err.to_string()) } @@ -271,8 +282,7 @@ impl PodmanComputeDriver { "Bridge network ready" ); - let gpu_inventory = local_podman_cdi_gpu_inventory(); - let allow_all_default_gpu = local_podman_all_gpu_default_supported(); + let (gpu_inventory, allow_all_default_gpu) = local_podman_gpu_selector_state(); if !gpu_inventory.is_empty() { info!( device_count = gpu_inventory.as_slice().len(), @@ -308,9 +318,11 @@ impl PodmanComputeDriver { client, config, network_gateway_ip, - gpu_inventory, - allow_all_default_gpu, - gpu_selector: Arc::new(CdiGpuRoundRobin::new()), + gpu_selector: Arc::new(CdiGpuDefaultSelector::new( + gpu_inventory, + allow_all_default_gpu, + )), + gpu_inventory_refresh: Arc::new(local_podman_gpu_selector_state), }) } @@ -342,71 +354,85 @@ impl PodmanComputeDriver { &self, sandbox: &DriverSandbox, ) -> Result<(), ComputeDriverError> { + let _ = self.validated_sandbox_create(sandbox).await?; + Ok(()) + } + + async fn validated_sandbox_create<'a>( + &self, + sandbox: &'a DriverSandbox, + ) -> Result, ComputeDriverError> { let gpu_requirements = sandbox .spec .as_ref() .and_then(|spec| spec.resource_requirements.as_ref()) .and_then(|requirements| driver_gpu_requirements(Some(requirements))); let driver_config = PodmanSandboxDriverConfig::from_sandbox(sandbox)?; - if gpu_requirements.is_none() && driver_config.cdi_devices.is_some() { - return Err(ComputeDriverError::InvalidArgument( - "driver_config.cdi_devices requires gpu=true".to_string(), - )); - } - Self::validate_gpu_request(gpu_requirements)?; + Self::validate_gpu_request(gpu_requirements, &driver_config)?; self.validate_user_volume_mounts_available(sandbox).await?; - let _ = self.peek_default_gpu_device(sandbox)?; - Ok(()) + let _ = self.resolve_gpu_cdi_devices( + gpu_requirements, + &driver_config, + CdiGpuDefaultSelector::peek_device_ids, + )?; + Ok(ValidatedPodmanSandbox { + driver_config, + gpu_requirements, + }) } fn validate_gpu_request( gpu_requirements: Option<&GpuResourceRequirements>, + driver_config: &PodmanSandboxDriverConfig, ) -> Result<(), ComputeDriverError> { - if gpu_requirements.is_some() && !Self::has_gpu_capacity() { - return Err(ComputeDriverError::Precondition( - "GPU sandbox requested, but no NVIDIA GPU devices are available.".to_string(), - )); + let _ = effective_driver_gpu_count(gpu_requirements) + .map_err(ComputeDriverError::InvalidArgument)?; + if let Some(cdi_devices) = driver_config.cdi_devices.as_deref() { + validate_specific_gpu_device_request( + gpu_requirements, + cdi_devices, + "driver_config.cdi_devices", + ) + .map_err(ComputeDriverError::InvalidArgument)?; } + Ok(()) } - fn peek_default_gpu_device( - &self, - sandbox: &DriverSandbox, - ) -> Result, ComputeDriverError> { - self.selected_default_gpu_device(sandbox, false) + fn refresh_gpu_inventory(&self) { + let (inventory, allow_all_default_gpu) = (self.gpu_inventory_refresh)(); + self.gpu_selector.refresh(inventory, allow_all_default_gpu); } - fn next_default_gpu_device( + fn resolve_gpu_cdi_devices( &self, - sandbox: &DriverSandbox, - ) -> Result, ComputeDriverError> { - self.selected_default_gpu_device(sandbox, true) - } + gpu_requirements: Option<&GpuResourceRequirements>, + driver_config: &PodmanSandboxDriverConfig, + select_default_devices: fn( + &CdiGpuDefaultSelector, + u32, + ) -> Result, CdiGpuSelectionError>, + ) -> Result>, ComputeDriverError> { + if let Some(cdi_devices) = driver_config.cdi_devices.as_deref() { + validate_specific_gpu_device_request( + gpu_requirements, + cdi_devices, + "driver_config.cdi_devices", + ) + .map_err(ComputeDriverError::InvalidArgument)?; + return Ok(Some(cdi_devices.to_vec())); + } - fn selected_default_gpu_device( - &self, - sandbox: &DriverSandbox, - consume: bool, - ) -> Result, ComputeDriverError> { - let Some(spec) = sandbox.spec.as_ref() else { + let Some(count) = effective_driver_gpu_count(gpu_requirements) + .map_err(ComputeDriverError::InvalidArgument)? + else { return Ok(None); }; - let driver_config = PodmanSandboxDriverConfig::from_sandbox(sandbox)?; - let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); - if gpu_requirements.is_none() || driver_config.cdi_devices.is_some() { - return Ok(None); - } - let selected = if consume { - self.gpu_selector - .next_default_device_id(&self.gpu_inventory, self.allow_all_default_gpu) - } else { - self.gpu_selector - .peek_default_device_id(&self.gpu_inventory, self.allow_all_default_gpu) - } - .map_err(podman_gpu_selection_error)?; - Ok(Some(selected)) + self.refresh_gpu_inventory(); + select_default_devices(&self.gpu_selector, count) + .map(Some) + .map_err(podman_gpu_selection_error) } async fn validate_user_volume_mounts_available( @@ -454,7 +480,7 @@ impl PodmanComputeDriver { // resources (volume), so we don't leave orphans when the name is // invalid. let name = validated_container_name(&sandbox.name)?; - self.validate_sandbox_create(sandbox).await?; + let validated = self.validated_sandbox_create(sandbox).await?; let vol_name = container::volume_name(&sandbox.id); @@ -520,19 +546,23 @@ impl PodmanComputeDriver { }; // 3. Create container. - let selected_default_gpu = match self.next_default_gpu_device(sandbox) { - Ok(device) => device, + let gpu_devices = match self.resolve_gpu_cdi_devices( + validated.gpu_requirements, + &validated.driver_config, + CdiGpuDefaultSelector::next_device_ids, + ) { + Ok(devices) => devices, Err(e) => { let _ = self.client.remove_volume(&vol_name).await; cleanup_sandbox_token_file(&sandbox.id); return Err(e); } }; - let spec = match container::build_container_spec_with_token_and_gpu_default( + let spec = match container::build_container_spec_with_token_and_gpu_devices( sandbox, &self.config, token_host_path.as_deref(), - selected_default_gpu.as_deref(), + gpu_devices.as_deref(), ) { Ok(spec) => spec, Err(e) => { @@ -763,13 +793,18 @@ impl PodmanComputeDriver { allow_all_default_gpu: bool, ) -> Self { let client = PodmanClient::new(config.socket_path.clone()); + let refresh_inventory = gpu_inventory.clone(); Self { client, config, network_gateway_ip: None, - gpu_inventory, - allow_all_default_gpu, - gpu_selector: Arc::new(CdiGpuRoundRobin::new()), + gpu_selector: Arc::new(CdiGpuDefaultSelector::new( + gpu_inventory, + allow_all_default_gpu, + )), + gpu_inventory_refresh: Arc::new(move || { + (refresh_inventory.clone(), allow_all_default_gpu) + }), } } } @@ -826,7 +861,9 @@ mod tests { use super::*; use crate::test_utils::{StubResponse, spawn_podman_stub}; use hyper::StatusCode; - use openshell_core::proto::compute::v1::{DriverSandboxSpec, DriverSandboxTemplate}; + use openshell_core::proto::compute::v1::{ + DriverSandboxSpec, DriverSandboxTemplate, ResourceRequirements, + }; use std::collections::HashMap; use std::fs; use std::path::PathBuf; @@ -854,6 +891,12 @@ mod tests { } } + fn gpu_resources(count: Option) -> ResourceRequirements { + ResourceRequirements { + gpu: Some(GpuResourceRequirements { count }), + } + } + #[test] fn podman_driver_error_from_conflict() { let err = ComputeDriverError::from(PodmanApiError::Conflict("exists".into())); @@ -866,6 +909,69 @@ mod tests { assert!(matches!(err, ComputeDriverError::Message(_))); } + #[test] + fn validate_gpu_request_accepts_gpu_count_request_shape() { + let gpu = GpuResourceRequirements { count: Some(2) }; + let driver_config = PodmanSandboxDriverConfig::default(); + + PodmanComputeDriver::validate_gpu_request(Some(&gpu), &driver_config) + .expect("default GPU count shape should be accepted before inventory selection"); + } + + #[test] + fn validate_gpu_request_accepts_single_cdi_device_without_gpu_count() { + let gpu = GpuResourceRequirements { count: None }; + let mut driver_config = PodmanSandboxDriverConfig::default(); + driver_config.cdi_devices = Some(vec!["nvidia.com/gpu=0".to_string()]); + + PodmanComputeDriver::validate_gpu_request(Some(&gpu), &driver_config) + .expect("single exact CDI device should pass count validation"); + } + + #[test] + fn validate_gpu_request_rejects_multiple_cdi_devices_without_gpu_count() { + let gpu = GpuResourceRequirements { count: None }; + let mut driver_config = PodmanSandboxDriverConfig::default(); + driver_config.cdi_devices = Some(vec![ + "nvidia.com/gpu=0".to_string(), + "nvidia.com/gpu=1".to_string(), + ]); + let err = PodmanComputeDriver::validate_gpu_request(Some(&gpu), &driver_config) + .expect_err("missing CDI device count should be rejected for multiple devices"); + + assert!(matches!(err, ComputeDriverError::InvalidArgument(_))); + assert!( + err.to_string() + .contains("gpu count (1) must match driver_config.cdi_devices length (2)") + ); + } + + #[test] + fn validate_gpu_request_rejects_cdi_devices_without_gpu_request() { + let mut driver_config = PodmanSandboxDriverConfig::default(); + driver_config.cdi_devices = Some(vec!["nvidia.com/gpu=0".to_string()]); + let err = PodmanComputeDriver::validate_gpu_request(None, &driver_config) + .expect_err("missing GPU request should be rejected"); + + assert!(matches!(err, ComputeDriverError::InvalidArgument(_))); + assert!(err.to_string().contains("requires a gpu request")); + } + + #[test] + fn validate_gpu_request_rejects_mismatched_cdi_device_count() { + let gpu = GpuResourceRequirements { count: Some(2) }; + let mut driver_config = PodmanSandboxDriverConfig::default(); + driver_config.cdi_devices = Some(vec!["nvidia.com/gpu=0".to_string()]); + let err = PodmanComputeDriver::validate_gpu_request(Some(&gpu), &driver_config) + .expect_err("mismatched CDI device count should be rejected"); + + assert!(matches!(err, ComputeDriverError::InvalidArgument(_))); + assert!( + err.to_string() + .contains("gpu count (2) must match driver_config.cdi_devices length (1)") + ); + } + // ── grpc_endpoint auto-detection ─────────────────────────────────── // // PodmanComputeDriver::new() fills grpc_endpoint when it is empty. @@ -996,7 +1102,7 @@ mod tests { ); let sandbox = DriverSandbox { spec: Some(DriverSandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources(None)), ..Default::default() }), ..Default::default() @@ -1016,7 +1122,7 @@ mod tests { ); let sandbox = DriverSandbox { spec: Some(DriverSandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources(None)), ..Default::default() }), ..Default::default() @@ -1035,7 +1141,7 @@ mod tests { ); let sandbox = DriverSandbox { spec: Some(DriverSandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources(None)), ..Default::default() }), ..Default::default() @@ -1053,7 +1159,7 @@ mod tests { let driver = PodmanComputeDriver::for_tests(PodmanComputeConfig::default()); let sandbox = DriverSandbox { spec: Some(DriverSandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources(None)), template: Some(DriverSandboxTemplate { driver_config: Some(cdi_devices_config(&["nvidia.com/gpu=0"])), ..Default::default() @@ -1078,7 +1184,7 @@ mod tests { id: "sbx-first".to_string(), name: "first".to_string(), spec: Some(DriverSandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources(None)), ..Default::default() }), ..Default::default() @@ -1087,35 +1193,35 @@ mod tests { id: "sbx-second".to_string(), name: "second".to_string(), spec: Some(DriverSandboxSpec { - gpu: true, + resource_requirements: Some(gpu_resources(None)), ..Default::default() }), ..Default::default() }; assert_eq!( - driver.peek_default_gpu_device(&first_sandbox).unwrap(), - Some("nvidia.com/gpu=0".to_string()) + driver.gpu_selector.peek_device_ids(1).unwrap(), + vec!["nvidia.com/gpu=0".to_string()] ); - let first_device = driver.next_default_gpu_device(&first_sandbox).unwrap(); - let first_spec = container::build_container_spec_with_token_and_gpu_default( + let first_devices = driver.gpu_selector.next_device_ids(1).unwrap(); + let first_spec = container::build_container_spec_with_token_and_gpu_devices( &first_sandbox, &driver.config, None, - first_device.as_deref(), + Some(&first_devices), ) .unwrap(); assert_eq!( - driver.peek_default_gpu_device(&second_sandbox).unwrap(), - Some("nvidia.com/gpu=1".to_string()) + driver.gpu_selector.peek_device_ids(1).unwrap(), + vec!["nvidia.com/gpu=1".to_string()] ); - let second_device = driver.next_default_gpu_device(&second_sandbox).unwrap(); - let second_spec = container::build_container_spec_with_token_and_gpu_default( + let second_devices = driver.gpu_selector.next_device_ids(1).unwrap(); + let second_spec = container::build_container_spec_with_token_and_gpu_devices( &second_sandbox, &driver.config, None, - second_device.as_deref(), + Some(&second_devices), ) .unwrap(); diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index f0c669b87..d5d9565e7 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -29,7 +29,9 @@ use oci_client::manifest::{ }; use oci_client::secrets::RegistryAuth; use oci_client::{Reference, RegistryOperation}; -use openshell_core::gpu::driver_gpu_requirements; +use openshell_core::gpu::{ + driver_gpu_requirements, effective_driver_gpu_count, validate_specific_gpu_device_request, +}; use openshell_core::progress::{ PROGRESS_STEP_PULLING_IMAGE, PROGRESS_STEP_REQUESTING_SANDBOX, PROGRESS_STEP_STARTING_SANDBOX, format_bytes, mark_progress_active, mark_progress_complete, mark_progress_detail, @@ -3113,13 +3115,23 @@ fn validate_gpu_request(sandbox: &Sandbox, gpu_enabled: bool) -> Result<(), Stat .ok_or_else(|| Status::invalid_argument("sandbox spec is required"))?; let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); + let gpu_count = + effective_driver_gpu_count(gpu_requirements).map_err(Status::invalid_argument)?; - let _ = vm_gpu_device_id(sandbox)?; if gpu_requirements.is_some() && !gpu_enabled { return Err(Status::failed_precondition( "GPU support is not enabled on this driver; start with --gpu", )); } + + let _ = vm_gpu_device_id(sandbox)?; + + if gpu_count.is_some_and(|count| count > 1) { + return Err(Status::invalid_argument( + "VM GPU sandboxes support only one GPU", + )); + } + Ok(()) } @@ -3133,11 +3145,12 @@ fn vm_gpu_device_id(sandbox: &Sandbox) -> Result, Status> { .gpu_device_ids .unwrap_or_default(); let gpu_requirements = driver_gpu_requirements(spec.resource_requirements.as_ref()); - if gpu_requirements.is_none() && !gpu_device_ids.is_empty() { - return Err(Status::invalid_argument( - "driver_config.gpu_device_ids requires gpu=true", - )); - } + validate_specific_gpu_device_request( + gpu_requirements, + &gpu_device_ids, + "driver_config.gpu_device_ids", + ) + .map_err(Status::invalid_argument)?; if gpu_device_ids.len() > 1 { return Err(Status::invalid_argument( "vm driver currently supports at most one gpu_device_ids entry", @@ -5112,9 +5125,9 @@ mod tests { } } - fn gpu_resources() -> ResourceRequirements { + fn gpu_resources(count: Option) -> ResourceRequirements { ResourceRequirements { - gpu: Some(GpuResourceRequirements {}), + gpu: Some(GpuResourceRequirements { count }), } } @@ -5185,7 +5198,7 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - resource_requirements: Some(gpu_resources()), + resource_requirements: Some(gpu_resources(None)), ..Default::default() }), ..Default::default() @@ -5196,12 +5209,32 @@ mod tests { assert!(err.message().contains("GPU support is not enabled")); } + #[test] + fn validate_vm_sandbox_rejects_missing_gpu_support_before_request_shape() { + let sandbox = Sandbox { + id: "sandbox-123".to_string(), + spec: Some(SandboxSpec { + resource_requirements: Some(gpu_resources(Some(2))), + template: Some(SandboxTemplate { + driver_config: Some(gpu_device_ids_config(&["0000:2d:00.0"])), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + let err = validate_vm_sandbox(&sandbox, false) + .expect_err("missing GPU support should be rejected before request shape"); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!(err.message().contains("GPU support is not enabled")); + } + #[test] fn validate_vm_sandbox_accepts_gpu_when_enabled() { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - resource_requirements: Some(gpu_resources()), + resource_requirements: Some(gpu_resources(None)), ..Default::default() }), ..Default::default() @@ -5209,7 +5242,121 @@ mod tests { validate_vm_sandbox(&sandbox, true).expect("gpu should be accepted when enabled"); } - fn validate_vm_sandbox_rejects_gpu_device_without_gpu() { + #[test] + fn validate_vm_sandbox_accepts_gpu_count_one() { + let sandbox = Sandbox { + id: "sandbox-123".to_string(), + spec: Some(SandboxSpec { + resource_requirements: Some(gpu_resources(Some(1))), + ..Default::default() + }), + ..Default::default() + }; + validate_vm_sandbox(&sandbox, true).expect("one GPU should be accepted when enabled"); + } + + #[test] + fn validate_vm_sandbox_accepts_single_gpu_device_without_gpu_count() { + let sandbox = Sandbox { + id: "sandbox-123".to_string(), + spec: Some(SandboxSpec { + resource_requirements: Some(gpu_resources(None)), + template: Some(SandboxTemplate { + driver_config: Some(gpu_device_ids_config(&["0000:2d:00.0"])), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + validate_vm_sandbox(&sandbox, true) + .expect("single exact GPU device should be compatible with a default GPU request"); + } + + #[test] + fn validate_vm_sandbox_rejects_multiple_gpu_device_ids_without_gpu_count() { + let sandbox = Sandbox { + id: "sandbox-123".to_string(), + spec: Some(SandboxSpec { + resource_requirements: Some(gpu_resources(None)), + template: Some(SandboxTemplate { + driver_config: Some(gpu_device_ids_config(&["0000:2d:00.0", "0000:31:00.0"])), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + let err = validate_vm_sandbox(&sandbox, true) + .expect_err("multiple GPU device IDs without count should be rejected"); + + assert_eq!(err.code(), Code::InvalidArgument); + assert!( + err.message() + .contains("gpu count (1) must match driver_config.gpu_device_ids length (2)") + ); + } + + #[test] + fn validate_vm_sandbox_accepts_gpu_count_matching_device_id() { + let sandbox = Sandbox { + id: "sandbox-123".to_string(), + spec: Some(SandboxSpec { + resource_requirements: Some(gpu_resources(Some(1))), + template: Some(SandboxTemplate { + driver_config: Some(gpu_device_ids_config(&["0000:2d:00.0"])), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + validate_vm_sandbox(&sandbox, true) + .expect("matching explicit GPU device count should be accepted"); + } + + #[test] + fn validate_vm_sandbox_rejects_gpu_count_above_one() { + let sandbox = Sandbox { + id: "sandbox-123".to_string(), + spec: Some(SandboxSpec { + resource_requirements: Some(gpu_resources(Some(2))), + ..Default::default() + }), + ..Default::default() + }; + let err = validate_vm_sandbox(&sandbox, true) + .expect_err("multiple GPU VM request should be rejected"); + assert_eq!(err.code(), Code::InvalidArgument); + assert!(err.message().contains("support only one GPU")); + } + + #[test] + fn validate_vm_sandbox_rejects_gpu_count_mismatched_device_id() { + let sandbox = Sandbox { + id: "sandbox-123".to_string(), + spec: Some(SandboxSpec { + resource_requirements: Some(gpu_resources(Some(2))), + template: Some(SandboxTemplate { + driver_config: Some(gpu_device_ids_config(&["0000:2d:00.0"])), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + let err = validate_vm_sandbox(&sandbox, true) + .expect_err("mismatched explicit GPU device count should be rejected"); + + assert_eq!(err.code(), Code::InvalidArgument); + assert!( + err.message() + .contains("gpu count (2) must match driver_config.gpu_device_ids length (1)") + ); + } + + #[test] + fn validate_vm_sandbox_rejects_gpu_device_without_gpu_request() { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { @@ -5222,9 +5369,9 @@ mod tests { ..Default::default() }; let err = validate_vm_sandbox(&sandbox, true) - .expect_err("gpu_device_ids without gpu should be rejected"); + .expect_err("gpu_device_ids without a GPU request should be rejected"); assert_eq!(err.code(), Code::InvalidArgument); - assert!(err.message().contains("gpu_device_ids requires gpu=true")); + assert!(err.message().contains("requires a gpu request")); } #[test] @@ -5232,7 +5379,7 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - resource_requirements: Some(gpu_resources()), + resource_requirements: Some(gpu_resources(Some(2))), template: Some(SandboxTemplate { driver_config: Some(gpu_device_ids_config(&["0000:2d:00.0", "0000:31:00.0"])), ..Default::default() @@ -5252,7 +5399,7 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - resource_requirements: Some(gpu_resources()), + resource_requirements: Some(gpu_resources(None)), template: Some(SandboxTemplate { driver_config: Some(gpu_device_ids_config(&[])), ..Default::default() @@ -5272,7 +5419,7 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - resource_requirements: Some(gpu_resources()), + resource_requirements: Some(gpu_resources(None)), template: Some(SandboxTemplate { driver_config: Some(gpu_device_id_typo_config(&["0000:2d:00.0"])), ..Default::default() @@ -5292,7 +5439,7 @@ mod tests { let sandbox = Sandbox { id: "sandbox-123".to_string(), spec: Some(SandboxSpec { - resource_requirements: Some(gpu_resources()), + resource_requirements: Some(gpu_resources(None)), template: Some(SandboxTemplate { agent_socket_path: "/tmp/agent.sock".to_string(), driver_config: Some(gpu_device_ids_config(&[])), diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 3dc28943f..8576e51b9 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -1506,7 +1506,7 @@ fn driver_sandbox_spec_from_public( gpu: requirements .gpu .as_ref() - .map(|_| DriverGpuResourceRequirements {}), + .map(|gpu| DriverGpuResourceRequirements { count: gpu.count }), } }), sandbox_token: String::new(), @@ -2092,7 +2092,7 @@ mod tests { fn driver_sandbox_spec_from_public_preserves_gpu_requirement() { let public = SandboxSpec { resource_requirements: Some(openshell_core::proto::ResourceRequirements { - gpu: Some(openshell_core::proto::GpuResourceRequirements {}), + gpu: Some(openshell_core::proto::GpuResourceRequirements { count: Some(2) }), }), ..Default::default() }; @@ -2105,7 +2105,7 @@ mod tests { .as_ref() .and_then(|requirements| requirements.gpu.as_ref()) .expect("driver GPU requirement should be set"); - assert_eq!(gpu, &DriverGpuResourceRequirements {}); + assert_eq!(gpu.count, Some(2)); } #[test] @@ -2615,7 +2615,7 @@ mod tests { &mut status, Some(&SandboxSpec { resource_requirements: Some(openshell_core::proto::ResourceRequirements { - gpu: Some(openshell_core::proto::GpuResourceRequirements {}), + gpu: Some(openshell_core::proto::GpuResourceRequirements { count: None }), }), ..Default::default() }), @@ -2976,7 +2976,7 @@ mod tests { let sandbox = Sandbox { spec: Some(SandboxSpec { resource_requirements: Some(openshell_core::proto::ResourceRequirements { - gpu: Some(openshell_core::proto::GpuResourceRequirements {}), + gpu: Some(openshell_core::proto::GpuResourceRequirements { count: None }), }), ..Default::default() }), diff --git a/crates/openshell-server/src/grpc/validation.rs b/crates/openshell-server/src/grpc/validation.rs index 2ca71f7b3..b3680c6e7 100644 --- a/crates/openshell-server/src/grpc/validation.rs +++ b/crates/openshell-server/src/grpc/validation.rs @@ -134,6 +134,9 @@ pub(super) fn validate_sandbox_spec( validate_env_entries(&tmpl.environment, "spec.template.environment")?; } + // --- spec.resource_requirements.gpu --- + validate_gpu_request_fields(spec)?; + // --- spec.policy serialized size --- if let Some(ref policy) = spec.policy { let size = policy.encoded_len(); @@ -147,6 +150,14 @@ pub(super) fn validate_sandbox_spec( Ok(()) } +fn validate_gpu_request_fields(spec: &openshell_core::proto::SandboxSpec) -> Result<(), Status> { + if openshell_core::gpu::sandbox_gpu_count(spec.resource_requirements.as_ref()) == Some(0) { + return Err(Status::invalid_argument("gpu count must be greater than 0")); + } + + Ok(()) +} + /// Validate template-level field sizes. fn validate_sandbox_template(tmpl: &SandboxTemplate) -> Result<(), Status> { // String fields. @@ -761,13 +772,37 @@ mod tests { fn validate_sandbox_spec_accepts_gpu_flag() { let spec = SandboxSpec { resource_requirements: Some(openshell_core::proto::ResourceRequirements { - gpu: Some(openshell_core::proto::GpuResourceRequirements {}), + gpu: Some(openshell_core::proto::GpuResourceRequirements { count: None }), + }), + ..Default::default() + }; + assert!(validate_sandbox_spec("gpu-sandbox", &spec).is_ok()); + } + + #[test] + fn validate_sandbox_spec_accepts_gpu_count() { + let spec = SandboxSpec { + resource_requirements: Some(openshell_core::proto::ResourceRequirements { + gpu: Some(openshell_core::proto::GpuResourceRequirements { count: Some(2) }), }), ..Default::default() }; assert!(validate_sandbox_spec("gpu-sandbox", &spec).is_ok()); } + #[test] + fn validate_sandbox_spec_rejects_zero_gpu_count() { + let spec = SandboxSpec { + resource_requirements: Some(openshell_core::proto::ResourceRequirements { + gpu: Some(openshell_core::proto::GpuResourceRequirements { count: Some(0) }), + }), + ..Default::default() + }; + let err = validate_sandbox_spec("gpu-sandbox", &spec).unwrap_err(); + assert_eq!(err.code(), Code::InvalidArgument); + assert!(err.message().contains("gpu count must be greater than 0")); + } + #[test] fn validate_sandbox_spec_accepts_empty_defaults() { assert!(validate_sandbox_spec("", &default_spec()).is_ok()); diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index e76a4b1ca..c81afd66d 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -53,14 +53,29 @@ openshell sandbox create \ ``` Driver config is for fields without a stable public flag. Prefer `--cpu`, -`--memory`, and `--gpu` for supported resource intent. +`--memory`, and `--gpu` for supported resource intent. When `--gpu` is present +without a count, OpenShell treats it as a request for one GPU. Pass +`--gpu COUNT` when requesting more than one GPU. + +Kubernetes maps the GPU count to the `nvidia.com/gpu` pod resource limit. +Docker and Podman satisfy count-only GPU requests by selecting the requested +number of NVIDIA CDI devices from the local CDI inventory in round-robin order. +The drivers refresh the CDI inventory before validating or creating the +sandbox, so CDI devices added or removed after gateway start can affect later +creates. On WSL2 all-only runtimes, Docker or Podman can use +`nvidia.com/gpu=all` as a compatibility fallback, where it counts as one +selectable device. Exact GPU device selection remains driver-owned and requires `--gpu`. Docker -and Podman accept `cdi_devices`; replace the top-level `docker` key with -`podman` when using the Podman driver, for example -`{"docker":{"cdi_devices":["nvidia.com/gpu=0"]}}`. The VM driver accepts -`gpu_device_ids`, for example `{"vm":{"gpu_device_ids":["0000:2d:00.0"]}}`; -the current VM implementation accepts at most one entry. +and Podman accept `cdi_devices` as opaque CDI device names; replace the +top-level `docker` key with `podman` when using the Podman driver, for example +`{"docker":{"cdi_devices":["nvidia.com/gpu=0"]}}`. Explicit CDI device lists +must not contain duplicates, and their length must match the effective GPU +count. A single exact CDI device is compatible with the default `--gpu` +request. The VM driver accepts `gpu_device_ids`, for example +`{"vm":{"gpu_device_ids":["0000:2d:00.0"]}}`; the current VM implementation +accepts at most one entry and allows either `--gpu` or `--gpu 1` when +`gpu_device_ids` is set. For Kubernetes, `pod.runtime_class_name` maps to PodSpec `runtimeClassName`. It overrides the gateway's configured default runtime class for that sandbox, diff --git a/docs/sandboxes/manage-sandboxes.mdx b/docs/sandboxes/manage-sandboxes.mdx index efb2a6b5a..fdbf497f5 100644 --- a/docs/sandboxes/manage-sandboxes.mdx +++ b/docs/sandboxes/manage-sandboxes.mdx @@ -70,17 +70,33 @@ To request GPU resources, add `--gpu`: openshell sandbox create --gpu -- claude ``` +Request a specific number of GPUs by passing a count to `--gpu`: + +```shell +openshell sandbox create --gpu 2 -- claude +``` + +When you omit the count, OpenShell treats the request as `--gpu 1`. +Kubernetes honors counted `--gpu` requests by setting the `nvidia.com/gpu` +limit. Docker and Podman select the requested number of default NVIDIA CDI +devices in round-robin order. VM gateways accept only one GPU, either through +`--gpu` or `--gpu 1`; a single `gpu_device_ids` entry works with either form. + For Docker-backed sandboxes, GPU injection uses Docker CDI. If you enable Docker CDI after the gateway starts, restart the gateway so OpenShell can detect the updated Docker daemon capability. -Docker and Podman gateways map a default `--gpu` request to one concrete -NVIDIA CDI device when individual CDI devices are available. On WSL2 all-only -runtimes, the default can fall back to `nvidia.com/gpu=all`. +Docker and Podman refresh the CDI inventory before validating or creating a +default GPU request, so CDI devices added or removed after the driver starts can +be reflected in later sandbox creates. On WSL2 all-only runtimes, the default +can fall back to `nvidia.com/gpu=all`; that fallback counts as one selectable +device. Exact GPU device selection is driver-specific and still requires `--gpu`. For Docker or Podman, pass CDI IDs through `cdi_devices`. The top-level key must -match the active driver; replace `docker` with `podman` when using Podman: +match the active driver; replace `docker` with `podman` when using Podman. CDI +IDs are treated as opaque strings. The list must not contain duplicate IDs, and +its length must match the effective GPU count: ```shell openshell sandbox create \ diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index bfff3d42d..f471f575a 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -103,7 +103,11 @@ message ResourceRequirements { } // Driver GPU resource requirements. -message GpuResourceRequirements {} +message GpuResourceRequirements { + // Optional number of GPUs requested. When omitted, the request is for one + // GPU using the selected driver's default assignment behavior. + optional uint32 count = 1; +} // Driver-owned runtime template consumed by the compute platform. // diff --git a/proto/openshell.proto b/proto/openshell.proto index b7884f6e7..c5dad4140 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -336,7 +336,11 @@ message ResourceRequirements { } // Public GPU resource requirements. -message GpuResourceRequirements {} +message GpuResourceRequirements { + // Optional number of GPUs requested. When omitted, the request is for one + // GPU using the selected driver's default assignment behavior. + optional uint32 count = 1; +} // Public sandbox template mapped onto compute-driver template inputs. message SandboxTemplate { diff --git a/rfc/0004-sandbox-resource-requirements/README.md b/rfc/0004-sandbox-resource-requirements/README.md index 01b4319dd..c2ceae778 100644 --- a/rfc/0004-sandbox-resource-requirements/README.md +++ b/rfc/0004-sandbox-resource-requirements/README.md @@ -7,6 +7,8 @@ links: - https://github.com/NVIDIA/OpenShell/pull/1340 - https://github.com/NVIDIA/OpenShell/pull/1360 - https://github.com/NVIDIA/OpenShell/issues/1492 + - https://github.com/NVIDIA/OpenShell/pull/1675 + - https://github.com/NVIDIA/OpenShell/pull/1815 --- # RFC 0004 - Sandbox Resource Requirements @@ -26,6 +28,11 @@ then relies on the selected driver to validate and provision the request. `SandboxTemplate.resources` remains a platform-native realization layer and escape hatch. It is not the portable driver-selection interface. +The current implementation lands the first GPU-focused phase of this model: +`resource_requirements.gpu` replaces the legacy GPU-specific request fields, +with an optional count. The broader generic compute/device/dataset model in +this RFC remains the target direction for later expansion. + ## Motivation OpenShell currently treats GPU placement as a special case. The public @@ -59,6 +66,12 @@ configuration remains separate and is interpreted by the resource driver. Exposing a general-purpose driver-specific configuration surface is related, but tracked separately in issue #1492. +Since this RFC was first drafted, `SandboxTemplate.driver_config` has been +added as the driver-owned configuration surface. The gateway selects the block +for the active driver and forwards that inner object unchanged. Exact GPU +selection for Docker, Podman, and VM is now expressed there rather than in the +portable resource requirement. + ## Non-goals - Defining dataset allocation, mount, caching, or access-control semantics. @@ -69,8 +82,10 @@ tracked separately in issue #1492. - Defining the general driver-specific configuration passthrough API. Issue #1492 tracks that related API surface. - Publishing allocated resource identities in sandbox status. -- Preserving long-term compatibility for `gpu`, `gpu_device`, or a - GPU-specific `gpu_count` request field. +- Preserving alpha-era compatibility for `gpu`, `gpu_device`, or a + GPU-specific `gpu_count` request field. The legacy GPU-specific request + fields are intentionally not carried forward into the API shape this RFC + aims to stabilize. ## Proposal @@ -89,26 +104,52 @@ message SandboxSpec { // Portable resource requirements used by the gateway for driver selection // and by drivers for provisioning. - SandboxResourceRequirements resource_requirements = 11; + ResourceRequirements resource_requirements = 9; - reserved 9, 10; - reserved "gpu", "gpu_device"; + reserved 10; + reserved "gpu_device"; } ``` +The public sandbox API is still alpha. This migration intentionally replaces +the old `bool gpu = 9` field with the typed `resource_requirements = 9` message +instead of reserving the legacy field number. Old live requests and persisted +sandbox records that encode GPU intent through the legacy boolean are not +migrated; callers should use a matching OpenShell CLI/API version and recreate +GPU sandboxes after upgrade when they need the new typed shape. Avoiding +alpha-era reserved fields keeps the proto surface closer to the API intended +for stabilization. + `SandboxTemplate.resources` keeps its existing role as platform-native workload configuration. It may contain Kubernetes-style CPU, memory, and extended resource requests and limits, but it is not the portable resource contract. +The implemented first phase uses this narrower proto shape: + +```proto +message ResourceRequirements { + // GPU requirements for the sandbox. Presence indicates a GPU request. + GpuResourceRequirements gpu = 1; +} + +message GpuResourceRequirements { + // Optional number of GPUs requested. When omitted, the request is for one + // GPU using the selected driver's default assignment behavior. + optional uint32 count = 1; +} +``` + +`gpu.count` must be greater than zero when present. When `gpu` is present and +`count` is omitted, the effective count is one. Drivers apply that same +effective-count rule when validating exact driver-config device lists. + The CLI should not expose a JSON flag for `resource_requirements`. Common portable requests should use typed flags such as CPU, memory, and GPU-count flags, and SDK/API callers should use the typed protobuf messages directly. -JSON-formatted driver-specific configuration is a related but separate API -topic. Issue #1492 tracks exposing an opaque driver-owned configuration surface, -potentially named `driver_config`. This RFC only requires that driver-native -configuration remains separate from portable resource requirements. +JSON-formatted driver-specific configuration remains separate from portable +resource requirements and is passed through `SandboxTemplate.driver_config`. -### Resource requirements +### Long-term resource requirements Use typed requirement domains for stable first-party resource concepts instead of making every request stringly typed through a `kind` field. @@ -250,6 +291,11 @@ Portable GPU semantics are limited to: - `count` - exact-match attributes in `selector.match_attributes` +In the current GPU-focused implementation, only the `gpu.count` portion is +implemented in `ResourceRequirements`. Exact device IDs are not portable +selector attributes yet; they remain driver-owned config. Docker and Podman use +`driver_config.cdi_devices`, and VM uses `driver_config.gpu_device_ids`. + Driver-native GPU details are expressed through namespaced parameters. Example parameter namespaces: @@ -317,10 +363,23 @@ Example realizations: | Driver | Realization | |---|---| | Kubernetes | Convert `className=gpu,count=N` into a pod resource limit such as `limits["nvidia.com/gpu"] = "N"` unless Kubernetes-specific parameters select another resource name or class. | -| Docker | Convert CDI parameters into Docker CDI device injection. For a count-only request, select an available CDI GPU device when device inventory is available. | -| Podman | Convert CDI parameters into Podman CDI device injection. For a count-only request, select an available CDI GPU device when device inventory is available. | +| Docker | Use exact `driver_config.cdi_devices` values when present; otherwise select `count` default CDI GPU IDs from the discovered inventory. | +| Podman | Use exact `driver_config.cdi_devices` values when present; otherwise select `count` default CDI GPU IDs from the discovered inventory. | | VM | Convert VM parameters into BDF or index-based device assignment. | +Docker and Podman default selection is round-robin over the normalized NVIDIA +CDI inventory. Indexed IDs are preferred and sorted numerically; named IDs are +used when no indexed IDs exist. On WSL2 all-only runtimes, `nvidia.com/gpu=all` +can be used as a fallback and counts as one selectable device. The selector +refreshes its inventory before validating and creating default GPU requests so +CDI devices added or removed after driver startup can affect later creates. + +Explicit CDI IDs are opaque. The drivers do not parse aliases, normalize +`nvidia.com/gpu=all`, or treat it specially on the explicit path. Exact ID +lists must not contain duplicates and their length must match the effective GPU +count. A list of two explicit IDs with `--gpu` and no count fails because the +effective count is one. + Docker and Podman should not interpret VM BDF/index parameters. The VM driver should not interpret CDI parameters. Gateway namespace prefiltering should avoid sending clearly incompatible requests to those drivers. @@ -505,17 +564,22 @@ other. ### Related driver-specific configuration Driver-specific configuration is intentionally separate from portable resource -requirements. Issue #1492 tracks an opaque driver-owned configuration surface -for backend-native settings such as Kubernetes node selectors, tolerations, -image pull secrets, Docker network mode, or VM disk shape. A future design may -place that surface on `SandboxTemplate.driver_config` and may use a namespaced -map-of-maps shape, but this RFC does not standardize that API or its CLI flags. +requirements. The current API uses `SandboxTemplate.driver_config` as an +opaque envelope keyed by driver name. The gateway selects the block for the +active driver and forwards that inner object to the compute driver; the driver +owns nested schema validation. + +`driver_config` is the right place for backend-native settings such as +Kubernetes node selectors, tolerations, image pull secrets, Docker or Podman +mounts, and exact GPU IDs. These settings are not portable resource +requirements, and the gateway must not interpret them as a scheduling contract. This RFC does not introduce `--resources-json`, `--resource-requirements-json`, or `--template-resources-json`. CPU, memory, GPU count, and exact GPU selection -should use typed resource fields or typed CLI flags. Backend-native settings -that are not modeled by `resource_requirements` should remain driver-specific -and should not be presented as portable resource requests. +should use typed resource fields, typed CLI flags, or the selected driver's +documented `driver_config` fields. Backend-native settings that are not modeled +by `resource_requirements` should remain driver-specific and should not be +presented as portable resource requests. ### Template realization and conflicts @@ -551,21 +615,43 @@ message DriverSandboxSpec { string log_level = 1; map environment = 5; DriverSandboxTemplate template = 6; - DriverSandboxResourceRequirements resource_requirements = 11; + ResourceRequirements resource_requirements = 9; - reserved 9, 10; - reserved "gpu", "gpu_device"; + reserved 10; + reserved "gpu_device"; } ``` Driver-owned resource requirement messages should have the same semantics as the public messages, but live in `compute_driver.proto` to keep the public and -internal contracts separated. +internal contracts separated. In the current implementation, the driver proto +mirrors the narrow GPU-focused shape: + +```proto +message ResourceRequirements { + GpuResourceRequirements gpu = 1; +} + +message GpuResourceRequirements { + optional uint32 count = 1; +} +``` + +The compute-driver API is version-coupled to the gateway in current deployments: +local drivers are launched by the gateway at startup, and the driver proto is +not treated as a public compatibility surface. It follows the same alpha-era +field replacement as the public API rather than preserving transitional GPU +fields. ### Driver capabilities -Replace GPU-specific capability fields with coarse resource capability -summaries: +The current implementation removes the GPU-specific `supports_gpu` and +`gpu_count` capability fields from the compute-driver proto. It does not yet +replace them with a stable resource capability summary. Until that lands, the +selected driver's `ValidateSandboxCreate` remains the authority for whether a +GPU request can be served. + +A later phase should add coarse resource capability summaries: ```proto message GetCapabilitiesResponse { @@ -638,30 +724,44 @@ or if driver-specific validation rejects parameter values. When no resource requirements are present, the gateway should preserve today's default behavior and use the configured default driver. -## Implementation plan - -1. Update public protobufs to add `SandboxResourceRequirements` and remove the - long-term use of `gpu` and `gpu_device`. -2. Update compute-driver protobufs with mirrored driver-owned resource - requirements and coarse resource capability summaries. -3. Update gateway validation and public-to-driver translation. -4. Add validation that rejects conflicts between portable resource requirements - and template resource passthrough. -5. Allow the gateway to consider multiple configured compute drivers for a - create request, using capability prefiltering plus `ValidateSandboxCreate`. -6. Update Kubernetes, Docker, Podman, and VM drivers to advertise compute and - GPU device capability summaries and interpret their supported parameter - namespaces. -7. Update CLI/API request construction so CPU, memory, GPU count, and exact GPU - selection use resource requirements instead of GPU-specific request fields. -8. Do not expose JSON-formatted portable resource request flags. -9. Update user-facing docs and driver README files once behavior is - implemented. - -Because this is a breaking request-spec change, the implementation must either -land in a breaking API version or be explicitly called out as a breaking change -for the current API. Removed protobuf tags should be reserved rather than -reused. +## Implementation state and plan + +Implemented in the first phase: + +1. Replace the public and driver `gpu`/`gpu_device` request fields with + `resource_requirements.gpu`. +2. Treat a present GPU requirement with omitted `count` as an effective request + for one GPU, and reject `count = 0`. +3. Pass GPU requirements through gateway-to-driver create and validation paths. +4. Keep exact device IDs in `driver_config`: + `driver_config.cdi_devices` for Docker and Podman, and + `driver_config.gpu_device_ids` for VM. +5. Reject duplicate exact device IDs and require exact device list length to + match the effective GPU count. +6. Let Kubernetes map GPU count to the `nvidia.com/gpu` limit. +7. Let Docker and Podman satisfy count-only GPU requests by selecting default + CDI devices from a refreshed local inventory. +8. Remove GPU-specific capability fields from the compute-driver proto without + adding replacement resource capability summaries yet. +9. Do not expose JSON-formatted portable resource request flags. + +Remaining follow-up work: + +1. Add the broader compute/device/dataset resource model when those semantics + are stable enough for the public API. +2. Add coarse resource capability summaries and gateway matching across + multiple configured drivers. +3. Add conflict validation between portable resource requirements and + platform-native template resource passthrough where both can express the + same resource. +4. Decide whether and how drivers should expose inventory or capacity for + advisory matching without turning it into a reservation API. + +Because the public and driver APIs are still alpha, the first implementation +intentionally breaks old live and persisted GPU intent instead of preserving +compatibility shims for the legacy GPU fields. Callers should use a matching +OpenShell CLI/API version and recreate GPU sandboxes after upgrade when they +need the new typed request shape. ## Tests @@ -669,23 +769,27 @@ The implementation should include: - protobuf translation tests for public resource requirements into driver resource requirements. -- gateway matching tests for compute capability support, device class, count, - selector, and parameter namespace filtering. -- gateway tests showing that the selected driver is the first validating - candidate in configured order. -- validation tests for conflicts between resource requirements and template - resource passthrough. -- validation tests that unsupported parameter namespaces are rejected. +- shared helper tests showing that omitted GPU count is effective count one and + explicit zero count is rejected. +- validation tests for duplicate exact device IDs and for mismatched exact + device-list length versus effective GPU count. - Kubernetes tests that map compute requirements to pod CPU/memory resources and GPU count to `nvidia.com/gpu` limits. -- Docker and Podman GPU e2e tests that request a CDI GPU with - `cdi.openshell.ai`. +- Docker and Podman selector tests for numeric CDI ID ordering, named CDI ID + fallback, count-based round-robin selection, all-only fallback, insufficient + devices, and inventory refresh without cursor reset. +- Docker and Podman validation tests showing that explicit CDI IDs bypass + default inventory selection and remain opaque. - VM tests that map CPU/memory to VM allocation and request a GPU by BDF or - index with `vm.openshell.ai`. -- tests showing that template-only resources are treated as platform-native - passthrough and are not used for portable driver matching. + index through `driver_config.gpu_device_ids`. - CLI request-shape tests showing that there is no JSON-formatted portable resource request flag. +- future gateway matching tests for compute capability support, device class, + count, selector, and parameter namespace filtering once resource capability + summaries are added. +- future validation tests for conflicts between resource requirements and + template resource passthrough once those conflicts can occur through stable + portable fields. - error-message tests for no matching driver and validation failure across all candidates.