From 7a152fa398fdf056b983f86e1936b02c532c1897 Mon Sep 17 00:00:00 2001 From: Davanum Srinivas Date: Tue, 23 Jun 2026 08:06:36 -0400 Subject: [PATCH 1/2] fix(sandbox): make drop_privileges idempotent When the current euid/egid already equal the policy's resolved target (e.g. a container entrypoint pre-dropped before exec'ing the sandbox), skip initgroups(3), which otherwise fails without CAP_SETGID. No behavioural change when a privilege drop is actually needed. Signed-off-by: Davanum Srinivas --- crates/openshell-supervisor-process/src/process.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/crates/openshell-supervisor-process/src/process.rs b/crates/openshell-supervisor-process/src/process.rs index 9f9fe1822..e4f560bd6 100644 --- a/crates/openshell-supervisor-process/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -934,6 +934,13 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> { .ok_or_else(|| miette::miette!("Failed to resolve user primary group"))? }; + // Idempotent fast-path: if euid/egid already match the target (e.g. a + // container entrypoint pre-dropped before exec'ing the sandbox), skip + // initgroups(3), which would otherwise fail without CAP_SETGID. + if nix::unistd::geteuid() == user.uid && nix::unistd::getegid() == group.gid { + return Ok(()); + } + if user_name.is_some() { let user_cstr = CString::new(user.name.clone()).map_err(|_| miette::miette!("Invalid user name"))?; From 5112fca71d52fab22d8823cc8f2e3fb776abd17c Mon Sep 17 00:00:00 2001 From: Davanum Srinivas Date: Tue, 23 Jun 2026 08:06:37 -0400 Subject: [PATCH 2/2] feat(sandbox): operator-declared skippable bootstrap subsystems The supervisor performs three privileged startup steps an outer sandbox (gVisor, Firecracker, Kata) may own instead: network-namespace creation, the supervisor seccomp prelude, and the workload seccomp filter. On bare metal all three are attempted and a host refusal is fatal. Add --skip-bootstrap (env OPENSHELL_SKIP_BOOTSTRAP): a comma-separated list of subsystems (netns, supervisor-seccomp, workload-seccomp, or all) the operator declares the environment owns. A skipped subsystem is not attempted; a subsystem that is NOT skipped and fails stays fatal. Empty (the default) attempts all three and aborts on any failure -- byte-identical to upstream. This configures the supervisor from the deployment environment instead of catching refusals at runtime: under an outer sandbox the operator declares 'the runtime owns netns/seccomp', so those steps are skipped and every other failure remains a genuine error. Per #1650 the config + the three call sites (run.rs, netns/mod.rs, sandbox/linux/mod.rs) live in openshell-supervisor-process, re-exported from openshell-sandbox. Signed-off-by: Davanum Srinivas --- crates/openshell-sandbox/src/lib.rs | 3 + crates/openshell-sandbox/src/main.rs | 18 ++ .../openshell-supervisor-process/src/lib.rs | 156 ++++++++++++++++++ .../src/netns/mod.rs | 8 + .../openshell-supervisor-process/src/run.rs | 9 +- .../src/sandbox/linux/mod.rs | 9 +- 6 files changed, 201 insertions(+), 2 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index b5232bce9..82a257344 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -62,6 +62,9 @@ use openshell_core::provider_credentials::ProviderCredentialState; use openshell_supervisor_network::opa::OpaEngine; pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; use openshell_supervisor_process::skills; +pub use openshell_supervisor_process::{ + BootstrapSubsystem, parse_skip_bootstrap, set_skipped_bootstrap, +}; use tokio::sync::mpsc::UnboundedSender; /// Run a command in the sandbox. diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 91b145c2e..8ab598399 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -152,6 +152,15 @@ struct Args { /// enforcement runs in another pod. #[arg(long, default_value = DEFAULT_MODE)] mode: Mode, + + /// Bootstrap subsystems the environment (e.g. an outer sandbox like gVisor) + /// owns, so the supervisor SKIPS attempting them instead of failing on the + /// host's refusal. Comma-separated: `netns`, `supervisor-seccomp`, + /// `workload-seccomp`, or `all`. Empty (the default) attempts all three and + /// treats any failure as fatal — byte-identical to upstream. A subsystem + /// that is NOT skipped and fails is still fatal. + #[arg(long, value_delimiter = ',', env = "OPENSHELL_SKIP_BOOTSTRAP")] + skip_bootstrap: Vec, } /// Copy the running executable to `dest`, creating parent directories as @@ -222,6 +231,15 @@ fn main() -> Result<()> { let args = Args::parse(); + // Operator-declared skips: bootstrap subsystems the environment owns are + // not attempted; everything else stays fatal-on-failure (the upstream + // default). Done before run_sandbox so the declaration is in place before + // any bootstrap step runs. Empty (default) skips nothing. + if !args.skip_bootstrap.is_empty() { + let skipped = openshell_sandbox::parse_skip_bootstrap(&args.skip_bootstrap)?; + openshell_sandbox::set_skipped_bootstrap(skipped); + } + // Try to open a rolling log file; fall back to stderr-only logging if it fails // (e.g., /var/log is not writable in custom workload images). // Rotates daily, keeps the 3 most recent files to bound disk usage. diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index 1a3efd733..1538a5053 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -23,3 +23,159 @@ pub mod supervisor_session; pub mod bypass_monitor; #[cfg(target_os = "linux")] pub mod netns; + +use miette::Result; +use std::sync::OnceLock; + +// Operator-declared bootstrap policy. +// +// The supervisor performs three privileged startup steps that an outer sandbox +// (gVisor, Firecracker, Kata) may own instead of the supervisor: network +// namespace creation, the supervisor seccomp prelude, and the workload seccomp +// filter. On bare-metal Linux all three are attempted and a host refusal is +// fatal. When the operator declares — via `--skip-bootstrap` / +// `OPENSHELL_SKIP_BOOTSTRAP` — that the environment owns one of them, the +// supervisor SKIPS it (never attempts it). Any subsystem that is NOT skipped +// and fails is still fatal. The default (skip nothing) is byte-identical to +// upstream: attempt everything, abort on any failure. + +/// A privileged bootstrap step the supervisor performs at startup. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BootstrapSubsystem { + /// `unshare(CLONE_NEWNET)` + the veth/nftables setup behind the proxy. + NetworkNamespace, + /// The supervisor seccomp prelude (`apply_supervisor_startup_hardening`). + SupervisorSeccomp, + /// The workload per-policy seccomp filter in `sandbox::linux::enforce`. + WorkloadSeccomp, +} + +impl BootstrapSubsystem { + /// Stable short name, used in `--skip-bootstrap` tokens and logs. + #[must_use] + pub fn as_str(self) -> &'static str { + match self { + Self::NetworkNamespace => "netns", + Self::SupervisorSeccomp => "supervisor-seccomp", + Self::WorkloadSeccomp => "workload-seccomp", + } + } + + /// Parse an operator-facing token. Case-insensitive; accepts the short and + /// long spellings. Returns `None` for an unknown token. + #[must_use] + pub fn parse_token(token: &str) -> Option { + match token.trim().to_ascii_lowercase().as_str() { + "netns" | "network-namespace" => Some(Self::NetworkNamespace), + "supervisor-seccomp" | "supervisor_seccomp" => Some(Self::SupervisorSeccomp), + "workload-seccomp" | "workload_seccomp" => Some(Self::WorkloadSeccomp), + _ => None, + } + } +} + +/// Set-once skip declaration. Unset (the default) skips nothing. +static SKIPPED_BOOTSTRAP: OnceLock<[bool; 3]> = OnceLock::new(); + +/// Declare which bootstrap subsystems the environment owns, so the supervisor +/// skips attempting them. Call once at process start, before the supervisor +/// boots; a second call is ignored. +pub fn set_skipped_bootstrap(subsystems: impl IntoIterator) { + let mut skip = [false; 3]; + for subsystem in subsystems { + skip[subsystem as usize] = true; + } + let _ = SKIPPED_BOOTSTRAP.set(skip); +} + +/// Parse operator tokens (`--skip-bootstrap` values / `OPENSHELL_SKIP_BOOTSTRAP`). +/// +/// `all` skips every subsystem; otherwise each token must name one (see +/// [`BootstrapSubsystem::parse_token`]). Empty/blank tokens are ignored; +/// empty input skips nothing. +/// +/// # Errors +/// Returns an error naming the offending token if it is not `all` or a known +/// subsystem. +pub fn parse_skip_bootstrap(tokens: I) -> Result> +where + I: IntoIterator, + S: AsRef, +{ + let mut skip = Vec::new(); + for token in tokens { + let token = token.as_ref().trim(); + if token.is_empty() { + continue; + } + if token.eq_ignore_ascii_case("all") { + return Ok(vec![ + BootstrapSubsystem::NetworkNamespace, + BootstrapSubsystem::SupervisorSeccomp, + BootstrapSubsystem::WorkloadSeccomp, + ]); + } + match BootstrapSubsystem::parse_token(token) { + Some(subsystem) => skip.push(subsystem), + None => { + return Err(miette::miette!( + "unknown --skip-bootstrap subsystem '{token}' \ + (expected: netns, supervisor-seccomp, workload-seccomp, or all)" + )); + } + } + } + Ok(skip) +} + +/// Whether the operator declared `subsystem` as environment-owned. A skipped +/// subsystem is not attempted; a non-skipped subsystem's failure stays fatal. +pub(crate) fn bootstrap_skipped(subsystem: BootstrapSubsystem) -> bool { + SKIPPED_BOOTSTRAP + .get() + .is_some_and(|skip| skip[subsystem as usize]) +} + +#[cfg(test)] +mod bootstrap_tests { + use super::{BootstrapSubsystem, parse_skip_bootstrap}; + + const ALL: [BootstrapSubsystem; 3] = [ + BootstrapSubsystem::NetworkNamespace, + BootstrapSubsystem::SupervisorSeccomp, + BootstrapSubsystem::WorkloadSeccomp, + ]; + + #[test] + fn parse_all_expands_to_every_subsystem() { + assert_eq!(parse_skip_bootstrap(["all"]).unwrap().len(), 3); + } + + #[test] + fn parse_named_subset_preserves_order() { + let got = parse_skip_bootstrap(["netns", "workload-seccomp"]).unwrap(); + assert_eq!( + got, + vec![ + BootstrapSubsystem::NetworkNamespace, + BootstrapSubsystem::WorkloadSeccomp + ] + ); + } + + #[test] + fn parse_skips_blanks_and_rejects_unknown() { + assert!(parse_skip_bootstrap(["", " "]).unwrap().is_empty()); + assert!(parse_skip_bootstrap(["bogus"]).is_err()); + } + + #[test] + fn token_roundtrips_via_as_str() { + for subsystem in ALL { + assert_eq!( + BootstrapSubsystem::parse_token(subsystem.as_str()), + Some(subsystem) + ); + } + } +} diff --git a/crates/openshell-supervisor-process/src/netns/mod.rs b/crates/openshell-supervisor-process/src/netns/mod.rs index d2242b1c1..c79b53a8f 100644 --- a/crates/openshell-supervisor-process/src/netns/mod.rs +++ b/crates/openshell-supervisor-process/src/netns/mod.rs @@ -400,6 +400,14 @@ pub fn create_netns_for_proxy( if !matches!(policy.network.mode, NetworkMode::Proxy) { return Ok(None); } + if crate::bootstrap_skipped(crate::BootstrapSubsystem::NetworkNamespace) { + tracing::warn!( + subsystem = "netns", + "Skipping network namespace creation (--skip-bootstrap: outer sandbox owns it); \ + proxy egress is cooperating-client only, not bypass-proof, in this mode" + ); + return Ok(None); + } match NetworkNamespace::create() { Ok(ns) => { let proxy_port = policy diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs index 5a5c203a2..8da5c0fb8 100644 --- a/crates/openshell-supervisor-process/src/run.rs +++ b/crates/openshell-supervisor-process/src/run.rs @@ -89,7 +89,14 @@ pub async fn run_process( // tasks. By this point the orchestrator has finished privileged startup // helpers (network namespace setup, nftables probes via run_networking), // and the SSH listener and entrypoint child have not been exposed yet. - crate::sandbox::apply_supervisor_startup_hardening()?; + if crate::bootstrap_skipped(crate::BootstrapSubsystem::SupervisorSeccomp) { + tracing::warn!( + subsystem = "supervisor-seccomp", + "Skipping supervisor seccomp prelude (--skip-bootstrap: outer sandbox owns it)" + ); + } else { + crate::sandbox::apply_supervisor_startup_hardening()?; + } // Spawn the bypass detection monitor. It tails dmesg for nftables LOG // entries fired by rules installed on the workload's network namespace diff --git a/crates/openshell-supervisor-process/src/sandbox/linux/mod.rs b/crates/openshell-supervisor-process/src/sandbox/linux/mod.rs index b5397ef07..9b710d89c 100644 --- a/crates/openshell-supervisor-process/src/sandbox/linux/mod.rs +++ b/crates/openshell-supervisor-process/src/sandbox/linux/mod.rs @@ -38,7 +38,14 @@ pub fn enforce(prepared: PreparedSandbox) -> Result<()> { if let Some(ruleset) = prepared.landlock { landlock::enforce(ruleset)?; } - seccomp::apply(&prepared.policy)?; + if crate::bootstrap_skipped(crate::BootstrapSubsystem::WorkloadSeccomp) { + tracing::warn!( + subsystem = "workload-seccomp", + "Skipping workload seccomp filter (--skip-bootstrap: outer sandbox owns it)" + ); + } else { + seccomp::apply(&prepared.policy)?; + } Ok(()) }