diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index b5232bce9..c09812778 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -61,6 +61,10 @@ use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPol use openshell_core::provider_credentials::ProviderCredentialState; use openshell_supervisor_network::opa::OpaEngine; pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; +pub use openshell_supervisor_process::{ + BestEffortHandler, SandboxFailureHandler, SandboxFailureKind, StrictHandler, + best_effort_handler_from_tokens, set_failure_handler, +}; use openshell_supervisor_process::skills; use tokio::sync::mpsc::UnboundedSender; diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 91b145c2e..a4b59fbbf 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -152,6 +152,15 @@ struct Args { /// enforcement runs in another pod. #[arg(long, default_value = DEFAULT_MODE)] mode: Mode, + + /// Bootstrap subsystems to run in best-effort (degraded) mode instead of + /// aborting when the host or an outer sandbox refuses them. Comma-separated: + /// `netns`, `supervisor-seccomp`, `workload-seccomp`, or `all`. Empty + /// (the default) is strict — every refusal is fatal, matching upstream. + /// Set this when running under an outer sandbox (e.g. gVisor) that owns the + /// corresponding boundary and is expected to refuse these syscalls. + #[arg(long, value_delimiter = ',', env = "OPENSHELL_BEST_EFFORT_BOOTSTRAP")] + best_effort_bootstrap: Vec, } /// Copy the running executable to `dest`, creating parent directories as @@ -222,6 +231,18 @@ fn main() -> Result<()> { let args = Args::parse(); + // Operator-declared degraded mode: when --best-effort-bootstrap names one + // or more subsystems (or `all`), install a BestEffortHandler that tolerates + // exactly those bootstrap refusals. Empty (default) leaves the lazy + // StrictHandler in place, so behaviour is byte-identical to upstream unless + // the operator opts in. Done before run_sandbox so the handler is set + // before any bootstrap step can fail. + if !args.best_effort_bootstrap.is_empty() { + let handler = openshell_sandbox::best_effort_handler_from_tokens(&args.best_effort_bootstrap)?; + // Ignore an already-set slot: nothing else registers in this binary. + let _ = openshell_sandbox::set_failure_handler(Box::new(handler)); + } + // Try to open a rolling log file; fall back to stderr-only logging if it fails // (e.g., /var/log is not writable in custom workload images). // Rotates daily, keeps the 3 most recent files to bound disk usage. diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index 1a3efd733..480d6c3c4 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -23,3 +23,228 @@ pub mod supervisor_session; pub mod bypass_monitor; #[cfg(target_os = "linux")] pub mod netns; + +use miette::Result; +use std::sync::OnceLock; + +// Pluggable policy for bootstrap subsystems the host kernel may refuse +// (netns create, supervisor seccomp, workload seccomp). Default +// `StrictHandler` aborts; outer-sandbox integrations register their own +// via `set_failure_handler`. + +/// Which bootstrap subsystem failed. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SandboxFailureKind { + /// `unshare(CLONE_NEWNET)` or a follow-up netns op refused by the kernel. + NetworkNamespaceCreate, + /// Supervisor seccomp prelude install failed. + SupervisorSeccompInstall, + /// Workload per-policy seccomp filter failed in `sandbox::linux::enforce`. + WorkloadSeccompInstall, +} + +/// Policy for handling bootstrap refusals. `Ok(())` continues in degraded +/// mode; `Err` aborts. Invoked synchronously — do not block. +pub trait SandboxFailureHandler: Send + Sync + 'static { + fn handle(&self, kind: SandboxFailureKind, err: miette::Report) -> Result<()>; +} + +/// Default handler — every refusal aborts. +pub struct StrictHandler; + +impl SandboxFailureHandler for StrictHandler { + fn handle(&self, _kind: SandboxFailureKind, err: miette::Report) -> Result<()> { + Err(err) + } +} + +impl SandboxFailureKind { + /// Stable short name, used in operator tokens and degraded-mode logs. + #[must_use] + pub fn as_str(self) -> &'static str { + match self { + Self::NetworkNamespaceCreate => "network-namespace", + Self::SupervisorSeccompInstall => "supervisor-seccomp", + Self::WorkloadSeccompInstall => "workload-seccomp", + } + } + + /// Parse an operator-facing subsystem token (from `--best-effort-bootstrap` + /// / `OPENSHELL_BEST_EFFORT_BOOTSTRAP`). Case-insensitive; accepts the + /// short and long spellings. Returns `None` for an unknown token. + #[must_use] + pub fn parse_token(token: &str) -> Option { + match token.trim().to_ascii_lowercase().as_str() { + "netns" | "network-namespace" => Some(Self::NetworkNamespaceCreate), + "supervisor-seccomp" | "supervisor_seccomp" => Some(Self::SupervisorSeccompInstall), + "workload-seccomp" | "workload_seccomp" => Some(Self::WorkloadSeccompInstall), + _ => None, + } + } +} + +/// Handler that tolerates an operator-declared set of bootstrap refusals — +/// logging a warning and continuing in degraded mode for those kinds, while +/// still aborting on any kind not in the set. +/// +/// This is the operator-facing policy surface for "the outer sandbox owns this +/// boundary": when running under gVisor/Firecracker/Kata the host runtime is +/// the enforcing layer and routinely refuses these syscalls, so the operator +/// declares which subsystems are expected to be unavailable. Everything else +/// stays fatal. Default (no declaration) remains [`StrictHandler`]. +pub struct BestEffortHandler { + tolerate: [bool; 3], +} + +impl BestEffortHandler { + /// Tolerate exactly the given kinds; abort on all others. + #[must_use] + pub fn new(kinds: impl IntoIterator) -> Self { + let mut tolerate = [false; 3]; + for kind in kinds { + tolerate[kind as usize] = true; + } + Self { tolerate } + } + + /// Tolerate every bootstrap kind (full degraded mode). + #[must_use] + pub fn all() -> Self { + Self { + tolerate: [true; 3], + } + } + + fn tolerates(&self, kind: SandboxFailureKind) -> bool { + self.tolerate[kind as usize] + } +} + +impl SandboxFailureHandler for BestEffortHandler { + fn handle(&self, kind: SandboxFailureKind, err: miette::Report) -> Result<()> { + if self.tolerates(kind) { + tracing::warn!( + subsystem = kind.as_str(), + error = %err, + "Sandbox bootstrap subsystem unavailable; continuing in best-effort (outer-sandbox-managed) mode" + ); + Ok(()) + } else { + Err(err) + } + } +} + +/// Build a [`BestEffortHandler`] from operator tokens. +/// +/// Tokens come from `--best-effort-bootstrap` / `OPENSHELL_BEST_EFFORT_BOOTSTRAP`. +/// `all` tolerates every kind; otherwise each token must name a subsystem (see +/// [`SandboxFailureKind::parse_token`]). Empty input tolerates nothing, i.e. +/// equivalent to strict. +/// +/// # Errors +/// Returns an error naming the offending token if it is not `all` or a known +/// subsystem. +pub fn best_effort_handler_from_tokens(tokens: I) -> Result +where + I: IntoIterator, + S: AsRef, +{ + let mut kinds = Vec::new(); + for token in tokens { + let token = token.as_ref().trim(); + if token.is_empty() { + continue; + } + if token.eq_ignore_ascii_case("all") { + return Ok(BestEffortHandler::all()); + } + match SandboxFailureKind::parse_token(token) { + Some(kind) => kinds.push(kind), + None => { + return Err(miette::miette!( + "unknown --best-effort-bootstrap subsystem '{token}' \ + (expected: netns, supervisor-seccomp, workload-seccomp, or all)" + )); + } + } + } + Ok(BestEffortHandler::new(kinds)) +} + +/// Set-once handler slot; lazy default is [`StrictHandler`]. +static FAILURE_HANDLER: OnceLock> = OnceLock::new(); + +/// Register the process-wide handler. Call once at process start, before the +/// supervisor boots. Returns the handler back on `Err` if the slot is +/// already set. +pub fn set_failure_handler( + handler: Box, +) -> Result<(), Box> { + FAILURE_HANDLER.set(handler) +} + +pub(crate) fn failure_handler() -> &'static dyn SandboxFailureHandler { + FAILURE_HANDLER + .get_or_init(|| Box::new(StrictHandler)) + .as_ref() +} + +#[cfg(test)] +mod failure_handler_tests { + use super::{BestEffortHandler, SandboxFailureHandler, SandboxFailureKind, StrictHandler}; + use super::best_effort_handler_from_tokens; + + const ALL_KINDS: [SandboxFailureKind; 3] = [ + SandboxFailureKind::NetworkNamespaceCreate, + SandboxFailureKind::SupervisorSeccompInstall, + SandboxFailureKind::WorkloadSeccompInstall, + ]; + + #[test] + fn strict_aborts_every_kind() { + let handler = StrictHandler; + for kind in ALL_KINDS { + assert!(handler.handle(kind, miette::miette!("refused")).is_err()); + } + } + + #[test] + fn best_effort_tolerates_only_configured_kinds() { + let handler = BestEffortHandler::new([SandboxFailureKind::NetworkNamespaceCreate]); + assert!( + handler + .handle(SandboxFailureKind::NetworkNamespaceCreate, miette::miette!("x")) + .is_ok() + ); + assert!( + handler + .handle(SandboxFailureKind::WorkloadSeccompInstall, miette::miette!("x")) + .is_err() + ); + } + + #[test] + fn all_tolerates_everything() { + let handler = BestEffortHandler::all(); + for kind in ALL_KINDS { + assert!(handler.handle(kind, miette::miette!("refused")).is_ok()); + } + } + + #[test] + fn tokens_parse_all_known_and_reject_unknown() { + assert!(best_effort_handler_from_tokens(["all"]).is_ok()); + assert!(best_effort_handler_from_tokens(["netns", "workload-seccomp"]).is_ok()); + assert!(best_effort_handler_from_tokens(["supervisor-seccomp"]).is_ok()); + assert!(best_effort_handler_from_tokens(["", " "]).is_ok()); // empties skipped + assert!(best_effort_handler_from_tokens(["bogus"]).is_err()); + } + + #[test] + fn token_roundtrip_via_as_str() { + for kind in ALL_KINDS { + assert_eq!(SandboxFailureKind::parse_token(kind.as_str()), Some(kind)); + } + } +} diff --git a/crates/openshell-supervisor-process/src/netns/mod.rs b/crates/openshell-supervisor-process/src/netns/mod.rs index d2242b1c1..98163d52b 100644 --- a/crates/openshell-supervisor-process/src/netns/mod.rs +++ b/crates/openshell-supervisor-process/src/netns/mod.rs @@ -422,11 +422,17 @@ pub fn create_netns_for_proxy( } Ok(Some(ns)) } - Err(e) => Err(miette::miette!( - "Network namespace creation failed and proxy mode requires isolation. \ - Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ - Error: {e}" - )), + Err(e) => { + crate::failure_handler().handle( + crate::SandboxFailureKind::NetworkNamespaceCreate, + miette::miette!( + "Network namespace creation failed and proxy mode requires isolation. \ + Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ + Error: {e}" + ), + )?; + Ok(None) + } } } diff --git a/crates/openshell-supervisor-process/src/process.rs b/crates/openshell-supervisor-process/src/process.rs index 9f9fe1822..e4f560bd6 100644 --- a/crates/openshell-supervisor-process/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -934,6 +934,13 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> { .ok_or_else(|| miette::miette!("Failed to resolve user primary group"))? }; + // Idempotent fast-path: if euid/egid already match the target (e.g. a + // container entrypoint pre-dropped before exec'ing the sandbox), skip + // initgroups(3), which would otherwise fail without CAP_SETGID. + if nix::unistd::geteuid() == user.uid && nix::unistd::getegid() == group.gid { + return Ok(()); + } + if user_name.is_some() { let user_cstr = CString::new(user.name.clone()).map_err(|_| miette::miette!("Invalid user name"))?; diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs index 5a5c203a2..c895078ce 100644 --- a/crates/openshell-supervisor-process/src/run.rs +++ b/crates/openshell-supervisor-process/src/run.rs @@ -89,7 +89,9 @@ pub async fn run_process( // tasks. By this point the orchestrator has finished privileged startup // helpers (network namespace setup, nftables probes via run_networking), // and the SSH listener and entrypoint child have not been exposed yet. - crate::sandbox::apply_supervisor_startup_hardening()?; + if let Err(e) = crate::sandbox::apply_supervisor_startup_hardening() { + crate::failure_handler().handle(crate::SandboxFailureKind::SupervisorSeccompInstall, e)?; + } // Spawn the bypass detection monitor. It tails dmesg for nftables LOG // entries fired by rules installed on the workload's network namespace diff --git a/crates/openshell-supervisor-process/src/sandbox/linux/mod.rs b/crates/openshell-supervisor-process/src/sandbox/linux/mod.rs index b5397ef07..bb146e081 100644 --- a/crates/openshell-supervisor-process/src/sandbox/linux/mod.rs +++ b/crates/openshell-supervisor-process/src/sandbox/linux/mod.rs @@ -38,7 +38,9 @@ pub fn enforce(prepared: PreparedSandbox) -> Result<()> { if let Some(ruleset) = prepared.landlock { landlock::enforce(ruleset)?; } - seccomp::apply(&prepared.policy)?; + if let Err(e) = seccomp::apply(&prepared.policy) { + crate::failure_handler().handle(crate::SandboxFailureKind::WorkloadSeccompInstall, e)?; + } Ok(()) }