Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions crates/openshell-sandbox/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPol
use openshell_core::provider_credentials::ProviderCredentialState;
use openshell_supervisor_network::opa::OpaEngine;
pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus};
pub use openshell_supervisor_process::{
BestEffortHandler, SandboxFailureHandler, SandboxFailureKind, StrictHandler,
best_effort_handler_from_tokens, set_failure_handler,
};
use openshell_supervisor_process::skills;
use tokio::sync::mpsc::UnboundedSender;

Expand Down
21 changes: 21 additions & 0 deletions crates/openshell-sandbox/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,15 @@ struct Args {
/// enforcement runs in another pod.
#[arg(long, default_value = DEFAULT_MODE)]
mode: Mode,

/// Bootstrap subsystems to run in best-effort (degraded) mode instead of
/// aborting when the host or an outer sandbox refuses them. Comma-separated:
/// `netns`, `supervisor-seccomp`, `workload-seccomp`, or `all`. Empty
/// (the default) is strict — every refusal is fatal, matching upstream.
/// Set this when running under an outer sandbox (e.g. gVisor) that owns the
/// corresponding boundary and is expected to refuse these syscalls.
#[arg(long, value_delimiter = ',', env = "OPENSHELL_BEST_EFFORT_BOOTSTRAP")]
best_effort_bootstrap: Vec<String>,
}

/// Copy the running executable to `dest`, creating parent directories as
Expand Down Expand Up @@ -222,6 +231,18 @@ fn main() -> Result<()> {

let args = Args::parse();

// Operator-declared degraded mode: when --best-effort-bootstrap names one
// or more subsystems (or `all`), install a BestEffortHandler that tolerates
// exactly those bootstrap refusals. Empty (default) leaves the lazy
// StrictHandler in place, so behaviour is byte-identical to upstream unless
// the operator opts in. Done before run_sandbox so the handler is set
// before any bootstrap step can fail.
if !args.best_effort_bootstrap.is_empty() {
let handler = openshell_sandbox::best_effort_handler_from_tokens(&args.best_effort_bootstrap)?;
// Ignore an already-set slot: nothing else registers in this binary.
let _ = openshell_sandbox::set_failure_handler(Box::new(handler));
}

// Try to open a rolling log file; fall back to stderr-only logging if it fails
// (e.g., /var/log is not writable in custom workload images).
// Rotates daily, keeps the 3 most recent files to bound disk usage.
Expand Down
225 changes: 225 additions & 0 deletions crates/openshell-supervisor-process/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,228 @@ pub mod supervisor_session;
pub mod bypass_monitor;
#[cfg(target_os = "linux")]
pub mod netns;

use miette::Result;
use std::sync::OnceLock;

// Pluggable policy for bootstrap subsystems the host kernel may refuse
// (netns create, supervisor seccomp, workload seccomp). Default
// `StrictHandler` aborts; outer-sandbox integrations register their own
// via `set_failure_handler`.

/// Which bootstrap subsystem failed.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SandboxFailureKind {
/// `unshare(CLONE_NEWNET)` or a follow-up netns op refused by the kernel.
NetworkNamespaceCreate,
/// Supervisor seccomp prelude install failed.
SupervisorSeccompInstall,
/// Workload per-policy seccomp filter failed in `sandbox::linux::enforce`.
WorkloadSeccompInstall,
}

/// Policy for handling bootstrap refusals. `Ok(())` continues in degraded
/// mode; `Err` aborts. Invoked synchronously — do not block.
pub trait SandboxFailureHandler: Send + Sync + 'static {
fn handle(&self, kind: SandboxFailureKind, err: miette::Report) -> Result<()>;
}

/// Default handler — every refusal aborts.
pub struct StrictHandler;

impl SandboxFailureHandler for StrictHandler {
fn handle(&self, _kind: SandboxFailureKind, err: miette::Report) -> Result<()> {
Err(err)
}
}

impl SandboxFailureKind {
/// Stable short name, used in operator tokens and degraded-mode logs.
#[must_use]
pub fn as_str(self) -> &'static str {
match self {
Self::NetworkNamespaceCreate => "network-namespace",
Self::SupervisorSeccompInstall => "supervisor-seccomp",
Self::WorkloadSeccompInstall => "workload-seccomp",
}
}

/// Parse an operator-facing subsystem token (from `--best-effort-bootstrap`
/// / `OPENSHELL_BEST_EFFORT_BOOTSTRAP`). Case-insensitive; accepts the
/// short and long spellings. Returns `None` for an unknown token.
#[must_use]
pub fn parse_token(token: &str) -> Option<Self> {
match token.trim().to_ascii_lowercase().as_str() {
"netns" | "network-namespace" => Some(Self::NetworkNamespaceCreate),
"supervisor-seccomp" | "supervisor_seccomp" => Some(Self::SupervisorSeccompInstall),
"workload-seccomp" | "workload_seccomp" => Some(Self::WorkloadSeccompInstall),
_ => None,
}
}
}

/// Handler that tolerates an operator-declared set of bootstrap refusals —
/// logging a warning and continuing in degraded mode for those kinds, while
/// still aborting on any kind not in the set.
///
/// This is the operator-facing policy surface for "the outer sandbox owns this
/// boundary": when running under gVisor/Firecracker/Kata the host runtime is
/// the enforcing layer and routinely refuses these syscalls, so the operator
/// declares which subsystems are expected to be unavailable. Everything else
/// stays fatal. Default (no declaration) remains [`StrictHandler`].
pub struct BestEffortHandler {
tolerate: [bool; 3],
}

impl BestEffortHandler {
/// Tolerate exactly the given kinds; abort on all others.
#[must_use]
pub fn new(kinds: impl IntoIterator<Item = SandboxFailureKind>) -> Self {
let mut tolerate = [false; 3];
for kind in kinds {
tolerate[kind as usize] = true;
}
Self { tolerate }
}

/// Tolerate every bootstrap kind (full degraded mode).
#[must_use]
pub fn all() -> Self {
Self {
tolerate: [true; 3],
}
}

fn tolerates(&self, kind: SandboxFailureKind) -> bool {
self.tolerate[kind as usize]
}
}

impl SandboxFailureHandler for BestEffortHandler {
fn handle(&self, kind: SandboxFailureKind, err: miette::Report) -> Result<()> {
if self.tolerates(kind) {
tracing::warn!(
subsystem = kind.as_str(),
error = %err,
"Sandbox bootstrap subsystem unavailable; continuing in best-effort (outer-sandbox-managed) mode"
);
Ok(())
} else {
Err(err)
}
}
}

/// Build a [`BestEffortHandler`] from operator tokens.
///
/// Tokens come from `--best-effort-bootstrap` / `OPENSHELL_BEST_EFFORT_BOOTSTRAP`.
/// `all` tolerates every kind; otherwise each token must name a subsystem (see
/// [`SandboxFailureKind::parse_token`]). Empty input tolerates nothing, i.e.
/// equivalent to strict.
///
/// # Errors
/// Returns an error naming the offending token if it is not `all` or a known
/// subsystem.
pub fn best_effort_handler_from_tokens<I, S>(tokens: I) -> Result<BestEffortHandler>
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let mut kinds = Vec::new();
for token in tokens {
let token = token.as_ref().trim();
if token.is_empty() {
continue;
}
if token.eq_ignore_ascii_case("all") {
return Ok(BestEffortHandler::all());
}
match SandboxFailureKind::parse_token(token) {
Some(kind) => kinds.push(kind),
None => {
return Err(miette::miette!(
"unknown --best-effort-bootstrap subsystem '{token}' \
(expected: netns, supervisor-seccomp, workload-seccomp, or all)"
));
}
}
}
Ok(BestEffortHandler::new(kinds))
}

/// Set-once handler slot; lazy default is [`StrictHandler`].
static FAILURE_HANDLER: OnceLock<Box<dyn SandboxFailureHandler>> = OnceLock::new();

/// Register the process-wide handler. Call once at process start, before the
/// supervisor boots. Returns the handler back on `Err` if the slot is
/// already set.
pub fn set_failure_handler(
handler: Box<dyn SandboxFailureHandler>,
) -> Result<(), Box<dyn SandboxFailureHandler>> {
FAILURE_HANDLER.set(handler)
}

pub(crate) fn failure_handler() -> &'static dyn SandboxFailureHandler {
FAILURE_HANDLER
.get_or_init(|| Box::new(StrictHandler))
.as_ref()
}

#[cfg(test)]
mod failure_handler_tests {
use super::{BestEffortHandler, SandboxFailureHandler, SandboxFailureKind, StrictHandler};
use super::best_effort_handler_from_tokens;

const ALL_KINDS: [SandboxFailureKind; 3] = [
SandboxFailureKind::NetworkNamespaceCreate,
SandboxFailureKind::SupervisorSeccompInstall,
SandboxFailureKind::WorkloadSeccompInstall,
];

#[test]
fn strict_aborts_every_kind() {
let handler = StrictHandler;
for kind in ALL_KINDS {
assert!(handler.handle(kind, miette::miette!("refused")).is_err());
}
}

#[test]
fn best_effort_tolerates_only_configured_kinds() {
let handler = BestEffortHandler::new([SandboxFailureKind::NetworkNamespaceCreate]);
assert!(
handler
.handle(SandboxFailureKind::NetworkNamespaceCreate, miette::miette!("x"))
.is_ok()
);
assert!(
handler
.handle(SandboxFailureKind::WorkloadSeccompInstall, miette::miette!("x"))
.is_err()
);
}

#[test]
fn all_tolerates_everything() {
let handler = BestEffortHandler::all();
for kind in ALL_KINDS {
assert!(handler.handle(kind, miette::miette!("refused")).is_ok());
}
}

#[test]
fn tokens_parse_all_known_and_reject_unknown() {
assert!(best_effort_handler_from_tokens(["all"]).is_ok());
assert!(best_effort_handler_from_tokens(["netns", "workload-seccomp"]).is_ok());
assert!(best_effort_handler_from_tokens(["supervisor-seccomp"]).is_ok());
assert!(best_effort_handler_from_tokens(["", " "]).is_ok()); // empties skipped
assert!(best_effort_handler_from_tokens(["bogus"]).is_err());
}

#[test]
fn token_roundtrip_via_as_str() {
for kind in ALL_KINDS {
assert_eq!(SandboxFailureKind::parse_token(kind.as_str()), Some(kind));
}
}
}
16 changes: 11 additions & 5 deletions crates/openshell-supervisor-process/src/netns/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -422,11 +422,17 @@ pub fn create_netns_for_proxy(
}
Ok(Some(ns))
}
Err(e) => Err(miette::miette!(
"Network namespace creation failed and proxy mode requires isolation. \
Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \
Error: {e}"
)),
Err(e) => {
crate::failure_handler().handle(
crate::SandboxFailureKind::NetworkNamespaceCreate,
miette::miette!(
"Network namespace creation failed and proxy mode requires isolation. \
Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \
Error: {e}"
),
)?;
Ok(None)
}
}
}

Expand Down
7 changes: 7 additions & 0 deletions crates/openshell-supervisor-process/src/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,13 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> {
.ok_or_else(|| miette::miette!("Failed to resolve user primary group"))?
};

// Idempotent fast-path: if euid/egid already match the target (e.g. a
// container entrypoint pre-dropped before exec'ing the sandbox), skip
// initgroups(3), which would otherwise fail without CAP_SETGID.
if nix::unistd::geteuid() == user.uid && nix::unistd::getegid() == group.gid {
return Ok(());
}

if user_name.is_some() {
let user_cstr =
CString::new(user.name.clone()).map_err(|_| miette::miette!("Invalid user name"))?;
Expand Down
4 changes: 3 additions & 1 deletion crates/openshell-supervisor-process/src/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ pub async fn run_process(
// tasks. By this point the orchestrator has finished privileged startup
// helpers (network namespace setup, nftables probes via run_networking),
// and the SSH listener and entrypoint child have not been exposed yet.
crate::sandbox::apply_supervisor_startup_hardening()?;
if let Err(e) = crate::sandbox::apply_supervisor_startup_hardening() {
crate::failure_handler().handle(crate::SandboxFailureKind::SupervisorSeccompInstall, e)?;
}

// Spawn the bypass detection monitor. It tails dmesg for nftables LOG
// entries fired by rules installed on the workload's network namespace
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ pub fn enforce(prepared: PreparedSandbox) -> Result<()> {
if let Some(ruleset) = prepared.landlock {
landlock::enforce(ruleset)?;
}
seccomp::apply(&prepared.policy)?;
if let Err(e) = seccomp::apply(&prepared.policy) {
crate::failure_handler().handle(crate::SandboxFailureKind::WorkloadSeccompInstall, e)?;
}
Ok(())
}

Expand Down
Loading