From fa6e3794b45f06cced4823fc325f2639476ab6d2 Mon Sep 17 00:00:00 2001 From: Calum Murray Date: Mon, 22 Jun 2026 14:34:03 -0400 Subject: [PATCH] fix(openshell-network-supervisor): gate proxy accept on symlink resolution readiness Signed-off-by: Calum Murray --- .../openshell-supervisor-network/src/proxy.rs | 25 +++++++++++++++++++ .../openshell-supervisor-network/src/run.rs | 14 +++++++++++ 2 files changed, 39 insertions(+) diff --git a/crates/openshell-supervisor-network/src/proxy.rs b/crates/openshell-supervisor-network/src/proxy.rs index d467b022e..cde6b9458 100644 --- a/crates/openshell-supervisor-network/src/proxy.rs +++ b/crates/openshell-supervisor-network/src/proxy.rs @@ -188,6 +188,7 @@ impl ProxyHandle { policy_local_ctx: Option>, denial_tx: Option>, activity_tx: Option, + engine_ready: tokio::sync::watch::Receiver, ) -> Result { // Use override bind_addr, fall back to policy http_addr, then default // to loopback:3128. The default allows the proxy to function when no @@ -229,6 +230,30 @@ impl ProxyHandle { } let join = tokio::spawn(async move { + // Wait for the OPA engine's symlink resolution reload to complete + // before accepting connections. This prevents requests from + // observing a generation transition mid-flight, which would cause + // the generation guard to reject them with a 403. + // + // The TCP listener is already bound, so the OS backlog queues + // incoming SYN packets during this wait. Once we start accepting, + // queued connections drain immediately. + let mut engine_ready = engine_ready; + match tokio::time::timeout( + std::time::Duration::from_secs(15), + engine_ready.wait_for(|v| *v), + ) + .await + { + Ok(_) => {} + Err(_) => { + warn!( + "Engine readiness signal not received within 15s; \ + proceeding with proxy accept loop" + ); + } + } + loop { match listener.accept().await { Ok((stream, _addr)) => { diff --git a/crates/openshell-supervisor-network/src/run.rs b/crates/openshell-supervisor-network/src/run.rs index b98923051..68a287f84 100644 --- a/crates/openshell-supervisor-network/src/run.rs +++ b/crates/openshell-supervisor-network/src/run.rs @@ -97,6 +97,13 @@ pub async fn run_networking( .or_else(|| sandbox_id.map(str::to_string)), )); + // Readiness signal for the proxy accept loop: the proxy binds the TCP + // listener immediately (so the OS backlog queues early SYN packets) but + // defers `accept()` until symlink resolution completes. This eliminates + // the race where an in-flight request observes a generation transition + // during the OPA engine reload. + let (engine_ready_tx, engine_ready_rx) = tokio::sync::watch::channel(false); + // Spawn a task to resolve policy binary symlinks once the workload's mount // namespace becomes accessible via /proc//root/. The task starts // before run_process spawns the child, so first wait for the orchestrator @@ -125,6 +132,7 @@ pub async fn run_networking( "Entrypoint PID never published; binary symlink resolution skipped. \ Policy binary paths will be matched literally." ); + let _ = engine_ready_tx.send(true); return; } @@ -155,6 +163,7 @@ pub async fn run_networking( ); } } + let _ = engine_ready_tx.send(true); return; } debug!( @@ -170,7 +179,11 @@ pub async fn run_networking( If binaries are symlinks, use canonical paths in your policy \ (run 'readlink -f ' inside the sandbox)" ); + let _ = engine_ready_tx.send(true); }); + } else { + // No symlink resolution needed — unblock the proxy immediately. + let _ = engine_ready_tx.send(true); } // Identity cache for SHA256 TOFU when OPA is active. Only consumed by @@ -279,6 +292,7 @@ pub async fn run_networking( Some(policy_local_ctx.clone()), denial_tx, activity_tx, + engine_ready_rx, ) .await?; Some(proxy_handle)