From f4dd1b6f895047298845c1aaf028a7b66ff23608 Mon Sep 17 00:00:00 2001 From: Stephen Akinyemi Date: Wed, 20 May 2026 04:45:12 +0100 Subject: [PATCH 1/4] feat(agent-relay): raise max clients to 128 and wire id_range_step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Raise relay MAX_CLIENTS from 16 to 128 and extend the handshake to carry the per-client correlation ID range size on the wire, so the SDK no longer needs to know MAX_CLIENTS. Before, the SDK computed its assigned ID range with a hardcoded `u32::MAX / 16` that mirrored the relay's `ID_RANGE_STEP`. The two sides had no way to stay in sync: changing MAX_CLIENTS on the relay without patching the SDK would make the SDK trespass into neighbor slots' ID ranges, causing silent cross-client frame misrouting via `client_slot = frame.id / ID_RANGE_STEP` in the ring reader. Wire change: handshake bytes go from `[id_offset: u32 BE][ready_frame]` to `[id_offset: u32 BE][id_range_step: u32 BE][ready_frame]`. The SDK reads both values and derives `id_start = id_offset + 1` and `id_max = id_offset + id_range_step` from what it was actually told. A new `id_start` field on `AgentClient` replaces the recomputed wrap boundary so the wrap path no longer references the constant either. At MAX_CLIENTS = 128, each client gets `u32::MAX / 128 ≈ 33.5M` correlation IDs per connection, with slot reset on reconnect — ample headroom for any realistic SDK lifetime. Future changes to MAX_CLIENTS are now a one-line edit in `relay.rs` with no SDK rebuild. Python, Node-TS, and Go SDKs are FFI bindings to this Rust client and require no separate update. --- crates/microsandbox/lib/agent/client.rs | 36 +++++++++++++------------ crates/runtime/lib/relay.rs | 11 +++++--- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/crates/microsandbox/lib/agent/client.rs b/crates/microsandbox/lib/agent/client.rs index 249ab2c81..351550fb5 100644 --- a/crates/microsandbox/lib/agent/client.rs +++ b/crates/microsandbox/lib/agent/client.rs @@ -36,9 +36,11 @@ use crate::MicrosandboxResult; pub struct AgentClient { /// Writer half of the Unix socket connection. writer: Arc>, - /// Next correlation ID to allocate (starts at `id_offset + 1`). + /// Next correlation ID to allocate (starts at `id_start`). next_id: AtomicU32, - /// Upper bound (exclusive) of the assigned ID range. + /// Inclusive lower bound of the assigned ID range (`id_offset + 1`). + id_start: u32, + /// Exclusive upper bound of the assigned ID range (`id_offset + id_range_step`). id_max: u32, /// Pending response channels keyed by correlation ID. pending: Arc>>>, @@ -74,19 +76,21 @@ impl AgentClient { let (mut reader, writer) = stream.into_split(); - // Read the handshake: [id_offset: u32 BE][ready_frame_bytes...] - let mut offset_buf = [0u8; 4]; - tokio::time::timeout_at(deadline, reader.read_exact(&mut offset_buf)) + // Read the handshake header: + // [id_offset: u32 BE][id_range_step: u32 BE][ready_frame_bytes...] + let mut header_buf = [0u8; 8]; + tokio::time::timeout_at(deadline, reader.read_exact(&mut header_buf)) .await .map_err(|_| { crate::MicrosandboxError::Runtime( - "handshake read id_offset: timed out before relay sent bytes".into(), + "handshake read header: timed out before relay sent bytes".into(), ) })? .map_err(|e| { - crate::MicrosandboxError::Runtime(format!("handshake read id_offset: {e}")) + crate::MicrosandboxError::Runtime(format!("handshake read header: {e}")) })?; - let id_offset = u32::from_be_bytes(offset_buf); + let id_offset = u32::from_be_bytes(header_buf[0..4].try_into().unwrap()); + let id_range_step = u32::from_be_bytes(header_buf[4..8].try_into().unwrap()); // Read the ready frame using the protocol codec directly. let ready_msg = tokio::time::timeout_at(deadline, codec::read_message(&mut reader)) @@ -105,7 +109,7 @@ impl AgentClient { .map_err(|e| crate::MicrosandboxError::Runtime(format!("decode ready payload: {e}")))?; tracing::info!( - "agent client: connected to relay, id_offset={id_offset}, boot_time={}ns", + "agent client: connected to relay, id_offset={id_offset}, id_range_step={id_range_step}, boot_time={}ns", ready.boot_time_ns ); @@ -116,14 +120,13 @@ impl AgentClient { let writer = Arc::new(Mutex::new(writer)); - // Compute the upper bound of the assigned ID range. - // ID_RANGE_STEP = u32::MAX / 16 ≈ 268M IDs per client. - let id_range_step: u32 = u32::MAX / 16; + let id_start = id_offset.saturating_add(1); let id_max = id_offset.saturating_add(id_range_step); Ok(Self { writer, - next_id: AtomicU32::new(id_offset + 1), + next_id: AtomicU32::new(id_start), + id_start, id_max, pending, reader_handle, @@ -137,12 +140,11 @@ impl AgentClient { pub fn next_id(&self) -> u32 { loop { let id = self.next_id.fetch_add(1, Ordering::Relaxed); - if id != 0 && id < self.id_max { + if id >= self.id_start && id < self.id_max { return id; } - // Wrapped past the range or hit 0 (reserved) — reset to range start. - let start = self.id_max.saturating_sub(u32::MAX / 16) + 1; - self.next_id.store(start, Ordering::Relaxed); + // Wrapped past the range — reset to range start. + self.next_id.store(self.id_start, Ordering::Relaxed); } } diff --git a/crates/runtime/lib/relay.rs b/crates/runtime/lib/relay.rs index aeb65fcc3..4253ed9a9 100644 --- a/crates/runtime/lib/relay.rs +++ b/crates/runtime/lib/relay.rs @@ -62,9 +62,12 @@ type SessionRegistry = std::sync::Mutex>; //-------------------------------------------------------------------------------------------------- /// Maximum number of simultaneous clients. -const MAX_CLIENTS: u32 = 16; +const MAX_CLIENTS: u32 = 128; /// Size of the correlation ID range allocated to each client. +/// +/// Sent to each client during handshake, so the SDK never needs to know +/// `MAX_CLIENTS`. Changing `MAX_CLIENTS` requires no SDK rebuild. const ID_RANGE_STEP: u32 = u32::MAX / MAX_CLIENTS; /// Size of the length prefix in the wire format. @@ -323,11 +326,13 @@ impl AgentRelay { "agent relay: client connected slot={slot} id_offset={id_offset}" ); - // Perform handshake: send [id_offset: u32 BE][ready_frame_bytes...]. + // Perform handshake: send + // [id_offset: u32 BE][id_range_step: u32 BE][ready_frame_bytes...]. let (reader_half, mut writer_half) = stream.into_split(); - let mut handshake = Vec::with_capacity(4 + ready_frame.len()); + let mut handshake = Vec::with_capacity(8 + ready_frame.len()); handshake.extend_from_slice(&id_offset.to_be_bytes()); + handshake.extend_from_slice(&ID_RANGE_STEP.to_be_bytes()); handshake.extend_from_slice(&ready_frame); if let Err(e) = writer_half.write_all(&handshake).await { From cb3573739061cd78ab5b4b36519d4ea565d33792 Mon Sep 17 00:00:00 2001 From: Stephen Akinyemi Date: Wed, 20 May 2026 14:28:09 +0100 Subject: [PATCH 2/4] chore(readme): tweak tagline and tidy YC badge spacing - Reword the tagline to "the easiest way to give your agent their own computer". - Drop one extra `
` above the YC badge and add two below for consistent spacing. --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fdd158311..eaef72f6a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@
-
——   every agent deserves its own computer   ——
+
——   the easiest way to give your agent their own computer   ——


@@ -319,10 +319,11 @@ This project is licensed under the [Apache License 2.0](./LICENSE). Special thanks to all our contributors, testers, and community members who help make microsandbox better every day! We'd like to thank the following projects and communities that made `microsandbox` possible: [libkrun](https://github.com/containers/libkrun) and [smoltcp](https://github.com/smoltcp-rs/smoltcp) -
-

Backed by Y Combinator
+ +
+
From 4424fad1aa999d55834f2c4c242e29c1313ce3ff Mon Sep 17 00:00:00 2001 From: Stephen Akinyemi Date: Wed, 20 May 2026 15:33:18 +0100 Subject: [PATCH 3/4] refactor(agent-relay): send assigned id bounds Send the client's assigned correlation ID range as explicit start and exclusive end bounds during the relay handshake. This keeps the client contract focused on the allocation it may use instead of exposing the relay's range-step calculation. Validate the range on the SDK side and reject client frames in the relay when their IDs fall outside the assigned interval. --- crates/microsandbox/lib/agent/client.rs | 21 ++++++++++--------- crates/runtime/lib/relay.rs | 27 ++++++++++++++++++------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/crates/microsandbox/lib/agent/client.rs b/crates/microsandbox/lib/agent/client.rs index 351550fb5..68b5fc87f 100644 --- a/crates/microsandbox/lib/agent/client.rs +++ b/crates/microsandbox/lib/agent/client.rs @@ -38,9 +38,9 @@ pub struct AgentClient { writer: Arc>, /// Next correlation ID to allocate (starts at `id_start`). next_id: AtomicU32, - /// Inclusive lower bound of the assigned ID range (`id_offset + 1`). + /// Inclusive lower bound of the assigned ID range. id_start: u32, - /// Exclusive upper bound of the assigned ID range (`id_offset + id_range_step`). + /// Exclusive upper bound of the assigned ID range. id_max: u32, /// Pending response channels keyed by correlation ID. pending: Arc>>>, @@ -77,7 +77,7 @@ impl AgentClient { let (mut reader, writer) = stream.into_split(); // Read the handshake header: - // [id_offset: u32 BE][id_range_step: u32 BE][ready_frame_bytes...] + // [id_start: u32 BE][id_end_exclusive: u32 BE][ready_frame_bytes...] let mut header_buf = [0u8; 8]; tokio::time::timeout_at(deadline, reader.read_exact(&mut header_buf)) .await @@ -89,8 +89,14 @@ impl AgentClient { .map_err(|e| { crate::MicrosandboxError::Runtime(format!("handshake read header: {e}")) })?; - let id_offset = u32::from_be_bytes(header_buf[0..4].try_into().unwrap()); - let id_range_step = u32::from_be_bytes(header_buf[4..8].try_into().unwrap()); + let id_start = u32::from_be_bytes(header_buf[0..4].try_into().unwrap()); + let id_max = u32::from_be_bytes(header_buf[4..8].try_into().unwrap()); + + if id_start >= id_max { + return Err(crate::MicrosandboxError::Runtime(format!( + "invalid relay id range: start={id_start}, end={id_max}" + ))); + } // Read the ready frame using the protocol codec directly. let ready_msg = tokio::time::timeout_at(deadline, codec::read_message(&mut reader)) @@ -109,7 +115,7 @@ impl AgentClient { .map_err(|e| crate::MicrosandboxError::Runtime(format!("decode ready payload: {e}")))?; tracing::info!( - "agent client: connected to relay, id_offset={id_offset}, id_range_step={id_range_step}, boot_time={}ns", + "agent client: connected to relay, id_start={id_start}, id_end_exclusive={id_max}, boot_time={}ns", ready.boot_time_ns ); @@ -120,9 +126,6 @@ impl AgentClient { let writer = Arc::new(Mutex::new(writer)); - let id_start = id_offset.saturating_add(1); - let id_max = id_offset.saturating_add(id_range_step); - Ok(Self { writer, next_id: AtomicU32::new(id_start), diff --git a/crates/runtime/lib/relay.rs b/crates/runtime/lib/relay.rs index 4253ed9a9..192856c3a 100644 --- a/crates/runtime/lib/relay.rs +++ b/crates/runtime/lib/relay.rs @@ -65,9 +65,6 @@ type SessionRegistry = std::sync::Mutex>; const MAX_CLIENTS: u32 = 128; /// Size of the correlation ID range allocated to each client. -/// -/// Sent to each client during handshake, so the SDK never needs to know -/// `MAX_CLIENTS`. Changing `MAX_CLIENTS` requires no SDK rebuild. const ID_RANGE_STEP: u32 = u32::MAX / MAX_CLIENTS; /// Size of the length prefix in the wire format. @@ -322,17 +319,19 @@ impl AgentRelay { }; let id_offset = slot * ID_RANGE_STEP; + let id_start = id_offset.saturating_add(1); + let id_end_exclusive = id_offset.saturating_add(ID_RANGE_STEP); tracing::info!( - "agent relay: client connected slot={slot} id_offset={id_offset}" + "agent relay: client connected slot={slot} id_start={id_start} id_end_exclusive={id_end_exclusive}" ); // Perform handshake: send - // [id_offset: u32 BE][id_range_step: u32 BE][ready_frame_bytes...]. + // [id_start: u32 BE][id_end_exclusive: u32 BE][ready_frame_bytes...]. let (reader_half, mut writer_half) = stream.into_split(); let mut handshake = Vec::with_capacity(8 + ready_frame.len()); - handshake.extend_from_slice(&id_offset.to_be_bytes()); - handshake.extend_from_slice(&ID_RANGE_STEP.to_be_bytes()); + handshake.extend_from_slice(&id_start.to_be_bytes()); + handshake.extend_from_slice(&id_end_exclusive.to_be_bytes()); handshake.extend_from_slice(&ready_frame); if let Err(e) = writer_half.write_all(&handshake).await { @@ -384,6 +383,8 @@ impl AgentRelay { drain_tx_clone, registry_clone, next_id_clone, + id_start, + id_end_exclusive, )); } Err(e) => { @@ -746,6 +747,8 @@ async fn client_reader_task( drain_tx: mpsc::Sender<()>, session_registry: Arc, next_session_id: Arc, + id_start: u32, + id_end_exclusive: u32, ) { loop { let frame = match read_raw_frame(&mut reader).await { @@ -756,6 +759,16 @@ async fn client_reader_task( } }; + if frame.id < id_start || frame.id >= id_end_exclusive { + tracing::warn!( + "agent relay: client slot={slot} sent out-of-range id={} range=[{}, {})", + frame.id, + id_start, + id_end_exclusive + ); + break; + } + // Track session starts for disconnect cleanup. let is_session_start = (frame.flags & FLAG_SESSION_START) != 0; let is_terminal = (frame.flags & FLAG_TERMINAL) != 0; From 1c0084f6c9acf09e60ad52465a7baa00d3fa8f89 Mon Sep 17 00:00:00 2001 From: Stephen Akinyemi Date: Thu, 21 May 2026 12:33:58 +0100 Subject: [PATCH 4/4] fix(agent-relay): allow shutdown control id zero Keep relay ID-range validation for correlated client frames, but allow core.shutdown to use correlation ID 0. Shutdown is a process-level control frame rather than a client-owned request, so rejecting it prevents agentd from receiving the graceful shutdown request. Add focused relay validation tests for in-range frames, out-of-range non-shutdown frames, and the shutdown ID 0 exception. Add an ignored VM integration regression test that bounds stop_and_wait so future shutdown routing regressions fail instead of hanging CI indefinitely. --- crates/microsandbox/tests/correlation_ids.rs | 43 +++++++++++++++++ crates/runtime/lib/relay.rs | 50 +++++++++++++++++--- 2 files changed, 87 insertions(+), 6 deletions(-) create mode 100644 crates/microsandbox/tests/correlation_ids.rs diff --git a/crates/microsandbox/tests/correlation_ids.rs b/crates/microsandbox/tests/correlation_ids.rs new file mode 100644 index 000000000..0e3d374ba --- /dev/null +++ b/crates/microsandbox/tests/correlation_ids.rs @@ -0,0 +1,43 @@ +//! Integration tests for relay correlation ID handling. +//! +//! These tests require KVM (or libkrun on macOS). The `#[msb_test]` +//! attribute marks them `#[ignore]`, so plain `cargo test --workspace` +//! skips them. Run them via: +//! +//! cargo nextest run -p microsandbox --test correlation_ids --run-ignored=only + +use std::time::Duration; + +use microsandbox::Sandbox; +use test_utils::msb_test; + +/// `core.shutdown` is a process-level control frame sent with correlation ID +/// 0, not a client-owned request/session ID. The relay must allow it even +/// though normal client frames are restricted to the assigned ID range. +#[msb_test] +async fn shutdown_control_id_zero_stops_sandbox() { + let name = "correlation-shutdown-id-zero"; + + let sandbox = Sandbox::builder(name) + .image("mirror.gcr.io/library/alpine") + .cpus(1) + .memory(256) + .replace() + .create() + .await + .expect("create sandbox"); + + let stop_result = tokio::time::timeout(Duration::from_secs(30), sandbox.stop_and_wait()).await; + + if stop_result.is_err() { + if let Ok(mut h) = Sandbox::get(name).await { + let _ = h.kill().await; + let _ = h.remove().await; + } + } + Sandbox::remove(name).await.ok(); + + stop_result + .expect("stop_and_wait timed out; relay likely rejected core.shutdown id 0") + .expect("stop_and_wait failed"); +} diff --git a/crates/runtime/lib/relay.rs b/crates/runtime/lib/relay.rs index f6eefd6d1..867bb6b1b 100644 --- a/crates/runtime/lib/relay.rs +++ b/crates/runtime/lib/relay.rs @@ -761,7 +761,12 @@ async fn client_reader_task( } }; - if frame.id < id_start || frame.id >= id_end_exclusive { + // Track session starts for disconnect cleanup. + let is_session_start = (frame.flags & FLAG_SESSION_START) != 0; + let is_terminal = (frame.flags & FLAG_TERMINAL) != 0; + let is_shutdown = (frame.flags & FLAG_SHUTDOWN) != 0; + + if !is_client_frame_allowed(frame.id, frame.flags, id_start, id_end_exclusive) { tracing::warn!( "agent relay: client slot={slot} sent out-of-range id={} range=[{}, {})", frame.id, @@ -771,11 +776,6 @@ async fn client_reader_task( break; } - // Track session starts for disconnect cleanup. - let is_session_start = (frame.flags & FLAG_SESSION_START) != 0; - let is_terminal = (frame.flags & FLAG_TERMINAL) != 0; - let is_shutdown = (frame.flags & FLAG_SHUTDOWN) != 0; - // Forward shutdown to agentd (via the agent_tx send below) so the // guest can sync filesystems and power off cleanly. Also notify the // caller so it can start the flush-grace fallback timer — if the @@ -883,3 +883,41 @@ async fn client_reader_task( used_slots.lock().await.remove(&slot); tracing::debug!("agent relay: slot={slot} released"); } + +/// Return whether a client-originated frame may be forwarded to agentd. +/// +/// Most client frames must use a correlation ID from the relay-assigned +/// range so responses route back to the owning client. `core.shutdown` is a +/// process-level control frame, not a correlated request, and the SDK sends it +/// with ID 0. +fn is_client_frame_allowed(id: u32, flags: u8, id_start: u32, id_end_exclusive: u32) -> bool { + let is_shutdown_control = (flags & FLAG_SHUTDOWN) != 0 && id == 0; + is_shutdown_control || (id >= id_start && id < id_end_exclusive) +} + +//-------------------------------------------------------------------------------------------------- +// Tests +//-------------------------------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn client_frame_validation_allows_ids_in_assigned_range() { + assert!(is_client_frame_allowed(10, 0, 10, 20)); + assert!(is_client_frame_allowed(19, FLAG_SESSION_START, 10, 20)); + } + + #[test] + fn client_frame_validation_rejects_non_shutdown_ids_outside_range() { + assert!(!is_client_frame_allowed(0, 0, 10, 20)); + assert!(!is_client_frame_allowed(9, FLAG_SESSION_START, 10, 20)); + assert!(!is_client_frame_allowed(20, FLAG_TERMINAL, 10, 20)); + } + + #[test] + fn client_frame_validation_allows_shutdown_control_id_zero() { + assert!(is_client_frame_allowed(0, FLAG_SHUTDOWN, 10, 20)); + } +}