From 308207373d2ab260a451a5a40ce7f0eba6dc9bf6 Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 14:20:50 +0000
Subject: [PATCH 1/9] feat(appstore): enforce broker exposes + cross-app
 ipc.call grants

Add a deny-by-default authorization gate to the app-store broker:

- manifest.ExposesMethod / HasGrant helpers (cap+target matching with
  exact, prefix-wildcard, and blanket forms).
- supervisor.callFrom enforces, before any dial: target installed,
  method in exposes (all callers), and a matching ipc.call grant for
  cross-app callers. New typed errors ErrMethodNotExposed/ErrGrantMissing.
- Service.CallFrom exposes the caller-aware entry point; Call stays the
  trusted daemon/pilotctl path.

Update existing call-path tests to declare exposed methods.
---
 pkg/manifest/grants.go                        |  58 +++++++++
 pkg/manifest/grants_test.go                   |  67 ++++++++++
 plugin/appstore/grant_test.go                 | 114 ++++++++++++++++++
 plugin/appstore/service.go                    |  18 +++
 plugin/appstore/supervisor.go                 |  60 ++++++++-
 .../appstore/zz2_supervisor_lifecycle_test.go |   4 +-
 plugin/appstore/zz3_supervisor_call_test.go   |   8 +-
 plugin/appstore/zz_supervisor_simple_test.go  |   4 +-
 8 files changed, 325 insertions(+), 8 deletions(-)
 create mode 100644 pkg/manifest/grants.go
 create mode 100644 pkg/manifest/grants_test.go
 create mode 100644 plugin/appstore/grant_test.go
diff --git a/pkg/manifest/grants.go b/pkg/manifest/grants.go
new file mode 100644
index 0000000..958fd90
--- /dev/null
+++ b/pkg/manifest/grants.go
@@ -0,0 +1,58 @@
+package manifest
+
+import "strings"
+
+// ExposesMethod reports whether method appears in the manifest's Exposes
+// list. Exposes is the app's entire broker surface: the daemon's app-store
+// broker dispatches a method into an app only if the app explicitly
+// exposes it. An empty Exposes list therefore means "no broker-callable
+// methods" — fail-closed.
+func (m *Manifest) ExposesMethod(method string) bool {
+	for _, e := range m.Exposes {
+		if e == method {
+			return true
+		}
+	}
+	return false
+}
+
+// HasGrant reports whether the manifest declares a grant whose capability
+// equals capName and whose target matches the requested target. Used by
+// the broker to authorize cross-app ipc.call dispatch: the calling app
+// must declare an `ipc.call` grant targeting the specific "<app>.<method>"
+// it wants to reach.
+//
+// Target matching supports three forms, in order of generality:
+//   - "*"           matches any target (blanket grant)
+//   - "<prefix>.*"  matches any target sharing the prefix (e.g.
+//     "io.pilot.wallet.*" matches "io.pilot.wallet.pay")
+//   - exact         the grant target equals the requested target verbatim
+//
+// Conditions on the grant are NOT evaluated here — HasGrant answers only
+// "is this capability+target declared". Per-request condition evaluation
+// (rate, consent, time-window, …) is a separate, later gate.
+func (m *Manifest) HasGrant(capName, target string) bool {
+	for _, g := range m.Grants {
+		if g.Cap == capName && matchGrantTarget(g.Target, target) {
+			return true
+		}
+	}
+	return false
+}
+
+// matchGrantTarget implements the target pattern rules documented on
+// HasGrant.
+func matchGrantTarget(pattern, target string) bool {
+	switch {
+	case pattern == "*":
+		return true
+	case pattern == target:
+		return true
+	case strings.HasSuffix(pattern, ".*"):
+		// Keep the trailing dot so "io.app.*" matches "io.app.x" but
+		// not "io.apple.x".
+		return strings.HasPrefix(target, strings.TrimSuffix(pattern, "*"))
+	default:
+		return false
+	}
+}
diff --git a/pkg/manifest/grants_test.go b/pkg/manifest/grants_test.go
new file mode 100644
index 0000000..c90499c
--- /dev/null
+++ b/pkg/manifest/grants_test.go
@@ -0,0 +1,67 @@
+package manifest
+
+import "testing"
+
+func TestExposesMethod(t *testing.T) {
+	m := &Manifest{Exposes: []string{"a.read", "a.write"}}
+	cases := map[string]bool{
+		"a.read":  true,
+		"a.write": true,
+		"a.admin": false,
+		"":        false,
+	}
+	for method, want := range cases {
+		if got := m.ExposesMethod(method); got != want {
+			t.Errorf("ExposesMethod(%q) = %v, want %v", method, got, want)
+		}
+	}
+	// Empty exposes is fail-closed: nothing is callable.
+	empty := &Manifest{}
+	if empty.ExposesMethod("a.read") {
+		t.Errorf("empty exposes should expose nothing")
+	}
+}
+
+func TestHasGrant(t *testing.T) {
+	m := &Manifest{Grants: []Grant{
+		{Cap: "ipc.call", Target: "io.pilot.wallet.pay"},
+		{Cap: "ipc.call", Target: "io.pilot.notes.*"},
+		{Cap: "fs.read", Target: "*"},
+	}}
+	type tc struct {
+		cap, target string
+		want        bool
+	}
+	for _, c := range []tc{
+		{"ipc.call", "io.pilot.wallet.pay", true},   // exact
+		{"ipc.call", "io.pilot.wallet.refund", false}, // exact mismatch
+		{"ipc.call", "io.pilot.notes.add", true},    // prefix.*
+		{"ipc.call", "io.pilot.notes.list", true},   // prefix.*
+		{"ipc.call", "io.pilot.notesX.add", false},  // prefix must end at dot
+		{"fs.read", "anything.at.all", true},        // blanket *
+		{"net.dial", "io.pilot.wallet.pay", false},  // cap mismatch
+	} {
+		if got := m.HasGrant(c.cap, c.target); got != c.want {
+			t.Errorf("HasGrant(%q,%q) = %v, want %v", c.cap, c.target, got, c.want)
+		}
+	}
+}
+
+func TestMatchGrantTarget(t *testing.T) {
+	cases := []struct {
+		pattern, target string
+		want            bool
+	}{
+		{"*", "x.y", true},
+		{"x.y", "x.y", true},
+		{"x.*", "x.y", true},
+		{"x.*", "x.y.z", true},
+		{"x.*", "xy.z", false}, // trailing dot kept → "xy.z" lacks "x." prefix
+		{"x.y", "x.z", false},
+	}
+	for _, c := range cases {
+		if got := matchGrantTarget(c.pattern, c.target); got != c.want {
+			t.Errorf("matchGrantTarget(%q,%q) = %v, want %v", c.pattern, c.target, got, c.want)
+		}
+	}
+}
diff --git a/plugin/appstore/grant_test.go b/plugin/appstore/grant_test.go
new file mode 100644
index 0000000..76a1438
--- /dev/null
+++ b/plugin/appstore/grant_test.go
@@ -0,0 +1,114 @@
+package appstore
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"github.com/pilot-protocol/app-store/pkg/manifest"
+)
+
+// installTarget registers a ready target app exposing the given methods.
+func installTarget(sup *supervisor, id string, exposes ...string) {
+	sup.mu.Lock()
+	sup.installed[id] = &installedApp{
+		Dir:        "/nonexistent/" + id,
+		SocketPath: "/nonexistent/" + id + "/app.sock",
+		Manifest:   &manifest.Manifest{ID: id, Exposes: exposes},
+	}
+	sup.ready[id] = true
+	sup.mu.Unlock()
+}
+
+// installCaller registers a caller app with the given ipc.call grants.
+func installCaller(sup *supervisor, id string, grantTargets ...string) {
+	var grants []manifest.Grant
+	for _, gt := range grantTargets {
+		grants = append(grants, manifest.Grant{Cap: "ipc.call", Target: gt})
+	}
+	sup.mu.Lock()
+	sup.installed[id] = &installedApp{
+		Dir:      "/nonexistent/" + id,
+		Manifest: &manifest.Manifest{ID: id, Grants: grants},
+	}
+	sup.mu.Unlock()
+}
+
+// TestCallFrom_Gates exercises every authorization branch of callFrom
+// without needing a live socket: each denial returns before the dial,
+// and a fully-authorized call falls through to ErrAppNotReady only when
+// the target socket is absent (here it dials a nonexistent path, so we
+// assert the gate was passed by checking the error is NOT a gate error).
+func TestCallFrom_Gates(t *testing.T) {
+	t.Parallel()
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+	defer cancel()
+
+	t.Run("unknown target → ErrAppNotInstalled", func(t *testing.T) {
+		sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t))
+		err := sup.CallFrom(ctx, "io.caller", "io.missing", "ping", nil, nil)
+		if !errors.Is(err, ErrAppNotInstalled) {
+			t.Fatalf("err = %v, want ErrAppNotInstalled", err)
+		}
+	})
+
+	t.Run("method not exposed → ErrMethodNotExposed", func(t *testing.T) {
+		sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t))
+		installTarget(sup, "io.target", "ping")
+		err := sup.CallFrom(ctx, "", "io.target", "io.target.secret", nil, nil)
+		if !errors.Is(err, ErrMethodNotExposed) {
+			t.Fatalf("err = %v, want ErrMethodNotExposed", err)
+		}
+	})
+
+	t.Run("trusted caller skips grant gate", func(t *testing.T) {
+		sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t))
+		installTarget(sup, "io.target", "ping")
+		// Empty callerID = trusted; exposed method → passes gates, then
+		// fails at dial (socket path is nonexistent). Must NOT be a gate err.
+		err := sup.CallFrom(ctx, "", "io.target", "ping", nil, nil)
+		if errors.Is(err, ErrGrantMissing) || errors.Is(err, ErrMethodNotExposed) || errors.Is(err, ErrAppNotInstalled) {
+			t.Fatalf("trusted call should pass gates, got gate err: %v", err)
+		}
+	})
+
+	t.Run("cross-app caller not installed → ErrGrantMissing", func(t *testing.T) {
+		sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t))
+		installTarget(sup, "io.target", "ping")
+		err := sup.CallFrom(ctx, "io.ghost", "io.target", "ping", nil, nil)
+		if !errors.Is(err, ErrGrantMissing) {
+			t.Fatalf("err = %v, want ErrGrantMissing", err)
+		}
+	})
+
+	t.Run("cross-app caller without matching grant → ErrGrantMissing", func(t *testing.T) {
+		sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t))
+		installTarget(sup, "io.target", "ping")
+		installCaller(sup, "io.caller", "io.other.method") // wrong target
+		err := sup.CallFrom(ctx, "io.caller", "io.target", "ping", nil, nil)
+		if !errors.Is(err, ErrGrantMissing) {
+			t.Fatalf("err = %v, want ErrGrantMissing", err)
+		}
+	})
+
+	t.Run("cross-app caller with matching grant passes gates", func(t *testing.T) {
+		sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t))
+		installTarget(sup, "io.target", "ping")
+		installCaller(sup, "io.caller", "io.target.ping")
+		err := sup.CallFrom(ctx, "io.caller", "io.target", "ping", nil, nil)
+		if errors.Is(err, ErrGrantMissing) || errors.Is(err, ErrMethodNotExposed) {
+			t.Fatalf("authorized call should pass gates, got: %v", err)
+		}
+	})
+
+	t.Run("cross-app caller with wildcard grant passes gates", func(t *testing.T) {
+		sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t))
+		installTarget(sup, "io.target", "ping")
+		installCaller(sup, "io.caller", "io.target.*")
+		err := sup.CallFrom(ctx, "io.caller", "io.target", "ping", nil, nil)
+		if errors.Is(err, ErrGrantMissing) {
+			t.Fatalf("wildcard grant should authorize, got: %v", err)
+		}
+	})
+}
diff --git a/plugin/appstore/service.go b/plugin/appstore/service.go
index 2979472..2a00466 100644
--- a/plugin/appstore/service.go
+++ b/plugin/appstore/service.go
@@ -234,3 +234,21 @@ func (s *Service) Call(ctx context.Context, appID, method string, args, out any)
 	}
 	return sup.Call(ctx, appID, method, args, out)
 }
+
+// CallFrom is the cross-app broker entry point. callerID identifies the
+// installed app making the request; the supervisor authorizes the call
+// against that app's manifest grants (it must declare an `ipc.call` grant
+// targeting "<appID>.<method>"). Pass an empty callerID for trusted
+// daemon/pilotctl calls — see Call.
+//
+// Returns ErrAppNotInstalled, ErrMethodNotExposed, ErrGrantMissing, or
+// ErrAppNotReady for the gate failures; otherwise the app's IPC response.
+func (s *Service) CallFrom(ctx context.Context, callerID, appID, method string, args, out any) error {
+	s.startMu.Lock()
+	sup := s.sup
+	s.startMu.Unlock()
+	if sup == nil {
+		return errors.New("appstore: service not started")
+	}
+	return sup.CallFrom(ctx, callerID, appID, method, args, out)
+}
diff --git a/plugin/appstore/supervisor.go b/plugin/appstore/supervisor.go
index f653992..9e05a6d 100644
--- a/plugin/appstore/supervisor.go
+++ b/plugin/appstore/supervisor.go
@@ -860,6 +860,18 @@ var ErrAppNotInstalled = errors.New("appstore: app not installed")
 // appeared yet (still starting up, or it crashed and hasn't respawned).
 var ErrAppNotReady = errors.New("appstore: app not ready")
 
+// ErrMethodNotExposed is returned by Call/CallFrom when the target app's
+// manifest does not list the requested method in its `exposes` set. The
+// exposes list is the app's entire broker surface, so calling anything
+// else is denied — even for trusted (daemon/pilotctl) callers.
+var ErrMethodNotExposed = errors.New("appstore: method not exposed by app")
+
+// ErrGrantMissing is returned by CallFrom when a cross-app caller does
+// not hold an `ipc.call` grant whose target matches "<app>.<method>".
+// Deny-by-default: an app may only reach methods it declared at install
+// time, and the user reviewed, in its manifest grants.
+var ErrGrantMissing = errors.New("appstore: caller lacks ipc.call grant for target")
+
 // Apps returns the public-shaped summary of every installed app, in
 // arbitrary order. Used by pilotctl's `appstore list` and by other
 // apps that want to introspect what's available.
@@ -894,18 +906,58 @@ func (s *supervisor) Get(appID string) *installedApp {
 }
 
 // Call dispatches method+args into the named installed app via its
-// app.sock. The connection is dialed per-call — simple and lets the
-// app's own concurrency handle multiple in-flight calls. Returns
-// ErrAppNotInstalled / ErrAppNotReady for the obvious failure modes,
-// otherwise propagates the app's IPC response (or its error).
+// app.sock on behalf of a trusted caller (the daemon or pilotctl). It is
+// CallFrom with an empty callerID — the broker-surface (exposes) gate
+// still applies, but no cross-app ipc.call grant is required.
 func (s *supervisor) Call(ctx context.Context, appID, method string, args, out any) error {
+	return s.callFrom(ctx, "", appID, method, args, out)
+}
+
+// CallFrom dispatches method+args into the named installed app on behalf
+// of callerID. When callerID is non-empty the call is treated as a
+// cross-app ipc.call and is authorized against the caller's manifest
+// grants. When callerID is empty the caller is trusted (daemon/pilotctl)
+// and only the broker-surface gate applies.
+//
+// Authorization, all enforced before any socket is dialed:
+//  1. target app must be installed            → ErrAppNotInstalled
+//  2. method must be in the target's exposes   → ErrMethodNotExposed
+//  3. cross-app only: caller must be installed and hold an `ipc.call`
+//     grant matching "<app>.<method>"          → ErrGrantMissing
+func (s *supervisor) CallFrom(ctx context.Context, callerID, appID, method string, args, out any) error {
+	return s.callFrom(ctx, callerID, appID, method, args, out)
+}
+
+// callFrom is the shared implementation behind Call and CallFrom. The
+// connection is dialed per-call — simple and lets the app's own
+// concurrency handle multiple in-flight calls. Returns the typed gate
+// errors above, or otherwise propagates the app's IPC response/error.
+func (s *supervisor) callFrom(ctx context.Context, callerID, appID, method string, args, out any) error {
 	s.mu.RLock()
 	app, ok := s.installed[appID]
+	caller, callerOK := s.installed[callerID]
 	ready := s.ready[appID]
 	s.mu.RUnlock()
 	if !ok {
 		return fmt.Errorf("%w: %s", ErrAppNotInstalled, appID)
 	}
+	// Broker-surface gate: only methods the app explicitly exposes are
+	// dispatchable, for every caller including the daemon itself.
+	if !app.Manifest.ExposesMethod(method) {
+		return fmt.Errorf("%w: %s.%s", ErrMethodNotExposed, appID, method)
+	}
+	// Cross-app grant gate: a calling app must have declared an ipc.call
+	// grant targeting the specific method it wants to invoke. Trusted
+	// callers (empty callerID) skip this — they are the broker itself.
+	if callerID != "" {
+		if !callerOK {
+			return fmt.Errorf("%w: caller %q not installed", ErrGrantMissing, callerID)
+		}
+		target := appID + "." + method
+		if !caller.Manifest.HasGrant("ipc.call", target) {
+			return fmt.Errorf("%w: %s -> %s", ErrGrantMissing, callerID, target)
+		}
+	}
 	if !ready {
 		// Give it a brief moment in case it just spawned.
 		if !s.awaitReady(ctx, appID, 1*time.Second) {
diff --git a/plugin/appstore/zz2_supervisor_lifecycle_test.go b/plugin/appstore/zz2_supervisor_lifecycle_test.go
index 7b5a1a7..68e1c19 100644
--- a/plugin/appstore/zz2_supervisor_lifecycle_test.go
+++ b/plugin/appstore/zz2_supervisor_lifecycle_test.go
@@ -668,12 +668,14 @@ func TestSupervisor_Call_DialFailsWhenNoServer(t *testing.T) {
 	if err := os.MkdirAll(appDir, 0o700); err != nil {
 		t.Fatal(err)
 	}
+	md := parseDummyManifest(t, "io.dial.fail")
+	md.Exposes = []string{"any"}
 	sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t))
 	sup.mu.Lock()
 	sup.installed["io.dial.fail"] = &installedApp{
 		Dir:        appDir,
 		SocketPath: filepath.Join(appDir, "missing.sock"),
-		Manifest:   parseDummyManifest(t, "io.dial.fail"),
+		Manifest:   md,
 	}
 	sup.ready["io.dial.fail"] = true
 	sup.mu.Unlock()
diff --git a/plugin/appstore/zz3_supervisor_call_test.go b/plugin/appstore/zz3_supervisor_call_test.go
index 37a0ad4..3b6ecf6 100644
--- a/plugin/appstore/zz3_supervisor_call_test.go
+++ b/plugin/appstore/zz3_supervisor_call_test.go
@@ -77,12 +77,14 @@ func TestSupervisor_Call_HappyPath(t *testing.T) {
 		})
 	defer cleanup()
 
+	mh := parseDummyManifest(t, "io.call.happy")
+	mh.Exposes = []string{"echo"}
 	sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t))
 	sup.mu.Lock()
 	sup.installed["io.call.happy"] = &installedApp{
 		Dir:        appDir,
 		SocketPath: socketPath,
-		Manifest:   parseDummyManifest(t, "io.call.happy"),
+		Manifest:   mh,
 	}
 	sup.ready["io.call.happy"] = true
 	sup.mu.Unlock()
@@ -114,12 +116,14 @@ func TestSupervisor_Call_PropagatesServerError(t *testing.T) {
 		})
 	defer cleanup()
 
+	me := parseDummyManifest(t, "io.call.err")
+	me.Exposes = []string{"boom"}
 	sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t))
 	sup.mu.Lock()
 	sup.installed["io.call.err"] = &installedApp{
 		Dir:        appDir,
 		SocketPath: socketPath,
-		Manifest:   parseDummyManifest(t, "io.call.err"),
+		Manifest:   me,
 	}
 	sup.ready["io.call.err"] = true
 	sup.mu.Unlock()
diff --git a/plugin/appstore/zz_supervisor_simple_test.go b/plugin/appstore/zz_supervisor_simple_test.go
index 4a93c2f..5b9fdba 100644
--- a/plugin/appstore/zz_supervisor_simple_test.go
+++ b/plugin/appstore/zz_supervisor_simple_test.go
@@ -59,10 +59,12 @@ func TestSupervisor_Call_NotReady(t *testing.T) {
 	t.Parallel()
 	dir := t.TempDir()
 	sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t))
+	m := parseDummyManifest(t, "io.test.app")
+	m.Exposes = []string{"method"} // method must be exposed to reach the ready gate
 	sup.mu.Lock()
 	sup.installed["io.test.app"] = &installedApp{
 		Dir:      dir,
-		Manifest: parseDummyManifest(t, "io.test.app"),
+		Manifest: m,
 	}
 	sup.mu.Unlock()
 

From d0f820ec75dd1040febda2c535cca2f059f1b7ad Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 14:22:13 +0000
Subject: [PATCH 2/9] feat(appstore): exponential backoff for verify-fail
 retries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the fixed 30s verify-fail retry sleep with a capped exponential
ramp (1s→2s→…→30s) that resets on a successful verify, consistent with
crash-loop restart handling. Extract the shared nextBackoff helper and
unit-test that it grows and saturates at the cap.
---
 plugin/appstore/backoff_test.go | 43 +++++++++++++++++++++++++++++++++
 plugin/appstore/supervisor.go   | 33 ++++++++++++++++++-------
 2 files changed, 67 insertions(+), 9 deletions(-)
 create mode 100644 plugin/appstore/backoff_test.go

diff --git a/plugin/appstore/backoff_test.go b/plugin/appstore/backoff_test.go
new file mode 100644
index 0000000..3d5f553
--- /dev/null
+++ b/plugin/appstore/backoff_test.go
@@ -0,0 +1,43 @@
+package appstore
+
+import (
+	"testing"
+	"time"
+)
+
+// TestNextBackoff_GrowsAndCaps asserts the verify-fail / crash-restart
+// backoff doubles each step and saturates at the cap — i.e. it is not a
+// fixed interval.
+func TestNextBackoff_GrowsAndCaps(t *testing.T) {
+	t.Parallel()
+	const max = 30 * time.Second
+	want := []time.Duration{
+		2 * time.Second,  // 1 → 2
+		4 * time.Second,  // 2 → 4
+		8 * time.Second,  // 4 → 8
+		16 * time.Second, // 8 → 16
+		max,              // 16 → 32 capped to 30
+		max,              // stays at cap
+	}
+	cur := time.Second
+	for i, w := range want {
+		cur = nextBackoff(cur, max)
+		if cur != w {
+			t.Fatalf("step %d: got %s, want %s", i, cur, w)
+		}
+	}
+
+	// Strictly grows until the cap, never exceeds it.
+	prev := time.Duration(0)
+	c := time.Second
+	for i := 0; i < 20; i++ {
+		c = nextBackoff(c, max)
+		if c < prev {
+			t.Fatalf("backoff decreased at step %d: %s < %s", i, c, prev)
+		}
+		if c > max {
+			t.Fatalf("backoff exceeded cap at step %d: %s > %s", i, c, max)
+		}
+		prev = c
+	}
+}
diff --git a/plugin/appstore/supervisor.go b/plugin/appstore/supervisor.go
index 9e05a6d..2ab18a8 100644
--- a/plugin/appstore/supervisor.go
+++ b/plugin/appstore/supervisor.go
@@ -637,6 +637,16 @@ func atoiOrZero(s string) int {
 	return n
 }
 
+// nextBackoff doubles cur, capping at max. Shared by superviseOne's
+// crash-restart and verify-fail backoff ramps so both grow identically.
+func nextBackoff(cur, max time.Duration) time.Duration {
+	next := cur * 2
+	if next > max {
+		return max
+	}
+	return next
+}
+
 // superviseOne runs one app forever, respawning on exit until ctx is canceled.
 func (s *supervisor) superviseOne(ctx context.Context, a *installedApp) {
 	// Mark the start + end of supervision. The pair tells forensics
@@ -671,6 +681,11 @@ func (s *supervisor) superviseOne(ctx context.Context, a *installedApp) {
 	}()
 	backoff := time.Second
 	const maxBackoff = 30 * time.Second
+	// verifyBackoff grows independently of the crash-restart backoff so a
+	// binary that fails sha256 verification doesn't hammer the disk every
+	// 30s — it ramps 1s→2s→4s…→30s, consistent with crash-loop handling,
+	// and resets the moment a verification succeeds.
+	verifyBackoff := time.Second
 	verifyFails := 0
 	for {
 		if err := ctx.Err(); err != nil {
@@ -690,17 +705,20 @@ func (s *supervisor) superviseOne(ctx context.Context, a *installedApp) {
 				s.markSuspended(a.Manifest.ID)
 				return
 			}
-			// A bad sha256 may be transient; wait + retry in case
-			// the user fixes it (e.g. re-install).
+			// A bad sha256 may be transient; wait + retry with capped
+			// exponential backoff in case the user fixes it (re-install).
 			select {
 			case <-ctx.Done():
 				return
-			case <-time.After(maxBackoff):
-				continue
+			case <-time.After(verifyBackoff):
 			}
+			verifyBackoff = nextBackoff(verifyBackoff, maxBackoff)
+			continue
 		}
-		// verify passed — reset the consecutive-fail counter.
+		// verify passed — reset the consecutive-fail counter and the
+		// verify backoff so a later transient failure starts fresh.
 		verifyFails = 0
+		verifyBackoff = time.Second
 		exitCode := s.spawn(ctx, a)
 		s.markNotReady(a.Manifest.ID)
 		s.writeAuditLine(a, auditEvent{Event: "exit", ExitCode: exitCode})
@@ -728,10 +746,7 @@ func (s *supervisor) superviseOne(ctx context.Context, a *installedApp) {
 			return
 		case <-time.After(backoff):
 		}
-		backoff *= 2
-		if backoff > maxBackoff {
-			backoff = maxBackoff
-		}
+		backoff = nextBackoff(backoff, maxBackoff)
 	}
 }
 

From eb69ed0285e459f144c0194949c7675eefd5a766 Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 14:24:39 +0000
Subject: [PATCH 3/9] feat(appstore): keep N rotated audit log generations

Replace single-step supervisor.log->.1 rotation with an N-generation
shift (.1..N), discarding the oldest. Add Config.AuditLogMaxBackups
(default 3); worst-case footprint is (backups+1) x AuditLogMaxBytes per
app. Cover generation count and shift semantics with tests.
---
 plugin/appstore/rotation_test.go | 72 ++++++++++++++++++++++++++++++++
 plugin/appstore/service.go       | 12 ++++--
 plugin/appstore/supervisor.go    | 69 +++++++++++++++++++++++-------
 3 files changed, 135 insertions(+), 18 deletions(-)
 create mode 100644 plugin/appstore/rotation_test.go

diff --git a/plugin/appstore/rotation_test.go b/plugin/appstore/rotation_test.go
new file mode 100644
index 0000000..cbfbc56
--- /dev/null
+++ b/plugin/appstore/rotation_test.go
@@ -0,0 +1,72 @@
+package appstore
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// TestAuditLogKeepsNGenerations drives repeated rotations and asserts the
+// supervisor keeps exactly AuditLogMaxBackups rotated files
+// (supervisor.log.1 .. .N), never more, and that .1 always holds the most
+// recently rotated content.
+func TestAuditLogKeepsNGenerations(t *testing.T) {
+	t.Parallel()
+	dir := t.TempDir()
+	app := &installedApp{Dir: dir, Manifest: parseDummyManifest(t, "io.test.app")}
+	const backups = 3
+	sup := newSupervisor(Config{
+		InstallRoot:        dir,
+		AuditLogMaxBytes:   200,
+		AuditLogMaxBackups: backups,
+	}, Deps{}, newQuietLogger(t))
+
+	// Write far more than enough to trigger several rotations.
+	for i := 0; i < 200; i++ {
+		sup.writeAuditLine(app, auditEvent{Event: "spawn", PID: i, SHA256: "abc"})
+	}
+
+	// Exactly `backups` rotated generations must exist: .1 .. .N.
+	for i := 1; i <= backups; i++ {
+		p := filepath.Join(dir, fmt.Sprintf("%s.%d", supervisorLogName, i))
+		if _, err := os.Stat(p); err != nil {
+			t.Errorf("expected rotated generation %s to exist: %v", p, err)
+		}
+	}
+	// .N+1 must NOT exist — the oldest is discarded.
+	overflow := filepath.Join(dir, fmt.Sprintf("%s.%d", supervisorLogName, backups+1))
+	if _, err := os.Stat(overflow); !os.IsNotExist(err) {
+		t.Errorf("generation beyond backup count should not exist: %s (err=%v)", overflow, err)
+	}
+	// Active log still present.
+	if _, err := os.Stat(filepath.Join(dir, supervisorLogName)); err != nil {
+		t.Errorf("active log missing after rotations: %v", err)
+	}
+}
+
+// TestRotateGenerationsShiftsContent verifies the shift semantics
+// directly: after a rotation, what was in .1 moves to .2.
+func TestRotateGenerationsShiftsContent(t *testing.T) {
+	t.Parallel()
+	dir := t.TempDir()
+	sup := newSupervisor(Config{InstallRoot: dir, AuditLogMaxBackups: 2}, Deps{}, newQuietLogger(t))
+
+	write := func(name, content string) {
+		if err := os.WriteFile(filepath.Join(dir, name), []byte(content), 0o600); err != nil {
+			t.Fatal(err)
+		}
+	}
+	write(supervisorLogName, "ACTIVE")
+	write(supervisorLogName+".1", "GEN1")
+
+	sup.rotateGenerations(dir, 2)
+
+	// active → .1, old .1 → .2
+	if b, _ := os.ReadFile(filepath.Join(dir, supervisorLogName+".1")); string(b) != "ACTIVE" {
+		t.Errorf(".1 = %q, want ACTIVE", b)
+	}
+	if b, _ := os.ReadFile(filepath.Join(dir, supervisorLogName+".2")); string(b) != "GEN1" {
+		t.Errorf(".2 = %q, want GEN1", b)
+	}
+}
diff --git a/plugin/appstore/service.go b/plugin/appstore/service.go
index 2a00466..34a2163 100644
--- a/plugin/appstore/service.go
+++ b/plugin/appstore/service.go
@@ -47,10 +47,16 @@ type Config struct {
 	RescanInterval time.Duration
 
 	// AuditLogMaxBytes is the per-app supervisor.log size threshold
-	// at which a single-step rotation fires (active → .1). Zero
-	// defaults to maxAuditLogSize (10MB). Tests set this low to
-	// exercise the rotation path without writing megabytes.
+	// at which rotation fires (active → .1). Zero defaults to
+	// maxAuditLogSize (10MB). Tests set this low to exercise the
+	// rotation path without writing megabytes.
 	AuditLogMaxBytes int64
+
+	// AuditLogMaxBackups is the number of rotated generations the
+	// supervisor keeps (supervisor.log.1 .. .N). Zero defaults to
+	// defaultAuditLogBackups (3). Worst-case per-app footprint is
+	// (AuditLogMaxBackups+1) × AuditLogMaxBytes.
+	AuditLogMaxBackups int
 }
 
 // EmbeddedCatalogPubkey is the production trust anchor for the catalog.
diff --git a/plugin/appstore/supervisor.go b/plugin/appstore/supervisor.go
index 2ab18a8..0864d86 100644
--- a/plugin/appstore/supervisor.go
+++ b/plugin/appstore/supervisor.go
@@ -28,13 +28,19 @@ import (
 // `pilotctl appstore audit <id>` can read it without daemon glue.
 const supervisorLogName = "supervisor.log"
 
-// supervisorLogRotated is the single-step rotation target. On reaching
-// maxAuditLogSize, writeAuditLine moves supervisor.log here (overwriting
-// any prior copy) and starts fresh. Single-step keeps the design simple:
-// worst-case footprint is 2 × maxAuditLogSize per app, plenty of history
-// for an incident, and no log.2/log.3 chain to manage.
+// supervisorLogRotated is the first rotated generation (supervisor.log.1).
+// On reaching the size threshold the active log shifts to .1, the prior
+// .1 shifts to .2, and so on up to the configured backup count; the
+// oldest generation beyond that is discarded. Worst-case footprint is
+// (backups+1) × maxAuditLogSize per app.
 const supervisorLogRotated = "supervisor.log.1"
 
+// defaultAuditLogBackups is the number of rotated generations kept
+// (supervisor.log.1 .. .N) when Config.AuditLogMaxBackups is unset.
+// Three generations + the active log keeps a useful incident window
+// without unbounded disk growth.
+const defaultAuditLogBackups = 3
+
 // maxAuditLogSize bounds each app's active audit log. A crash-looping
 // app emits ~5 lines per failed spawn cycle (verify-fail / exit /
 // spawn / spawn-fail / supervise-stop), each ~150B, so 10MB is
@@ -80,10 +86,12 @@ func (s *supervisor) writeAuditLine(a *installedApp, ev auditEvent) {
 	}
 }
 
-// rotateAuditIfLarge renames supervisor.log → supervisor.log.1 when
-// the active log has crossed the configured size threshold. The
-// threshold defaults to maxAuditLogSize but can be lowered for tests
-// via supervisor.cfg.AuditLogMaxBytes. Errors are logged but never
+// rotateAuditIfLarge rotates the audit log when the active
+// supervisor.log has crossed the configured size threshold, keeping up
+// to N rotated generations (supervisor.log.1 .. .N). The threshold
+// defaults to maxAuditLogSize and the generation count to
+// defaultAuditLogBackups; both can be overridden via Config
+// (AuditLogMaxBytes / AuditLogMaxBackups). Errors are logged but never
 // fatal — forensics best-effort, never blocks the lifecycle write.
 func (s *supervisor) rotateAuditIfLarge(appDir string) {
 	max := int64(maxAuditLogSize)
@@ -98,12 +106,43 @@ func (s *supervisor) rotateAuditIfLarge(appDir string) {
 	if info.Size() < max {
 		return
 	}
-	rotated := filepath.Join(appDir, supervisorLogRotated)
-	// os.Rename replaces the destination atomically on Unix, so a
-	// concurrent reader of the rotated path will see either the old
-	// or the new one — never a partial file.
-	if err := os.Rename(active, rotated); err != nil {
-		s.logger.Printf("audit rotate %s → %s: %v", active, rotated, err)
+	keep := defaultAuditLogBackups
+	if s.cfg.AuditLogMaxBackups > 0 {
+		keep = s.cfg.AuditLogMaxBackups
+	}
+	s.rotateGenerations(appDir, keep)
+}
+
+// rotateGenerations shifts the audit log generations down by one,
+// keeping at most `keep` rotated copies: supervisor.log.(keep) is
+// discarded, .(keep-1)→.keep, …, .1→.2, and the active
+// supervisor.log→.1. Each os.Rename replaces its destination
+// atomically on Unix, so a concurrent reader of any generation sees a
+// whole file, never a partial one. Errors are logged, never fatal.
+func (s *supervisor) rotateGenerations(appDir string, keep int) {
+	if keep < 1 {
+		keep = 1
+	}
+	gen := func(i int) string {
+		if i == 0 {
+			return filepath.Join(appDir, supervisorLogName)
+		}
+		return filepath.Join(appDir, fmt.Sprintf("%s.%d", supervisorLogName, i))
+	}
+	// Discard the generation that would fall off the end.
+	if err := os.Remove(gen(keep)); err != nil && !errors.Is(err, os.ErrNotExist) {
+		s.logger.Printf("audit rotate: remove %s: %v", gen(keep), err)
+	}
+	// Shift each surviving generation down one slot, oldest first so we
+	// never clobber a file we still need to move.
+	for i := keep - 1; i >= 0; i-- {
+		src, dst := gen(i), gen(i+1)
+		if _, err := os.Stat(src); errors.Is(err, os.ErrNotExist) {
+			continue // gap in the chain — nothing to move
+		}
+		if err := os.Rename(src, dst); err != nil {
+			s.logger.Printf("audit rotate %s → %s: %v", src, dst, err)
+		}
 	}
 }
 

From af5a48f4c972ad26f384c2b28a12add69da1068f Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 14:27:53 +0000
Subject: [PATCH 4/9] feat(appstore): cap spawned-app address space (RLIMIT_AS)
 on Linux
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend applyChildResourceLimits to set RLIMIT_AS alongside RLIMIT_NOFILE
for spawned children, configurable via Config.ChildMemoryLimitBytes
(default 4 GiB — generous enough for Go/Node/Python virtual reservations,
still a runaway guard). No-op on non-Linux, documented. Tests read the
limits back via prlimit64 to confirm enforcement.
---
 plugin/appstore/resource_test.go              | 15 ++++
 plugin/appstore/rlimit_linux.go               | 59 ++++++++++------
 plugin/appstore/rlimit_linux_test.go          | 70 +++++++++++++++++++
 plugin/appstore/rlimit_other.go               | 19 ++---
 plugin/appstore/service.go                    |  8 +++
 plugin/appstore/supervisor.go                 | 19 ++++-
 .../appstore/zz2_supervisor_lifecycle_test.go |  2 +-
 7 files changed, 160 insertions(+), 32 deletions(-)
 create mode 100644 plugin/appstore/resource_test.go
 create mode 100644 plugin/appstore/rlimit_linux_test.go

diff --git a/plugin/appstore/resource_test.go b/plugin/appstore/resource_test.go
new file mode 100644
index 0000000..ac7f587
--- /dev/null
+++ b/plugin/appstore/resource_test.go
@@ -0,0 +1,15 @@
+package appstore
+
+import "testing"
+
+// TestChildAddressSpaceLimitResolution covers the default-vs-override
+// resolution of the RLIMIT_AS cap (platform-independent).
+func TestChildAddressSpaceLimitResolution(t *testing.T) {
+	t.Parallel()
+	if got := newSupervisor(Config{}, Deps{}, newQuietLogger(t)).childAddressSpaceLimit(); got != defaultChildAddressSpaceLimit {
+		t.Errorf("unset → %d, want default %d", got, defaultChildAddressSpaceLimit)
+	}
+	if got := newSupervisor(Config{ChildMemoryLimitBytes: 123 << 20}, Deps{}, newQuietLogger(t)).childAddressSpaceLimit(); got != 123<<20 {
+		t.Errorf("override → %d, want %d", got, 123<<20)
+	}
+}
diff --git a/plugin/appstore/rlimit_linux.go b/plugin/appstore/rlimit_linux.go
index cb8d790..66100b9 100644
--- a/plugin/appstore/rlimit_linux.go
+++ b/plugin/appstore/rlimit_linux.go
@@ -27,29 +27,44 @@ type rlimit64 struct {
 	Max uint64
 }
 
-// applyChildResourceLimits sets RLIMIT_NOFILE on a freshly-spawned
-// child after exec. Best-effort: a failure logs but doesn't kill the
-// spawn — the OS-wide ulimit still applies. We use prlimit (not
-// setrlimit) because the parent (supervisor) doesn't want to bound
-// its own fd count; we only want to bound the child's.
+// applyChildResourceLimits sets resource limits on a freshly-spawned
+// child after exec, via prlimit(2) targeting the child pid (so the
+// supervisor's own limits are untouched). Best-effort: any failure logs
+// but does not kill the spawn — the OS-wide ulimit still applies.
+//
+//   - RLIMIT_NOFILE is always set to childFDLimit.
+//   - RLIMIT_AS (virtual address space) is set to addrSpaceLimit when it
+//     is non-zero. NOTE: RLIMIT_AS bounds *address space*, not RSS.
+//     Runtime-managed languages (Go, Node/V8, Python) reserve large
+//     virtual regions well above their working set, so the cap must be
+//     generous — a too-low value makes the runtime fail to mmap and the
+//     child crashes on startup. The supervisor's default
+//     (defaultChildAddressSpaceLimit) is chosen with that headroom.
 //
 // There is a tiny race: between cmd.Start and prlimit landing, the
-// child can open more fds. For a wallet-class app starting from a
-// clean state, that's a handful at most — well under the cap.
-func applyChildResourceLimits(pid int, logger *log.Logger) {
-	want := rlimit64{Cur: childFDLimit, Max: childFDLimit}
-	// SYS_PRLIMIT64 takes (pid, resource, *new_limit, *old_limit).
-	// We don't care about the old limit; pass a nil out-pointer.
-	_, _, errno := syscall.Syscall6(
-		syscall.SYS_PRLIMIT64,
-		uintptr(pid),
-		uintptr(syscall.RLIMIT_NOFILE),
-		uintptr(unsafe.Pointer(&want)),
-		0, 0, 0,
-	)
-	if errno != 0 {
-		logger.Printf("prlimit pid=%d RLIMIT_NOFILE=%d: %v (proceeding; OS-wide ulimit still applies)", pid, childFDLimit, errno)
-		return
+// child can allocate. For an app starting from a clean state that's a
+// handful of fds / a small allocation — well under the caps.
+func applyChildResourceLimits(pid int, addrSpaceLimit uint64, logger *log.Logger) {
+	setLimit := func(resource int, name string, val uint64) {
+		want := rlimit64{Cur: val, Max: val}
+		// SYS_PRLIMIT64 takes (pid, resource, *new_limit, *old_limit).
+		// We don't care about the old limit; pass a nil out-pointer.
+		_, _, errno := syscall.Syscall6(
+			syscall.SYS_PRLIMIT64,
+			uintptr(pid),
+			uintptr(resource),
+			uintptr(unsafe.Pointer(&want)),
+			0, 0, 0,
+		)
+		if errno != 0 {
+			logger.Printf("prlimit pid=%d %s=%d: %v (proceeding; OS-wide ulimit still applies)", pid, name, val, errno)
+			return
+		}
+		logger.Printf("prlimit pid=%d %s=%d ok", pid, name, val)
+	}
+
+	setLimit(syscall.RLIMIT_NOFILE, "RLIMIT_NOFILE", childFDLimit)
+	if addrSpaceLimit > 0 {
+		setLimit(syscall.RLIMIT_AS, "RLIMIT_AS", addrSpaceLimit)
 	}
-	logger.Printf("prlimit pid=%d RLIMIT_NOFILE=%d ok", pid, childFDLimit)
 }
diff --git a/plugin/appstore/rlimit_linux_test.go b/plugin/appstore/rlimit_linux_test.go
new file mode 100644
index 0000000..66de634
--- /dev/null
+++ b/plugin/appstore/rlimit_linux_test.go
@@ -0,0 +1,70 @@
+//go:build linux
+
+package appstore
+
+import (
+	"os/exec"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+// getChildRlimit reads back a resource limit for pid via prlimit64(2).
+func getChildRlimit(t *testing.T, pid, resource int) rlimit64 {
+	t.Helper()
+	var out rlimit64
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_PRLIMIT64,
+		uintptr(pid), uintptr(resource), 0,
+		uintptr(unsafe.Pointer(&out)), 0, 0,
+	)
+	if errno != 0 {
+		t.Fatalf("prlimit get pid=%d res=%d: %v", pid, resource, errno)
+	}
+	return out
+}
+
+func startSleeper(t *testing.T) *exec.Cmd {
+	t.Helper()
+	cmd := exec.Command("sleep", "5")
+	if err := cmd.Start(); err != nil {
+		t.Skipf("cannot start sleep child: %v", err)
+	}
+	t.Cleanup(func() {
+		_ = cmd.Process.Kill()
+		_ = cmd.Wait()
+	})
+	return cmd
+}
+
+// TestApplyChildResourceLimits_SetsAddressSpace asserts RLIMIT_AS and
+// RLIMIT_NOFILE are actually applied to the child by reading them back.
+func TestApplyChildResourceLimits_SetsAddressSpace(t *testing.T) {
+	cmd := startSleeper(t)
+	pid := cmd.Process.Pid
+	const limit uint64 = 2 << 30
+	applyChildResourceLimits(pid, limit, newQuietLogger(t))
+
+	if got := getChildRlimit(t, pid, syscall.RLIMIT_AS); got.Cur != limit {
+		t.Errorf("RLIMIT_AS Cur = %d, want %d", got.Cur, limit)
+	}
+	if got := getChildRlimit(t, pid, syscall.RLIMIT_NOFILE); got.Cur != childFDLimit {
+		t.Errorf("RLIMIT_NOFILE Cur = %d, want %d", got.Cur, childFDLimit)
+	}
+}
+
+// TestApplyChildResourceLimits_ZeroSkipsAddressSpace asserts a zero
+// addrSpaceLimit leaves RLIMIT_AS untouched while still applying NOFILE.
+func TestApplyChildResourceLimits_ZeroSkipsAddressSpace(t *testing.T) {
+	cmd := startSleeper(t)
+	pid := cmd.Process.Pid
+	before := getChildRlimit(t, pid, syscall.RLIMIT_AS)
+	applyChildResourceLimits(pid, 0, newQuietLogger(t))
+	after := getChildRlimit(t, pid, syscall.RLIMIT_AS)
+	if after.Cur != before.Cur {
+		t.Errorf("RLIMIT_AS changed despite zero limit: before=%d after=%d", before.Cur, after.Cur)
+	}
+	if got := getChildRlimit(t, pid, syscall.RLIMIT_NOFILE); got.Cur != childFDLimit {
+		t.Errorf("RLIMIT_NOFILE Cur = %d, want %d", got.Cur, childFDLimit)
+	}
+}
diff --git a/plugin/appstore/rlimit_other.go b/plugin/appstore/rlimit_other.go
index ab9feef..54ad666 100644
--- a/plugin/appstore/rlimit_other.go
+++ b/plugin/appstore/rlimit_other.go
@@ -6,13 +6,16 @@ import "log"
 
 // applyChildResourceLimits is the non-Linux build's no-op. macOS's
 // equivalent (setrlimit) affects the calling process, not children;
-// without a wrapper binary or pre-exec hook there's no portable way
-// to bound the child's fd count from the supervisor. Documented as
-// an RC1 known gap; production deployments on linux get the real
-// limit via rlimit_linux.go's applyChildResourceLimits.
-func applyChildResourceLimits(pid int, logger *log.Logger) {
+// without a wrapper binary or pre-exec hook there's no portable way to
+// bound the child's fd count or address space from the supervisor.
+// Documented as an RC1 known gap; production deployments on linux get
+// the real limits (RLIMIT_NOFILE + RLIMIT_AS) via rlimit_linux.go.
+//
+// The addrSpaceLimit parameter is accepted for signature parity with the
+// Linux build and ignored here.
+func applyChildResourceLimits(pid int, addrSpaceLimit uint64, logger *log.Logger) {
 	// Single startup log line would be nicer than per-spawn, but
-	// inlining here keeps the supervisor call-site identical to
-	// the Linux build. Cheap log line at info level — easy to grep.
-	logger.Printf("resource limits not enforced for pid=%d on this platform (linux-only)", pid)
+	// inlining here keeps the supervisor call-site identical to the
+	// Linux build. Cheap log line at info level — easy to grep.
+	logger.Printf("resource limits not enforced for pid=%d on this platform (linux-only); requested addr-space cap=%d ignored", pid, addrSpaceLimit)
 }
diff --git a/plugin/appstore/service.go b/plugin/appstore/service.go
index 34a2163..a20f8b3 100644
--- a/plugin/appstore/service.go
+++ b/plugin/appstore/service.go
@@ -57,6 +57,14 @@ type Config struct {
 	// defaultAuditLogBackups (3). Worst-case per-app footprint is
 	// (AuditLogMaxBackups+1) × AuditLogMaxBytes.
 	AuditLogMaxBackups int
+
+	// ChildMemoryLimitBytes caps the virtual address space (RLIMIT_AS)
+	// of each spawned app process on Linux. Zero defaults to
+	// defaultChildAddressSpaceLimit (4 GiB). No-op on non-Linux
+	// platforms. Set generously — RLIMIT_AS bounds address space, not
+	// RSS, and runtime-managed languages reserve far more virtual
+	// memory than they use.
+	ChildMemoryLimitBytes uint64
 }
 
 // EmbeddedCatalogPubkey is the production trust anchor for the catalog.
diff --git a/plugin/appstore/supervisor.go b/plugin/appstore/supervisor.go
index 0864d86..c59ad3a 100644
--- a/plugin/appstore/supervisor.go
+++ b/plugin/appstore/supervisor.go
@@ -41,6 +41,14 @@ const supervisorLogRotated = "supervisor.log.1"
 // without unbounded disk growth.
 const defaultAuditLogBackups = 3
 
+// defaultChildAddressSpaceLimit is the RLIMIT_AS cap applied to spawned
+// app processes when Config.ChildMemoryLimitBytes is unset. 4 GiB is
+// generous enough for Go/Node/Python runtime virtual reservations (a
+// trivial Go binary needs ~1 GiB of address space) while still bounding
+// a runaway / hostile app well below host memory. Linux-only; see
+// rlimit_linux.go.
+const defaultChildAddressSpaceLimit uint64 = 4 << 30
+
 // maxAuditLogSize bounds each app's active audit log. A crash-looping
 // app emits ~5 lines per failed spawn cycle (verify-fail / exit /
 // spawn / spawn-fail / supervise-stop), each ~150B, so 10MB is
@@ -676,6 +684,15 @@ func atoiOrZero(s string) int {
 	return n
 }
 
+// childAddressSpaceLimit resolves the RLIMIT_AS cap for spawned apps:
+// the configured value, or defaultChildAddressSpaceLimit when unset.
+func (s *supervisor) childAddressSpaceLimit() uint64 {
+	if s.cfg.ChildMemoryLimitBytes > 0 {
+		return s.cfg.ChildMemoryLimitBytes
+	}
+	return defaultChildAddressSpaceLimit
+}
+
 // nextBackoff doubles cur, capping at max. Shared by superviseOne's
 // crash-restart and verify-fail backoff ramps so both grow identically.
 func nextBackoff(cur, max time.Duration) time.Duration {
@@ -859,7 +876,7 @@ func (s *supervisor) spawn(ctx context.Context, a *installedApp) int {
 	// child. Best-effort: a failure logs but doesn't kill the spawn
 	// (OS-wide ulimits still apply). Linux uses prlimit(2) for a
 	// RLIMIT_NOFILE cap; other platforms log a "not enforced" line.
-	applyChildResourceLimits(cmd.Process.Pid, s.logger)
+	applyChildResourceLimits(cmd.Process.Pid, s.childAddressSpaceLimit(), s.logger)
 	s.writeAuditLine(a, auditEvent{Event: "spawn", PID: cmd.Process.Pid, SHA256: a.Manifest.Binary.SHA256, BinaryAt: a.BinaryPath})
 
 	// Watch for the socket to appear; once it does, mark ready.
diff --git a/plugin/appstore/zz2_supervisor_lifecycle_test.go b/plugin/appstore/zz2_supervisor_lifecycle_test.go
index 68e1c19..9050469 100644
--- a/plugin/appstore/zz2_supervisor_lifecycle_test.go
+++ b/plugin/appstore/zz2_supervisor_lifecycle_test.go
@@ -565,7 +565,7 @@ func TestApplyChildResourceLimits_Smoke(t *testing.T) {
 	t.Parallel()
 	// Pass an invalid PID; the function is best-effort and must not panic.
 	logger := newQuietLogger(t)
-	applyChildResourceLimits(0, logger)
+	applyChildResourceLimits(0, defaultChildAddressSpaceLimit, logger)
 }
 
 // TestService_AppsBeforeStart returns nil per the docstring.

From 3c3a62838ed7435d7535744d5898e1586c4f9ce5 Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 14:31:16 +0000
Subject: [PATCH 5/9] feat(appstore): re-verify binary + reject symlink at
 spawn (TOCTOU)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add verifyAtSpawn, run immediately before execve in spawn(): Lstat
symlink rejection plus a sha256 re-check against the pinned hash. Closes
the time-of-check/time-of-use gap between scanInstalled (runs once at
discovery) and the actual launch — a binary swapped for a symlink or
different bytes after the scan is now refused. Tests cover both swap
cases and the valid-binary pass.
---
 plugin/appstore/supervisor.go                 |  32 ++++++
 plugin/appstore/toctou_test.go                | 105 ++++++++++++++++++
 .../appstore/zz2_supervisor_lifecycle_test.go |  21 +++-
 3 files changed, 154 insertions(+), 4 deletions(-)
 create mode 100644 plugin/appstore/toctou_test.go

diff --git a/plugin/appstore/supervisor.go b/plugin/appstore/supervisor.go
index c59ad3a..e80dfc7 100644
--- a/plugin/appstore/supervisor.go
+++ b/plugin/appstore/supervisor.go
@@ -826,12 +826,44 @@ func (s *supervisor) verifyBinary(a *installedApp) error {
 	return nil
 }
 
+// verifyAtSpawn re-runs the binary trust checks immediately before exec,
+// closing the TOCTOU window between scanInstalled (which checks once, at
+// discovery) and the actual launch. An attacker who swaps the resolved
+// binary for a symlink (→ a host binary like /bin/sh) or for different
+// bytes after the scan but before spawn is caught here, not after the
+// hostile code has already run.
+//
+// Lstat (not Stat) so a symlink is seen as itself, not followed. The
+// sha256 re-hash reuses verifyBinary. A tiny residual window remains
+// between this check and execve — inherent to any check-then-exec — but
+// it is orders of magnitude smaller than scan-to-spawn.
+func (s *supervisor) verifyAtSpawn(a *installedApp) error {
+	fi, err := os.Lstat(a.BinaryPath)
+	if err != nil {
+		return fmt.Errorf("lstat binary: %w", err)
+	}
+	if fi.Mode()&os.ModeSymlink != 0 {
+		return fmt.Errorf("binary path %s is a symlink (refusing)", a.BinaryPath)
+	}
+	return s.verifyBinary(a)
+}
+
 // spawn launches the app's binary, blocks until it exits, returns the
 // exit code (or -1 on error). The caller is responsible for restart logic.
 //
 // Marks the app "ready" once its socket has appeared, so concurrent
 // Call() invocations can know when to dial.
 func (s *supervisor) spawn(ctx context.Context, a *installedApp) int {
+	// TOCTOU guard: re-verify the binary (no symlink, sha256 still
+	// matches) immediately before exec. The supervise loop already
+	// verified at the top of its cycle, but a swap could land in the
+	// window between that check and this exec — refuse to launch if so.
+	if err := s.verifyAtSpawn(a); err != nil {
+		s.logger.Printf("app=%s: spawn-time verify failed: %v — refusing to exec", a.Manifest.ID, err)
+		s.writeAuditLine(a, auditEvent{Event: "verify-fail", Reason: "spawn-time: " + err.Error(), SHA256: a.Manifest.Binary.SHA256, BinaryAt: a.BinaryPath})
+		return -1
+	}
+
 	// Drop a stale socket if a previous instance crashed without cleaning.
 	if _, err := os.Stat(a.SocketPath); err == nil {
 		_ = os.Remove(a.SocketPath)
diff --git a/plugin/appstore/toctou_test.go b/plugin/appstore/toctou_test.go
new file mode 100644
index 0000000..2541630
--- /dev/null
+++ b/plugin/appstore/toctou_test.go
@@ -0,0 +1,105 @@
+package appstore
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+)
+
+// appWithBinary writes content to <dir>/<id>/bin and returns an
+// installedApp whose manifest pins the sha256 of that content.
+func appWithBinary(t *testing.T, root, id string, content []byte) *installedApp {
+	t.Helper()
+	dir := filepath.Join(root, id)
+	if err := os.MkdirAll(dir, 0o700); err != nil {
+		t.Fatal(err)
+	}
+	binPath := filepath.Join(dir, "bin")
+	if err := os.WriteFile(binPath, content, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	sum := sha256.Sum256(content)
+	m := parseDummyManifest(t, id)
+	m.Binary.SHA256 = hex.EncodeToString(sum[:])
+	return &installedApp{
+		Dir:        dir,
+		BinaryPath: binPath,
+		SocketPath: filepath.Join(dir, "app.sock"),
+		Manifest:   m,
+	}
+}
+
+// auditContains reports whether the app's supervisor.log contains needle.
+func auditContains(t *testing.T, a *installedApp, needle string) bool {
+	t.Helper()
+	b, _ := os.ReadFile(filepath.Join(a.Dir, supervisorLogName))
+	return strings.Contains(string(b), needle)
+}
+
+// TestSpawn_RefusesSymlinkSwap covers the TOCTOU case where the resolved
+// binary is replaced by a symlink (e.g. → /bin/sh) after the scan but
+// before exec: spawn must refuse and never launch.
+func TestSpawn_RefusesSymlinkSwap(t *testing.T) {
+	t.Parallel()
+	root := t.TempDir()
+	a := appWithBinary(t, root, "io.toctou.symlink", []byte("#!/bin/true\n"))
+	sup := newSupervisor(Config{InstallRoot: root}, Deps{}, newQuietLogger(t))
+
+	// Swap the real binary for a symlink to a host binary.
+	if err := os.Remove(a.BinaryPath); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Symlink("/bin/sh", a.BinaryPath); err != nil {
+		t.Fatal(err)
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+	defer cancel()
+	if code := sup.spawn(ctx, a); code != -1 {
+		t.Fatalf("spawn exit code = %d, want -1 (refused)", code)
+	}
+	if !auditContains(t, a, "spawn-time") || !auditContains(t, a, "symlink") {
+		t.Errorf("expected spawn-time symlink verify-fail audit line; log missing it")
+	}
+}
+
+// TestSpawn_RefusesContentSwap covers the TOCTOU case where the binary
+// bytes change (sha256 no longer matches the pinned hash) between scan
+// and exec.
+func TestSpawn_RefusesContentSwap(t *testing.T) {
+	t.Parallel()
+	root := t.TempDir()
+	a := appWithBinary(t, root, "io.toctou.content", []byte("original"))
+	sup := newSupervisor(Config{InstallRoot: root}, Deps{}, newQuietLogger(t))
+
+	// Swap the bytes — pinned sha256 no longer matches.
+	if err := os.WriteFile(a.BinaryPath, []byte("tampered-payload"), 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+	defer cancel()
+	if code := sup.spawn(ctx, a); code != -1 {
+		t.Fatalf("spawn exit code = %d, want -1 (refused)", code)
+	}
+	if !auditContains(t, a, "spawn-time") || !auditContains(t, a, "sha256 mismatch") {
+		t.Errorf("expected spawn-time sha256 verify-fail audit line; log missing it")
+	}
+}
+
+// TestVerifyAtSpawn_AcceptsGoodBinary confirms the guard passes for an
+// untampered binary matching its pinned hash.
+func TestVerifyAtSpawn_AcceptsGoodBinary(t *testing.T) {
+	t.Parallel()
+	root := t.TempDir()
+	a := appWithBinary(t, root, "io.toctou.ok", []byte("good-bytes"))
+	sup := newSupervisor(Config{InstallRoot: root}, Deps{}, newQuietLogger(t))
+	if err := sup.verifyAtSpawn(a); err != nil {
+		t.Errorf("verifyAtSpawn rejected a valid binary: %v", err)
+	}
+}
diff --git a/plugin/appstore/zz2_supervisor_lifecycle_test.go b/plugin/appstore/zz2_supervisor_lifecycle_test.go
index 9050469..929fc29 100644
--- a/plugin/appstore/zz2_supervisor_lifecycle_test.go
+++ b/plugin/appstore/zz2_supervisor_lifecycle_test.go
@@ -99,7 +99,7 @@ func TestSpawn_FastExitTriggersExitCode(t *testing.T) {
 	if err := os.MkdirAll(appDir, 0o700); err != nil {
 		t.Fatal(err)
 	}
-	path, _ := fakeBinaryScript(t, appDir, "bin", 0, 0)
+	path, sum := fakeBinaryScript(t, appDir, "bin", 0, 0)
 
 	a := &installedApp{
 		Dir:        appDir,
@@ -109,6 +109,7 @@ func TestSpawn_FastExitTriggersExitCode(t *testing.T) {
 		IDPath:     filepath.Join(appDir, "identity.json"),
 		Manifest:   parseDummyManifest(t, "io.spawn.fast"),
 	}
+	a.Manifest.Binary.SHA256 = sum // pass the spawn-time TOCTOU re-verify
 	sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t))
 	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
 	defer cancel()
@@ -128,7 +129,7 @@ func TestSpawn_NonZeroExitPropagates(t *testing.T) {
 	if err := os.MkdirAll(appDir, 0o700); err != nil {
 		t.Fatal(err)
 	}
-	path, _ := fakeBinaryScript(t, appDir, "bin", 42, 0)
+	path, sum := fakeBinaryScript(t, appDir, "bin", 42, 0)
 
 	a := &installedApp{
 		Dir:        appDir,
@@ -138,6 +139,7 @@ func TestSpawn_NonZeroExitPropagates(t *testing.T) {
 		IDPath:     filepath.Join(appDir, "identity.json"),
 		Manifest:   parseDummyManifest(t, "io.spawn.exit42"),
 	}
+	a.Manifest.Binary.SHA256 = sum // pass the spawn-time TOCTOU re-verify
 	sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t))
 	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
 	defer cancel()
@@ -157,14 +159,24 @@ func TestSpawn_StartFailure(t *testing.T) {
 	if err := os.MkdirAll(appDir, 0o700); err != nil {
 		t.Fatal(err)
 	}
+	// A real, executable file whose pinned sha256 matches (so it passes
+	// the spawn-time TOCTOU re-verify) but whose interpreter is missing,
+	// so execve fails at cmd.Start — exercising the spawn-fail branch.
+	binPath := filepath.Join(appDir, "bin")
+	body := []byte("#!/nonexistent/interp-xyz-12345\n")
+	if err := os.WriteFile(binPath, body, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	sum := sha256.Sum256(body)
 	a := &installedApp{
 		Dir:        appDir,
-		BinaryPath: filepath.Join(appDir, "does-not-exist"),
+		BinaryPath: binPath,
 		SocketPath: filepath.Join(appDir, "app.sock"),
 		DBPath:     filepath.Join(appDir, "data.db"),
 		IDPath:     filepath.Join(appDir, "identity.json"),
 		Manifest:   parseDummyManifest(t, "io.spawn.nostart"),
 	}
+	a.Manifest.Binary.SHA256 = hex.EncodeToString(sum[:])
 	sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t))
 	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
 	defer cancel()
@@ -195,7 +207,7 @@ func TestSpawn_StaleSocketIsCleaned(t *testing.T) {
 	if err := os.WriteFile(socketPath, []byte("stale"), 0o600); err != nil {
 		t.Fatal(err)
 	}
-	path, _ := fakeBinaryScript(t, appDir, "bin", 0, 0)
+	path, sum := fakeBinaryScript(t, appDir, "bin", 0, 0)
 
 	a := &installedApp{
 		Dir:        appDir,
@@ -205,6 +217,7 @@ func TestSpawn_StaleSocketIsCleaned(t *testing.T) {
 		IDPath:     filepath.Join(appDir, "identity.json"),
 		Manifest:   parseDummyManifest(t, "io.spawn.stale"),
 	}
+	a.Manifest.Binary.SHA256 = sum // pass the spawn-time TOCTOU re-verify
 	sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t))
 	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
 	defer cancel()

From ee7b9472fbb0470fb3e021b5d35f137cabeaee8a Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 14:33:53 +0000
Subject: [PATCH 6/9] feat(extend): rate-limit hook dispatch + cap dynamic
 registrations

Add two DoS guards to pkg/extend:

- Per-app token-bucket rate limiter on hook dispatch (Registry.Run),
  enabled via SetRateLimit; aborts with ErrRateLimited when an app's
  budget is spent. Off by default (back-compat); injectable clock for
  deterministic tests.
- Cap of maxDynamicRegistrationsPerApp (32) on runtime hook
  registrations per app in DaemonHandler.Register, via Registry.CountForApp;
  refuses growth with ErrTooManyRegistrations.
---
 pkg/extend/extend.go         |  47 +++++++++++++
 pkg/extend/ratelimit.go      |  60 +++++++++++++++++
 pkg/extend/ratelimit_test.go | 126 +++++++++++++++++++++++++++++++++++
 pkg/extend/runtime.go        |  17 +++++
 4 files changed, 250 insertions(+)
 create mode 100644 pkg/extend/ratelimit.go
 create mode 100644 pkg/extend/ratelimit_test.go

diff --git a/pkg/extend/extend.go b/pkg/extend/extend.go
index a1eb8e5..0ee1c7a 100644
--- a/pkg/extend/extend.go
+++ b/pkg/extend/extend.go
@@ -115,8 +115,17 @@ type Registry struct {
 
 	mu      sync.RWMutex
 	byPoint map[HookPoint][]Extension
+
+	// limiter, when non-nil, rate-limits hook dispatch per app in Run.
+	// Nil = unlimited (default); enable via SetRateLimit.
+	limiter *rateLimiter
 }
 
+// ErrRateLimited is returned by Run when an app's hook-invocation rate
+// budget is exhausted. The chain aborts rather than dispatch into an app
+// that's being called too aggressively.
+var ErrRateLimited = errors.New("extend: hook rate limit exceeded")
+
 // NewRegistry constructs an empty Registry. dispatch is required.
 func NewRegistry(dispatch Dispatcher) *Registry {
 	if dispatch == nil {
@@ -133,6 +142,38 @@ func NewRegistry(dispatch Dispatcher) *Registry {
 	}
 }
 
+// SetRateLimit enables per-app hook-dispatch rate limiting: each app may
+// fire `burst` hook invocations instantly and `ratePerSec` sustained
+// thereafter. Once an app's budget is exhausted, Run aborts that app's
+// hook with ErrRateLimited. Call with a non-positive rate or burst to
+// disable. Intended to be set once at daemon wire-up.
+func (r *Registry) SetRateLimit(ratePerSec float64, burst int) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if ratePerSec <= 0 || burst <= 0 {
+		r.limiter = nil
+		return
+	}
+	r.limiter = newRateLimiter(ratePerSec, burst)
+}
+
+// CountForApp returns the number of currently-registered extensions
+// belonging to appID across all hook points. Used to bound how many
+// dynamic registrations a single app may hold.
+func (r *Registry) CountForApp(appID string) int {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	n := 0
+	for _, list := range r.byPoint {
+		for _, ext := range list {
+			if ext.AppID == appID {
+				n++
+			}
+		}
+	}
+	return n
+}
+
 // Register adds one extension to the registry. Returns an error if
 // the primitive is shape-invalid, the method is empty, or a flag is
 // shaped wrong. Multiple apps may register for the same hook point;
@@ -239,11 +280,17 @@ func (r *Registry) FlagsFor(p HookPoint) []FlagSpec {
 // the hook may have populated, then cleared.
 func (r *Registry) Run(ctx context.Context, p HookPoint, args HookArgs) (HookArgs, error) {
 	hooks := r.HooksFor(p)
+	r.mu.RLock()
+	lim := r.limiter
+	r.mu.RUnlock()
 	current := args
 	for _, h := range hooks {
 		if err := ctx.Err(); err != nil {
 			return current, err
 		}
+		if lim != nil && !lim.allow(h.AppID) {
+			return current, fmt.Errorf("extend %s: hook %s.%s: %w", p, h.AppID, h.Method, ErrRateLimited)
+		}
 		next, err := r.dispatch(ctx, h.AppID, h.Method, cloneArgs(current))
 		if err != nil {
 			return current, fmt.Errorf("extend %s: hook %s.%s: %w", p, h.AppID, h.Method, err)
diff --git a/pkg/extend/ratelimit.go b/pkg/extend/ratelimit.go
new file mode 100644
index 0000000..c2755c1
--- /dev/null
+++ b/pkg/extend/ratelimit.go
@@ -0,0 +1,60 @@
+package extend
+
+import (
+	"sync"
+	"time"
+)
+
+// rateLimiter is a per-key token bucket used to bound how often the
+// registry dispatches into any single app's hooks. A misbehaving or
+// hostile app whose hooks fire on every primitive cannot turn the hook
+// chain into a DoS amplifier: once its bucket is empty, further hook
+// invocations are refused until it refills.
+type rateLimiter struct {
+	mu      sync.Mutex
+	rate    float64 // tokens added per second
+	burst   float64 // bucket capacity
+	now     func() time.Time
+	buckets map[string]*tokenBucket
+}
+
+type tokenBucket struct {
+	tokens float64
+	last   time.Time
+}
+
+// newRateLimiter builds a limiter allowing `burst` events instantly and
+// `ratePerSec` sustained thereafter, per key.
+func newRateLimiter(ratePerSec float64, burst int) *rateLimiter {
+	return &rateLimiter{
+		rate:    ratePerSec,
+		burst:   float64(burst),
+		now:     time.Now,
+		buckets: map[string]*tokenBucket{},
+	}
+}
+
+// allow reports whether one event for key may proceed now, consuming a
+// token if so. Refills lazily based on elapsed wall-clock time.
+func (rl *rateLimiter) allow(key string) bool {
+	rl.mu.Lock()
+	defer rl.mu.Unlock()
+	t := rl.now()
+	b := rl.buckets[key]
+	if b == nil {
+		b = &tokenBucket{tokens: rl.burst, last: t}
+		rl.buckets[key] = b
+	}
+	if elapsed := t.Sub(b.last).Seconds(); elapsed > 0 {
+		b.tokens += elapsed * rl.rate
+		if b.tokens > rl.burst {
+			b.tokens = rl.burst
+		}
+		b.last = t
+	}
+	if b.tokens >= 1 {
+		b.tokens--
+		return true
+	}
+	return false
+}
diff --git a/pkg/extend/ratelimit_test.go b/pkg/extend/ratelimit_test.go
new file mode 100644
index 0000000..181ecc1
--- /dev/null
+++ b/pkg/extend/ratelimit_test.go
@@ -0,0 +1,126 @@
+package extend
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"testing"
+	"time"
+)
+
+// noopDispatch is a Dispatcher that echoes args back with no error.
+func noopDispatch(_ context.Context, _, _ string, a HookArgs) (HookArgs, error) {
+	return a, nil
+}
+
+// TestRun_RateLimitsPerApp asserts that once an app's burst budget is
+// spent, Run refuses further hook dispatch with ErrRateLimited, and that
+// the budget refills as the (injected) clock advances.
+func TestRun_RateLimitsPerApp(t *testing.T) {
+	t.Parallel()
+	r := NewRegistry(noopDispatch)
+	if err := r.Register(Extension{AppID: "io.spammer", Primitive: PreSendMessage, Method: "h"}); err != nil {
+		t.Fatal(err)
+	}
+	// 1 token/sec, burst 3.
+	r.SetRateLimit(1, 3)
+
+	// Pin the clock so refill is deterministic.
+	base := time.Unix(1_000_000, 0)
+	now := base
+	r.mu.Lock()
+	r.limiter.now = func() time.Time { return now }
+	r.mu.Unlock()
+
+	ctx := context.Background()
+	// First 3 calls consume the burst.
+	for i := 0; i < 3; i++ {
+		if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); err != nil {
+			t.Fatalf("call %d should pass, got %v", i, err)
+		}
+	}
+	// 4th call (no time elapsed) is rate-limited.
+	if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); !errors.Is(err, ErrRateLimited) {
+		t.Fatalf("4th call err = %v, want ErrRateLimited", err)
+	}
+	// Advance 1s → exactly one token refills → one call passes, next fails.
+	now = base.Add(time.Second)
+	if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); err != nil {
+		t.Fatalf("post-refill call should pass, got %v", err)
+	}
+	if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); !errors.Is(err, ErrRateLimited) {
+		t.Fatalf("call after refill drained err = %v, want ErrRateLimited", err)
+	}
+}
+
+// TestRun_RateLimitIsPerApp confirms one app exhausting its budget does
+// not block a different app's hooks.
+func TestRun_RateLimitIsPerApp(t *testing.T) {
+	t.Parallel()
+	r := NewRegistry(noopDispatch)
+	_ = r.Register(Extension{AppID: "a", Primitive: PreSendMessage, Method: "h", Order: 1})
+	_ = r.Register(Extension{AppID: "b", Primitive: PostRecvMessage, Method: "h", Order: 1})
+	r.SetRateLimit(1, 1)
+	now := time.Unix(2_000_000, 0)
+	r.mu.Lock()
+	r.limiter.now = func() time.Time { return now }
+	r.mu.Unlock()
+
+	ctx := context.Background()
+	// Drain app "a".
+	if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); err != nil {
+		t.Fatal(err)
+	}
+	if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); !errors.Is(err, ErrRateLimited) {
+		t.Fatalf("app a should be limited, got %v", err)
+	}
+	// App "b" still has its own full budget.
+	if _, err := r.Run(ctx, PostRecvMessage, HookArgs{}); err != nil {
+		t.Fatalf("app b should not be limited, got %v", err)
+	}
+}
+
+// TestRun_NoLimiterByDefault confirms back-compat: without SetRateLimit,
+// many invocations all pass.
+func TestRun_NoLimiterByDefault(t *testing.T) {
+	t.Parallel()
+	r := NewRegistry(noopDispatch)
+	_ = r.Register(Extension{AppID: "a", Primitive: PreSendMessage, Method: "h"})
+	for i := 0; i < 1000; i++ {
+		if _, err := r.Run(context.Background(), PreSendMessage, HookArgs{}); err != nil {
+			t.Fatalf("unlimited registry should never rate-limit, got %v at %d", err, i)
+		}
+	}
+}
+
+// TestDaemonHandler_CapsDynamicRegistrations asserts an app cannot exceed
+// maxDynamicRegistrationsPerApp dynamic hook registrations.
+func TestDaemonHandler_CapsDynamicRegistrations(t *testing.T) {
+	t.Parallel()
+	reg := NewRegistry(noopDispatch)
+	h := NewDaemonHandler(reg, AllowAll)
+
+	for i := 0; i < maxDynamicRegistrationsPerApp; i++ {
+		err := h.Register("io.greedy", Extension{
+			Primitive: PreSendMessage,
+			Method:    fmt.Sprintf("m%d", i),
+		})
+		if err != nil {
+			t.Fatalf("registration %d should succeed, got %v", i, err)
+		}
+	}
+	// One past the cap must be refused.
+	err := h.Register("io.greedy", Extension{Primitive: PreSendMessage, Method: "overflow"})
+	if !errors.Is(err, ErrTooManyRegistrations) {
+		t.Fatalf("over-cap registration err = %v, want ErrTooManyRegistrations", err)
+	}
+	// A different app is unaffected.
+	if err := h.Register("io.modest", Extension{Primitive: PreSendMessage, Method: "m"}); err != nil {
+		t.Fatalf("other app should still register, got %v", err)
+	}
+	// After unregistering one of greedy's hooks, it can register again.
+	reg.UnregisterOne("io.greedy", PreSendMessage, "m0")
+	if err := h.Register("io.greedy", Extension{Primitive: PreSendMessage, Method: "again"}); err != nil {
+		t.Fatalf("after freeing a slot, registration should succeed, got %v", err)
+	}
+}
diff --git a/pkg/extend/runtime.go b/pkg/extend/runtime.go
index b077f91..8434d2f 100644
--- a/pkg/extend/runtime.go
+++ b/pkg/extend/runtime.go
@@ -36,6 +36,18 @@ var DenyAll Permission = PermissionFunc(func(string, HookPoint) bool { return fa
 // configured Permission rejects the request.
 var ErrPermissionDenied = errors.New("extend: permission denied")
 
+// maxDynamicRegistrationsPerApp bounds how many hooks a single app may
+// hold at once via the runtime-register IPC. Without a cap, an app
+// permitted to register dynamically could register unboundedly and
+// exhaust memory / slow every primitive's hook lookup — a DoS. The
+// cap is generous (a well-behaved app registers a handful of hooks)
+// while refusing pathological growth.
+const maxDynamicRegistrationsPerApp = 32
+
+// ErrTooManyRegistrations is returned by DaemonHandler.Register when an
+// app already holds maxDynamicRegistrationsPerApp hooks.
+var ErrTooManyRegistrations = errors.New("extend: too many dynamic registrations for app")
+
 // DaemonHandler is the runtime-side IPC surface the daemon exposes to
 // installed apps so they can add/remove their own hooks dynamically
 // (within the bounds Permission allows). Apps call these methods via
@@ -68,6 +80,11 @@ func (h *DaemonHandler) Register(appID string, ext Extension) error {
 	if !h.perms.CanRegister(appID, ext.Primitive) {
 		return fmt.Errorf("%w: %s cannot register %s", ErrPermissionDenied, appID, ext.Primitive)
 	}
+	// Bound the number of hooks one app may hold to prevent unbounded
+	// dynamic registration (memory + per-primitive lookup-cost DoS).
+	if h.reg.CountForApp(appID) >= maxDynamicRegistrationsPerApp {
+		return fmt.Errorf("%w: %s holds %d (max %d)", ErrTooManyRegistrations, appID, h.reg.CountForApp(appID), maxDynamicRegistrationsPerApp)
+	}
 	return h.reg.Register(ext)
 }
 

From 6c8989db8a606c3067074af3b4fac582a3706edb Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 14:35:12 +0000
Subject: [PATCH 7/9] style: gofmt grants_test.go

---
 pkg/manifest/grants_test.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pkg/manifest/grants_test.go b/pkg/manifest/grants_test.go
index c90499c..5ccbd6d 100644
--- a/pkg/manifest/grants_test.go
+++ b/pkg/manifest/grants_test.go
@@ -33,13 +33,13 @@ func TestHasGrant(t *testing.T) {
 		want        bool
 	}
 	for _, c := range []tc{
-		{"ipc.call", "io.pilot.wallet.pay", true},   // exact
+		{"ipc.call", "io.pilot.wallet.pay", true},     // exact
 		{"ipc.call", "io.pilot.wallet.refund", false}, // exact mismatch
-		{"ipc.call", "io.pilot.notes.add", true},    // prefix.*
-		{"ipc.call", "io.pilot.notes.list", true},   // prefix.*
-		{"ipc.call", "io.pilot.notesX.add", false},  // prefix must end at dot
-		{"fs.read", "anything.at.all", true},        // blanket *
-		{"net.dial", "io.pilot.wallet.pay", false},  // cap mismatch
+		{"ipc.call", "io.pilot.notes.add", true},      // prefix.*
+		{"ipc.call", "io.pilot.notes.list", true},     // prefix.*
+		{"ipc.call", "io.pilot.notesX.add", false},    // prefix must end at dot
+		{"fs.read", "anything.at.all", true},          // blanket *
+		{"net.dial", "io.pilot.wallet.pay", false},    // cap mismatch
 	} {
 		if got := m.HasGrant(c.cap, c.target); got != c.want {
 			t.Errorf("HasGrant(%q,%q) = %v, want %v", c.cap, c.target, got, c.want)

From f1493456505f7f6ac5139276ba488d60eb6ed518 Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 15:04:26 +0000
Subject: [PATCH 8/9] docs(changelog): note broker + supervisor hardening for
 rc1

---
 CHANGELOG.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5df6f64..b70e90d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,29 @@ First release candidate. Surface is end-to-end working for the
 Production deployment against an untrusted publisher requires the
 catalog-signing chain landed in a follow-up RC (see "Known gaps").
 
+### Security & hardening — broker + supervisor
+
+- **Broker authorization (deny-by-default).** Every brokered call is
+  gated before dialing an app socket: the method must be in the target's
+  `exposes` set, and cross-app `ipc.call` callers must hold a matching
+  grant (`<app>.<method>`, `<app>.*`, or `*`). New `Service.CallFrom` and
+  `manifest.ExposesMethod` / `HasGrant`; errors `ErrMethodNotExposed`,
+  `ErrGrantMissing`.
+- **TOCTOU re-verification at spawn.** The binary is re-checked
+  immediately before `exec` (symlink rejection + sha256), closing the gap
+  between install-scan and launch.
+- **Exponential verify-fail backoff.** Verification failures retry with
+  capped exponential backoff (was a fixed 30s), consistent with crash-loop
+  handling.
+- **Multi-generation audit log rotation.** `supervisor.log` rotates across
+  N generations (`AuditLogMaxBackups`, default 3) instead of a single step.
+- **Address-space cap (Linux).** Spawned apps get `RLIMIT_AS` alongside
+  `RLIMIT_NOFILE`, configurable via `Config.ChildMemoryLimitBytes`
+  (default 4 GiB); no-op on non-Linux.
+- **Extension DoS guards.** `pkg/extend` adds per-app hook-dispatch rate
+  limiting (`Registry.SetRateLimit`, `ErrRateLimited`) and a per-app cap on
+  dynamic registrations (`ErrTooManyRegistrations`).
+
 ### Added — pilotctl
 
 `pilotctl appstore` is the operator surface for the app store.

From 8fead51fbc28292832b961e48ff650435ab71bc9 Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 15:18:57 +0000
Subject: [PATCH 9/9] test(appstore): cover Service.CallFrom, rate-limit
 disable, rotate floor

Close the codecov patch gap: Service.CallFrom (started + not-started),
SetRateLimit disable branch, and rotateGenerations keep<1 floor.
---
 pkg/extend/ratelimit_disable_test.go     | 25 +++++++++++++++++
 plugin/appstore/callfrom_service_test.go | 35 ++++++++++++++++++++++++
 plugin/appstore/rotation_floor_test.go   | 22 +++++++++++++++
 3 files changed, 82 insertions(+)
 create mode 100644 pkg/extend/ratelimit_disable_test.go
 create mode 100644 plugin/appstore/callfrom_service_test.go
 create mode 100644 plugin/appstore/rotation_floor_test.go

diff --git a/pkg/extend/ratelimit_disable_test.go b/pkg/extend/ratelimit_disable_test.go
new file mode 100644
index 0000000..fe584a3
--- /dev/null
+++ b/pkg/extend/ratelimit_disable_test.go
@@ -0,0 +1,25 @@
+package extend
+
+import (
+	"context"
+	"testing"
+)
+
+// TestSetRateLimit_DisableClearsLimiter covers the disable branch
+// (non-positive rate/burst clears the limiter): a previously-enabled
+// limiter is removed and Run stops rate-limiting.
+func TestSetRateLimit_DisableClearsLimiter(t *testing.T) {
+	t.Parallel()
+	r := NewRegistry(noopDispatch)
+	if err := r.Register(Extension{AppID: "a", Primitive: PreSendMessage, Method: "h"}); err != nil {
+		t.Fatal(err)
+	}
+	r.SetRateLimit(1, 1) // enable, tiny budget
+	r.SetRateLimit(0, 0) // disable again
+
+	for i := 0; i < 50; i++ {
+		if _, err := r.Run(context.Background(), PreSendMessage, HookArgs{}); err != nil {
+			t.Fatalf("disabled limiter must not block (call %d): %v", i, err)
+		}
+	}
+}
diff --git a/plugin/appstore/callfrom_service_test.go b/plugin/appstore/callfrom_service_test.go
new file mode 100644
index 0000000..b7136a1
--- /dev/null
+++ b/plugin/appstore/callfrom_service_test.go
@@ -0,0 +1,35 @@
+package appstore
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+)
+
+// TestService_CallFrom_NotStarted covers the nil-supervisor guard.
+func TestService_CallFrom_NotStarted(t *testing.T) {
+	t.Parallel()
+	err := (&Service{}).CallFrom(context.Background(), "io.caller", "io.target", "m", nil, nil)
+	if err == nil {
+		t.Fatal("expected 'service not started' error")
+	}
+}
+
+// TestService_CallFrom_DelegatesToSupervisor drives the started-service
+// path: an unknown target returns ErrAppNotInstalled, proving CallFrom
+// delegates to the supervisor gate.
+func TestService_CallFrom_DelegatesToSupervisor(t *testing.T) {
+	t.Parallel()
+	s := NewService(Config{InstallRoot: t.TempDir()})
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	if err := s.Start(ctx, Deps{}); err != nil {
+		t.Fatalf("Start: %v", err)
+	}
+	defer s.Stop(ctx)
+
+	if err := s.CallFrom(ctx, "io.caller", "io.unknown", "m", nil, nil); !errors.Is(err, ErrAppNotInstalled) {
+		t.Fatalf("CallFrom unknown app: err = %v, want ErrAppNotInstalled", err)
+	}
+}
diff --git a/plugin/appstore/rotation_floor_test.go b/plugin/appstore/rotation_floor_test.go
new file mode 100644
index 0000000..6379428
--- /dev/null
+++ b/plugin/appstore/rotation_floor_test.go
@@ -0,0 +1,22 @@
+package appstore
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// TestRotateGenerations_KeepFloorIsOne covers the keep<1 guard: a
+// non-positive keep is floored to 1, so the active log still rotates to .1.
+func TestRotateGenerations_KeepFloorIsOne(t *testing.T) {
+	t.Parallel()
+	dir := t.TempDir()
+	sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t))
+	if err := os.WriteFile(filepath.Join(dir, supervisorLogName), []byte("X"), 0o600); err != nil {
+		t.Fatal(err)
+	}
+	sup.rotateGenerations(dir, 0) // keep<1 → floored to 1
+	if _, err := os.Stat(filepath.Join(dir, supervisorLogName+".1")); err != nil {
+		t.Errorf("expected supervisor.log.1 after floored rotation: %v", err)
+	}
+}