From 308207373d2ab260a451a5a40ce7f0eba6dc9bf6 Mon Sep 17 00:00:00 2001 From: Teodor Calin Date: Mon, 15 Jun 2026 14:20:50 +0000 Subject: [PATCH 1/9] feat(appstore): enforce broker exposes + cross-app ipc.call grants Add a deny-by-default authorization gate to the app-store broker: - manifest.ExposesMethod / HasGrant helpers (cap+target matching with exact, prefix-wildcard, and blanket forms). - supervisor.callFrom enforces, before any dial: target installed, method in exposes (all callers), and a matching ipc.call grant for cross-app callers. New typed errors ErrMethodNotExposed/ErrGrantMissing. - Service.CallFrom exposes the caller-aware entry point; Call stays the trusted daemon/pilotctl path. Update existing call-path tests to declare exposed methods. --- pkg/manifest/grants.go | 58 +++++++++ pkg/manifest/grants_test.go | 67 ++++++++++ plugin/appstore/grant_test.go | 114 ++++++++++++++++++ plugin/appstore/service.go | 18 +++ plugin/appstore/supervisor.go | 60 ++++++++- .../appstore/zz2_supervisor_lifecycle_test.go | 4 +- plugin/appstore/zz3_supervisor_call_test.go | 8 +- plugin/appstore/zz_supervisor_simple_test.go | 4 +- 8 files changed, 325 insertions(+), 8 deletions(-) create mode 100644 pkg/manifest/grants.go create mode 100644 pkg/manifest/grants_test.go create mode 100644 plugin/appstore/grant_test.go diff --git a/pkg/manifest/grants.go b/pkg/manifest/grants.go new file mode 100644 index 0000000..958fd90 --- /dev/null +++ b/pkg/manifest/grants.go @@ -0,0 +1,58 @@ +package manifest + +import "strings" + +// ExposesMethod reports whether method appears in the manifest's Exposes +// list. Exposes is the app's entire broker surface: the daemon's app-store +// broker dispatches a method into an app only if the app explicitly +// exposes it. An empty Exposes list therefore means "no broker-callable +// methods" — fail-closed. +func (m *Manifest) ExposesMethod(method string) bool { + for _, e := range m.Exposes { + if e == method { + return true + } + } + return false +} + +// HasGrant reports whether the manifest declares a grant whose capability +// equals capName and whose target matches the requested target. Used by +// the broker to authorize cross-app ipc.call dispatch: the calling app +// must declare an `ipc.call` grant targeting the specific "." +// it wants to reach. +// +// Target matching supports three forms, in order of generality: +// - "*" matches any target (blanket grant) +// - ".*" matches any target sharing the prefix (e.g. +// "io.pilot.wallet.*" matches "io.pilot.wallet.pay") +// - exact the grant target equals the requested target verbatim +// +// Conditions on the grant are NOT evaluated here — HasGrant answers only +// "is this capability+target declared". Per-request condition evaluation +// (rate, consent, time-window, …) is a separate, later gate. +func (m *Manifest) HasGrant(capName, target string) bool { + for _, g := range m.Grants { + if g.Cap == capName && matchGrantTarget(g.Target, target) { + return true + } + } + return false +} + +// matchGrantTarget implements the target pattern rules documented on +// HasGrant. +func matchGrantTarget(pattern, target string) bool { + switch { + case pattern == "*": + return true + case pattern == target: + return true + case strings.HasSuffix(pattern, ".*"): + // Keep the trailing dot so "io.app.*" matches "io.app.x" but + // not "io.apple.x". + return strings.HasPrefix(target, strings.TrimSuffix(pattern, "*")) + default: + return false + } +} diff --git a/pkg/manifest/grants_test.go b/pkg/manifest/grants_test.go new file mode 100644 index 0000000..c90499c --- /dev/null +++ b/pkg/manifest/grants_test.go @@ -0,0 +1,67 @@ +package manifest + +import "testing" + +func TestExposesMethod(t *testing.T) { + m := &Manifest{Exposes: []string{"a.read", "a.write"}} + cases := map[string]bool{ + "a.read": true, + "a.write": true, + "a.admin": false, + "": false, + } + for method, want := range cases { + if got := m.ExposesMethod(method); got != want { + t.Errorf("ExposesMethod(%q) = %v, want %v", method, got, want) + } + } + // Empty exposes is fail-closed: nothing is callable. + empty := &Manifest{} + if empty.ExposesMethod("a.read") { + t.Errorf("empty exposes should expose nothing") + } +} + +func TestHasGrant(t *testing.T) { + m := &Manifest{Grants: []Grant{ + {Cap: "ipc.call", Target: "io.pilot.wallet.pay"}, + {Cap: "ipc.call", Target: "io.pilot.notes.*"}, + {Cap: "fs.read", Target: "*"}, + }} + type tc struct { + cap, target string + want bool + } + for _, c := range []tc{ + {"ipc.call", "io.pilot.wallet.pay", true}, // exact + {"ipc.call", "io.pilot.wallet.refund", false}, // exact mismatch + {"ipc.call", "io.pilot.notes.add", true}, // prefix.* + {"ipc.call", "io.pilot.notes.list", true}, // prefix.* + {"ipc.call", "io.pilot.notesX.add", false}, // prefix must end at dot + {"fs.read", "anything.at.all", true}, // blanket * + {"net.dial", "io.pilot.wallet.pay", false}, // cap mismatch + } { + if got := m.HasGrant(c.cap, c.target); got != c.want { + t.Errorf("HasGrant(%q,%q) = %v, want %v", c.cap, c.target, got, c.want) + } + } +} + +func TestMatchGrantTarget(t *testing.T) { + cases := []struct { + pattern, target string + want bool + }{ + {"*", "x.y", true}, + {"x.y", "x.y", true}, + {"x.*", "x.y", true}, + {"x.*", "x.y.z", true}, + {"x.*", "xy.z", false}, // trailing dot kept → "xy.z" lacks "x." prefix + {"x.y", "x.z", false}, + } + for _, c := range cases { + if got := matchGrantTarget(c.pattern, c.target); got != c.want { + t.Errorf("matchGrantTarget(%q,%q) = %v, want %v", c.pattern, c.target, got, c.want) + } + } +} diff --git a/plugin/appstore/grant_test.go b/plugin/appstore/grant_test.go new file mode 100644 index 0000000..76a1438 --- /dev/null +++ b/plugin/appstore/grant_test.go @@ -0,0 +1,114 @@ +package appstore + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/pilot-protocol/app-store/pkg/manifest" +) + +// installTarget registers a ready target app exposing the given methods. +func installTarget(sup *supervisor, id string, exposes ...string) { + sup.mu.Lock() + sup.installed[id] = &installedApp{ + Dir: "/nonexistent/" + id, + SocketPath: "/nonexistent/" + id + "/app.sock", + Manifest: &manifest.Manifest{ID: id, Exposes: exposes}, + } + sup.ready[id] = true + sup.mu.Unlock() +} + +// installCaller registers a caller app with the given ipc.call grants. +func installCaller(sup *supervisor, id string, grantTargets ...string) { + var grants []manifest.Grant + for _, gt := range grantTargets { + grants = append(grants, manifest.Grant{Cap: "ipc.call", Target: gt}) + } + sup.mu.Lock() + sup.installed[id] = &installedApp{ + Dir: "/nonexistent/" + id, + Manifest: &manifest.Manifest{ID: id, Grants: grants}, + } + sup.mu.Unlock() +} + +// TestCallFrom_Gates exercises every authorization branch of callFrom +// without needing a live socket: each denial returns before the dial, +// and a fully-authorized call falls through to ErrAppNotReady only when +// the target socket is absent (here it dials a nonexistent path, so we +// assert the gate was passed by checking the error is NOT a gate error). +func TestCallFrom_Gates(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + t.Run("unknown target → ErrAppNotInstalled", func(t *testing.T) { + sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t)) + err := sup.CallFrom(ctx, "io.caller", "io.missing", "ping", nil, nil) + if !errors.Is(err, ErrAppNotInstalled) { + t.Fatalf("err = %v, want ErrAppNotInstalled", err) + } + }) + + t.Run("method not exposed → ErrMethodNotExposed", func(t *testing.T) { + sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t)) + installTarget(sup, "io.target", "ping") + err := sup.CallFrom(ctx, "", "io.target", "io.target.secret", nil, nil) + if !errors.Is(err, ErrMethodNotExposed) { + t.Fatalf("err = %v, want ErrMethodNotExposed", err) + } + }) + + t.Run("trusted caller skips grant gate", func(t *testing.T) { + sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t)) + installTarget(sup, "io.target", "ping") + // Empty callerID = trusted; exposed method → passes gates, then + // fails at dial (socket path is nonexistent). Must NOT be a gate err. + err := sup.CallFrom(ctx, "", "io.target", "ping", nil, nil) + if errors.Is(err, ErrGrantMissing) || errors.Is(err, ErrMethodNotExposed) || errors.Is(err, ErrAppNotInstalled) { + t.Fatalf("trusted call should pass gates, got gate err: %v", err) + } + }) + + t.Run("cross-app caller not installed → ErrGrantMissing", func(t *testing.T) { + sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t)) + installTarget(sup, "io.target", "ping") + err := sup.CallFrom(ctx, "io.ghost", "io.target", "ping", nil, nil) + if !errors.Is(err, ErrGrantMissing) { + t.Fatalf("err = %v, want ErrGrantMissing", err) + } + }) + + t.Run("cross-app caller without matching grant → ErrGrantMissing", func(t *testing.T) { + sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t)) + installTarget(sup, "io.target", "ping") + installCaller(sup, "io.caller", "io.other.method") // wrong target + err := sup.CallFrom(ctx, "io.caller", "io.target", "ping", nil, nil) + if !errors.Is(err, ErrGrantMissing) { + t.Fatalf("err = %v, want ErrGrantMissing", err) + } + }) + + t.Run("cross-app caller with matching grant passes gates", func(t *testing.T) { + sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t)) + installTarget(sup, "io.target", "ping") + installCaller(sup, "io.caller", "io.target.ping") + err := sup.CallFrom(ctx, "io.caller", "io.target", "ping", nil, nil) + if errors.Is(err, ErrGrantMissing) || errors.Is(err, ErrMethodNotExposed) { + t.Fatalf("authorized call should pass gates, got: %v", err) + } + }) + + t.Run("cross-app caller with wildcard grant passes gates", func(t *testing.T) { + sup := newSupervisor(Config{}, Deps{}, newQuietLogger(t)) + installTarget(sup, "io.target", "ping") + installCaller(sup, "io.caller", "io.target.*") + err := sup.CallFrom(ctx, "io.caller", "io.target", "ping", nil, nil) + if errors.Is(err, ErrGrantMissing) { + t.Fatalf("wildcard grant should authorize, got: %v", err) + } + }) +} diff --git a/plugin/appstore/service.go b/plugin/appstore/service.go index 2979472..2a00466 100644 --- a/plugin/appstore/service.go +++ b/plugin/appstore/service.go @@ -234,3 +234,21 @@ func (s *Service) Call(ctx context.Context, appID, method string, args, out any) } return sup.Call(ctx, appID, method, args, out) } + +// CallFrom is the cross-app broker entry point. callerID identifies the +// installed app making the request; the supervisor authorizes the call +// against that app's manifest grants (it must declare an `ipc.call` grant +// targeting "."). Pass an empty callerID for trusted +// daemon/pilotctl calls — see Call. +// +// Returns ErrAppNotInstalled, ErrMethodNotExposed, ErrGrantMissing, or +// ErrAppNotReady for the gate failures; otherwise the app's IPC response. +func (s *Service) CallFrom(ctx context.Context, callerID, appID, method string, args, out any) error { + s.startMu.Lock() + sup := s.sup + s.startMu.Unlock() + if sup == nil { + return errors.New("appstore: service not started") + } + return sup.CallFrom(ctx, callerID, appID, method, args, out) +} diff --git a/plugin/appstore/supervisor.go b/plugin/appstore/supervisor.go index f653992..9e05a6d 100644 --- a/plugin/appstore/supervisor.go +++ b/plugin/appstore/supervisor.go @@ -860,6 +860,18 @@ var ErrAppNotInstalled = errors.New("appstore: app not installed") // appeared yet (still starting up, or it crashed and hasn't respawned). var ErrAppNotReady = errors.New("appstore: app not ready") +// ErrMethodNotExposed is returned by Call/CallFrom when the target app's +// manifest does not list the requested method in its `exposes` set. The +// exposes list is the app's entire broker surface, so calling anything +// else is denied — even for trusted (daemon/pilotctl) callers. +var ErrMethodNotExposed = errors.New("appstore: method not exposed by app") + +// ErrGrantMissing is returned by CallFrom when a cross-app caller does +// not hold an `ipc.call` grant whose target matches ".". +// Deny-by-default: an app may only reach methods it declared at install +// time, and the user reviewed, in its manifest grants. +var ErrGrantMissing = errors.New("appstore: caller lacks ipc.call grant for target") + // Apps returns the public-shaped summary of every installed app, in // arbitrary order. Used by pilotctl's `appstore list` and by other // apps that want to introspect what's available. @@ -894,18 +906,58 @@ func (s *supervisor) Get(appID string) *installedApp { } // Call dispatches method+args into the named installed app via its -// app.sock. The connection is dialed per-call — simple and lets the -// app's own concurrency handle multiple in-flight calls. Returns -// ErrAppNotInstalled / ErrAppNotReady for the obvious failure modes, -// otherwise propagates the app's IPC response (or its error). +// app.sock on behalf of a trusted caller (the daemon or pilotctl). It is +// CallFrom with an empty callerID — the broker-surface (exposes) gate +// still applies, but no cross-app ipc.call grant is required. func (s *supervisor) Call(ctx context.Context, appID, method string, args, out any) error { + return s.callFrom(ctx, "", appID, method, args, out) +} + +// CallFrom dispatches method+args into the named installed app on behalf +// of callerID. When callerID is non-empty the call is treated as a +// cross-app ipc.call and is authorized against the caller's manifest +// grants. When callerID is empty the caller is trusted (daemon/pilotctl) +// and only the broker-surface gate applies. +// +// Authorization, all enforced before any socket is dialed: +// 1. target app must be installed → ErrAppNotInstalled +// 2. method must be in the target's exposes → ErrMethodNotExposed +// 3. cross-app only: caller must be installed and hold an `ipc.call` +// grant matching "." → ErrGrantMissing +func (s *supervisor) CallFrom(ctx context.Context, callerID, appID, method string, args, out any) error { + return s.callFrom(ctx, callerID, appID, method, args, out) +} + +// callFrom is the shared implementation behind Call and CallFrom. The +// connection is dialed per-call — simple and lets the app's own +// concurrency handle multiple in-flight calls. Returns the typed gate +// errors above, or otherwise propagates the app's IPC response/error. +func (s *supervisor) callFrom(ctx context.Context, callerID, appID, method string, args, out any) error { s.mu.RLock() app, ok := s.installed[appID] + caller, callerOK := s.installed[callerID] ready := s.ready[appID] s.mu.RUnlock() if !ok { return fmt.Errorf("%w: %s", ErrAppNotInstalled, appID) } + // Broker-surface gate: only methods the app explicitly exposes are + // dispatchable, for every caller including the daemon itself. + if !app.Manifest.ExposesMethod(method) { + return fmt.Errorf("%w: %s.%s", ErrMethodNotExposed, appID, method) + } + // Cross-app grant gate: a calling app must have declared an ipc.call + // grant targeting the specific method it wants to invoke. Trusted + // callers (empty callerID) skip this — they are the broker itself. + if callerID != "" { + if !callerOK { + return fmt.Errorf("%w: caller %q not installed", ErrGrantMissing, callerID) + } + target := appID + "." + method + if !caller.Manifest.HasGrant("ipc.call", target) { + return fmt.Errorf("%w: %s -> %s", ErrGrantMissing, callerID, target) + } + } if !ready { // Give it a brief moment in case it just spawned. if !s.awaitReady(ctx, appID, 1*time.Second) { diff --git a/plugin/appstore/zz2_supervisor_lifecycle_test.go b/plugin/appstore/zz2_supervisor_lifecycle_test.go index 7b5a1a7..68e1c19 100644 --- a/plugin/appstore/zz2_supervisor_lifecycle_test.go +++ b/plugin/appstore/zz2_supervisor_lifecycle_test.go @@ -668,12 +668,14 @@ func TestSupervisor_Call_DialFailsWhenNoServer(t *testing.T) { if err := os.MkdirAll(appDir, 0o700); err != nil { t.Fatal(err) } + md := parseDummyManifest(t, "io.dial.fail") + md.Exposes = []string{"any"} sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t)) sup.mu.Lock() sup.installed["io.dial.fail"] = &installedApp{ Dir: appDir, SocketPath: filepath.Join(appDir, "missing.sock"), - Manifest: parseDummyManifest(t, "io.dial.fail"), + Manifest: md, } sup.ready["io.dial.fail"] = true sup.mu.Unlock() diff --git a/plugin/appstore/zz3_supervisor_call_test.go b/plugin/appstore/zz3_supervisor_call_test.go index 37a0ad4..3b6ecf6 100644 --- a/plugin/appstore/zz3_supervisor_call_test.go +++ b/plugin/appstore/zz3_supervisor_call_test.go @@ -77,12 +77,14 @@ func TestSupervisor_Call_HappyPath(t *testing.T) { }) defer cleanup() + mh := parseDummyManifest(t, "io.call.happy") + mh.Exposes = []string{"echo"} sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t)) sup.mu.Lock() sup.installed["io.call.happy"] = &installedApp{ Dir: appDir, SocketPath: socketPath, - Manifest: parseDummyManifest(t, "io.call.happy"), + Manifest: mh, } sup.ready["io.call.happy"] = true sup.mu.Unlock() @@ -114,12 +116,14 @@ func TestSupervisor_Call_PropagatesServerError(t *testing.T) { }) defer cleanup() + me := parseDummyManifest(t, "io.call.err") + me.Exposes = []string{"boom"} sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t)) sup.mu.Lock() sup.installed["io.call.err"] = &installedApp{ Dir: appDir, SocketPath: socketPath, - Manifest: parseDummyManifest(t, "io.call.err"), + Manifest: me, } sup.ready["io.call.err"] = true sup.mu.Unlock() diff --git a/plugin/appstore/zz_supervisor_simple_test.go b/plugin/appstore/zz_supervisor_simple_test.go index 4a93c2f..5b9fdba 100644 --- a/plugin/appstore/zz_supervisor_simple_test.go +++ b/plugin/appstore/zz_supervisor_simple_test.go @@ -59,10 +59,12 @@ func TestSupervisor_Call_NotReady(t *testing.T) { t.Parallel() dir := t.TempDir() sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t)) + m := parseDummyManifest(t, "io.test.app") + m.Exposes = []string{"method"} // method must be exposed to reach the ready gate sup.mu.Lock() sup.installed["io.test.app"] = &installedApp{ Dir: dir, - Manifest: parseDummyManifest(t, "io.test.app"), + Manifest: m, } sup.mu.Unlock() From d0f820ec75dd1040febda2c535cca2f059f1b7ad Mon Sep 17 00:00:00 2001 From: Teodor Calin Date: Mon, 15 Jun 2026 14:22:13 +0000 Subject: [PATCH 2/9] feat(appstore): exponential backoff for verify-fail retries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the fixed 30s verify-fail retry sleep with a capped exponential ramp (1s→2s→…→30s) that resets on a successful verify, consistent with crash-loop restart handling. Extract the shared nextBackoff helper and unit-test that it grows and saturates at the cap. --- plugin/appstore/backoff_test.go | 43 +++++++++++++++++++++++++++++++++ plugin/appstore/supervisor.go | 33 ++++++++++++++++++------- 2 files changed, 67 insertions(+), 9 deletions(-) create mode 100644 plugin/appstore/backoff_test.go diff --git a/plugin/appstore/backoff_test.go b/plugin/appstore/backoff_test.go new file mode 100644 index 0000000..3d5f553 --- /dev/null +++ b/plugin/appstore/backoff_test.go @@ -0,0 +1,43 @@ +package appstore + +import ( + "testing" + "time" +) + +// TestNextBackoff_GrowsAndCaps asserts the verify-fail / crash-restart +// backoff doubles each step and saturates at the cap — i.e. it is not a +// fixed interval. +func TestNextBackoff_GrowsAndCaps(t *testing.T) { + t.Parallel() + const max = 30 * time.Second + want := []time.Duration{ + 2 * time.Second, // 1 → 2 + 4 * time.Second, // 2 → 4 + 8 * time.Second, // 4 → 8 + 16 * time.Second, // 8 → 16 + max, // 16 → 32 capped to 30 + max, // stays at cap + } + cur := time.Second + for i, w := range want { + cur = nextBackoff(cur, max) + if cur != w { + t.Fatalf("step %d: got %s, want %s", i, cur, w) + } + } + + // Strictly grows until the cap, never exceeds it. + prev := time.Duration(0) + c := time.Second + for i := 0; i < 20; i++ { + c = nextBackoff(c, max) + if c < prev { + t.Fatalf("backoff decreased at step %d: %s < %s", i, c, prev) + } + if c > max { + t.Fatalf("backoff exceeded cap at step %d: %s > %s", i, c, max) + } + prev = c + } +} diff --git a/plugin/appstore/supervisor.go b/plugin/appstore/supervisor.go index 9e05a6d..2ab18a8 100644 --- a/plugin/appstore/supervisor.go +++ b/plugin/appstore/supervisor.go @@ -637,6 +637,16 @@ func atoiOrZero(s string) int { return n } +// nextBackoff doubles cur, capping at max. Shared by superviseOne's +// crash-restart and verify-fail backoff ramps so both grow identically. +func nextBackoff(cur, max time.Duration) time.Duration { + next := cur * 2 + if next > max { + return max + } + return next +} + // superviseOne runs one app forever, respawning on exit until ctx is canceled. func (s *supervisor) superviseOne(ctx context.Context, a *installedApp) { // Mark the start + end of supervision. The pair tells forensics @@ -671,6 +681,11 @@ func (s *supervisor) superviseOne(ctx context.Context, a *installedApp) { }() backoff := time.Second const maxBackoff = 30 * time.Second + // verifyBackoff grows independently of the crash-restart backoff so a + // binary that fails sha256 verification doesn't hammer the disk every + // 30s — it ramps 1s→2s→4s…→30s, consistent with crash-loop handling, + // and resets the moment a verification succeeds. + verifyBackoff := time.Second verifyFails := 0 for { if err := ctx.Err(); err != nil { @@ -690,17 +705,20 @@ func (s *supervisor) superviseOne(ctx context.Context, a *installedApp) { s.markSuspended(a.Manifest.ID) return } - // A bad sha256 may be transient; wait + retry in case - // the user fixes it (e.g. re-install). + // A bad sha256 may be transient; wait + retry with capped + // exponential backoff in case the user fixes it (re-install). select { case <-ctx.Done(): return - case <-time.After(maxBackoff): - continue + case <-time.After(verifyBackoff): } + verifyBackoff = nextBackoff(verifyBackoff, maxBackoff) + continue } - // verify passed — reset the consecutive-fail counter. + // verify passed — reset the consecutive-fail counter and the + // verify backoff so a later transient failure starts fresh. verifyFails = 0 + verifyBackoff = time.Second exitCode := s.spawn(ctx, a) s.markNotReady(a.Manifest.ID) s.writeAuditLine(a, auditEvent{Event: "exit", ExitCode: exitCode}) @@ -728,10 +746,7 @@ func (s *supervisor) superviseOne(ctx context.Context, a *installedApp) { return case <-time.After(backoff): } - backoff *= 2 - if backoff > maxBackoff { - backoff = maxBackoff - } + backoff = nextBackoff(backoff, maxBackoff) } } From eb69ed0285e459f144c0194949c7675eefd5a766 Mon Sep 17 00:00:00 2001 From: Teodor Calin Date: Mon, 15 Jun 2026 14:24:39 +0000 Subject: [PATCH 3/9] feat(appstore): keep N rotated audit log generations Replace single-step supervisor.log->.1 rotation with an N-generation shift (.1..N), discarding the oldest. Add Config.AuditLogMaxBackups (default 3); worst-case footprint is (backups+1) x AuditLogMaxBytes per app. Cover generation count and shift semantics with tests. --- plugin/appstore/rotation_test.go | 72 ++++++++++++++++++++++++++++++++ plugin/appstore/service.go | 12 ++++-- plugin/appstore/supervisor.go | 69 +++++++++++++++++++++++------- 3 files changed, 135 insertions(+), 18 deletions(-) create mode 100644 plugin/appstore/rotation_test.go diff --git a/plugin/appstore/rotation_test.go b/plugin/appstore/rotation_test.go new file mode 100644 index 0000000..cbfbc56 --- /dev/null +++ b/plugin/appstore/rotation_test.go @@ -0,0 +1,72 @@ +package appstore + +import ( + "fmt" + "os" + "path/filepath" + "testing" +) + +// TestAuditLogKeepsNGenerations drives repeated rotations and asserts the +// supervisor keeps exactly AuditLogMaxBackups rotated files +// (supervisor.log.1 .. .N), never more, and that .1 always holds the most +// recently rotated content. +func TestAuditLogKeepsNGenerations(t *testing.T) { + t.Parallel() + dir := t.TempDir() + app := &installedApp{Dir: dir, Manifest: parseDummyManifest(t, "io.test.app")} + const backups = 3 + sup := newSupervisor(Config{ + InstallRoot: dir, + AuditLogMaxBytes: 200, + AuditLogMaxBackups: backups, + }, Deps{}, newQuietLogger(t)) + + // Write far more than enough to trigger several rotations. + for i := 0; i < 200; i++ { + sup.writeAuditLine(app, auditEvent{Event: "spawn", PID: i, SHA256: "abc"}) + } + + // Exactly `backups` rotated generations must exist: .1 .. .N. + for i := 1; i <= backups; i++ { + p := filepath.Join(dir, fmt.Sprintf("%s.%d", supervisorLogName, i)) + if _, err := os.Stat(p); err != nil { + t.Errorf("expected rotated generation %s to exist: %v", p, err) + } + } + // .N+1 must NOT exist — the oldest is discarded. + overflow := filepath.Join(dir, fmt.Sprintf("%s.%d", supervisorLogName, backups+1)) + if _, err := os.Stat(overflow); !os.IsNotExist(err) { + t.Errorf("generation beyond backup count should not exist: %s (err=%v)", overflow, err) + } + // Active log still present. + if _, err := os.Stat(filepath.Join(dir, supervisorLogName)); err != nil { + t.Errorf("active log missing after rotations: %v", err) + } +} + +// TestRotateGenerationsShiftsContent verifies the shift semantics +// directly: after a rotation, what was in .1 moves to .2. +func TestRotateGenerationsShiftsContent(t *testing.T) { + t.Parallel() + dir := t.TempDir() + sup := newSupervisor(Config{InstallRoot: dir, AuditLogMaxBackups: 2}, Deps{}, newQuietLogger(t)) + + write := func(name, content string) { + if err := os.WriteFile(filepath.Join(dir, name), []byte(content), 0o600); err != nil { + t.Fatal(err) + } + } + write(supervisorLogName, "ACTIVE") + write(supervisorLogName+".1", "GEN1") + + sup.rotateGenerations(dir, 2) + + // active → .1, old .1 → .2 + if b, _ := os.ReadFile(filepath.Join(dir, supervisorLogName+".1")); string(b) != "ACTIVE" { + t.Errorf(".1 = %q, want ACTIVE", b) + } + if b, _ := os.ReadFile(filepath.Join(dir, supervisorLogName+".2")); string(b) != "GEN1" { + t.Errorf(".2 = %q, want GEN1", b) + } +} diff --git a/plugin/appstore/service.go b/plugin/appstore/service.go index 2a00466..34a2163 100644 --- a/plugin/appstore/service.go +++ b/plugin/appstore/service.go @@ -47,10 +47,16 @@ type Config struct { RescanInterval time.Duration // AuditLogMaxBytes is the per-app supervisor.log size threshold - // at which a single-step rotation fires (active → .1). Zero - // defaults to maxAuditLogSize (10MB). Tests set this low to - // exercise the rotation path without writing megabytes. + // at which rotation fires (active → .1). Zero defaults to + // maxAuditLogSize (10MB). Tests set this low to exercise the + // rotation path without writing megabytes. AuditLogMaxBytes int64 + + // AuditLogMaxBackups is the number of rotated generations the + // supervisor keeps (supervisor.log.1 .. .N). Zero defaults to + // defaultAuditLogBackups (3). Worst-case per-app footprint is + // (AuditLogMaxBackups+1) × AuditLogMaxBytes. + AuditLogMaxBackups int } // EmbeddedCatalogPubkey is the production trust anchor for the catalog. diff --git a/plugin/appstore/supervisor.go b/plugin/appstore/supervisor.go index 2ab18a8..0864d86 100644 --- a/plugin/appstore/supervisor.go +++ b/plugin/appstore/supervisor.go @@ -28,13 +28,19 @@ import ( // `pilotctl appstore audit ` can read it without daemon glue. const supervisorLogName = "supervisor.log" -// supervisorLogRotated is the single-step rotation target. On reaching -// maxAuditLogSize, writeAuditLine moves supervisor.log here (overwriting -// any prior copy) and starts fresh. Single-step keeps the design simple: -// worst-case footprint is 2 × maxAuditLogSize per app, plenty of history -// for an incident, and no log.2/log.3 chain to manage. +// supervisorLogRotated is the first rotated generation (supervisor.log.1). +// On reaching the size threshold the active log shifts to .1, the prior +// .1 shifts to .2, and so on up to the configured backup count; the +// oldest generation beyond that is discarded. Worst-case footprint is +// (backups+1) × maxAuditLogSize per app. const supervisorLogRotated = "supervisor.log.1" +// defaultAuditLogBackups is the number of rotated generations kept +// (supervisor.log.1 .. .N) when Config.AuditLogMaxBackups is unset. +// Three generations + the active log keeps a useful incident window +// without unbounded disk growth. +const defaultAuditLogBackups = 3 + // maxAuditLogSize bounds each app's active audit log. A crash-looping // app emits ~5 lines per failed spawn cycle (verify-fail / exit / // spawn / spawn-fail / supervise-stop), each ~150B, so 10MB is @@ -80,10 +86,12 @@ func (s *supervisor) writeAuditLine(a *installedApp, ev auditEvent) { } } -// rotateAuditIfLarge renames supervisor.log → supervisor.log.1 when -// the active log has crossed the configured size threshold. The -// threshold defaults to maxAuditLogSize but can be lowered for tests -// via supervisor.cfg.AuditLogMaxBytes. Errors are logged but never +// rotateAuditIfLarge rotates the audit log when the active +// supervisor.log has crossed the configured size threshold, keeping up +// to N rotated generations (supervisor.log.1 .. .N). The threshold +// defaults to maxAuditLogSize and the generation count to +// defaultAuditLogBackups; both can be overridden via Config +// (AuditLogMaxBytes / AuditLogMaxBackups). Errors are logged but never // fatal — forensics best-effort, never blocks the lifecycle write. func (s *supervisor) rotateAuditIfLarge(appDir string) { max := int64(maxAuditLogSize) @@ -98,12 +106,43 @@ func (s *supervisor) rotateAuditIfLarge(appDir string) { if info.Size() < max { return } - rotated := filepath.Join(appDir, supervisorLogRotated) - // os.Rename replaces the destination atomically on Unix, so a - // concurrent reader of the rotated path will see either the old - // or the new one — never a partial file. - if err := os.Rename(active, rotated); err != nil { - s.logger.Printf("audit rotate %s → %s: %v", active, rotated, err) + keep := defaultAuditLogBackups + if s.cfg.AuditLogMaxBackups > 0 { + keep = s.cfg.AuditLogMaxBackups + } + s.rotateGenerations(appDir, keep) +} + +// rotateGenerations shifts the audit log generations down by one, +// keeping at most `keep` rotated copies: supervisor.log.(keep) is +// discarded, .(keep-1)→.keep, …, .1→.2, and the active +// supervisor.log→.1. Each os.Rename replaces its destination +// atomically on Unix, so a concurrent reader of any generation sees a +// whole file, never a partial one. Errors are logged, never fatal. +func (s *supervisor) rotateGenerations(appDir string, keep int) { + if keep < 1 { + keep = 1 + } + gen := func(i int) string { + if i == 0 { + return filepath.Join(appDir, supervisorLogName) + } + return filepath.Join(appDir, fmt.Sprintf("%s.%d", supervisorLogName, i)) + } + // Discard the generation that would fall off the end. + if err := os.Remove(gen(keep)); err != nil && !errors.Is(err, os.ErrNotExist) { + s.logger.Printf("audit rotate: remove %s: %v", gen(keep), err) + } + // Shift each surviving generation down one slot, oldest first so we + // never clobber a file we still need to move. + for i := keep - 1; i >= 0; i-- { + src, dst := gen(i), gen(i+1) + if _, err := os.Stat(src); errors.Is(err, os.ErrNotExist) { + continue // gap in the chain — nothing to move + } + if err := os.Rename(src, dst); err != nil { + s.logger.Printf("audit rotate %s → %s: %v", src, dst, err) + } } } From af5a48f4c972ad26f384c2b28a12add69da1068f Mon Sep 17 00:00:00 2001 From: Teodor Calin Date: Mon, 15 Jun 2026 14:27:53 +0000 Subject: [PATCH 4/9] feat(appstore): cap spawned-app address space (RLIMIT_AS) on Linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend applyChildResourceLimits to set RLIMIT_AS alongside RLIMIT_NOFILE for spawned children, configurable via Config.ChildMemoryLimitBytes (default 4 GiB — generous enough for Go/Node/Python virtual reservations, still a runaway guard). No-op on non-Linux, documented. Tests read the limits back via prlimit64 to confirm enforcement. --- plugin/appstore/resource_test.go | 15 ++++ plugin/appstore/rlimit_linux.go | 59 ++++++++++------ plugin/appstore/rlimit_linux_test.go | 70 +++++++++++++++++++ plugin/appstore/rlimit_other.go | 19 ++--- plugin/appstore/service.go | 8 +++ plugin/appstore/supervisor.go | 19 ++++- .../appstore/zz2_supervisor_lifecycle_test.go | 2 +- 7 files changed, 160 insertions(+), 32 deletions(-) create mode 100644 plugin/appstore/resource_test.go create mode 100644 plugin/appstore/rlimit_linux_test.go diff --git a/plugin/appstore/resource_test.go b/plugin/appstore/resource_test.go new file mode 100644 index 0000000..ac7f587 --- /dev/null +++ b/plugin/appstore/resource_test.go @@ -0,0 +1,15 @@ +package appstore + +import "testing" + +// TestChildAddressSpaceLimitResolution covers the default-vs-override +// resolution of the RLIMIT_AS cap (platform-independent). +func TestChildAddressSpaceLimitResolution(t *testing.T) { + t.Parallel() + if got := newSupervisor(Config{}, Deps{}, newQuietLogger(t)).childAddressSpaceLimit(); got != defaultChildAddressSpaceLimit { + t.Errorf("unset → %d, want default %d", got, defaultChildAddressSpaceLimit) + } + if got := newSupervisor(Config{ChildMemoryLimitBytes: 123 << 20}, Deps{}, newQuietLogger(t)).childAddressSpaceLimit(); got != 123<<20 { + t.Errorf("override → %d, want %d", got, 123<<20) + } +} diff --git a/plugin/appstore/rlimit_linux.go b/plugin/appstore/rlimit_linux.go index cb8d790..66100b9 100644 --- a/plugin/appstore/rlimit_linux.go +++ b/plugin/appstore/rlimit_linux.go @@ -27,29 +27,44 @@ type rlimit64 struct { Max uint64 } -// applyChildResourceLimits sets RLIMIT_NOFILE on a freshly-spawned -// child after exec. Best-effort: a failure logs but doesn't kill the -// spawn — the OS-wide ulimit still applies. We use prlimit (not -// setrlimit) because the parent (supervisor) doesn't want to bound -// its own fd count; we only want to bound the child's. +// applyChildResourceLimits sets resource limits on a freshly-spawned +// child after exec, via prlimit(2) targeting the child pid (so the +// supervisor's own limits are untouched). Best-effort: any failure logs +// but does not kill the spawn — the OS-wide ulimit still applies. +// +// - RLIMIT_NOFILE is always set to childFDLimit. +// - RLIMIT_AS (virtual address space) is set to addrSpaceLimit when it +// is non-zero. NOTE: RLIMIT_AS bounds *address space*, not RSS. +// Runtime-managed languages (Go, Node/V8, Python) reserve large +// virtual regions well above their working set, so the cap must be +// generous — a too-low value makes the runtime fail to mmap and the +// child crashes on startup. The supervisor's default +// (defaultChildAddressSpaceLimit) is chosen with that headroom. // // There is a tiny race: between cmd.Start and prlimit landing, the -// child can open more fds. For a wallet-class app starting from a -// clean state, that's a handful at most — well under the cap. -func applyChildResourceLimits(pid int, logger *log.Logger) { - want := rlimit64{Cur: childFDLimit, Max: childFDLimit} - // SYS_PRLIMIT64 takes (pid, resource, *new_limit, *old_limit). - // We don't care about the old limit; pass a nil out-pointer. - _, _, errno := syscall.Syscall6( - syscall.SYS_PRLIMIT64, - uintptr(pid), - uintptr(syscall.RLIMIT_NOFILE), - uintptr(unsafe.Pointer(&want)), - 0, 0, 0, - ) - if errno != 0 { - logger.Printf("prlimit pid=%d RLIMIT_NOFILE=%d: %v (proceeding; OS-wide ulimit still applies)", pid, childFDLimit, errno) - return +// child can allocate. For an app starting from a clean state that's a +// handful of fds / a small allocation — well under the caps. +func applyChildResourceLimits(pid int, addrSpaceLimit uint64, logger *log.Logger) { + setLimit := func(resource int, name string, val uint64) { + want := rlimit64{Cur: val, Max: val} + // SYS_PRLIMIT64 takes (pid, resource, *new_limit, *old_limit). + // We don't care about the old limit; pass a nil out-pointer. + _, _, errno := syscall.Syscall6( + syscall.SYS_PRLIMIT64, + uintptr(pid), + uintptr(resource), + uintptr(unsafe.Pointer(&want)), + 0, 0, 0, + ) + if errno != 0 { + logger.Printf("prlimit pid=%d %s=%d: %v (proceeding; OS-wide ulimit still applies)", pid, name, val, errno) + return + } + logger.Printf("prlimit pid=%d %s=%d ok", pid, name, val) + } + + setLimit(syscall.RLIMIT_NOFILE, "RLIMIT_NOFILE", childFDLimit) + if addrSpaceLimit > 0 { + setLimit(syscall.RLIMIT_AS, "RLIMIT_AS", addrSpaceLimit) } - logger.Printf("prlimit pid=%d RLIMIT_NOFILE=%d ok", pid, childFDLimit) } diff --git a/plugin/appstore/rlimit_linux_test.go b/plugin/appstore/rlimit_linux_test.go new file mode 100644 index 0000000..66de634 --- /dev/null +++ b/plugin/appstore/rlimit_linux_test.go @@ -0,0 +1,70 @@ +//go:build linux + +package appstore + +import ( + "os/exec" + "syscall" + "testing" + "unsafe" +) + +// getChildRlimit reads back a resource limit for pid via prlimit64(2). +func getChildRlimit(t *testing.T, pid, resource int) rlimit64 { + t.Helper() + var out rlimit64 + _, _, errno := syscall.Syscall6( + syscall.SYS_PRLIMIT64, + uintptr(pid), uintptr(resource), 0, + uintptr(unsafe.Pointer(&out)), 0, 0, + ) + if errno != 0 { + t.Fatalf("prlimit get pid=%d res=%d: %v", pid, resource, errno) + } + return out +} + +func startSleeper(t *testing.T) *exec.Cmd { + t.Helper() + cmd := exec.Command("sleep", "5") + if err := cmd.Start(); err != nil { + t.Skipf("cannot start sleep child: %v", err) + } + t.Cleanup(func() { + _ = cmd.Process.Kill() + _ = cmd.Wait() + }) + return cmd +} + +// TestApplyChildResourceLimits_SetsAddressSpace asserts RLIMIT_AS and +// RLIMIT_NOFILE are actually applied to the child by reading them back. +func TestApplyChildResourceLimits_SetsAddressSpace(t *testing.T) { + cmd := startSleeper(t) + pid := cmd.Process.Pid + const limit uint64 = 2 << 30 + applyChildResourceLimits(pid, limit, newQuietLogger(t)) + + if got := getChildRlimit(t, pid, syscall.RLIMIT_AS); got.Cur != limit { + t.Errorf("RLIMIT_AS Cur = %d, want %d", got.Cur, limit) + } + if got := getChildRlimit(t, pid, syscall.RLIMIT_NOFILE); got.Cur != childFDLimit { + t.Errorf("RLIMIT_NOFILE Cur = %d, want %d", got.Cur, childFDLimit) + } +} + +// TestApplyChildResourceLimits_ZeroSkipsAddressSpace asserts a zero +// addrSpaceLimit leaves RLIMIT_AS untouched while still applying NOFILE. +func TestApplyChildResourceLimits_ZeroSkipsAddressSpace(t *testing.T) { + cmd := startSleeper(t) + pid := cmd.Process.Pid + before := getChildRlimit(t, pid, syscall.RLIMIT_AS) + applyChildResourceLimits(pid, 0, newQuietLogger(t)) + after := getChildRlimit(t, pid, syscall.RLIMIT_AS) + if after.Cur != before.Cur { + t.Errorf("RLIMIT_AS changed despite zero limit: before=%d after=%d", before.Cur, after.Cur) + } + if got := getChildRlimit(t, pid, syscall.RLIMIT_NOFILE); got.Cur != childFDLimit { + t.Errorf("RLIMIT_NOFILE Cur = %d, want %d", got.Cur, childFDLimit) + } +} diff --git a/plugin/appstore/rlimit_other.go b/plugin/appstore/rlimit_other.go index ab9feef..54ad666 100644 --- a/plugin/appstore/rlimit_other.go +++ b/plugin/appstore/rlimit_other.go @@ -6,13 +6,16 @@ import "log" // applyChildResourceLimits is the non-Linux build's no-op. macOS's // equivalent (setrlimit) affects the calling process, not children; -// without a wrapper binary or pre-exec hook there's no portable way -// to bound the child's fd count from the supervisor. Documented as -// an RC1 known gap; production deployments on linux get the real -// limit via rlimit_linux.go's applyChildResourceLimits. -func applyChildResourceLimits(pid int, logger *log.Logger) { +// without a wrapper binary or pre-exec hook there's no portable way to +// bound the child's fd count or address space from the supervisor. +// Documented as an RC1 known gap; production deployments on linux get +// the real limits (RLIMIT_NOFILE + RLIMIT_AS) via rlimit_linux.go. +// +// The addrSpaceLimit parameter is accepted for signature parity with the +// Linux build and ignored here. +func applyChildResourceLimits(pid int, addrSpaceLimit uint64, logger *log.Logger) { // Single startup log line would be nicer than per-spawn, but - // inlining here keeps the supervisor call-site identical to - // the Linux build. Cheap log line at info level — easy to grep. - logger.Printf("resource limits not enforced for pid=%d on this platform (linux-only)", pid) + // inlining here keeps the supervisor call-site identical to the + // Linux build. Cheap log line at info level — easy to grep. + logger.Printf("resource limits not enforced for pid=%d on this platform (linux-only); requested addr-space cap=%d ignored", pid, addrSpaceLimit) } diff --git a/plugin/appstore/service.go b/plugin/appstore/service.go index 34a2163..a20f8b3 100644 --- a/plugin/appstore/service.go +++ b/plugin/appstore/service.go @@ -57,6 +57,14 @@ type Config struct { // defaultAuditLogBackups (3). Worst-case per-app footprint is // (AuditLogMaxBackups+1) × AuditLogMaxBytes. AuditLogMaxBackups int + + // ChildMemoryLimitBytes caps the virtual address space (RLIMIT_AS) + // of each spawned app process on Linux. Zero defaults to + // defaultChildAddressSpaceLimit (4 GiB). No-op on non-Linux + // platforms. Set generously — RLIMIT_AS bounds address space, not + // RSS, and runtime-managed languages reserve far more virtual + // memory than they use. + ChildMemoryLimitBytes uint64 } // EmbeddedCatalogPubkey is the production trust anchor for the catalog. diff --git a/plugin/appstore/supervisor.go b/plugin/appstore/supervisor.go index 0864d86..c59ad3a 100644 --- a/plugin/appstore/supervisor.go +++ b/plugin/appstore/supervisor.go @@ -41,6 +41,14 @@ const supervisorLogRotated = "supervisor.log.1" // without unbounded disk growth. const defaultAuditLogBackups = 3 +// defaultChildAddressSpaceLimit is the RLIMIT_AS cap applied to spawned +// app processes when Config.ChildMemoryLimitBytes is unset. 4 GiB is +// generous enough for Go/Node/Python runtime virtual reservations (a +// trivial Go binary needs ~1 GiB of address space) while still bounding +// a runaway / hostile app well below host memory. Linux-only; see +// rlimit_linux.go. +const defaultChildAddressSpaceLimit uint64 = 4 << 30 + // maxAuditLogSize bounds each app's active audit log. A crash-looping // app emits ~5 lines per failed spawn cycle (verify-fail / exit / // spawn / spawn-fail / supervise-stop), each ~150B, so 10MB is @@ -676,6 +684,15 @@ func atoiOrZero(s string) int { return n } +// childAddressSpaceLimit resolves the RLIMIT_AS cap for spawned apps: +// the configured value, or defaultChildAddressSpaceLimit when unset. +func (s *supervisor) childAddressSpaceLimit() uint64 { + if s.cfg.ChildMemoryLimitBytes > 0 { + return s.cfg.ChildMemoryLimitBytes + } + return defaultChildAddressSpaceLimit +} + // nextBackoff doubles cur, capping at max. Shared by superviseOne's // crash-restart and verify-fail backoff ramps so both grow identically. func nextBackoff(cur, max time.Duration) time.Duration { @@ -859,7 +876,7 @@ func (s *supervisor) spawn(ctx context.Context, a *installedApp) int { // child. Best-effort: a failure logs but doesn't kill the spawn // (OS-wide ulimits still apply). Linux uses prlimit(2) for a // RLIMIT_NOFILE cap; other platforms log a "not enforced" line. - applyChildResourceLimits(cmd.Process.Pid, s.logger) + applyChildResourceLimits(cmd.Process.Pid, s.childAddressSpaceLimit(), s.logger) s.writeAuditLine(a, auditEvent{Event: "spawn", PID: cmd.Process.Pid, SHA256: a.Manifest.Binary.SHA256, BinaryAt: a.BinaryPath}) // Watch for the socket to appear; once it does, mark ready. diff --git a/plugin/appstore/zz2_supervisor_lifecycle_test.go b/plugin/appstore/zz2_supervisor_lifecycle_test.go index 68e1c19..9050469 100644 --- a/plugin/appstore/zz2_supervisor_lifecycle_test.go +++ b/plugin/appstore/zz2_supervisor_lifecycle_test.go @@ -565,7 +565,7 @@ func TestApplyChildResourceLimits_Smoke(t *testing.T) { t.Parallel() // Pass an invalid PID; the function is best-effort and must not panic. logger := newQuietLogger(t) - applyChildResourceLimits(0, logger) + applyChildResourceLimits(0, defaultChildAddressSpaceLimit, logger) } // TestService_AppsBeforeStart returns nil per the docstring. From 3c3a62838ed7435d7535744d5898e1586c4f9ce5 Mon Sep 17 00:00:00 2001 From: Teodor Calin Date: Mon, 15 Jun 2026 14:31:16 +0000 Subject: [PATCH 5/9] feat(appstore): re-verify binary + reject symlink at spawn (TOCTOU) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add verifyAtSpawn, run immediately before execve in spawn(): Lstat symlink rejection plus a sha256 re-check against the pinned hash. Closes the time-of-check/time-of-use gap between scanInstalled (runs once at discovery) and the actual launch — a binary swapped for a symlink or different bytes after the scan is now refused. Tests cover both swap cases and the valid-binary pass. --- plugin/appstore/supervisor.go | 32 ++++++ plugin/appstore/toctou_test.go | 105 ++++++++++++++++++ .../appstore/zz2_supervisor_lifecycle_test.go | 21 +++- 3 files changed, 154 insertions(+), 4 deletions(-) create mode 100644 plugin/appstore/toctou_test.go diff --git a/plugin/appstore/supervisor.go b/plugin/appstore/supervisor.go index c59ad3a..e80dfc7 100644 --- a/plugin/appstore/supervisor.go +++ b/plugin/appstore/supervisor.go @@ -826,12 +826,44 @@ func (s *supervisor) verifyBinary(a *installedApp) error { return nil } +// verifyAtSpawn re-runs the binary trust checks immediately before exec, +// closing the TOCTOU window between scanInstalled (which checks once, at +// discovery) and the actual launch. An attacker who swaps the resolved +// binary for a symlink (→ a host binary like /bin/sh) or for different +// bytes after the scan but before spawn is caught here, not after the +// hostile code has already run. +// +// Lstat (not Stat) so a symlink is seen as itself, not followed. The +// sha256 re-hash reuses verifyBinary. A tiny residual window remains +// between this check and execve — inherent to any check-then-exec — but +// it is orders of magnitude smaller than scan-to-spawn. +func (s *supervisor) verifyAtSpawn(a *installedApp) error { + fi, err := os.Lstat(a.BinaryPath) + if err != nil { + return fmt.Errorf("lstat binary: %w", err) + } + if fi.Mode()&os.ModeSymlink != 0 { + return fmt.Errorf("binary path %s is a symlink (refusing)", a.BinaryPath) + } + return s.verifyBinary(a) +} + // spawn launches the app's binary, blocks until it exits, returns the // exit code (or -1 on error). The caller is responsible for restart logic. // // Marks the app "ready" once its socket has appeared, so concurrent // Call() invocations can know when to dial. func (s *supervisor) spawn(ctx context.Context, a *installedApp) int { + // TOCTOU guard: re-verify the binary (no symlink, sha256 still + // matches) immediately before exec. The supervise loop already + // verified at the top of its cycle, but a swap could land in the + // window between that check and this exec — refuse to launch if so. + if err := s.verifyAtSpawn(a); err != nil { + s.logger.Printf("app=%s: spawn-time verify failed: %v — refusing to exec", a.Manifest.ID, err) + s.writeAuditLine(a, auditEvent{Event: "verify-fail", Reason: "spawn-time: " + err.Error(), SHA256: a.Manifest.Binary.SHA256, BinaryAt: a.BinaryPath}) + return -1 + } + // Drop a stale socket if a previous instance crashed without cleaning. if _, err := os.Stat(a.SocketPath); err == nil { _ = os.Remove(a.SocketPath) diff --git a/plugin/appstore/toctou_test.go b/plugin/appstore/toctou_test.go new file mode 100644 index 0000000..2541630 --- /dev/null +++ b/plugin/appstore/toctou_test.go @@ -0,0 +1,105 @@ +package appstore + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "os" + "path/filepath" + "strings" + "testing" + "time" +) + +// appWithBinary writes content to //bin and returns an +// installedApp whose manifest pins the sha256 of that content. +func appWithBinary(t *testing.T, root, id string, content []byte) *installedApp { + t.Helper() + dir := filepath.Join(root, id) + if err := os.MkdirAll(dir, 0o700); err != nil { + t.Fatal(err) + } + binPath := filepath.Join(dir, "bin") + if err := os.WriteFile(binPath, content, 0o755); err != nil { + t.Fatal(err) + } + sum := sha256.Sum256(content) + m := parseDummyManifest(t, id) + m.Binary.SHA256 = hex.EncodeToString(sum[:]) + return &installedApp{ + Dir: dir, + BinaryPath: binPath, + SocketPath: filepath.Join(dir, "app.sock"), + Manifest: m, + } +} + +// auditContains reports whether the app's supervisor.log contains needle. +func auditContains(t *testing.T, a *installedApp, needle string) bool { + t.Helper() + b, _ := os.ReadFile(filepath.Join(a.Dir, supervisorLogName)) + return strings.Contains(string(b), needle) +} + +// TestSpawn_RefusesSymlinkSwap covers the TOCTOU case where the resolved +// binary is replaced by a symlink (e.g. → /bin/sh) after the scan but +// before exec: spawn must refuse and never launch. +func TestSpawn_RefusesSymlinkSwap(t *testing.T) { + t.Parallel() + root := t.TempDir() + a := appWithBinary(t, root, "io.toctou.symlink", []byte("#!/bin/true\n")) + sup := newSupervisor(Config{InstallRoot: root}, Deps{}, newQuietLogger(t)) + + // Swap the real binary for a symlink to a host binary. + if err := os.Remove(a.BinaryPath); err != nil { + t.Fatal(err) + } + if err := os.Symlink("/bin/sh", a.BinaryPath); err != nil { + t.Fatal(err) + } + + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + if code := sup.spawn(ctx, a); code != -1 { + t.Fatalf("spawn exit code = %d, want -1 (refused)", code) + } + if !auditContains(t, a, "spawn-time") || !auditContains(t, a, "symlink") { + t.Errorf("expected spawn-time symlink verify-fail audit line; log missing it") + } +} + +// TestSpawn_RefusesContentSwap covers the TOCTOU case where the binary +// bytes change (sha256 no longer matches the pinned hash) between scan +// and exec. +func TestSpawn_RefusesContentSwap(t *testing.T) { + t.Parallel() + root := t.TempDir() + a := appWithBinary(t, root, "io.toctou.content", []byte("original")) + sup := newSupervisor(Config{InstallRoot: root}, Deps{}, newQuietLogger(t)) + + // Swap the bytes — pinned sha256 no longer matches. + if err := os.WriteFile(a.BinaryPath, []byte("tampered-payload"), 0o755); err != nil { + t.Fatal(err) + } + + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + if code := sup.spawn(ctx, a); code != -1 { + t.Fatalf("spawn exit code = %d, want -1 (refused)", code) + } + if !auditContains(t, a, "spawn-time") || !auditContains(t, a, "sha256 mismatch") { + t.Errorf("expected spawn-time sha256 verify-fail audit line; log missing it") + } +} + +// TestVerifyAtSpawn_AcceptsGoodBinary confirms the guard passes for an +// untampered binary matching its pinned hash. +func TestVerifyAtSpawn_AcceptsGoodBinary(t *testing.T) { + t.Parallel() + root := t.TempDir() + a := appWithBinary(t, root, "io.toctou.ok", []byte("good-bytes")) + sup := newSupervisor(Config{InstallRoot: root}, Deps{}, newQuietLogger(t)) + if err := sup.verifyAtSpawn(a); err != nil { + t.Errorf("verifyAtSpawn rejected a valid binary: %v", err) + } +} diff --git a/plugin/appstore/zz2_supervisor_lifecycle_test.go b/plugin/appstore/zz2_supervisor_lifecycle_test.go index 9050469..929fc29 100644 --- a/plugin/appstore/zz2_supervisor_lifecycle_test.go +++ b/plugin/appstore/zz2_supervisor_lifecycle_test.go @@ -99,7 +99,7 @@ func TestSpawn_FastExitTriggersExitCode(t *testing.T) { if err := os.MkdirAll(appDir, 0o700); err != nil { t.Fatal(err) } - path, _ := fakeBinaryScript(t, appDir, "bin", 0, 0) + path, sum := fakeBinaryScript(t, appDir, "bin", 0, 0) a := &installedApp{ Dir: appDir, @@ -109,6 +109,7 @@ func TestSpawn_FastExitTriggersExitCode(t *testing.T) { IDPath: filepath.Join(appDir, "identity.json"), Manifest: parseDummyManifest(t, "io.spawn.fast"), } + a.Manifest.Binary.SHA256 = sum // pass the spawn-time TOCTOU re-verify sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t)) ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) defer cancel() @@ -128,7 +129,7 @@ func TestSpawn_NonZeroExitPropagates(t *testing.T) { if err := os.MkdirAll(appDir, 0o700); err != nil { t.Fatal(err) } - path, _ := fakeBinaryScript(t, appDir, "bin", 42, 0) + path, sum := fakeBinaryScript(t, appDir, "bin", 42, 0) a := &installedApp{ Dir: appDir, @@ -138,6 +139,7 @@ func TestSpawn_NonZeroExitPropagates(t *testing.T) { IDPath: filepath.Join(appDir, "identity.json"), Manifest: parseDummyManifest(t, "io.spawn.exit42"), } + a.Manifest.Binary.SHA256 = sum // pass the spawn-time TOCTOU re-verify sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t)) ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) defer cancel() @@ -157,14 +159,24 @@ func TestSpawn_StartFailure(t *testing.T) { if err := os.MkdirAll(appDir, 0o700); err != nil { t.Fatal(err) } + // A real, executable file whose pinned sha256 matches (so it passes + // the spawn-time TOCTOU re-verify) but whose interpreter is missing, + // so execve fails at cmd.Start — exercising the spawn-fail branch. + binPath := filepath.Join(appDir, "bin") + body := []byte("#!/nonexistent/interp-xyz-12345\n") + if err := os.WriteFile(binPath, body, 0o755); err != nil { + t.Fatal(err) + } + sum := sha256.Sum256(body) a := &installedApp{ Dir: appDir, - BinaryPath: filepath.Join(appDir, "does-not-exist"), + BinaryPath: binPath, SocketPath: filepath.Join(appDir, "app.sock"), DBPath: filepath.Join(appDir, "data.db"), IDPath: filepath.Join(appDir, "identity.json"), Manifest: parseDummyManifest(t, "io.spawn.nostart"), } + a.Manifest.Binary.SHA256 = hex.EncodeToString(sum[:]) sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t)) ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) defer cancel() @@ -195,7 +207,7 @@ func TestSpawn_StaleSocketIsCleaned(t *testing.T) { if err := os.WriteFile(socketPath, []byte("stale"), 0o600); err != nil { t.Fatal(err) } - path, _ := fakeBinaryScript(t, appDir, "bin", 0, 0) + path, sum := fakeBinaryScript(t, appDir, "bin", 0, 0) a := &installedApp{ Dir: appDir, @@ -205,6 +217,7 @@ func TestSpawn_StaleSocketIsCleaned(t *testing.T) { IDPath: filepath.Join(appDir, "identity.json"), Manifest: parseDummyManifest(t, "io.spawn.stale"), } + a.Manifest.Binary.SHA256 = sum // pass the spawn-time TOCTOU re-verify sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t)) ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) defer cancel() From ee7b9472fbb0470fb3e021b5d35f137cabeaee8a Mon Sep 17 00:00:00 2001 From: Teodor Calin Date: Mon, 15 Jun 2026 14:33:53 +0000 Subject: [PATCH 6/9] feat(extend): rate-limit hook dispatch + cap dynamic registrations Add two DoS guards to pkg/extend: - Per-app token-bucket rate limiter on hook dispatch (Registry.Run), enabled via SetRateLimit; aborts with ErrRateLimited when an app's budget is spent. Off by default (back-compat); injectable clock for deterministic tests. - Cap of maxDynamicRegistrationsPerApp (32) on runtime hook registrations per app in DaemonHandler.Register, via Registry.CountForApp; refuses growth with ErrTooManyRegistrations. --- pkg/extend/extend.go | 47 +++++++++++++ pkg/extend/ratelimit.go | 60 +++++++++++++++++ pkg/extend/ratelimit_test.go | 126 +++++++++++++++++++++++++++++++++++ pkg/extend/runtime.go | 17 +++++ 4 files changed, 250 insertions(+) create mode 100644 pkg/extend/ratelimit.go create mode 100644 pkg/extend/ratelimit_test.go diff --git a/pkg/extend/extend.go b/pkg/extend/extend.go index a1eb8e5..0ee1c7a 100644 --- a/pkg/extend/extend.go +++ b/pkg/extend/extend.go @@ -115,8 +115,17 @@ type Registry struct { mu sync.RWMutex byPoint map[HookPoint][]Extension + + // limiter, when non-nil, rate-limits hook dispatch per app in Run. + // Nil = unlimited (default); enable via SetRateLimit. + limiter *rateLimiter } +// ErrRateLimited is returned by Run when an app's hook-invocation rate +// budget is exhausted. The chain aborts rather than dispatch into an app +// that's being called too aggressively. +var ErrRateLimited = errors.New("extend: hook rate limit exceeded") + // NewRegistry constructs an empty Registry. dispatch is required. func NewRegistry(dispatch Dispatcher) *Registry { if dispatch == nil { @@ -133,6 +142,38 @@ func NewRegistry(dispatch Dispatcher) *Registry { } } +// SetRateLimit enables per-app hook-dispatch rate limiting: each app may +// fire `burst` hook invocations instantly and `ratePerSec` sustained +// thereafter. Once an app's budget is exhausted, Run aborts that app's +// hook with ErrRateLimited. Call with a non-positive rate or burst to +// disable. Intended to be set once at daemon wire-up. +func (r *Registry) SetRateLimit(ratePerSec float64, burst int) { + r.mu.Lock() + defer r.mu.Unlock() + if ratePerSec <= 0 || burst <= 0 { + r.limiter = nil + return + } + r.limiter = newRateLimiter(ratePerSec, burst) +} + +// CountForApp returns the number of currently-registered extensions +// belonging to appID across all hook points. Used to bound how many +// dynamic registrations a single app may hold. +func (r *Registry) CountForApp(appID string) int { + r.mu.RLock() + defer r.mu.RUnlock() + n := 0 + for _, list := range r.byPoint { + for _, ext := range list { + if ext.AppID == appID { + n++ + } + } + } + return n +} + // Register adds one extension to the registry. Returns an error if // the primitive is shape-invalid, the method is empty, or a flag is // shaped wrong. Multiple apps may register for the same hook point; @@ -239,11 +280,17 @@ func (r *Registry) FlagsFor(p HookPoint) []FlagSpec { // the hook may have populated, then cleared. func (r *Registry) Run(ctx context.Context, p HookPoint, args HookArgs) (HookArgs, error) { hooks := r.HooksFor(p) + r.mu.RLock() + lim := r.limiter + r.mu.RUnlock() current := args for _, h := range hooks { if err := ctx.Err(); err != nil { return current, err } + if lim != nil && !lim.allow(h.AppID) { + return current, fmt.Errorf("extend %s: hook %s.%s: %w", p, h.AppID, h.Method, ErrRateLimited) + } next, err := r.dispatch(ctx, h.AppID, h.Method, cloneArgs(current)) if err != nil { return current, fmt.Errorf("extend %s: hook %s.%s: %w", p, h.AppID, h.Method, err) diff --git a/pkg/extend/ratelimit.go b/pkg/extend/ratelimit.go new file mode 100644 index 0000000..c2755c1 --- /dev/null +++ b/pkg/extend/ratelimit.go @@ -0,0 +1,60 @@ +package extend + +import ( + "sync" + "time" +) + +// rateLimiter is a per-key token bucket used to bound how often the +// registry dispatches into any single app's hooks. A misbehaving or +// hostile app whose hooks fire on every primitive cannot turn the hook +// chain into a DoS amplifier: once its bucket is empty, further hook +// invocations are refused until it refills. +type rateLimiter struct { + mu sync.Mutex + rate float64 // tokens added per second + burst float64 // bucket capacity + now func() time.Time + buckets map[string]*tokenBucket +} + +type tokenBucket struct { + tokens float64 + last time.Time +} + +// newRateLimiter builds a limiter allowing `burst` events instantly and +// `ratePerSec` sustained thereafter, per key. +func newRateLimiter(ratePerSec float64, burst int) *rateLimiter { + return &rateLimiter{ + rate: ratePerSec, + burst: float64(burst), + now: time.Now, + buckets: map[string]*tokenBucket{}, + } +} + +// allow reports whether one event for key may proceed now, consuming a +// token if so. Refills lazily based on elapsed wall-clock time. +func (rl *rateLimiter) allow(key string) bool { + rl.mu.Lock() + defer rl.mu.Unlock() + t := rl.now() + b := rl.buckets[key] + if b == nil { + b = &tokenBucket{tokens: rl.burst, last: t} + rl.buckets[key] = b + } + if elapsed := t.Sub(b.last).Seconds(); elapsed > 0 { + b.tokens += elapsed * rl.rate + if b.tokens > rl.burst { + b.tokens = rl.burst + } + b.last = t + } + if b.tokens >= 1 { + b.tokens-- + return true + } + return false +} diff --git a/pkg/extend/ratelimit_test.go b/pkg/extend/ratelimit_test.go new file mode 100644 index 0000000..181ecc1 --- /dev/null +++ b/pkg/extend/ratelimit_test.go @@ -0,0 +1,126 @@ +package extend + +import ( + "context" + "errors" + "fmt" + "testing" + "time" +) + +// noopDispatch is a Dispatcher that echoes args back with no error. +func noopDispatch(_ context.Context, _, _ string, a HookArgs) (HookArgs, error) { + return a, nil +} + +// TestRun_RateLimitsPerApp asserts that once an app's burst budget is +// spent, Run refuses further hook dispatch with ErrRateLimited, and that +// the budget refills as the (injected) clock advances. +func TestRun_RateLimitsPerApp(t *testing.T) { + t.Parallel() + r := NewRegistry(noopDispatch) + if err := r.Register(Extension{AppID: "io.spammer", Primitive: PreSendMessage, Method: "h"}); err != nil { + t.Fatal(err) + } + // 1 token/sec, burst 3. + r.SetRateLimit(1, 3) + + // Pin the clock so refill is deterministic. + base := time.Unix(1_000_000, 0) + now := base + r.mu.Lock() + r.limiter.now = func() time.Time { return now } + r.mu.Unlock() + + ctx := context.Background() + // First 3 calls consume the burst. + for i := 0; i < 3; i++ { + if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); err != nil { + t.Fatalf("call %d should pass, got %v", i, err) + } + } + // 4th call (no time elapsed) is rate-limited. + if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); !errors.Is(err, ErrRateLimited) { + t.Fatalf("4th call err = %v, want ErrRateLimited", err) + } + // Advance 1s → exactly one token refills → one call passes, next fails. + now = base.Add(time.Second) + if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); err != nil { + t.Fatalf("post-refill call should pass, got %v", err) + } + if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); !errors.Is(err, ErrRateLimited) { + t.Fatalf("call after refill drained err = %v, want ErrRateLimited", err) + } +} + +// TestRun_RateLimitIsPerApp confirms one app exhausting its budget does +// not block a different app's hooks. +func TestRun_RateLimitIsPerApp(t *testing.T) { + t.Parallel() + r := NewRegistry(noopDispatch) + _ = r.Register(Extension{AppID: "a", Primitive: PreSendMessage, Method: "h", Order: 1}) + _ = r.Register(Extension{AppID: "b", Primitive: PostRecvMessage, Method: "h", Order: 1}) + r.SetRateLimit(1, 1) + now := time.Unix(2_000_000, 0) + r.mu.Lock() + r.limiter.now = func() time.Time { return now } + r.mu.Unlock() + + ctx := context.Background() + // Drain app "a". + if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); err != nil { + t.Fatal(err) + } + if _, err := r.Run(ctx, PreSendMessage, HookArgs{}); !errors.Is(err, ErrRateLimited) { + t.Fatalf("app a should be limited, got %v", err) + } + // App "b" still has its own full budget. + if _, err := r.Run(ctx, PostRecvMessage, HookArgs{}); err != nil { + t.Fatalf("app b should not be limited, got %v", err) + } +} + +// TestRun_NoLimiterByDefault confirms back-compat: without SetRateLimit, +// many invocations all pass. +func TestRun_NoLimiterByDefault(t *testing.T) { + t.Parallel() + r := NewRegistry(noopDispatch) + _ = r.Register(Extension{AppID: "a", Primitive: PreSendMessage, Method: "h"}) + for i := 0; i < 1000; i++ { + if _, err := r.Run(context.Background(), PreSendMessage, HookArgs{}); err != nil { + t.Fatalf("unlimited registry should never rate-limit, got %v at %d", err, i) + } + } +} + +// TestDaemonHandler_CapsDynamicRegistrations asserts an app cannot exceed +// maxDynamicRegistrationsPerApp dynamic hook registrations. +func TestDaemonHandler_CapsDynamicRegistrations(t *testing.T) { + t.Parallel() + reg := NewRegistry(noopDispatch) + h := NewDaemonHandler(reg, AllowAll) + + for i := 0; i < maxDynamicRegistrationsPerApp; i++ { + err := h.Register("io.greedy", Extension{ + Primitive: PreSendMessage, + Method: fmt.Sprintf("m%d", i), + }) + if err != nil { + t.Fatalf("registration %d should succeed, got %v", i, err) + } + } + // One past the cap must be refused. + err := h.Register("io.greedy", Extension{Primitive: PreSendMessage, Method: "overflow"}) + if !errors.Is(err, ErrTooManyRegistrations) { + t.Fatalf("over-cap registration err = %v, want ErrTooManyRegistrations", err) + } + // A different app is unaffected. + if err := h.Register("io.modest", Extension{Primitive: PreSendMessage, Method: "m"}); err != nil { + t.Fatalf("other app should still register, got %v", err) + } + // After unregistering one of greedy's hooks, it can register again. + reg.UnregisterOne("io.greedy", PreSendMessage, "m0") + if err := h.Register("io.greedy", Extension{Primitive: PreSendMessage, Method: "again"}); err != nil { + t.Fatalf("after freeing a slot, registration should succeed, got %v", err) + } +} diff --git a/pkg/extend/runtime.go b/pkg/extend/runtime.go index b077f91..8434d2f 100644 --- a/pkg/extend/runtime.go +++ b/pkg/extend/runtime.go @@ -36,6 +36,18 @@ var DenyAll Permission = PermissionFunc(func(string, HookPoint) bool { return fa // configured Permission rejects the request. var ErrPermissionDenied = errors.New("extend: permission denied") +// maxDynamicRegistrationsPerApp bounds how many hooks a single app may +// hold at once via the runtime-register IPC. Without a cap, an app +// permitted to register dynamically could register unboundedly and +// exhaust memory / slow every primitive's hook lookup — a DoS. The +// cap is generous (a well-behaved app registers a handful of hooks) +// while refusing pathological growth. +const maxDynamicRegistrationsPerApp = 32 + +// ErrTooManyRegistrations is returned by DaemonHandler.Register when an +// app already holds maxDynamicRegistrationsPerApp hooks. +var ErrTooManyRegistrations = errors.New("extend: too many dynamic registrations for app") + // DaemonHandler is the runtime-side IPC surface the daemon exposes to // installed apps so they can add/remove their own hooks dynamically // (within the bounds Permission allows). Apps call these methods via @@ -68,6 +80,11 @@ func (h *DaemonHandler) Register(appID string, ext Extension) error { if !h.perms.CanRegister(appID, ext.Primitive) { return fmt.Errorf("%w: %s cannot register %s", ErrPermissionDenied, appID, ext.Primitive) } + // Bound the number of hooks one app may hold to prevent unbounded + // dynamic registration (memory + per-primitive lookup-cost DoS). + if h.reg.CountForApp(appID) >= maxDynamicRegistrationsPerApp { + return fmt.Errorf("%w: %s holds %d (max %d)", ErrTooManyRegistrations, appID, h.reg.CountForApp(appID), maxDynamicRegistrationsPerApp) + } return h.reg.Register(ext) } From 6c8989db8a606c3067074af3b4fac582a3706edb Mon Sep 17 00:00:00 2001 From: Teodor Calin Date: Mon, 15 Jun 2026 14:35:12 +0000 Subject: [PATCH 7/9] style: gofmt grants_test.go --- pkg/manifest/grants_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/manifest/grants_test.go b/pkg/manifest/grants_test.go index c90499c..5ccbd6d 100644 --- a/pkg/manifest/grants_test.go +++ b/pkg/manifest/grants_test.go @@ -33,13 +33,13 @@ func TestHasGrant(t *testing.T) { want bool } for _, c := range []tc{ - {"ipc.call", "io.pilot.wallet.pay", true}, // exact + {"ipc.call", "io.pilot.wallet.pay", true}, // exact {"ipc.call", "io.pilot.wallet.refund", false}, // exact mismatch - {"ipc.call", "io.pilot.notes.add", true}, // prefix.* - {"ipc.call", "io.pilot.notes.list", true}, // prefix.* - {"ipc.call", "io.pilot.notesX.add", false}, // prefix must end at dot - {"fs.read", "anything.at.all", true}, // blanket * - {"net.dial", "io.pilot.wallet.pay", false}, // cap mismatch + {"ipc.call", "io.pilot.notes.add", true}, // prefix.* + {"ipc.call", "io.pilot.notes.list", true}, // prefix.* + {"ipc.call", "io.pilot.notesX.add", false}, // prefix must end at dot + {"fs.read", "anything.at.all", true}, // blanket * + {"net.dial", "io.pilot.wallet.pay", false}, // cap mismatch } { if got := m.HasGrant(c.cap, c.target); got != c.want { t.Errorf("HasGrant(%q,%q) = %v, want %v", c.cap, c.target, got, c.want) From f1493456505f7f6ac5139276ba488d60eb6ed518 Mon Sep 17 00:00:00 2001 From: Teodor Calin Date: Mon, 15 Jun 2026 15:04:26 +0000 Subject: [PATCH 8/9] docs(changelog): note broker + supervisor hardening for rc1 --- CHANGELOG.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5df6f64..b70e90d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,29 @@ First release candidate. Surface is end-to-end working for the Production deployment against an untrusted publisher requires the catalog-signing chain landed in a follow-up RC (see "Known gaps"). +### Security & hardening — broker + supervisor + +- **Broker authorization (deny-by-default).** Every brokered call is + gated before dialing an app socket: the method must be in the target's + `exposes` set, and cross-app `ipc.call` callers must hold a matching + grant (`.`, `.*`, or `*`). New `Service.CallFrom` and + `manifest.ExposesMethod` / `HasGrant`; errors `ErrMethodNotExposed`, + `ErrGrantMissing`. +- **TOCTOU re-verification at spawn.** The binary is re-checked + immediately before `exec` (symlink rejection + sha256), closing the gap + between install-scan and launch. +- **Exponential verify-fail backoff.** Verification failures retry with + capped exponential backoff (was a fixed 30s), consistent with crash-loop + handling. +- **Multi-generation audit log rotation.** `supervisor.log` rotates across + N generations (`AuditLogMaxBackups`, default 3) instead of a single step. +- **Address-space cap (Linux).** Spawned apps get `RLIMIT_AS` alongside + `RLIMIT_NOFILE`, configurable via `Config.ChildMemoryLimitBytes` + (default 4 GiB); no-op on non-Linux. +- **Extension DoS guards.** `pkg/extend` adds per-app hook-dispatch rate + limiting (`Registry.SetRateLimit`, `ErrRateLimited`) and a per-app cap on + dynamic registrations (`ErrTooManyRegistrations`). + ### Added — pilotctl `pilotctl appstore` is the operator surface for the app store. From 8fead51fbc28292832b961e48ff650435ab71bc9 Mon Sep 17 00:00:00 2001 From: Teodor Calin Date: Mon, 15 Jun 2026 15:18:57 +0000 Subject: [PATCH 9/9] test(appstore): cover Service.CallFrom, rate-limit disable, rotate floor Close the codecov patch gap: Service.CallFrom (started + not-started), SetRateLimit disable branch, and rotateGenerations keep<1 floor. --- pkg/extend/ratelimit_disable_test.go | 25 +++++++++++++++++ plugin/appstore/callfrom_service_test.go | 35 ++++++++++++++++++++++++ plugin/appstore/rotation_floor_test.go | 22 +++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 pkg/extend/ratelimit_disable_test.go create mode 100644 plugin/appstore/callfrom_service_test.go create mode 100644 plugin/appstore/rotation_floor_test.go diff --git a/pkg/extend/ratelimit_disable_test.go b/pkg/extend/ratelimit_disable_test.go new file mode 100644 index 0000000..fe584a3 --- /dev/null +++ b/pkg/extend/ratelimit_disable_test.go @@ -0,0 +1,25 @@ +package extend + +import ( + "context" + "testing" +) + +// TestSetRateLimit_DisableClearsLimiter covers the disable branch +// (non-positive rate/burst clears the limiter): a previously-enabled +// limiter is removed and Run stops rate-limiting. +func TestSetRateLimit_DisableClearsLimiter(t *testing.T) { + t.Parallel() + r := NewRegistry(noopDispatch) + if err := r.Register(Extension{AppID: "a", Primitive: PreSendMessage, Method: "h"}); err != nil { + t.Fatal(err) + } + r.SetRateLimit(1, 1) // enable, tiny budget + r.SetRateLimit(0, 0) // disable again + + for i := 0; i < 50; i++ { + if _, err := r.Run(context.Background(), PreSendMessage, HookArgs{}); err != nil { + t.Fatalf("disabled limiter must not block (call %d): %v", i, err) + } + } +} diff --git a/plugin/appstore/callfrom_service_test.go b/plugin/appstore/callfrom_service_test.go new file mode 100644 index 0000000..b7136a1 --- /dev/null +++ b/plugin/appstore/callfrom_service_test.go @@ -0,0 +1,35 @@ +package appstore + +import ( + "context" + "errors" + "testing" + "time" +) + +// TestService_CallFrom_NotStarted covers the nil-supervisor guard. +func TestService_CallFrom_NotStarted(t *testing.T) { + t.Parallel() + err := (&Service{}).CallFrom(context.Background(), "io.caller", "io.target", "m", nil, nil) + if err == nil { + t.Fatal("expected 'service not started' error") + } +} + +// TestService_CallFrom_DelegatesToSupervisor drives the started-service +// path: an unknown target returns ErrAppNotInstalled, proving CallFrom +// delegates to the supervisor gate. +func TestService_CallFrom_DelegatesToSupervisor(t *testing.T) { + t.Parallel() + s := NewService(Config{InstallRoot: t.TempDir()}) + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := s.Start(ctx, Deps{}); err != nil { + t.Fatalf("Start: %v", err) + } + defer s.Stop(ctx) + + if err := s.CallFrom(ctx, "io.caller", "io.unknown", "m", nil, nil); !errors.Is(err, ErrAppNotInstalled) { + t.Fatalf("CallFrom unknown app: err = %v, want ErrAppNotInstalled", err) + } +} diff --git a/plugin/appstore/rotation_floor_test.go b/plugin/appstore/rotation_floor_test.go new file mode 100644 index 0000000..6379428 --- /dev/null +++ b/plugin/appstore/rotation_floor_test.go @@ -0,0 +1,22 @@ +package appstore + +import ( + "os" + "path/filepath" + "testing" +) + +// TestRotateGenerations_KeepFloorIsOne covers the keep<1 guard: a +// non-positive keep is floored to 1, so the active log still rotates to .1. +func TestRotateGenerations_KeepFloorIsOne(t *testing.T) { + t.Parallel() + dir := t.TempDir() + sup := newSupervisor(Config{InstallRoot: dir}, Deps{}, newQuietLogger(t)) + if err := os.WriteFile(filepath.Join(dir, supervisorLogName), []byte("X"), 0o600); err != nil { + t.Fatal(err) + } + sup.rotateGenerations(dir, 0) // keep<1 → floored to 1 + if _, err := os.Stat(filepath.Join(dir, supervisorLogName+".1")); err != nil { + t.Errorf("expected supervisor.log.1 after floored rotation: %v", err) + } +}