From 44cd67633c7eb4491036e27e0cfe1e174c8607bf Mon Sep 17 00:00:00 2001 From: stxkxs Date: Fri, 19 Jun 2026 20:06:22 -0700 Subject: [PATCH] =?UTF-8?q?feat(operators):=20reconcile=20per-session=20at?= =?UTF-8?q?tribution=20=E2=80=94=20session=20role=20+=20impersonate=20RBAC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The platform side of fab's per-session human attribution (nanohype/fab#30). A Platform opts in with spec.attribution; the operator then provisions the two resources fab's role-session entrypoint needs, per tenant, and tears them down on removal or deletion. ─── API (api/platform/v1alpha1) ────────────────────────────────────────── - PlatformSpec.Attribution (*AttributionSpec, optional): Operators []string (required, min 1) and SessionRoleMaxDurationSeconds (*int32, 900–43200, default 3600). nil = unattributed, the default. - PlatformStatus.SessionRoleArn carries the provisioned session role ARN. - Each operator string is reused verbatim as BOTH an allowed STS SourceIdentity and an impersonate resourceName, so the same identity binds the AWS and Kubernetes audit records (the contract fab documents in docs/attribution.md). ─── Session role (platform_session_iam.go) ─────────────────────────────── - ensureSessionRole mints --session (same 64-char + FNV-1a hash scheme as the tenant role). Trust: only the tenant IRSA role may assume, and only while setting one of the Platform's operators as SourceIdentity — Action [sts:AssumeRole, sts:SetSourceIdentity] with a StringEquals condition on sts:SourceIdentity. Permissions: the tenant baseline policy (Bedrock invoke) and the same permissions boundary — never broad sts:AssumeRole. - Idempotent: GetRole→Create on miss; on hit it refreshes the trust policy (the operator list can change) via UpdateAssumeRolePolicy and converges the baseline attachment. - Kill-switch parity: when the Platform is suspended the session role's baseline is DETACHED, not attached — otherwise a suspended tenant could keep invoking Bedrock through the session role after its tenant role's baseline was pulled. - deleteIamRole/deleteSessionRole now share a detachAndDeleteRole helper. ─── Impersonate RBAC (platform_rbac.go) ────────────────────────────────── - ensureOperatorImpersonateRBAC creates a ClusterRole granting `impersonate` on exactly the named operator users (never `impersonate *`) bound to the tenant-runtime ServiceAccount, named -impersonate. fab's session kubeconfig authenticates with that SA token while impersonating the operator, so apiserver audit records impersonatedUser=. - Cluster-scoped, so (like the tenant namespace) it's reaped through the finalizer, not OwnerReferences. ─── Wiring (platform_controller.go) ────────────────────────────────────── - Reconcile provisions the pair when spec.attribution is set (after the tenant IRSA role + SA exist), records status.SessionRoleArn, and tears the pair down when attribution is removed. Finalizer cleans up both (no-ops when never enabled). - RBAC markers added for clusterroles/clusterrolebindings and (for escalation prevention) impersonate on users — regenerated into config/rbac/role.yaml. ─── Tests / codegen ────────────────────────────────────────────────────── - 10 unit tests: session-role trust/baseline/duration/idempotency/suspend/ delete (fakeIAM) and the impersonate RBAC create/update/delete (the controller-runtime fake client). go build, vet, golangci-lint, and the internal+api unit suites are green. - make generate + make manifests regenerated deepcopy, the CRD (config + Helm chart copy), the RBAC role, and the CRD reference doc. Draft: pairs with nanohype/fab#30 (also draft). The conformance/envtest suite was not run here (needs setup-envtest); the new logic is unit-covered. Co-authored-by: stxkxsbot <275011021+stxkxsbot@users.noreply.github.com> --- .../crds/platform.nanohype.dev_platforms.yaml | 42 ++++ docs/crd-reference/v1alpha1.md | 20 ++ .../api/platform/v1alpha1/platform_types.go | 39 ++++ .../v1alpha1/zz_generated.deepcopy.go | 30 +++ .../platform.nanohype.dev_platforms.yaml | 42 ++++ operators/config/rbac/role.yaml | 19 ++ .../controller/platform_controller.go | 45 ++++ operators/internal/controller/platform_iam.go | 11 +- .../controller/platform_iam_reconcile_test.go | 18 +- .../internal/controller/platform_rbac.go | 95 +++++++++ .../internal/controller/platform_rbac_test.go | 123 +++++++++++ .../controller/platform_session_iam.go | 195 ++++++++++++++++++ .../controller/platform_session_iam_test.go | 166 +++++++++++++++ 13 files changed, 837 insertions(+), 8 deletions(-) create mode 100644 operators/internal/controller/platform_rbac.go create mode 100644 operators/internal/controller/platform_rbac_test.go create mode 100644 operators/internal/controller/platform_session_iam.go create mode 100644 operators/internal/controller/platform_session_iam_test.go diff --git a/charts/operator/crds/platform.nanohype.dev_platforms.yaml b/charts/operator/crds/platform.nanohype.dev_platforms.yaml index 7770c9e..d7c3587 100644 --- a/charts/operator/crds/platform.nanohype.dev_platforms.yaml +++ b/charts/operator/crds/platform.nanohype.dev_platforms.yaml @@ -63,6 +63,43 @@ spec: hosting one or more AgentFleets, with its own budget, identity, and guardrails. properties: + attribution: + description: |- + Attribution opts the Platform into per-session human attribution. When + set, the operator provisions a session role — assumable by the tenant + IRSA role with the operator carried as STS SourceIdentity, scoped to the + tenant baseline (Bedrock invoke) and NOT broad sts:AssumeRole — plus a + ClusterRole letting the tenant ServiceAccount impersonate the named + operators at the apiserver. fab's role-session entrypoint consumes both, + so an agent's AWS + Kubernetes actions attribute to a named human. + nil = unattributed (the default). + properties: + operators: + description: |- + Operators is the set of human identities (e.g. email addresses) a + session in this Platform may act as. Each value becomes both an allowed + STS SourceIdentity on the session role's trust policy and a resourceNames + entry on the impersonate ClusterRole, so the SAME string binds the AWS + and Kubernetes audit records. Use a canonical form (a lowercased email); + it must byte-match the operator's own RBAC subject name. + items: + type: string + minItems: 1 + type: array + sessionRoleMaxDurationSeconds: + default: 3600 + description: |- + SessionRoleMaxDurationSeconds caps the assumed session lifetime. Because + the caller is the tenant IRSA role, AWS STS role chaining hard-caps a + chained session at 3600s regardless of this value; larger values only + matter if the caller ever changes. Defaults to 3600. + format: int32 + maximum: 43200 + minimum: 900 + type: integer + required: + - operators + type: object budget: description: Budget references a BudgetPolicy CR in the same namespace. properties: @@ -230,6 +267,11 @@ spec: phase: description: 'Phase: Pending, Provisioning, Ready, Suspended, Failed.' type: string + sessionRoleArn: + description: |- + SessionRoleArn is the per-Platform attribution session role, created when + spec.attribution is set. Empty when attribution is off. + type: string suspendedAt: description: |- SuspendedAt is the timestamp at which the kill-switch fired. When diff --git a/docs/crd-reference/v1alpha1.md b/docs/crd-reference/v1alpha1.md index 6b047eb..cf61df8 100644 --- a/docs/crd-reference/v1alpha1.md +++ b/docs/crd-reference/v1alpha1.md @@ -621,6 +621,24 @@ Package v1alpha1 contains API Schema definitions for the platform v1alpha1 API g +#### AttributionSpec + + + +AttributionSpec configures per-session human attribution for a Platform. See +github.com/nanohype/fab docs/attribution.md for the consumer side. + + + +_Appears in:_ +- [PlatformSpec](#platformspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `operators` _string array_ | Operators is the set of human identities (e.g. email addresses) a
session in this Platform may act as. Each value becomes both an allowed
STS SourceIdentity on the session role's trust policy and a resourceNames
entry on the impersonate ClusterRole, so the SAME string binds the AWS
and Kubernetes audit records. Use a canonical form (a lowercased email);
it must byte-match the operator's own RBAC subject name. | | MinItems: 1
| +| `sessionRoleMaxDurationSeconds` _integer_ | SessionRoleMaxDurationSeconds caps the assumed session lifetime. Because
the caller is the tenant IRSA role, AWS STS role chaining hard-caps a
chained session at 3600s regardless of this value; larger values only
matter if the caller ever changes. Defaults to 3600. | 3600 | Maximum: 43200
Minimum: 900
Optional: \{\}
| + + #### BudgetRef @@ -737,6 +755,7 @@ _Appears in:_ | `identity` _[IdentitySpec](#identityspec)_ | Identity controls how the IRSA role is named + which Bedrock models are
reachable. | | | | `compliance` _[ComplianceSpec](#compliancespec)_ | Compliance flags drive stricter defaults across the Platform. | | Optional: \{\}
| | `isolation` _string_ | Isolation: namespace (default) or vCluster (hard isolation). | namespace | Enum: [namespace vcluster]
Optional: \{\}
| +| `attribution` _[AttributionSpec](#attributionspec)_ | Attribution opts the Platform into per-session human attribution. When
set, the operator provisions a session role — assumable by the tenant
IRSA role with the operator carried as STS SourceIdentity, scoped to the
tenant baseline (Bedrock invoke) and NOT broad sts:AssumeRole — plus a
ClusterRole letting the tenant ServiceAccount impersonate the named
operators at the apiserver. fab's role-session entrypoint consumes both,
so an agent's AWS + Kubernetes actions attribute to a named human.
nil = unattributed (the default). | | Optional: \{\}
| #### PlatformStatus @@ -754,6 +773,7 @@ _Appears in:_ | --- | --- | --- | --- | | `phase` _string_ | Phase: Pending, Provisioning, Ready, Suspended, Failed. | | Optional: \{\}
| | `iamRoleArn` _string_ | IamRoleArn is the per-Platform IRSA role created by the controller. | | Optional: \{\}
| +| `sessionRoleArn` _string_ | SessionRoleArn is the per-Platform attribution session role, created when
spec.attribution is set. Empty when attribution is off. | | Optional: \{\}
| | `namespace` _string_ | Namespace is the tenant namespace the controller provisioned. | | Optional: \{\}
| | `observedGeneration` _integer_ | ObservedGeneration is the last spec.generation the controller reconciled. | | Optional: \{\}
| | `suspendedAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.33/#time-v1-meta)_ | SuspendedAt is the timestamp at which the kill-switch fired. When
non-nil the operator stops reattaching the baseline IAM policy and
the AgentFleetReconciler scales fleets to zero. Resets to nil only
when ops clears the iam:TagRole 'platform.nanohype.dev/suspended'
marker on the tenant IRSA role. | | Optional: \{\}
| diff --git a/operators/api/platform/v1alpha1/platform_types.go b/operators/api/platform/v1alpha1/platform_types.go index e1213f5..9ee22ab 100644 --- a/operators/api/platform/v1alpha1/platform_types.go +++ b/operators/api/platform/v1alpha1/platform_types.go @@ -44,6 +44,40 @@ type PlatformSpec struct { // +kubebuilder:default=namespace // +optional Isolation string `json:"isolation,omitempty"` + + // Attribution opts the Platform into per-session human attribution. When + // set, the operator provisions a session role — assumable by the tenant + // IRSA role with the operator carried as STS SourceIdentity, scoped to the + // tenant baseline (Bedrock invoke) and NOT broad sts:AssumeRole — plus a + // ClusterRole letting the tenant ServiceAccount impersonate the named + // operators at the apiserver. fab's role-session entrypoint consumes both, + // so an agent's AWS + Kubernetes actions attribute to a named human. + // nil = unattributed (the default). + // +optional + Attribution *AttributionSpec `json:"attribution,omitempty"` +} + +// AttributionSpec configures per-session human attribution for a Platform. See +// github.com/nanohype/fab docs/attribution.md for the consumer side. +type AttributionSpec struct { + // Operators is the set of human identities (e.g. email addresses) a + // session in this Platform may act as. Each value becomes both an allowed + // STS SourceIdentity on the session role's trust policy and a resourceNames + // entry on the impersonate ClusterRole, so the SAME string binds the AWS + // and Kubernetes audit records. Use a canonical form (a lowercased email); + // it must byte-match the operator's own RBAC subject name. + // +kubebuilder:validation:MinItems=1 + Operators []string `json:"operators"` + + // SessionRoleMaxDurationSeconds caps the assumed session lifetime. Because + // the caller is the tenant IRSA role, AWS STS role chaining hard-caps a + // chained session at 3600s regardless of this value; larger values only + // matter if the caller ever changes. Defaults to 3600. + // +kubebuilder:validation:Minimum=900 + // +kubebuilder:validation:Maximum=43200 + // +kubebuilder:default=3600 + // +optional + SessionRoleMaxDurationSeconds *int32 `json:"sessionRoleMaxDurationSeconds,omitempty"` } // BudgetRef points at a BudgetPolicy by name. @@ -91,6 +125,11 @@ type PlatformStatus struct { // +optional IamRoleArn string `json:"iamRoleArn,omitempty"` + // SessionRoleArn is the per-Platform attribution session role, created when + // spec.attribution is set. Empty when attribution is off. + // +optional + SessionRoleArn string `json:"sessionRoleArn,omitempty"` + // Namespace is the tenant namespace the controller provisioned. // +optional Namespace string `json:"namespace,omitempty"` diff --git a/operators/api/platform/v1alpha1/zz_generated.deepcopy.go b/operators/api/platform/v1alpha1/zz_generated.deepcopy.go index 0e33577..43e7f69 100644 --- a/operators/api/platform/v1alpha1/zz_generated.deepcopy.go +++ b/operators/api/platform/v1alpha1/zz_generated.deepcopy.go @@ -25,6 +25,31 @@ import ( runtime "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AttributionSpec) DeepCopyInto(out *AttributionSpec) { + *out = *in + if in.Operators != nil { + in, out := &in.Operators, &out.Operators + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.SessionRoleMaxDurationSeconds != nil { + in, out := &in.SessionRoleMaxDurationSeconds, &out.SessionRoleMaxDurationSeconds + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AttributionSpec. +func (in *AttributionSpec) DeepCopy() *AttributionSpec { + if in == nil { + return nil + } + out := new(AttributionSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *BudgetRef) DeepCopyInto(out *BudgetRef) { *out = *in @@ -165,6 +190,11 @@ func (in *PlatformSpec) DeepCopyInto(out *PlatformSpec) { out.Budget = in.Budget in.Identity.DeepCopyInto(&out.Identity) out.Compliance = in.Compliance + if in.Attribution != nil { + in, out := &in.Attribution, &out.Attribution + *out = new(AttributionSpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PlatformSpec. diff --git a/operators/config/crd/bases/platform.nanohype.dev_platforms.yaml b/operators/config/crd/bases/platform.nanohype.dev_platforms.yaml index 7770c9e..d7c3587 100644 --- a/operators/config/crd/bases/platform.nanohype.dev_platforms.yaml +++ b/operators/config/crd/bases/platform.nanohype.dev_platforms.yaml @@ -63,6 +63,43 @@ spec: hosting one or more AgentFleets, with its own budget, identity, and guardrails. properties: + attribution: + description: |- + Attribution opts the Platform into per-session human attribution. When + set, the operator provisions a session role — assumable by the tenant + IRSA role with the operator carried as STS SourceIdentity, scoped to the + tenant baseline (Bedrock invoke) and NOT broad sts:AssumeRole — plus a + ClusterRole letting the tenant ServiceAccount impersonate the named + operators at the apiserver. fab's role-session entrypoint consumes both, + so an agent's AWS + Kubernetes actions attribute to a named human. + nil = unattributed (the default). + properties: + operators: + description: |- + Operators is the set of human identities (e.g. email addresses) a + session in this Platform may act as. Each value becomes both an allowed + STS SourceIdentity on the session role's trust policy and a resourceNames + entry on the impersonate ClusterRole, so the SAME string binds the AWS + and Kubernetes audit records. Use a canonical form (a lowercased email); + it must byte-match the operator's own RBAC subject name. + items: + type: string + minItems: 1 + type: array + sessionRoleMaxDurationSeconds: + default: 3600 + description: |- + SessionRoleMaxDurationSeconds caps the assumed session lifetime. Because + the caller is the tenant IRSA role, AWS STS role chaining hard-caps a + chained session at 3600s regardless of this value; larger values only + matter if the caller ever changes. Defaults to 3600. + format: int32 + maximum: 43200 + minimum: 900 + type: integer + required: + - operators + type: object budget: description: Budget references a BudgetPolicy CR in the same namespace. properties: @@ -230,6 +267,11 @@ spec: phase: description: 'Phase: Pending, Provisioning, Ready, Suspended, Failed.' type: string + sessionRoleArn: + description: |- + SessionRoleArn is the per-Platform attribution session role, created when + spec.attribution is set. Empty when attribution is off. + type: string suspendedAt: description: |- SuspendedAt is the timestamp at which the kill-switch fired. When diff --git a/operators/config/rbac/role.yaml b/operators/config/rbac/role.yaml index c7cfb61..862a055 100644 --- a/operators/config/rbac/role.yaml +++ b/operators/config/rbac/role.yaml @@ -21,6 +21,12 @@ rules: - patch - update - watch +- apiGroups: + - "" + resources: + - users + verbs: + - impersonate - apiGroups: - agentgateway.dev resources: @@ -220,6 +226,19 @@ rules: - get - patch - update +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - clusterroles + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - resource.k8s.io resources: diff --git a/operators/internal/controller/platform_controller.go b/operators/internal/controller/platform_controller.go index 7ff9ae8..5f8cde7 100644 --- a/operators/internal/controller/platform_controller.go +++ b/operators/internal/controller/platform_controller.go @@ -69,6 +69,8 @@ type PlatformReconciler struct { // +kubebuilder:rbac:groups="",resources=namespaces;resourcequotas;limitranges,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=networking.k8s.io,resources=networkpolicies,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=argoproj.io,resources=appprojects,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles;clusterrolebindings,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="",resources=users,verbs=impersonate // Reconcile drives a Platform CR toward its desired state. func (r *PlatformReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -105,6 +107,15 @@ func (r *PlatformReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c logger.Error(err, "IAM role cleanup failed; will retry") return ctrl.Result{}, err } + // Attribution resources (no-ops when attribution was never enabled). + if err := r.deleteSessionRole(ctx, platform, r.IAMCfg.Environment); err != nil { + logger.Error(err, "session role cleanup failed; will retry") + return ctrl.Result{}, err + } + if err := r.deleteOperatorImpersonateRBAC(ctx, platform); err != nil { + logger.Error(err, "impersonate RBAC cleanup failed; will retry") + return ctrl.Result{}, err + } controllerutil.RemoveFinalizer(platform, finalizerName) if err := r.Update(ctx, platform); err != nil { return ctrl.Result{}, fmt.Errorf("remove finalizer: %w", err) @@ -199,6 +210,40 @@ func (r *PlatformReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, err } + // Per-session human attribution (optional). Provision the session role + // (assumable by the tenant IRSA role with the operator as STS + // SourceIdentity) + the apiserver impersonate RBAC. Reconciles in both + // directions: removing spec.attribution tears the pair back down. The + // session role honors the kill-switch via the susp.Suspended flag (baseline + // detached when suspended, like the tenant role). + if platform.Spec.Attribution != nil { + if susp.RoleARN != "" { + sessionARN, err := r.ensureSessionRole(ctx, platform, susp.RoleARN, susp.Suspended, r.IAMCfg) + if err != nil { + logger.Error(err, "ensureSessionRole failed") + return ctrl.Result{}, err + } + if sessionARN != "" { + platform.Status.SessionRoleArn = sessionARN + } + } + if err := r.ensureOperatorImpersonateRBAC(ctx, platform); err != nil { + logger.Error(err, "ensureOperatorImpersonateRBAC failed") + return ctrl.Result{}, err + } + } else if platform.Status.SessionRoleArn != "" { + // Attribution was enabled and is now removed — tear the pair down. + if err := r.deleteSessionRole(ctx, platform, r.IAMCfg.Environment); err != nil { + logger.Error(err, "deleteSessionRole (attribution removed) failed") + return ctrl.Result{}, err + } + if err := r.deleteOperatorImpersonateRBAC(ctx, platform); err != nil { + logger.Error(err, "deleteOperatorImpersonateRBAC (attribution removed) failed") + return ctrl.Result{}, err + } + platform.Status.SessionRoleArn = "" + } + if susp.Suspended { platform.Status.Phase = phaseSuspended if platform.Status.SuspendedAt == nil { diff --git a/operators/internal/controller/platform_iam.go b/operators/internal/controller/platform_iam.go index af48cdf..07c5ea0 100644 --- a/operators/internal/controller/platform_iam.go +++ b/operators/internal/controller/platform_iam.go @@ -317,12 +317,19 @@ func (r *PlatformReconciler) reconcileManagedPolicies(ctx context.Context, roleN } // deleteIamRole is the finalizer counterpart: detach all policies and -// delete the role. Tolerates NotFound so re-runs are safe. +// delete the tenant role. Tolerates NotFound so re-runs are safe. func (r *PlatformReconciler) deleteIamRole(ctx context.Context, p *platformv1alpha1.Platform, environment string) error { + return r.detachAndDeleteRole(ctx, tenantRoleName(environment, p)) +} + +// detachAndDeleteRole detaches every managed policy from a role and deletes +// it. Shared by the tenant-role and session-role finalizers. Tolerates +// NotFound at every step so re-runs (and roles that were never created) are +// safe no-ops. +func (r *PlatformReconciler) detachAndDeleteRole(ctx context.Context, name string) error { if r.IAM == nil { return nil } - name := tenantRoleName(environment, p) var marker *string for { listOut, err := r.IAM.ListAttachedRolePolicies(ctx, &iam.ListAttachedRolePoliciesInput{ diff --git a/operators/internal/controller/platform_iam_reconcile_test.go b/operators/internal/controller/platform_iam_reconcile_test.go index 1df8e71..5686ba2 100644 --- a/operators/internal/controller/platform_iam_reconcile_test.go +++ b/operators/internal/controller/platform_iam_reconcile_test.go @@ -40,11 +40,14 @@ type fakeIAM struct { roles map[string]*iamtypes.Role attached map[string]map[string]struct{} // roleName -> set of policy ARNs - listCalls int - attachCalls []iam.AttachRolePolicyInput - listReturnsErr error - attachReturnsErr map[string]error // policyARN -> err - pageBoundary int // if > 0, paginate ListAttached at this size + listCalls int + attachCalls []iam.AttachRolePolicyInput + createCalls []iam.CreateRoleInput + updateAssumeCalls []iam.UpdateAssumeRolePolicyInput + detachCalls []iam.DetachRolePolicyInput + listReturnsErr error + attachReturnsErr map[string]error // policyARN -> err + pageBoundary int // if > 0, paginate ListAttached at this size } func newFakeIAM() *fakeIAM { @@ -83,6 +86,7 @@ func (f *fakeIAM) attachmentsFor(roleName string) []string { } func (f *fakeIAM) CreateRole(_ context.Context, params *iam.CreateRoleInput, _ ...func(*iam.Options)) (*iam.CreateRoleOutput, error) { + f.createCalls = append(f.createCalls, *params) name := aws.ToString(params.RoleName) arn := "arn:aws:iam::123456789012:role/" + name f.seedRole(name, arn, params.Tags...) @@ -109,7 +113,8 @@ func (f *fakeIAM) TagRole(_ context.Context, _ *iam.TagRoleInput, _ ...func(*iam return &iam.TagRoleOutput{}, nil } -func (f *fakeIAM) UpdateAssumeRolePolicy(_ context.Context, _ *iam.UpdateAssumeRolePolicyInput, _ ...func(*iam.Options)) (*iam.UpdateAssumeRolePolicyOutput, error) { +func (f *fakeIAM) UpdateAssumeRolePolicy(_ context.Context, params *iam.UpdateAssumeRolePolicyInput, _ ...func(*iam.Options)) (*iam.UpdateAssumeRolePolicyOutput, error) { + f.updateAssumeCalls = append(f.updateAssumeCalls, *params) return &iam.UpdateAssumeRolePolicyOutput{}, nil } @@ -128,6 +133,7 @@ func (f *fakeIAM) AttachRolePolicy(_ context.Context, params *iam.AttachRolePoli } func (f *fakeIAM) DetachRolePolicy(_ context.Context, params *iam.DetachRolePolicyInput, _ ...func(*iam.Options)) (*iam.DetachRolePolicyOutput, error) { + f.detachCalls = append(f.detachCalls, *params) roleName := aws.ToString(params.RoleName) delete(f.attached[roleName], aws.ToString(params.PolicyArn)) return &iam.DetachRolePolicyOutput{}, nil diff --git a/operators/internal/controller/platform_rbac.go b/operators/internal/controller/platform_rbac.go new file mode 100644 index 0000000..0096edd --- /dev/null +++ b/operators/internal/controller/platform_rbac.go @@ -0,0 +1,95 @@ +/* +Copyright 2026 stxkxs. + +Licensed under the Apache License, Version 2.0 (the "License"); +*/ + +package controller + +import ( + "context" + "fmt" + + rbacv1 "k8s.io/api/rbac/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + platformv1alpha1 "github.com/nanohype/eks-agent-platform/operators/api/platform/v1alpha1" +) + +// impersonateResourceName is the cluster-scoped name shared by the ClusterRole +// and ClusterRoleBinding that let a Platform's tenant ServiceAccount +// impersonate its named operators. Keyed off the (already unique, length-safe) +// tenant namespace so two Platforms never collide on a cluster-global name. +func impersonateResourceName(p *platformv1alpha1.Platform) string { + return PlatformNamespace(p) + "-impersonate" +} + +// ensureOperatorImpersonateRBAC provisions the apiserver half of attribution: +// a ClusterRole granting `impersonate` on exactly the Platform's operator +// users, bound to the tenant-runtime ServiceAccount. fab's session kubeconfig +// authenticates with that SA's token while impersonating the operator, so the +// apiserver audit log records impersonatedUser=. +// +// Scoped to the named users only (never `impersonate *`), so the SA can act as +// the listed humans and no one else. Cluster-scoped resources can't be GC'd via +// an OwnerReference from the namespaced Platform, so cleanup runs through +// deleteOperatorImpersonateRBAC in the finalizer (same pattern as the tenant +// namespace). +func (r *PlatformReconciler) ensureOperatorImpersonateRBAC(ctx context.Context, p *platformv1alpha1.Platform) error { + if p.Spec.Attribution == nil { + return nil + } + name := impersonateResourceName(p) + operators := p.Spec.Attribution.Operators + + cr := &rbacv1.ClusterRole{ObjectMeta: metav1.ObjectMeta{Name: name}} + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, cr, func() error { + cr.Labels = labelsForPlatform(p) + cr.Rules = []rbacv1.PolicyRule{{ + APIGroups: []string{""}, + Resources: []string{"users"}, + Verbs: []string{"impersonate"}, + ResourceNames: operators, + }} + return nil + }); err != nil { + return fmt.Errorf("ensure impersonate ClusterRole %s: %w", name, err) + } + + crb := &rbacv1.ClusterRoleBinding{ObjectMeta: metav1.ObjectMeta{Name: name}} + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, crb, func() error { + crb.Labels = labelsForPlatform(p) + crb.RoleRef = rbacv1.RoleRef{ + APIGroup: rbacv1.GroupName, + Kind: "ClusterRole", + Name: name, + } + crb.Subjects = []rbacv1.Subject{{ + Kind: rbacv1.ServiceAccountKind, + Name: tenantSAName, + Namespace: PlatformNamespace(p), + }} + return nil + }); err != nil { + return fmt.Errorf("ensure impersonate ClusterRoleBinding %s: %w", name, err) + } + return nil +} + +// deleteOperatorImpersonateRBAC removes the impersonate ClusterRole + +// ClusterRoleBinding. Tolerates NotFound so non-attribution Platforms and +// re-runs are safe no-ops. +func (r *PlatformReconciler) deleteOperatorImpersonateRBAC(ctx context.Context, p *platformv1alpha1.Platform) error { + name := impersonateResourceName(p) + crb := &rbacv1.ClusterRoleBinding{ObjectMeta: metav1.ObjectMeta{Name: name}} + if err := r.Delete(ctx, crb); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete impersonate ClusterRoleBinding %s: %w", name, err) + } + cr := &rbacv1.ClusterRole{ObjectMeta: metav1.ObjectMeta{Name: name}} + if err := r.Delete(ctx, cr); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete impersonate ClusterRole %s: %w", name, err) + } + return nil +} diff --git a/operators/internal/controller/platform_rbac_test.go b/operators/internal/controller/platform_rbac_test.go new file mode 100644 index 0000000..a039364 --- /dev/null +++ b/operators/internal/controller/platform_rbac_test.go @@ -0,0 +1,123 @@ +/* +Copyright 2026 stxkxs. + +Licensed under the Apache License, Version 2.0 (the "License"); +*/ + +package controller + +import ( + "context" + "testing" + + rbacv1 "k8s.io/api/rbac/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func rbacTestClient(t *testing.T) client.Client { + t.Helper() + scheme := runtime.NewScheme() + if err := rbacv1.AddToScheme(scheme); err != nil { + t.Fatalf("add rbac scheme: %v", err) + } + return fake.NewClientBuilder().WithScheme(scheme).Build() +} + +func TestEnsureOperatorImpersonateRBAC(t *testing.T) { + cl := rbacTestClient(t) + r := &PlatformReconciler{Client: cl} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com", "bob@acme.com"}, nil) + + if err := r.ensureOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Fatalf("ensureOperatorImpersonateRBAC: %v", err) + } + name := impersonateResourceName(p) + + var cr rbacv1.ClusterRole + if err := cl.Get(context.Background(), types.NamespacedName{Name: name}, &cr); err != nil { + t.Fatalf("get ClusterRole: %v", err) + } + if len(cr.Rules) != 1 { + t.Fatalf("rules: got %d want 1", len(cr.Rules)) + } + rule := cr.Rules[0] + if len(rule.Verbs) != 1 || rule.Verbs[0] != "impersonate" { + t.Errorf("verbs: got %v want [impersonate]", rule.Verbs) + } + if len(rule.Resources) != 1 || rule.Resources[0] != "users" { + t.Errorf("resources: got %v want [users]", rule.Resources) + } + wantOps := map[string]bool{"alice@acme.com": true, "bob@acme.com": true} + if len(rule.ResourceNames) != len(wantOps) { + t.Fatalf("resourceNames: got %v want %v", rule.ResourceNames, wantOps) + } + for _, op := range rule.ResourceNames { + if !wantOps[op] { + t.Errorf("unexpected resourceName %q (impersonation must be scoped to the named operators)", op) + } + } + + var crb rbacv1.ClusterRoleBinding + if err := cl.Get(context.Background(), types.NamespacedName{Name: name}, &crb); err != nil { + t.Fatalf("get ClusterRoleBinding: %v", err) + } + if crb.RoleRef.Name != name || crb.RoleRef.Kind != "ClusterRole" { + t.Errorf("roleRef: got %+v", crb.RoleRef) + } + if len(crb.Subjects) != 1 { + t.Fatalf("subjects: got %d want 1", len(crb.Subjects)) + } + sub := crb.Subjects[0] + if sub.Kind != "ServiceAccount" || sub.Name != tenantSAName || sub.Namespace != PlatformNamespace(p) { + t.Errorf("subject: got %+v want ServiceAccount %s/%s", sub, PlatformNamespace(p), tenantSAName) + } +} + +func TestEnsureOperatorImpersonateRBAC_UpdatesOperators(t *testing.T) { + cl := rbacTestClient(t) + r := &PlatformReconciler{Client: cl} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, nil) + if err := r.ensureOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Fatalf("first ensure: %v", err) + } + + p.Spec.Attribution.Operators = []string{"carol@acme.com"} + if err := r.ensureOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Fatalf("second ensure: %v", err) + } + + var cr rbacv1.ClusterRole + if err := cl.Get(context.Background(), types.NamespacedName{Name: impersonateResourceName(p)}, &cr); err != nil { + t.Fatalf("get ClusterRole: %v", err) + } + got := cr.Rules[0].ResourceNames + if len(got) != 1 || got[0] != "carol@acme.com" { + t.Errorf("resourceNames after operator change: got %v want [carol@acme.com]", got) + } +} + +func TestDeleteOperatorImpersonateRBAC(t *testing.T) { + cl := rbacTestClient(t) + r := &PlatformReconciler{Client: cl} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, nil) + if err := r.ensureOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Fatalf("ensure: %v", err) + } + + if err := r.deleteOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Fatalf("delete: %v", err) + } + name := impersonateResourceName(p) + var cr rbacv1.ClusterRole + if err := cl.Get(context.Background(), types.NamespacedName{Name: name}, &cr); !apierrors.IsNotFound(err) { + t.Errorf("ClusterRole should be gone: err=%v", err) + } + // Deleting again is a tolerated no-op. + if err := r.deleteOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Errorf("second delete should be a no-op: %v", err) + } +} diff --git a/operators/internal/controller/platform_session_iam.go b/operators/internal/controller/platform_session_iam.go new file mode 100644 index 0000000..78b1d78 --- /dev/null +++ b/operators/internal/controller/platform_session_iam.go @@ -0,0 +1,195 @@ +/* +Copyright 2026 stxkxs. + +Licensed under the Apache License, Version 2.0 (the "License"); +*/ + +package controller + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/iam" + iamtypes "github.com/aws/aws-sdk-go-v2/service/iam/types" + + platformv1alpha1 "github.com/nanohype/eks-agent-platform/operators/api/platform/v1alpha1" +) + +// defaultSessionRoleMaxDuration is the assumed-session lifetime when a +// Platform's spec.attribution.sessionRoleMaxDurationSeconds is unset. Matches +// the STS role-chaining ceiling: the caller is the pod's own IRSA-assumed +// tenant role, so AWS caps the chained session at 3600s regardless. +const defaultSessionRoleMaxDuration int32 = 3600 + +// sessionRoleName returns the attribution session role minted for a Platform: +// +// --session +// +// Same 64-char cap + FNV-1a hash-truncation scheme as tenantRoleName, so the +// two role names never collide and both stay within IAM's role-name limit. +func sessionRoleName(env string, p *platformv1alpha1.Platform) string { + const suffix = "-session" + const maxLen = 64 + full := env + "-" + p.Name + suffix + if len(full) <= maxLen { + return full + } + prefix := env + "-" + budget := maxLen - len(prefix) - len(suffix) - 1 - 8 + h := fnv1a64(p.Name) + return fmt.Sprintf("%s%s-%08x%s", prefix, p.Name[:budget], h&0xffffffff, suffix) +} + +// sessionRoleTrustPolicy builds the trust policy for the attribution session +// role: only the tenant IRSA role may assume it, and only while setting an STS +// SourceIdentity drawn from the Platform's operator list. sts:SetSourceIdentity +// is granted alongside sts:AssumeRole so the caller can stamp the human, and +// the sts:SourceIdentity condition pins the allowed values so the caller can't +// assume the role under an arbitrary identity. +func sessionRoleTrustPolicy(tenantRoleARN string, operators []string) (string, error) { + stmt := map[string]any{ + "Effect": "Allow", + "Principal": map[string]any{"AWS": tenantRoleARN}, + "Action": []string{"sts:AssumeRole", "sts:SetSourceIdentity"}, + } + if len(operators) > 0 { + stmt["Condition"] = map[string]any{ + "StringEquals": map[string]any{"sts:SourceIdentity": operators}, + } + } + doc := map[string]any{ + "Version": "2012-10-17", + "Statement": []map[string]any{stmt}, + } + b, err := json.Marshal(doc) + if err != nil { + return "", fmt.Errorf("marshal session trust policy: %w", err) + } + return string(b), nil +} + +// sessionRoleTags mirrors tenantRoleTags but marks the role's Component as +// session-iam so cloudgov tagging + cost attribution tell the two roles apart. +func sessionRoleTags(p *platformv1alpha1.Platform, cfg IAMConfig) []iamtypes.Tag { + tags := tenantRoleTags(p, cfg) + for i := range tags { + if aws.ToString(tags[i].Key) == "Component" { + tags[i].Value = aws.String("session-iam") + } + } + return tags +} + +// sessionRoleMaxDuration reads the per-Platform cap, defaulting to 3600. +func sessionRoleMaxDuration(p *platformv1alpha1.Platform) int32 { + if p.Spec.Attribution != nil && p.Spec.Attribution.SessionRoleMaxDurationSeconds != nil { + return *p.Spec.Attribution.SessionRoleMaxDurationSeconds + } + return defaultSessionRoleMaxDuration +} + +// ensureSessionRole provisions (or reconciles) the attribution session role for +// a Platform with spec.attribution set, and returns its ARN. The role is +// assumable only by the tenant IRSA role, only while carrying one of the +// Platform's operators as STS SourceIdentity, and is limited to the tenant +// baseline policy (Bedrock invoke) — never broad sts:AssumeRole. +// +// When suspended (kill-switch), the baseline is DETACHED rather than attached: +// otherwise a suspended tenant could keep invoking Bedrock through the session +// role even after its own tenant role's baseline was pulled. +// +// Idempotent: refreshes the trust policy on every reconcile (the operator list +// can change) and converges the baseline attachment to the suspended state. +func (r *PlatformReconciler) ensureSessionRole( + ctx context.Context, + p *platformv1alpha1.Platform, + tenantRoleARN string, + suspended bool, + cfg IAMConfig, +) (string, error) { + if r.IAM == nil || p.Spec.Attribution == nil { + return "", nil + } + name := sessionRoleName(cfg.Environment, p) + trust, err := sessionRoleTrustPolicy(tenantRoleARN, p.Spec.Attribution.Operators) + if err != nil { + return "", err + } + + // Idempotency: GetRole first; if present, refresh trust + converge baseline. + getOut, getErr := r.IAM.GetRole(ctx, &iam.GetRoleInput{RoleName: aws.String(name)}) + if getErr == nil && getOut != nil && getOut.Role != nil { + arn := aws.ToString(getOut.Role.Arn) + if _, err := r.IAM.UpdateAssumeRolePolicy(ctx, &iam.UpdateAssumeRolePolicyInput{ + RoleName: aws.String(name), + PolicyDocument: aws.String(trust), + }); err != nil { + return arn, fmt.Errorf("iam UpdateAssumeRolePolicy %s: %w", name, err) + } + if err := r.reconcileSessionBaseline(ctx, name, cfg.TenantBaselinePolicyARN, suspended); err != nil { + return arn, err + } + return arn, nil + } + if !isIAMNotFound(getErr) { + return "", fmt.Errorf("iam GetRole %s: %w", name, getErr) + } + + path := cfg.TenantIAMPath + if path == "" { + path = "/eks-agent-platform/tenants/" + } + if !strings.HasSuffix(path, "/") { + path += "/" + } + createInput := &iam.CreateRoleInput{ + RoleName: aws.String(name), + Path: aws.String(path), + AssumeRolePolicyDocument: aws.String(trust), + Description: aws.String(fmt.Sprintf("Attribution session role for Platform %s (tenant %s)", p.Name, p.Spec.Tenant)), + MaxSessionDuration: aws.Int32(sessionRoleMaxDuration(p)), + Tags: sessionRoleTags(p, cfg), + } + if cfg.TenantPermissionsBoundaryARN != "" { + createInput.PermissionsBoundary = aws.String(cfg.TenantPermissionsBoundaryARN) + } + createOut, err := r.IAM.CreateRole(ctx, createInput) + if err != nil { + return "", fmt.Errorf("iam CreateRole %s: %w", name, err) + } + arn := aws.ToString(createOut.Role.Arn) + if err := r.reconcileSessionBaseline(ctx, name, cfg.TenantBaselinePolicyARN, suspended); err != nil { + return arn, err + } + return arn, nil +} + +// reconcileSessionBaseline converges the session role's baseline attachment: +// attached when running, detached when suspended (kill-switch parity with the +// tenant role). No-op when no baseline policy is configured (dev/test). +func (r *PlatformReconciler) reconcileSessionBaseline(ctx context.Context, roleName, baselineARN string, suspended bool) error { + if baselineARN == "" { + return nil + } + if suspended { + if _, err := r.IAM.DetachRolePolicy(ctx, &iam.DetachRolePolicyInput{ + RoleName: aws.String(roleName), + PolicyArn: aws.String(baselineARN), + }); err != nil && !isIAMNotFound(err) { + return fmt.Errorf("iam DetachRolePolicy %s (suspend session role): %w", baselineARN, err) + } + return nil + } + return r.reconcileManagedPolicies(ctx, roleName, baselineARN, nil) +} + +// deleteSessionRole is the finalizer counterpart: detach policies + delete the +// session role. Tolerates NotFound so non-attribution Platforms (which never +// had a session role) and re-runs are safe. +func (r *PlatformReconciler) deleteSessionRole(ctx context.Context, p *platformv1alpha1.Platform, environment string) error { + return r.detachAndDeleteRole(ctx, sessionRoleName(environment, p)) +} diff --git a/operators/internal/controller/platform_session_iam_test.go b/operators/internal/controller/platform_session_iam_test.go new file mode 100644 index 0000000..c253fe6 --- /dev/null +++ b/operators/internal/controller/platform_session_iam_test.go @@ -0,0 +1,166 @@ +/* +Copyright 2026 stxkxs. + +Licensed under the Apache License, Version 2.0 (the "License"); +*/ + +package controller + +import ( + "context" + "strings" + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + + platformv1alpha1 "github.com/nanohype/eks-agent-platform/operators/api/platform/v1alpha1" +) + +// attributedPlatform builds a Platform with spec.attribution set. Shared with +// the RBAC tests (same package). +// +//nolint:unparam // test helper: name/tenant are fixed across cases by design +func attributedPlatform(name, tenant string, operators []string, maxDur *int32) *platformv1alpha1.Platform { + p := newPlatform(name, tenant) + p.Spec.Attribution = &platformv1alpha1.AttributionSpec{ + Operators: operators, + SessionRoleMaxDurationSeconds: maxDur, + } + return p +} + +func TestEnsureSessionRole_CreatesRoleWithTrustAndBaseline(t *testing.T) { + const baseline = "arn:aws:iam::aws:policy/EksAgentBaseline" + const tenantARN = "arn:aws:iam::123456789012:role/production-acme-tenant" + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + cfg := IAMConfig{TenantBaselinePolicyARN: baseline, Environment: "production"} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com", "bob@acme.com"}, nil) + + arn, err := r.ensureSessionRole(context.Background(), p, tenantARN, false, cfg) + if err != nil { + t.Fatalf("ensureSessionRole: %v", err) + } + if arn == "" { + t.Fatal("expected a session role ARN") + } + name := sessionRoleName(cfg.Environment, p) + if name != "production-acme-session" { + t.Fatalf("session role name: got %s want production-acme-session", name) + } + if len(f.createCalls) != 1 { + t.Fatalf("create calls: got %d want 1", len(f.createCalls)) + } + + // Trust: only the tenant role may assume, only while setting one of the + // operators as SourceIdentity, and NOT via web identity / broad assume. + trust := aws.ToString(f.createCalls[0].AssumeRolePolicyDocument) + for _, want := range []string{tenantARN, "sts:AssumeRole", "sts:SetSourceIdentity", "sts:SourceIdentity", "alice@acme.com", "bob@acme.com"} { + if !strings.Contains(trust, want) { + t.Errorf("trust policy missing %q:\n%s", want, trust) + } + } + if strings.Contains(trust, "AssumeRoleWithWebIdentity") { + t.Errorf("session role trust must not grant web-identity assume:\n%s", trust) + } + if got := aws.ToInt32(f.createCalls[0].MaxSessionDuration); got != 3600 { + t.Errorf("MaxSessionDuration: got %d want 3600", got) + } + if got := f.attachmentsFor(name); len(got) != 1 || got[0] != baseline { + t.Errorf("baseline attachment: got %v want [%s]", got, baseline) + } +} + +func TestEnsureSessionRole_CustomMaxDuration(t *testing.T) { + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + cfg := IAMConfig{Environment: "production"} + dur := int32(7200) + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, &dur) + + if _, err := r.ensureSessionRole(context.Background(), p, "arn:aws:iam::1:role/tenant", false, cfg); err != nil { + t.Fatalf("ensureSessionRole: %v", err) + } + if got := aws.ToInt32(f.createCalls[0].MaxSessionDuration); got != 7200 { + t.Errorf("MaxSessionDuration: got %d want 7200", got) + } +} + +func TestEnsureSessionRole_IdempotentRefreshesTrust(t *testing.T) { + const baseline = "arn:aws:iam::aws:policy/EksAgentBaseline" + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + cfg := IAMConfig{TenantBaselinePolicyARN: baseline, Environment: "production"} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, nil) + name := sessionRoleName(cfg.Environment, p) + f.seedRole(name, "arn:aws:iam::123:role/"+name) + + if _, err := r.ensureSessionRole(context.Background(), p, "arn:aws:iam::1:role/tenant", false, cfg); err != nil { + t.Fatalf("ensureSessionRole: %v", err) + } + if len(f.createCalls) != 0 { + t.Errorf("create calls: got %d want 0 (role already existed)", len(f.createCalls)) + } + if len(f.updateAssumeCalls) != 1 { + t.Fatalf("trust-refresh calls: got %d want 1", len(f.updateAssumeCalls)) + } + if !strings.Contains(aws.ToString(f.updateAssumeCalls[0].PolicyDocument), "alice@acme.com") { + t.Errorf("refreshed trust should carry the operator") + } + if got := f.attachmentsFor(name); len(got) != 1 || got[0] != baseline { + t.Errorf("baseline attachment: got %v", got) + } +} + +func TestEnsureSessionRole_SuspendedDetachesBaseline(t *testing.T) { + const baseline = "arn:aws:iam::aws:policy/EksAgentBaseline" + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + cfg := IAMConfig{TenantBaselinePolicyARN: baseline, Environment: "production"} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, nil) + name := sessionRoleName(cfg.Environment, p) + f.seedRole(name, "arn:aws:iam::123:role/"+name) + f.seedAttachment(name, baseline) + + if _, err := r.ensureSessionRole(context.Background(), p, "arn:aws:iam::1:role/tenant", true, cfg); err != nil { + t.Fatalf("ensureSessionRole (suspended): %v", err) + } + if got := f.attachmentsFor(name); len(got) != 0 { + t.Errorf("baseline must be detached when suspended (kill-switch parity): got %v", got) + } +} + +func TestEnsureSessionRole_NilAttributionNoop(t *testing.T) { + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + p := newPlatform("acme", "protohype") // no attribution + + arn, err := r.ensureSessionRole(context.Background(), p, "arn:aws:iam::1:role/tenant", false, IAMConfig{Environment: "production"}) + if err != nil || arn != "" { + t.Fatalf("expected no-op; got arn=%q err=%v", arn, err) + } + if len(f.createCalls) != 0 { + t.Errorf("expected no create calls, got %d", len(f.createCalls)) + } +} + +func TestDeleteSessionRole(t *testing.T) { + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + cfg := IAMConfig{Environment: "production"} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, nil) + name := sessionRoleName(cfg.Environment, p) + f.seedRole(name, "arn:aws:iam::123:role/"+name) + f.seedAttachment(name, "arn:aws:iam::aws:policy/EksAgentBaseline") + + if err := r.deleteSessionRole(context.Background(), p, cfg.Environment); err != nil { + t.Fatalf("deleteSessionRole: %v", err) + } + if _, ok := f.roles[name]; ok { + t.Errorf("session role should be deleted") + } + // Deleting a non-existent session role is a tolerated no-op. + if err := r.deleteSessionRole(context.Background(), p, cfg.Environment); err != nil { + t.Errorf("second delete should be a no-op: %v", err) + } +}