diff --git a/charts/operator/crds/platform.nanohype.dev_platforms.yaml b/charts/operator/crds/platform.nanohype.dev_platforms.yaml index 7770c9e..d7c3587 100644 --- a/charts/operator/crds/platform.nanohype.dev_platforms.yaml +++ b/charts/operator/crds/platform.nanohype.dev_platforms.yaml @@ -63,6 +63,43 @@ spec: hosting one or more AgentFleets, with its own budget, identity, and guardrails. properties: + attribution: + description: |- + Attribution opts the Platform into per-session human attribution. When + set, the operator provisions a session role — assumable by the tenant + IRSA role with the operator carried as STS SourceIdentity, scoped to the + tenant baseline (Bedrock invoke) and NOT broad sts:AssumeRole — plus a + ClusterRole letting the tenant ServiceAccount impersonate the named + operators at the apiserver. fab's role-session entrypoint consumes both, + so an agent's AWS + Kubernetes actions attribute to a named human. + nil = unattributed (the default). + properties: + operators: + description: |- + Operators is the set of human identities (e.g. email addresses) a + session in this Platform may act as. Each value becomes both an allowed + STS SourceIdentity on the session role's trust policy and a resourceNames + entry on the impersonate ClusterRole, so the SAME string binds the AWS + and Kubernetes audit records. Use a canonical form (a lowercased email); + it must byte-match the operator's own RBAC subject name. + items: + type: string + minItems: 1 + type: array + sessionRoleMaxDurationSeconds: + default: 3600 + description: |- + SessionRoleMaxDurationSeconds caps the assumed session lifetime. Because + the caller is the tenant IRSA role, AWS STS role chaining hard-caps a + chained session at 3600s regardless of this value; larger values only + matter if the caller ever changes. Defaults to 3600. + format: int32 + maximum: 43200 + minimum: 900 + type: integer + required: + - operators + type: object budget: description: Budget references a BudgetPolicy CR in the same namespace. properties: @@ -230,6 +267,11 @@ spec: phase: description: 'Phase: Pending, Provisioning, Ready, Suspended, Failed.' type: string + sessionRoleArn: + description: |- + SessionRoleArn is the per-Platform attribution session role, created when + spec.attribution is set. Empty when attribution is off. + type: string suspendedAt: description: |- SuspendedAt is the timestamp at which the kill-switch fired. When diff --git a/docs/crd-reference/v1alpha1.md b/docs/crd-reference/v1alpha1.md index 6b047eb..cf61df8 100644 --- a/docs/crd-reference/v1alpha1.md +++ b/docs/crd-reference/v1alpha1.md @@ -621,6 +621,24 @@ Package v1alpha1 contains API Schema definitions for the platform v1alpha1 API g +#### AttributionSpec + + + +AttributionSpec configures per-session human attribution for a Platform. See +github.com/nanohype/fab docs/attribution.md for the consumer side. + + + +_Appears in:_ +- [PlatformSpec](#platformspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `operators` _string array_ | Operators is the set of human identities (e.g. email addresses) a
session in this Platform may act as. Each value becomes both an allowed
STS SourceIdentity on the session role's trust policy and a resourceNames
entry on the impersonate ClusterRole, so the SAME string binds the AWS
and Kubernetes audit records. Use a canonical form (a lowercased email);
it must byte-match the operator's own RBAC subject name. | | MinItems: 1
| +| `sessionRoleMaxDurationSeconds` _integer_ | SessionRoleMaxDurationSeconds caps the assumed session lifetime. Because
the caller is the tenant IRSA role, AWS STS role chaining hard-caps a
chained session at 3600s regardless of this value; larger values only
matter if the caller ever changes. Defaults to 3600. | 3600 | Maximum: 43200
Minimum: 900
Optional: \{\}
| + + #### BudgetRef @@ -737,6 +755,7 @@ _Appears in:_ | `identity` _[IdentitySpec](#identityspec)_ | Identity controls how the IRSA role is named + which Bedrock models are
reachable. | | | | `compliance` _[ComplianceSpec](#compliancespec)_ | Compliance flags drive stricter defaults across the Platform. | | Optional: \{\}
| | `isolation` _string_ | Isolation: namespace (default) or vCluster (hard isolation). | namespace | Enum: [namespace vcluster]
Optional: \{\}
| +| `attribution` _[AttributionSpec](#attributionspec)_ | Attribution opts the Platform into per-session human attribution. When
set, the operator provisions a session role — assumable by the tenant
IRSA role with the operator carried as STS SourceIdentity, scoped to the
tenant baseline (Bedrock invoke) and NOT broad sts:AssumeRole — plus a
ClusterRole letting the tenant ServiceAccount impersonate the named
operators at the apiserver. fab's role-session entrypoint consumes both,
so an agent's AWS + Kubernetes actions attribute to a named human.
nil = unattributed (the default). | | Optional: \{\}
| #### PlatformStatus @@ -754,6 +773,7 @@ _Appears in:_ | --- | --- | --- | --- | | `phase` _string_ | Phase: Pending, Provisioning, Ready, Suspended, Failed. | | Optional: \{\}
| | `iamRoleArn` _string_ | IamRoleArn is the per-Platform IRSA role created by the controller. | | Optional: \{\}
| +| `sessionRoleArn` _string_ | SessionRoleArn is the per-Platform attribution session role, created when
spec.attribution is set. Empty when attribution is off. | | Optional: \{\}
| | `namespace` _string_ | Namespace is the tenant namespace the controller provisioned. | | Optional: \{\}
| | `observedGeneration` _integer_ | ObservedGeneration is the last spec.generation the controller reconciled. | | Optional: \{\}
| | `suspendedAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.33/#time-v1-meta)_ | SuspendedAt is the timestamp at which the kill-switch fired. When
non-nil the operator stops reattaching the baseline IAM policy and
the AgentFleetReconciler scales fleets to zero. Resets to nil only
when ops clears the iam:TagRole 'platform.nanohype.dev/suspended'
marker on the tenant IRSA role. | | Optional: \{\}
| diff --git a/operators/api/platform/v1alpha1/platform_types.go b/operators/api/platform/v1alpha1/platform_types.go index e1213f5..9ee22ab 100644 --- a/operators/api/platform/v1alpha1/platform_types.go +++ b/operators/api/platform/v1alpha1/platform_types.go @@ -44,6 +44,40 @@ type PlatformSpec struct { // +kubebuilder:default=namespace // +optional Isolation string `json:"isolation,omitempty"` + + // Attribution opts the Platform into per-session human attribution. When + // set, the operator provisions a session role — assumable by the tenant + // IRSA role with the operator carried as STS SourceIdentity, scoped to the + // tenant baseline (Bedrock invoke) and NOT broad sts:AssumeRole — plus a + // ClusterRole letting the tenant ServiceAccount impersonate the named + // operators at the apiserver. fab's role-session entrypoint consumes both, + // so an agent's AWS + Kubernetes actions attribute to a named human. + // nil = unattributed (the default). + // +optional + Attribution *AttributionSpec `json:"attribution,omitempty"` +} + +// AttributionSpec configures per-session human attribution for a Platform. See +// github.com/nanohype/fab docs/attribution.md for the consumer side. +type AttributionSpec struct { + // Operators is the set of human identities (e.g. email addresses) a + // session in this Platform may act as. Each value becomes both an allowed + // STS SourceIdentity on the session role's trust policy and a resourceNames + // entry on the impersonate ClusterRole, so the SAME string binds the AWS + // and Kubernetes audit records. Use a canonical form (a lowercased email); + // it must byte-match the operator's own RBAC subject name. + // +kubebuilder:validation:MinItems=1 + Operators []string `json:"operators"` + + // SessionRoleMaxDurationSeconds caps the assumed session lifetime. Because + // the caller is the tenant IRSA role, AWS STS role chaining hard-caps a + // chained session at 3600s regardless of this value; larger values only + // matter if the caller ever changes. Defaults to 3600. + // +kubebuilder:validation:Minimum=900 + // +kubebuilder:validation:Maximum=43200 + // +kubebuilder:default=3600 + // +optional + SessionRoleMaxDurationSeconds *int32 `json:"sessionRoleMaxDurationSeconds,omitempty"` } // BudgetRef points at a BudgetPolicy by name. @@ -91,6 +125,11 @@ type PlatformStatus struct { // +optional IamRoleArn string `json:"iamRoleArn,omitempty"` + // SessionRoleArn is the per-Platform attribution session role, created when + // spec.attribution is set. Empty when attribution is off. + // +optional + SessionRoleArn string `json:"sessionRoleArn,omitempty"` + // Namespace is the tenant namespace the controller provisioned. // +optional Namespace string `json:"namespace,omitempty"` diff --git a/operators/api/platform/v1alpha1/zz_generated.deepcopy.go b/operators/api/platform/v1alpha1/zz_generated.deepcopy.go index 0e33577..43e7f69 100644 --- a/operators/api/platform/v1alpha1/zz_generated.deepcopy.go +++ b/operators/api/platform/v1alpha1/zz_generated.deepcopy.go @@ -25,6 +25,31 @@ import ( runtime "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AttributionSpec) DeepCopyInto(out *AttributionSpec) { + *out = *in + if in.Operators != nil { + in, out := &in.Operators, &out.Operators + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.SessionRoleMaxDurationSeconds != nil { + in, out := &in.SessionRoleMaxDurationSeconds, &out.SessionRoleMaxDurationSeconds + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AttributionSpec. +func (in *AttributionSpec) DeepCopy() *AttributionSpec { + if in == nil { + return nil + } + out := new(AttributionSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *BudgetRef) DeepCopyInto(out *BudgetRef) { *out = *in @@ -165,6 +190,11 @@ func (in *PlatformSpec) DeepCopyInto(out *PlatformSpec) { out.Budget = in.Budget in.Identity.DeepCopyInto(&out.Identity) out.Compliance = in.Compliance + if in.Attribution != nil { + in, out := &in.Attribution, &out.Attribution + *out = new(AttributionSpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PlatformSpec. diff --git a/operators/config/crd/bases/platform.nanohype.dev_platforms.yaml b/operators/config/crd/bases/platform.nanohype.dev_platforms.yaml index 7770c9e..d7c3587 100644 --- a/operators/config/crd/bases/platform.nanohype.dev_platforms.yaml +++ b/operators/config/crd/bases/platform.nanohype.dev_platforms.yaml @@ -63,6 +63,43 @@ spec: hosting one or more AgentFleets, with its own budget, identity, and guardrails. properties: + attribution: + description: |- + Attribution opts the Platform into per-session human attribution. When + set, the operator provisions a session role — assumable by the tenant + IRSA role with the operator carried as STS SourceIdentity, scoped to the + tenant baseline (Bedrock invoke) and NOT broad sts:AssumeRole — plus a + ClusterRole letting the tenant ServiceAccount impersonate the named + operators at the apiserver. fab's role-session entrypoint consumes both, + so an agent's AWS + Kubernetes actions attribute to a named human. + nil = unattributed (the default). + properties: + operators: + description: |- + Operators is the set of human identities (e.g. email addresses) a + session in this Platform may act as. Each value becomes both an allowed + STS SourceIdentity on the session role's trust policy and a resourceNames + entry on the impersonate ClusterRole, so the SAME string binds the AWS + and Kubernetes audit records. Use a canonical form (a lowercased email); + it must byte-match the operator's own RBAC subject name. + items: + type: string + minItems: 1 + type: array + sessionRoleMaxDurationSeconds: + default: 3600 + description: |- + SessionRoleMaxDurationSeconds caps the assumed session lifetime. Because + the caller is the tenant IRSA role, AWS STS role chaining hard-caps a + chained session at 3600s regardless of this value; larger values only + matter if the caller ever changes. Defaults to 3600. + format: int32 + maximum: 43200 + minimum: 900 + type: integer + required: + - operators + type: object budget: description: Budget references a BudgetPolicy CR in the same namespace. properties: @@ -230,6 +267,11 @@ spec: phase: description: 'Phase: Pending, Provisioning, Ready, Suspended, Failed.' type: string + sessionRoleArn: + description: |- + SessionRoleArn is the per-Platform attribution session role, created when + spec.attribution is set. Empty when attribution is off. + type: string suspendedAt: description: |- SuspendedAt is the timestamp at which the kill-switch fired. When diff --git a/operators/config/rbac/role.yaml b/operators/config/rbac/role.yaml index c7cfb61..862a055 100644 --- a/operators/config/rbac/role.yaml +++ b/operators/config/rbac/role.yaml @@ -21,6 +21,12 @@ rules: - patch - update - watch +- apiGroups: + - "" + resources: + - users + verbs: + - impersonate - apiGroups: - agentgateway.dev resources: @@ -220,6 +226,19 @@ rules: - get - patch - update +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - clusterroles + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - resource.k8s.io resources: diff --git a/operators/internal/controller/platform_controller.go b/operators/internal/controller/platform_controller.go index 7ff9ae8..5f8cde7 100644 --- a/operators/internal/controller/platform_controller.go +++ b/operators/internal/controller/platform_controller.go @@ -69,6 +69,8 @@ type PlatformReconciler struct { // +kubebuilder:rbac:groups="",resources=namespaces;resourcequotas;limitranges,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=networking.k8s.io,resources=networkpolicies,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=argoproj.io,resources=appprojects,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles;clusterrolebindings,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="",resources=users,verbs=impersonate // Reconcile drives a Platform CR toward its desired state. func (r *PlatformReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -105,6 +107,15 @@ func (r *PlatformReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c logger.Error(err, "IAM role cleanup failed; will retry") return ctrl.Result{}, err } + // Attribution resources (no-ops when attribution was never enabled). + if err := r.deleteSessionRole(ctx, platform, r.IAMCfg.Environment); err != nil { + logger.Error(err, "session role cleanup failed; will retry") + return ctrl.Result{}, err + } + if err := r.deleteOperatorImpersonateRBAC(ctx, platform); err != nil { + logger.Error(err, "impersonate RBAC cleanup failed; will retry") + return ctrl.Result{}, err + } controllerutil.RemoveFinalizer(platform, finalizerName) if err := r.Update(ctx, platform); err != nil { return ctrl.Result{}, fmt.Errorf("remove finalizer: %w", err) @@ -199,6 +210,40 @@ func (r *PlatformReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, err } + // Per-session human attribution (optional). Provision the session role + // (assumable by the tenant IRSA role with the operator as STS + // SourceIdentity) + the apiserver impersonate RBAC. Reconciles in both + // directions: removing spec.attribution tears the pair back down. The + // session role honors the kill-switch via the susp.Suspended flag (baseline + // detached when suspended, like the tenant role). + if platform.Spec.Attribution != nil { + if susp.RoleARN != "" { + sessionARN, err := r.ensureSessionRole(ctx, platform, susp.RoleARN, susp.Suspended, r.IAMCfg) + if err != nil { + logger.Error(err, "ensureSessionRole failed") + return ctrl.Result{}, err + } + if sessionARN != "" { + platform.Status.SessionRoleArn = sessionARN + } + } + if err := r.ensureOperatorImpersonateRBAC(ctx, platform); err != nil { + logger.Error(err, "ensureOperatorImpersonateRBAC failed") + return ctrl.Result{}, err + } + } else if platform.Status.SessionRoleArn != "" { + // Attribution was enabled and is now removed — tear the pair down. + if err := r.deleteSessionRole(ctx, platform, r.IAMCfg.Environment); err != nil { + logger.Error(err, "deleteSessionRole (attribution removed) failed") + return ctrl.Result{}, err + } + if err := r.deleteOperatorImpersonateRBAC(ctx, platform); err != nil { + logger.Error(err, "deleteOperatorImpersonateRBAC (attribution removed) failed") + return ctrl.Result{}, err + } + platform.Status.SessionRoleArn = "" + } + if susp.Suspended { platform.Status.Phase = phaseSuspended if platform.Status.SuspendedAt == nil { diff --git a/operators/internal/controller/platform_iam.go b/operators/internal/controller/platform_iam.go index af48cdf..07c5ea0 100644 --- a/operators/internal/controller/platform_iam.go +++ b/operators/internal/controller/platform_iam.go @@ -317,12 +317,19 @@ func (r *PlatformReconciler) reconcileManagedPolicies(ctx context.Context, roleN } // deleteIamRole is the finalizer counterpart: detach all policies and -// delete the role. Tolerates NotFound so re-runs are safe. +// delete the tenant role. Tolerates NotFound so re-runs are safe. func (r *PlatformReconciler) deleteIamRole(ctx context.Context, p *platformv1alpha1.Platform, environment string) error { + return r.detachAndDeleteRole(ctx, tenantRoleName(environment, p)) +} + +// detachAndDeleteRole detaches every managed policy from a role and deletes +// it. Shared by the tenant-role and session-role finalizers. Tolerates +// NotFound at every step so re-runs (and roles that were never created) are +// safe no-ops. +func (r *PlatformReconciler) detachAndDeleteRole(ctx context.Context, name string) error { if r.IAM == nil { return nil } - name := tenantRoleName(environment, p) var marker *string for { listOut, err := r.IAM.ListAttachedRolePolicies(ctx, &iam.ListAttachedRolePoliciesInput{ diff --git a/operators/internal/controller/platform_iam_reconcile_test.go b/operators/internal/controller/platform_iam_reconcile_test.go index 1df8e71..5686ba2 100644 --- a/operators/internal/controller/platform_iam_reconcile_test.go +++ b/operators/internal/controller/platform_iam_reconcile_test.go @@ -40,11 +40,14 @@ type fakeIAM struct { roles map[string]*iamtypes.Role attached map[string]map[string]struct{} // roleName -> set of policy ARNs - listCalls int - attachCalls []iam.AttachRolePolicyInput - listReturnsErr error - attachReturnsErr map[string]error // policyARN -> err - pageBoundary int // if > 0, paginate ListAttached at this size + listCalls int + attachCalls []iam.AttachRolePolicyInput + createCalls []iam.CreateRoleInput + updateAssumeCalls []iam.UpdateAssumeRolePolicyInput + detachCalls []iam.DetachRolePolicyInput + listReturnsErr error + attachReturnsErr map[string]error // policyARN -> err + pageBoundary int // if > 0, paginate ListAttached at this size } func newFakeIAM() *fakeIAM { @@ -83,6 +86,7 @@ func (f *fakeIAM) attachmentsFor(roleName string) []string { } func (f *fakeIAM) CreateRole(_ context.Context, params *iam.CreateRoleInput, _ ...func(*iam.Options)) (*iam.CreateRoleOutput, error) { + f.createCalls = append(f.createCalls, *params) name := aws.ToString(params.RoleName) arn := "arn:aws:iam::123456789012:role/" + name f.seedRole(name, arn, params.Tags...) @@ -109,7 +113,8 @@ func (f *fakeIAM) TagRole(_ context.Context, _ *iam.TagRoleInput, _ ...func(*iam return &iam.TagRoleOutput{}, nil } -func (f *fakeIAM) UpdateAssumeRolePolicy(_ context.Context, _ *iam.UpdateAssumeRolePolicyInput, _ ...func(*iam.Options)) (*iam.UpdateAssumeRolePolicyOutput, error) { +func (f *fakeIAM) UpdateAssumeRolePolicy(_ context.Context, params *iam.UpdateAssumeRolePolicyInput, _ ...func(*iam.Options)) (*iam.UpdateAssumeRolePolicyOutput, error) { + f.updateAssumeCalls = append(f.updateAssumeCalls, *params) return &iam.UpdateAssumeRolePolicyOutput{}, nil } @@ -128,6 +133,7 @@ func (f *fakeIAM) AttachRolePolicy(_ context.Context, params *iam.AttachRolePoli } func (f *fakeIAM) DetachRolePolicy(_ context.Context, params *iam.DetachRolePolicyInput, _ ...func(*iam.Options)) (*iam.DetachRolePolicyOutput, error) { + f.detachCalls = append(f.detachCalls, *params) roleName := aws.ToString(params.RoleName) delete(f.attached[roleName], aws.ToString(params.PolicyArn)) return &iam.DetachRolePolicyOutput{}, nil diff --git a/operators/internal/controller/platform_rbac.go b/operators/internal/controller/platform_rbac.go new file mode 100644 index 0000000..0096edd --- /dev/null +++ b/operators/internal/controller/platform_rbac.go @@ -0,0 +1,95 @@ +/* +Copyright 2026 stxkxs. + +Licensed under the Apache License, Version 2.0 (the "License"); +*/ + +package controller + +import ( + "context" + "fmt" + + rbacv1 "k8s.io/api/rbac/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + platformv1alpha1 "github.com/nanohype/eks-agent-platform/operators/api/platform/v1alpha1" +) + +// impersonateResourceName is the cluster-scoped name shared by the ClusterRole +// and ClusterRoleBinding that let a Platform's tenant ServiceAccount +// impersonate its named operators. Keyed off the (already unique, length-safe) +// tenant namespace so two Platforms never collide on a cluster-global name. +func impersonateResourceName(p *platformv1alpha1.Platform) string { + return PlatformNamespace(p) + "-impersonate" +} + +// ensureOperatorImpersonateRBAC provisions the apiserver half of attribution: +// a ClusterRole granting `impersonate` on exactly the Platform's operator +// users, bound to the tenant-runtime ServiceAccount. fab's session kubeconfig +// authenticates with that SA's token while impersonating the operator, so the +// apiserver audit log records impersonatedUser=. +// +// Scoped to the named users only (never `impersonate *`), so the SA can act as +// the listed humans and no one else. Cluster-scoped resources can't be GC'd via +// an OwnerReference from the namespaced Platform, so cleanup runs through +// deleteOperatorImpersonateRBAC in the finalizer (same pattern as the tenant +// namespace). +func (r *PlatformReconciler) ensureOperatorImpersonateRBAC(ctx context.Context, p *platformv1alpha1.Platform) error { + if p.Spec.Attribution == nil { + return nil + } + name := impersonateResourceName(p) + operators := p.Spec.Attribution.Operators + + cr := &rbacv1.ClusterRole{ObjectMeta: metav1.ObjectMeta{Name: name}} + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, cr, func() error { + cr.Labels = labelsForPlatform(p) + cr.Rules = []rbacv1.PolicyRule{{ + APIGroups: []string{""}, + Resources: []string{"users"}, + Verbs: []string{"impersonate"}, + ResourceNames: operators, + }} + return nil + }); err != nil { + return fmt.Errorf("ensure impersonate ClusterRole %s: %w", name, err) + } + + crb := &rbacv1.ClusterRoleBinding{ObjectMeta: metav1.ObjectMeta{Name: name}} + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, crb, func() error { + crb.Labels = labelsForPlatform(p) + crb.RoleRef = rbacv1.RoleRef{ + APIGroup: rbacv1.GroupName, + Kind: "ClusterRole", + Name: name, + } + crb.Subjects = []rbacv1.Subject{{ + Kind: rbacv1.ServiceAccountKind, + Name: tenantSAName, + Namespace: PlatformNamespace(p), + }} + return nil + }); err != nil { + return fmt.Errorf("ensure impersonate ClusterRoleBinding %s: %w", name, err) + } + return nil +} + +// deleteOperatorImpersonateRBAC removes the impersonate ClusterRole + +// ClusterRoleBinding. Tolerates NotFound so non-attribution Platforms and +// re-runs are safe no-ops. +func (r *PlatformReconciler) deleteOperatorImpersonateRBAC(ctx context.Context, p *platformv1alpha1.Platform) error { + name := impersonateResourceName(p) + crb := &rbacv1.ClusterRoleBinding{ObjectMeta: metav1.ObjectMeta{Name: name}} + if err := r.Delete(ctx, crb); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete impersonate ClusterRoleBinding %s: %w", name, err) + } + cr := &rbacv1.ClusterRole{ObjectMeta: metav1.ObjectMeta{Name: name}} + if err := r.Delete(ctx, cr); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete impersonate ClusterRole %s: %w", name, err) + } + return nil +} diff --git a/operators/internal/controller/platform_rbac_test.go b/operators/internal/controller/platform_rbac_test.go new file mode 100644 index 0000000..a039364 --- /dev/null +++ b/operators/internal/controller/platform_rbac_test.go @@ -0,0 +1,123 @@ +/* +Copyright 2026 stxkxs. + +Licensed under the Apache License, Version 2.0 (the "License"); +*/ + +package controller + +import ( + "context" + "testing" + + rbacv1 "k8s.io/api/rbac/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func rbacTestClient(t *testing.T) client.Client { + t.Helper() + scheme := runtime.NewScheme() + if err := rbacv1.AddToScheme(scheme); err != nil { + t.Fatalf("add rbac scheme: %v", err) + } + return fake.NewClientBuilder().WithScheme(scheme).Build() +} + +func TestEnsureOperatorImpersonateRBAC(t *testing.T) { + cl := rbacTestClient(t) + r := &PlatformReconciler{Client: cl} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com", "bob@acme.com"}, nil) + + if err := r.ensureOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Fatalf("ensureOperatorImpersonateRBAC: %v", err) + } + name := impersonateResourceName(p) + + var cr rbacv1.ClusterRole + if err := cl.Get(context.Background(), types.NamespacedName{Name: name}, &cr); err != nil { + t.Fatalf("get ClusterRole: %v", err) + } + if len(cr.Rules) != 1 { + t.Fatalf("rules: got %d want 1", len(cr.Rules)) + } + rule := cr.Rules[0] + if len(rule.Verbs) != 1 || rule.Verbs[0] != "impersonate" { + t.Errorf("verbs: got %v want [impersonate]", rule.Verbs) + } + if len(rule.Resources) != 1 || rule.Resources[0] != "users" { + t.Errorf("resources: got %v want [users]", rule.Resources) + } + wantOps := map[string]bool{"alice@acme.com": true, "bob@acme.com": true} + if len(rule.ResourceNames) != len(wantOps) { + t.Fatalf("resourceNames: got %v want %v", rule.ResourceNames, wantOps) + } + for _, op := range rule.ResourceNames { + if !wantOps[op] { + t.Errorf("unexpected resourceName %q (impersonation must be scoped to the named operators)", op) + } + } + + var crb rbacv1.ClusterRoleBinding + if err := cl.Get(context.Background(), types.NamespacedName{Name: name}, &crb); err != nil { + t.Fatalf("get ClusterRoleBinding: %v", err) + } + if crb.RoleRef.Name != name || crb.RoleRef.Kind != "ClusterRole" { + t.Errorf("roleRef: got %+v", crb.RoleRef) + } + if len(crb.Subjects) != 1 { + t.Fatalf("subjects: got %d want 1", len(crb.Subjects)) + } + sub := crb.Subjects[0] + if sub.Kind != "ServiceAccount" || sub.Name != tenantSAName || sub.Namespace != PlatformNamespace(p) { + t.Errorf("subject: got %+v want ServiceAccount %s/%s", sub, PlatformNamespace(p), tenantSAName) + } +} + +func TestEnsureOperatorImpersonateRBAC_UpdatesOperators(t *testing.T) { + cl := rbacTestClient(t) + r := &PlatformReconciler{Client: cl} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, nil) + if err := r.ensureOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Fatalf("first ensure: %v", err) + } + + p.Spec.Attribution.Operators = []string{"carol@acme.com"} + if err := r.ensureOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Fatalf("second ensure: %v", err) + } + + var cr rbacv1.ClusterRole + if err := cl.Get(context.Background(), types.NamespacedName{Name: impersonateResourceName(p)}, &cr); err != nil { + t.Fatalf("get ClusterRole: %v", err) + } + got := cr.Rules[0].ResourceNames + if len(got) != 1 || got[0] != "carol@acme.com" { + t.Errorf("resourceNames after operator change: got %v want [carol@acme.com]", got) + } +} + +func TestDeleteOperatorImpersonateRBAC(t *testing.T) { + cl := rbacTestClient(t) + r := &PlatformReconciler{Client: cl} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, nil) + if err := r.ensureOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Fatalf("ensure: %v", err) + } + + if err := r.deleteOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Fatalf("delete: %v", err) + } + name := impersonateResourceName(p) + var cr rbacv1.ClusterRole + if err := cl.Get(context.Background(), types.NamespacedName{Name: name}, &cr); !apierrors.IsNotFound(err) { + t.Errorf("ClusterRole should be gone: err=%v", err) + } + // Deleting again is a tolerated no-op. + if err := r.deleteOperatorImpersonateRBAC(context.Background(), p); err != nil { + t.Errorf("second delete should be a no-op: %v", err) + } +} diff --git a/operators/internal/controller/platform_session_iam.go b/operators/internal/controller/platform_session_iam.go new file mode 100644 index 0000000..78b1d78 --- /dev/null +++ b/operators/internal/controller/platform_session_iam.go @@ -0,0 +1,195 @@ +/* +Copyright 2026 stxkxs. + +Licensed under the Apache License, Version 2.0 (the "License"); +*/ + +package controller + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/iam" + iamtypes "github.com/aws/aws-sdk-go-v2/service/iam/types" + + platformv1alpha1 "github.com/nanohype/eks-agent-platform/operators/api/platform/v1alpha1" +) + +// defaultSessionRoleMaxDuration is the assumed-session lifetime when a +// Platform's spec.attribution.sessionRoleMaxDurationSeconds is unset. Matches +// the STS role-chaining ceiling: the caller is the pod's own IRSA-assumed +// tenant role, so AWS caps the chained session at 3600s regardless. +const defaultSessionRoleMaxDuration int32 = 3600 + +// sessionRoleName returns the attribution session role minted for a Platform: +// +// --session +// +// Same 64-char cap + FNV-1a hash-truncation scheme as tenantRoleName, so the +// two role names never collide and both stay within IAM's role-name limit. +func sessionRoleName(env string, p *platformv1alpha1.Platform) string { + const suffix = "-session" + const maxLen = 64 + full := env + "-" + p.Name + suffix + if len(full) <= maxLen { + return full + } + prefix := env + "-" + budget := maxLen - len(prefix) - len(suffix) - 1 - 8 + h := fnv1a64(p.Name) + return fmt.Sprintf("%s%s-%08x%s", prefix, p.Name[:budget], h&0xffffffff, suffix) +} + +// sessionRoleTrustPolicy builds the trust policy for the attribution session +// role: only the tenant IRSA role may assume it, and only while setting an STS +// SourceIdentity drawn from the Platform's operator list. sts:SetSourceIdentity +// is granted alongside sts:AssumeRole so the caller can stamp the human, and +// the sts:SourceIdentity condition pins the allowed values so the caller can't +// assume the role under an arbitrary identity. +func sessionRoleTrustPolicy(tenantRoleARN string, operators []string) (string, error) { + stmt := map[string]any{ + "Effect": "Allow", + "Principal": map[string]any{"AWS": tenantRoleARN}, + "Action": []string{"sts:AssumeRole", "sts:SetSourceIdentity"}, + } + if len(operators) > 0 { + stmt["Condition"] = map[string]any{ + "StringEquals": map[string]any{"sts:SourceIdentity": operators}, + } + } + doc := map[string]any{ + "Version": "2012-10-17", + "Statement": []map[string]any{stmt}, + } + b, err := json.Marshal(doc) + if err != nil { + return "", fmt.Errorf("marshal session trust policy: %w", err) + } + return string(b), nil +} + +// sessionRoleTags mirrors tenantRoleTags but marks the role's Component as +// session-iam so cloudgov tagging + cost attribution tell the two roles apart. +func sessionRoleTags(p *platformv1alpha1.Platform, cfg IAMConfig) []iamtypes.Tag { + tags := tenantRoleTags(p, cfg) + for i := range tags { + if aws.ToString(tags[i].Key) == "Component" { + tags[i].Value = aws.String("session-iam") + } + } + return tags +} + +// sessionRoleMaxDuration reads the per-Platform cap, defaulting to 3600. +func sessionRoleMaxDuration(p *platformv1alpha1.Platform) int32 { + if p.Spec.Attribution != nil && p.Spec.Attribution.SessionRoleMaxDurationSeconds != nil { + return *p.Spec.Attribution.SessionRoleMaxDurationSeconds + } + return defaultSessionRoleMaxDuration +} + +// ensureSessionRole provisions (or reconciles) the attribution session role for +// a Platform with spec.attribution set, and returns its ARN. The role is +// assumable only by the tenant IRSA role, only while carrying one of the +// Platform's operators as STS SourceIdentity, and is limited to the tenant +// baseline policy (Bedrock invoke) — never broad sts:AssumeRole. +// +// When suspended (kill-switch), the baseline is DETACHED rather than attached: +// otherwise a suspended tenant could keep invoking Bedrock through the session +// role even after its own tenant role's baseline was pulled. +// +// Idempotent: refreshes the trust policy on every reconcile (the operator list +// can change) and converges the baseline attachment to the suspended state. +func (r *PlatformReconciler) ensureSessionRole( + ctx context.Context, + p *platformv1alpha1.Platform, + tenantRoleARN string, + suspended bool, + cfg IAMConfig, +) (string, error) { + if r.IAM == nil || p.Spec.Attribution == nil { + return "", nil + } + name := sessionRoleName(cfg.Environment, p) + trust, err := sessionRoleTrustPolicy(tenantRoleARN, p.Spec.Attribution.Operators) + if err != nil { + return "", err + } + + // Idempotency: GetRole first; if present, refresh trust + converge baseline. + getOut, getErr := r.IAM.GetRole(ctx, &iam.GetRoleInput{RoleName: aws.String(name)}) + if getErr == nil && getOut != nil && getOut.Role != nil { + arn := aws.ToString(getOut.Role.Arn) + if _, err := r.IAM.UpdateAssumeRolePolicy(ctx, &iam.UpdateAssumeRolePolicyInput{ + RoleName: aws.String(name), + PolicyDocument: aws.String(trust), + }); err != nil { + return arn, fmt.Errorf("iam UpdateAssumeRolePolicy %s: %w", name, err) + } + if err := r.reconcileSessionBaseline(ctx, name, cfg.TenantBaselinePolicyARN, suspended); err != nil { + return arn, err + } + return arn, nil + } + if !isIAMNotFound(getErr) { + return "", fmt.Errorf("iam GetRole %s: %w", name, getErr) + } + + path := cfg.TenantIAMPath + if path == "" { + path = "/eks-agent-platform/tenants/" + } + if !strings.HasSuffix(path, "/") { + path += "/" + } + createInput := &iam.CreateRoleInput{ + RoleName: aws.String(name), + Path: aws.String(path), + AssumeRolePolicyDocument: aws.String(trust), + Description: aws.String(fmt.Sprintf("Attribution session role for Platform %s (tenant %s)", p.Name, p.Spec.Tenant)), + MaxSessionDuration: aws.Int32(sessionRoleMaxDuration(p)), + Tags: sessionRoleTags(p, cfg), + } + if cfg.TenantPermissionsBoundaryARN != "" { + createInput.PermissionsBoundary = aws.String(cfg.TenantPermissionsBoundaryARN) + } + createOut, err := r.IAM.CreateRole(ctx, createInput) + if err != nil { + return "", fmt.Errorf("iam CreateRole %s: %w", name, err) + } + arn := aws.ToString(createOut.Role.Arn) + if err := r.reconcileSessionBaseline(ctx, name, cfg.TenantBaselinePolicyARN, suspended); err != nil { + return arn, err + } + return arn, nil +} + +// reconcileSessionBaseline converges the session role's baseline attachment: +// attached when running, detached when suspended (kill-switch parity with the +// tenant role). No-op when no baseline policy is configured (dev/test). +func (r *PlatformReconciler) reconcileSessionBaseline(ctx context.Context, roleName, baselineARN string, suspended bool) error { + if baselineARN == "" { + return nil + } + if suspended { + if _, err := r.IAM.DetachRolePolicy(ctx, &iam.DetachRolePolicyInput{ + RoleName: aws.String(roleName), + PolicyArn: aws.String(baselineARN), + }); err != nil && !isIAMNotFound(err) { + return fmt.Errorf("iam DetachRolePolicy %s (suspend session role): %w", baselineARN, err) + } + return nil + } + return r.reconcileManagedPolicies(ctx, roleName, baselineARN, nil) +} + +// deleteSessionRole is the finalizer counterpart: detach policies + delete the +// session role. Tolerates NotFound so non-attribution Platforms (which never +// had a session role) and re-runs are safe. +func (r *PlatformReconciler) deleteSessionRole(ctx context.Context, p *platformv1alpha1.Platform, environment string) error { + return r.detachAndDeleteRole(ctx, sessionRoleName(environment, p)) +} diff --git a/operators/internal/controller/platform_session_iam_test.go b/operators/internal/controller/platform_session_iam_test.go new file mode 100644 index 0000000..c253fe6 --- /dev/null +++ b/operators/internal/controller/platform_session_iam_test.go @@ -0,0 +1,166 @@ +/* +Copyright 2026 stxkxs. + +Licensed under the Apache License, Version 2.0 (the "License"); +*/ + +package controller + +import ( + "context" + "strings" + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + + platformv1alpha1 "github.com/nanohype/eks-agent-platform/operators/api/platform/v1alpha1" +) + +// attributedPlatform builds a Platform with spec.attribution set. Shared with +// the RBAC tests (same package). +// +//nolint:unparam // test helper: name/tenant are fixed across cases by design +func attributedPlatform(name, tenant string, operators []string, maxDur *int32) *platformv1alpha1.Platform { + p := newPlatform(name, tenant) + p.Spec.Attribution = &platformv1alpha1.AttributionSpec{ + Operators: operators, + SessionRoleMaxDurationSeconds: maxDur, + } + return p +} + +func TestEnsureSessionRole_CreatesRoleWithTrustAndBaseline(t *testing.T) { + const baseline = "arn:aws:iam::aws:policy/EksAgentBaseline" + const tenantARN = "arn:aws:iam::123456789012:role/production-acme-tenant" + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + cfg := IAMConfig{TenantBaselinePolicyARN: baseline, Environment: "production"} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com", "bob@acme.com"}, nil) + + arn, err := r.ensureSessionRole(context.Background(), p, tenantARN, false, cfg) + if err != nil { + t.Fatalf("ensureSessionRole: %v", err) + } + if arn == "" { + t.Fatal("expected a session role ARN") + } + name := sessionRoleName(cfg.Environment, p) + if name != "production-acme-session" { + t.Fatalf("session role name: got %s want production-acme-session", name) + } + if len(f.createCalls) != 1 { + t.Fatalf("create calls: got %d want 1", len(f.createCalls)) + } + + // Trust: only the tenant role may assume, only while setting one of the + // operators as SourceIdentity, and NOT via web identity / broad assume. + trust := aws.ToString(f.createCalls[0].AssumeRolePolicyDocument) + for _, want := range []string{tenantARN, "sts:AssumeRole", "sts:SetSourceIdentity", "sts:SourceIdentity", "alice@acme.com", "bob@acme.com"} { + if !strings.Contains(trust, want) { + t.Errorf("trust policy missing %q:\n%s", want, trust) + } + } + if strings.Contains(trust, "AssumeRoleWithWebIdentity") { + t.Errorf("session role trust must not grant web-identity assume:\n%s", trust) + } + if got := aws.ToInt32(f.createCalls[0].MaxSessionDuration); got != 3600 { + t.Errorf("MaxSessionDuration: got %d want 3600", got) + } + if got := f.attachmentsFor(name); len(got) != 1 || got[0] != baseline { + t.Errorf("baseline attachment: got %v want [%s]", got, baseline) + } +} + +func TestEnsureSessionRole_CustomMaxDuration(t *testing.T) { + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + cfg := IAMConfig{Environment: "production"} + dur := int32(7200) + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, &dur) + + if _, err := r.ensureSessionRole(context.Background(), p, "arn:aws:iam::1:role/tenant", false, cfg); err != nil { + t.Fatalf("ensureSessionRole: %v", err) + } + if got := aws.ToInt32(f.createCalls[0].MaxSessionDuration); got != 7200 { + t.Errorf("MaxSessionDuration: got %d want 7200", got) + } +} + +func TestEnsureSessionRole_IdempotentRefreshesTrust(t *testing.T) { + const baseline = "arn:aws:iam::aws:policy/EksAgentBaseline" + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + cfg := IAMConfig{TenantBaselinePolicyARN: baseline, Environment: "production"} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, nil) + name := sessionRoleName(cfg.Environment, p) + f.seedRole(name, "arn:aws:iam::123:role/"+name) + + if _, err := r.ensureSessionRole(context.Background(), p, "arn:aws:iam::1:role/tenant", false, cfg); err != nil { + t.Fatalf("ensureSessionRole: %v", err) + } + if len(f.createCalls) != 0 { + t.Errorf("create calls: got %d want 0 (role already existed)", len(f.createCalls)) + } + if len(f.updateAssumeCalls) != 1 { + t.Fatalf("trust-refresh calls: got %d want 1", len(f.updateAssumeCalls)) + } + if !strings.Contains(aws.ToString(f.updateAssumeCalls[0].PolicyDocument), "alice@acme.com") { + t.Errorf("refreshed trust should carry the operator") + } + if got := f.attachmentsFor(name); len(got) != 1 || got[0] != baseline { + t.Errorf("baseline attachment: got %v", got) + } +} + +func TestEnsureSessionRole_SuspendedDetachesBaseline(t *testing.T) { + const baseline = "arn:aws:iam::aws:policy/EksAgentBaseline" + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + cfg := IAMConfig{TenantBaselinePolicyARN: baseline, Environment: "production"} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, nil) + name := sessionRoleName(cfg.Environment, p) + f.seedRole(name, "arn:aws:iam::123:role/"+name) + f.seedAttachment(name, baseline) + + if _, err := r.ensureSessionRole(context.Background(), p, "arn:aws:iam::1:role/tenant", true, cfg); err != nil { + t.Fatalf("ensureSessionRole (suspended): %v", err) + } + if got := f.attachmentsFor(name); len(got) != 0 { + t.Errorf("baseline must be detached when suspended (kill-switch parity): got %v", got) + } +} + +func TestEnsureSessionRole_NilAttributionNoop(t *testing.T) { + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + p := newPlatform("acme", "protohype") // no attribution + + arn, err := r.ensureSessionRole(context.Background(), p, "arn:aws:iam::1:role/tenant", false, IAMConfig{Environment: "production"}) + if err != nil || arn != "" { + t.Fatalf("expected no-op; got arn=%q err=%v", arn, err) + } + if len(f.createCalls) != 0 { + t.Errorf("expected no create calls, got %d", len(f.createCalls)) + } +} + +func TestDeleteSessionRole(t *testing.T) { + f := newFakeIAM() + r := &PlatformReconciler{IAM: f} + cfg := IAMConfig{Environment: "production"} + p := attributedPlatform("acme", "protohype", []string{"alice@acme.com"}, nil) + name := sessionRoleName(cfg.Environment, p) + f.seedRole(name, "arn:aws:iam::123:role/"+name) + f.seedAttachment(name, "arn:aws:iam::aws:policy/EksAgentBaseline") + + if err := r.deleteSessionRole(context.Background(), p, cfg.Environment); err != nil { + t.Fatalf("deleteSessionRole: %v", err) + } + if _, ok := f.roles[name]; ok { + t.Errorf("session role should be deleted") + } + // Deleting a non-existent session role is a tolerated no-op. + if err := r.deleteSessionRole(context.Background(), p, cfg.Environment); err != nil { + t.Errorf("second delete should be a no-op: %v", err) + } +}