From ee36691ec0b2ae02261aabaf267678788f40e258 Mon Sep 17 00:00:00 2001 From: ontave Date: Sat, 2 May 2026 21:42:23 +0200 Subject: [PATCH 1/4] wrapper: fix stale runner.ontai.dev RunnerConfig watch GVK PackExecution reconciler was watching RunnerConfig under the old runner.ontai.dev/v1alpha1 group, which no longer exists after Phase 2B migrated all cross-operator CRDs to infrastructure.ontai.dev (Decision G). The missing CRD caused controller-runtime to fail the EventSource watch every 10s, blocking the PE informer cache from syncing and preventing pack-deploy Jobs from being submitted after PE creation. Updated to InfrastructureRunnerConfig under infrastructure.ontai.dev/v1alpha1. INV-010, Decision G. --- internal/controller/packexecution_reconciler.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/controller/packexecution_reconciler.go b/internal/controller/packexecution_reconciler.go index 8052d91..29663f4 100644 --- a/internal/controller/packexecution_reconciler.go +++ b/internal/controller/packexecution_reconciler.go @@ -1037,9 +1037,9 @@ func (r *PackExecutionReconciler) SetupWithManager(mgr ctrl.Manager) error { }) rcObj := &unstructured.Unstructured{} rcObj.SetGroupVersionKind(schema.GroupVersionKind{ - Group: "runner.ontai.dev", + Group: "infrastructure.ontai.dev", Version: "v1alpha1", - Kind: "RunnerConfig", + Kind: "InfrastructureRunnerConfig", }) return ctrl.NewControllerManagedBy(mgr). For(&seamv1alpha1.InfrastructurePackExecution{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). From 1249de5fb436952662841e4a4d56b6f61878ef54 Mon Sep 17 00:00:00 2001 From: ontave Date: Sat, 2 May 2026 21:50:37 +0200 Subject: [PATCH 2/4] wrapper: update CODEBASE.md for session/18 RunnerConfig GVK fix and PE ownerRef gap --- CODEBASE.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CODEBASE.md b/CODEBASE.md index 1171024..866425d 100644 --- a/CODEBASE.md +++ b/CODEBASE.md @@ -27,6 +27,8 @@ Wrapper has NO own CRD type definitions. `api/v1alpha1/` contains only `.gitkeep | 4 | L343 | RBACProfile | `isRBACProfileProvisioned()` L755 -- checks `provisioned=true` on the pack's RBACProfile | | 5 | L378 | WrapperRunnerRBAC | `isWrapperRunnerRBACReady()` L849 -- SubjectAccessReview verifies wrapper-runner SA has required permissions | +**RunnerConfig EventSource watch** (L1038): The PE reconciler watches `InfrastructureRunnerConfig` in group `infrastructure.ontai.dev/v1alpha1` to re-trigger gate 0 when RunnerConfig capabilities are populated. The GVK must use the Phase-2B-migrated group and kind -- using the legacy `runner.ontai.dev/v1alpha1/RunnerConfig` GVK causes the informer cache to fail to sync and wrapper pods enter CrashLoopBackOff. + `gateRequeueInterval = 30 * time.Second` (L61). Failing a gate sets `ConditionTypePackExecutionPending=True` with `ReasonGatesClearing` and returns `RequeueAfter: gateRequeueInterval`. `RBACReadyChecker` type at L101: `func(ctx, *InfrastructurePackExecution) (bool, string, error)`. Production uses `isWrapperRunnerRBACReady`; test stub set via `r.RBACChecker` field (L107). @@ -90,3 +92,11 @@ PackExecution name: `{packName}-{targetCluster}`. PackInstance name: `{basePackN |---------|----------| | `test/unit/controller` | PackExecutionReconciler (all 6 gates, POR revision selection), ClusterPackReconciler (deletion, rollback) | | `test/e2e` | Stub files; all skip when `MGMT_KUBECONFIG` absent; skip reasons reference backlog item IDs | + +--- + +## 8. Sharp Edges + +**RunnerConfig EventSource GVK must be Phase-2B-migrated**: The PE reconciler's `Owns()` / EventSource watch for RunnerConfig must use `infrastructure.ontai.dev/v1alpha1/InfrastructureRunnerConfig`. Using the pre-Phase-2B group `runner.ontai.dev/v1alpha1/RunnerConfig` causes the controller-runtime informer cache to fail to sync -- no match for that kind in the CRD registry. Symptom: wrapper pods log `no matches for kind "RunnerConfig" in version "runner.ontai.dev/v1alpha1"` every 10 seconds and crash every ~2 minutes. + +**PackExecution has no ownerReference to ClusterPack**: PEs are created by `ClusterPackReconciler` but carry no `ownerReference` pointing back to the ClusterPack. If a PE is externally deleted (e.g., by conductor's `DriftSignalHandler`), the ClusterPack controller is not notified and will not recreate the PE automatically. To force PE recreation, annotate the ClusterPack with `reconcile.infrastructure.ontai.dev/force` or delete and recreate the ClusterPack. This is a design gap -- consider adding an ownerRef or a PE-watching EventSource to ClusterPackReconciler. From d58699307bb681c792fe169e9709ec00d4b3273d Mon Sep 17 00:00:00 2001 From: ontave Date: Sat, 2 May 2026 21:57:03 +0200 Subject: [PATCH 3/4] wrapper: PE-watching EventSource on ClusterPackReconciler to retrigger delivery on external PE deletion --- internal/controller/clusterpack_reconciler.go | 46 +++++++++++- test/unit/clusterpack_reconciler_test.go | 73 +++++++++++++++++++ 2 files changed, 117 insertions(+), 2 deletions(-) diff --git a/internal/controller/clusterpack_reconciler.go b/internal/controller/clusterpack_reconciler.go index 40c334e..f5ddd0d 100644 --- a/internal/controller/clusterpack_reconciler.go +++ b/internal/controller/clusterpack_reconciler.go @@ -455,6 +455,40 @@ func (r *ClusterPackReconciler) handleClusterPackDeletion(ctx context.Context, c return ctrl.Result{}, nil } +// MapPackExecutionToClusterPack maps a PackExecution delete event back to the +// ClusterPack that owns it. It also deletes the corresponding PackInstance so that +// Step I of Reconcile (which skips PE creation when a PackInstance at the current +// version already exists) does not suppress PE recreation after an external retrigger +// (e.g. conductor DriftSignalHandler). Mirrors the inverse logic in +// MapPackInstanceToClusterPack, which deletes the PE when a PackInstance is deleted. +func (r *ClusterPackReconciler) MapPackExecutionToClusterPack( + ctx context.Context, + obj client.Object, +) []reconcile.Request { + pe, ok := obj.(*seamcorev1alpha1.InfrastructurePackExecution) + if !ok { + return nil + } + cpName := pe.Spec.ClusterPackRef.Name + if cpName == "" { + return nil + } + + // Delete the PackInstance with the same name so that the ClusterPackReconciler's + // version guard (Step I) does not skip PE recreation. The PE name and PackInstance + // name share the same convention: {cpName}-{clusterName}. The deleted PE's own name + // is therefore exactly the PackInstance name we need to remove. + ns := pe.GetNamespace() + pi := &seamcorev1alpha1.InfrastructurePackInstance{} + if getErr := r.Client.Get(ctx, client.ObjectKey{Name: pe.GetName(), Namespace: ns}, pi); getErr == nil { + _ = r.Client.Delete(ctx, pi) + } + + return []reconcile.Request{ + {NamespacedName: types.NamespacedName{Name: cpName, Namespace: ns}}, + } +} + // containsString reports whether slice contains s. func containsString(slice []string, s string) bool { for _, v := range slice { @@ -486,8 +520,12 @@ func removeString(slice []string, s string) []string { // Watches PackInstance with a delete-only predicate. When a PackInstance is // deleted, the reconciler is notified so it can check whether redelivery is // needed and create a fresh PackExecution. +// +// Watches PackExecution with a delete-only predicate. When a PackExecution is +// externally deleted (e.g. by conductor's DriftSignalHandler to retrigger +// delivery), the reconciler is notified so it can create a fresh PackExecution. func (r *ClusterPackReconciler) SetupWithManager(mgr ctrl.Manager) error { - packInstanceDeletePredicate := predicate.Funcs{ + deleteOnlyPredicate := predicate.Funcs{ CreateFunc: func(_ event.CreateEvent) bool { return false }, UpdateFunc: func(_ event.UpdateEvent) bool { return false }, DeleteFunc: func(_ event.DeleteEvent) bool { return true }, @@ -501,7 +539,11 @@ func (r *ClusterPackReconciler) SetupWithManager(mgr ctrl.Manager) error { Owns(&batchv1.Job{}). Watches(&seamcorev1alpha1.InfrastructurePackInstance{}, handler.EnqueueRequestsFromMapFunc(r.MapPackInstanceToClusterPack), - builder.WithPredicates(packInstanceDeletePredicate), + builder.WithPredicates(deleteOnlyPredicate), + ). + Watches(&seamcorev1alpha1.InfrastructurePackExecution{}, + handler.EnqueueRequestsFromMapFunc(r.MapPackExecutionToClusterPack), + builder.WithPredicates(deleteOnlyPredicate), ). Complete(r) } diff --git a/test/unit/clusterpack_reconciler_test.go b/test/unit/clusterpack_reconciler_test.go index d946bb0..ab05e6b 100644 --- a/test/unit/clusterpack_reconciler_test.go +++ b/test/unit/clusterpack_reconciler_test.go @@ -268,6 +268,79 @@ func TestClusterPackReconciler_DeletionCascadesDriftSignal(t *testing.T) { } } +// TestClusterPackReconciler_PackExecutionDeletedRecreatesPE verifies that when a +// PackExecution is externally deleted (drift retrigger), MapPackExecutionToClusterPack +// deletes the PackInstance so the version guard in Step I does not suppress PE +// recreation, and that the subsequent Reconcile creates a fresh PackExecution. +func TestClusterPackReconciler_PackExecutionDeletedRecreatesPE(t *testing.T) { + s := newClusterPackScheme(t) + + clusterName := "ccs-dev" + tenantNS := "seam-tenant-" + clusterName + cpName := "nginx-ccs-dev" + version := "v4.9.0" + peName := cpName + "-" + clusterName + + // ClusterPack: signed and available, targets ccs-dev. + cp := newClusterPack(cpName, tenantNS, version) + cp.Spec.TargetClusters = []string{clusterName} + cp.Status.Signed = true + cp.Status.PackSignature = "base64sig==" + cp.Annotations = map[string]string{ + "ontai.dev/pack-signature": "base64sig==", + "infrastructure.ontai.dev/spec-checksum-snapshot": cp.Spec.Checksum + "|" + + cp.Spec.RegistryRef.URL + "|" + cp.Spec.RegistryRef.Digest + "|" + cp.Spec.Version, + } + + // PackInstance exists at current version — without the fix, this would suppress PE recreation. + pi := &seamcorev1alpha1.InfrastructurePackInstance{ + ObjectMeta: metav1.ObjectMeta{Name: peName, Namespace: tenantNS}, + Spec: seamcorev1alpha1.InfrastructurePackInstanceSpec{Version: version, ClusterPackRef: cpName}, + } + + // PackExecution is absent — it was externally deleted by DriftSignalHandler. + fakeClient := fake.NewClientBuilder().WithScheme(s). + WithObjects(cp, pi). + WithStatusSubresource(&seamcorev1alpha1.InfrastructureClusterPack{}). + Build() + r := &controller.ClusterPackReconciler{ + Client: fakeClient, + Scheme: s, + Recorder: clientevents.NewFakeRecorder(10), + } + + // Simulate the PE deletion watch firing: call the mapper with the deleted PE object. + deletedPE := &seamcorev1alpha1.InfrastructurePackExecution{ + ObjectMeta: metav1.ObjectMeta{Name: peName, Namespace: tenantNS}, + Spec: seamcorev1alpha1.InfrastructurePackExecutionSpec{ + ClusterPackRef: seamcorev1alpha1.InfrastructureClusterPackRef{Name: cpName, Version: version}, + TargetClusterRef: clusterName, + }, + } + requests := r.MapPackExecutionToClusterPack(context.Background(), deletedPE) + + if len(requests) != 1 { + t.Fatalf("expected 1 reconcile request, got %d", len(requests)) + } + if requests[0].Name != cpName || requests[0].Namespace != tenantNS { + t.Errorf("expected request for %s/%s, got %v", tenantNS, cpName, requests[0]) + } + + // Mapper must have deleted the PackInstance. + remainingPI := &seamcorev1alpha1.InfrastructurePackInstance{} + if err := fakeClient.Get(context.Background(), client.ObjectKey{Name: peName, Namespace: tenantNS}, remainingPI); !apierrors.IsNotFound(err) { + t.Errorf("expected PackInstance to be deleted by mapper; got err=%v", err) + } + + // Reconcile: PackInstance gone, PE gone → new PE must be created. + reconcileCP(t, r, cp) + + newPE := &seamcorev1alpha1.InfrastructurePackExecution{} + if err := fakeClient.Get(context.Background(), client.ObjectKey{Name: peName, Namespace: tenantNS}, newPE); err != nil { + t.Errorf("expected new PackExecution to be created after PE deletion; got err=%v", err) + } +} + // TestClusterPackReconciler_RevokedNoRequeue verifies that a revoked ClusterPack // stops reconciliation without requeue. func TestClusterPackReconciler_RevokedNoRequeue(t *testing.T) { From 2d76b053bbc77e3f9e7500ec0e1763fc5c96aeb6 Mon Sep 17 00:00:00 2001 From: ontave Date: Mon, 4 May 2026 16:36:47 +0200 Subject: [PATCH 4/4] docs: remove CODEBASE.md -- graphify is now source of truth Governor directive (session/21): CODEBASE.md eliminated from all repos. The graphify knowledge graph at ~/ontai/graphify-out/graph.json is the sole authoritative source for codebase understanding. See root CONTEXT.md and CLAUDE.md for the Graphify Source of Truth Protocol. --- CODEBASE.md | 102 ---------------------------------------------------- 1 file changed, 102 deletions(-) delete mode 100644 CODEBASE.md diff --git a/CODEBASE.md b/CODEBASE.md deleted file mode 100644 index 866425d..0000000 --- a/CODEBASE.md +++ /dev/null @@ -1,102 +0,0 @@ -# wrapper: Codebase Reference - -## 1. Purpose - -Wrapper is the pack delivery engine for the ONT platform. It manages the lifecycle of pre-compiled OCI artifact deliveries (`InfrastructureClusterPack`) to target clusters: enforcing 6 delivery gates (gates 0-5) before submitting a `pack-deploy` Kueue Job, tracking delivered state via `InfrastructurePackInstance`, and managing drift visibility via `InfrastructurePackReceipt`. Wrapper does NOT compile packs (conductor/compiler), sign packs (conductor agent on management cluster), own RBAC governance (guardian), or manage cluster lifecycle (platform). It does not apply Helm or Kustomize at runtime. - -Wrapper has NO own CRD type definitions. `api/v1alpha1/` contains only `.gitkeep`. All types consumed by wrapper (InfrastructureClusterPack, InfrastructurePackExecution, InfrastructurePackInstance, InfrastructurePackReceipt, PackOperationResult, DriftSignal) are defined in seam-core (Decision G). - ---- - -## 2. Key Files and Locations - -### Controllers (`internal/controller/`) - -#### `packexecution_reconciler.go` - -`PackExecutionReconciler` (L74 comment block, `Reconcile()` L121). Manages the 6-gate delivery pipeline. - -**Gate check flow** (all gates at L175-417): - -| Gate | Line | Condition | Blocks on | -|------|------|-----------|-----------| -| 0 | L176 | ConductorReady | `isConductorReadyForCluster()` L799 -- checks RunnerConfig in `ont-system` has `status.capabilities` non-empty | -| 1 | L221 | Signature | `ClusterPack.status.Signed=true` | -| 2 | L289 | Revocation | ClusterPack conditions Revoked != True | -| 3 | L306 | PermissionSnapshot | `isPermissionSnapshotCurrent()` L716 -- reads PermissionSnapshot via unstructured (no cross-operator type import) | -| 4 | L343 | RBACProfile | `isRBACProfileProvisioned()` L755 -- checks `provisioned=true` on the pack's RBACProfile | -| 5 | L378 | WrapperRunnerRBAC | `isWrapperRunnerRBACReady()` L849 -- SubjectAccessReview verifies wrapper-runner SA has required permissions | - -**RunnerConfig EventSource watch** (L1038): The PE reconciler watches `InfrastructureRunnerConfig` in group `infrastructure.ontai.dev/v1alpha1` to re-trigger gate 0 when RunnerConfig capabilities are populated. The GVK must use the Phase-2B-migrated group and kind -- using the legacy `runner.ontai.dev/v1alpha1/RunnerConfig` GVK causes the informer cache to fail to sync and wrapper pods enter CrashLoopBackOff. - -`gateRequeueInterval = 30 * time.Second` (L61). Failing a gate sets `ConditionTypePackExecutionPending=True` with `ReasonGatesClearing` and returns `RequeueAfter: gateRequeueInterval`. - -`RBACReadyChecker` type at L101: `func(ctx, *InfrastructurePackExecution) (bool, string, error)`. Production uses `isWrapperRunnerRBACReady`; test stub set via `r.RBACChecker` field (L107). - -`findLatestPOR()` at L1162: lists all PackOperationResult CRs in namespace labeled with `packExecutionRef`, returns the one with highest `Spec.Revision`. Called at L466 to check completion status. - -#### `clusterpack_reconciler.go` - -`ClusterPackReconciler.Reconcile()` L67. Called on ClusterPack create/update. - -`handleClusterPackDeletion()` L393: three steps + step 2.5: -1. L396: List all PackInstances cluster-wide, delete those where `spec.clusterPackRef == cp.Name`. -2. L415: List all PackExecutions cluster-wide, delete those where `spec.clusterPackRef.name == cp.Name`. -3. Step 2.5 (L434): Delete DriftSignal named `drift-{cp.Name}` in `seam-tenant-{clusterName}` for each target cluster. -4. L449: Remove finalizer `clusterPackFinalizer` so API server can delete the ClusterPack object. - -`handleRollback()` L306: SSA-patches ClusterPack spec back to a previous version. Normal reconcile then creates PackExecution for the rolled-back version. - -PackExecution creation (L230): for each cluster in `spec.targetClusters`, creates one PackExecution in `seam-tenant-{cluster}`. Skips if PackInstance with current version already exists (L243). Skips if PackExecution already exists (L258). - ---- - -## 3. Primary Data Flows - -**Pack deploy path**: ClusterPack created --> `ClusterPackReconciler` creates PackExecution in `seam-tenant-{cluster}` --> `PackExecutionReconciler` runs 6-gate check --> all gates pass --> Kueue Job (`pack-deploy`, `conductor-execute:dev` image) submitted --> conductor execute-mode `executeSplitPath()` applies RBAC + cluster-scoped + workload OCI layers --> writes PackOperationResult --> `PackExecutionReconciler` reads POR via `findLatestPOR()` L1162 --> creates PackInstance on management cluster. - -**ClusterPack deletion path**: Finalizer prevents deletion --> `handleClusterPackDeletion()` L393 runs 3 steps (PackInstances, PackExecutions, DriftSignals) --> removes finalizer --> API server deletes ClusterPack object. Conductor `teardownOrphanedReceipt()` then cleans up deployed resources on the tenant cluster. - -**Pack rollback**: `spec.rollbackToRevision` set on ClusterPack --> `handleRollback()` L306 patches spec --> `clearRollbackField()` L378 clears the field --> normal reconcile creates new PackExecution for rolled-back version. - -**Single-active-revision (POR)**: `conductor/internal/persistence/operationresult_writer.go` writes POR with `Revision` incremented. Predecessor labeled `ontai.dev/superseded=true`, retained max 10. `findLatestPOR()` L1162 selects highest revision. - ---- - -## 4. PackExecution naming and supersession - -PackExecution name: `{packName}-{targetCluster}`. PackInstance name: `{basePackName}-{targetCluster}`. Same base name enables supersession: when a newer ClusterPack version arrives, the existing PackInstance is replaced in-place (same name, new content) rather than creating a new object. This is the upgrade path. - ---- - -## 5. Invariants - -| ID | Rule | Location | -|----|------|----------| -| CP-INV-010 | Kueue is not used for any operation in platform. Pack-deploy Jobs are the only Kueue Jobs in wrapper. | `packexecution_reconciler.go` | -| Decision G | Wrapper has no own CRD type definitions | `api/v1alpha1/.gitkeep` | - ---- - -## 6. Open Items - -**PLATFORM-BL-WRAPPER-RUNNER-RBAC-LIFECYCLE (platform)**: `ensureWrapperRunnerResources()` in `platform/internal/controller/taloscluster_helpers.go` creates wrapper-runner SA/Role/RoleBinding/ClusterRoleBinding at tenant onboarding. `handleTalosClusterDeletion()` does NOT delete `ClusterRoleBinding wrapper-runner-{cluster}`. This is a platform open item, not a wrapper open item. - -**CLUSTERPACK-BL-VERSION-CLEANUP (conductor)**: `DeployedResources` field exists in `InfrastructurePackReceiptSpec` at `seam-core/api/v1alpha1/packreceipt_types.go:74`. When PackInstance version N+1 replaces N, resources present in N's PackReceipt but absent from N+1's manifests are NOT cleaned up. Version-upgrade orphan diff is absent from `conductor/internal/agent/packinstance_pull_loop.go`. No schema addition needed; only implementation missing. - ---- - -## 7. Test Contract - -| Package | Coverage | -|---------|----------| -| `test/unit/controller` | PackExecutionReconciler (all 6 gates, POR revision selection), ClusterPackReconciler (deletion, rollback) | -| `test/e2e` | Stub files; all skip when `MGMT_KUBECONFIG` absent; skip reasons reference backlog item IDs | - ---- - -## 8. Sharp Edges - -**RunnerConfig EventSource GVK must be Phase-2B-migrated**: The PE reconciler's `Owns()` / EventSource watch for RunnerConfig must use `infrastructure.ontai.dev/v1alpha1/InfrastructureRunnerConfig`. Using the pre-Phase-2B group `runner.ontai.dev/v1alpha1/RunnerConfig` causes the controller-runtime informer cache to fail to sync -- no match for that kind in the CRD registry. Symptom: wrapper pods log `no matches for kind "RunnerConfig" in version "runner.ontai.dev/v1alpha1"` every 10 seconds and crash every ~2 minutes. - -**PackExecution has no ownerReference to ClusterPack**: PEs are created by `ClusterPackReconciler` but carry no `ownerReference` pointing back to the ClusterPack. If a PE is externally deleted (e.g., by conductor's `DriftSignalHandler`), the ClusterPack controller is not notified and will not recreate the PE automatically. To force PE recreation, annotate the ClusterPack with `reconcile.infrastructure.ontai.dev/force` or delete and recreate the ClusterPack. This is a design gap -- consider adding an ownerRef or a PE-watching EventSource to ClusterPackReconciler.