From 5881de1a7f6ffdc3e1ad74540969e26367242821 Mon Sep 17 00:00:00 2001 From: ontave Date: Fri, 29 May 2026 10:08:23 +0200 Subject: [PATCH 01/15] feat(federation): implement ADR-F6 stream connection pool (RECON-F6) Adds semaphore-bounded concurrent stream limit (D1), token-bucket admission rate limiter (D2), and Prometheus metrics (D4) to FederationServer. Env vars FEDERATION_MAX_CONCURRENT_STREAMS (default 50, range 1-1000) and FEDERATION_ADMISSION_RATE (default 5, must be >0) configure the pool at startup. ActiveStreamCount() exposes the current gauge value for health checks. Five unit tests cover: semaphore rejection at limit, concurrent admission up to limit, activeCount increment/decrement on connect/disconnect, and both env-var parser edge cases. --- go.mod | 4 +- internal/federation/metrics.go | 29 +++ internal/federation/server.go | 109 ++++++++++- internal/federation/server_pool_test.go | 181 ++++++++++++++++++ internal/kernel/agent.go | 6 +- .../federation/stream_integration_test.go | 8 +- .../unit/federation/federation_stream_test.go | 2 +- test/unit/federation/federation_tls_test.go | 8 +- 8 files changed, 327 insertions(+), 20 deletions(-) create mode 100644 internal/federation/metrics.go create mode 100644 internal/federation/server_pool_test.go diff --git a/go.mod b/go.mod index e88fae0..e766047 100644 --- a/go.mod +++ b/go.mod @@ -18,13 +18,14 @@ require ( github.com/onsi/ginkgo/v2 v2.27.2 github.com/onsi/gomega v1.38.2 github.com/ontai-dev/conductor-sdk v0.0.0-00010101000000-000000000000 + github.com/ontai-dev/dispatcher v0.0.0-00010101000000-000000000000 github.com/ontai-dev/guardian v0.0.0-00010101000000-000000000000 github.com/ontai-dev/platform v0.0.0-00010101000000-000000000000 - github.com/ontai-dev/dispatcher v0.0.0-00010101000000-000000000000 github.com/ontai-dev/seam v0.0.0-00010101000000-000000000000 github.com/ontai-dev/seam-sdk v0.0.0-00010101000000-000000000000 github.com/prometheus/client_golang v1.23.2 github.com/siderolabs/talos/pkg/machinery v1.12.6 + golang.org/x/time v0.14.0 google.golang.org/grpc v1.79.3 gopkg.in/yaml.v3 v3.0.1 helm.sh/helm/v3 v3.17.3 @@ -158,7 +159,6 @@ require ( golang.org/x/sys v0.41.0 // indirect golang.org/x/term v0.40.0 // indirect golang.org/x/text v0.34.0 // indirect - golang.org/x/time v0.14.0 // indirect golang.org/x/tools v0.41.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect diff --git a/internal/federation/metrics.go b/internal/federation/metrics.go new file mode 100644 index 0000000..33e11c2 --- /dev/null +++ b/internal/federation/metrics.go @@ -0,0 +1,29 @@ +package federation + +import ( + "github.com/prometheus/client_golang/prometheus" + ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + // metricActiveStreams tracks the current number of live streams accepted by FederationServer. + // ADR-F6 D4. + metricActiveStreams = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "conductor_federation_stream_active_count", + Help: "Current number of live streams accepted by FederationServer.", + }) + + // metricReconnectsTotal counts reconnect events observed per tenant cluster. + // ADR-F6 D4. + metricReconnectsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "conductor_federation_stream_reconnects_total", + Help: "Total number of stream reconnect events observed, labeled by cluster ID.", + }, + []string{"cluster_id"}, + ) +) + +func init() { + ctrlmetrics.Registry.MustRegister(metricActiveStreams, metricReconnectsTotal) +} diff --git a/internal/federation/server.go b/internal/federation/server.go index 6fa4daf..8f78ecb 100644 --- a/internal/federation/server.go +++ b/internal/federation/server.go @@ -7,8 +7,10 @@ import ( "fmt" "net" "sync" + "sync/atomic" "time" + "golang.org/x/time/rate" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials" @@ -47,6 +49,19 @@ type clusterStatus struct { missedHeartbeats int } +// FederationServerOptions configures the stream admission limits for FederationServer. +// Zero values disable the corresponding limit. ADR-F6. +type FederationServerOptions struct { + // MaxConcurrentStreams is the maximum number of simultaneous active streams. + // When reached, new connections receive codes.ResourceExhausted. + // Must be in [1, 1000]; 0 means unlimited (no semaphore). Default: 50 via env. + MaxConcurrentStreams int + + // AdmissionRate is the token-bucket refill rate in tokens per second. + // Burst capacity is 2x this value. 0 means unlimited. Default: 5 via env. + AdmissionRate int +} + // FederationServer is the management-side federation gRPC server. // It listens on the federation port with mutual TLS, extracts cluster IDs // from client certificate SANs, and maintains the bidirectional stream with @@ -63,31 +78,82 @@ type FederationServer struct { mu sync.RWMutex // connectedClusters maps clusterID → stream status for heartbeat tracking. connectedClusters map[string]*clusterStatus + + // semaphore limits concurrent active streams. nil = unlimited. ADR-F6 D1. + semaphore chan struct{} + + // admissionLimiter rate-limits new stream accepts. nil = unlimited. ADR-F6 D2. + admissionLimiter *rate.Limiter + + // activeCount is the live stream count, kept in sync with the semaphore. ADR-F6 D4. + activeCount atomic.Int64 } // NewFederationServer constructs a FederationServer from certificate paths. // The server does not start until Start is called. // conductor-schema.md §18. -func NewFederationServer(caCertPath, serverCertPath, serverKeyPath string, kubeClient kubernetes.Interface) (*FederationServer, error) { +func NewFederationServer(caCertPath, serverCertPath, serverKeyPath string, kubeClient kubernetes.Interface, opts FederationServerOptions) (*FederationServer, error) { tlsCfg, err := BuildServerTLSConfig(caCertPath, serverCertPath, serverKeyPath) if err != nil { return nil, fmt.Errorf("federation server TLS config: %w", err) } - return &FederationServer{ - tlsCfg: tlsCfg, - kubeClient: kubeClient, - connectedClusters: make(map[string]*clusterStatus), - }, nil + return newFederationServer(tlsCfg, kubeClient, opts), nil } // NewFederationServerFromTLS constructs a FederationServer from an already-built // tls.Config. Used in tests to inject a test TLS config directly. -func NewFederationServerFromTLS(tlsCfg *tls.Config, kubeClient kubernetes.Interface) *FederationServer { - return &FederationServer{ +func NewFederationServerFromTLS(tlsCfg *tls.Config, kubeClient kubernetes.Interface, opts FederationServerOptions) *FederationServer { + return newFederationServer(tlsCfg, kubeClient, opts) +} + +func newFederationServer(tlsCfg *tls.Config, kubeClient kubernetes.Interface, opts FederationServerOptions) *FederationServer { + s := &FederationServer{ tlsCfg: tlsCfg, kubeClient: kubeClient, connectedClusters: make(map[string]*clusterStatus), } + if opts.MaxConcurrentStreams > 0 { + s.semaphore = make(chan struct{}, opts.MaxConcurrentStreams) + } + if opts.AdmissionRate > 0 { + s.admissionLimiter = rate.NewLimiter(rate.Limit(opts.AdmissionRate), 2*opts.AdmissionRate) + } + return s +} + +// ActiveStreamCount returns the number of currently active streams. ADR-F6 D4. +func (s *FederationServer) ActiveStreamCount() int64 { + return s.activeCount.Load() +} + +// ParseFederationMaxStreams parses FEDERATION_MAX_CONCURRENT_STREAMS env value. +// Valid range: [1, 1000]. Returns 50 (default) if empty, 0 on invalid input. +// ADR-F6 D1. +func ParseFederationMaxStreams(v string) int { + if v == "" { + return 50 + } + var n int + if _, err := fmt.Sscanf(v, "%d", &n); err != nil || n <= 0 || n > 1000 { + fmt.Printf("federation server: invalid FEDERATION_MAX_CONCURRENT_STREAMS %q (must be 1-1000) — using default 50\n", v) + return 50 + } + return n +} + +// ParseFederationAdmissionRate parses FEDERATION_ADMISSION_RATE env value. +// Returns 5 (default) if empty, 0 on invalid input (disables rate limiting). +// ADR-F6 D2. +func ParseFederationAdmissionRate(v string) int { + if v == "" { + return 5 + } + var n int + if _, err := fmt.Sscanf(v, "%d", &n); err != nil || n <= 0 { + fmt.Printf("federation server: invalid FEDERATION_ADMISSION_RATE %q (must be >0) — using default 5\n", v) + return 5 + } + return n } // Start begins listening on addr and serves the federation gRPC stream with mutual TLS. @@ -138,12 +204,39 @@ func (s *FederationServer) ConnectedClusterIDs() []string { // federationStream handles a single bidirectional stream from a connected tenant. // It implements the grpc.ServerStream interface handler for the FederationService/Stream method. func (s *FederationServer) federationStream(stream grpc.ServerStream) error { + // D2: admission rate-limit check before semaphore acquisition. ADR-F6. + if s.admissionLimiter != nil && !s.admissionLimiter.Allow() { + return status.Errorf(codes.ResourceExhausted, "federation server: admission rate limit exceeded") + } + + // D1: semaphore -- reject when max concurrent streams reached. ADR-F6. + if s.semaphore != nil { + select { + case s.semaphore <- struct{}{}: + // slot acquired + default: + return status.Errorf(codes.ResourceExhausted, "federation server: max concurrent stream limit reached") + } + defer func() { <-s.semaphore }() + } + + // Track active count and update the Prometheus gauge. ADR-F6 D4. + s.activeCount.Add(1) + metricActiveStreams.Inc() + defer func() { + s.activeCount.Add(-1) + metricActiveStreams.Dec() + }() + // Extract cluster ID from the peer TLS certificate SAN. clusterID, err := s.clusterIDFromStream(stream) if err != nil { return status.Errorf(codes.Unauthenticated, "cluster ID extraction: %v", err) } + // Count this as a reconnect event (every stream accept = one connection). ADR-F6 D4. + metricReconnectsTotal.WithLabelValues(clusterID).Inc() + // Register this cluster as connected. cs := &clusterStatus{} s.mu.Lock() diff --git a/internal/federation/server_pool_test.go b/internal/federation/server_pool_test.go new file mode 100644 index 0000000..5cb56a8 --- /dev/null +++ b/internal/federation/server_pool_test.go @@ -0,0 +1,181 @@ +package federation + +import ( + "context" + "sync" + "testing" + "time" +) + +// blockingStream is a minimal grpc.ServerStream that blocks RecvMsg until +// released via the done channel. Used to hold a semaphore slot open. +type blockingStream struct { + ctx context.Context + done chan struct{} +} + +func newBlockingStream(ctx context.Context) *blockingStream { + return &blockingStream{ctx: ctx, done: make(chan struct{})} +} + +func (b *blockingStream) Context() context.Context { return b.ctx } +func (b *blockingStream) RecvMsg(m any) error { + select { + case <-b.done: + return nil + case <-b.ctx.Done(): + return b.ctx.Err() + } +} +func (b *blockingStream) SendMsg(m any) error { return nil } +func (b *blockingStream) SetHeader(md any) error { return nil } +func (b *blockingStream) SendHeader(md any) error { return nil } +func (b *blockingStream) SetTrailer(md any) {} + +// acquireSlot tests the semaphore directly, bypassing TLS cert extraction. +// Returns true if the slot was acquired (semaphore not full), false otherwise. +func acquireSlot(s *FederationServer) (release func(), ok bool) { + if s.semaphore == nil { + return func() {}, true + } + select { + case s.semaphore <- struct{}{}: + return func() { <-s.semaphore }, true + default: + return nil, false + } +} + +// TestFederationServer_RejectsWhenLimitReached verifies that a server with +// limit=2 rejects the third concurrent connection with RESOURCE_EXHAUSTED. +// ADR-F6 D1. +func TestFederationServer_RejectsWhenLimitReached(t *testing.T) { + opts := FederationServerOptions{MaxConcurrentStreams: 2} + s := newFederationServer(nil, nil, opts) + + // Acquire both slots. + rel1, ok1 := acquireSlot(s) + if !ok1 { + t.Fatal("expected slot 1 to be acquired") + } + defer rel1() + rel2, ok2 := acquireSlot(s) + if !ok2 { + t.Fatal("expected slot 2 to be acquired") + } + defer rel2() + + // Third attempt must be rejected. + _, ok3 := acquireSlot(s) + if ok3 { + t.Error("expected slot 3 to be rejected (limit=2 reached)") + } +} + +// TestFederationServer_AdmitsUpToLimit verifies that a server with limit=2 +// admits exactly two concurrent streams and both are recorded as active. +// ADR-F6 D1. +func TestFederationServer_AdmitsUpToLimit(t *testing.T) { + opts := FederationServerOptions{MaxConcurrentStreams: 2} + s := newFederationServer(nil, nil, opts) + + var mu sync.Mutex + admitted := 0 + + var wg sync.WaitGroup + for i := 0; i < 2; i++ { + wg.Add(1) + go func() { + defer wg.Done() + rel, ok := acquireSlot(s) + if !ok { + return + } + defer rel() + mu.Lock() + admitted++ + mu.Unlock() + // Hold the slot briefly. + time.Sleep(10 * time.Millisecond) + }() + } + wg.Wait() + + if admitted != 2 { + t.Errorf("expected 2 admitted streams, got %d", admitted) + } +} + +// TestActiveStreamCount_DecreasesOnDisconnect verifies that ActiveStreamCount +// increments when a slot is acquired and decrements when it is released. +// ADR-F6 D4. +func TestActiveStreamCount_DecreasesOnDisconnect(t *testing.T) { + opts := FederationServerOptions{MaxConcurrentStreams: 5} + s := newFederationServer(nil, nil, opts) + + if n := s.ActiveStreamCount(); n != 0 { + t.Fatalf("expected ActiveStreamCount=0 before any stream, got %d", n) + } + + // Simulate what federationStream does: acquire semaphore + track activeCount. + rel, ok := acquireSlot(s) + if !ok { + t.Fatal("expected slot to be acquired") + } + s.activeCount.Add(1) + metricActiveStreams.Inc() + + if n := s.ActiveStreamCount(); n != 1 { + t.Errorf("expected ActiveStreamCount=1 after connect, got %d", n) + } + + // Simulate disconnect. + s.activeCount.Add(-1) + metricActiveStreams.Dec() + rel() + + if n := s.ActiveStreamCount(); n != 0 { + t.Errorf("expected ActiveStreamCount=0 after disconnect, got %d", n) + } +} + +// TestParseFederationMaxStreams verifies the env var parser. ADR-F6 D1. +func TestParseFederationMaxStreams(t *testing.T) { + cases := []struct { + input string + want int + }{ + {"", 50}, + {"10", 10}, + {"1000", 1000}, + {"0", 50}, // out of range: default + {"1001", 50}, // out of range: default + {"bad", 50}, // invalid: default + {"-5", 50}, // negative: default + } + for _, tc := range cases { + if got := ParseFederationMaxStreams(tc.input); got != tc.want { + t.Errorf("ParseFederationMaxStreams(%q) = %d, want %d", tc.input, got, tc.want) + } + } +} + +// TestParseFederationAdmissionRate verifies the env var parser. ADR-F6 D2. +func TestParseFederationAdmissionRate(t *testing.T) { + cases := []struct { + input string + want int + }{ + {"", 5}, + {"10", 10}, + {"1", 1}, + {"0", 5}, // zero invalid: default + {"-1", 5}, // negative: default + {"bad", 5}, // invalid: default + } + for _, tc := range cases { + if got := ParseFederationAdmissionRate(tc.input); got != tc.want { + t.Errorf("ParseFederationAdmissionRate(%q) = %d, want %d", tc.input, got, tc.want) + } + } +} diff --git a/internal/kernel/agent.go b/internal/kernel/agent.go index 19d766a..b52f655 100644 --- a/internal/kernel/agent.go +++ b/internal/kernel/agent.go @@ -354,7 +354,11 @@ func RunAgent(goCtx context.Context, execCtx config.ExecutionContext, client kub if fedCACertPath != "" && fedServerCertPath != "" && fedServerKeyPath != "" { // Management Conductor: start the federation server. - fedServer, fedErr := federation.NewFederationServer(fedCACertPath, fedServerCertPath, fedServerKeyPath, nil) + fedOpts := federation.FederationServerOptions{ + MaxConcurrentStreams: federation.ParseFederationMaxStreams(os.Getenv("FEDERATION_MAX_CONCURRENT_STREAMS")), + AdmissionRate: federation.ParseFederationAdmissionRate(os.Getenv("FEDERATION_ADMISSION_RATE")), + } + fedServer, fedErr := federation.NewFederationServer(fedCACertPath, fedServerCertPath, fedServerKeyPath, nil, fedOpts) if fedErr != nil { return fmt.Errorf("conductor agent: build federation server: %w", fedErr) } diff --git a/test/integration/federation/stream_integration_test.go b/test/integration/federation/stream_integration_test.go index cb12e8b..4ffa890 100644 --- a/test/integration/federation/stream_integration_test.go +++ b/test/integration/federation/stream_integration_test.go @@ -209,7 +209,7 @@ func TestStream_HeartBeat_ServerRespondsWithACK(t *testing.T) { if err != nil { t.Fatalf("server TLS: %v", err) } - srv := federation.NewFederationServerFromTLS(serverTLS, nil) + srv := federation.NewFederationServerFromTLS(serverTLS, nil, federation.FederationServerOptions{}) addr, _ := startStreamServer(t, srv) clientTLS, err := federation.BuildClientTLSConfig(caPath, clientCertPath, clientKeyPath) @@ -264,7 +264,7 @@ func TestStream_AuditEventBatch_ServerRespondsWithAck(t *testing.T) { t.Fatalf("server TLS: %v", err) } // kubeClient is nil — server skips ConfigMap creation but still ACKs. - srv := federation.NewFederationServerFromTLS(serverTLS, nil) + srv := federation.NewFederationServerFromTLS(serverTLS, nil, federation.FederationServerOptions{}) addr, _ := startStreamServer(t, srv) clientTLS, err := federation.BuildClientTLSConfig(caPath, clientCertPath, clientKeyPath) @@ -327,7 +327,7 @@ func TestStream_ClusterID_ExtractedFromClientCert(t *testing.T) { if err != nil { t.Fatalf("server TLS: %v", err) } - srv := federation.NewFederationServerFromTLS(serverTLS, nil) + srv := federation.NewFederationServerFromTLS(serverTLS, nil, federation.FederationServerOptions{}) addr, _ := startStreamServer(t, srv) clientTLS, err := federation.BuildClientTLSConfig(caPath, clientCertPath, clientKeyPath) @@ -374,7 +374,7 @@ func TestStream_WALReplay_OnReconnect(t *testing.T) { if err != nil { t.Fatalf("server TLS: %v", err) } - srv := federation.NewFederationServerFromTLS(serverTLS, nil) + srv := federation.NewFederationServerFromTLS(serverTLS, nil, federation.FederationServerOptions{}) addr, _ := startStreamServer(t, srv) // Pre-populate WAL with 3 entries; ACK sequence 1. diff --git a/test/unit/federation/federation_stream_test.go b/test/unit/federation/federation_stream_test.go index 6ba1aec..2eac4f6 100644 --- a/test/unit/federation/federation_stream_test.go +++ b/test/unit/federation/federation_stream_test.go @@ -38,7 +38,7 @@ func setupStreamTest(t *testing.T) *streamTestEnv { serverCertPath, serverKeyPath, caPath := writeTempCerts(t, serverCertPEM, serverKeyPEM, ca.caPEM()) // Use a fake kubeClient in tests (nil — server skips ConfigMap creation). - srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil) + srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil, federation.FederationServerOptions{}) if err != nil { t.Fatalf("NewFederationServer: %v", err) } diff --git a/test/unit/federation/federation_tls_test.go b/test/unit/federation/federation_tls_test.go index 03d347f..2def2ca 100644 --- a/test/unit/federation/federation_tls_test.go +++ b/test/unit/federation/federation_tls_test.go @@ -286,7 +286,7 @@ func TestFederationServer_gRPC_AcceptsValidCert(t *testing.T) { serverCertPEM, serverKeyPEM := ca.issueServerCert(t, []string{"localhost"}) serverCertPath, serverKeyPath, caPath := writeTempCerts(t, serverCertPEM, serverKeyPEM, ca.caPEM()) - srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil) + srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil, federation.FederationServerOptions{}) if err != nil { t.Fatalf("NewFederationServer: %v", err) } @@ -334,7 +334,7 @@ func TestFederationServer_gRPC_RejectsNoCert(t *testing.T) { serverCertPEM, serverKeyPEM := ca.issueServerCert(t, []string{"localhost"}) serverCertPath, serverKeyPath, caPath := writeTempCerts(t, serverCertPEM, serverKeyPEM, ca.caPEM()) - srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil) + srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil, federation.FederationServerOptions{}) if err != nil { t.Fatalf("NewFederationServer: %v", err) } @@ -389,7 +389,7 @@ func TestFederationServer_gRPC_RejectsWrongCA(t *testing.T) { serverCertPEM, serverKeyPEM := serverCA.issueServerCert(t, []string{"localhost"}) serverCertPath, serverKeyPath, caPath := writeTempCerts(t, serverCertPEM, serverKeyPEM, serverCA.caPEM()) - srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil) + srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil, federation.FederationServerOptions{}) if err != nil { t.Fatalf("NewFederationServer: %v", err) } @@ -460,7 +460,7 @@ func TestFederationClient_ClusterIDExtraction(t *testing.T) { serverCertPEM, serverKeyPEM := ca.issueServerCert(t, []string{"localhost"}) serverCertPath, serverKeyPath, caPath := writeTempCerts(t, serverCertPEM, serverKeyPEM, ca.caPEM()) - srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil) + srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil, federation.FederationServerOptions{}) if err != nil { t.Fatalf("NewFederationServer: %v", err) } From 9870e9763f504b75258d82b3f32ff7eab8246366 Mon Sep 17 00:00:00 2001 From: ontave Date: Fri, 29 May 2026 10:34:59 +0200 Subject: [PATCH 02/15] fix(federation): remove unused blockingStream mock (lint unused) --- internal/federation/server_pool_test.go | 26 ------------------------- 1 file changed, 26 deletions(-) diff --git a/internal/federation/server_pool_test.go b/internal/federation/server_pool_test.go index 5cb56a8..8816c6a 100644 --- a/internal/federation/server_pool_test.go +++ b/internal/federation/server_pool_test.go @@ -1,37 +1,11 @@ package federation import ( - "context" "sync" "testing" "time" ) -// blockingStream is a minimal grpc.ServerStream that blocks RecvMsg until -// released via the done channel. Used to hold a semaphore slot open. -type blockingStream struct { - ctx context.Context - done chan struct{} -} - -func newBlockingStream(ctx context.Context) *blockingStream { - return &blockingStream{ctx: ctx, done: make(chan struct{})} -} - -func (b *blockingStream) Context() context.Context { return b.ctx } -func (b *blockingStream) RecvMsg(m any) error { - select { - case <-b.done: - return nil - case <-b.ctx.Done(): - return b.ctx.Err() - } -} -func (b *blockingStream) SendMsg(m any) error { return nil } -func (b *blockingStream) SetHeader(md any) error { return nil } -func (b *blockingStream) SendHeader(md any) error { return nil } -func (b *blockingStream) SetTrailer(md any) {} - // acquireSlot tests the semaphore directly, bypassing TLS cert extraction. // Returns true if the slot was acquired (semaphore not full), false otherwise. func acquireSlot(s *FederationServer) (release func(), ok bool) { From da977c0ea05ca771639a87872caa2c8ff44d50a9 Mon Sep 17 00:00:00 2001 From: ontave Date: Fri, 29 May 2026 10:51:50 +0200 Subject: [PATCH 03/15] feat(conductor): RECON-CMN1 PackSourceVersionLoop -- upstream Helm chart version drift detection --- internal/agent/pack_source_version_loop.go | 291 +++++++++++++++ .../agent/pack_source_version_loop_test.go | 330 ++++++++++++++++++ internal/kernel/agent.go | 22 +- test/e2e/pack_source_version_loop_test.go | 36 ++ 4 files changed, 678 insertions(+), 1 deletion(-) create mode 100644 internal/agent/pack_source_version_loop.go create mode 100644 internal/agent/pack_source_version_loop_test.go create mode 100644 test/e2e/pack_source_version_loop_test.go diff --git a/internal/agent/pack_source_version_loop.go b/internal/agent/pack_source_version_loop.go new file mode 100644 index 0000000..1c5cc3f --- /dev/null +++ b/internal/agent/pack_source_version_loop.go @@ -0,0 +1,291 @@ +package agent + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "time" + + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/dynamic" + "sigs.k8s.io/yaml" +) + +// packSourceSignalPrefix is the DriftSignal name prefix for upstream version signals. +// One signal per PackDelivery, written in the same namespace as the PackDelivery. +const packSourceSignalPrefix = "drift-pack-source-" + +// helmIndexMaxBytes is the upper bound for index.yaml fetches. +const helmIndexMaxBytes = 8 * 1024 * 1024 // 8 MiB + +// helmIndex represents the relevant fields of a Helm chart repository index.yaml. +type helmIndex struct { + Entries map[string][]helmIndexEntry `json:"entries" yaml:"entries"` +} + +type helmIndexEntry struct { + Version string `json:"version" yaml:"version"` +} + +// PackSourceVersionLoop runs on conductor role=management. On each cycle it: +// 1. Lists all PackDeliveries in the management namespace (seam-system). +// 2. Filters those with a non-empty spec.chartURL (Helm-backed packs). +// 3. For each, derives the Helm repository base URL from spec.chartURL, fetches +// the repository index.yaml, and finds the latest version for spec.chartName. +// 4. If the latest version is newer than spec.chartVersion, emits an +// UpstreamVersionAvailable DriftSignal in the same namespace as the PackDelivery. +// 5. If no newer version is found, confirms any existing signal. +// +// RECON-CMN1. conductor-schema.md §7. +type PackSourceVersionLoop struct { + client dynamic.Interface + namespace string + httpClient *http.Client +} + +// NewPackSourceVersionLoop constructs a PackSourceVersionLoop for the given namespace. +func NewPackSourceVersionLoop(client dynamic.Interface, namespace string) *PackSourceVersionLoop { + return &PackSourceVersionLoop{ + client: client, + namespace: namespace, + httpClient: &http.Client{ + Timeout: 15 * time.Second, + }, + } +} + +// Run runs the loop until ctx is cancelled. Fires once immediately then repeats. +func (l *PackSourceVersionLoop) Run(ctx context.Context, interval time.Duration) { + l.checkOnce(ctx) + if ctx.Err() != nil { + return + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + l.checkOnce(ctx) + } + } +} + +// checkOnce performs one version check cycle across all Helm-backed PackDeliveries. +func (l *PackSourceVersionLoop) checkOnce(ctx context.Context) { + list, err := l.client.Resource(clusterPackMgmtGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + fmt.Printf("pack source version loop: list PackDeliveries in %s: %v\n", l.namespace, err) + return + } + + for i := range list.Items { + l.checkPack(ctx, &list.Items[i]) + } +} + +// checkPack checks one PackDelivery for upstream version availability. +func (l *PackSourceVersionLoop) checkPack(ctx context.Context, pd *k8sunstructured.Unstructured) { + spec, _, _ := unstructuredNestedMap(pd.Object, "spec") + chartURL, _ := spec["chartURL"].(string) + chartName, _ := spec["chartName"].(string) + chartVersion, _ := spec["chartVersion"].(string) + packName := pd.GetName() + + if chartURL == "" || chartName == "" || chartVersion == "" { + return + } + + repoURL, err := helmRepoBaseURL(chartURL) + if err != nil { + fmt.Printf("pack source version loop: pack=%q derive repo URL from %q: %v\n", packName, chartURL, err) + return + } + + latest, err := l.fetchLatestHelmVersion(ctx, repoURL, chartName) + if err != nil { + fmt.Printf("pack source version loop: pack=%q fetch index from %s: %v\n", packName, repoURL, err) + return + } + if latest == "" { + return + } + + signalName := packSourceSignalPrefix + packName + + if latest == chartVersion { + l.confirmSignalIfPresent(ctx, signalName, l.namespace) + return + } + + driftReason := fmt.Sprintf("upstream version available: chart=%s current=%s latest=%s", chartName, chartVersion, latest) + l.emitVersionSignal(ctx, signalName, l.namespace, packName, chartName, chartVersion, latest, driftReason) +} + +// fetchLatestHelmVersion downloads {repoURL}/index.yaml and returns the newest version +// listed for chartName. Returns "" if the chart is not found in the index. +func (l *PackSourceVersionLoop) fetchLatestHelmVersion(ctx context.Context, repoURL, chartName string) (string, error) { + indexURL := repoURL + "/index.yaml" + req, err := http.NewRequestWithContext(ctx, http.MethodGet, indexURL, nil) + if err != nil { + return "", fmt.Errorf("build request for %s: %w", indexURL, err) + } + resp, err := l.httpClient.Do(req) + if err != nil { + return "", fmt.Errorf("GET %s: %w", indexURL, err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("GET %s: status %d", indexURL, resp.StatusCode) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, helmIndexMaxBytes)) + if err != nil { + return "", fmt.Errorf("read index body from %s: %w", indexURL, err) + } + + var idx helmIndex + if err := yaml.Unmarshal(body, &idx); err != nil { + return "", fmt.Errorf("parse index.yaml from %s: %w", indexURL, err) + } + + entries := idx.Entries[chartName] + if len(entries) == 0 { + return "", nil + } + // Helm index.yaml entries are sorted newest-first by convention. + return entries[0].Version, nil +} + +// emitVersionSignal writes or updates the UpstreamVersionAvailable DriftSignal. +// Idempotent: creates if absent, increments counter if present. +func (l *PackSourceVersionLoop) emitVersionSignal(ctx context.Context, signalName, namespace, packName, chartName, currentVersion, latestVersion, driftReason string) { + now := time.Now().UTC().Format(time.RFC3339) + + existing, err := l.client.Resource(driftSignalGVR).Namespace(namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil && !k8serrors.IsNotFound(err) { + fmt.Printf("pack source version loop: pack=%q get DriftSignal: %v\n", packName, err) + return + } + + if k8serrors.IsNotFound(err) { + obj := map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": signalName, "namespace": namespace}, + "spec": map[string]interface{}{ + "state": "pending", + "signalKind": "UpstreamVersionAvailable", + "driftLayer": "governance", + "correlationID": fmt.Sprintf("pack-source-%s-%d", packName, time.Now().UnixNano()), + "observedAt": now, + "driftReason": driftReason, + "affectedCRRef": map[string]interface{}{ + "group": "seam.ontai.dev", + "kind": "PackDelivery", + "namespace": namespace, + "name": packName, + }, + "escalationCounter": int64(0), + }, + } + if _, cErr := l.client.Resource(driftSignalGVR).Namespace(namespace).Create( + ctx, &k8sunstructured.Unstructured{Object: obj}, metav1.CreateOptions{}, + ); cErr != nil { + fmt.Printf("pack source version loop: pack=%q create DriftSignal: %v\n", packName, cErr) + } + fmt.Printf("pack source version loop: pack=%q upstream version available (current=%s latest=%s)\n", + packName, currentVersion, latestVersion) + return + } + + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + counter, _ := spec["escalationCounter"].(int64) + + if int32(counter) >= escalationThreshold { + return + } + + if state == "confirmed" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "correlationID": fmt.Sprintf("pack-source-%s-%d", packName, time.Now().UnixNano()), + "observedAt": now, + "escalationCounter": int64(0), + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("pack source version loop: pack=%q reset confirmed DriftSignal: %v\n", packName, pErr) + } + return + } + + if state == "queued" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "escalationCounter": counter + 1, + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("pack source version loop: pack=%q increment escalation counter: %v\n", packName, pErr) + } + } +} + +// confirmSignalIfPresent advances the DriftSignal to confirmed if it exists and is not +// already in a terminal state. +func (l *PackSourceVersionLoop) confirmSignalIfPresent(ctx context.Context, signalName, namespace string) { + existing, err := l.client.Resource(driftSignalGVR).Namespace(namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil { + return + } + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + if state == "confirmed" || state == "" { + return + } + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "confirmed", + "correlationID": "", + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("pack source version loop: confirm DriftSignal %s/%s: %v\n", namespace, signalName, pErr) + } +} + +// helmRepoBaseURL extracts the Helm repository base URL (scheme + host) from a chart URL. +// Helm chart repositories serve index.yaml at the root of the host. +// Example: "http://10.20.0.1:5000/charts/mychart-1.0.0.tgz" -> "http://10.20.0.1:5000" +func helmRepoBaseURL(chartURL string) (string, error) { + u, err := url.Parse(chartURL) + if err != nil { + return "", fmt.Errorf("parse chart URL %q: %w", chartURL, err) + } + if u.Scheme == "" || u.Host == "" { + return "", fmt.Errorf("chart URL %q missing scheme or host", chartURL) + } + return fmt.Sprintf("%s://%s", u.Scheme, u.Host), nil +} diff --git a/internal/agent/pack_source_version_loop_test.go b/internal/agent/pack_source_version_loop_test.go new file mode 100644 index 0000000..0c3bede --- /dev/null +++ b/internal/agent/pack_source_version_loop_test.go @@ -0,0 +1,330 @@ +package agent + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/dynamic/fake" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +// helmIndexYAML is a minimal Helm chart repository index for testing. +const helmIndexYAML = ` +apiVersion: v1 +entries: + mychart: + - version: "2.0.0" + - version: "1.5.0" + - version: "1.0.0" + otherchart: + - version: "0.3.0" +` + +// helmIndexEmpty has no entries for the requested chart. +const helmIndexEmpty = ` +apiVersion: v1 +entries: {} +` + +// TestHelmRepoBaseURL verifies URL base extraction from chart URLs. +func TestHelmRepoBaseURL(t *testing.T) { + cases := []struct { + input string + want string + wantErr bool + }{ + {"http://10.20.0.1:5000/charts/mychart-1.0.0.tgz", "http://10.20.0.1:5000", false}, + {"http://10.20.0.1:5000/mychart-1.0.0.tgz", "http://10.20.0.1:5000", false}, + {"https://charts.example.com/charts/app-2.0.0.tgz", "https://charts.example.com", false}, + {"not-a-url", "", true}, + {"", "", true}, + } + for _, tc := range cases { + got, err := helmRepoBaseURL(tc.input) + if tc.wantErr { + if err == nil { + t.Errorf("helmRepoBaseURL(%q): expected error, got %q", tc.input, got) + } + continue + } + if err != nil { + t.Errorf("helmRepoBaseURL(%q): unexpected error: %v", tc.input, err) + continue + } + if got != tc.want { + t.Errorf("helmRepoBaseURL(%q) = %q, want %q", tc.input, got, tc.want) + } + } +} + +// TestFetchLatestHelmVersion verifies the Helm index fetch and parse logic. +func TestFetchLatestHelmVersion(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/index.yaml" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/x-yaml") + _, _ = w.Write([]byte(helmIndexYAML)) + })) + defer srv.Close() + + loop := NewPackSourceVersionLoop(nil, "seam-system") + loop.httpClient = srv.Client() + + t.Run("KnownChart", func(t *testing.T) { + got, err := loop.fetchLatestHelmVersion(context.Background(), srv.URL, "mychart") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != "2.0.0" { + t.Errorf("fetchLatestHelmVersion(mychart) = %q, want %q", got, "2.0.0") + } + }) + + t.Run("UnknownChart", func(t *testing.T) { + got, err := loop.fetchLatestHelmVersion(context.Background(), srv.URL, "notexist") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != "" { + t.Errorf("fetchLatestHelmVersion(notexist) = %q, want empty", got) + } + }) +} + +// TestFetchLatestHelmVersion_HTTPError verifies error propagation on server errors. +func TestFetchLatestHelmVersion_HTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "internal server error", http.StatusInternalServerError) + })) + defer srv.Close() + + loop := NewPackSourceVersionLoop(nil, "seam-system") + loop.httpClient = srv.Client() + + _, err := loop.fetchLatestHelmVersion(context.Background(), srv.URL, "mychart") + if err == nil { + t.Error("expected error on HTTP 500, got nil") + } +} + +// TestFetchLatestHelmVersion_EmptyIndex verifies that an empty index returns "". +func TestFetchLatestHelmVersion_EmptyIndex(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/x-yaml") + _, _ = w.Write([]byte(helmIndexEmpty)) + })) + defer srv.Close() + + loop := NewPackSourceVersionLoop(nil, "seam-system") + loop.httpClient = srv.Client() + + got, err := loop.fetchLatestHelmVersion(context.Background(), srv.URL, "mychart") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != "" { + t.Errorf("fetchLatestHelmVersion on empty index = %q, want empty", got) + } +} + +// newFakePackDelivery builds an unstructured PackDelivery for testing. +func newFakePackDelivery(name, namespace, chartURL, chartName, chartVersion string) *unstructured.Unstructured { + return &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "PackDelivery", + "metadata": map[string]interface{}{ + "name": name, + "namespace": namespace, + }, + "spec": map[string]interface{}{ + "chartURL": chartURL, + "chartName": chartName, + "chartVersion": chartVersion, + }, + }, + } +} + +// newFakeDynamicClient builds a fake dynamic client pre-loaded with the given objects. +func newFakeDynamicClient(scheme *runtime.Scheme, objs ...runtime.Object) *fake.FakeDynamicClient { + return fake.NewSimpleDynamicClient(scheme, objs...) +} + +// TestCheckOnce_EmitsDriftSignalOnNewerVersion verifies that checkOnce creates a +// DriftSignal when the index reports a version newer than spec.chartVersion. +func TestCheckOnce_EmitsDriftSignalOnNewerVersion(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.HasSuffix(r.URL.Path, "/index.yaml") { + _, _ = w.Write([]byte(helmIndexYAML)) // latest: 2.0.0 + return + } + http.NotFound(w, r) + })) + defer srv.Close() + + scheme := runtime.NewScheme() + pd := newFakePackDelivery("mypack", "seam-system", srv.URL+"/charts/mychart-1.0.0.tgz", "mychart", "1.0.0") + + // Register GroupVersionResource for fake client. + pdGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "packdeliveries"} + dsGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "driftsignals"} + + client := fake.NewSimpleDynamicClientWithCustomListKinds(scheme, + map[schema.GroupVersionResource]string{ + pdGVR: "PackDeliveryList", + dsGVR: "DriftSignalList", + }, + pd, + ) + + loop := NewPackSourceVersionLoop(client, "seam-system") + loop.httpClient = srv.Client() + + loop.checkOnce(context.Background()) + + signalName := packSourceSignalPrefix + "mypack" + ds, err := client.Resource(dsGVR).Namespace("seam-system").Get(context.Background(), signalName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("expected DriftSignal to be created, got error: %v", err) + } + + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + state, _ := spec["state"].(string) + if state != "pending" { + t.Errorf("DriftSignal state = %q, want %q", state, "pending") + } + signalKind, _ := spec["signalKind"].(string) + if signalKind != "UpstreamVersionAvailable" { + t.Errorf("DriftSignal signalKind = %q, want UpstreamVersionAvailable", signalKind) + } +} + +// TestCheckOnce_NoSignalWhenVersionCurrent verifies that checkOnce does not create +// a DriftSignal when spec.chartVersion matches the latest version in the index. +func TestCheckOnce_NoSignalWhenVersionCurrent(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(helmIndexYAML)) // latest: 2.0.0 + })) + defer srv.Close() + + scheme := runtime.NewScheme() + // Pack already at latest version. + pd := newFakePackDelivery("mypack", "seam-system", srv.URL+"/charts/mychart-2.0.0.tgz", "mychart", "2.0.0") + + pdGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "packdeliveries"} + dsGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "driftsignals"} + + client := fake.NewSimpleDynamicClientWithCustomListKinds(scheme, + map[schema.GroupVersionResource]string{ + pdGVR: "PackDeliveryList", + dsGVR: "DriftSignalList", + }, + pd, + ) + + loop := NewPackSourceVersionLoop(client, "seam-system") + loop.httpClient = srv.Client() + + loop.checkOnce(context.Background()) + + signalName := packSourceSignalPrefix + "mypack" + _, err := client.Resource(dsGVR).Namespace("seam-system").Get(context.Background(), signalName, metav1.GetOptions{}) + if err == nil { + t.Error("expected no DriftSignal when version is current, but one was created") + } +} + +// TestCheckOnce_SkipsNonHelmPacks verifies that packs without chartURL are skipped. +func TestCheckOnce_SkipsNonHelmPacks(t *testing.T) { + scheme := runtime.NewScheme() + pd := newFakePackDelivery("rawpack", "seam-system", "", "", "") // no chartURL + + pdGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "packdeliveries"} + dsGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "driftsignals"} + + client := fake.NewSimpleDynamicClientWithCustomListKinds(scheme, + map[schema.GroupVersionResource]string{ + pdGVR: "PackDeliveryList", + dsGVR: "DriftSignalList", + }, + pd, + ) + + loop := NewPackSourceVersionLoop(client, "seam-system") + + // Should complete without panicking or creating any DriftSignal. + loop.checkOnce(context.Background()) + + signalName := packSourceSignalPrefix + "rawpack" + _, err := client.Resource(dsGVR).Namespace("seam-system").Get(context.Background(), signalName, metav1.GetOptions{}) + if err == nil { + t.Error("expected no DriftSignal for non-Helm pack, but one was created") + } +} + +// TestCheckOnce_ConfirmsExistingSignalWhenVersionCurrent verifies that checkOnce +// advances an existing DriftSignal to confirmed when the pack is at the latest version. +func TestCheckOnce_ConfirmsExistingSignalWhenVersionCurrent(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(helmIndexYAML)) // latest: 2.0.0 + })) + defer srv.Close() + + scheme := runtime.NewScheme() + pd := newFakePackDelivery("mypack", "seam-system", srv.URL+"/charts/mychart-2.0.0.tgz", "mychart", "2.0.0") + + signalName := packSourceSignalPrefix + "mypack" + existingSignal := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{ + "name": signalName, + "namespace": "seam-system", + }, + "spec": map[string]interface{}{ + "state": "queued", + "signalKind": "UpstreamVersionAvailable", + }, + }, + } + // Serialize and deserialize to ensure the raw JSON format that the fake client returns. + rawBytes, _ := json.Marshal(existingSignal.Object) + _ = json.Unmarshal(rawBytes, &existingSignal.Object) + + pdGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "packdeliveries"} + dsGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "driftsignals"} + + client := fake.NewSimpleDynamicClientWithCustomListKinds(scheme, + map[schema.GroupVersionResource]string{ + pdGVR: "PackDeliveryList", + dsGVR: "DriftSignalList", + }, + pd, existingSignal, + ) + + loop := NewPackSourceVersionLoop(client, "seam-system") + loop.httpClient = srv.Client() + + loop.checkOnce(context.Background()) + + ds, err := client.Resource(dsGVR).Namespace("seam-system").Get(context.Background(), signalName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("expected DriftSignal to exist: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + state, _ := spec["state"].(string) + if state != "confirmed" { + t.Errorf("DriftSignal state = %q, want confirmed", state) + } +} diff --git a/internal/kernel/agent.go b/internal/kernel/agent.go index b52f655..4a1efaf 100644 --- a/internal/kernel/agent.go +++ b/internal/kernel/agent.go @@ -333,6 +333,17 @@ func RunAgent(goCtx context.Context, execCtx config.ExecutionContext, client kub execCtx.ClusterRef) } + // PackSourceVersionLoop — role=management only. Polls Helm chart repository + // index.yaml for each Helm-backed PackDelivery in seam-system and emits an + // UpstreamVersionAvailable DriftSignal when a newer chart version is found. + // RECON-CMN1. + var packSourceVersionLoop *agent.PackSourceVersionLoop + if role == RoleManagement { + packSourceVersionLoop = agent.NewPackSourceVersionLoop(dynamicClient, ns) + fmt.Printf("conductor agent: cluster=%q pack source version loop enabled (management role)\n", + execCtx.ClusterRef) + } + if runtimeDriftHandler != nil { runtimeDriftHandler.WithOperatorContextWatcher(ocWatcher) } @@ -457,7 +468,7 @@ func RunAgent(goCtx context.Context, execCtx config.ExecutionContext, client kub "", // identity: resolved from hostname inside RunLeaderElection agent.LeaderCallbacks{ OnStartedLeading: func(leaderCtx context.Context) { - onLeaderStart(leaderCtx, execCtx.ClusterRef, ns, manifest, publisher, reconciler, signingLoop, snapshotPullLoop, packInstancePullLoop, packReceiptDriftLoop, rbacProfilePullLoop, rbacPolicyPullLoop, driftSignalHandler, talosVersionDriftLoop, kubernetesVersionDriftLoop, packPodHealthLoop, runtimeDriftHandler, ocWatcher, clusterNodeHealthLoop, dynamicClient) + onLeaderStart(leaderCtx, execCtx.ClusterRef, ns, manifest, publisher, reconciler, signingLoop, snapshotPullLoop, packInstancePullLoop, packReceiptDriftLoop, rbacProfilePullLoop, rbacPolicyPullLoop, driftSignalHandler, talosVersionDriftLoop, kubernetesVersionDriftLoop, packPodHealthLoop, runtimeDriftHandler, ocWatcher, clusterNodeHealthLoop, packSourceVersionLoop, dynamicClient) }, OnStoppedLeading: func() { fmt.Printf("conductor agent: cluster=%q lost leadership — entering standby\n", @@ -494,6 +505,7 @@ func onLeaderStart( runtimeDriftHandler *agent.RuntimeDriftHandler, ocWatcher *agent.OperatorContextWatcher, clusterNodeHealthLoop *agent.ClusterNodeHealthLoop, + packSourceVersionLoop *agent.PackSourceVersionLoop, dynamicClient dynamic.Interface, ) { // Publish capability manifest to RunnerConfig status with background retry. @@ -629,6 +641,14 @@ func onLeaderStart( go clusterNodeHealthLoop.Run(leaderCtx, reconcileInterval) } + // Start PackSourceVersionLoop (management cluster only). Polls Helm chart repository + // index.yaml for each Helm-backed PackDelivery in the management namespace and emits + // UpstreamVersionAvailable DriftSignals. RECON-CMN1. + const packVersionInterval = 6 * time.Hour + if packSourceVersionLoop != nil { + go packSourceVersionLoop.Run(leaderCtx, packVersionInterval) + } + // Mark InfrastructureTalosCluster Ready=True (tenant clusters only). // snapshotPullLoop non-nil indicates role=tenant. Conductor signals readiness // to management once leadership is established. guardian-schema.md §3. diff --git a/test/e2e/pack_source_version_loop_test.go b/test/e2e/pack_source_version_loop_test.go new file mode 100644 index 0000000..9e17bc8 --- /dev/null +++ b/test/e2e/pack_source_version_loop_test.go @@ -0,0 +1,36 @@ +package e2e_test + +// pack_source_version_loop_test.go -- live cluster verification that the +// PackSourceVersionLoop correctly detects and signals upstream Helm chart +// version availability for extension PackDeliveries on ccs-mgmt. +// +// Pre-conditions: +// - MGMT_KUBECONFIG set; ccs-mgmt fully onboarded with Guardian operational. +// - At least one Helm-backed PackDelivery deployed to seam-system (e.g., Dex). +// - Helm chart repository at 10.20.0.1:5000 serving index.yaml with a newer +// chart version than the one referenced by the PackDelivery. +// - Conductor agent role=management running with PackSourceVersionLoop enabled. +// +// What this test verifies (RECON-CMN1): +// - PackSourceVersionLoop detects the version gap within one poll interval. +// - UpstreamVersionAvailable DriftSignal created in seam-system for the pack. +// - DriftSignal spec.signalKind == "UpstreamVersionAvailable". +// - After updating the PackDelivery to the latest chart version, the loop +// confirms the DriftSignal (state=confirmed) within the next poll interval. + +import ( + "testing" +) + +// TestPackSourceVersionLoop_DetectsAndSignalsNewChartVersion verifies the full +// upstream version detection and DriftSignal lifecycle. RECON-CMN1. +func TestPackSourceVersionLoop_DetectsAndSignalsNewChartVersion(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and a Helm-backed PackDelivery in seam-system with an available newer chart version and RECON-K1 closed") +} + +// TestPackSourceVersionLoop_ConfirmsSignalAfterVersionUpdate verifies that the +// loop confirms an existing UpstreamVersionAvailable signal after the PackDelivery +// spec.chartVersion is updated to match the latest index version. RECON-CMN1. +func TestPackSourceVersionLoop_ConfirmsSignalAfterVersionUpdate(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and an existing UpstreamVersionAvailable DriftSignal on a Helm-backed PackDelivery and RECON-K1 closed") +} From db42291d441dfe3dcfa6cc4af29597d67faf77e8 Mon Sep 17 00:00:00 2001 From: ontave Date: Fri, 29 May 2026 14:03:41 +0200 Subject: [PATCH 04/15] feat(compiler): emit extensions-maximum PermissionSet in compiler enable (RECON-CMN2) Adds extensions-maximum as a second Layer 1 PermissionSet in guardian-permissionsets.yaml. Covers CRD groups for all ten ONT extension operator categories (EXT-1 through EXT-10): external-secrets.io, kyverno.io, aquasecurity.github.io, velero.io, cost.grafana.com, monitoring.coreos.com, apiextensions.crossplane.io, pkg.crossplane.io. Updates TestEnable_OnlyManagementMaximumPermissionSet -> TestEnable_BootstrapPermissionSetCount to expect 2 PermissionSet documents. Adds TestEnable_ExtensionsMaximumPermissionSet. --- cmd/compiler/compile_enable.go | 38 +++++++++++++++++++++---- cmd/compiler/compile_enable_test.go | 43 +++++++++++++++++++++++------ 2 files changed, 67 insertions(+), 14 deletions(-) diff --git a/cmd/compiler/compile_enable.go b/cmd/compiler/compile_enable.go index 27a6ec8..ba11628 100644 --- a/cmd/compiler/compile_enable.go +++ b/cmd/compiler/compile_enable.go @@ -938,9 +938,12 @@ func writeBootstrapRBACPolicy(dir string) error { } // writeBootstrapPermissionSets writes guardian-permissionsets.yaml to dir. -// Emits ONLY management-maximum, the Layer 1 fleet ceiling (CS-INV-008). -// Per-operator PermissionSets are not emitted. All Seam operator RBACProfiles -// reference management-maximum directly. guardian-schema.md §6, §19. +// Emits two Layer 1 PermissionSets: +// - management-maximum: the fleet ceiling; all Seam operator RBACProfiles reference it. +// - extensions-maximum: the extension ceiling; covers CRDs for all ONT-managed extension +// operators (EXT-1 through EXT-10). RECON-CMN2. +// +// Per-operator PermissionSets are not emitted. guardian-schema.md §6, §19, CS-INV-008. func writeBootstrapPermissionSets(dir string) error { // rule builds a single permission rule map. rule := func(apiGroups, resources, verbs []string) map[string]interface{} { @@ -975,14 +978,37 @@ func writeBootstrapPermissionSets(dir string) error { rule([]string{"*"}, []string{"*"}, allVerbs), }, }, + { + // extensions-maximum: Layer 1 extension ceiling. Covers CRDs for all ten + // ONT-managed extension operator categories (EXT-1 through EXT-10). Extension + // RBACProfiles in seam-tenant-* namespaces reference cluster-maximum (the + // per-cluster copy) for permission enforcement; this PermissionSet declares the + // fleet-level CRD-group boundary for governance audits. RECON-CMN2. + name: "extensions-maximum", + labels: map[string]string{ + "ontai.dev/managed-by": "compiler", + "ontai.dev/permission-set-type": "bootstrap", + "ontai.dev/policy-type": "management", + }, + description: "Extension permission ceiling -- CRDs for ONT-managed extension operators", + permissions: []map[string]interface{}{ + rule([]string{"external-secrets.io"}, []string{"externalsecrets", "secretstores", "clustersecretstores"}, allVerbs), + rule([]string{"kyverno.io"}, []string{"clusterpolicies", "policies", "policyreports", "clusterpolicyreports"}, allVerbs), + rule([]string{"aquasecurity.github.io"}, []string{"vulnerabilityreports", "configauditreports", "clustervulnerabilityreports"}, allVerbs), + rule([]string{"velero.io"}, []string{"backups", "backupstoragelocations", "restores", "schedules", "volumesnapshotlocations"}, allVerbs), + rule([]string{"cost.grafana.com"}, []string{"*"}, allVerbs), + rule([]string{"monitoring.coreos.com"}, []string{"servicemonitors", "prometheusrules", "podmonitors"}, allVerbs), + rule([]string{"apiextensions.crossplane.io", "pkg.crossplane.io"}, []string{"*"}, allVerbs), + }, + }, } var buf bytes.Buffer buf.WriteString("# Bootstrap PermissionSet CRs\n") buf.WriteString("# Generated by: compiler enable (phase 1 guardian-bootstrap)\n") - buf.WriteString("# management-maximum is the Layer 1 fleet ceiling (guardian-schema.md §19 Layer 1).\n") - buf.WriteString("# CS-INV-008: exactly one PermissionSet at Layer 1. All Seam operator RBACProfiles\n") - buf.WriteString("# reference management-maximum directly. No per-operator PermissionSets are emitted.\n") + buf.WriteString("# management-maximum: Layer 1 fleet ceiling (guardian-schema.md §19 Layer 1).\n") + buf.WriteString("# extensions-maximum: Layer 1 extension ceiling for ONT-managed extension operators.\n") + buf.WriteString("# CS-INV-008. No per-operator PermissionSets are emitted.\n") for _, s := range sets { spec := map[string]interface{}{ diff --git a/cmd/compiler/compile_enable_test.go b/cmd/compiler/compile_enable_test.go index fd605a5..15feef7 100644 --- a/cmd/compiler/compile_enable_test.go +++ b/cmd/compiler/compile_enable_test.go @@ -441,10 +441,10 @@ func TestEnable_ManagementMaximumHasPolicyTypeLabel(t *testing.T) { assertContainsStr(t, content, "ontai.dev/policy-type: management") } -// TestEnable_OnlyManagementMaximumPermissionSet verifies that guardian-permissionsets.yaml -// contains exactly one PermissionSet document (management-maximum) and that it is the -// wildcard Layer 1 ceiling. Per-operator PermissionSets must not be emitted. CS-INV-008. -func TestEnable_OnlyManagementMaximumPermissionSet(t *testing.T) { +// TestEnable_BootstrapPermissionSetCount verifies that guardian-permissionsets.yaml +// contains exactly two PermissionSet documents: management-maximum and extensions-maximum. +// Per-operator PermissionSets must not be emitted. CS-INV-008, RECON-CMN2. +func TestEnable_BootstrapPermissionSetCount(t *testing.T) { outDir := t.TempDir() if err := compileEnableBundle(outDir, "dev", defaultRegistry, "", false, "", "", "", "", ""); err != nil { t.Fatalf("compileEnableBundle error: %v", err) @@ -452,16 +452,43 @@ func TestEnable_OnlyManagementMaximumPermissionSet(t *testing.T) { content := readPhaseFile(t, outDir, "01-guardian-bootstrap", "guardian-permissionsets.yaml") - // Count PermissionSet documents. count := strings.Count(content, "kind: PermissionSet") - if count != 1 { - t.Errorf("expected exactly 1 PermissionSet document, got %d (CS-INV-008)", count) + if count != 2 { + t.Errorf("expected exactly 2 PermissionSet documents (management-maximum + extensions-maximum), got %d (CS-INV-008, RECON-CMN2)", count) } - // The sole document must be management-maximum. if !strings.Contains(content, "name: management-maximum") { t.Error("expected management-maximum PermissionSet document") } + if !strings.Contains(content, "name: extensions-maximum") { + t.Error("expected extensions-maximum PermissionSet document (RECON-CMN2)") + } +} + +// TestEnable_ExtensionsMaximumPermissionSet verifies that extensions-maximum covers all +// ten ONT extension operator CRD groups. RECON-CMN2. +func TestEnable_ExtensionsMaximumPermissionSet(t *testing.T) { + outDir := t.TempDir() + if err := compileEnableBundle(outDir, "dev", defaultRegistry, "", false, "", "", "", "", ""); err != nil { + t.Fatalf("compileEnableBundle error: %v", err) + } + + content := readPhaseFile(t, outDir, "01-guardian-bootstrap", "guardian-permissionsets.yaml") + + for _, group := range []string{ + "external-secrets.io", + "kyverno.io", + "aquasecurity.github.io", + "velero.io", + "cost.grafana.com", + "monitoring.coreos.com", + "apiextensions.crossplane.io", + "pkg.crossplane.io", + } { + if !strings.Contains(content, group) { + t.Errorf("extensions-maximum missing CRD group %q (RECON-CMN2)", group) + } + } } // TestEnable_RBACProfilesRefManagementPolicyAndMaximum verifies that all emitted From 82c1d64cd906451a541e11c21c03201319e797a8 Mon Sep 17 00:00:00 2001 From: ontave Date: Fri, 29 May 2026 15:06:21 +0200 Subject: [PATCH 05/15] feat(conductor): add ESOHealthLoop, PolicyReportDriftLoop, VulnerabilityDriftLoop, BackupHealthLoop Add 4 management-only extension drift loops: - ESOHealthLoop (RECON-K3): detects ExternalSecret sync failures (Ready=False / Synced=False), emits ExternalSecretSyncFailed DriftSignal. Skips when ESO CRDs not installed. - PolicyReportDriftLoop (RECON-L2): detects Kyverno fail results in ClusterPolicyReport and PolicyReport CRs, emits KyvernoPolicyViolation DriftSignal. Skips when Kyverno CRDs not installed. - VulnerabilityDriftLoop (RECON-M2): detects CRITICAL severity vulnerabilities in Trivy Operator VulnerabilityReport CRs, emits VulnerableImageDetected DriftSignal. Skips when Trivy CRDs not installed. - BackupHealthLoop (RECON-N2): detects Velero BSL unavailability and RPO breaches (last successful backup older than 25h), emits BackupStorageUnavailable and BackupRPOBreached DriftSignals. Skips when Velero CRDs not installed. All 4 loops: - Support AutonomyLevel=observe-only gate (log-only when restricted). - Have unit tests with fake dynamic client covering signal emit, confirm, and observe-only cases. - Have e2e stubs with skip reasons referencing RECON backlog IDs. - Are wired in kernel/agent.go (management role only, with ocWatcher). Also adds unstructuredNestedSlice helper in eso_health_loop.go (package-private, used by all 4 loops for status condition parsing). --- internal/agent/backup_health_loop.go | 291 ++++++++++++++++++ internal/agent/backup_health_loop_test.go | 194 ++++++++++++ internal/agent/eso_health_loop.go | 257 ++++++++++++++++ internal/agent/eso_health_loop_test.go | 237 ++++++++++++++ internal/agent/policy_report_drift_loop.go | 249 +++++++++++++++ .../agent/policy_report_drift_loop_test.go | 169 ++++++++++ internal/agent/vulnerability_drift_loop.go | 248 +++++++++++++++ .../agent/vulnerability_drift_loop_test.go | 162 ++++++++++ internal/kernel/agent.go | 86 +++++- test/e2e/backup_health_loop_test.go | 41 +++ test/e2e/eso_health_loop_test.go | 34 ++ test/e2e/policy_report_drift_loop_test.go | 34 ++ test/e2e/vulnerability_drift_loop_test.go | 34 ++ 13 files changed, 2035 insertions(+), 1 deletion(-) create mode 100644 internal/agent/backup_health_loop.go create mode 100644 internal/agent/backup_health_loop_test.go create mode 100644 internal/agent/eso_health_loop.go create mode 100644 internal/agent/eso_health_loop_test.go create mode 100644 internal/agent/policy_report_drift_loop.go create mode 100644 internal/agent/policy_report_drift_loop_test.go create mode 100644 internal/agent/vulnerability_drift_loop.go create mode 100644 internal/agent/vulnerability_drift_loop_test.go create mode 100644 test/e2e/backup_health_loop_test.go create mode 100644 test/e2e/eso_health_loop_test.go create mode 100644 test/e2e/policy_report_drift_loop_test.go create mode 100644 test/e2e/vulnerability_drift_loop_test.go diff --git a/internal/agent/backup_health_loop.go b/internal/agent/backup_health_loop.go new file mode 100644 index 0000000..acc6d49 --- /dev/null +++ b/internal/agent/backup_health_loop.go @@ -0,0 +1,291 @@ +package agent + +import ( + "context" + "encoding/json" + "fmt" + "time" + + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/dynamic" +) + +// backupStorageLocationGVR is the GroupVersionResource for BackupStorageLocation CRs (Velero). +var backupStorageLocationGVR = schema.GroupVersionResource{ + Group: "velero.io", + Version: "v1", + Resource: "backupstoragelocations", +} + +// veleroBackupGVR is the GroupVersionResource for Backup CRs (Velero). +var veleroBackupGVR = schema.GroupVersionResource{ + Group: "velero.io", + Version: "v1", + Resource: "backups", +} + +const bslSignalPrefix = "drift-bsl-" +const backupRPOSignalPrefix = "drift-backup-rpo-" + +// defaultBackupRPO is the maximum age of a successful backup before a RPO breach signal is emitted. +const defaultBackupRPO = 25 * time.Hour + +// BackupHealthLoop runs on conductor role=management. On each cycle it: +// 1. Lists BackupStorageLocation CRs in the management namespace; emits BackupStorageUnavailable +// when status.phase is not Available. +// 2. Lists Backup CRs and finds the most recent successful backup; emits BackupRPOBreached +// when no successful backup is younger than defaultBackupRPO (25h, covering daily schedules +// with a 1-hour grace window). +// +// Skips cleanly when Velero CRDs are not installed. +// AutonomyLevel=observe-only: logs only, no DriftSignal written. +// RECON-N2. +type BackupHealthLoop struct { + client dynamic.Interface + namespace string + ocWatcher *OperatorContextWatcher + clusterRef string + rpo time.Duration +} + +// NewBackupHealthLoop constructs a BackupHealthLoop for the given namespace. +func NewBackupHealthLoop(client dynamic.Interface, namespace, clusterRef string) *BackupHealthLoop { + return &BackupHealthLoop{ + client: client, + namespace: namespace, + clusterRef: clusterRef, + rpo: defaultBackupRPO, + } +} + +// WithOperatorContextWatcher sets the OperatorContextWatcher for autonomy-level gating. +func (l *BackupHealthLoop) WithOperatorContextWatcher(w *OperatorContextWatcher) { + l.ocWatcher = w +} + +// Run runs the loop until ctx is cancelled. +func (l *BackupHealthLoop) Run(ctx context.Context, interval time.Duration) { + l.checkOnce(ctx) + if ctx.Err() != nil { + return + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + l.checkOnce(ctx) + } + } +} + +func (l *BackupHealthLoop) checkOnce(ctx context.Context) { + l.checkBSLs(ctx) + l.checkBackupRPO(ctx) +} + +func (l *BackupHealthLoop) checkBSLs(ctx context.Context) { + list, err := l.client.Resource(backupStorageLocationGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("backup health loop: list BackupStorageLocations in %s: %v\n", l.namespace, err) + return + } + for i := range list.Items { + l.checkBSL(ctx, &list.Items[i]) + } +} + +func (l *BackupHealthLoop) checkBSL(ctx context.Context, bsl *k8sunstructured.Unstructured) { + name := bsl.GetName() + signalName := bslSignalPrefix + name + + status, _, _ := unstructuredNestedMap(bsl.Object, "status") + phase, _ := status["phase"].(string) + + if phase == "Available" { + l.confirmSignalIfPresent(ctx, signalName) + return + } + + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("backup health loop: bsl=%q phase=%q -- observe-only mode, no DriftSignal written\n", name, phase) + return + } + + driftReason := fmt.Sprintf("BackupStorageLocation unavailable: name=%s phase=%s", name, phase) + l.emitSignal(ctx, signalName, "BackupStorageUnavailable", name, "velero.io", "BackupStorageLocation", driftReason) +} + +func (l *BackupHealthLoop) checkBackupRPO(ctx context.Context) { + list, err := l.client.Resource(veleroBackupGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("backup health loop: list Backups in %s: %v\n", l.namespace, err) + return + } + + signalName := backupRPOSignalPrefix + "cluster" + latestSuccess := l.findLatestSuccessfulBackup(list.Items) + + if latestSuccess.IsZero() { + // No successful backup at all. + if len(list.Items) == 0 { + // No backups scheduled yet -- not a breach. + return + } + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("backup health loop: no successful backup found -- observe-only mode, no DriftSignal written\n") + return + } + driftReason := "BackupRPOBreached: no successful backup found" + l.emitSignal(ctx, signalName, "BackupRPOBreached", "cluster", "velero.io", "Backup", driftReason) + return + } + + age := time.Since(latestSuccess) + if age <= l.rpo { + l.confirmSignalIfPresent(ctx, signalName) + return + } + + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("backup health loop: last successful backup age=%v exceeds RPO=%v -- observe-only mode, no DriftSignal written\n", age.Round(time.Minute), l.rpo) + return + } + + driftReason := fmt.Sprintf("BackupRPOBreached: last successful backup age=%v exceeds RPO=%v", age.Round(time.Minute), l.rpo) + l.emitSignal(ctx, signalName, "BackupRPOBreached", "cluster", "velero.io", "Backup", driftReason) +} + +// findLatestSuccessfulBackup returns the completion time of the most recent Completed backup. +func (l *BackupHealthLoop) findLatestSuccessfulBackup(items []k8sunstructured.Unstructured) time.Time { + var latest time.Time + for i := range items { + status, _, _ := unstructuredNestedMap(items[i].Object, "status") + phase, _ := status["phase"].(string) + if phase != "Completed" { + continue + } + completionStr, _ := status["completionTimestamp"].(string) + t, err := time.Parse(time.RFC3339, completionStr) + if err != nil { + continue + } + if t.After(latest) { + latest = t + } + } + return latest +} + +func (l *BackupHealthLoop) emitSignal(ctx context.Context, signalName, signalKind, resourceName, group, kind, driftReason string) { + now := time.Now().UTC().Format(time.RFC3339) + + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil && !k8serrors.IsNotFound(err) { + fmt.Printf("backup health loop: get DriftSignal %s: %v\n", signalName, err) + return + } + + if k8serrors.IsNotFound(err) { + obj := map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": signalName, "namespace": l.namespace}, + "spec": map[string]interface{}{ + "state": "pending", + "signalKind": signalKind, + "driftLayer": "infrastructure", + "correlationID": fmt.Sprintf("backup-%s-%d", resourceName, time.Now().UnixNano()), + "observedAt": now, + "driftReason": driftReason, + "affectedCRRef": map[string]interface{}{ + "group": group, + "kind": kind, + "namespace": l.namespace, + "name": resourceName, + }, + "escalationCounter": int64(0), + }, + } + if _, cErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Create( + ctx, &k8sunstructured.Unstructured{Object: obj}, metav1.CreateOptions{}, + ); cErr != nil { + fmt.Printf("backup health loop: create DriftSignal %s: %v\n", signalName, cErr) + } + fmt.Printf("backup health loop: %s -- DriftSignal written\n", driftReason) + return + } + + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + counter, _ := spec["escalationCounter"].(int64) + if int32(counter) >= escalationThreshold { + return + } + if state == "confirmed" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "correlationID": fmt.Sprintf("backup-%s-%d", resourceName, time.Now().UnixNano()), + "observedAt": now, + "escalationCounter": int64(0), + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("backup health loop: reset DriftSignal %s: %v\n", signalName, pErr) + } + return + } + if state == "queued" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "escalationCounter": counter + 1, + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("backup health loop: increment escalation counter %s: %v\n", signalName, pErr) + } + } +} + +func (l *BackupHealthLoop) confirmSignalIfPresent(ctx context.Context, signalName string) { + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil { + return + } + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + if state == "confirmed" || state == "" { + return + } + patch := map[string]interface{}{ + "spec": map[string]interface{}{"state": "confirmed", "correlationID": ""}, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("backup health loop: confirm DriftSignal %s/%s: %v\n", l.namespace, signalName, pErr) + } +} diff --git a/internal/agent/backup_health_loop_test.go b/internal/agent/backup_health_loop_test.go new file mode 100644 index 0000000..ea9cda0 --- /dev/null +++ b/internal/agent/backup_health_loop_test.go @@ -0,0 +1,194 @@ +package agent + +import ( + "context" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic/fake" +) + +var backupTestGVRs = map[schema.GroupVersionResource]string{ + backupStorageLocationGVR: "BackupStorageLocationList", + veleroBackupGVR: "BackupList", + driftSignalGVR: "DriftSignalList", +} + +func newBackupFakeClient(objs ...runtime.Object) *fake.FakeDynamicClient { + return fake.NewSimpleDynamicClientWithCustomListKinds(runtime.NewScheme(), backupTestGVRs, objs...) +} + +func fakeBSL(name, phase string) *unstructured.Unstructured { + return &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "velero.io/v1", + "kind": "BackupStorageLocation", + "metadata": map[string]interface{}{"name": name, "namespace": "seam-system"}, + "status": map[string]interface{}{"phase": phase}, + }} +} + +func fakeBackup(name, phase, completionTimestamp string) *unstructured.Unstructured { + return &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "velero.io/v1", + "kind": "Backup", + "metadata": map[string]interface{}{"name": name, "namespace": "seam-system"}, + "status": map[string]interface{}{"phase": phase, "completionTimestamp": completionTimestamp}, + }} +} + +func TestBackupHealthLoop_BSLAvailable_NoSignal(t *testing.T) { + bsl := fakeBSL("default", "Available") + backup := fakeBackup("daily-backup", "Completed", time.Now().Add(-1*time.Hour).UTC().Format(time.RFC3339)) + + client := newBackupFakeClient(bsl, backup) + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Errorf("expected no DriftSignal for available BSL with recent backup, got create on %s", a.GetResource().Resource) + } + } +} + +func TestBackupHealthLoop_BSLUnavailable_EmitsSignal(t *testing.T) { + bsl := fakeBSL("default", "Unavailable") + client := newBackupFakeClient(bsl) + + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + signalName := bslSignalPrefix + "default" + ds, err := client.Resource(driftSignalGVR).Namespace("seam-system").Get( + context.Background(), signalName, metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("expected BackupStorageUnavailable DriftSignal: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + if kind, _ := spec["signalKind"].(string); kind != "BackupStorageUnavailable" { + t.Errorf("signalKind = %q, want BackupStorageUnavailable", kind) + } +} + +func TestBackupHealthLoop_RPOBreached_EmitsSignal(t *testing.T) { + bsl := fakeBSL("default", "Available") + // Backup completed 30 hours ago -- exceeds defaultBackupRPO (25h). + oldBackup := fakeBackup("old-backup", "Completed", time.Now().Add(-30*time.Hour).UTC().Format(time.RFC3339)) + + client := newBackupFakeClient(bsl, oldBackup) + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + signalName := backupRPOSignalPrefix + "cluster" + ds, err := client.Resource(driftSignalGVR).Namespace("seam-system").Get( + context.Background(), signalName, metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("expected BackupRPOBreached DriftSignal: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + if kind, _ := spec["signalKind"].(string); kind != "BackupRPOBreached" { + t.Errorf("signalKind = %q, want BackupRPOBreached", kind) + } +} + +func TestBackupHealthLoop_RecentBackup_NoRPOSignal(t *testing.T) { + bsl := fakeBSL("default", "Available") + recentBackup := fakeBackup("recent-backup", "Completed", time.Now().Add(-2*time.Hour).UTC().Format(time.RFC3339)) + + client := newBackupFakeClient(bsl, recentBackup) + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Errorf("expected no DriftSignal for recent backup, got create on %s", a.GetResource().Resource) + } + } +} + +func TestBackupHealthLoop_ObserveOnly_NoSignal(t *testing.T) { + bsl := fakeBSL("default", "Unavailable") + client := newBackupFakeClient(bsl) + + ocWatcher := NewOperatorContextWatcher(client, "ont-system") + ocWatcher.mu.Lock() + ocWatcher.resolved["ccs-mgmt"] = resolvedContext{autonomyLevel: AutonomyLevelObserveOnly, mode: "normal"} + ocWatcher.mu.Unlock() + + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.WithOperatorContextWatcher(ocWatcher) + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal under observe-only mode") + } + } +} + +func TestBackupHealthLoop_BSLConfirmedWhenAvailable(t *testing.T) { + bsl := fakeBSL("default", "Available") + backup := fakeBackup("daily", "Completed", time.Now().Add(-1*time.Hour).UTC().Format(time.RFC3339)) + existingSignal := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": bslSignalPrefix + "default", "namespace": "seam-system"}, + "spec": map[string]interface{}{"state": "queued"}, + }} + + client := newBackupFakeClient(bsl, backup, existingSignal) + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var patched bool + for _, a := range client.Actions() { + if a.GetVerb() == "patch" && a.GetResource().Resource == "driftsignals" { + patched = true + } + } + if !patched { + t.Error("expected DriftSignal to be confirmed when BSL returns to Available") + } +} + +func TestBackupHealthLoop_FindLatestSuccessfulBackup(t *testing.T) { + l := NewBackupHealthLoop(nil, "seam-system", "ccs-mgmt") + + older := time.Now().Add(-10 * time.Hour) + newer := time.Now().Add(-2 * time.Hour) + + items := []unstructured.Unstructured{ + *fakeBackup("b1", "Completed", older.UTC().Format(time.RFC3339)), + *fakeBackup("b2", "Failed", newer.UTC().Format(time.RFC3339)), + *fakeBackup("b3", "Completed", newer.UTC().Format(time.RFC3339)), + } + + result := l.findLatestSuccessfulBackup(items) + if result.IsZero() { + t.Fatal("expected a valid timestamp") + } + if result.Before(older) || result.Before(newer.Add(-time.Second)) { + t.Errorf("expected result close to newer time, got %v", result) + } +} + +func TestBackupHealthLoop_NoBackups_NoRPOSignal(t *testing.T) { + bsl := fakeBSL("default", "Available") + client := newBackupFakeClient(bsl) + + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" && a.GetResource().Resource == "driftsignals" { + spec := a.(interface{ GetObject() runtime.Object }).GetObject() + t.Errorf("unexpected DriftSignal create: %v", spec) + } + } +} diff --git a/internal/agent/eso_health_loop.go b/internal/agent/eso_health_loop.go new file mode 100644 index 0000000..24cc407 --- /dev/null +++ b/internal/agent/eso_health_loop.go @@ -0,0 +1,257 @@ +package agent + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "time" + + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/dynamic" +) + +// externalSecretGVR is the GroupVersionResource for ExternalSecret CRs (ESO v1beta1). +var externalSecretGVR = schema.GroupVersionResource{ + Group: "external-secrets.io", + Version: "v1beta1", + Resource: "externalsecrets", +} + +// esoSignalPrefix is the DriftSignal name prefix for ESO sync failure signals. +const esoSignalPrefix = "drift-eso-sync-" + +// ESOHealthLoop runs on conductor role=management. On each cycle it: +// 1. Lists ExternalSecret CRs across the management namespace (seam-system). +// 2. For each, inspects status.conditions for a Ready=False or Synced=False condition. +// 3. Emits an ExternalSecretSyncFailed DriftSignal when a sync error is detected. +// 4. Confirms any existing signal when the ExternalSecret reaches Ready=True. +// +// Skips cleanly when the external-secrets CRDs are not installed on the cluster. +// AutonomyLevel=observe-only: logs only, no DriftSignal written. +// RECON-K3. +type ESOHealthLoop struct { + client dynamic.Interface + namespace string + ocWatcher *OperatorContextWatcher + clusterRef string +} + +// NewESOHealthLoop constructs an ESOHealthLoop for the given namespace. +func NewESOHealthLoop(client dynamic.Interface, namespace, clusterRef string) *ESOHealthLoop { + return &ESOHealthLoop{ + client: client, + namespace: namespace, + clusterRef: clusterRef, + } +} + +// WithOperatorContextWatcher sets the OperatorContextWatcher for autonomy-level gating. +func (l *ESOHealthLoop) WithOperatorContextWatcher(w *OperatorContextWatcher) { + l.ocWatcher = w +} + +// Run runs the loop until ctx is cancelled. +func (l *ESOHealthLoop) Run(ctx context.Context, interval time.Duration) { + l.checkOnce(ctx) + if ctx.Err() != nil { + return + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + l.checkOnce(ctx) + } + } +} + +func (l *ESOHealthLoop) checkOnce(ctx context.Context) { + list, err := l.client.Resource(externalSecretGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("eso health loop: list ExternalSecrets in %s: %v\n", l.namespace, err) + return + } + for i := range list.Items { + l.checkESO(ctx, &list.Items[i]) + } +} + +func (l *ESOHealthLoop) checkESO(ctx context.Context, es *k8sunstructured.Unstructured) { + name := es.GetName() + signalName := esoSignalPrefix + name + + conditions, _, _ := unstructuredNestedSlice(es.Object, "status", "conditions") + syncFailed, reason := esoSyncFailed(conditions) + + if !syncFailed { + l.confirmSignalIfPresent(ctx, signalName) + return + } + + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("eso health loop: eso=%q sync failed (%s) -- observe-only mode, no DriftSignal written\n", name, reason) + return + } + + driftReason := fmt.Sprintf("ExternalSecret sync failed: name=%s reason=%s", name, reason) + l.emitSignal(ctx, signalName, name, driftReason) +} + +// esoSyncFailed returns true when any condition indicates sync failure. +func esoSyncFailed(conditions []interface{}) (bool, string) { + for _, raw := range conditions { + cond, ok := raw.(map[string]interface{}) + if !ok { + continue + } + condType, _ := cond["type"].(string) + condStatus, _ := cond["status"].(string) + reason, _ := cond["reason"].(string) + if (condType == "Ready" || condType == "Synced") && condStatus == "False" { + if reason == "" { + reason = "unknown" + } + return true, reason + } + } + return false, "" +} + +func (l *ESOHealthLoop) emitSignal(ctx context.Context, signalName, esName, driftReason string) { + now := time.Now().UTC().Format(time.RFC3339) + + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil && !k8serrors.IsNotFound(err) { + fmt.Printf("eso health loop: eso=%q get DriftSignal: %v\n", esName, err) + return + } + + if k8serrors.IsNotFound(err) { + obj := map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": signalName, "namespace": l.namespace}, + "spec": map[string]interface{}{ + "state": "pending", + "signalKind": "ExternalSecretSyncFailed", + "driftLayer": "kubernetes", + "correlationID": fmt.Sprintf("eso-%s-%d", esName, time.Now().UnixNano()), + "observedAt": now, + "driftReason": driftReason, + "affectedCRRef": map[string]interface{}{ + "group": "external-secrets.io", + "kind": "ExternalSecret", + "namespace": l.namespace, + "name": esName, + }, + "escalationCounter": int64(0), + }, + } + if _, cErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Create( + ctx, &k8sunstructured.Unstructured{Object: obj}, metav1.CreateOptions{}, + ); cErr != nil { + fmt.Printf("eso health loop: eso=%q create DriftSignal: %v\n", esName, cErr) + } + fmt.Printf("eso health loop: eso=%q sync failed -- DriftSignal written\n", esName) + return + } + + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + counter, _ := spec["escalationCounter"].(int64) + if int32(counter) >= escalationThreshold { + return + } + if state == "confirmed" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "correlationID": fmt.Sprintf("eso-%s-%d", esName, time.Now().UnixNano()), + "observedAt": now, + "escalationCounter": int64(0), + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("eso health loop: eso=%q reset DriftSignal: %v\n", esName, pErr) + } + return + } + if state == "queued" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "escalationCounter": counter + 1, + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("eso health loop: eso=%q increment escalation counter: %v\n", esName, pErr) + } + } +} + +func (l *ESOHealthLoop) confirmSignalIfPresent(ctx context.Context, signalName string) { + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil { + return + } + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + if state == "confirmed" || state == "" { + return + } + patch := map[string]interface{}{ + "spec": map[string]interface{}{"state": "confirmed", "correlationID": ""}, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("eso health loop: confirm DriftSignal %s/%s: %v\n", l.namespace, signalName, pErr) + } +} + +// isNoCRDError returns true when the API server reports the CRD resource type is unknown. +// This occurs when an extension is not installed and its CRDs are absent. +func isNoCRDError(err error) bool { + if err == nil { + return false + } + msg := err.Error() + return strings.Contains(msg, "no matches for kind") || + strings.Contains(msg, "the server could not find the requested resource") || + k8serrors.IsNotFound(err) +} + +// unstructuredNestedSlice extracts a []interface{} from an unstructured map by field path. +func unstructuredNestedSlice(obj map[string]interface{}, fields ...string) ([]interface{}, bool, error) { + cur := obj + for _, f := range fields[:len(fields)-1] { + next, ok := cur[f].(map[string]interface{}) + if !ok { + return nil, false, nil + } + cur = next + } + last := fields[len(fields)-1] + val, ok := cur[last].([]interface{}) + return val, ok, nil +} diff --git a/internal/agent/eso_health_loop_test.go b/internal/agent/eso_health_loop_test.go new file mode 100644 index 0000000..88e32f6 --- /dev/null +++ b/internal/agent/eso_health_loop_test.go @@ -0,0 +1,237 @@ +package agent + +import ( + "context" + "fmt" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic/fake" +) + +// esoTestGVRs contains the list-kind mappings used by all ESO loop tests. +var esoTestGVRs = map[schema.GroupVersionResource]string{ + externalSecretGVR: "ExternalSecretList", + driftSignalGVR: "DriftSignalList", +} + +// newESOFakeClient builds a fake dynamic client pre-loaded with the given objects, +// registering ExternalSecret and DriftSignal list kinds. +func newESOFakeClient(objs ...runtime.Object) *fake.FakeDynamicClient { + return fake.NewSimpleDynamicClientWithCustomListKinds(runtime.NewScheme(), esoTestGVRs, objs...) +} + +func TestESOHealthLoop_HealthyESO_NoSignal(t *testing.T) { + es := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "external-secrets.io/v1beta1", + "kind": "ExternalSecret", + "metadata": map[string]interface{}{"name": "my-secret", "namespace": "seam-system"}, + "status": map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"type": "Ready", "status": "True"}, + }, + }, + }} + + client := newESOFakeClient(es) + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Errorf("expected no DriftSignal created for healthy ESO, got create action") + } + } +} + +func TestESOHealthLoop_SyncFailed_EmitsSignal(t *testing.T) { + es := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "external-secrets.io/v1beta1", + "kind": "ExternalSecret", + "metadata": map[string]interface{}{"name": "bad-secret", "namespace": "seam-system"}, + "status": map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"type": "Ready", "status": "False", "reason": "SecretSyncError"}, + }, + }, + }} + + client := newESOFakeClient(es) + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var created bool + for _, a := range client.Actions() { + if a.GetVerb() == "create" && a.GetResource().Resource == "driftsignals" { + created = true + } + } + if !created { + t.Error("expected DriftSignal to be created for failed ESO sync") + } +} + +func TestESOHealthLoop_SyncFailed_ObserveOnly_NoSignal(t *testing.T) { + es := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "external-secrets.io/v1beta1", + "kind": "ExternalSecret", + "metadata": map[string]interface{}{"name": "bad-secret", "namespace": "seam-system"}, + "status": map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"type": "Synced", "status": "False", "reason": "VaultError"}, + }, + }, + }} + + client := newESOFakeClient(es) + + ocWatcher := NewOperatorContextWatcher(client, "ont-system") + ocWatcher.mu.Lock() + ocWatcher.resolved["ccs-mgmt"] = resolvedContext{autonomyLevel: AutonomyLevelObserveOnly, mode: "normal"} + ocWatcher.mu.Unlock() + + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + l.WithOperatorContextWatcher(ocWatcher) + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal created under observe-only mode") + } + } +} + +func TestESOHealthLoop_ConfirmsSignalWhenHealthy(t *testing.T) { + es := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "external-secrets.io/v1beta1", + "kind": "ExternalSecret", + "metadata": map[string]interface{}{"name": "my-secret", "namespace": "seam-system"}, + "status": map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"type": "Ready", "status": "True"}, + }, + }, + }} + existingSignal := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": esoSignalPrefix + "my-secret", "namespace": "seam-system"}, + "spec": map[string]interface{}{"state": "queued"}, + }} + + client := newESOFakeClient(es, existingSignal) + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var patched bool + for _, a := range client.Actions() { + if a.GetVerb() == "patch" && a.GetResource().Resource == "driftsignals" { + patched = true + } + } + if !patched { + t.Error("expected DriftSignal to be confirmed (patched) when ESO is healthy") + } +} + +func TestESOHealthLoop_SignalCreated_VerifyFields(t *testing.T) { + es := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "external-secrets.io/v1beta1", + "kind": "ExternalSecret", + "metadata": map[string]interface{}{"name": "vault-secret", "namespace": "seam-system"}, + "status": map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"type": "Ready", "status": "False", "reason": "VaultUnreachable"}, + }, + }, + }} + + client := newESOFakeClient(es) + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + ds, err := client.Resource(driftSignalGVR).Namespace("seam-system").Get( + context.Background(), esoSignalPrefix+"vault-secret", metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("expected DriftSignal: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + if kind, _ := spec["signalKind"].(string); kind != "ExternalSecretSyncFailed" { + t.Errorf("signalKind = %q, want ExternalSecretSyncFailed", kind) + } + if state, _ := spec["state"].(string); state != "pending" { + t.Errorf("state = %q, want pending", state) + } +} + +func TestESOSyncFailed_BothConditionTypes(t *testing.T) { + tests := []struct { + name string + condType string + status string + wantFail bool + }{ + {"ready false", "Ready", "False", true}, + {"synced false", "Synced", "False", true}, + {"ready true", "Ready", "True", false}, + {"synced true", "Synced", "True", false}, + {"other type false", "Connected", "False", false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + conditions := []interface{}{ + map[string]interface{}{"type": tt.condType, "status": tt.status}, + } + failed, _ := esoSyncFailed(conditions) + if failed != tt.wantFail { + t.Errorf("esoSyncFailed: got %v want %v", failed, tt.wantFail) + } + }) + } +} + +func TestIsNoCRDError_DetectsKnownPatterns(t *testing.T) { + tests := []struct { + err error + want bool + }{ + {nil, false}, + {&esoTestError{"no matches for kind ExternalSecret"}, true}, + {&esoTestError{"the server could not find the requested resource"}, true}, + {fmt.Errorf("connection refused"), false}, + } + for _, tt := range tests { + got := isNoCRDError(tt.err) + if got != tt.want { + t.Errorf("isNoCRDError(%v) = %v, want %v", tt.err, got, tt.want) + } + } +} + +func TestESOHealthLoop_Run_StopsOnContextCancel(t *testing.T) { + client := newESOFakeClient() + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // cancel immediately so Run exits after first checkOnce returns + done := make(chan struct{}) + go func() { + l.Run(ctx, 100*time.Millisecond) + close(done) + }() + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("ESOHealthLoop.Run did not stop after context cancel") + } +} + +// esoTestError is a minimal error type for CRD-not-installed test cases. +type esoTestError struct{ msg string } + +func (e *esoTestError) Error() string { return e.msg } diff --git a/internal/agent/policy_report_drift_loop.go b/internal/agent/policy_report_drift_loop.go new file mode 100644 index 0000000..b54d66a --- /dev/null +++ b/internal/agent/policy_report_drift_loop.go @@ -0,0 +1,249 @@ +package agent + +import ( + "context" + "encoding/json" + "fmt" + "time" + + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/dynamic" +) + +// clusterPolicyReportGVR is the GroupVersionResource for ClusterPolicyReport CRs (Kyverno). +var clusterPolicyReportGVR = schema.GroupVersionResource{ + Group: "wgpolicyk8s.io", + Version: "v1alpha2", + Resource: "clusterpolicyreports", +} + +// policyReportGVR is the GroupVersionResource for namespaced PolicyReport CRs (Kyverno). +var policyReportGVR = schema.GroupVersionResource{ + Group: "wgpolicyk8s.io", + Version: "v1alpha2", + Resource: "policyreports", +} + +// policyReportSignalPrefix is the DriftSignal name prefix for Kyverno policy violation signals. +const policyReportSignalPrefix = "drift-policy-" + +// PolicyReportDriftLoop runs on conductor role=management. On each cycle it: +// 1. Lists ClusterPolicyReport and PolicyReport CRs across the management namespace. +// 2. For each report with at least one fail result, emits a KyvernoPolicyViolation DriftSignal. +// 3. Confirms any existing signal when the report has no fail results. +// +// Skips cleanly when Kyverno CRDs are not installed. +// AutonomyLevel=observe-only: logs only, no DriftSignal written. +// RECON-L2. +type PolicyReportDriftLoop struct { + client dynamic.Interface + namespace string + ocWatcher *OperatorContextWatcher + clusterRef string +} + +// NewPolicyReportDriftLoop constructs a PolicyReportDriftLoop for the given namespace. +func NewPolicyReportDriftLoop(client dynamic.Interface, namespace, clusterRef string) *PolicyReportDriftLoop { + return &PolicyReportDriftLoop{ + client: client, + namespace: namespace, + clusterRef: clusterRef, + } +} + +// WithOperatorContextWatcher sets the OperatorContextWatcher for autonomy-level gating. +func (l *PolicyReportDriftLoop) WithOperatorContextWatcher(w *OperatorContextWatcher) { + l.ocWatcher = w +} + +// Run runs the loop until ctx is cancelled. +func (l *PolicyReportDriftLoop) Run(ctx context.Context, interval time.Duration) { + l.checkOnce(ctx) + if ctx.Err() != nil { + return + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + l.checkOnce(ctx) + } + } +} + +func (l *PolicyReportDriftLoop) checkOnce(ctx context.Context) { + clusterList, err := l.client.Resource(clusterPolicyReportGVR).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("policy report drift loop: list ClusterPolicyReports: %v\n", err) + return + } + for i := range clusterList.Items { + l.checkReport(ctx, &clusterList.Items[i], true) + } + + nsList, err := l.client.Resource(policyReportGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("policy report drift loop: list PolicyReports in %s: %v\n", l.namespace, err) + return + } + for i := range nsList.Items { + l.checkReport(ctx, &nsList.Items[i], false) + } +} + +func (l *PolicyReportDriftLoop) checkReport(ctx context.Context, report *k8sunstructured.Unstructured, cluster bool) { + name := report.GetName() + prefix := "cluster-" + if !cluster { + prefix = "" + } + signalName := policyReportSignalPrefix + prefix + name + + failCount, policies := policyReportFailures(report.Object) + if failCount == 0 { + l.confirmSignalIfPresent(ctx, signalName) + return + } + + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("policy report drift loop: report=%q has %d fail(s) -- observe-only mode, no DriftSignal written\n", name, failCount) + return + } + + driftReason := fmt.Sprintf("Kyverno policy violations: report=%s failCount=%d policies=%v", name, failCount, policies) + l.emitSignal(ctx, signalName, name, driftReason) +} + +// policyReportFailures counts fail results in a PolicyReport object and returns policy names. +func policyReportFailures(obj map[string]interface{}) (int, []string) { + results, _, _ := unstructuredNestedSlice(obj, "results") + var count int + var names []string + for _, raw := range results { + entry, ok := raw.(map[string]interface{}) + if !ok { + continue + } + result, _ := entry["result"].(string) + if result == "fail" { + count++ + if policy, ok := entry["policy"].(string); ok { + names = append(names, policy) + } + } + } + return count, names +} + +func (l *PolicyReportDriftLoop) emitSignal(ctx context.Context, signalName, reportName, driftReason string) { + now := time.Now().UTC().Format(time.RFC3339) + + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil && !k8serrors.IsNotFound(err) { + fmt.Printf("policy report drift loop: report=%q get DriftSignal: %v\n", reportName, err) + return + } + + if k8serrors.IsNotFound(err) { + obj := map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": signalName, "namespace": l.namespace}, + "spec": map[string]interface{}{ + "state": "pending", + "signalKind": "KyvernoPolicyViolation", + "driftLayer": "kubernetes", + "correlationID": fmt.Sprintf("policy-%s-%d", reportName, time.Now().UnixNano()), + "observedAt": now, + "driftReason": driftReason, + "affectedCRRef": map[string]interface{}{ + "group": "wgpolicyk8s.io", + "kind": "PolicyReport", + "name": reportName, + }, + "escalationCounter": int64(0), + }, + } + if _, cErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Create( + ctx, &k8sunstructured.Unstructured{Object: obj}, metav1.CreateOptions{}, + ); cErr != nil { + fmt.Printf("policy report drift loop: report=%q create DriftSignal: %v\n", reportName, cErr) + } + fmt.Printf("policy report drift loop: report=%q Kyverno policy violations -- DriftSignal written\n", reportName) + return + } + + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + counter, _ := spec["escalationCounter"].(int64) + if int32(counter) >= escalationThreshold { + return + } + if state == "confirmed" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "correlationID": fmt.Sprintf("policy-%s-%d", reportName, time.Now().UnixNano()), + "observedAt": now, + "escalationCounter": int64(0), + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("policy report drift loop: report=%q reset DriftSignal: %v\n", reportName, pErr) + } + return + } + if state == "queued" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "escalationCounter": counter + 1, + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("policy report drift loop: report=%q increment escalation counter: %v\n", reportName, pErr) + } + } +} + +func (l *PolicyReportDriftLoop) confirmSignalIfPresent(ctx context.Context, signalName string) { + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil { + return + } + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + if state == "confirmed" || state == "" { + return + } + patch := map[string]interface{}{ + "spec": map[string]interface{}{"state": "confirmed", "correlationID": ""}, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("policy report drift loop: confirm DriftSignal %s/%s: %v\n", l.namespace, signalName, pErr) + } +} diff --git a/internal/agent/policy_report_drift_loop_test.go b/internal/agent/policy_report_drift_loop_test.go new file mode 100644 index 0000000..4ec4a87 --- /dev/null +++ b/internal/agent/policy_report_drift_loop_test.go @@ -0,0 +1,169 @@ +package agent + +import ( + "context" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic/fake" +) + +var policyTestGVRs = map[schema.GroupVersionResource]string{ + clusterPolicyReportGVR: "ClusterPolicyReportList", + policyReportGVR: "PolicyReportList", + driftSignalGVR: "DriftSignalList", +} + +func newPolicyFakeClient(objs ...runtime.Object) *fake.FakeDynamicClient { + return fake.NewSimpleDynamicClientWithCustomListKinds(runtime.NewScheme(), policyTestGVRs, objs...) +} + +func TestPolicyReportDriftLoop_NoViolations_NoSignal(t *testing.T) { + cr := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "wgpolicyk8s.io/v1alpha2", + "kind": "ClusterPolicyReport", + "metadata": map[string]interface{}{"name": "cluster-report"}, + "results": []interface{}{ + map[string]interface{}{"result": "pass", "policy": "require-labels"}, + }, + }} + + client := newPolicyFakeClient(cr) + l := NewPolicyReportDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal for passing policy report") + } + } +} + +func TestPolicyReportDriftLoop_ClusterReportFail_EmitsSignal(t *testing.T) { + cr := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "wgpolicyk8s.io/v1alpha2", + "kind": "ClusterPolicyReport", + "metadata": map[string]interface{}{"name": "cluster-report"}, + "results": []interface{}{ + map[string]interface{}{"result": "fail", "policy": "require-psa"}, + map[string]interface{}{"result": "pass", "policy": "require-labels"}, + }, + }} + + client := newPolicyFakeClient(cr) + l := NewPolicyReportDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var created bool + for _, a := range client.Actions() { + if a.GetVerb() == "create" && a.GetResource().Resource == "driftsignals" { + created = true + } + } + if !created { + t.Error("expected DriftSignal created for ClusterPolicyReport with fail results") + } +} + +func TestPolicyReportDriftLoop_SignalFields(t *testing.T) { + cr := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "wgpolicyk8s.io/v1alpha2", + "kind": "ClusterPolicyReport", + "metadata": map[string]interface{}{"name": "my-report"}, + "results": []interface{}{ + map[string]interface{}{"result": "fail", "policy": "no-privileged"}, + }, + }} + + client := newPolicyFakeClient(cr) + l := NewPolicyReportDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + signalName := policyReportSignalPrefix + "cluster-" + "my-report" + ds, err := client.Resource(driftSignalGVR).Namespace("seam-system").Get( + context.Background(), signalName, metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("expected DriftSignal: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + if kind, _ := spec["signalKind"].(string); kind != "KyvernoPolicyViolation" { + t.Errorf("signalKind = %q, want KyvernoPolicyViolation", kind) + } +} + +func TestPolicyReportDriftLoop_ObserveOnly_NoSignal(t *testing.T) { + cr := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "wgpolicyk8s.io/v1alpha2", + "kind": "ClusterPolicyReport", + "metadata": map[string]interface{}{"name": "cluster-report"}, + "results": []interface{}{ + map[string]interface{}{"result": "fail", "policy": "require-psa"}, + }, + }} + + client := newPolicyFakeClient(cr) + ocWatcher := NewOperatorContextWatcher(client, "ont-system") + ocWatcher.mu.Lock() + ocWatcher.resolved["ccs-mgmt"] = resolvedContext{autonomyLevel: AutonomyLevelObserveOnly, mode: "normal"} + ocWatcher.mu.Unlock() + + l := NewPolicyReportDriftLoop(client, "seam-system", "ccs-mgmt") + l.WithOperatorContextWatcher(ocWatcher) + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal under observe-only mode") + } + } +} + +func TestPolicyReportDriftLoop_ConfirmsSignalWhenClean(t *testing.T) { + cr := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "wgpolicyk8s.io/v1alpha2", + "kind": "ClusterPolicyReport", + "metadata": map[string]interface{}{"name": "clean-report"}, + "results": []interface{}{}, + }} + existingSignal := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": policyReportSignalPrefix + "cluster-" + "clean-report", "namespace": "seam-system"}, + "spec": map[string]interface{}{"state": "queued"}, + }} + + client := newPolicyFakeClient(cr, existingSignal) + l := NewPolicyReportDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var patched bool + for _, a := range client.Actions() { + if a.GetVerb() == "patch" && a.GetResource().Resource == "driftsignals" { + patched = true + } + } + if !patched { + t.Error("expected DriftSignal to be confirmed when report has no violations") + } +} + +func TestPolicyReportFailures_CountsCorrectly(t *testing.T) { + obj := map[string]interface{}{ + "results": []interface{}{ + map[string]interface{}{"result": "fail", "policy": "pol-a"}, + map[string]interface{}{"result": "pass", "policy": "pol-b"}, + map[string]interface{}{"result": "fail", "policy": "pol-c"}, + }, + } + count, policies := policyReportFailures(obj) + if count != 2 { + t.Errorf("failCount = %d, want 2", count) + } + if len(policies) != 2 { + t.Errorf("policies len = %d, want 2", len(policies)) + } +} diff --git a/internal/agent/vulnerability_drift_loop.go b/internal/agent/vulnerability_drift_loop.go new file mode 100644 index 0000000..4105a6e --- /dev/null +++ b/internal/agent/vulnerability_drift_loop.go @@ -0,0 +1,248 @@ +package agent + +import ( + "context" + "encoding/json" + "fmt" + "time" + + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/dynamic" +) + +// vulnerabilityReportGVR is the GroupVersionResource for VulnerabilityReport CRs (Trivy Operator). +var vulnerabilityReportGVR = schema.GroupVersionResource{ + Group: "aquasecurity.github.io", + Version: "v1alpha1", + Resource: "vulnerabilityreports", +} + +// vulnerabilitySignalPrefix is the DriftSignal name prefix for Trivy vulnerability signals. +const vulnerabilitySignalPrefix = "drift-vuln-" + +// criticalSeverity is the default minimum severity threshold for emitting a DriftSignal. +const criticalSeverity = "CRITICAL" + +// VulnerabilityDriftLoop runs on conductor role=management. On each cycle it: +// 1. Lists VulnerabilityReport CRs in the management namespace (seam-system). +// 2. For each report containing at least one vulnerability at or above CRITICAL severity, +// emits a VulnerableImageDetected DriftSignal. +// 3. Confirms any existing signal when the report has no CRITICAL vulnerabilities. +// +// Skips cleanly when Trivy Operator CRDs are not installed. +// AutonomyLevel=observe-only: logs only, no DriftSignal written. +// RECON-M2. +type VulnerabilityDriftLoop struct { + client dynamic.Interface + namespace string + ocWatcher *OperatorContextWatcher + clusterRef string +} + +// NewVulnerabilityDriftLoop constructs a VulnerabilityDriftLoop for the given namespace. +func NewVulnerabilityDriftLoop(client dynamic.Interface, namespace, clusterRef string) *VulnerabilityDriftLoop { + return &VulnerabilityDriftLoop{ + client: client, + namespace: namespace, + clusterRef: clusterRef, + } +} + +// WithOperatorContextWatcher sets the OperatorContextWatcher for autonomy-level gating. +func (l *VulnerabilityDriftLoop) WithOperatorContextWatcher(w *OperatorContextWatcher) { + l.ocWatcher = w +} + +// Run runs the loop until ctx is cancelled. +func (l *VulnerabilityDriftLoop) Run(ctx context.Context, interval time.Duration) { + l.checkOnce(ctx) + if ctx.Err() != nil { + return + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + l.checkOnce(ctx) + } + } +} + +func (l *VulnerabilityDriftLoop) checkOnce(ctx context.Context) { + list, err := l.client.Resource(vulnerabilityReportGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("vulnerability drift loop: list VulnerabilityReports in %s: %v\n", l.namespace, err) + return + } + for i := range list.Items { + l.checkReport(ctx, &list.Items[i]) + } +} + +func (l *VulnerabilityDriftLoop) checkReport(ctx context.Context, report *k8sunstructured.Unstructured) { + name := report.GetName() + signalName := vulnerabilitySignalPrefix + name + + critCount, imageRef := vulnerabilityCriticalCount(report.Object) + if critCount == 0 { + l.confirmSignalIfPresent(ctx, signalName) + return + } + + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("vulnerability drift loop: report=%q has %d CRITICAL vuln(s) -- observe-only mode, no DriftSignal written\n", name, critCount) + return + } + + driftReason := fmt.Sprintf("vulnerable image detected: report=%s image=%s criticalCount=%d", name, imageRef, critCount) + l.emitSignal(ctx, signalName, name, imageRef, driftReason) +} + +// vulnerabilityCriticalCount returns the count of CRITICAL severity vulnerabilities +// and the image reference from the VulnerabilityReport object. +func vulnerabilityCriticalCount(obj map[string]interface{}) (int, string) { + report, _, _ := unstructuredNestedMap(obj, "report") + imageRef := "" + if artifact, _, _ := unstructuredNestedMap(obj, "report", "artifact"); len(artifact) > 0 { + imageRef, _ = artifact["repository"].(string) + if tag, _ := artifact["tag"].(string); tag != "" { + imageRef = imageRef + ":" + tag + } + } + + summary, _, _ := unstructuredNestedMap(report, "summary") + if len(summary) == 0 { + // Fall back to scanning vulnerabilities slice directly. + return vulnerabilityCriticalCountFromSlice(obj, imageRef) + } + + critFloat, _ := summary["criticalCount"].(float64) + return int(critFloat), imageRef +} + +func vulnerabilityCriticalCountFromSlice(obj map[string]interface{}, imageRef string) (int, string) { + vulnerabilities, _, _ := unstructuredNestedSlice(obj, "report", "vulnerabilities") + count := 0 + for _, raw := range vulnerabilities { + v, ok := raw.(map[string]interface{}) + if !ok { + continue + } + severity, _ := v["severity"].(string) + if severity == criticalSeverity { + count++ + } + } + return count, imageRef +} + +func (l *VulnerabilityDriftLoop) emitSignal(ctx context.Context, signalName, reportName, imageRef, driftReason string) { + now := time.Now().UTC().Format(time.RFC3339) + + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil && !k8serrors.IsNotFound(err) { + fmt.Printf("vulnerability drift loop: report=%q get DriftSignal: %v\n", reportName, err) + return + } + + if k8serrors.IsNotFound(err) { + obj := map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": signalName, "namespace": l.namespace}, + "spec": map[string]interface{}{ + "state": "pending", + "signalKind": "VulnerableImageDetected", + "driftLayer": "kubernetes", + "correlationID": fmt.Sprintf("vuln-%s-%d", reportName, time.Now().UnixNano()), + "observedAt": now, + "driftReason": driftReason, + "affectedCRRef": map[string]interface{}{ + "group": "aquasecurity.github.io", + "kind": "VulnerabilityReport", + "namespace": l.namespace, + "name": reportName, + }, + "escalationCounter": int64(0), + }, + } + if _, cErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Create( + ctx, &k8sunstructured.Unstructured{Object: obj}, metav1.CreateOptions{}, + ); cErr != nil { + fmt.Printf("vulnerability drift loop: report=%q create DriftSignal: %v\n", reportName, cErr) + } + fmt.Printf("vulnerability drift loop: report=%q CRITICAL vuln(s) in %s -- DriftSignal written\n", reportName, imageRef) + return + } + + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + counter, _ := spec["escalationCounter"].(int64) + if int32(counter) >= escalationThreshold { + return + } + if state == "confirmed" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "correlationID": fmt.Sprintf("vuln-%s-%d", reportName, time.Now().UnixNano()), + "observedAt": now, + "escalationCounter": int64(0), + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("vulnerability drift loop: report=%q reset DriftSignal: %v\n", reportName, pErr) + } + return + } + if state == "queued" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "escalationCounter": counter + 1, + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("vulnerability drift loop: report=%q increment escalation counter: %v\n", reportName, pErr) + } + } +} + +func (l *VulnerabilityDriftLoop) confirmSignalIfPresent(ctx context.Context, signalName string) { + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil { + return + } + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + if state == "confirmed" || state == "" { + return + } + patch := map[string]interface{}{ + "spec": map[string]interface{}{"state": "confirmed", "correlationID": ""}, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("vulnerability drift loop: confirm DriftSignal %s/%s: %v\n", l.namespace, signalName, pErr) + } +} diff --git a/internal/agent/vulnerability_drift_loop_test.go b/internal/agent/vulnerability_drift_loop_test.go new file mode 100644 index 0000000..c81a6c8 --- /dev/null +++ b/internal/agent/vulnerability_drift_loop_test.go @@ -0,0 +1,162 @@ +package agent + +import ( + "context" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic/fake" +) + +var vulnTestGVRs = map[schema.GroupVersionResource]string{ + vulnerabilityReportGVR: "VulnerabilityReportList", + driftSignalGVR: "DriftSignalList", +} + +func newVulnFakeClient(objs ...runtime.Object) *fake.FakeDynamicClient { + return fake.NewSimpleDynamicClientWithCustomListKinds(runtime.NewScheme(), vulnTestGVRs, objs...) +} + +// fakeVulnReport builds a VulnerabilityReport with the given criticalCount in the summary. +func fakeVulnReport(name string, critCount int) *unstructured.Unstructured { + return &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "aquasecurity.github.io/v1alpha1", + "kind": "VulnerabilityReport", + "metadata": map[string]interface{}{"name": name, "namespace": "seam-system"}, + "report": map[string]interface{}{ + "artifact": map[string]interface{}{"repository": "nginx", "tag": "1.25.0"}, + "summary": map[string]interface{}{"criticalCount": float64(critCount)}, + }, + }} +} + +func TestVulnerabilityDriftLoop_NoCritical_NoSignal(t *testing.T) { + report := fakeVulnReport("safe-report", 0) + client := newVulnFakeClient(report) + + l := NewVulnerabilityDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal for report with zero critical vulnerabilities") + } + } +} + +func TestVulnerabilityDriftLoop_CriticalFound_EmitsSignal(t *testing.T) { + report := fakeVulnReport("vuln-report", 3) + client := newVulnFakeClient(report) + + l := NewVulnerabilityDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var created bool + for _, a := range client.Actions() { + if a.GetVerb() == "create" && a.GetResource().Resource == "driftsignals" { + created = true + } + } + if !created { + t.Error("expected DriftSignal created for report with critical vulnerabilities") + } +} + +func TestVulnerabilityDriftLoop_SignalFields(t *testing.T) { + report := fakeVulnReport("my-vuln-report", 2) + client := newVulnFakeClient(report) + + l := NewVulnerabilityDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + signalName := vulnerabilitySignalPrefix + "my-vuln-report" + ds, err := client.Resource(driftSignalGVR).Namespace("seam-system").Get( + context.Background(), signalName, metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("expected DriftSignal: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + if kind, _ := spec["signalKind"].(string); kind != "VulnerableImageDetected" { + t.Errorf("signalKind = %q, want VulnerableImageDetected", kind) + } + if state, _ := spec["state"].(string); state != "pending" { + t.Errorf("state = %q, want pending", state) + } +} + +func TestVulnerabilityDriftLoop_ObserveOnly_NoSignal(t *testing.T) { + report := fakeVulnReport("vuln-report", 5) + client := newVulnFakeClient(report) + + ocWatcher := NewOperatorContextWatcher(client, "ont-system") + ocWatcher.mu.Lock() + ocWatcher.resolved["ccs-mgmt"] = resolvedContext{autonomyLevel: AutonomyLevelObserveOnly, mode: "normal"} + ocWatcher.mu.Unlock() + + l := NewVulnerabilityDriftLoop(client, "seam-system", "ccs-mgmt") + l.WithOperatorContextWatcher(ocWatcher) + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal under observe-only mode") + } + } +} + +func TestVulnerabilityDriftLoop_ConfirmsSignalWhenClean(t *testing.T) { + report := fakeVulnReport("fixed-report", 0) + existingSignal := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": vulnerabilitySignalPrefix + "fixed-report", "namespace": "seam-system"}, + "spec": map[string]interface{}{"state": "queued"}, + }} + + client := newVulnFakeClient(report, existingSignal) + l := NewVulnerabilityDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var patched bool + for _, a := range client.Actions() { + if a.GetVerb() == "patch" && a.GetResource().Resource == "driftsignals" { + patched = true + } + } + if !patched { + t.Error("expected DriftSignal to be confirmed when no critical vulns remain") + } +} + +func TestVulnerabilityCriticalCount_FromSummary(t *testing.T) { + obj := map[string]interface{}{ + "report": map[string]interface{}{ + "summary": map[string]interface{}{"criticalCount": float64(7)}, + }, + } + count, _ := vulnerabilityCriticalCount(obj) + if count != 7 { + t.Errorf("criticalCount = %d, want 7", count) + } +} + +func TestVulnerabilityCriticalCount_FromSlice(t *testing.T) { + obj := map[string]interface{}{ + "report": map[string]interface{}{ + "vulnerabilities": []interface{}{ + map[string]interface{}{"severity": "CRITICAL"}, + map[string]interface{}{"severity": "HIGH"}, + map[string]interface{}{"severity": "CRITICAL"}, + map[string]interface{}{"severity": "MEDIUM"}, + }, + }, + } + count, _ := vulnerabilityCriticalCount(obj) + if count != 2 { + t.Errorf("criticalCount from slice = %d, want 2", count) + } +} diff --git a/internal/kernel/agent.go b/internal/kernel/agent.go index 4a1efaf..dc7b2f3 100644 --- a/internal/kernel/agent.go +++ b/internal/kernel/agent.go @@ -344,12 +344,64 @@ func RunAgent(goCtx context.Context, execCtx config.ExecutionContext, client kub execCtx.ClusterRef) } + // ESOHealthLoop — role=management only. Polls ExternalSecret CRs in seam-system + // and emits ExternalSecretSyncFailed DriftSignals on sync errors. + // Skips cleanly when ESO CRDs are not installed. RECON-K3. + var esoHealthLoop *agent.ESOHealthLoop + if role == RoleManagement { + esoHealthLoop = agent.NewESOHealthLoop(dynamicClient, ns, execCtx.ClusterRef) + fmt.Printf("conductor agent: cluster=%q ESO health loop enabled (management role)\n", + execCtx.ClusterRef) + } + + // PolicyReportDriftLoop — role=management only. Polls Kyverno PolicyReport and + // ClusterPolicyReport CRs and emits KyvernoPolicyViolation DriftSignals on failures. + // Skips cleanly when Kyverno CRDs are not installed. RECON-L2. + var policyReportDriftLoop *agent.PolicyReportDriftLoop + if role == RoleManagement { + policyReportDriftLoop = agent.NewPolicyReportDriftLoop(dynamicClient, ns, execCtx.ClusterRef) + fmt.Printf("conductor agent: cluster=%q policy report drift loop enabled (management role)\n", + execCtx.ClusterRef) + } + + // VulnerabilityDriftLoop — role=management only. Polls Trivy Operator + // VulnerabilityReport CRs and emits VulnerableImageDetected DriftSignals for + // CRITICAL severity findings. Skips cleanly when Trivy CRDs not installed. RECON-M2. + var vulnerabilityDriftLoop *agent.VulnerabilityDriftLoop + if role == RoleManagement { + vulnerabilityDriftLoop = agent.NewVulnerabilityDriftLoop(dynamicClient, ns, execCtx.ClusterRef) + fmt.Printf("conductor agent: cluster=%q vulnerability drift loop enabled (management role)\n", + execCtx.ClusterRef) + } + + // BackupHealthLoop — role=management only. Polls Velero BackupStorageLocation and + // Backup CRs; emits BackupStorageUnavailable and BackupRPOBreached DriftSignals. + // Skips cleanly when Velero CRDs are not installed. RECON-N2. + var backupHealthLoop *agent.BackupHealthLoop + if role == RoleManagement { + backupHealthLoop = agent.NewBackupHealthLoop(dynamicClient, ns, execCtx.ClusterRef) + fmt.Printf("conductor agent: cluster=%q backup health loop enabled (management role)\n", + execCtx.ClusterRef) + } + if runtimeDriftHandler != nil { runtimeDriftHandler.WithOperatorContextWatcher(ocWatcher) } if packPodHealthLoop != nil { packPodHealthLoop.WithOperatorContextWatcher(ocWatcher) } + if esoHealthLoop != nil { + esoHealthLoop.WithOperatorContextWatcher(ocWatcher) + } + if policyReportDriftLoop != nil { + policyReportDriftLoop.WithOperatorContextWatcher(ocWatcher) + } + if vulnerabilityDriftLoop != nil { + vulnerabilityDriftLoop.WithOperatorContextWatcher(ocWatcher) + } + if backupHealthLoop != nil { + backupHealthLoop.WithOperatorContextWatcher(ocWatcher) + } // Phase 3b — Start the federation channel listener/client. // Management Conductor: start FederationServer when FEDERATION_CA_CERT_PATH, @@ -468,7 +520,7 @@ func RunAgent(goCtx context.Context, execCtx config.ExecutionContext, client kub "", // identity: resolved from hostname inside RunLeaderElection agent.LeaderCallbacks{ OnStartedLeading: func(leaderCtx context.Context) { - onLeaderStart(leaderCtx, execCtx.ClusterRef, ns, manifest, publisher, reconciler, signingLoop, snapshotPullLoop, packInstancePullLoop, packReceiptDriftLoop, rbacProfilePullLoop, rbacPolicyPullLoop, driftSignalHandler, talosVersionDriftLoop, kubernetesVersionDriftLoop, packPodHealthLoop, runtimeDriftHandler, ocWatcher, clusterNodeHealthLoop, packSourceVersionLoop, dynamicClient) + onLeaderStart(leaderCtx, execCtx.ClusterRef, ns, manifest, publisher, reconciler, signingLoop, snapshotPullLoop, packInstancePullLoop, packReceiptDriftLoop, rbacProfilePullLoop, rbacPolicyPullLoop, driftSignalHandler, talosVersionDriftLoop, kubernetesVersionDriftLoop, packPodHealthLoop, runtimeDriftHandler, ocWatcher, clusterNodeHealthLoop, packSourceVersionLoop, esoHealthLoop, policyReportDriftLoop, vulnerabilityDriftLoop, backupHealthLoop, dynamicClient) }, OnStoppedLeading: func() { fmt.Printf("conductor agent: cluster=%q lost leadership — entering standby\n", @@ -506,6 +558,10 @@ func onLeaderStart( ocWatcher *agent.OperatorContextWatcher, clusterNodeHealthLoop *agent.ClusterNodeHealthLoop, packSourceVersionLoop *agent.PackSourceVersionLoop, + esoHealthLoop *agent.ESOHealthLoop, + policyReportDriftLoop *agent.PolicyReportDriftLoop, + vulnerabilityDriftLoop *agent.VulnerabilityDriftLoop, + backupHealthLoop *agent.BackupHealthLoop, dynamicClient dynamic.Interface, ) { // Publish capability manifest to RunnerConfig status with background retry. @@ -649,6 +705,34 @@ func onLeaderStart( go packSourceVersionLoop.Run(leaderCtx, packVersionInterval) } + // Start ESOHealthLoop (management cluster only). Polls ExternalSecret CRs for sync + // failures and emits ExternalSecretSyncFailed DriftSignals. Skips when ESO CRDs absent. + // RECON-K3. + if esoHealthLoop != nil { + go esoHealthLoop.Run(leaderCtx, reconcileInterval) + } + + // Start PolicyReportDriftLoop (management cluster only). Polls Kyverno PolicyReport and + // ClusterPolicyReport CRs and emits KyvernoPolicyViolation DriftSignals. Skips when + // Kyverno CRDs absent. RECON-L2. + if policyReportDriftLoop != nil { + go policyReportDriftLoop.Run(leaderCtx, reconcileInterval) + } + + // Start VulnerabilityDriftLoop (management cluster only). Polls Trivy Operator + // VulnerabilityReport CRs and emits VulnerableImageDetected DriftSignals for CRITICAL + // findings. Skips when Trivy CRDs absent. RECON-M2. + if vulnerabilityDriftLoop != nil { + go vulnerabilityDriftLoop.Run(leaderCtx, reconcileInterval) + } + + // Start BackupHealthLoop (management cluster only). Polls Velero BackupStorageLocation + // and Backup CRs; emits BackupStorageUnavailable and BackupRPOBreached DriftSignals. + // Skips when Velero CRDs absent. RECON-N2. + if backupHealthLoop != nil { + go backupHealthLoop.Run(leaderCtx, reconcileInterval) + } + // Mark InfrastructureTalosCluster Ready=True (tenant clusters only). // snapshotPullLoop non-nil indicates role=tenant. Conductor signals readiness // to management once leadership is established. guardian-schema.md §3. diff --git a/test/e2e/backup_health_loop_test.go b/test/e2e/backup_health_loop_test.go new file mode 100644 index 0000000..c8bf20c --- /dev/null +++ b/test/e2e/backup_health_loop_test.go @@ -0,0 +1,41 @@ +package e2e_test + +// backup_health_loop_test.go -- live cluster verification that BackupHealthLoop +// correctly detects Velero BSL unavailability and RPO breaches, emitting +// BackupStorageUnavailable and BackupRPOBreached DriftSignals. +// +// Pre-conditions: +// - MGMT_KUBECONFIG set; ccs-mgmt fully onboarded with Guardian operational. +// - Velero PackDelivery deployed to seam-system (RECON-N1 closed). +// - At least one BackupStorageLocation present in seam-system. +// - Conductor agent role=management running with BackupHealthLoop enabled. +// +// What this test verifies (RECON-N2): +// - BackupHealthLoop creates a BackupStorageUnavailable DriftSignal in seam-system +// when a BackupStorageLocation is not in phase=Available. +// - BackupHealthLoop creates a BackupRPOBreached DriftSignal when no successful +// backup is younger than 25 hours. +// - DriftSignal spec.signalKind == "BackupStorageUnavailable" or "BackupRPOBreached". +// - After the BSL returns to Available and a recent backup completes, the loop +// confirms both DriftSignals (state=confirmed). +// - Under AutonomyLevel=observe-only, no DriftSignal is created. + +import ( + "testing" +) + +func TestBackupHealthLoop_BSLUnavailableEmitsDriftSignal(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and Velero deployed with a degraded BackupStorageLocation with RECON-N1 and RECON-N2 closed") +} + +func TestBackupHealthLoop_RPOBreachedEmitsDriftSignal(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and Velero deployed with no successful backup within RPO window with RECON-N1 and RECON-N2 closed") +} + +func TestBackupHealthLoop_ConfirmsSignalAfterRecovery(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and existing BackupStorageUnavailable or BackupRPOBreached DriftSignal with RECON-N2 closed") +} + +func TestBackupHealthLoop_ObserveOnly_NoSignalCreated(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and OperatorContext with autonomyLevel=observe-only with RECON-N2 closed") +} diff --git a/test/e2e/eso_health_loop_test.go b/test/e2e/eso_health_loop_test.go new file mode 100644 index 0000000..a702ea2 --- /dev/null +++ b/test/e2e/eso_health_loop_test.go @@ -0,0 +1,34 @@ +package e2e_test + +// eso_health_loop_test.go -- live cluster verification that ESOHealthLoop correctly +// detects ExternalSecret sync failures and emits ExternalSecretSyncFailed DriftSignals. +// +// Pre-conditions: +// - MGMT_KUBECONFIG set; ccs-mgmt fully onboarded with Guardian operational. +// - External Secrets Operator PackDelivery deployed to seam-system (RECON-K1 closed). +// - At least one ExternalSecret CR present in seam-system pointing to a secret store. +// - Conductor agent role=management running with ESOHealthLoop enabled. +// +// What this test verifies (RECON-K3): +// - ESOHealthLoop creates an ExternalSecretSyncFailed DriftSignal in seam-system +// when an ExternalSecret has a Ready=False or Synced=False condition. +// - DriftSignal spec.signalKind == "ExternalSecretSyncFailed". +// - After the ExternalSecret recovers (Ready=True), the loop confirms the +// DriftSignal (state=confirmed) within the next poll interval. +// - Under AutonomyLevel=observe-only, no DriftSignal is created. + +import ( + "testing" +) + +func TestESOHealthLoop_SyncFailedEmitsDriftSignal(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and ESO PackDelivery deployed to seam-system with RECON-K1 and RECON-K3 closed") +} + +func TestESOHealthLoop_ConfirmsSignalOnRecovery(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and an existing ExternalSecretSyncFailed DriftSignal with RECON-K1 and RECON-K3 closed") +} + +func TestESOHealthLoop_ObserveOnly_NoSignalCreated(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and OperatorContext with autonomyLevel=observe-only with RECON-K3 closed") +} diff --git a/test/e2e/policy_report_drift_loop_test.go b/test/e2e/policy_report_drift_loop_test.go new file mode 100644 index 0000000..dea3970 --- /dev/null +++ b/test/e2e/policy_report_drift_loop_test.go @@ -0,0 +1,34 @@ +package e2e_test + +// policy_report_drift_loop_test.go -- live cluster verification that PolicyReportDriftLoop +// correctly detects Kyverno policy violations and emits KyvernoPolicyViolation DriftSignals. +// +// Pre-conditions: +// - MGMT_KUBECONFIG set; ccs-mgmt fully onboarded with Guardian operational. +// - Kyverno PackDelivery deployed to seam-system (RECON-L1 closed). +// - At least one ClusterPolicy or Policy active; at least one ClusterPolicyReport present. +// - Conductor agent role=management running with PolicyReportDriftLoop enabled. +// +// What this test verifies (RECON-L2): +// - PolicyReportDriftLoop creates a KyvernoPolicyViolation DriftSignal in seam-system +// when a PolicyReport or ClusterPolicyReport contains at least one fail result. +// - DriftSignal spec.signalKind == "KyvernoPolicyViolation". +// - After the policy violation is remediated (fail result removed), the loop confirms +// the DriftSignal (state=confirmed). +// - Under AutonomyLevel=observe-only, no DriftSignal is created. + +import ( + "testing" +) + +func TestPolicyReportDriftLoop_ViolationEmitsDriftSignal(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and Kyverno deployed with a failing ClusterPolicyReport with RECON-L1 and RECON-L2 closed") +} + +func TestPolicyReportDriftLoop_ConfirmsSignalAfterRemediation(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and an existing KyvernoPolicyViolation DriftSignal with RECON-L2 closed") +} + +func TestPolicyReportDriftLoop_ObserveOnly_NoSignalCreated(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and OperatorContext with autonomyLevel=observe-only with RECON-L2 closed") +} diff --git a/test/e2e/vulnerability_drift_loop_test.go b/test/e2e/vulnerability_drift_loop_test.go new file mode 100644 index 0000000..c6523b7 --- /dev/null +++ b/test/e2e/vulnerability_drift_loop_test.go @@ -0,0 +1,34 @@ +package e2e_test + +// vulnerability_drift_loop_test.go -- live cluster verification that VulnerabilityDriftLoop +// correctly detects CRITICAL vulnerabilities and emits VulnerableImageDetected DriftSignals. +// +// Pre-conditions: +// - MGMT_KUBECONFIG set; ccs-mgmt fully onboarded with Guardian operational. +// - Trivy Operator PackDelivery deployed to seam-system (RECON-M1 closed). +// - At least one VulnerabilityReport present in seam-system with scan results. +// - Conductor agent role=management running with VulnerabilityDriftLoop enabled. +// +// What this test verifies (RECON-M2): +// - VulnerabilityDriftLoop creates a VulnerableImageDetected DriftSignal in seam-system +// when a VulnerabilityReport contains at least one CRITICAL severity vulnerability. +// - DriftSignal spec.signalKind == "VulnerableImageDetected". +// - After the image is updated to a patched version (criticalCount drops to zero), +// the loop confirms the DriftSignal (state=confirmed). +// - Under AutonomyLevel=observe-only, no DriftSignal is created. + +import ( + "testing" +) + +func TestVulnerabilityDriftLoop_CriticalVulnEmitsDriftSignal(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and Trivy Operator deployed with a VulnerabilityReport containing CRITICAL CVEs with RECON-M1 and RECON-M2 closed") +} + +func TestVulnerabilityDriftLoop_ConfirmsSignalAfterImageUpdate(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and an existing VulnerableImageDetected DriftSignal with RECON-M2 closed") +} + +func TestVulnerabilityDriftLoop_ObserveOnly_NoSignalCreated(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and OperatorContext with autonomyLevel=observe-only with RECON-M2 closed") +} From a670071c6a00a44566277977992228e1269bee21 Mon Sep 17 00:00:00 2001 From: ontave Date: Fri, 29 May 2026 21:55:35 +0200 Subject: [PATCH 06/15] fix(agent): correct stale API group and missing ForceConflicts in pack-deploy Two bugs discovered during ccs-mgmt management cluster bootstrap: 1. capability_publisher.go: runnerConfigGVR used the pre-refactor API group infrastructure.ontai.dev/infrastructurerunnerconfigs instead of seam.ontai.dev/runnerconfigs. Conductor could not publish capabilities to RunnerConfig, blocking all PackExecution dispatch. 2. wrapper.go: applyParsedManifest and ensureNamespaces did not set Force: true on server-side apply PatchOptions. Phase C kubectl apply took field ownership on Namespace resources; conductor-exec pack-deploy Jobs then failed with field ownership conflicts on every re-apply. Both fixes are required for PackDelivery to succeed on clusters that were bootstrapped with Phase C raw kubectl apply. --- internal/agent/capability_publisher.go | 10 +++++----- internal/capability/wrapper.go | 8 +++++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/internal/agent/capability_publisher.go b/internal/agent/capability_publisher.go index 5e5a6fa..af9d236 100644 --- a/internal/agent/capability_publisher.go +++ b/internal/agent/capability_publisher.go @@ -35,11 +35,11 @@ const capabilityWatchInterval = 15 * time.Second const runnerConfigMissingDriftThreshold = 5 // runnerConfigGVR is the GroupVersionResource for RunnerConfig CRs. -// API group infrastructure.ontai.dev, schema version v1alpha1. conductor-schema.md §5. +// API group seam.ontai.dev, schema version v1alpha1. conductor-schema.md §5. var runnerConfigGVR = schema.GroupVersionResource{ - Group: "infrastructure.ontai.dev", + Group: "seam.ontai.dev", Version: "v1alpha1", - Resource: "infrastructurerunnerconfigs", + Resource: "runnerconfigs", } // CapabilityPublisher writes the Conductor capability manifest to the RunnerConfig @@ -69,7 +69,7 @@ func (p *CapabilityPublisher) emitRunnerConfigMissingSignal(ctx context.Context, now := time.Now().UTC().Format(time.RFC3339) obj := map[string]interface{}{ - "apiVersion": "infrastructure.ontai.dev/v1alpha1", + "apiVersion": "seam.ontai.dev/v1alpha1", "kind": "DriftSignal", "metadata": map[string]interface{}{ "name": signalName, @@ -81,7 +81,7 @@ func (p *CapabilityPublisher) emitRunnerConfigMissingSignal(ctx context.Context, "observedAt": now, "driftReason": "RunnerConfig not found in ont-system -- cluster-state drift", "affectedCRRef": map[string]interface{}{ - "group": "infrastructure.ontai.dev", + "group": "seam.ontai.dev", "kind": "RunnerConfig", "name": clusterRef, }, diff --git a/internal/capability/wrapper.go b/internal/capability/wrapper.go index cf2ee6b..1065268 100644 --- a/internal/capability/wrapper.go +++ b/internal/capability/wrapper.go @@ -1019,9 +1019,10 @@ func ensureNamespaces(ctx context.Context, dynClient dynamic.Interface, manifest for ns := range needed { nsJSON := []byte(fmt.Sprintf( `{"apiVersion":"v1","kind":"Namespace","metadata":{"name":%q}}`, ns)) + forceNS := true _, err := dynClient.Resource(namespaceGVR).Patch( ctx, ns, types.ApplyPatchType, nsJSON, - metav1.PatchOptions{FieldManager: "conductor-pack-deploy"}, + metav1.PatchOptions{FieldManager: "conductor-pack-deploy", Force: &forceNS}, ) if err != nil { return created, fmt.Errorf("pre-create namespace %q: %w", ns, err) @@ -1103,18 +1104,23 @@ func stageForKind(kind string) string { // --------------------------------------------------------------------------- // applyParsedManifest applies m to the cluster via server-side apply. +// Force is set to true so conductor-pack-deploy takes field ownership from any +// prior manager (e.g. kubectl) without returning a conflict error. func applyParsedManifest(ctx context.Context, dynClient dynamic.Interface, m parsedManifest) error { gvr := gvrFromAPIVersionKind(m.apiVersion, m.kind) + force := true if m.namespace != "" { _, err := dynClient.Resource(gvr).Namespace(m.namespace). Patch(ctx, m.name, types.ApplyPatchType, m.jsonData, metav1.PatchOptions{ FieldManager: "conductor-pack-deploy", + Force: &force, }) return err } _, err := dynClient.Resource(gvr). Patch(ctx, m.name, types.ApplyPatchType, m.jsonData, metav1.PatchOptions{ FieldManager: "conductor-pack-deploy", + Force: &force, }) return err } From 8de1cfe3a1fd06a73b7b6330859f6a64133235c7 Mon Sep 17 00:00:00 2001 From: ontave Date: Fri, 29 May 2026 22:56:20 +0200 Subject: [PATCH 07/15] feat(compiler): add LINEAGE_CNPG_URI env var to seam Deployment in enable phase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sources the URI from guardian-db-app secret so LineageController can archive LineageRecords to CNPG on root declaration deletion. seam-schema.md §4. --- cmd/compiler/compile_enable.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cmd/compiler/compile_enable.go b/cmd/compiler/compile_enable.go index ba11628..5d510c2 100644 --- a/cmd/compiler/compile_enable.go +++ b/cmd/compiler/compile_enable.go @@ -2439,6 +2439,22 @@ func buildOperatorDeployment(op operatorSpec) appsv1.Deployment { }) } + // seam carries LINEAGE_CNPG_URI so the LineageController can archive + // LineageRecords to CNPG on root declaration deletion. The URI is sourced from + // the guardian-db-app Secret which CNPG generates for the app user. + // seam-schema.md §4 CNPG Lineage Archival. INV-016. + if op.Name == "seam" { + env = append(env, corev1.EnvVar{ + Name: "LINEAGE_CNPG_URI", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "guardian-db-app"}, + Key: "uri", + }, + }, + }) + } + // Platform carries CONDUCTOR_REGISTRY so it can construct Conductor executor Job // image references without hardcoding the registry. conductor-schema.md §15. if op.ConductorRegistry != "" { From c01a26b5e3101197a159104ecc4ec3154bda0e33 Mon Sep 17 00:00:00 2001 From: ontave Date: Fri, 29 May 2026 23:11:48 +0200 Subject: [PATCH 08/15] feat(compiler): emit seam-root-declaration-principal MutatingWebhookConfiguration Adds writeSeamDeclaringPrincipalWebhook() to phase 3 output. The generated MutatingWebhookConfiguration intercepts CREATE for talosclusters and packdeliveries and routes to the seam webhook server at /mutate-root-declaration-declaring-principal. This wires the declaring-principal stamping into cluster bootstrap so every new root declaration carries the requesting principal identity, enabling LineageController to populate spec.rootBinding.declaringPrincipal with a real actor rather than system:unknown. Also removes stale seam.ontai.dev_runnerconfigs.yaml from conductor config/crd; RunnerConfig is owned by seam per Decision 13. --- cmd/compiler/compile_enable.go | 87 +++++ config/crd/seam.ontai.dev_runnerconfigs.yaml | 323 ------------------- 2 files changed, 87 insertions(+), 323 deletions(-) delete mode 100644 config/crd/seam.ontai.dev_runnerconfigs.yaml diff --git a/cmd/compiler/compile_enable.go b/cmd/compiler/compile_enable.go index 5d510c2..995ec56 100644 --- a/cmd/compiler/compile_enable.go +++ b/cmd/compiler/compile_enable.go @@ -1679,6 +1679,7 @@ func writePhase3PlatformDispatcher(output string, ops []operatorSpec) error { "platform-dispatcher-deployments.yaml", "platform-dispatcher-metrics-services.yaml", "seam-service.yaml", + "seam-declaring-principal-webhook.yaml", "seam-lineage-webhooks.yaml", } @@ -1690,6 +1691,8 @@ func writePhase3PlatformDispatcher(output string, ops []operatorSpec) error { "provisioned=true (kubectl get rbacprofiles -n seam-system). " + "These operators must be operational before Conductor's RBACProfile " + "can be provisioned in phase 4. " + + "Verify seam MutatingWebhookConfiguration is registered: " + + "kubectl get mutatingwebhookconfigurations | grep seam-root-declaration. " + "Verify seam ValidatingWebhookConfigurations are registered: " + "kubectl get validatingwebhookconfigurations | grep seam-lineage.", ApplyOrder: files, @@ -1753,6 +1756,14 @@ func writePhase3PlatformDispatcher(output string, ops []operatorSpec) error { return err } + // seam-declaring-principal-webhook.yaml — MutatingWebhookConfiguration that stamps + // infrastructure.ontai.dev/declaring-principal on TalosCluster and PackDelivery at + // CREATE time. Required for LineageController to populate declaringPrincipal on + // LineageRecord with the actual requesting principal. + if err := writeSeamDeclaringPrincipalWebhook(dir, seamNamespace); err != nil { + return err + } + // seam-lineage-webhooks.yaml — three ValidatingWebhookConfigurations for LineageRecord // governance: immutability (Decision 1), authorship gate (Decision 3), domainRef (Decision 2). if err := writeSeamWebhooks(dir); err != nil { @@ -1762,6 +1773,82 @@ func writePhase3PlatformDispatcher(output string, ops []operatorSpec) error { return nil } +// writeSeamDeclaringPrincipalWebhook writes seam-declaring-principal-webhook.yaml to dir. +// Emits a MutatingWebhookConfiguration that intercepts CREATE for TalosCluster and +// PackDelivery and stamps infrastructure.ontai.dev/declaring-principal with the +// requesting user's identity from AdmissionReview.UserInfo.Username. +// +// Without this webhook, the LineageController falls back to "system:unknown" for +// declaringPrincipal on every LineageRecord, making ownership tracing impossible. +// +// caBundle injected by cert-manager CA injector via cert-manager.io/inject-ca-from. +func writeSeamDeclaringPrincipalWebhook(dir, seamNamespace string) error { + injectAnnotation := seamNamespace + "/seam-webhook-cert" + + mwc := map[string]interface{}{ + "apiVersion": "admissionregistration.k8s.io/v1", + "kind": "MutatingWebhookConfiguration", + "metadata": map[string]interface{}{ + "name": "seam-root-declaration-principal", + "annotations": map[string]string{ + "ontai.dev/managed-by": "compiler", + "cert-manager.io/inject-ca-from": injectAnnotation, + }, + }, + "webhooks": []map[string]interface{}{ + { + "name": "mutate-root-declaration-declaring-principal.seam.ontai.dev", + "admissionReviewVersions": []string{"v1"}, + "sideEffects": "None", + "failurePolicy": "Fail", + "rules": []map[string]interface{}{ + { + "apiGroups": []string{"seam.ontai.dev"}, + "apiVersions": []string{"v1alpha1"}, + "operations": []string{"CREATE"}, + "resources": []string{"talosclusters", "packdeliveries"}, + "scope": "Namespaced", + }, + }, + "namespaceSelector": map[string]interface{}{ + "matchExpressions": []map[string]interface{}{ + { + "key": "seam.ontai.dev/webhook-mode", + "operator": "NotIn", + "values": []string{"exempt"}, + }, + }, + }, + "clientConfig": map[string]interface{}{ + "service": map[string]interface{}{ + "name": "seam", + "namespace": seamNamespace, + "path": "/mutate-root-declaration-declaring-principal", + "port": 443, + }, + }, + }, + }, + } + + data, err := yaml.Marshal(mwc) + if err != nil { + return fmt.Errorf("marshal seam declaring-principal MutatingWebhookConfiguration: %w", err) + } + + var buf bytes.Buffer + buf.WriteString("# seam Root Declaration Declaring Principal MutatingWebhookConfiguration\n") + buf.WriteString("# Generated by: compiler enable (phase 3 platform-dispatcher)\n") + buf.WriteString("# Stamps infrastructure.ontai.dev/declaring-principal on TalosCluster and\n") + buf.WriteString("# PackDelivery at CREATE time from AdmissionReview.UserInfo.Username.\n") + buf.WriteString("# Required for LineageController declaringPrincipal traceability.\n") + buf.WriteString("# caBundle injected by cert-manager CA injector.\n") + buf.WriteString("---\n") + buf.Write(data) + + return os.WriteFile(filepath.Join(dir, "seam-declaring-principal-webhook.yaml"), buf.Bytes(), 0644) +} + // writePlatformExecutorRoleFile emits a Role and RoleBinding in ont-system for the // platform-executor SA. Conductor executor Jobs run as this SA and write // InfrastructureTalosClusterOperationResult CRs to POD_NAMESPACE (ont-system). diff --git a/config/crd/seam.ontai.dev_runnerconfigs.yaml b/config/crd/seam.ontai.dev_runnerconfigs.yaml deleted file mode 100644 index 094bf6e..0000000 --- a/config/crd/seam.ontai.dev_runnerconfigs.yaml +++ /dev/null @@ -1,323 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.16.1 - name: runnerconfigs.seam.ontai.dev -spec: - group: seam.ontai.dev - names: - kind: RunnerConfig - listKind: RunnerConfigList - plural: runnerconfigs - shortNames: - - rc - singular: runnerconfig - scope: Namespaced - versions: - - additionalPrinterColumns: - - jsonPath: .spec.clusterRef - name: Cluster - type: string - - jsonPath: .metadata.creationTimestamp - name: Age - type: date - name: v1alpha1 - schema: - openAPIV3Schema: - description: |- - RunnerConfig is the seam-core CRD for Conductor agent runtime configuration. - Owned by seam-core; authored exclusively by the platform operator. INV-009. - conductor-schema.md. MIGRATION-3.8. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - RunnerConfigSpec is the operator-generated operational contract for a - specific cluster. Generated at runtime by platform using the runner shared library. - Never human-authored. INV-009, INV-010. conductor-schema.md. - properties: - clusterRef: - description: ClusterRef is the name of the TalosCluster this RunnerConfig - is authoritative for. - type: string - maintenanceTargetNodes: - description: MaintenanceTargetNodes is the list of node names that - are the subject of the operation. - items: - type: string - type: array - operationalHistory: - description: OperationalHistory is an append-only record of completed - RunnerConfig executions. - items: - description: |- - RunnerOperationalHistoryEntry is a single append-only audit record describing one - configuration change applied to this RunnerConfig. Never truncated. - properties: - appliedAt: - description: AppliedAt is the time this change was applied. - format: date-time - type: string - appliedBy: - description: AppliedBy identifies who applied the change. - type: string - concern: - description: Concern identifies what aspect of configuration - changed. - type: string - newValue: - description: NewValue is the value after the change. - type: string - previousValue: - description: PreviousValue is the value before the change. Empty - for initial entries. - type: string - required: - - appliedAt - - appliedBy - - concern - - newValue - type: object - type: array - operatorLeaderNode: - description: OperatorLeaderNode is the node hosting the leader pod - of the initiating operator. - type: string - phases: - description: Phases is the ordered list of operational phases for - this cluster's Conductor lifecycle. - items: - description: RunnerPhaseConfig carries per-phase parameters for - the runner's execution context. - properties: - name: - description: Name identifies the phase. - type: string - parameters: - additionalProperties: - type: string - description: Parameters holds phase-specific key-value configuration. - type: object - required: - - name - type: object - type: array - runnerImage: - description: |- - RunnerImage is the fully qualified container image reference for the Conductor agent. - Tag convention: v{talosVersion}-r{revision} stable, dev/dev-rc{N} development. INV-011. - type: string - selfOperation: - description: SelfOperation is true when the Job's execution cluster - and the target cluster are the same. - type: boolean - steps: - description: Steps is the ordered list of execution steps across all - phases. - items: - description: RunnerConfigStep declares one step in a multi-step - operation intent. - properties: - capability: - description: Capability is the named Conductor capability to - invoke for this step. - type: string - dependsOn: - description: DependsOn is the name of a prior step that must - complete before this step begins. - type: string - haltOnFailure: - description: |- - HaltOnFailure controls sequencer behaviour when this step fails. - When true, failure terminates the RunnerConfig with no further steps executing. - type: boolean - name: - description: Name is the unique identifier for this step within - the RunnerConfig. - type: string - parameters: - additionalProperties: - type: string - description: Parameters is the input parameter map passed to - the capability at Job materialisation time. - type: object - required: - - capability - - name - type: object - type: array - required: - - clusterRef - - runnerImage - type: object - status: - description: |- - RunnerConfigStatus is written exclusively by the Conductor agent leader. - CR-INV-006. - properties: - agentLeader: - description: AgentLeader is the pod name of the current Conductor - agent leader. - type: string - agentVersion: - description: AgentVersion is the version string of the Conductor agent - binary currently running. - type: string - capabilities: - description: |- - Capabilities is the self-declared capability manifest emitted by the Conductor agent on startup. - CR-INV-005. - items: - description: RunnerCapabilityEntry is one capability declared by - the Conductor agent on startup. - properties: - description: - description: Description is a human-readable description of - what this capability does. - type: string - name: - description: Name is the capability name (e.g., pack-deploy, - talos-upgrade). - type: string - version: - description: Version is the capability version declared by the - agent. - type: string - required: - - name - - version - type: object - type: array - conditions: - description: Conditions is the standard Kubernetes condition list - for this RunnerConfig. - items: - description: Condition contains details for one aspect of the current - state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - failedStep: - description: |- - FailedStep is the name of the first step that reached the Failed phase. - Present only when Phase="Failed". conductor-schema.md §17. - type: string - phase: - description: |- - Phase is the terminal execution phase written by Conductor execute mode. - "Completed" means all steps succeeded. "Failed" means at least one step failed. - Empty means execution is in progress. Platform operators watch this field to - detect terminal conditions without scanning StepResults. conductor-schema.md §17. - type: string - stepResults: - description: StepResults is the ordered list of step result records - written by Conductor execute mode. - items: - description: RunnerConfigStepResult is the status record for one - step. - properties: - completedAt: - description: CompletedAt is the time this step finished execution. - format: date-time - type: string - message: - description: Message is additional context about the step outcome. - type: string - name: - description: Name matches the Name field of the corresponding - RunnerConfigStep in spec. - type: string - startedAt: - description: StartedAt is the time this step began execution. - format: date-time - type: string - status: - allOf: - - enum: - - Succeeded - - Failed - - Skipped - - enum: - - Succeeded - - Failed - - Skipped - description: Status is the terminal status of this step execution. - type: string - required: - - name - - status - type: object - type: array - type: object - type: object - served: true - storage: true - subresources: - status: {} From c82a49873f11ee076b0cc5096e3b0ce6c35039ee Mon Sep 17 00:00:00 2001 From: ontave Date: Sat, 30 May 2026 04:54:10 +0200 Subject: [PATCH 09/15] feat(conductor): implement watchdog capabilities and wire Job submission Add 4 watchdog remediation capability handlers (pod-restart, resource-patch, force-volume-detach, credential-refresh) triggered by RuntimeDrift DriftSignals. Replace the placeholder Kueue Job submission in RuntimeDriftHandler with real Job construction: capability selection from failureReason, execute image from RunnerConfig.spec.runnerImage, kubeconfig Secret mount, Kueue watchdog-queue LocalQueue admission in ont-system. Add watchdog-queue LocalQueue to compiler enable phase 05 output. 12 capability handler tests and 3 compiler tests cover the new functionality. --- cmd/compiler/compile_enable.go | 47 ++- internal/agent/runtime_drift_handler.go | 165 +++++++- internal/agent/runtime_drift_handler_test.go | 95 +++++ internal/capability/stubs.go | 7 + internal/capability/watchdog.go | 355 ++++++++++++++++++ internal/capability/watchdog_test.go | 322 ++++++++++++++++ .../unit/compiler/wrapper_runner_rbac_test.go | 47 +++ 7 files changed, 1029 insertions(+), 9 deletions(-) create mode 100644 internal/capability/watchdog.go create mode 100644 internal/capability/watchdog_test.go diff --git a/cmd/compiler/compile_enable.go b/cmd/compiler/compile_enable.go index 995ec56..d72c175 100644 --- a/cmd/compiler/compile_enable.go +++ b/cmd/compiler/compile_enable.go @@ -2062,10 +2062,10 @@ func writePhase5PostBootstrap(output string, operators []operatorSpec, clusterNa files := []string{ "leaderelection.yaml", } - // pack-deploy-queue.yaml and dispatcher-runner.yaml require Kueue and seam-tenant-{name} - // namespaces, which exist only on the management cluster (INV-003). + // pack-deploy-queue.yaml, watchdog-queue.yaml, and dispatcher-runner.yaml require Kueue and + // seam-tenant-{name} namespaces, which exist only on the management cluster (INV-003). if clusterName != "" && clusterRole != "tenant" { - files = append(files, "pack-deploy-queue.yaml", "dispatcher-runner.yaml") + files = append(files, "pack-deploy-queue.yaml", "watchdog-queue.yaml", "dispatcher-runner.yaml") } meta := phaseMeta{ @@ -2091,6 +2091,11 @@ func writePhase5PostBootstrap(output string, operators []operatorSpec, clusterNa if err := writePackDeployQueueYAML(dir, clusterName); err != nil { return err } + // watchdog-queue.yaml — Kueue LocalQueue in ont-system for watchdog Jobs. + // conductor-schema.md §6 RuntimeDrift remediation. + if err := writeWatchdogQueueYAML(dir); err != nil { + return err + } // dispatcher-runner.yaml — SA, Role, RoleBinding for pack-deploy Job identity. // guardian-schema.md §6, INV-004. if err := writeDispatcherRunnerRBACYAML(dir, clusterName); err != nil { @@ -2844,6 +2849,42 @@ func writePackDeployQueueYAML(dir, clusterName string) error { return os.WriteFile(filepath.Join(dir, "pack-deploy-queue.yaml"), buf.Bytes(), 0644) } +// writeWatchdogQueueYAML emits a Kueue LocalQueue named watchdog-queue in ont-system +// referencing ClusterQueue seam-pack-deploy. The LocalQueue gates watchdog Job admission +// for RuntimeDrift remediation capabilities submitted by the conductor agent. +// conductor-schema.md §6 RuntimeDrift remediation. +func writeWatchdogQueueYAML(dir string) error { + lq := map[string]interface{}{ + "apiVersion": "kueue.x-k8s.io/v1beta1", + "kind": "LocalQueue", + "metadata": map[string]interface{}{ + "name": "watchdog-queue", + "namespace": "ont-system", + "labels": map[string]interface{}{ + "ontai.dev/managed-by": "compiler", + }, + }, + "spec": map[string]interface{}{ + "clusterQueue": "seam-pack-deploy", + }, + } + + data, err := yaml.Marshal(lq) + if err != nil { + return fmt.Errorf("marshal watchdog-queue LocalQueue: %w", err) + } + + var buf bytes.Buffer + buf.WriteString("# Kueue LocalQueue — watchdog-queue in ont-system\n") + buf.WriteString("# References ClusterQueue seam-pack-deploy.\n") + buf.WriteString("# Required for watchdog Job admission for RuntimeDrift remediation.\n") + buf.WriteString("# Generated by: compiler enable (phase 05 post-bootstrap)\n") + buf.WriteString("# conductor-schema.md §6.\n") + buf.WriteString("---\n") + buf.Write(data) + return os.WriteFile(filepath.Join(dir, "watchdog-queue.yaml"), buf.Bytes(), 0644) +} + // writeDispatcherRunnerRBACYAML emits ServiceAccount, Role, and RoleBinding for the // dispatcher-runner identity in seam-tenant-{clusterName}. The Role is annotated with // ontai.dev/rbac-owner=guardian per INV-004. diff --git a/internal/agent/runtime_drift_handler.go b/internal/agent/runtime_drift_handler.go index 260fe28..e159108 100644 --- a/internal/agent/runtime_drift_handler.go +++ b/internal/agent/runtime_drift_handler.go @@ -13,6 +13,8 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/dynamic" + + "github.com/ontai-dev/conductor-sdk/runnerlib" ) // packLogGVR is the GroupVersionResource for PackLog CRs (dispatcher). @@ -212,14 +214,21 @@ func (h *RuntimeDriftHandler) reconcileRuntimeDrift( h.markApprovalActed(ctx, approval.GetName(), packInstalledNS) } - // 4. Submit remediation Job (Job scheduling via Kueue placeholder). - // The actual Kueue Job submission is handled by the remediation capability - // executor. Here we increment the attempt count in PackLog and advance the - // signal to state=queued. + // 4. Submit a Kueue watchdog Job that runs the appropriate remediation + // capability against the target cluster. The capability is chosen from + // the failureReason via watchdogCapabilityForFailureReason. + capability := watchdogCapabilityForFailureReason(failureReason) + executeImage := h.resolveExecuteImage(ctx, clusterName) + jobErr := h.submitWatchdogJob(ctx, clusterName, packInstalledName, capability, failureReason, executeImage) + if jobErr != nil { + fmt.Printf("runtime drift handler: cluster=%q signal=%q Job submit failed: %v\n", + clusterName, signalName, jobErr) + return + } h.incrementPackLogAttempts(ctx, packLogName, packInstalledNS, failureReason, currentAttempts+1) h.advanceSignalState(ctx, tenantNS, signalName, "queued") - fmt.Printf("runtime drift handler: cluster=%q signal=%q remediation attempt %d submitted\n", - clusterName, signalName, currentAttempts+1) + fmt.Printf("runtime drift handler: cluster=%q signal=%q capability=%q attempt %d queued\n", + clusterName, signalName, capability, currentAttempts+1) return } @@ -535,3 +544,147 @@ func (h *RuntimeDriftHandler) markApprovalActed(ctx context.Context, approvalNam namespace, approvalName, pErr) } } + +// watchdogCapabilityForFailureReason maps a DriftSignal failureReason string to the +// appropriate watchdog capability name. Defaults to pod-restart for unknown reasons. +func watchdogCapabilityForFailureReason(failureReason string) string { + switch failureReason { + case "OOMKilled": + return runnerlib.CapabilityResourcePatch + case "ImagePullBackOff", "ErrImagePull": + return runnerlib.CapabilityCredentialRefresh + case "FailedMount", "MultiAttachError": + return runnerlib.CapabilityForceVolumeDetach + default: + // CrashLoopBackOff and all other reasons. + return runnerlib.CapabilityPodRestart + } +} + +// watchdogExecuteImageFallback is used when the RunnerConfig cannot be read. +const watchdogExecuteImageFallback = "10.20.0.1:5000/ontai-dev/conductor-execute:dev" + +// resolveExecuteImage reads spec.runnerImage from the RunnerConfig for clusterRef. +// Returns the fallback image when the RunnerConfig is absent or the field is empty. +func (h *RuntimeDriftHandler) resolveExecuteImage(ctx context.Context, clusterRef string) string { + rc, err := h.client.Resource(runnerConfigGVR).Namespace(h.namespace).Get(ctx, clusterRef, metav1.GetOptions{}) + if err != nil { + return watchdogExecuteImageFallback + } + img, _, _ := unstructured.NestedString(rc.Object, "spec", "runnerImage") + if img == "" { + return watchdogExecuteImageFallback + } + return img +} + +// watchdogJobGVR is the GroupVersionResource for batch/v1 Jobs. +var watchdogJobGVR = schema.GroupVersionResource{Group: "batch", Version: "v1", Resource: "jobs"} + +// submitWatchdogJob creates a Kueue-admitted batch/v1 Job in h.namespace that runs +// the given watchdog capability against the target cluster. The Job mounts the +// tenant kubeconfig Secret so the capability executor can reach the tenant cluster. +// conductor-schema.md §6, wrapper-schema.md §9. +func (h *RuntimeDriftHandler) submitWatchdogJob( + ctx context.Context, + clusterRef, packInstalledName, capability, failureReason, executeImage string, +) error { + jobName := fmt.Sprintf("watchdog-%s-%s-%d", sanitizeLabel(capability), sanitizeLabel(clusterRef), time.Now().Unix()) + ttl := int64(600) + completions := int64(1) + backoffLimit := int64(0) + falseVal := false + trueVal := true + + job := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": map[string]interface{}{ + "name": jobName, + "namespace": h.namespace, + "labels": map[string]interface{}{ + "kueue.x-k8s.io/queue-name": "watchdog-queue", + "ontai.dev/watchdog-capability": capability, + "ontai.dev/cluster-ref": clusterRef, + }, + }, + "spec": map[string]interface{}{ + "ttlSecondsAfterFinished": ttl, + "completions": completions, + "backoffLimit": backoffLimit, + "template": map[string]interface{}{ + "spec": map[string]interface{}{ + "serviceAccountName": "conductor", + "restartPolicy": "Never", + "securityContext": map[string]interface{}{ + "runAsNonRoot": trueVal, + "seccompProfile": map[string]interface{}{ + "type": "RuntimeDefault", + }, + }, + "volumes": []interface{}{ + map[string]interface{}{ + "name": "kubeconfig", + "secret": map[string]interface{}{ + "secretName": "seam-mc-" + clusterRef + "-kubeconfig", + }, + }, + }, + "containers": []interface{}{ + map[string]interface{}{ + "name": "conductor", + "image": executeImage, + "imagePullPolicy": "Always", + "env": []interface{}{ + map[string]interface{}{"name": "CAPABILITY", "value": capability}, + map[string]interface{}{"name": "CLUSTER_REF", "value": clusterRef}, + map[string]interface{}{"name": "POD_NAMESPACE", "value": h.namespace}, + map[string]interface{}{"name": "PACK_INSTALLED_NAME", "value": packInstalledName}, + map[string]interface{}{"name": "FAILURE_REASON", "value": failureReason}, + }, + "volumeMounts": []interface{}{ + map[string]interface{}{ + "name": "kubeconfig", + "mountPath": "/var/run/secrets/kubeconfig", + "subPath": "value", + "readOnly": trueVal, + }, + }, + "securityContext": map[string]interface{}{ + "allowPrivilegeEscalation": falseVal, + "capabilities": map[string]interface{}{ + "drop": []interface{}{"ALL"}, + }, + "runAsNonRoot": trueVal, + "seccompProfile": map[string]interface{}{ + "type": "RuntimeDefault", + }, + }, + }, + }, + }, + }, + }, + }, + } + + _, err := h.client.Resource(watchdogJobGVR).Namespace(h.namespace).Create(ctx, job, metav1.CreateOptions{}) + return err +} + +// sanitizeLabel trims characters that are not valid in Kubernetes label values or +// Job name components (alphanumeric plus hyphen and dot, max 63 chars per segment). +// Used to build the Job name from capability and clusterRef strings. +func sanitizeLabel(s string) string { + out := make([]byte, 0, len(s)) + for i := 0; i < len(s) && len(out) < 30; i++ { + c := s[i] + if (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' { + out = append(out, c) + } else if c >= 'A' && c <= 'Z' { + out = append(out, c+32) // to lower + } + } + return string(out) +} diff --git a/internal/agent/runtime_drift_handler_test.go b/internal/agent/runtime_drift_handler_test.go index 05dc505..a5440bb 100644 --- a/internal/agent/runtime_drift_handler_test.go +++ b/internal/agent/runtime_drift_handler_test.go @@ -160,3 +160,98 @@ func TestRuntimeDriftHandler_SkipsGovernanceDrift(t *testing.T) { // If this panics, the guard is missing and the test fails. _ = h } + +// TestWatchdogCapabilityForFailureReason verifies that each known failure reason +// maps to the expected watchdog capability and that unknown reasons fall through +// to pod-restart. +func TestWatchdogCapabilityForFailureReason(t *testing.T) { + cases := []struct { + reason string + want string + }{ + {"OOMKilled", "resource-patch"}, + {"ImagePullBackOff", "credential-refresh"}, + {"ErrImagePull", "credential-refresh"}, + {"FailedMount", "force-volume-detach"}, + {"MultiAttachError", "force-volume-detach"}, + {"CrashLoopBackOff", "pod-restart"}, + {"Unknown", "pod-restart"}, + {"", "pod-restart"}, + } + for _, tc := range cases { + got := watchdogCapabilityForFailureReason(tc.reason) + if got != tc.want { + t.Errorf("watchdogCapabilityForFailureReason(%q) = %q, want %q", tc.reason, got, tc.want) + } + } +} + +// setupJobScheme builds a fake scheme with types needed to verify Job creation. +func setupJobScheme() *runtime.Scheme { + s := setupApprovalScheme() + s.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "batch", Version: "v1", Kind: "Job", + }, &unstructured.Unstructured{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "batch", Version: "v1", Kind: "JobList", + }, &unstructured.UnstructuredList{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "seam.ontai.dev", Version: "v1alpha1", Kind: "RunnerConfig", + }, &unstructured.Unstructured{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "seam.ontai.dev", Version: "v1alpha1", Kind: "RunnerConfigList", + }, &unstructured.UnstructuredList{}) + return s +} + +// TestSubmitWatchdogJob_CreatesJobInOntSystem verifies that submitWatchdogJob +// creates a batch/v1 Job in the ont-system namespace with the expected Kueue +// queue label and env vars. +func TestSubmitWatchdogJob_CreatesJobInOntSystem(t *testing.T) { + client := fake.NewSimpleDynamicClient(setupJobScheme()) + h := NewRuntimeDriftHandler(client, "ont-system") + + err := h.submitWatchdogJob(context.Background(), + "ccs-dev", "nginx", "pod-restart", "CrashLoopBackOff", "10.20.0.1:5000/ontai-dev/conductor-execute:dev") + if err != nil { + t.Fatalf("submitWatchdogJob returned unexpected error: %v", err) + } + + jobGVR := schema.GroupVersionResource{Group: "batch", Version: "v1", Resource: "jobs"} + list, listErr := client.Resource(jobGVR).Namespace("ont-system").List(context.Background(), metav1.ListOptions{}) + if listErr != nil { + t.Fatalf("list Jobs: %v", listErr) + } + if len(list.Items) != 1 { + t.Fatalf("expected 1 Job, got %d", len(list.Items)) + } + job := list.Items[0] + + // Verify Kueue queue label. + labels := job.GetLabels() + if queueName := labels["kueue.x-k8s.io/queue-name"]; queueName != "watchdog-queue" { + t.Errorf("expected queue label watchdog-queue, got %q", queueName) + } + // Verify namespace. + if job.GetNamespace() != "ont-system" { + t.Errorf("expected namespace ont-system, got %q", job.GetNamespace()) + } + // Verify CAPABILITY env var. + containers, _, _ := unstructured.NestedSlice(job.Object, "spec", "template", "spec", "containers") + if len(containers) == 0 { + t.Fatal("expected at least 1 container in Job spec") + } + container, _ := containers[0].(map[string]interface{}) + envVars, _, _ := unstructured.NestedSlice(container, "env") + found := false + for _, envRaw := range envVars { + env, _ := envRaw.(map[string]interface{}) + if env["name"] == "CAPABILITY" && env["value"] == "pod-restart" { + found = true + break + } + } + if !found { + t.Errorf("CAPABILITY=pod-restart env var not found in Job container; env: %v", envVars) + } +} diff --git a/internal/capability/stubs.go b/internal/capability/stubs.go index 826a229..2d31a06 100644 --- a/internal/capability/stubs.go +++ b/internal/capability/stubs.go @@ -42,6 +42,13 @@ func RegisterAll(reg *Registry) { // Guardian capabilities -- RBAC plane. reg.Register(runnerlib.CapabilityRBACProvision, &rbacProvisionHandler{}) + // Watchdog capabilities -- runtime failure remediation. Triggered by + // RuntimeDrift DriftSignals on the management cluster. conductor-schema.md §6. + reg.Register(runnerlib.CapabilityPodRestart, &podRestartHandler{}) + reg.Register(runnerlib.CapabilityResourcePatch, &resourcePatchHandler{}) + reg.Register(runnerlib.CapabilityForceVolumeDetach, &forceVolumeDetachHandler{}) + reg.Register(runnerlib.CapabilityCredentialRefresh, &credentialRefreshHandler{}) + // Note: CapabilityPackCompile is NOT registered here. pack-compile is a // Compiler compile-mode invocation only -- it never runs as a Conductor Job. // Registering it here would be a schema violation. conductor-schema.md §6. diff --git a/internal/capability/watchdog.go b/internal/capability/watchdog.go new file mode 100644 index 0000000..2fab1fe --- /dev/null +++ b/internal/capability/watchdog.go @@ -0,0 +1,355 @@ +// Package capability — Conductor Watchdog remediation capability implementations. +// pod-restart, resource-patch, force-volume-detach, credential-refresh. +// Triggered by RuntimeDrift DriftSignals on the management cluster. +// conductor-schema.md §6, conductor-sdk runnerlib/constants.go. +package capability + +import ( + "context" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + + "github.com/ontai-dev/conductor-sdk/runnerlib" +) + +// packNameLabel is the label key used to scope watchdog operations to a specific pack. +// Set by the pack-deploy handler on pod template specs at deploy time. +const packNameLabel = "seam.ontai.dev/pack-name" + +// volumeAttachmentGVR is the GroupVersionResource for storage.k8s.io/v1 VolumeAttachment. +var volumeAttachmentGVR = schema.GroupVersionResource{ + Group: "storage.k8s.io", + Version: "v1", + Resource: "volumeattachments", +} + +// podGVR is the GroupVersionResource for core/v1 Pod. +var podGVR = schema.GroupVersionResource{Group: "", Version: "v1", Resource: "pods"} + +// podNamespaceAnnotation is stamped on Jobs by the execute runner so watchdog handlers +// can find the namespace where the failing pod lives. +const podNamespaceAnnotation = "conductor.ontai.dev/pod-namespace" + +// --------------------------------------------------------------------------- +// pod-restart +// --------------------------------------------------------------------------- + +// podRestartHandler implements the pod-restart named capability. +// Deletes all pods bearing the seam.ontai.dev/pack-name={PackInstalledName} label +// so the ReplicaSet controller recreates them. Used for CrashLoopBackOff failures. +// conductor-schema.md §6, conductor-sdk CapabilityPodRestart. +type podRestartHandler struct{} + +func (h *podRestartHandler) Execute(ctx context.Context, params ExecuteParams) (runnerlib.OperationResultSpec, error) { + now := time.Now().UTC() + + if params.TenantKubeClient == nil { + return failureResult(runnerlib.CapabilityPodRestart, now, runnerlib.ValidationFailure, + "pod-restart requires TenantKubeClient"), nil + } + if params.PackInstalledName == "" { + return failureResult(runnerlib.CapabilityPodRestart, now, runnerlib.ValidationFailure, + "pod-restart requires PackInstalledName"), nil + } + + selector := packNameLabel + "=" + params.PackInstalledName + podList, err := params.TenantKubeClient.CoreV1().Pods("").List(ctx, metav1.ListOptions{ + LabelSelector: selector, + }) + if err != nil { + return failureResult(runnerlib.CapabilityPodRestart, now, runnerlib.ExecutionFailure, + fmt.Sprintf("list pods with label %s on cluster %s: %v", selector, params.ClusterRef, err)), nil + } + + stepStart := time.Now().UTC() + deleted := 0 + for _, pod := range podList.Items { + if err := params.TenantKubeClient.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil { + return failureResult(runnerlib.CapabilityPodRestart, now, runnerlib.ExecutionFailure, + fmt.Sprintf("delete pod %s/%s on cluster %s: %v", pod.Namespace, pod.Name, params.ClusterRef, err)), nil + } + deleted++ + params.Log().Info("pod-restart: deleted pod", "cluster", params.ClusterRef, "pod", pod.Namespace+"/"+pod.Name) + } + + return runnerlib.OperationResultSpec{ + Capability: runnerlib.CapabilityPodRestart, + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Artifacts: []runnerlib.ArtifactRef{}, + Steps: []runnerlib.StepResult{{ + Name: "delete-pods", + Status: runnerlib.ResultSucceeded, + StartedAt: stepStart, + CompletedAt: time.Now().UTC(), + Message: fmt.Sprintf("%d pod(s) deleted from pack %s on cluster %s", deleted, params.PackInstalledName, params.ClusterRef), + }}, + }, nil +} + +// --------------------------------------------------------------------------- +// resource-patch +// --------------------------------------------------------------------------- + +// resourcePatchHandler implements the resource-patch named capability. +// Triggers a rollout restart on all Deployments and StatefulSets bearing the +// seam.ontai.dev/pack-name label. The rollout annotation forces pods to be +// recreated with updated scheduler placement, resolving OOMKilled failures. +// conductor-schema.md §6, conductor-sdk CapabilityResourcePatch. +type resourcePatchHandler struct{} + +func (h *resourcePatchHandler) Execute(ctx context.Context, params ExecuteParams) (runnerlib.OperationResultSpec, error) { + now := time.Now().UTC() + + if params.TenantDynamicClient == nil { + return failureResult(runnerlib.CapabilityResourcePatch, now, runnerlib.ValidationFailure, + "resource-patch requires TenantDynamicClient"), nil + } + if params.PackInstalledName == "" { + return failureResult(runnerlib.CapabilityResourcePatch, now, runnerlib.ValidationFailure, + "resource-patch requires PackInstalledName"), nil + } + + selector := packNameLabel + "=" + params.PackInstalledName + restartTS := time.Now().UTC().Format(time.RFC3339) + + // Patch both Deployments and StatefulSets that belong to this pack. + gvrs := []schema.GroupVersionResource{deploymentGVR, statefulSetGVR} + stepStart := time.Now().UTC() + patched := 0 + + for _, gvr := range gvrs { + list, err := params.TenantDynamicClient.Resource(gvr).Namespace("").List(ctx, metav1.ListOptions{ + LabelSelector: selector, + }) + if err != nil { + return failureResult(runnerlib.CapabilityResourcePatch, now, runnerlib.ExecutionFailure, + fmt.Sprintf("list %s with label %s on cluster %s: %v", gvr.Resource, selector, params.ClusterRef, err)), nil + } + for _, item := range list.Items { + // Inject restart annotation on spec.template.metadata.annotations. + patch := fmt.Sprintf( + `{"spec":{"template":{"metadata":{"annotations":{"kubectl.kubernetes.io/restartedAt":%q}}}}}`, + restartTS, + ) + _, err := params.TenantDynamicClient.Resource(gvr).Namespace(item.GetNamespace()).Patch( + ctx, item.GetName(), types.MergePatchType, []byte(patch), metav1.PatchOptions{}, + ) + if err != nil { + return failureResult(runnerlib.CapabilityResourcePatch, now, runnerlib.ExecutionFailure, + fmt.Sprintf("patch %s %s/%s on cluster %s: %v", gvr.Resource, item.GetNamespace(), item.GetName(), params.ClusterRef, err)), nil + } + patched++ + params.Log().Info("resource-patch: rollout restart triggered", + "cluster", params.ClusterRef, "resource", gvr.Resource, "name", item.GetNamespace()+"/"+item.GetName()) + } + } + + return runnerlib.OperationResultSpec{ + Capability: runnerlib.CapabilityResourcePatch, + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Artifacts: []runnerlib.ArtifactRef{}, + Steps: []runnerlib.StepResult{{ + Name: "rollout-restart", + Status: runnerlib.ResultSucceeded, + StartedAt: stepStart, + CompletedAt: time.Now().UTC(), + Message: fmt.Sprintf("%d workload(s) patched for rollout restart on cluster %s", patched, params.ClusterRef), + }}, + }, nil +} + +// --------------------------------------------------------------------------- +// force-volume-detach +// --------------------------------------------------------------------------- + +// forceVolumeDetachHandler implements the force-volume-detach named capability. +// Deletes VolumeAttachment objects for PVCs belonging to the pack so the kubelet +// can re-attach the volumes on a healthy node. Used for FailedMount and +// MultiAttachError failures. conductor-schema.md §6, conductor-sdk CapabilityForceVolumeDetach. +type forceVolumeDetachHandler struct{} + +func (h *forceVolumeDetachHandler) Execute(ctx context.Context, params ExecuteParams) (runnerlib.OperationResultSpec, error) { + now := time.Now().UTC() + + if params.TenantDynamicClient == nil { + return failureResult(runnerlib.CapabilityForceVolumeDetach, now, runnerlib.ValidationFailure, + "force-volume-detach requires TenantDynamicClient"), nil + } + if params.PackInstalledName == "" { + return failureResult(runnerlib.CapabilityForceVolumeDetach, now, runnerlib.ValidationFailure, + "force-volume-detach requires PackInstalledName"), nil + } + + selector := packNameLabel + "=" + params.PackInstalledName + + // List PVCs with the pack label to find which PVs to detach. + pvcList, err := params.TenantDynamicClient.Resource(pvcGVR).Namespace("").List(ctx, metav1.ListOptions{ + LabelSelector: selector, + }) + if err != nil { + return failureResult(runnerlib.CapabilityForceVolumeDetach, now, runnerlib.ExecutionFailure, + fmt.Sprintf("list PVCs with label %s on cluster %s: %v", selector, params.ClusterRef, err)), nil + } + + // Build the set of PV names bound to these PVCs. + pvNames := make(map[string]struct{}, len(pvcList.Items)) + for _, pvc := range pvcList.Items { + spec, _, _ := unstructuredNestedMap(pvc.Object, "spec") + if pvName, _ := spec["volumeName"].(string); pvName != "" { + pvNames[pvName] = struct{}{} + } + } + + if len(pvNames) == 0 { + return runnerlib.OperationResultSpec{ + Capability: runnerlib.CapabilityForceVolumeDetach, + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Artifacts: []runnerlib.ArtifactRef{}, + Steps: []runnerlib.StepResult{{ + Name: "delete-volume-attachments", + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Message: fmt.Sprintf("no PVs bound to pack %s on cluster %s", params.PackInstalledName, params.ClusterRef), + }}, + }, nil + } + + // List all VolumeAttachments and delete those referencing our PVs. + stepStart := time.Now().UTC() + vaList, err := params.TenantDynamicClient.Resource(volumeAttachmentGVR).List(ctx, metav1.ListOptions{}) + if err != nil { + return failureResult(runnerlib.CapabilityForceVolumeDetach, now, runnerlib.ExecutionFailure, + fmt.Sprintf("list VolumeAttachments on cluster %s: %v", params.ClusterRef, err)), nil + } + + deleted := 0 + for _, va := range vaList.Items { + spec, _, _ := unstructuredNestedMap(va.Object, "spec") + pvRef, _ := spec["source"].(map[string]interface{}) + pvName, _ := pvRef["persistentVolumeName"].(string) + if _, ok := pvNames[pvName]; !ok { + continue + } + if err := params.TenantDynamicClient.Resource(volumeAttachmentGVR).Delete(ctx, va.GetName(), metav1.DeleteOptions{}); err != nil { + return failureResult(runnerlib.CapabilityForceVolumeDetach, now, runnerlib.ExecutionFailure, + fmt.Sprintf("delete VolumeAttachment %s on cluster %s: %v", va.GetName(), params.ClusterRef, err)), nil + } + deleted++ + params.Log().Info("force-volume-detach: deleted VolumeAttachment", + "cluster", params.ClusterRef, "volumeAttachment", va.GetName(), "pv", pvName) + } + + return runnerlib.OperationResultSpec{ + Capability: runnerlib.CapabilityForceVolumeDetach, + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Artifacts: []runnerlib.ArtifactRef{}, + Steps: []runnerlib.StepResult{{ + Name: "delete-volume-attachments", + Status: runnerlib.ResultSucceeded, + StartedAt: stepStart, + CompletedAt: time.Now().UTC(), + Message: fmt.Sprintf("%d VolumeAttachment(s) deleted for pack %s on cluster %s", deleted, params.PackInstalledName, params.ClusterRef), + }}, + }, nil +} + +// --------------------------------------------------------------------------- +// credential-refresh +// --------------------------------------------------------------------------- + +// credentialRefreshHandler implements the credential-refresh named capability. +// Deletes pods bearing the seam.ontai.dev/pack-name label so the kubelet +// retries the image pull with up-to-date imagePullSecret credentials. +// Intended for ImagePullBackOff failures where the imagePullSecret has been +// refreshed out-of-band (e.g., by a secret rotation operator). +// conductor-schema.md §6, conductor-sdk CapabilityCredentialRefresh. +type credentialRefreshHandler struct{} + +func (h *credentialRefreshHandler) Execute(ctx context.Context, params ExecuteParams) (runnerlib.OperationResultSpec, error) { + now := time.Now().UTC() + + if params.TenantKubeClient == nil { + return failureResult(runnerlib.CapabilityCredentialRefresh, now, runnerlib.ValidationFailure, + "credential-refresh requires TenantKubeClient"), nil + } + if params.PackInstalledName == "" { + return failureResult(runnerlib.CapabilityCredentialRefresh, now, runnerlib.ValidationFailure, + "credential-refresh requires PackInstalledName"), nil + } + + selector := packNameLabel + "=" + params.PackInstalledName + podList, err := params.TenantKubeClient.CoreV1().Pods("").List(ctx, metav1.ListOptions{ + LabelSelector: selector, + }) + if err != nil { + return failureResult(runnerlib.CapabilityCredentialRefresh, now, runnerlib.ExecutionFailure, + fmt.Sprintf("list pods with label %s on cluster %s: %v", selector, params.ClusterRef, err)), nil + } + + // Filter to pods in ImagePullBackOff or ErrImagePull state. + stepStart := time.Now().UTC() + deleted := 0 + for _, pod := range podList.Items { + if !hasPullFailure(pod) { + continue + } + if err := params.TenantKubeClient.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil { + return failureResult(runnerlib.CapabilityCredentialRefresh, now, runnerlib.ExecutionFailure, + fmt.Sprintf("delete pod %s/%s on cluster %s: %v", pod.Namespace, pod.Name, params.ClusterRef, err)), nil + } + deleted++ + params.Log().Info("credential-refresh: deleted pod for image pull retry", + "cluster", params.ClusterRef, "pod", pod.Namespace+"/"+pod.Name) + } + + return runnerlib.OperationResultSpec{ + Capability: runnerlib.CapabilityCredentialRefresh, + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Artifacts: []runnerlib.ArtifactRef{}, + Steps: []runnerlib.StepResult{{ + Name: "delete-image-pull-failed-pods", + Status: runnerlib.ResultSucceeded, + StartedAt: stepStart, + CompletedAt: time.Now().UTC(), + Message: fmt.Sprintf("%d pod(s) deleted for image pull retry on cluster %s", deleted, params.ClusterRef), + }}, + }, nil +} + +// hasPullFailure returns true when any container in the pod is in ImagePullBackOff +// or ErrImagePull waiting state, indicating the pod needs a fresh pull attempt. +func hasPullFailure(pod corev1.Pod) bool { + for _, cs := range pod.Status.ContainerStatuses { + if cs.State.Waiting != nil { + switch cs.State.Waiting.Reason { + case "ImagePullBackOff", "ErrImagePull": + return true + } + } + } + for _, ics := range pod.Status.InitContainerStatuses { + if ics.State.Waiting != nil { + switch ics.State.Waiting.Reason { + case "ImagePullBackOff", "ErrImagePull": + return true + } + } + } + return false +} diff --git a/internal/capability/watchdog_test.go b/internal/capability/watchdog_test.go new file mode 100644 index 0000000..66ecf98 --- /dev/null +++ b/internal/capability/watchdog_test.go @@ -0,0 +1,322 @@ +package capability + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + dynfake "k8s.io/client-go/dynamic/fake" + kubefake "k8s.io/client-go/kubernetes/fake" + + "github.com/ontai-dev/conductor-sdk/runnerlib" +) + +// setupWatchdogScheme returns a runtime.Scheme with enough type registrations for +// watchdog handler tests. Only the GVKs exercised in these tests need to be present. +func setupWatchdogScheme() *runtime.Scheme { + s := runtime.NewScheme() + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "storage.k8s.io", Version: "v1", Kind: "VolumeAttachment"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "storage.k8s.io", Version: "v1", Kind: "VolumeAttachmentList"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "", Version: "v1", Kind: "PersistentVolumeClaim"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "", Version: "v1", Kind: "PersistentVolumeClaimList"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "apps", Version: "v1", Kind: "Deployment"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "apps", Version: "v1", Kind: "DeploymentList"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "apps", Version: "v1", Kind: "StatefulSet"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "apps", Version: "v1", Kind: "StatefulSetList"}, &runtime.Unknown{}) + return s +} + +// TestPodRestartHandler_NilClient verifies that a nil TenantKubeClient returns +// a ValidationFailure without panicking. +func TestPodRestartHandler_NilClient(t *testing.T) { + h := &podRestartHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityPodRestart, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestPodRestartHandler_NilPackInstalledName verifies that a missing PackInstalledName +// returns a ValidationFailure. +func TestPodRestartHandler_NilPackInstalledName(t *testing.T) { + h := &podRestartHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityPodRestart, + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TenantKubeClient: kubefake.NewSimpleClientset(), + }, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestPodRestartHandler_DeletesPods verifies that Execute deletes pods bearing +// the pack-name label on the tenant cluster. +func TestPodRestartHandler_DeletesPods(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nginx-abc", + Namespace: "default", + Labels: map[string]string{packNameLabel: "nginx"}, + }, + } + client := kubefake.NewSimpleClientset(pod) + h := &podRestartHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityPodRestart, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{TenantKubeClient: client}, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Errorf("expected ResultSucceeded, got %q; reason %+v", result.Status, result.FailureReason) + } + // Verify pod was deleted. + _, getErr := client.CoreV1().Pods("default").Get(context.Background(), "nginx-abc", metav1.GetOptions{}) + if getErr == nil { + t.Error("expected pod to be deleted but it still exists") + } +} + +// TestResourcePatchHandler_NilClient verifies that a nil TenantDynamicClient returns +// a ValidationFailure without panicking. +func TestResourcePatchHandler_NilClient(t *testing.T) { + h := &resourcePatchHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityResourcePatch, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestResourcePatchHandler_NilPackInstalledName verifies that a missing PackInstalledName +// returns a ValidationFailure. +func TestResourcePatchHandler_NilPackInstalledName(t *testing.T) { + h := &resourcePatchHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityResourcePatch, + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TenantDynamicClient: dynfake.NewSimpleDynamicClient(setupWatchdogScheme()), + }, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestResourcePatchHandler_EmptyCluster verifies that no Deployments with the pack +// label results in a Succeeded result (no-op is valid). +func TestResourcePatchHandler_EmptyCluster(t *testing.T) { + client := dynfake.NewSimpleDynamicClient(setupWatchdogScheme()) + h := &resourcePatchHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityResourcePatch, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{TenantDynamicClient: client}, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Errorf("expected ResultSucceeded for empty cluster, got %q; reason %+v", result.Status, result.FailureReason) + } +} + +// TestForceVolumeDetachHandler_NilClient verifies that a nil TenantDynamicClient +// returns a ValidationFailure without panicking. +func TestForceVolumeDetachHandler_NilClient(t *testing.T) { + h := &forceVolumeDetachHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityForceVolumeDetach, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestForceVolumeDetachHandler_NilPackInstalledName verifies that a missing +// PackInstalledName returns a ValidationFailure. +func TestForceVolumeDetachHandler_NilPackInstalledName(t *testing.T) { + h := &forceVolumeDetachHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityForceVolumeDetach, + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TenantDynamicClient: dynfake.NewSimpleDynamicClient(setupWatchdogScheme()), + }, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestForceVolumeDetachHandler_NoPVCs verifies that when no PVCs match the pack +// label the result is Succeeded (no-op is valid). +func TestForceVolumeDetachHandler_NoPVCs(t *testing.T) { + client := dynfake.NewSimpleDynamicClient(setupWatchdogScheme()) + h := &forceVolumeDetachHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityForceVolumeDetach, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{TenantDynamicClient: client}, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Errorf("expected ResultSucceeded for no PVCs, got %q; reason %+v", result.Status, result.FailureReason) + } +} + +// TestCredentialRefreshHandler_NilClient verifies that a nil TenantKubeClient +// returns a ValidationFailure without panicking. +func TestCredentialRefreshHandler_NilClient(t *testing.T) { + h := &credentialRefreshHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityCredentialRefresh, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestCredentialRefreshHandler_NilPackInstalledName verifies that a missing +// PackInstalledName returns a ValidationFailure. +func TestCredentialRefreshHandler_NilPackInstalledName(t *testing.T) { + h := &credentialRefreshHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityCredentialRefresh, + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TenantKubeClient: kubefake.NewSimpleClientset(), + }, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestCredentialRefreshHandler_DeletesPullFailedPods verifies that pods in +// ImagePullBackOff state are deleted and pods in other states are skipped. +func TestCredentialRefreshHandler_DeletesPullFailedPods(t *testing.T) { + pullFailPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nginx-pull-fail", + Namespace: "default", + Labels: map[string]string{packNameLabel: "nginx"}, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{{ + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{Reason: "ImagePullBackOff"}, + }, + }}, + }, + } + runningPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nginx-running", + Namespace: "default", + Labels: map[string]string{packNameLabel: "nginx"}, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{{ + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }}, + }, + } + client := kubefake.NewSimpleClientset(pullFailPod, runningPod) + h := &credentialRefreshHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityCredentialRefresh, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{TenantKubeClient: client}, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Errorf("expected ResultSucceeded, got %q; reason %+v", result.Status, result.FailureReason) + } + // pull-fail pod must be deleted. + _, getErr := client.CoreV1().Pods("default").Get(context.Background(), "nginx-pull-fail", metav1.GetOptions{}) + if getErr == nil { + t.Error("expected pull-fail pod to be deleted but it still exists") + } + // running pod must be preserved. + _, getErr = client.CoreV1().Pods("default").Get(context.Background(), "nginx-running", metav1.GetOptions{}) + if getErr != nil { + t.Errorf("expected running pod to be preserved but got: %v", getErr) + } +} diff --git a/test/unit/compiler/wrapper_runner_rbac_test.go b/test/unit/compiler/wrapper_runner_rbac_test.go index 651298d..fc11c5b 100644 --- a/test/unit/compiler/wrapper_runner_rbac_test.go +++ b/test/unit/compiler/wrapper_runner_rbac_test.go @@ -203,3 +203,50 @@ func TestDispatcherRunnerRole_NotGeneratedWithoutClusterName(t *testing.T) { t.Errorf("dispatcher-runner.yaml was generated without --cluster-name; must not be present") } } + +// TestWatchdogQueueYAML_EmittedInPostBootstrap verifies that watchdog-queue.yaml is +// generated in 05-post-bootstrap when --cluster-name is provided. +func TestWatchdogQueueYAML_EmittedInPostBootstrap(t *testing.T) { + bin := buildCompiler(t) + out := t.TempDir() + cmd := exec.Command(bin, "enable", "--cluster-name", "ccs-mgmt", "--output", out) + if output, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("compiler enable failed: %v\n%s", err, output) + } + + path := filepath.Join(out, "05-post-bootstrap", "watchdog-queue.yaml") + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("watchdog-queue.yaml not generated: %v", err) + } + content := string(raw) + + if !strings.Contains(content, "watchdog-queue") { + t.Error("watchdog-queue.yaml does not contain 'watchdog-queue' name") + } + if !strings.Contains(content, "ont-system") { + t.Error("watchdog-queue.yaml is not scoped to ont-system namespace") + } + if !strings.Contains(content, "seam-pack-deploy") { + t.Error("watchdog-queue.yaml does not reference ClusterQueue seam-pack-deploy") + } + if !strings.Contains(content, "LocalQueue") { + t.Error("watchdog-queue.yaml is not a LocalQueue kind") + } +} + +// TestWatchdogQueueYAML_NotGeneratedWithoutClusterName verifies that watchdog-queue.yaml +// is NOT generated when --cluster-name is absent. +func TestWatchdogQueueYAML_NotGeneratedWithoutClusterName(t *testing.T) { + bin := buildCompiler(t) + out := t.TempDir() + cmd := exec.Command(bin, "enable", "--output", out) + if output, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("compiler enable failed: %v\n%s", err, output) + } + + path := filepath.Join(out, "05-post-bootstrap", "watchdog-queue.yaml") + if _, err := os.Stat(path); err == nil { + t.Errorf("watchdog-queue.yaml was generated without --cluster-name; must not be present") + } +} From 7295045fb085803e494433bf1c0042825049e4c2 Mon Sep 17 00:00:00 2001 From: ontave Date: Sat, 30 May 2026 15:27:01 +0200 Subject: [PATCH 10/15] feat(conductor): PLT-BUG-3-ARCH single-node targeting via MC_NODE_IP When MC_NODE_IP env var is set, machineconfig-sync capability targets only that node IP rather than all nodes from talosconfig. Supports machineconfig.yaml data key (compiler per-node secret format) as fallback alongside gzip-compressed machineconfig key. --- .../capability/platform_machineconfig_sync.go | 41 ++++++-- .../platform_machineconfig_sync_test.go | 93 +++++++++++++++++++ 2 files changed, 127 insertions(+), 7 deletions(-) diff --git a/internal/capability/platform_machineconfig_sync.go b/internal/capability/platform_machineconfig_sync.go index 640d79c..5a5976f 100644 --- a/internal/capability/platform_machineconfig_sync.go +++ b/internal/capability/platform_machineconfig_sync.go @@ -35,6 +35,17 @@ const machineConfigCompressionLabel = "platform.ontai.dev/compression" // Must match envMCNodeClass in platform/internal/controller/machineconfigsync_reconciler.go. const envMCSyncNodeClass = "MC_NODE_CLASS" +// envMCSyncNodeIP is the env var key injected when MachineConfigSync.spec.nodeRef is set. +// When present, the capability applies the machineconfig to only this specific node IP. +// Must match envMCNodeIP in platform/internal/controller/machineconfigsync_reconciler.go. +// PLT-BUG-3-ARCH. +const envMCSyncNodeIP = "MC_NODE_IP" + +// machineConfigSyncDataKeyYAML is the fallback Secret data key for compiler-generated +// per-node secrets. Mirrors MachineConfigDataKeyYAML in platform machineconfig_labels.go. +// PLT-BUG-3-ARCH. +const machineConfigSyncDataKeyYAML = "machineconfig.yaml" + // machineConfigSyncSecretNamespace returns the namespace that holds the source-of-truth Secret. func machineConfigSyncSecretNamespace(clusterRef string) string { return "seam-tenant-" + clusterRef @@ -72,6 +83,10 @@ func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecutePa "machineconfig-sync: MC_NODE_CLASS env var is required but not set"), nil } + // MC_NODE_IP is set by the MachineConfigSync reconciler when spec.nodeRef is + // non-empty. When present, apply to only this specific node. PLT-BUG-3-ARCH. + nodeIP := os.Getenv(envMCSyncNodeIP) + clusterRef := params.ClusterRef secretNS := machineConfigSyncSecretNamespace(clusterRef) secretName := machineConfigSyncSecretName(clusterRef, nodeClass) @@ -83,14 +98,21 @@ func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecutePa fmt.Sprintf("get MachineConfig Secret %s/%s: %v", secretNS, secretName, err)), nil } + // Try the primary data key first; fall back to the compiler-generated key. PLT-BUG-3-ARCH. mcBytes := secret.Data[machineConfigSyncDataKey] + usingYAMLKey := false + if len(mcBytes) == 0 { + mcBytes = secret.Data[machineConfigSyncDataKeyYAML] + usingYAMLKey = true + } if len(mcBytes) == 0 { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ValidationFailure, - fmt.Sprintf("MachineConfig Secret %s/%s has no data key %q", secretNS, secretName, machineConfigSyncDataKey)), nil + fmt.Sprintf("MachineConfig Secret %s/%s has no data key %q or %q", secretNS, secretName, machineConfigSyncDataKey, machineConfigSyncDataKeyYAML)), nil } // Decompress if the secret was stored with gzip compression. RECON-F5. - if secret.Labels[machineConfigCompressionLabel] == "gzip" { + // Compiler-generated secrets are not gzip-compressed (usingYAMLKey == true). + if !usingYAMLKey && secret.Labels[machineConfigCompressionLabel] == "gzip" { decompressed, dcErr := decompressMachineConfig(mcBytes) if dcErr != nil { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ValidationFailure, @@ -116,9 +138,12 @@ func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecutePa fmt.Sprintf("merged machineconfig is not valid YAML: %v", err)), nil } - // Enumerate nodes from talosconfig; fall back to single-context when absent. + // When MC_NODE_IP is set (PLT-BUG-3-ARCH per-node MCS), apply to only that + // specific node. Skip talosconfig endpoint enumeration. var nodeIPs []string - if params.TalosconfigPath != "" { + if nodeIP != "" { + nodeIPs = []string{nodeIP} + } else if params.TalosconfigPath != "" { ips, epErr := EndpointsFromTalosconfig(params.TalosconfigPath) if epErr != nil { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ExecutionFailure, @@ -127,9 +152,11 @@ func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecutePa nodeIPs = ips } - // singleNodeClass is true when nodeClass is already a per-node class (e.g. "node-cp1"). - // In that mode, the base secret IS the per-node config; skip per-node patch lookup. - singleNodeClass := strings.HasPrefix(nodeClass, "node-") + // singleNodeClass is true when: + // - nodeClass starts with "node-" (per-node class secret), or + // - MC_NODE_IP is set (per-node targeting, compiler secret is already per-node). + // In single-node mode, the secret IS the complete per-node config; skip patch lookup. + singleNodeClass := strings.HasPrefix(nodeClass, "node-") || nodeIP != "" var steps []runnerlib.StepResult if len(nodeIPs) > 0 { diff --git a/internal/capability/platform_machineconfig_sync_test.go b/internal/capability/platform_machineconfig_sync_test.go index 16ada2b..c5802ad 100644 --- a/internal/capability/platform_machineconfig_sync_test.go +++ b/internal/capability/platform_machineconfig_sync_test.go @@ -401,6 +401,99 @@ func TestMachineConfigSyncHandler_SingleNodeClass(t *testing.T) { } } +// TestMachineConfigSyncHandler_NodeIPTargetsSingleNode verifies that when MC_NODE_IP +// is set, the capability applies the machineconfig to only that one node and skips +// talosconfig endpoint enumeration. PLT-BUG-3-ARCH. +func TestMachineConfigSyncHandler_NodeIPTargetsSingleNode(t *testing.T) { + t.Setenv(envMCSyncNodeClass, "cp1") + t.Setenv(envMCSyncNodeIP, "10.20.0.11") + defer t.Setenv(envMCSyncNodeIP, "") + + content := []byte("version: v1alpha1\nmachine:\n type: controlplane\n") + secret := mcSyncTestSecret("ccs-dev", "cp1", content) + // Provide a talosconfig with multiple nodes -- only the MC_NODE_IP node should be targeted. + talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.11", "10.20.0.12", "10.20.0.13"}) + + kubeClient := fake.NewSimpleClientset(secret) + talosClient := &stubApplyTalosClient{} + + handler := &machineConfigSyncHandler{} + result, err := handler.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityMachineConfigSync, + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TalosClient: talosClient, + KubeClient: kubeClient, + TalosconfigPath: talosconfigPath, + }, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) + } + // Must apply to exactly 1 node (not 3). + if len(talosClient.applied) != 1 { + t.Fatalf("expected 1 ApplyConfiguration call (MC_NODE_IP single-target), got %d", len(talosClient.applied)) + } + if len(result.Steps) != 1 { + t.Errorf("expected 1 step result, got %d", len(result.Steps)) + } + if !containsString(result.Steps[0].Message, "10.20.0.11") { + t.Errorf("step message must reference nodeIP 10.20.0.11, got %q", result.Steps[0].Message) + } +} + +// TestMachineConfigSyncHandler_YAMLKeyFallback verifies that when the machineconfig +// Secret uses the compiler-generated "machineconfig.yaml" key (no "machineconfig" key), +// the capability still reads and applies the config. PLT-BUG-3-ARCH. +func TestMachineConfigSyncHandler_YAMLKeyFallback(t *testing.T) { + t.Setenv(envMCSyncNodeClass, "cp1") + t.Setenv(envMCSyncNodeIP, "10.20.0.11") + defer t.Setenv(envMCSyncNodeIP, "") + + content := []byte("version: v1alpha1\nmachine:\n type: controlplane\n") + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "seam-mc-ccs-dev-cp1", + Namespace: "seam-tenant-ccs-dev", + Labels: map[string]string{ + "ontai.dev/managed-by": "compiler", + "ontai.dev/cluster": "ccs-dev", + }, + }, + Data: map[string][]byte{ + // Only machineconfig.yaml key present -- no "machineconfig" key. + "machineconfig.yaml": content, + }, + } + kubeClient := fake.NewSimpleClientset(secret) + talosClient := &stubApplyTalosClient{} + + handler := &machineConfigSyncHandler{} + result, err := handler.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityMachineConfigSync, + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TalosClient: talosClient, + KubeClient: kubeClient, + }, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Fatalf("expected ResultSucceeded with yaml key fallback, got %q; reason: %v", result.Status, result.FailureReason) + } + if len(talosClient.applied) != 1 { + t.Fatalf("expected 1 ApplyConfiguration call, got %d", len(talosClient.applied)) + } + if !containsString(string(talosClient.applied[0]), ontControlledLabel) { + t.Errorf("ONT controlled label must be injected even with yaml key fallback") + } +} + // TestMachineConfigSyncHandler_NoPatchSecretFallsBackToBase verifies that when no // per-node patch secret exists, the base class config is applied unchanged. RECON-A8. func TestMachineConfigSyncHandler_NoPatchSecretFallsBackToBase(t *testing.T) { From 551a07b2a3b03869f8aed974653df7fbeb2073d2 Mon Sep 17 00:00:00 2001 From: ontave Date: Sat, 30 May 2026 21:31:12 +0200 Subject: [PATCH 11/15] fix(conductor): PLT-BUG-4 PackReceipt kind, CG-8 MGMT_KUBECONFIG_PATH, CG-9 dispatcher-runner for tenant, pre-existing API group in test PLT-BUG-4: packinstance_pull_loop.go used pre-migration kind name InfrastructurePackReceipt; API server requires PackReceipt. CG-8: buildOperatorDeployment now adds MGMT_KUBECONFIG_PATH env var and conductor-mgmt-kubeconfig volume to tenant conductor Deployments. Without this env var the gate in agent.go silently disables every drift loop that requires management cluster access (TalosVersionDrift, KubernetesVersionDrift, PackReceiptDrift, PackPodHealth, OperatorContext loops). CG-9: writePhase5PostBootstrap now generates pack-deploy-queue.yaml and dispatcher-runner.yaml for all cluster roles including tenant. watchdog-queue.yaml remains management-cluster-only. The enable script applies these files to ccs-mgmt (where seam-tenant-ccs-dev resources live), not to ccs-dev. Test: capability_publisher_test.go updated from pre-migration infrastructure.ontai.dev/ infrastructurerunnerconfigs to current seam.ontai.dev/runnerconfigs. All unit tests pass. --- cmd/compiler/compile_enable.go | 60 ++++++++++++++++---- internal/agent/packinstance_pull_loop.go | 2 +- test/unit/agent/capability_publisher_test.go | 18 +++--- 3 files changed, 58 insertions(+), 22 deletions(-) diff --git a/cmd/compiler/compile_enable.go b/cmd/compiler/compile_enable.go index d72c175..14c8b2b 100644 --- a/cmd/compiler/compile_enable.go +++ b/cmd/compiler/compile_enable.go @@ -2062,10 +2062,19 @@ func writePhase5PostBootstrap(output string, operators []operatorSpec, clusterNa files := []string{ "leaderelection.yaml", } - // pack-deploy-queue.yaml, watchdog-queue.yaml, and dispatcher-runner.yaml require Kueue and - // seam-tenant-{name} namespaces, which exist only on the management cluster (INV-003). + if clusterName != "" { + // pack-deploy-queue.yaml and dispatcher-runner.yaml are required for both + // management and tenant cluster enable bundles. These resources live in + // seam-tenant-{clusterName} on the management cluster and must be applied to + // the management cluster (not the tenant cluster). The enable script is + // responsible for routing these files to the correct kubectl context. + files = append(files, "pack-deploy-queue.yaml", "dispatcher-runner.yaml") + } if clusterName != "" && clusterRole != "tenant" { - files = append(files, "pack-deploy-queue.yaml", "watchdog-queue.yaml", "dispatcher-runner.yaml") + // watchdog-queue.yaml is management-cluster-only: the conductor watchdog + // submits remediation Jobs in ont-system, which only exists on ccs-mgmt. + // conductor-schema.md §6 RuntimeDrift remediation. + files = append(files, "watchdog-queue.yaml") } meta := phaseMeta{ @@ -2084,23 +2093,26 @@ func writePhase5PostBootstrap(output string, operators []operatorSpec, clusterNa return err } - // Kueue and seam-tenant-{name} resources are management-cluster-only (INV-003). - if clusterName != "" && clusterRole != "tenant" { - // pack-deploy-queue.yaml — Kueue LocalQueue in seam-tenant-{clusterName}. - // wrapper-schema.md §9 pack delivery chain. + if clusterName != "" { + // pack-deploy-queue.yaml — Kueue LocalQueue in seam-tenant-{clusterName} on the + // management cluster. Required for pack-deploy Job admission for any cluster. + // dispatcher-schema.md §9, conductor-schema.md §5 (execute mode). if err := writePackDeployQueueYAML(dir, clusterName); err != nil { return err } + // dispatcher-runner.yaml — SA, Role, RoleBinding for the pack-deploy Job identity + // in seam-tenant-{clusterName} on the management cluster. Required for both + // management and tenant cluster PackExecution RBAC gates. INV-004. + if err := writeDispatcherRunnerRBACYAML(dir, clusterName); err != nil { + return err + } + } + if clusterName != "" && clusterRole != "tenant" { // watchdog-queue.yaml — Kueue LocalQueue in ont-system for watchdog Jobs. // conductor-schema.md §6 RuntimeDrift remediation. if err := writeWatchdogQueueYAML(dir); err != nil { return err } - // dispatcher-runner.yaml — SA, Role, RoleBinding for pack-deploy Job identity. - // guardian-schema.md §6, INV-004. - if err := writeDispatcherRunnerRBACYAML(dir, clusterName); err != nil { - return err - } } return nil @@ -2518,6 +2530,30 @@ func buildOperatorDeployment(op operatorSpec) appsv1.Deployment { }) } + // Tenant conductor mounts the management cluster kubeconfig so all target-cluster + // drift loops (TalosVersionDriftLoop, KubernetesVersionDriftLoop, PackPodHealthLoop, + // PackReceiptDriftLoop, etc.) can write DriftSignals and read PackInstalled on + // ccs-mgmt. Without MGMT_KUBECONFIG_PATH the gate in agent.go silently disables + // every loop that requires management cluster access. conductor-schema.md §15. + if op.Name == "conductor" && op.Role == "tenant" { + env = append(env, + corev1.EnvVar{Name: "MGMT_KUBECONFIG_PATH", Value: "/etc/conductor/mgmt/kubeconfig"}, + ) + volumes = append(volumes, corev1.Volume{ + Name: "conductor-mgmt-kubeconfig", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: "conductor-mgmt-kubeconfig", + }, + }, + }) + volumeMounts = append(volumeMounts, corev1.VolumeMount{ + Name: "conductor-mgmt-kubeconfig", + MountPath: "/etc/conductor/mgmt", + ReadOnly: true, + }) + } + // Platform, Dispatcher, and seam carry OPERATOR_NAMESPACE so their webhook // servers and controllers can resolve their own namespace without downward API // duplication. OPERATOR_NAMESPACE is also required by Guardian admission hooks diff --git a/internal/agent/packinstance_pull_loop.go b/internal/agent/packinstance_pull_loop.go index c841c7d..1f62fb2 100644 --- a/internal/agent/packinstance_pull_loop.go +++ b/internal/agent/packinstance_pull_loop.go @@ -356,7 +356,7 @@ func (l *PackInstancePullLoop) upsertPackReceipt( receipt := &unstructured.Unstructured{ Object: map[string]interface{}{ "apiVersion": "seam.ontai.dev/v1alpha1", - "kind": "InfrastructurePackReceipt", + "kind": "PackReceipt", "metadata": map[string]interface{}{ "name": receiptName, "namespace": l.namespace, diff --git a/test/unit/agent/capability_publisher_test.go b/test/unit/agent/capability_publisher_test.go index a4b2f55..accc6d6 100644 --- a/test/unit/agent/capability_publisher_test.go +++ b/test/unit/agent/capability_publisher_test.go @@ -18,9 +18,9 @@ import ( ) var runnerConfigGVR = schema.GroupVersionResource{ - Group: "infrastructure.ontai.dev", - Version: "v1alpha1", - Resource: "infrastructurerunnerconfigs", + Group: "seam.ontai.dev", + Version: "v1alpha1", + Resource: "runnerconfigs", } // makeRunnerConfig constructs an Unstructured RunnerConfig with optional capabilities @@ -28,7 +28,7 @@ var runnerConfigGVR = schema.GroupVersionResource{ func makeRunnerConfig(name, namespace string, hasCaps bool) *unstructured.Unstructured { obj := &unstructured.Unstructured{ Object: map[string]interface{}{ - "apiVersion": "infrastructure.ontai.dev/v1alpha1", + "apiVersion": "seam.ontai.dev/v1alpha1", "kind": "RunnerConfig", "metadata": map[string]interface{}{ "name": name, @@ -55,15 +55,15 @@ func newFakeDynamicClient(scheme *runtime.Scheme) *dynamicfake.FakeDynamicClient // Register the RunnerConfig GVR in the RESTMapper by adding it to the scheme. // dynamicfake uses the scheme to resolve GVKs; we add a dummy unstructured type. gvk := schema.GroupVersionKind{ - Group: "infrastructure.ontai.dev", + Group: "seam.ontai.dev", Version: "v1alpha1", Kind: "RunnerConfig", } scheme.AddKnownTypeWithName(gvk, &runtime.Unknown{}) gvkList := schema.GroupVersionKind{ - Group: "infrastructure.ontai.dev", + Group: "seam.ontai.dev", Version: "v1alpha1", - Kind: "InfrastructureRunnerConfigList", + Kind: "RunnerConfigList", } scheme.AddKnownTypeWithName(gvkList, &runtime.Unknown{}) _ = meta.NewDefaultRESTMapper(nil) @@ -137,11 +137,11 @@ func TestCapabilityPublisher_ConstructsWithoutPanic(t *testing.T) { // fake tracker knows the list kind mapping. func newAllFakeDynamicClient(scheme *runtime.Scheme) *dynamicfake.FakeDynamicClient { scheme.AddKnownTypeWithName(schema.GroupVersionKind{ - Group: "infrastructure.ontai.dev", Version: "v1alpha1", Kind: "RunnerConfig", + Group: "seam.ontai.dev", Version: "v1alpha1", Kind: "RunnerConfig", }, &unstructured.Unstructured{}) _ = meta.NewDefaultRESTMapper(nil) return dynamicfake.NewSimpleDynamicClientWithCustomListKinds(scheme, - map[schema.GroupVersionResource]string{runnerConfigGVR: "InfrastructureRunnerConfigList"}, + map[schema.GroupVersionResource]string{runnerConfigGVR: "RunnerConfigList"}, ) } From f86e4b69faad667b9f25f222ae52d6bfeb60fef6 Mon Sep 17 00:00:00 2001 From: ontave Date: Mon, 1 Jun 2026 07:33:44 +0200 Subject: [PATCH 12/15] feat(conductor): MachineConfig CRD migration -- Phases 3a/3b/4a/4b Compiler now generates MachineConfig CRs (platform.ontai.dev/v1alpha1) instead of Secrets. Adds addnode subcommand for post-bootstrap node template generation. machineconfig-sync reads MachineConfig CRs via DynamicClient. Upgrade capability uses powercycle reboot and orders nodes ascending by spec.order from MachineConfig CRs. - compiler bootstrap: buildMachineConfigCR replaces buildMachineConfigSecret; MachineConfig CR YAML output with typed fields (role, order, nodeIP, nodeHostname, clusterRef) and unstructured spec.machine/spec.cluster - compiler addnode: new subcommand generates MachineConfig CR template with --existing-cr cloning or skeleton placeholder output - machineconfig-sync: DynamicClient fetch of MachineConfig CR, reconstructMachineConfigYAML splits machine/cluster sections; machineConfigSyncDataKey constant extracted to shared file - platform_upgrade: nodesFromMachineConfigCRs lists and sorts CRs ascending by spec.order; falls back to TalosClient.Nodes(); RebootPowercycle replaces Reboot for post-stage node cycling - TalosNodeClient: RebootPowercycle added to interface; all stubs updated - 24 new tests across addnode, machineconfig-sync, and upgrade packages --- cmd/compiler/addnode.go | 210 +++++++ cmd/compiler/addnode_test.go | 191 +++++++ cmd/compiler/compile.go | 429 +++++++------- .../compile_bootstrap_features_test.go | 23 +- cmd/compiler/compile_bootstrap_import_test.go | 133 +++-- cmd/compiler/compile_bootstrap_test.go | 27 +- cmd/compiler/main.go | 3 + go.mod | 2 +- internal/capability/adapters.go | 6 + internal/capability/clients.go | 5 + internal/capability/platform_cluster_test.go | 3 +- .../platform_machineconfig_constants.go | 9 + .../capability/platform_machineconfig_sync.go | 180 ++---- .../platform_machineconfig_sync_test.go | 537 ++++++++---------- .../capability/platform_machineconfig_test.go | 6 +- .../capability/platform_node_scaleup_test.go | 3 +- internal/capability/platform_postop_test.go | 3 +- internal/capability/platform_upgrade.go | 54 +- internal/capability/platform_upgrade_test.go | 140 ++++- test/unit/capability/platform_test.go | 5 + 20 files changed, 1230 insertions(+), 739 deletions(-) create mode 100644 cmd/compiler/addnode.go create mode 100644 cmd/compiler/addnode_test.go create mode 100644 internal/capability/platform_machineconfig_constants.go diff --git a/cmd/compiler/addnode.go b/cmd/compiler/addnode.go new file mode 100644 index 0000000..98d885c --- /dev/null +++ b/cmd/compiler/addnode.go @@ -0,0 +1,210 @@ +// addnode.go implements the compiler addnode subcommand for generating a new +// MachineConfig CR for a node being added to an existing cluster. +// +// Usage: +// +// compiler addnode --cluster --hostname --ip --role +// [--order ] [--existing-cr ] --output +// +// When --existing-cr is given, the machine and cluster config sections are +// copied from the specified MachineConfig CR with identity fields overridden. +// When absent, a skeleton CR is emitted with empty machine and cluster stubs. +// +// conductor-schema.md §9. platform-schema.md §9. +package main + +import ( + "flag" + "fmt" + "os" + "path/filepath" + + corev1 "k8s.io/api/core/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/yaml" + + platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" +) + +// addnodeHelp is the authored per-subcommand help for 'compiler addnode'. +const addnodeHelp = `Usage: compiler addnode --cluster --hostname --ip --role --output + [--order ] [--existing-cr ] + +Generate a MachineConfig CR for a node being added to an existing cluster. +The output CR is placed in the --output directory as seam-mc-{cluster}-{hostname}.yaml. + +Flags: + --cluster Cluster name (matches the TalosCluster CR name and seam-tenant-{cluster} namespace). + --hostname Node hostname. The cluster-name prefix is stripped automatically if present, + so both "cp4" and "ccs-dev-cp4" produce seam-mc-{cluster}-cp4. + --ip Node IP address reachable on Talos API port 50000. + --role Node role: controlplane or worker. (init is reserved for compiler bootstrap.) + --order Upgrade sequence order (default: 1). init=0, controlplane=1..N, worker=N+1..M. + --existing-cr Path to an existing MachineConfig CR YAML. When provided, spec.machine and + spec.cluster are copied from the existing CR and identity fields are overridden + with the flags above. Use to clone an existing node config for a new peer. + --output Output directory for the generated MachineConfig CR YAML (required). + +When --existing-cr is absent, a skeleton CR is emitted with empty machine and cluster sections. +Populate those sections with the Talos v1alpha1 machineconfig content before applying. + +Compile-only: compiler addnode never applies resources. Human review and GitOps apply required. +` + +// compileAddNode generates a MachineConfig CR for a node being added to an existing cluster. +// clusterName is the TalosCluster name. hostname may include the cluster-name prefix -- +// it is stripped before constructing the CR name. role must be "controlplane" or "worker" +// (init is managed exclusively by compiler bootstrap). order is the upgrade sequence position. +// existingCRPath, when non-empty, is read to copy spec.machine and spec.cluster. output is the +// directory receiving seam-mc-{cluster}-{bareHostname}.yaml. +func compileAddNode(clusterName, hostname, ip, role string, order int32, existingCRPath, output string) error { + if clusterName == "" { + return fmt.Errorf("--cluster is required") + } + if hostname == "" { + return fmt.Errorf("--hostname is required") + } + if ip == "" { + return fmt.Errorf("--ip is required") + } + switch role { + case "controlplane", "worker": + case "init": + return fmt.Errorf("role=init is reserved for compiler bootstrap; use controlplane or worker") + default: + return fmt.Errorf("--role must be controlplane or worker, got %q", role) + } + if output == "" { + return fmt.Errorf("--output is required") + } + + // Strip cluster-name prefix from hostname so seam-mc-{cluster}-{hostname} is not doubled. + bareHostname := stripClusterPrefix(clusterName, hostname) + + mcRole := platformv1alpha1.MachineConfigRoleControlPlane + if role == "worker" { + mcRole = platformv1alpha1.MachineConfigRoleWorker + } + + var machineJSON, clusterJSON *apiextensionsv1.JSON + if existingCRPath != "" { + m, c, err := loadMachineClusterFromCR(existingCRPath) + if err != nil { + return fmt.Errorf("read existing CR %q: %w", existingCRPath, err) + } + machineJSON = m + clusterJSON = c + } + + crName := "seam-mc-" + clusterName + "-" + bareHostname + mc := platformv1alpha1.MachineConfig{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "platform.ontai.dev/v1alpha1", + Kind: "MachineConfig", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: crName, + Namespace: "seam-tenant-" + clusterName, + Labels: map[string]string{ + "ontai.dev/cluster": clusterName, + "ontai.dev/node": hostname, + "ontai.dev/node-role": role, + "ontai.dev/managed-by": "compiler", + }, + }, + Spec: platformv1alpha1.MachineConfigSpec{ + Role: mcRole, + Order: order, + ClusterRef: corev1.LocalObjectReference{ + Name: clusterName, + }, + NodeIP: ip, + NodeHostname: bareHostname, + Machine: machineJSON, + Cluster: clusterJSON, + }, + } + + data, err := yaml.Marshal(mc) + if err != nil { + return fmt.Errorf("marshal MachineConfig CR: %w", err) + } + + var header string + if existingCRPath == "" { + header = "# MachineConfig CR skeleton generated by compiler addnode.\n" + + "# Populate spec.machine and spec.cluster with the Talos v1alpha1\n" + + "# machineconfig sections for this node before applying.\n" + + "# Refer to: https://www.talos.dev/latest/reference/configuration/\n" + } + + if err := os.MkdirAll(output, 0755); err != nil { + return fmt.Errorf("create output directory %q: %w", output, err) + } + outPath := filepath.Join(output, crName+".yaml") + if err := os.WriteFile(outPath, []byte(header+string(data)), 0644); err != nil { + return fmt.Errorf("write MachineConfig CR %q: %w", outPath, err) + } + return nil +} + +// stripClusterPrefix strips the "{clusterName}-" prefix from hostname if present. +// e.g. stripClusterPrefix("ccs-dev", "ccs-dev-cp4") → "cp4" +// +// stripClusterPrefix("ccs-dev", "cp4") → "cp4" +func stripClusterPrefix(clusterName, hostname string) string { + prefix := clusterName + "-" + if len(hostname) > len(prefix) && hostname[:len(prefix)] == prefix { + return hostname[len(prefix):] + } + return hostname +} + +// loadMachineClusterFromCR reads a MachineConfig CR YAML file and returns the +// spec.machine and spec.cluster sections. Used by addnode to clone the config +// body from an existing peer node. +func loadMachineClusterFromCR(path string) (*apiextensionsv1.JSON, *apiextensionsv1.JSON, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, nil, fmt.Errorf("read file: %w", err) + } + + var cr struct { + Spec struct { + Machine *apiextensionsv1.JSON `json:"machine" yaml:"machine"` + Cluster *apiextensionsv1.JSON `json:"cluster" yaml:"cluster"` + } `json:"spec" yaml:"spec"` + } + if err := yaml.Unmarshal(data, &cr); err != nil { + return nil, nil, fmt.Errorf("parse MachineConfig CR: %w", err) + } + return cr.Spec.Machine, cr.Spec.Cluster, nil +} + +// runAddNodeSubcommand parses addnode-specific flags and calls compileAddNode. +func runAddNodeSubcommand(args []string) { + fs := flag.NewFlagSet("addnode", flag.ExitOnError) + cluster := fs.String("cluster", "", "Cluster name (required)") + hostname := fs.String("hostname", "", "Node hostname (required)") + ip := fs.String("ip", "", "Node IP address (required)") + role := fs.String("role", "", "Node role: controlplane or worker (required)") + order := fs.Int("order", 1, "Upgrade sequence order (default: 1)") + existingCR := fs.String("existing-cr", "", "Path to existing MachineConfig CR to clone machine/cluster sections from") + output := fs.String("output", "", "Output directory (required)") + + fs.Usage = func() { + fmt.Fprint(os.Stderr, addnodeHelp) + fs.PrintDefaults() + } + + if err := fs.Parse(args); err != nil { + fmt.Fprintf(os.Stderr, "compiler addnode: flag error: %v\n", err) + os.Exit(1) + } + + if err := compileAddNode(*cluster, *hostname, *ip, *role, int32(*order), *existingCR, *output); err != nil { + fmt.Fprintf(os.Stderr, "compiler addnode: %v\n", err) + os.Exit(1) + } +} diff --git a/cmd/compiler/addnode_test.go b/cmd/compiler/addnode_test.go new file mode 100644 index 0000000..9fdfefe --- /dev/null +++ b/cmd/compiler/addnode_test.go @@ -0,0 +1,191 @@ +// addnode_test.go tests the compiler addnode subcommand. +// Covers skeleton generation (no --existing-cr) and CR cloning +// (--existing-cr copies machine/cluster sections, overrides identity fields). +// All tests are fully offline -- no cluster connectivity. +package main + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "sigs.k8s.io/yaml" +) + +// TestAddNode_SkeletonOutput verifies that addnode without --existing-cr produces +// a valid MachineConfig CR YAML with correct metadata and an empty machine/cluster section. +func TestAddNode_SkeletonOutput(t *testing.T) { + outDir := t.TempDir() + err := compileAddNode("ccs-dev", "cp4", "10.20.0.14", "controlplane", 3, "", outDir) + if err != nil { + t.Fatalf("compileAddNode error: %v", err) + } + + outPath := filepath.Join(outDir, "seam-mc-ccs-dev-cp4.yaml") + data, err := os.ReadFile(outPath) + if err != nil { + t.Fatalf("output file not found: %v", err) + } + content := string(data) + + assertContainsStr(t, content, "apiVersion: platform.ontai.dev/v1alpha1") + assertContainsStr(t, content, "kind: MachineConfig") + assertContainsStr(t, content, "name: seam-mc-ccs-dev-cp4") + assertContainsStr(t, content, "namespace: seam-tenant-ccs-dev") + assertContainsStr(t, content, "role: controlplane") + assertContainsStr(t, content, "nodeIP: 10.20.0.14") + assertContainsStr(t, content, "nodeHostname: cp4") + assertContainsStr(t, content, "ontai.dev/cluster: ccs-dev") + // Skeleton header comment must be present. + assertContainsStr(t, content, "# MachineConfig CR skeleton generated by compiler addnode.") +} + +// TestAddNode_SkeletonStripsClusterPrefix verifies that a hostname already containing +// the cluster-name prefix is not doubled in the output CR name. +func TestAddNode_SkeletonStripsClusterPrefix(t *testing.T) { + outDir := t.TempDir() + // hostname includes cluster prefix -- should produce seam-mc-ccs-dev-cp4, not seam-mc-ccs-dev-ccs-dev-cp4. + err := compileAddNode("ccs-dev", "ccs-dev-cp4", "10.20.0.14", "controlplane", 3, "", outDir) + if err != nil { + t.Fatalf("compileAddNode error: %v", err) + } + + outPath := filepath.Join(outDir, "seam-mc-ccs-dev-cp4.yaml") + if _, err := os.Stat(outPath); err != nil { + t.Errorf("expected output file seam-mc-ccs-dev-cp4.yaml not found: %v", err) + } + data, _ := os.ReadFile(outPath) + if strings.Contains(string(data), "ccs-dev-ccs-dev") { + t.Errorf("hostname prefix was doubled in output: %s", string(data)) + } +} + +// TestAddNode_OrderField verifies that the spec.order field is written correctly. +func TestAddNode_OrderField(t *testing.T) { + outDir := t.TempDir() + if err := compileAddNode("ccs-dev", "wk1", "10.20.0.20", "worker", 5, "", outDir); err != nil { + t.Fatalf("compileAddNode error: %v", err) + } + data, _ := os.ReadFile(filepath.Join(outDir, "seam-mc-ccs-dev-wk1.yaml")) + assertContainsStr(t, string(data), "order: 5") + assertContainsStr(t, string(data), "role: worker") +} + +// TestAddNode_InitRoleFails verifies that role=init is rejected (reserved for bootstrap). +func TestAddNode_InitRoleFails(t *testing.T) { + err := compileAddNode("ccs-dev", "cp1", "10.20.0.11", "init", 0, "", t.TempDir()) + if err == nil { + t.Error("expected error for role=init; got nil") + } + if !strings.Contains(err.Error(), "init") { + t.Errorf("error %q should mention 'init'", err.Error()) + } +} + +// TestAddNode_InvalidRoleFails verifies that an unknown role is rejected. +func TestAddNode_InvalidRoleFails(t *testing.T) { + err := compileAddNode("ccs-dev", "cp4", "10.20.0.14", "not-a-role", 1, "", t.TempDir()) + if err == nil { + t.Error("expected error for unknown role; got nil") + } +} + +// TestAddNode_MissingClusterFails verifies that an empty --cluster is rejected. +func TestAddNode_MissingClusterFails(t *testing.T) { + err := compileAddNode("", "cp4", "10.20.0.14", "controlplane", 1, "", t.TempDir()) + if err == nil { + t.Error("expected error for missing cluster; got nil") + } +} + +// TestAddNode_MissingIPFails verifies that an empty --ip is rejected. +func TestAddNode_MissingIPFails(t *testing.T) { + err := compileAddNode("ccs-dev", "cp4", "", "controlplane", 1, "", t.TempDir()) + if err == nil { + t.Error("expected error for missing ip; got nil") + } +} + +// TestAddNode_ExistingCR_CopiesMachineCluster verifies that when --existing-cr is +// provided, spec.machine and spec.cluster are copied from the existing CR and all +// identity fields are overridden with the supplied flags. +func TestAddNode_ExistingCR_CopiesMachineCluster(t *testing.T) { + // Build a bootstrap output to get a real MachineConfig CR as the source. + bootstrapDir := t.TempDir() + inputPath := writeInputFile(t, bootstrapInputYAML) + if err := compileBootstrap(inputPath, bootstrapDir, "", ""); err != nil { + t.Fatalf("compileBootstrap error: %v", err) + } + + // Use node1's MachineConfig CR as the existing-cr template. + existingCRPath := filepath.Join(bootstrapDir, "seam-mc-ccs-mgmt-node1.yaml") + if _, err := os.Stat(existingCRPath); err != nil { + t.Fatalf("existing CR not found: %v", err) + } + + outDir := t.TempDir() + err := compileAddNode("ccs-mgmt", "node4", "10.20.0.14", "controlplane", 3, existingCRPath, outDir) + if err != nil { + t.Fatalf("compileAddNode with existing-cr error: %v", err) + } + + data, err := os.ReadFile(filepath.Join(outDir, "seam-mc-ccs-mgmt-node4.yaml")) + if err != nil { + t.Fatalf("output file not found: %v", err) + } + content := string(data) + + // Identity fields must be overridden. + assertContainsStr(t, content, "name: seam-mc-ccs-mgmt-node4") + assertContainsStr(t, content, "namespace: seam-tenant-ccs-mgmt") + assertContainsStr(t, content, "nodeIP: 10.20.0.14") + assertContainsStr(t, content, "nodeHostname: node4") + assertContainsStr(t, content, "role: controlplane") + assertContainsStr(t, content, "order: 3") + + // Machine/cluster sections must be present (copied from source CR). + var cr map[string]interface{} + if err := yaml.Unmarshal(data, &cr); err != nil { + t.Fatalf("parse output CR: %v", err) + } + spec, _ := cr["spec"].(map[string]interface{}) + if spec == nil { + t.Fatal("output CR has no spec") + } + if spec["machine"] == nil { + t.Error("spec.machine should be populated from existing CR") + } + if spec["cluster"] == nil { + t.Error("spec.cluster should be populated from existing CR") + } + + // Skeleton comment must NOT appear (this is a cloned CR, not a skeleton). + if strings.Contains(content, "skeleton generated by compiler addnode") { + t.Error("cloned CR must not contain skeleton header comment") + } +} + +// TestAddNode_ExistingCR_MissingFileFails verifies that a missing --existing-cr path +// returns a descriptive error. +func TestAddNode_ExistingCR_MissingFileFails(t *testing.T) { + err := compileAddNode("ccs-dev", "cp4", "10.20.0.14", "controlplane", 1, "/nonexistent/cr.yaml", t.TempDir()) + if err == nil { + t.Error("expected error for missing existing-cr path; got nil") + } +} + +// TestAddNode_NamingConvention verifies the seam-mc-{cluster}-{hostname} naming +// convention for the output file. platform-schema.md §9. +func TestAddNode_NamingConvention(t *testing.T) { + outDir := t.TempDir() + if err := compileAddNode("my-cluster", "worker99", "10.10.0.99", "worker", 10, "", outDir); err != nil { + t.Fatalf("compileAddNode error: %v", err) + } + expectedFile := filepath.Join(outDir, "seam-mc-my-cluster-worker99.yaml") + if _, err := os.Stat(expectedFile); err != nil { + t.Errorf("expected output file seam-mc-my-cluster-worker99.yaml not found: %v", err) + } +} + +// assertContainsStr is defined in compile_bootstrap_test.go. diff --git a/cmd/compiler/compile.go b/cmd/compiler/compile.go index 4324193..de5cb96 100644 --- a/cmd/compiler/compile.go +++ b/cmd/compiler/compile.go @@ -6,6 +6,7 @@ package main import ( "context" + "encoding/json" "fmt" "log/slog" "os" @@ -14,7 +15,7 @@ import ( "time" corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/yaml" @@ -71,17 +72,20 @@ func ciliumPrerequisitesPatch() string { // the provided registry mirrors into machine.registries.mirrors. The http:// prefix // on endpoints is preserved exactly — no TLS config is added. func buildRegistryMirrorsPatch(mirrors []RegistryMirror) (string, error) { + // json: tags are required: sigs.k8s.io/yaml marshals via encoding/json, so + // without json: tags the field names default to PascalCase (Machine, Registries) + // which won't merge into the lowercase Talos machineconfig keys. type mirrorSpec struct { - Endpoints []string `yaml:"endpoints"` + Endpoints []string `json:"endpoints" yaml:"endpoints"` } type registriesSpec struct { - Mirrors map[string]mirrorSpec `yaml:"mirrors"` + Mirrors map[string]mirrorSpec `json:"mirrors" yaml:"mirrors"` } type machineSpec struct { - Registries registriesSpec `yaml:"registries"` + Registries registriesSpec `json:"registries" yaml:"registries"` } type patchSpec struct { - Machine machineSpec `yaml:"machine"` + Machine machineSpec `json:"machine" yaml:"machine"` } mirrorMap := make(map[string]mirrorSpec, len(mirrors)) @@ -171,9 +175,9 @@ func extractCAFromMachineConfig(machineConfigBytes []byte) (*secrets.Bundle, err } // BootstrapNode declares a single Talos node for management cluster bootstrap. -// Each node maps to one Talos machine configuration and one Kubernetes Secret. +// Each node maps to one Talos machine configuration and one MachineConfig CR. type BootstrapNode struct { - // Hostname is the node's hostname. Used as the node name in Secret naming + // Hostname is the node's hostname. Used as the bare hostname in naming // convention seam-mc-{cluster}-{hostname}. platform-schema.md §9. Hostname string `yaml:"hostname"` @@ -184,6 +188,11 @@ type BootstrapNode struct { // "controlplane" (additional control plane nodes), or "worker". // Exactly one node must have role "init". Role string `yaml:"role"` + + // MAC is the node's primary NIC MAC address. Informational only -- + // not used by the compiler or operator. Stored for admin reference. + // +optional + MAC string `yaml:"mac,omitempty"` } // BootstrapSection holds management cluster bootstrap configuration. @@ -744,25 +753,14 @@ func validateBootstrapInput(b *BootstrapSection) error { // compileBootstrap implements the bootstrap subcommand. // -// Reads a ClusterInput spec (with a bootstrap section declaring node IPs, roles, -// and Talos version) and produces three output artifacts in --output: -// - seam-mc-{cluster}-{hostname}.yaml — Kubernetes Secret YAML per node -// containing the Talos machine configuration. platform-schema.md §9. -// - {cluster-name}.yaml — TalosCluster CR with mode=bootstrap, capi.enabled=false. -// - bootstrap-sequence.yaml — documents the apply order. +// Bootstrap mode: generates MachineConfig CRs (one per node), namespace manifest, +// TalosCluster CR, and bootstrap-sequence.yaml. Uses Talos machinery for PKI. // -// kubeconfigPath is the optional path to a kubeconfig file, used only when -// in.ImportExistingCluster=true. Pass empty string to use the standard resolution -// chain (KUBECONFIG env → ~/.kube/config). +// Import mode: generates namespace manifest, talosconfig Secret (if resolvable), +// TalosCluster CR, and bootstrap-sequence.yaml. MachineConfig CRs are NOT +// generated -- admin provides them via compiler addnode or hand-authored. CP-INV-004. // -// When importExistingCluster=true, Compiler connects to the cluster Kubernetes API -// via kubeconfig, reads the init-node machine config Secret from seam-system, parses -// it, and derives the secrets bundle from existing CA material so new configs are -// signed with the same PKI. Fails fast if the kubeconfig is unreachable or the -// Secret or its machineconfig.yaml field is missing. -// -// Uses the Talos machinery library to generate machine configurations. -// No cluster connection is required in the default (fresh PKI) path. +// kubeconfigPath is not used; retained for CLI flag compatibility. // conductor-schema.md §9. func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) error { in, err := readClusterInput(input) @@ -797,6 +795,11 @@ func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) err controlPlaneEndpoint = ep } + tcMode := platformv1alpha1.TalosClusterModeBootstrap + if in.Mode == "import" || in.ImportExistingCluster { + tcMode = platformv1alpha1.TalosClusterModeImport + } + // Resolve kubernetesVersion: explicit > support matrix. kubernetesVersion := b.KubernetesVersion if kubernetesVersion == "" { @@ -808,8 +811,10 @@ func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) err } // Resolve installDisk: explicit > extracted from machineConfigPaths > default. + // Only needed for bootstrap mode (generate.NewInput); skip extraction in import mode + // to avoid requiring machineConfigPaths files that admin provides separately. installDisk := b.InstallDisk - if installDisk == "" { + if installDisk == "" && tcMode == platformv1alpha1.TalosClusterModeBootstrap { extracted, err := extractFromInitNode(in.MachineConfigPaths, b.Nodes, func(mcBytes []byte) (string, error) { return extractInstallDiskFromMachineConfig(mcBytes), nil @@ -843,106 +848,6 @@ func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) err } } - // Resolve the secrets bundle. When importExistingCluster=true, extract PKI from - // an existing cluster. Two paths are available: - // - // machineConfigPaths non-empty — local file path (pre-Seam clusters): - // Read the init node entry from the map, load the raw machine config file, - // and extract CA material via extractCAFromMachineConfig. - // - // machineConfigPaths absent — Kubernetes API path (Seam clusters): - // Connect to the cluster API via kubeconfig, read the seam-mc-{cluster}-{init} - // Secret from seam-system, extract machineconfig.yaml, and extract CA material. - // - // Both paths share extractCAFromMachineConfig for the final CA extraction step. - var secretsBundle *secrets.Bundle - if in.Mode == "import" || in.ImportExistingCluster { - // Find the init node hostname (guaranteed present by validateBootstrapInput). - var initHostname string - for _, n := range b.Nodes { - if n.Role == "init" { - initHostname = n.Hostname - break - } - } - - if len(in.MachineConfigPaths) > 0 { - // Local file path: read CA from user-provided machine config file. - // Only the init node entry is required; the same bundle is used for all nodes. - mcPath, ok := in.MachineConfigPaths[initHostname] - if !ok { - return fmt.Errorf("importExistingCluster: machineConfigPaths is non-empty but init node %q has no entry", initHostname) - } - mcBytes, err := os.ReadFile(mcPath) - if err != nil { - return fmt.Errorf("importExistingCluster: read machineconfig for init node %q from %q: %w", initHostname, mcPath, err) - } - secretsBundle, err = extractCAFromMachineConfig(mcBytes) - if err != nil { - return fmt.Errorf("importExistingCluster: extract CA from local file %q: %w", mcPath, err) - } - } else { - // Kubernetes API path: read CA from seam-mc Secret in seam-system. - resolvedKubeconfig := resolveKubeconfigPath(kubeconfigPath) - k8sClient, err := buildK8sClient(resolvedKubeconfig) - if err != nil { - return fmt.Errorf("importExistingCluster: connect to cluster via kubeconfig %q: %w", resolvedKubeconfig, err) - } - - // Strip cluster-name prefix from hostname: Talos node names carry the - // cluster prefix (e.g. "ccs-mgmt-cp1" for cluster "ccs-mgmt"), so the - // Secret name would double the prefix without this strip. C-32. - hostname := strings.TrimPrefix(initHostname, in.Name+"-") - secretName := "seam-mc-" + in.Name + "-" + hostname - mcSecret, err := k8sClient.CoreV1().Secrets("seam-system").Get( - context.Background(), secretName, metav1.GetOptions{}, - ) - if err != nil { - if apierrors.IsNotFound(err) { - // seam-mc Secret absent — cluster was not bootstrapped via Seam. - // Fall through to the talosconfig-only path: emit only the - // talosconfig Secret and TalosCluster CR. No machineconfig - // generation, no PKI extraction. C-32 Bug 2. - return compileImportTalosconfigSecret(in, output, talosconfigPath) - } - return fmt.Errorf("importExistingCluster: read secret %q from seam-system: %w", secretName, err) - } - - mcBytes, ok := mcSecret.Data["machineconfig.yaml"] - if !ok { - return fmt.Errorf("importExistingCluster: secret %q is missing machineconfig.yaml field", secretName) - } - - secretsBundle, err = extractCAFromMachineConfig(mcBytes) - if err != nil { - return fmt.Errorf("importExistingCluster: extract CA from secret %q: %w", secretName, err) - } - } - } else { - secretsBundle, err = secrets.NewBundle( - secrets.NewFixedClock(time.Now()), - versionContract, - ) - if err != nil { - return fmt.Errorf("generate secrets bundle: %w", err) - } - } - - // Build the generate input with cluster-wide settings. - genInput, err := generate.NewInput( - in.Name, - controlPlaneEndpoint, - kubernetesVersion, - generate.WithVersionContract(versionContract), - generate.WithSecretsBundle(secretsBundle), - generate.WithInstallDisk(installDisk), - generate.WithInstallImage(installerImage), - generate.WithEndpointList(cpIPs), - ) - if err != nil { - return fmt.Errorf("build generate input: %w", err) - } - if err := os.MkdirAll(output, 0755); err != nil { return fmt.Errorf("create output directory: %w", err) } @@ -952,111 +857,135 @@ func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) err ns = "seam-system" } - // Build the ordered patch list: - // 1. CiliumPrerequisites (built-in, applied first) - // 2. RegistryMirrors (injected next) - // 3. User Patches (applied last, in order) - var patches []string - if in.CiliumPrerequisites { - patches = append(patches, ciliumPrerequisitesPatch()) - } - if len(in.RegistryMirrors) > 0 { - mirrorPatch, err := buildRegistryMirrorsPatch(in.RegistryMirrors) + // MachineConfig CRs are generated for bootstrap mode only. + // Import mode: admin provides MachineConfig CRs (via compiler addnode or hand-authored). + // platform-schema.md §9, CP-INV-004. + var crNames []string + if tcMode == platformv1alpha1.TalosClusterModeBootstrap { + secretsBundle, err := secrets.NewBundle( + secrets.NewFixedClock(time.Now()), + versionContract, + ) if err != nil { - return fmt.Errorf("build registry mirrors patch: %w", err) + return fmt.Errorf("generate secrets bundle: %w", err) } - patches = append(patches, mirrorPatch) - } - patches = append(patches, in.Patches...) - // Generate machine configuration for each node and write as a Secret. - var secretNames []string - for _, node := range b.Nodes { - machineType, err := nodeRoleToMachineType(node.Role) + genInput, err := generate.NewInput( + in.Name, + controlPlaneEndpoint, + kubernetesVersion, + generate.WithVersionContract(versionContract), + generate.WithSecretsBundle(secretsBundle), + generate.WithInstallDisk(installDisk), + generate.WithInstallImage(installerImage), + generate.WithEndpointList(cpIPs), + ) if err != nil { - return fmt.Errorf("node %q: %w", node.Hostname, err) + return fmt.Errorf("build generate input: %w", err) } - cfg, err := genInput.Config(machineType) - if err != nil { - return fmt.Errorf("generate config for node %q: %w", node.Hostname, err) + // Build the ordered patch list: + // 1. CiliumPrerequisites (built-in, applied first) + // 2. RegistryMirrors (injected next) + // 3. User Patches (applied last, in order) + var patches []string + if in.CiliumPrerequisites { + patches = append(patches, ciliumPrerequisitesPatch()) } + if len(in.RegistryMirrors) > 0 { + mirrorPatch, err := buildRegistryMirrorsPatch(in.RegistryMirrors) + if err != nil { + return fmt.Errorf("build registry mirrors patch: %w", err) + } + patches = append(patches, mirrorPatch) + } + patches = append(patches, in.Patches...) - cfgBytes, err := cfg.Bytes() - if err != nil { - return fmt.Errorf("marshal config for node %q: %w", node.Hostname, err) + // Pre-compute upgrade order for each node. + // init=0, controlplane nodes=1..N in declaration order, workers=N+1..M. + cpIdx := int32(0) + workerIdx := int32(0) + cpCount := int32(0) + for _, n := range b.Nodes { + if n.Role == "controlplane" { + cpCount++ + } + } + nodeOrder := make(map[string]int32, len(b.Nodes)) + for _, n := range b.Nodes { + switch n.Role { + case "init": + nodeOrder[n.Hostname] = 0 + case "controlplane": + cpIdx++ + nodeOrder[n.Hostname] = cpIdx + case "worker": + nodeOrder[n.Hostname] = cpCount + 1 + workerIdx + workerIdx++ + } } - // Apply all patches in order (CiliumPrerequisites → RegistryMirrors → user Patches). - for i, patch := range patches { - cfgBytes, err = applyYAMLPatch(cfgBytes, patch) + for _, node := range b.Nodes { + machineType, err := nodeRoleToMachineType(node.Role) if err != nil { - return fmt.Errorf("apply patch %d to node %q: %w", i, node.Hostname, err) + return fmt.Errorf("node %q: %w", node.Hostname, err) } - } - // Strip cluster-name prefix from hostname before constructing the secret - // name so the prefix is not doubled (e.g. ccs-mgmt-cp1 → cp1). C-32. - // Machine config secrets always live in seam-tenant-{cluster}, not in the - // TalosCluster CR namespace (seam-system). Platform reads them from there. - bareHostname := strings.TrimPrefix(node.Hostname, in.Name+"-") - secretName := "seam-mc-" + in.Name + "-" + bareHostname - secret := corev1.Secret{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "v1", - Kind: "Secret", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: secretName, - Namespace: "seam-tenant-" + in.Name, - Labels: map[string]string{ - "ontai.dev/cluster": in.Name, - "ontai.dev/node": node.Hostname, - "ontai.dev/node-role": node.Role, - "ontai.dev/managed-by": "compiler", - }, - }, - Type: corev1.SecretTypeOpaque, - StringData: map[string]string{ - "machineconfig.yaml": string(cfgBytes), - }, - } + cfg, err := genInput.Config(machineType) + if err != nil { + return fmt.Errorf("generate config for node %q: %w", node.Hostname, err) + } - if err := writeCRYAML(output, secretName, secret); err != nil { - return fmt.Errorf("write machineconfig secret for node %q: %w", node.Hostname, err) + cfgBytes, err := cfg.Bytes() + if err != nil { + return fmt.Errorf("marshal config for node %q: %w", node.Hostname, err) + } + + // Apply all patches in order (CiliumPrerequisites → RegistryMirrors → user Patches). + for i, patch := range patches { + cfgBytes, err = applyYAMLPatch(cfgBytes, patch) + if err != nil { + return fmt.Errorf("apply patch %d to node %q: %w", i, node.Hostname, err) + } + } + + // Strip cluster-name prefix from hostname before constructing the CR + // name so the prefix is not doubled (e.g. ccs-mgmt-cp1 → cp1). C-32. + // MachineConfig CRs always live in seam-tenant-{cluster}. + bareHostname := strings.TrimPrefix(node.Hostname, in.Name+"-") + mc, err := buildMachineConfigCR(node, cfgBytes, in.Name, bareHostname, nodeOrder[node.Hostname]) + if err != nil { + return fmt.Errorf("build MachineConfig CR for node %q: %w", node.Hostname, err) + } + crName := mc.Name + if err := writeCRYAML(output, crName, mc); err != nil { + return fmt.Errorf("write MachineConfig CR for node %q: %w", node.Hostname, err) + } + crNames = append(crNames, crName+".yaml") } - secretNames = append(secretNames, secretName+".yaml") } - // C-35: When importExistingCluster=true, also emit the talosconfig Secret so - // Platform can generate the kubeconfig via ensureKubeconfigSecret. Applies to - // both the machineConfigPaths path (local file PKI) and the Kubernetes API path - // (Seam clusters). Failure is a warning -- the operator can apply manually. - // Also emit the seam-tenant namespace manifest so the admin can apply it before - // the Secrets (which live in seam-tenant-{cluster}). platform-schema.md §9. - if in.Mode == "import" || in.ImportExistingCluster { - nsFile, err := writeSeamTenantNamespaceManifest(in.Name, output) - if err != nil { - return err - } - secretNames = append([]string{nsFile}, secretNames...) + // Namespace manifest: always emitted. MachineConfig CRs (bootstrap) and + // talosconfig Secrets (import) both live in seam-tenant-{cluster}. + nsFile, err := writeSeamTenantNamespaceManifest(in.Name, output) + if err != nil { + return err + } + allResources := append([]string{nsFile}, crNames...) + + // Import mode: also emit the talosconfig Secret so Platform can generate the + // kubeconfig via ensureKubeconfigSecret. Failure is a warning -- the operator + // can apply manually. + if tcMode == platformv1alpha1.TalosClusterModeImport { if tcfgFile, err := writeTalosconfigSecret(in, talosconfigPath, output); err != nil { return err } else if tcfgFile != "" { - secretNames = append(secretNames, tcfgFile) + allResources = append(allResources, tcfgFile) } } - // Fix 1: importExistingCluster=true always emits mode=import. The - // machineConfigPaths field only controls where PKI is read from, not the - // cluster lifecycle mode. A re-imported cluster is always mode=import. - tcMode := platformv1alpha1.TalosClusterModeBootstrap - if in.Mode == "import" || in.ImportExistingCluster { - tcMode = platformv1alpha1.TalosClusterModeImport - } - // Produce TalosCluster CR. ontai.dev/owns-runnerconfig signals Platform to add - // a finalizer and clean up the RunnerConfig in ont-system on deletion. Bug 3. + // a finalizer and clean up the RunnerConfig in ont-system on deletion. // // Role is set when: (a) import path -- clusterRole defaults empty to management; // (b) bootstrap path with explicit role field (e.g. role: tenant in fixture). @@ -1093,7 +1022,79 @@ func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) err } // Produce bootstrap-sequence.yaml documenting the apply order. - return writeBootstrapSequence(output, in.Name, secretNames, tcMode) + return writeBootstrapSequence(output, in.Name, allResources, tcMode) +} + +// buildMachineConfigCR converts a generated Talos machine config YAML into a +// MachineConfig CR. The machine and cluster top-level sections are stored as +// unstructured JSON in spec.machine and spec.cluster respectively so the CR +// remains Talos-version-agnostic. +// +// bareHostname must be the hostname with any cluster-name prefix stripped +// (e.g. "cp1" for node "ccs-mgmt-cp1" in cluster "ccs-mgmt"). +func buildMachineConfigCR(node BootstrapNode, cfgBytes []byte, clusterName, bareHostname string, order int32) (platformv1alpha1.MachineConfig, error) { + var rawMap map[string]interface{} + if err := yaml.Unmarshal(cfgBytes, &rawMap); err != nil { + return platformv1alpha1.MachineConfig{}, fmt.Errorf("parse machineconfig for node %q: %w", node.Hostname, err) + } + + toJSON := func(key string) (*apiextensionsv1.JSON, error) { + v, ok := rawMap[key] + if !ok || v == nil { + return nil, nil + } + b, err := json.Marshal(v) + if err != nil { + return nil, fmt.Errorf("json-encode %q section: %w", key, err) + } + return &apiextensionsv1.JSON{Raw: b}, nil + } + + machineJSON, err := toJSON("machine") + if err != nil { + return platformv1alpha1.MachineConfig{}, err + } + clusterJSON, err := toJSON("cluster") + if err != nil { + return platformv1alpha1.MachineConfig{}, err + } + + role := platformv1alpha1.MachineConfigRoleControlPlane + switch node.Role { + case "init": + role = platformv1alpha1.MachineConfigRoleInit + case "worker": + role = platformv1alpha1.MachineConfigRoleWorker + } + + crName := "seam-mc-" + clusterName + "-" + bareHostname + return platformv1alpha1.MachineConfig{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "platform.ontai.dev/v1alpha1", + Kind: "MachineConfig", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: crName, + Namespace: "seam-tenant-" + clusterName, + Labels: map[string]string{ + "ontai.dev/cluster": clusterName, + "ontai.dev/node": node.Hostname, + "ontai.dev/node-role": node.Role, + "ontai.dev/managed-by": "compiler", + }, + }, + Spec: platformv1alpha1.MachineConfigSpec{ + Role: role, + Order: order, + ClusterRef: corev1.LocalObjectReference{ + Name: clusterName, + }, + NodeIP: node.IP, + NodeHostname: bareHostname, + Machine: machineJSON, + Cluster: clusterJSON, + }, + }, nil } // nodeRoleToMachineType converts a bootstrap node role to the Talos machine.Type. @@ -1193,18 +1194,18 @@ type BootstrapSequence struct { // // C-36: previously used kind: BootstrapSequence (not a valid CRD). platform-schema.md §9. func writeBootstrapSequence(output, clusterName string, secretFiles []string, mode platformv1alpha1.TalosClusterMode) error { - step1Desc := "Apply Talos machineconfig Secrets — one per node. " + - "Apply ALL before the TalosCluster CR." + step1Desc := "Apply seam-tenant namespace manifest AND MachineConfig CRs (one per node) " + + "in seam-tenant-" + clusterName + ". Apply ALL before the TalosCluster CR." step2Desc := "Apply TalosCluster CR with mode=bootstrap and capi.enabled=false. " + "Platform's TalosClusterReconciler watches this CR and submits the bootstrap Conductor Job." if mode == platformv1alpha1.TalosClusterModeImport { - step1Desc = "Apply ALL Secrets: machineconfig Secrets (one per node) AND the talosconfig Secret " + - "(seam-mc-" + clusterName + "-talosconfig.yaml). " + - "The talosconfig Secret is required for Platform to generate the kubeconfig. " + - "Apply ALL before TalosCluster CR." + step1Desc = "Apply seam-tenant namespace manifest AND the talosconfig Secret " + + "(seam-mc-" + clusterName + "-talosconfig.yaml) in seam-tenant-" + clusterName + ". " + + "Admin must separately apply MachineConfig CRs (via compiler addnode or hand-authored) " + + "before applying the TalosCluster CR." step2Desc = "Apply TalosCluster CR with mode=import. " + - "Apply AFTER all Secrets in step 1 are present in the cluster — " + + "Apply AFTER namespace, talosconfig Secret, and MachineConfig CRs are present — " + "Platform reads the talosconfig Secret during TalosCluster reconciliation " + "to generate and store the cluster kubeconfig." } diff --git a/cmd/compiler/compile_bootstrap_features_test.go b/cmd/compiler/compile_bootstrap_features_test.go index a57f129..1a79946 100644 --- a/cmd/compiler/compile_bootstrap_features_test.go +++ b/cmd/compiler/compile_bootstrap_features_test.go @@ -673,10 +673,11 @@ bootstrap: // ── ImportExistingCluster ───────────────────────────────────────────────────── -// TestBootstrap_ImportExistingCluster_MissingKubeconfigReturnsError verifies that -// importExistingCluster: true with a non-existent kubeconfig path returns an error -// rather than silently generating fresh PKI material. -func TestBootstrap_ImportExistingCluster_MissingKubeconfigReturnsError(t *testing.T) { +// TestBootstrap_ImportExistingCluster_Succeeds verifies that importExistingCluster:true +// with all required fields present succeeds. The kubeconfig API fallback path was +// removed when the MachineConfig CRD migration eliminated PKI extraction from +// the cluster. import mode no longer connects to any external API. +func TestBootstrap_ImportExistingCluster_Succeeds(t *testing.T) { input := ` name: test-cluster namespace: seam-system @@ -696,10 +697,16 @@ bootstrap: role: init ` inputPath := writeInputFile(t, input) - // Pass a kubeconfig path that does not exist — connection must fail with an error. - err := compileBootstrap(inputPath, t.TempDir(), "/nonexistent/kubeconfig.yaml", "") - if err == nil { - t.Fatal("expected error for missing kubeconfig; got nil") + outDir := t.TempDir() + if err := compileBootstrap(inputPath, outDir, "", ""); err != nil { + t.Fatalf("expected importExistingCluster=true to succeed; got: %v", err) + } + // importExistingCluster=true → tcMode=Import → no MachineConfig CRs emitted. + if _, err := os.Stat(filepath.Join(outDir, "seam-mc-test-cluster-cp1.yaml")); err == nil { + t.Error("import mode must not emit MachineConfig CRs") + } + if _, err := os.Stat(filepath.Join(outDir, "test-cluster.yaml")); err != nil { + t.Errorf("TalosCluster CR not found: %v", err) } } diff --git a/cmd/compiler/compile_bootstrap_import_test.go b/cmd/compiler/compile_bootstrap_import_test.go index 7eec389..9e0bef5 100644 --- a/cmd/compiler/compile_bootstrap_import_test.go +++ b/cmd/compiler/compile_bootstrap_import_test.go @@ -14,12 +14,17 @@ import ( ) // generateMachineConfigFile produces a valid Talos init-node machine config YAML -// file for use in import-path tests. It runs compileBootstrap with fresh PKI to -// generate a seam-mc Secret, extracts the machineconfig.yaml field, writes it to -// a temp file, and returns the path. +// file for use in import-path tests (machineConfigPaths). It runs compileBootstrap +// with fresh PKI to generate a MachineConfig CR, extracts spec.machine and +// spec.cluster, reconstructs a Talos machineconfig YAML, and writes it to a temp +// file. This file is used as a machineConfigPaths entry for endpoint/disk extraction. func generateMachineConfigFile(t *testing.T, clusterName, hostname string) string { t.Helper() + // Strip the cluster-name prefix from hostname to match the CR name. + // compileBootstrap uses TrimPrefix(hostname, clusterName+"-") for the bare name. + bareHostname := strings.TrimPrefix(hostname, clusterName+"-") + input := fmt.Sprintf(` name: %s namespace: seam-system @@ -45,31 +50,44 @@ bootstrap: t.Fatalf("generateMachineConfigFile: compileBootstrap failed: %v", err) } - // Read the Secret YAML produced for the init node. - secretPath := filepath.Join(outDir, fmt.Sprintf("seam-mc-%s-%s.yaml", clusterName, hostname)) - secretData, err := os.ReadFile(secretPath) + // Read the MachineConfig CR YAML produced for the init node. + crPath := filepath.Join(outDir, fmt.Sprintf("seam-mc-%s-%s.yaml", clusterName, bareHostname)) + crData, err := os.ReadFile(crPath) if err != nil { - t.Fatalf("generateMachineConfigFile: read secret YAML: %v", err) + t.Fatalf("generateMachineConfigFile: read MachineConfig CR YAML: %v", err) } - // Extract machineconfig.yaml from the Secret's stringData field. - var secretObj struct { - StringData map[string]string `yaml:"stringData"` + // Parse spec.machine and spec.cluster from the CR, then reconstruct the + // full Talos machineconfig YAML (used by extractEndpointFromMachineConfig, + // extractInstallDiskFromMachineConfig, and extractCAFromMachineConfig). + var crObj struct { + Spec struct { + Machine interface{} `yaml:"machine"` + Cluster interface{} `yaml:"cluster"` + } `yaml:"spec"` } - if err := yaml.Unmarshal(secretData, &secretObj); err != nil { - t.Fatalf("generateMachineConfigFile: parse secret YAML: %v", err) + if err := yaml.Unmarshal(crData, &crObj); err != nil { + t.Fatalf("generateMachineConfigFile: parse MachineConfig CR: %v", err) } - mcYAML, ok := secretObj.StringData["machineconfig.yaml"] - if !ok { - t.Fatal("generateMachineConfigFile: secret missing machineconfig.yaml field") + if crObj.Spec.Machine == nil { + t.Fatal("generateMachineConfigFile: spec.machine is nil in generated CR") } - // Write the raw machine config YAML to a dedicated temp file. + reconstructed := map[string]interface{}{ + "machine": crObj.Spec.Machine, + "cluster": crObj.Spec.Cluster, + } + mcYAML, err := yaml.Marshal(reconstructed) + if err != nil { + t.Fatalf("generateMachineConfigFile: marshal reconstructed machineconfig: %v", err) + } + + // Write the reconstructed machineconfig YAML to a temp file. f, err := os.CreateTemp(t.TempDir(), "mc-*.yaml") if err != nil { t.Fatalf("generateMachineConfigFile: create temp file: %v", err) } - if _, err := f.WriteString(mcYAML); err != nil { + if _, err := f.Write(mcYAML); err != nil { t.Fatalf("generateMachineConfigFile: write machine config: %v", err) } f.Close() @@ -80,11 +98,11 @@ bootstrap: // TestBootstrap_ImportExistingCluster_LocalFilePath verifies that when // importExistingCluster=true and machineConfigPaths is non-empty, Compiler -// reads CA material from the local machine config file and successfully generates -// all output artifacts (machine config Secrets, TalosCluster CR, bootstrap-sequence). -// This path is used for clusters bootstrapped before Seam. +// successfully generates all output artifacts for import mode. +// Import mode emits: namespace manifest, TalosCluster CR, bootstrap-sequence. +// MachineConfig CRs are NOT emitted for import mode -- admin provides them +// (via compiler addnode or hand-authored). CP-INV-004. func TestBootstrap_ImportExistingCluster_LocalFilePath(t *testing.T) { - // Generate a real init-node machine config file from a fresh PKI bundle. mcPath := generateMachineConfigFile(t, "import-cluster", "cp1") input := fmt.Sprintf(` @@ -118,11 +136,9 @@ bootstrap: t.Fatalf("compileBootstrap (local file path) error: %v", err) } - // All expected output files must be present. + // Import mode output: namespace + TalosCluster + bootstrap-sequence only. for _, name := range []string{ "seam-tenant-namespace.yaml", - "seam-mc-import-cluster-cp1.yaml", - "seam-mc-import-cluster-wk1.yaml", "import-cluster.yaml", "bootstrap-sequence.yaml", } { @@ -130,11 +146,23 @@ bootstrap: t.Errorf("expected output file %q not found: %v", name, err) } } + + // MachineConfig CRs are admin-provided; compiler must not generate them. + for _, name := range []string{ + "seam-mc-import-cluster-cp1.yaml", + "seam-mc-import-cluster-wk1.yaml", + } { + if _, err := os.Stat(filepath.Join(outDir, name)); err == nil { + t.Errorf("import mode must not generate MachineConfig CR %q; admin provides these", name) + } + } } // TestBootstrap_ImportExistingCluster_LocalFileMissingReturnsError verifies that -// when machineConfigPaths is non-empty but the referenced file does not exist, -// Compiler returns an error rather than panicking or silently producing output. +// when machineConfigPaths references a nonexistent file and controlPlaneEndpoint +// is absent (forcing endpoint extraction from the file), Compiler returns an error. +// When endpoint and disk are explicit, the file is not read; omitting them forces +// the compiler to attempt to read the file. func TestBootstrap_ImportExistingCluster_LocalFileMissingReturnsError(t *testing.T) { input := ` name: import-cluster @@ -147,7 +175,6 @@ importExistingCluster: true machineConfigPaths: cp1: /nonexistent/machineconfig.yaml bootstrap: - controlPlaneEndpoint: "https://10.0.0.10:6443" talosVersion: "v1.7.0" kubernetesVersion: "1.30.0" installDisk: "/dev/sda" @@ -164,9 +191,9 @@ bootstrap: } // TestBootstrap_ImportExistingCluster_InitNodeAbsentFromMapReturnsError verifies -// that when machineConfigPaths is non-empty but the init node hostname is absent -// from the map, Compiler returns an error. The init node entry is required for -// CA extraction; omitting it is a configuration error. +// that when machineConfigPaths is non-empty but contains no entry for any +// control-plane node, and controlPlaneEndpoint is absent (requiring extraction), +// Compiler returns an error about the missing endpoint. func TestBootstrap_ImportExistingCluster_InitNodeAbsentFromMapReturnsError(t *testing.T) { input := ` name: import-cluster @@ -179,7 +206,6 @@ importExistingCluster: true machineConfigPaths: worker1: /some/path/worker.yaml bootstrap: - controlPlaneEndpoint: "https://10.0.0.10:6443" talosVersion: "v1.7.0" kubernetesVersion: "1.30.0" installDisk: "/dev/sda" @@ -191,10 +217,10 @@ bootstrap: inputPath := writeInputFile(t, input) err := compileBootstrap(inputPath, t.TempDir(), "", "") if err == nil { - t.Fatal("expected error when init node hostname absent from machineConfigPaths; got nil") + t.Fatal("expected error when no control-plane node in machineConfigPaths and endpoint absent; got nil") } - if !containsStr(err.Error(), "cp1") { - t.Errorf("error message should mention the missing hostname %q; got: %v", "cp1", err) + if !containsStr(err.Error(), "controlPlaneEndpoint") { + t.Errorf("error should mention controlPlaneEndpoint; got: %v", err) } } @@ -247,11 +273,11 @@ bootstrap: assertContainsStr(t, content, "ontai.dev/cluster: my-cluster") } -// TestBootstrap_BootstrapMode_DoesNotEmitSeamTenantNamespaceManifest verifies that -// compileBootstrap in mode=bootstrap (importExistingCluster=false) does NOT emit -// seam-tenant-namespace.yaml. Platform creates the namespace for bootstrap/CAPI clusters. -// Governor ruling 2026-04-21. -func TestBootstrap_BootstrapMode_DoesNotEmitSeamTenantNamespaceManifest(t *testing.T) { +// TestBootstrap_BootstrapMode_EmitsSeamTenantNamespaceManifest verifies that +// compileBootstrap in mode=bootstrap emits seam-tenant-namespace.yaml. +// Compiler creates namespaces for all modes; platform no longer creates +// seam-tenant-{cluster} namespaces. CP-INV-004 amended 2026-05-31. +func TestBootstrap_BootstrapMode_EmitsSeamTenantNamespaceManifest(t *testing.T) { input := ` name: fresh-cluster namespace: seam-system @@ -277,9 +303,11 @@ bootstrap: } nsPath := filepath.Join(outDir, "seam-tenant-namespace.yaml") - if _, err := os.Stat(nsPath); err == nil { - t.Error("seam-tenant-namespace.yaml must not be emitted for mode=bootstrap") + nsData, err := os.ReadFile(nsPath) + if err != nil { + t.Fatalf("seam-tenant-namespace.yaml must be emitted for mode=bootstrap: %v", err) } + assertContainsStr(t, string(nsData), "name: seam-tenant-fresh-cluster") } // TestBootstrap_ImportMode_NamespaceNameIsSeamTenantNotTenant verifies that the @@ -328,11 +356,12 @@ bootstrap: // ── Kubernetes API fallback (machineConfigPaths absent) ─────────────────────── -// TestBootstrap_ImportExistingCluster_KubeconfigFallback verifies that when -// importExistingCluster=true and machineConfigPaths is absent, Compiler falls -// back to the Kubernetes API path and returns an error when the kubeconfig -// is unreachable. This is the existing Seam-cluster import path. -func TestBootstrap_ImportExistingCluster_KubeconfigFallback(t *testing.T) { +// TestBootstrap_ImportMode_NoMachineConfigPaths_Succeeds verifies that import mode +// with no machineConfigPaths (and explicit endpoint and disk) succeeds and emits +// namespace + TalosCluster + bootstrap-sequence. The kubeconfig API fallback was +// removed when the MachineConfig CRD migration eliminated PKI extraction. +// Admin provides MachineConfig CRs separately. CP-INV-004. +func TestBootstrap_ImportMode_NoMachineConfigPaths_Succeeds(t *testing.T) { input := ` name: import-cluster namespace: seam-system @@ -352,10 +381,14 @@ bootstrap: role: init ` inputPath := writeInputFile(t, input) - // Pass a non-existent kubeconfig — the API path must fail with an error. - err := compileBootstrap(inputPath, t.TempDir(), "/nonexistent/kubeconfig.yaml", "") - if err == nil { - t.Fatal("expected error for missing kubeconfig in API fallback path; got nil") + outDir := t.TempDir() + if err := compileBootstrap(inputPath, outDir, "", ""); err != nil { + t.Fatalf("expected import mode to succeed without machineConfigPaths; got: %v", err) + } + for _, name := range []string{"seam-tenant-namespace.yaml", "import-cluster.yaml", "bootstrap-sequence.yaml"} { + if _, err := os.Stat(filepath.Join(outDir, name)); err != nil { + t.Errorf("expected output file %q not found: %v", name, err) + } } } diff --git a/cmd/compiler/compile_bootstrap_test.go b/cmd/compiler/compile_bootstrap_test.go index 35444aa..618fd2c 100644 --- a/cmd/compiler/compile_bootstrap_test.go +++ b/cmd/compiler/compile_bootstrap_test.go @@ -52,8 +52,9 @@ func TestBootstrap_ProducesExpectedOutputFiles(t *testing.T) { t.Fatalf("compileBootstrap error: %v", err) } - // Expect 3 node Secrets + 1 TalosCluster + 1 bootstrap-sequence. + // Expect: namespace manifest + 3 MachineConfig CRs + TalosCluster CR + bootstrap-sequence. expectedFiles := []string{ + "seam-tenant-namespace.yaml", "seam-mc-ccs-mgmt-node1.yaml", "seam-mc-ccs-mgmt-node2.yaml", "seam-mc-ccs-mgmt-node3.yaml", @@ -68,10 +69,11 @@ func TestBootstrap_ProducesExpectedOutputFiles(t *testing.T) { } } -// TestBootstrap_SecretHasCorrectStructure verifies that the generated machineconfig -// Secret for the init node has the required Kubernetes Secret fields. +// TestBootstrap_MachineConfigCRHasCorrectStructure verifies that the generated +// MachineConfig CR for the init node has the required fields. // platform-schema.md §9: naming convention seam-mc-{cluster}-{hostname}. -func TestBootstrap_SecretHasCorrectStructure(t *testing.T) { +// Phase 3a: MachineConfig CRD replaces machineconfig Secrets for bootstrap output. +func TestBootstrap_MachineConfigCRHasCorrectStructure(t *testing.T) { outDir := t.TempDir() inputPath := writeInputFile(t, bootstrapInputYAML) @@ -81,15 +83,17 @@ func TestBootstrap_SecretHasCorrectStructure(t *testing.T) { data, err := os.ReadFile(filepath.Join(outDir, "seam-mc-ccs-mgmt-node1.yaml")) if err != nil { - t.Fatalf("read Secret YAML: %v", err) + t.Fatalf("read MachineConfig CR YAML: %v", err) } content := string(data) - assertContainsStr(t, content, "apiVersion: v1") - assertContainsStr(t, content, "kind: Secret") + assertContainsStr(t, content, "apiVersion: platform.ontai.dev/v1alpha1") + assertContainsStr(t, content, "kind: MachineConfig") assertContainsStr(t, content, "name: seam-mc-ccs-mgmt-node1") assertContainsStr(t, content, "namespace: seam-tenant-ccs-mgmt") - assertContainsStr(t, content, "machineconfig.yaml:") + assertContainsStr(t, content, "role: init") + assertContainsStr(t, content, "nodeHostname: node1") + assertContainsStr(t, content, "nodeIP: 10.20.0.11") assertContainsStr(t, content, "ontai.dev/cluster: ccs-mgmt") } @@ -316,12 +320,9 @@ bootstrap: t.Fatalf("compileBootstrap error: %v", err) } - // The Secret YAML should contain the default installer image reference. + // The MachineConfig CR's spec.machine should contain the default installer image. data, _ := os.ReadFile(filepath.Join(outDir, "seam-mc-ccs-mgmt-node1.yaml")) - content := string(data) - assertContainsStr(t, content, "machineconfig.yaml:") - // machineconfig.yaml should contain the default installer image. - assertContainsStr(t, content, "ghcr.io/siderolabs/installer:v1.7.0") + assertContainsStr(t, string(data), "ghcr.io/siderolabs/installer:v1.7.0") } // WS2 — Bootstrap malformed input validation tests. diff --git a/cmd/compiler/main.go b/cmd/compiler/main.go index 93aa5d7..0acec76 100644 --- a/cmd/compiler/main.go +++ b/cmd/compiler/main.go @@ -42,6 +42,8 @@ func main() { runComponentSubcommand(os.Args[2:]) case "maintenance": runMaintenanceSubcommand(os.Args[2:]) + case "addnode": + runAddNodeSubcommand(os.Args[2:]) case "scaffold": runScaffoldSubcommand(os.Args[2:]) case "domain": @@ -198,6 +200,7 @@ func printUsageTo(w *os.File) { fmt.Fprintln(w, " packbuild Compile a PackBuild spec into a ClusterPack CR") fmt.Fprintln(w, " maintenance Compile a MaintenanceBundle CR with pre-resolved scheduling context") fmt.Fprintln(w, " component Produce RBACProfile CR YAML from the embedded catalog or a descriptor") + fmt.Fprintln(w, " addnode Generate a MachineConfig CR for a node being added to an existing cluster") fmt.Fprintln(w, " scaffold Generate a seam-domain operator scaffold pre-wired with seam-sdk") fmt.Fprintln(w, " domain Reserved — not yet implemented") fmt.Fprintln(w, "") diff --git a/go.mod b/go.mod index e766047..13818d3 100644 --- a/go.mod +++ b/go.mod @@ -30,6 +30,7 @@ require ( gopkg.in/yaml.v3 v3.0.1 helm.sh/helm/v3 v3.17.3 k8s.io/api v0.35.3 + k8s.io/apiextensions-apiserver v0.35.0 k8s.io/apimachinery v0.35.3 k8s.io/client-go v0.35.3 sigs.k8s.io/controller-runtime v0.23.3 @@ -166,7 +167,6 @@ require ( google.golang.org/protobuf v1.36.10 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - k8s.io/apiextensions-apiserver v0.35.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect diff --git a/internal/capability/adapters.go b/internal/capability/adapters.go index 9abd046..d327824 100644 --- a/internal/capability/adapters.go +++ b/internal/capability/adapters.go @@ -287,6 +287,12 @@ func (a *TalosClientAdapter) Reboot(ctx context.Context) error { return a.inner.Reboot(ctx) } +// RebootPowercycle reboots the node in hardware powercycle mode (power off then on). +// Required after Talos upgrade staging so that BIOS/UEFI re-initialises cleanly. +func (a *TalosClientAdapter) RebootPowercycle(ctx context.Context) error { + return a.inner.Reboot(ctx, talos_client.WithPowerCycle) +} + // Reset performs a factory reset of the node. reboot is always false; // the caller controls any subsequent reboot via a separate Reboot capability. func (a *TalosClientAdapter) Reset(ctx context.Context, graceful bool) error { diff --git a/internal/capability/clients.go b/internal/capability/clients.go index a993866..d432ae5 100644 --- a/internal/capability/clients.go +++ b/internal/capability/clients.go @@ -30,6 +30,11 @@ type TalosNodeClient interface { // Reboot reboots the node. Reboot(ctx context.Context) error + // RebootPowercycle reboots the node using hardware powercycle mode (power off then + // power on). Required for Talos upgrade to ensure BIOS/UEFI re-initialises cleanly. + // Distinct from Reboot (OS-level restart) to allow test stubs to record the mode. + RebootPowercycle(ctx context.Context) error + // Reset performs a factory reset of the node. graceful=true drains workloads first. Reset(ctx context.Context, graceful bool) error diff --git a/internal/capability/platform_cluster_test.go b/internal/capability/platform_cluster_test.go index 2062d1b..423c872 100644 --- a/internal/capability/platform_cluster_test.go +++ b/internal/capability/platform_cluster_test.go @@ -28,7 +28,8 @@ type stubBootstrapTalosClient struct { func (s *stubBootstrapTalosClient) Bootstrap(_ context.Context) error { return nil } func (s *stubBootstrapTalosClient) ApplyConfiguration(_ context.Context, _ []byte, _ string) error { return nil } func (s *stubBootstrapTalosClient) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (s *stubBootstrapTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubBootstrapTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubBootstrapTalosClient) RebootPowercycle(_ context.Context) error { return nil } func (s *stubBootstrapTalosClient) Reset(_ context.Context, _ bool) error { return nil } func (s *stubBootstrapTalosClient) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (s *stubBootstrapTalosClient) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } diff --git a/internal/capability/platform_machineconfig_constants.go b/internal/capability/platform_machineconfig_constants.go new file mode 100644 index 0000000..0a3b2a8 --- /dev/null +++ b/internal/capability/platform_machineconfig_constants.go @@ -0,0 +1,9 @@ +package capability + +// platform_machineconfig_constants.go -- shared constants for machineconfig Secret keys. +// Used by reenrollment and scale-up capabilities that still manage the legacy per-node +// Secret model until those capabilities are migrated to MachineConfig CRs. + +// machineConfigSyncDataKey is the primary Secret data key that holds the raw Talos +// machineconfig YAML. Used by node-reenrollment and node-scale-up capabilities. +const machineConfigSyncDataKey = "machineconfig" diff --git a/internal/capability/platform_machineconfig_sync.go b/internal/capability/platform_machineconfig_sync.go index 5a5976f..9684811 100644 --- a/internal/capability/platform_machineconfig_sync.go +++ b/internal/capability/platform_machineconfig_sync.go @@ -2,9 +2,10 @@ package capability // platform_machineconfig_sync.go -- machineconfig-sync named capability. // -// Reads the canonical machineconfig from the source-of-truth Secret in -// seam-tenant-{clusterRef}, injects the ONT node label, and applies the config -// to each node in the target cluster via the Talos machine API. +// Reads the canonical machineconfig from the source-of-truth MachineConfig CR in +// seam-tenant-{clusterRef}, reconstructs full Talos YAML from spec.machine and +// spec.cluster, injects the ONT node label, and applies the config to the target +// node via the Talos machine API. // // Named Conductor capability: machineconfig-sync. // conductor-schema.md §6, platform-schema.md §15, RECON-A5. @@ -13,23 +14,25 @@ package capability // in execute mode. Never imported or called from agent mode. import ( - "bytes" - "compress/gzip" "context" "fmt" - "io" "os" - "strings" "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" sigsyaml "sigs.k8s.io/yaml" "github.com/ontai-dev/conductor-sdk/runnerlib" ) -// machineConfigCompressionLabel mirrors LabelMachineConfigCompression in platform. -const machineConfigCompressionLabel = "platform.ontai.dev/compression" +// machineConfigGVR is the GroupVersionResource for MachineConfig CRs. +// platform.ontai.dev/v1alpha1/machineconfigs -- platform-schema.md §9. +var machineConfigGVR = schema.GroupVersionResource{ + Group: "platform.ontai.dev", + Version: "v1alpha1", + Resource: "machineconfigs", +} // envMCSyncNodeClass is the env var key injected by MachineConfigSyncReconciler. // Must match envMCNodeClass in platform/internal/controller/machineconfigsync_reconciler.go. @@ -41,19 +44,14 @@ const envMCSyncNodeClass = "MC_NODE_CLASS" // PLT-BUG-3-ARCH. const envMCSyncNodeIP = "MC_NODE_IP" -// machineConfigSyncDataKeyYAML is the fallback Secret data key for compiler-generated -// per-node secrets. Mirrors MachineConfigDataKeyYAML in platform machineconfig_labels.go. -// PLT-BUG-3-ARCH. -const machineConfigSyncDataKeyYAML = "machineconfig.yaml" - -// machineConfigSyncSecretNamespace returns the namespace that holds the source-of-truth Secret. -func machineConfigSyncSecretNamespace(clusterRef string) string { +// machineConfigSyncCRNamespace returns the namespace holding the MachineConfig CR. +func machineConfigSyncCRNamespace(clusterRef string) string { return "seam-tenant-" + clusterRef } -// machineConfigSyncSecretName returns the canonical Secret name for a given cluster and class. -// Mirrors MachineConfigSecretName in platform/internal/controller/machineconfig_labels.go. -func machineConfigSyncSecretName(clusterRef, nodeClass string) string { +// machineConfigSyncCRName returns the MachineConfig CR name for a given cluster and nodeClass. +// Mirrors MachineConfigCRName in platform/internal/controller/machineconfig_labels.go. +func machineConfigSyncCRName(clusterRef, nodeClass string) string { return "seam-mc-" + clusterRef + "-" + nodeClass } @@ -62,19 +60,15 @@ func machineConfigSyncSecretName(clusterRef, nodeClass string) string { // Mirrors MachineConfigNodeLabel in platform/internal/controller/machineconfig_labels.go. const ontControlledLabel = "ont.platform.dev/controlled" -// machineConfigSyncDataKey is the Secret data key that holds the raw Talos machineconfig YAML. -// Mirrors MachineConfigDataKey in platform/internal/controller/machineconfig_labels.go. -const machineConfigSyncDataKey = "machineconfig" - // machineConfigSyncHandler implements the machineconfig-sync named capability. type machineConfigSyncHandler struct{} func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecuteParams) (runnerlib.OperationResultSpec, error) { now := time.Now().UTC() - if params.TalosClient == nil || params.KubeClient == nil { + if params.TalosClient == nil || params.DynamicClient == nil { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ValidationFailure, - "machineconfig-sync requires TalosClient and KubeClient"), nil + "machineconfig-sync requires TalosClient and DynamicClient"), nil } nodeClass := os.Getenv(envMCSyncNodeClass) @@ -88,37 +82,25 @@ func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecutePa nodeIP := os.Getenv(envMCSyncNodeIP) clusterRef := params.ClusterRef - secretNS := machineConfigSyncSecretNamespace(clusterRef) - secretName := machineConfigSyncSecretName(clusterRef, nodeClass) + crNS := machineConfigSyncCRNamespace(clusterRef) + crName := machineConfigSyncCRName(clusterRef, nodeClass) - // Read the source-of-truth machineconfig Secret. - secret, err := params.KubeClient.CoreV1().Secrets(secretNS).Get(ctx, secretName, metav1.GetOptions{}) + // Read the source-of-truth MachineConfig CR via the management cluster DynamicClient. + // MachineConfig CRs live in seam-tenant-{clusterRef} on the management cluster. + mcObj, err := params.DynamicClient.Resource(machineConfigGVR).Namespace(crNS). + Get(ctx, crName, metav1.GetOptions{}) if err != nil { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ExecutionFailure, - fmt.Sprintf("get MachineConfig Secret %s/%s: %v", secretNS, secretName, err)), nil + fmt.Sprintf("get MachineConfig CR %s/%s: %v", crNS, crName, err)), nil } - // Try the primary data key first; fall back to the compiler-generated key. PLT-BUG-3-ARCH. - mcBytes := secret.Data[machineConfigSyncDataKey] - usingYAMLKey := false - if len(mcBytes) == 0 { - mcBytes = secret.Data[machineConfigSyncDataKeyYAML] - usingYAMLKey = true - } - if len(mcBytes) == 0 { + // Reconstruct full Talos YAML from spec.machine and spec.cluster sections. + // Both sections are stored as unstructured JSON in the CR; unmarshal and + // marshal back to produce a valid Talos v1alpha1 machineconfig YAML. + mcBytes, err := reconstructMachineConfigYAML(mcObj.Object) + if err != nil { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ValidationFailure, - fmt.Sprintf("MachineConfig Secret %s/%s has no data key %q or %q", secretNS, secretName, machineConfigSyncDataKey, machineConfigSyncDataKeyYAML)), nil - } - - // Decompress if the secret was stored with gzip compression. RECON-F5. - // Compiler-generated secrets are not gzip-compressed (usingYAMLKey == true). - if !usingYAMLKey && secret.Labels[machineConfigCompressionLabel] == "gzip" { - decompressed, dcErr := decompressMachineConfig(mcBytes) - if dcErr != nil { - return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ValidationFailure, - fmt.Sprintf("decompress machineconfig from %s/%s: %v", secretNS, secretName, dcErr)), nil - } - mcBytes = decompressed + fmt.Sprintf("reconstruct machineconfig YAML from CR %s/%s: %v", crNS, crName, err)), nil } // Inject the ONT controlled node label into the machineconfig. @@ -152,30 +134,20 @@ func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecutePa nodeIPs = ips } - // singleNodeClass is true when: - // - nodeClass starts with "node-" (per-node class secret), or - // - MC_NODE_IP is set (per-node targeting, compiler secret is already per-node). - // In single-node mode, the secret IS the complete per-node config; skip patch lookup. - singleNodeClass := strings.HasPrefix(nodeClass, "node-") || nodeIP != "" - var steps []runnerlib.StepResult if len(nodeIPs) > 0 { - for _, nodeIP := range nodeIPs { - nodeConfig := modifiedConfig - if !singleNodeClass { - nodeConfig = perNodePatchConfig(ctx, params, secretNS, clusterRef, nodeIP, modifiedConfig) - } + for _, ip := range nodeIPs { stepStart := time.Now().UTC() - if err := params.TalosClient.ApplyConfiguration(NodeContext(ctx, nodeIP), nodeConfig, "no-reboot"); err != nil { + if err := params.TalosClient.ApplyConfiguration(NodeContext(ctx, ip), modifiedConfig, "no-reboot"); err != nil { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ExecutionFailure, - fmt.Sprintf("ApplyConfiguration on %s: %v", nodeIP, err)), nil + fmt.Sprintf("ApplyConfiguration on %s: %v", ip, err)), nil } steps = append(steps, runnerlib.StepResult{ - Name: "machineconfig-sync-" + nodeIP, + Name: "machineconfig-sync-" + ip, Status: runnerlib.ResultSucceeded, StartedAt: stepStart, CompletedAt: time.Now().UTC(), - Message: fmt.Sprintf("machineconfig applied to %s (nodeClass=%s)", nodeIP, nodeClass), + Message: fmt.Sprintf("machineconfig applied to %s (nodeClass=%s)", ip, nodeClass), }) } } else { @@ -202,67 +174,29 @@ func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecutePa }, nil } -// decompressMachineConfig gunzips gzip-compressed machineconfig bytes. RECON-F5. -func decompressMachineConfig(compressed []byte) ([]byte, error) { - r, err := gzip.NewReader(bytes.NewReader(compressed)) - if err != nil { - return nil, fmt.Errorf("gzip.NewReader: %w", err) - } - defer r.Close() - out, err := io.ReadAll(r) - if err != nil { - return nil, fmt.Errorf("read decompressed: %w", err) - } - return out, nil -} - -// perNodePatchConfig looks up any per-node patch secret for the Kubernetes node -// whose InternalIP matches nodeIP, then merges it with baseConfig. The ONT controlled -// label is always re-injected after merging so it cannot be overridden by a patch. -// Returns baseConfig unchanged when no per-node secret exists or any step fails. RECON-A8. -func perNodePatchConfig(ctx context.Context, params ExecuteParams, secretNS, clusterRef, nodeIP string, baseConfig []byte) []byte { - nodeList, err := params.KubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - if err != nil { - return baseConfig - } - - var hostname string - for i := range nodeList.Items { - node := &nodeList.Items[i] - for _, addr := range node.Status.Addresses { - if string(addr.Type) == "InternalIP" && addr.Address == nodeIP { - hostname = node.Name - break - } - } - if hostname != "" { - break - } +// reconstructMachineConfigYAML builds a Talos v1alpha1 machineconfig YAML document +// from a MachineConfig CR's unstructured object. The spec.machine and spec.cluster +// sections are extracted and merged into a single top-level map. +// +// Returns an error when neither section is present (empty CR is not applicable). +func reconstructMachineConfigYAML(obj map[string]interface{}) ([]byte, error) { + spec, _ := obj["spec"].(map[string]interface{}) + if spec == nil { + return nil, fmt.Errorf("MachineConfig CR has no spec") } - if hostname == "" { - return baseConfig + combined := map[string]interface{}{} + if m := spec["machine"]; m != nil { + combined["machine"] = m } - - patchSecretName := machineConfigSyncSecretName(clusterRef, "node-"+hostname) - patchSecret, pErr := params.KubeClient.CoreV1().Secrets(secretNS).Get(ctx, patchSecretName, metav1.GetOptions{}) - if pErr != nil { - return baseConfig + if c := spec["cluster"]; c != nil { + combined["cluster"] = c } - patchBytes := patchSecret.Data[machineConfigSyncDataKey] - if len(patchBytes) == 0 { - return baseConfig + if len(combined) == 0 { + return nil, fmt.Errorf("MachineConfig CR spec has neither machine nor cluster section") } - - merged, mergeErr := mergeYAMLPatch(baseConfig, patchBytes) - if mergeErr != nil { - return baseConfig - } - - // Re-inject the ONT controlled label: it must never be overridden by a per-node patch. - labelPatch := []byte(fmt.Sprintf(`{"machine":{"nodeLabels":{%q:"true"}}}`, ontControlledLabel)) - result, lErr := mergeYAMLPatch(merged, labelPatch) - if lErr != nil { - return merged + data, err := sigsyaml.Marshal(combined) + if err != nil { + return nil, fmt.Errorf("marshal reconstructed machineconfig: %w", err) } - return result + return data, nil } diff --git a/internal/capability/platform_machineconfig_sync_test.go b/internal/capability/platform_machineconfig_sync_test.go index c5802ad..87af176 100644 --- a/internal/capability/platform_machineconfig_sync_test.go +++ b/internal/capability/platform_machineconfig_sync_test.go @@ -1,25 +1,24 @@ package capability import ( - "bytes" - "compress/gzip" "context" - "fmt" "io" "os" "testing" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + fakedyn "k8s.io/client-go/dynamic/fake" "github.com/ontai-dev/conductor-sdk/runnerlib" ) // stubApplyTalosClient records ApplyConfiguration calls and exposes applied bytes. type stubApplyTalosClient struct { - applied [][]byte - applyErr error + applied [][]byte + applyErr error } func (s *stubApplyTalosClient) Bootstrap(_ context.Context) error { return nil } @@ -28,45 +27,106 @@ func (s *stubApplyTalosClient) ApplyConfiguration(_ context.Context, cfg []byte, return s.applyErr } func (s *stubApplyTalosClient) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (s *stubApplyTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) RebootPowercycle(_ context.Context) error { return nil } func (s *stubApplyTalosClient) Reset(_ context.Context, _ bool) error { return nil } func (s *stubApplyTalosClient) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } -func (s *stubApplyTalosClient) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } -func (s *stubApplyTalosClient) EtcdDefragment(_ context.Context) error { return nil } -func (s *stubApplyTalosClient) GetMachineConfig(_ context.Context) ([]byte, error) { return nil, nil } -func (s *stubApplyTalosClient) Kubeconfig(_ context.Context) ([]byte, error) { return nil, nil } -func (s *stubApplyTalosClient) Nodes() []string { return nil } -func (s *stubApplyTalosClient) Rollback(_ context.Context) error { return nil } -func (s *stubApplyTalosClient) WipeDisk(_ context.Context) error { return nil } -func (s *stubApplyTalosClient) Health(_ context.Context) error { return nil } -func (s *stubApplyTalosClient) Close() error { return nil } - -// mcSyncTestSecret builds a Kubernetes Secret for the machineconfig-sync capability tests. -func mcSyncTestSecret(clusterRef, nodeClass string, content []byte) *corev1.Secret { - return &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: "seam-mc-" + clusterRef + "-" + nodeClass, - Namespace: "seam-tenant-" + clusterRef, - }, - Data: map[string][]byte{ - "machineconfig": content, - }, +func (s *stubApplyTalosClient) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } +func (s *stubApplyTalosClient) EtcdDefragment(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) GetMachineConfig(_ context.Context) ([]byte, error) { return nil, nil } +func (s *stubApplyTalosClient) Kubeconfig(_ context.Context) ([]byte, error) { return nil, nil } +func (s *stubApplyTalosClient) Nodes() []string { return nil } +func (s *stubApplyTalosClient) Rollback(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) WipeDisk(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) Health(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) Close() error { return nil } + +// buildMCSyncScheme returns a runtime.Scheme with MachineConfig and MachineConfigList registered. +func buildMCSyncScheme() *runtime.Scheme { + scheme := runtime.NewScheme() + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }, &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfigList", + }, &unstructured.UnstructuredList{}) + return scheme +} + +// buildMCSyncDynClient returns a fake DynamicClient with one MachineConfig CR seeded. +// machineSection and clusterSection are the spec.machine and spec.cluster content +// stored as unstructured Go maps (matching the CR's unstructured representation). +func buildMCSyncDynClient(clusterRef, nodeClass string, machineSection, clusterSection map[string]interface{}) *fakedyn.FakeDynamicClient { + cr := &unstructured.Unstructured{} + cr.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }) + cr.SetName(machineConfigSyncCRName(clusterRef, nodeClass)) + cr.SetNamespace(machineConfigSyncCRNamespace(clusterRef)) + + spec := map[string]interface{}{ + "role": "controlplane", + "order": int64(1), + "clusterRef": map[string]interface{}{"name": clusterRef}, + "nodeIP": "10.20.0.11", + "nodeHostname": nodeClass, + } + if machineSection != nil { + spec["machine"] = machineSection + } + if clusterSection != nil { + spec["cluster"] = clusterSection + } + cr.Object["spec"] = spec + + return fakedyn.NewSimpleDynamicClient(buildMCSyncScheme(), cr) +} + +// writeFakeTalosconfig writes a minimal talosconfig YAML to a temp file and returns +// its path. The config uses ctx.nodes so EndpointsFromTalosconfig returns nodeIPs directly. +func writeFakeTalosconfig(t *testing.T, nodeIPs []string) string { + t.Helper() + var nodesYAML string + for _, ip := range nodeIPs { + nodesYAML += " - " + ip + "\n" } + content := "context: default\ncontexts:\n default:\n endpoints: []\n nodes:\n" + nodesYAML + f, err := os.CreateTemp(t.TempDir(), "talosconfig-*.yaml") + if err != nil { + t.Fatalf("create temp talosconfig: %v", err) + } + if _, err := f.WriteString(content); err != nil { + t.Fatalf("write talosconfig: %v", err) + } + _ = f.Close() + return f.Name() +} + +// containsString is a simple string containment check for test use only. +func containsString(s, sub string) bool { + return len(s) >= len(sub) && (s == sub || len(sub) == 0 || + func() bool { + for i := 0; i <= len(s)-len(sub); i++ { + if s[i:i+len(sub)] == sub { + return true + } + } + return false + }()) } // TestMachineConfigSyncHandler_MissingEnvVar verifies that a ValidationFailure is // returned when MC_NODE_CLASS is absent from the environment. func TestMachineConfigSyncHandler_MissingEnvVar(t *testing.T) { - // Ensure MC_NODE_CLASS is not set. t.Setenv(envMCSyncNodeClass, "") handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ - Capability: runnerlib.CapabilityMachineConfigSync, - ClusterRef: "ccs-mgmt", + Capability: runnerlib.CapabilityMachineConfigSync, + ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ - TalosClient: &stubApplyTalosClient{}, - KubeClient: fake.NewSimpleClientset(), + TalosClient: &stubApplyTalosClient{}, + DynamicClient: buildMCSyncDynClient("ccs-mgmt", "", nil, nil), }, }) if err != nil { @@ -81,9 +141,9 @@ func TestMachineConfigSyncHandler_MissingEnvVar(t *testing.T) { } // TestMachineConfigSyncHandler_NilClients verifies that a ValidationFailure is -// returned when TalosClient or KubeClient is nil. +// returned when TalosClient or DynamicClient is nil. func TestMachineConfigSyncHandler_NilClients(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") + t.Setenv(envMCSyncNodeClass, "cp1") handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ @@ -101,18 +161,18 @@ func TestMachineConfigSyncHandler_NilClients(t *testing.T) { } } -// TestMachineConfigSyncHandler_SecretNotFound verifies that a ExecutionFailure is -// returned when the machineconfig Secret is absent from the cluster. -func TestMachineConfigSyncHandler_SecretNotFound(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") +// TestMachineConfigSyncHandler_CRNotFound verifies that an ExecutionFailure is +// returned when the MachineConfig CR is absent from the management cluster. +func TestMachineConfigSyncHandler_CRNotFound(t *testing.T) { + t.Setenv(envMCSyncNodeClass, "cp1") handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ Capability: runnerlib.CapabilityMachineConfigSync, ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ - TalosClient: &stubApplyTalosClient{}, - KubeClient: fake.NewSimpleClientset(), // no secret + TalosClient: &stubApplyTalosClient{}, + DynamicClient: fakedyn.NewSimpleDynamicClient(buildMCSyncScheme()), // no CR seeded }, }) if err != nil { @@ -121,16 +181,24 @@ func TestMachineConfigSyncHandler_SecretNotFound(t *testing.T) { if result.Status != runnerlib.ResultFailed { t.Errorf("expected ResultFailed, got %q", result.Status) } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ExecutionFailure { + t.Errorf("expected ExecutionFailure for missing CR, got %v", result.FailureReason) + } } // TestMachineConfigSyncHandler_AppliesAndInjectsLabel verifies that the handler -// applies the machineconfig to the Talos node and injects the ONT controlled label. +// reads the MachineConfig CR, reconstructs Talos YAML, applies it, and injects +// the ONT controlled node label. func TestMachineConfigSyncHandler_AppliesAndInjectsLabel(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") + t.Setenv(envMCSyncNodeClass, "cp1") + t.Setenv(envMCSyncNodeIP, "10.20.0.11") + defer t.Setenv(envMCSyncNodeIP, "") - machineConfigContent := []byte("machine:\n type: controlplane\n nodeLabels: {}\n") - secret := mcSyncTestSecret("ccs-mgmt", "controlplane", machineConfigContent) - kubeClient := fake.NewSimpleClientset(secret) + machineSection := map[string]interface{}{ + "type": "controlplane", + "nodeLabels": map[string]interface{}{}, + } + dynClient := buildMCSyncDynClient("ccs-mgmt", "cp1", machineSection, nil) talosClient := &stubApplyTalosClient{} handler := &machineConfigSyncHandler{} @@ -138,8 +206,8 @@ func TestMachineConfigSyncHandler_AppliesAndInjectsLabel(t *testing.T) { Capability: runnerlib.CapabilityMachineConfigSync, ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ - TalosClient: talosClient, - KubeClient: kubeClient, + TalosClient: talosClient, + DynamicClient: dynClient, }, }) if err != nil { @@ -152,79 +220,32 @@ func TestMachineConfigSyncHandler_AppliesAndInjectsLabel(t *testing.T) { } } - // Verify ApplyConfiguration was called once. if len(talosClient.applied) != 1 { t.Fatalf("expected 1 ApplyConfiguration call, got %d", len(talosClient.applied)) } - - // Verify the applied config contains the ONT label. applied := string(talosClient.applied[0]) if !containsString(applied, ontControlledLabel) { t.Errorf("applied config does not contain node label %q:\n%s", ontControlledLabel, applied) } - if len(result.Steps) != 1 { t.Errorf("expected 1 step result, got %d", len(result.Steps)) } } -// containsString is a simple string containment check for test use only. -func containsString(s, sub string) bool { - return len(s) >= len(sub) && (s == sub || len(sub) == 0 || - func() bool { - for i := 0; i <= len(s)-len(sub); i++ { - if s[i:i+len(sub)] == sub { - return true - } - } - return false - }()) -} +// TestMachineConfigSyncHandler_ReconstructsBothSections verifies that spec.machine +// and spec.cluster are both present in the reconstructed Talos YAML. +func TestMachineConfigSyncHandler_ReconstructsBothSections(t *testing.T) { + t.Setenv(envMCSyncNodeClass, "cp1") + t.Setenv(envMCSyncNodeIP, "10.20.0.11") + defer t.Setenv(envMCSyncNodeIP, "") -// writeFakeTalosconfig writes a minimal talosconfig YAML to a temp file and returns -// its path. The config uses ctx.nodes so EndpointsFromTalosconfig returns nodeIPs directly. -func writeFakeTalosconfig(t *testing.T, nodeIPs []string) string { - t.Helper() - var nodesYAML string - for _, ip := range nodeIPs { - nodesYAML += fmt.Sprintf(" - %s\n", ip) - } - content := fmt.Sprintf("context: default\ncontexts:\n default:\n endpoints: []\n nodes:\n%s", nodesYAML) - f, err := os.CreateTemp(t.TempDir(), "talosconfig-*.yaml") - if err != nil { - t.Fatalf("create temp talosconfig: %v", err) - } - if _, err := f.WriteString(content); err != nil { - t.Fatalf("write talosconfig: %v", err) + machineSection := map[string]interface{}{ + "type": "controlplane", } - _ = f.Close() - return f.Name() -} - -// TestMachineConfigSyncHandler_DecompressesGzipSecret verifies that when the machineconfig -// secret is gzip-compressed (compression label present), the capability decompresses it -// before applying. RECON-F5. -func TestMachineConfigSyncHandler_DecompressesGzipSecret(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") - - rawContent := []byte("machine:\n type: controlplane\n") - var buf bytes.Buffer - w := mustGzipWriter(t, &buf) - _, _ = w.Write(rawContent) - _ = w.Close() - compressed := buf.Bytes() - - secret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: "seam-mc-ccs-mgmt-controlplane", - Namespace: "seam-tenant-ccs-mgmt", - Labels: map[string]string{ - "platform.ontai.dev/compression": "gzip", - }, - }, - Data: map[string][]byte{"machineconfig": compressed}, + clusterSection := map[string]interface{}{ + "clusterName": "ccs-mgmt", } - kubeClient := fake.NewSimpleClientset(secret) + dynClient := buildMCSyncDynClient("ccs-mgmt", "cp1", machineSection, clusterSection) talosClient := &stubApplyTalosClient{} handler := &machineConfigSyncHandler{} @@ -232,8 +253,8 @@ func TestMachineConfigSyncHandler_DecompressesGzipSecret(t *testing.T) { Capability: runnerlib.CapabilityMachineConfigSync, ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ - TalosClient: talosClient, - KubeClient: kubeClient, + TalosClient: talosClient, + DynamicClient: dynClient, }, }) if err != nil { @@ -243,57 +264,38 @@ func TestMachineConfigSyncHandler_DecompressesGzipSecret(t *testing.T) { t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) } if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 apply call, got %d", len(talosClient.applied)) + t.Fatalf("expected 1 ApplyConfiguration call, got %d", len(talosClient.applied)) } applied := string(talosClient.applied[0]) - if !containsString(applied, "type: controlplane") { - t.Errorf("decompressed content not present in applied config:\n%s", applied) + if !containsString(applied, "machine:") { + t.Errorf("applied config missing machine section:\n%s", applied) } -} - -// mustGzipWriter returns a gzip.Writer writing to w. Fatals if creation fails. -func mustGzipWriter(t *testing.T, w *bytes.Buffer) *gzip.Writer { - t.Helper() - gw := gzip.NewWriter(w) - return gw -} - -// buildMCSyncNode returns a minimal Kubernetes Node object with the given name and InternalIP. -func buildMCSyncNode(name, ip string) *corev1.Node { - return &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: name}, - Status: corev1.NodeStatus{ - Addresses: []corev1.NodeAddress{ - {Type: corev1.NodeInternalIP, Address: ip}, - }, - }, + if !containsString(applied, "cluster:") { + t.Errorf("applied config missing cluster section:\n%s", applied) } } -// TestMachineConfigSyncHandler_PerNodePatchMerged verifies that when a per-node patch -// secret exists for a node, its content is merged into the base class config. RECON-A8. -func TestMachineConfigSyncHandler_PerNodePatchMerged(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") - - baseContent := []byte("machine:\n type: controlplane\n") - patchContent := []byte("machine:\n network:\n hostname: cp1\n") +// TestMachineConfigSyncHandler_NodeIPTargetsSingleNode verifies that when MC_NODE_IP +// is set, the capability applies to only that one node and skips talosconfig enumeration. +// PLT-BUG-3-ARCH. +func TestMachineConfigSyncHandler_NodeIPTargetsSingleNode(t *testing.T) { + t.Setenv(envMCSyncNodeClass, "cp1") + t.Setenv(envMCSyncNodeIP, "10.20.0.11") + defer t.Setenv(envMCSyncNodeIP, "") - secret := mcSyncTestSecret("ccs-mgmt", "controlplane", baseContent) - patchSecret := mcSyncTestSecret("ccs-mgmt", "node-cp1", patchContent) - node := buildMCSyncNode("cp1", "10.20.0.2") - kubeClient := fake.NewSimpleClientset(secret, patchSecret, node) + machineSection := map[string]interface{}{"type": "controlplane"} + dynClient := buildMCSyncDynClient("ccs-dev", "cp1", machineSection, nil) + // Provide a talosconfig with multiple nodes -- only MC_NODE_IP should be targeted. + talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.11", "10.20.0.12", "10.20.0.13"}) talosClient := &stubApplyTalosClient{} - // Provide a fake talosconfig so the handler enumerates nodeIPs. - talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.2"}) - handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ Capability: runnerlib.CapabilityMachineConfigSync, - ClusterRef: "ccs-mgmt", + ClusterRef: "ccs-dev", ExecuteClients: ExecuteClients{ TalosClient: talosClient, - KubeClient: kubeClient, + DynamicClient: dynClient, TalosconfigPath: talosconfigPath, }, }) @@ -303,38 +305,28 @@ func TestMachineConfigSyncHandler_PerNodePatchMerged(t *testing.T) { if result.Status != runnerlib.ResultSucceeded { t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) } + // Must apply to exactly 1 node (not 3). if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 apply call, got %d", len(talosClient.applied)) + t.Fatalf("expected 1 ApplyConfiguration call (MC_NODE_IP single-target), got %d", len(talosClient.applied)) } - applied := string(talosClient.applied[0]) - if !containsString(applied, "hostname: cp1") { - t.Errorf("per-node patch hostname not merged into applied config:\n%s", applied) + if len(result.Steps) != 1 { + t.Errorf("expected 1 step result, got %d", len(result.Steps)) } - if !containsString(applied, ontControlledLabel) { - t.Errorf("ONT controlled label missing from merged config:\n%s", applied) + if !containsString(result.Steps[0].Message, "10.20.0.11") { + t.Errorf("step message must reference nodeIP 10.20.0.11, got %q", result.Steps[0].Message) } } -// TestMachineConfigSyncHandler_PerNodePatchPreservesONTLabel verifies that a per-node -// patch cannot override the ontControlledLabel (protected field). RECON-A8. -func TestMachineConfigSyncHandler_PerNodePatchPreservesONTLabel(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") - - baseContent := []byte("machine:\n type: controlplane\n") - // Patch explicitly tries to remove/override the ONT label. - patchContent := []byte(`machine: - nodeLabels: - ont.platform.dev/controlled: "false" - custom-key: custom-val -`) - - secret := mcSyncTestSecret("ccs-mgmt", "controlplane", baseContent) - patchSecret := mcSyncTestSecret("ccs-mgmt", "node-cp2", patchContent) - node := buildMCSyncNode("cp2", "10.20.0.3") - kubeClient := fake.NewSimpleClientset(secret, patchSecret, node) - talosClient := &stubApplyTalosClient{} +// TestMachineConfigSyncHandler_TalosconfigMultipleNodes verifies that when MC_NODE_IP +// is not set but a talosconfig with multiple nodes is provided, the capability applies +// to all enumerated nodes. +func TestMachineConfigSyncHandler_TalosconfigMultipleNodes(t *testing.T) { + t.Setenv(envMCSyncNodeClass, "cp1") - talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.3"}) + machineSection := map[string]interface{}{"type": "controlplane"} + dynClient := buildMCSyncDynClient("ccs-mgmt", "cp1", machineSection, nil) + talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.11", "10.20.0.12"}) + talosClient := &stubApplyTalosClient{} handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ @@ -342,7 +334,7 @@ func TestMachineConfigSyncHandler_PerNodePatchPreservesONTLabel(t *testing.T) { ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ TalosClient: talosClient, - KubeClient: kubeClient, + DynamicClient: dynClient, TalosconfigPath: talosconfigPath, }, }) @@ -352,183 +344,98 @@ func TestMachineConfigSyncHandler_PerNodePatchPreservesONTLabel(t *testing.T) { if result.Status != runnerlib.ResultSucceeded { t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) } - if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 apply call, got %d", len(talosClient.applied)) - } - applied := string(talosClient.applied[0]) - // ONT label must be "true" (re-injected after merge). - if !containsString(applied, `ont.platform.dev/controlled: "true"`) { - t.Errorf("ONT controlled label not protected; applied config:\n%s", applied) - } - // Per-node patch content should also be present. - if !containsString(applied, "custom-key") { - t.Errorf("per-node patch custom label missing from merged config:\n%s", applied) - } -} - -// TestMachineConfigSyncHandler_SingleNodeClass verifies that when nodeClass starts with -// "node-", no additional per-node patch lookup is performed and the base config is applied -// directly. RECON-A8. -func TestMachineConfigSyncHandler_SingleNodeClass(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "node-cp1") - - nodeContent := []byte("machine:\n type: controlplane\n network:\n hostname: cp1\n") - secret := mcSyncTestSecret("ccs-mgmt", "node-cp1", nodeContent) - kubeClient := fake.NewSimpleClientset(secret) - talosClient := &stubApplyTalosClient{} - - handler := &machineConfigSyncHandler{} - result, err := handler.Execute(context.Background(), ExecuteParams{ - Capability: runnerlib.CapabilityMachineConfigSync, - ClusterRef: "ccs-mgmt", - ExecuteClients: ExecuteClients{ - TalosClient: talosClient, - KubeClient: kubeClient, - }, - }) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if result.Status != runnerlib.ResultSucceeded { - t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) - } - if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 apply call, got %d", len(talosClient.applied)) - } - applied := string(talosClient.applied[0]) - if !containsString(applied, ontControlledLabel) { - t.Errorf("ONT controlled label missing in single-node-class apply:\n%s", applied) + if len(talosClient.applied) != 2 { + t.Errorf("expected 2 ApplyConfiguration calls (one per node), got %d", len(talosClient.applied)) } } -// TestMachineConfigSyncHandler_NodeIPTargetsSingleNode verifies that when MC_NODE_IP -// is set, the capability applies the machineconfig to only that one node and skips -// talosconfig endpoint enumeration. PLT-BUG-3-ARCH. -func TestMachineConfigSyncHandler_NodeIPTargetsSingleNode(t *testing.T) { +// TestMachineConfigSyncHandler_CREmptySpecFails verifies that a ValidationFailure +// is returned when the MachineConfig CR spec has neither machine nor cluster section. +func TestMachineConfigSyncHandler_CREmptySpecFails(t *testing.T) { t.Setenv(envMCSyncNodeClass, "cp1") t.Setenv(envMCSyncNodeIP, "10.20.0.11") defer t.Setenv(envMCSyncNodeIP, "") - content := []byte("version: v1alpha1\nmachine:\n type: controlplane\n") - secret := mcSyncTestSecret("ccs-dev", "cp1", content) - // Provide a talosconfig with multiple nodes -- only the MC_NODE_IP node should be targeted. - talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.11", "10.20.0.12", "10.20.0.13"}) - - kubeClient := fake.NewSimpleClientset(secret) - talosClient := &stubApplyTalosClient{} + // Seed a CR with no machine/cluster sections. + cr := &unstructured.Unstructured{} + cr.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }) + cr.SetName("seam-mc-ccs-mgmt-cp1") + cr.SetNamespace("seam-tenant-ccs-mgmt") + cr.Object["spec"] = map[string]interface{}{ + "role": "controlplane", + "order": int64(1), + } + dynClient := fakedyn.NewSimpleDynamicClient(buildMCSyncScheme(), cr) handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ Capability: runnerlib.CapabilityMachineConfigSync, - ClusterRef: "ccs-dev", + ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ - TalosClient: talosClient, - KubeClient: kubeClient, - TalosconfigPath: talosconfigPath, + TalosClient: &stubApplyTalosClient{}, + DynamicClient: dynClient, }, }) if err != nil { t.Fatalf("unexpected error: %v", err) } - if result.Status != runnerlib.ResultSucceeded { - t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) - } - // Must apply to exactly 1 node (not 3). - if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 ApplyConfiguration call (MC_NODE_IP single-target), got %d", len(talosClient.applied)) - } - if len(result.Steps) != 1 { - t.Errorf("expected 1 step result, got %d", len(result.Steps)) + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed for empty CR spec, got %q", result.Status) } - if !containsString(result.Steps[0].Message, "10.20.0.11") { - t.Errorf("step message must reference nodeIP 10.20.0.11, got %q", result.Steps[0].Message) + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure for empty CR spec, got %v", result.FailureReason) } } -// TestMachineConfigSyncHandler_YAMLKeyFallback verifies that when the machineconfig -// Secret uses the compiler-generated "machineconfig.yaml" key (no "machineconfig" key), -// the capability still reads and applies the config. PLT-BUG-3-ARCH. -func TestMachineConfigSyncHandler_YAMLKeyFallback(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "cp1") - t.Setenv(envMCSyncNodeIP, "10.20.0.11") - defer t.Setenv(envMCSyncNodeIP, "") - - content := []byte("version: v1alpha1\nmachine:\n type: controlplane\n") - secret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: "seam-mc-ccs-dev-cp1", - Namespace: "seam-tenant-ccs-dev", - Labels: map[string]string{ - "ontai.dev/managed-by": "compiler", - "ontai.dev/cluster": "ccs-dev", +// TestReconstructMachineConfigYAML_BothSections verifies YAML reconstruction from a CR +// with both machine and cluster sections. +func TestReconstructMachineConfigYAML_BothSections(t *testing.T) { + obj := map[string]interface{}{ + "spec": map[string]interface{}{ + "machine": map[string]interface{}{ + "type": "controlplane", + }, + "cluster": map[string]interface{}{ + "clusterName": "ccs-mgmt", }, - }, - Data: map[string][]byte{ - // Only machineconfig.yaml key present -- no "machineconfig" key. - "machineconfig.yaml": content, }, } - kubeClient := fake.NewSimpleClientset(secret) - talosClient := &stubApplyTalosClient{} - - handler := &machineConfigSyncHandler{} - result, err := handler.Execute(context.Background(), ExecuteParams{ - Capability: runnerlib.CapabilityMachineConfigSync, - ClusterRef: "ccs-dev", - ExecuteClients: ExecuteClients{ - TalosClient: talosClient, - KubeClient: kubeClient, - }, - }) + out, err := reconstructMachineConfigYAML(obj) if err != nil { t.Fatalf("unexpected error: %v", err) } - if result.Status != runnerlib.ResultSucceeded { - t.Fatalf("expected ResultSucceeded with yaml key fallback, got %q; reason: %v", result.Status, result.FailureReason) + s := string(out) + if !containsString(s, "machine:") { + t.Errorf("output missing machine section:\n%s", s) } - if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 ApplyConfiguration call, got %d", len(talosClient.applied)) - } - if !containsString(string(talosClient.applied[0]), ontControlledLabel) { - t.Errorf("ONT controlled label must be injected even with yaml key fallback") + if !containsString(s, "cluster:") { + t.Errorf("output missing cluster section:\n%s", s) } } -// TestMachineConfigSyncHandler_NoPatchSecretFallsBackToBase verifies that when no -// per-node patch secret exists, the base class config is applied unchanged. RECON-A8. -func TestMachineConfigSyncHandler_NoPatchSecretFallsBackToBase(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") - - baseContent := []byte("machine:\n type: controlplane\n") - secret := mcSyncTestSecret("ccs-mgmt", "controlplane", baseContent) - node := buildMCSyncNode("cp3", "10.20.0.4") - // No per-node patch secret in the fake client. - kubeClient := fake.NewSimpleClientset(secret, node) - talosClient := &stubApplyTalosClient{} - - talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.4"}) +// TestReconstructMachineConfigYAML_MissingSpec verifies an error when spec is absent. +func TestReconstructMachineConfigYAML_MissingSpec(t *testing.T) { + _, err := reconstructMachineConfigYAML(map[string]interface{}{}) + if err == nil { + t.Error("expected error for missing spec, got nil") + } +} - handler := &machineConfigSyncHandler{} - result, err := handler.Execute(context.Background(), ExecuteParams{ - Capability: runnerlib.CapabilityMachineConfigSync, - ClusterRef: "ccs-mgmt", - ExecuteClients: ExecuteClients{ - TalosClient: talosClient, - KubeClient: kubeClient, - TalosconfigPath: talosconfigPath, +// TestReconstructMachineConfigYAML_EmptySections verifies an error when neither +// machine nor cluster is present in spec. +func TestReconstructMachineConfigYAML_EmptySections(t *testing.T) { + obj := map[string]interface{}{ + "spec": map[string]interface{}{ + "role": "controlplane", }, - }) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if result.Status != runnerlib.ResultSucceeded { - t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) - } - if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 apply call, got %d", len(talosClient.applied)) } - applied := string(talosClient.applied[0]) - if !containsString(applied, ontControlledLabel) { - t.Errorf("ONT label missing in fallback apply:\n%s", applied) + _, err := reconstructMachineConfigYAML(obj) + if err == nil { + t.Error("expected error for spec with no machine/cluster, got nil") } } + +// metav1 import is used via metav1.GetOptions in the dynamic client calls. +var _ = metav1.GetOptions{} diff --git a/internal/capability/platform_machineconfig_test.go b/internal/capability/platform_machineconfig_test.go index 66b2719..3c717d8 100644 --- a/internal/capability/platform_machineconfig_test.go +++ b/internal/capability/platform_machineconfig_test.go @@ -23,7 +23,8 @@ type stubTalosClientMC struct { func (s *stubTalosClientMC) Bootstrap(_ context.Context) error { return nil } func (s *stubTalosClientMC) ApplyConfiguration(_ context.Context, _ []byte, _ string) error { return nil } func (s *stubTalosClientMC) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (s *stubTalosClientMC) Reboot(_ context.Context) error { return nil } +func (s *stubTalosClientMC) Reboot(_ context.Context) error { return nil } +func (s *stubTalosClientMC) RebootPowercycle(_ context.Context) error { return nil } func (s *stubTalosClientMC) Reset(_ context.Context, _ bool) error { return nil } func (s *stubTalosClientMC) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (s *stubTalosClientMC) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } @@ -302,7 +303,8 @@ func (s *stubTalosClientRestore) ApplyConfiguration(_ context.Context, _ []byte, return s.applyErr } func (s *stubTalosClientRestore) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (s *stubTalosClientRestore) Reboot(_ context.Context) error { return nil } +func (s *stubTalosClientRestore) Reboot(_ context.Context) error { return nil } +func (s *stubTalosClientRestore) RebootPowercycle(_ context.Context) error { return nil } func (s *stubTalosClientRestore) Reset(_ context.Context, _ bool) error { return nil } func (s *stubTalosClientRestore) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (s *stubTalosClientRestore) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } diff --git a/internal/capability/platform_node_scaleup_test.go b/internal/capability/platform_node_scaleup_test.go index cb1cabf..7e3d02e 100644 --- a/internal/capability/platform_node_scaleup_test.go +++ b/internal/capability/platform_node_scaleup_test.go @@ -37,7 +37,8 @@ func (s *stubEnrollTalosClient) ApplyConfiguration(_ context.Context, cfg []byte return s.applyErr } func (s *stubEnrollTalosClient) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (s *stubEnrollTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubEnrollTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubEnrollTalosClient) RebootPowercycle(_ context.Context) error { return nil } func (s *stubEnrollTalosClient) Reset(_ context.Context, _ bool) error { return nil } func (s *stubEnrollTalosClient) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (s *stubEnrollTalosClient) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } diff --git a/internal/capability/platform_postop_test.go b/internal/capability/platform_postop_test.go index 332a003..26432ce 100644 --- a/internal/capability/platform_postop_test.go +++ b/internal/capability/platform_postop_test.go @@ -18,7 +18,8 @@ type mockTalosPostOp struct { func (m *mockTalosPostOp) Bootstrap(ctx context.Context) error { return nil } func (m *mockTalosPostOp) ApplyConfiguration(_ context.Context, _ []byte, _ string) error { return nil } func (m *mockTalosPostOp) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (m *mockTalosPostOp) Reboot(_ context.Context) error { return nil } +func (m *mockTalosPostOp) Reboot(_ context.Context) error { return nil } +func (m *mockTalosPostOp) RebootPowercycle(_ context.Context) error { return nil } func (m *mockTalosPostOp) Reset(_ context.Context, _ bool) error { return nil } func (m *mockTalosPostOp) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (m *mockTalosPostOp) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } diff --git a/internal/capability/platform_upgrade.go b/internal/capability/platform_upgrade.go index 9a94507..fb3cd0a 100644 --- a/internal/capability/platform_upgrade.go +++ b/internal/capability/platform_upgrade.go @@ -9,6 +9,7 @@ import ( "encoding/json" "fmt" "log/slog" + "sort" "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -116,6 +117,40 @@ func clearUpgradeProgress(ctx context.Context, dynClient dynamic.Interface, ns, } } +// nodesFromMachineConfigCRs lists all MachineConfig CRs in seam-tenant-{clusterRef}, +// sorts them by spec.order ascending (lower ordinal upgrades first), and returns the +// ordered slice of nodeIP strings. Returns nil when no CRs are found or on list error +// so the caller can fall back to talosconfig endpoint enumeration. +func nodesFromMachineConfigCRs(ctx context.Context, dynClient dynamic.Interface, ns string) []string { + crList, err := dynClient.Resource(machineConfigGVR).Namespace(ns).List(ctx, metav1.ListOptions{}) + if err != nil || len(crList.Items) == 0 { + return nil + } + type nodeEntry struct { + order int64 + nodeIP string + } + entries := make([]nodeEntry, 0, len(crList.Items)) + for _, item := range crList.Items { + spec, _ := item.Object["spec"].(map[string]interface{}) + if spec == nil { + continue + } + ip, _, _ := unstructuredString(spec, "nodeIP") + if ip == "" { + continue + } + order, _ := spec["order"].(int64) + entries = append(entries, nodeEntry{order: order, nodeIP: ip}) + } + sort.Slice(entries, func(i, j int) bool { return entries[i].order < entries[j].order }) + ips := make([]string, len(entries)) + for i, e := range entries { + ips[i] = e.nodeIP + } + return ips +} + // talosUpgradeHandler implements the talos-upgrade named capability. // Performs a rolling sequential upgrade of all nodes: each node is upgraded // with stage=false (immediate reboot), then we wait for it to return healthy @@ -130,13 +165,18 @@ func (h *talosUpgradeHandler) Execute(ctx context.Context, params ExecuteParams) "talos-upgrade requires TalosClient and DynamicClient"), nil } - nodes := params.TalosClient.Nodes() + ns := tenantNamespace(params.ClusterRef) + + // Use MachineConfig CRs sorted by spec.order as the canonical node iteration order. + // Falls back to talosconfig endpoint enumeration when no CRs are present. + nodes := nodesFromMachineConfigCRs(ctx, params.DynamicClient, ns) + if len(nodes) == 0 { + nodes = params.TalosClient.Nodes() + } if len(nodes) == 0 { return failureResult(runnerlib.CapabilityTalosUpgrade, now, runnerlib.ValidationFailure, - "talos-upgrade: no nodes available from talosconfig"), nil + "talos-upgrade: no nodes available from MachineConfig CRs or talosconfig"), nil } - - ns := tenantNamespace(params.ClusterRef) crList, err := params.DynamicClient.Resource(upgradePolicyGVR).Namespace(ns). List(ctx, metav1.ListOptions{}) if err != nil { @@ -220,11 +260,11 @@ func (h *talosUpgradeHandler) Execute(ctx context.Context, params ExecuteParams) fmt.Sprintf("stage upgrade node %s to %s: %v", nodeIP, upgradeImage, uErr)), nil } - if rErr := params.TalosClient.Reboot(nodeCtx); rErr != nil { - slog.Info("talos-upgrade: forced reboot failed", + if rErr := params.TalosClient.RebootPowercycle(nodeCtx); rErr != nil { + slog.Info("talos-upgrade: powercycle reboot failed", slog.String("node", nodeIP), slog.String("error", rErr.Error())) return failureResult(runnerlib.CapabilityTalosUpgrade, now, runnerlib.ExecutionFailure, - fmt.Sprintf("reboot node %s after staging upgrade to %s: %v", nodeIP, upgradeImage, rErr)), nil + fmt.Sprintf("powercycle reboot node %s after staging upgrade to %s: %v", nodeIP, upgradeImage, rErr)), nil } slog.Info("talos-upgrade: upgrade staged and reboot forced, waiting for node reboot", diff --git a/internal/capability/platform_upgrade_test.go b/internal/capability/platform_upgrade_test.go index c3b227a..c6d7be3 100644 --- a/internal/capability/platform_upgrade_test.go +++ b/internal/capability/platform_upgrade_test.go @@ -2,6 +2,7 @@ package capability import ( "context" + "fmt" "io" "testing" @@ -11,20 +12,25 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" fakedyn "k8s.io/client-go/dynamic/fake" "k8s.io/client-go/kubernetes/fake" + + "github.com/ontai-dev/conductor-sdk/runnerlib" ) // stubUpgradeTalosClient is a TalosNodeClient stub for upgrade tests. // Health returns healthErr on every call (nil = node is healthy). +// powercycleCalled records whether RebootPowercycle was called (for Phase 4b test assertions). type stubUpgradeTalosClient struct { - nodes []string - healthErr error - upgradeErr error + nodes []string + healthErr error + upgradeErr error + powercycleCalled bool } func (s *stubUpgradeTalosClient) Bootstrap(_ context.Context) error { return nil } func (s *stubUpgradeTalosClient) ApplyConfiguration(_ context.Context, _ []byte, _ string) error { return nil } func (s *stubUpgradeTalosClient) Upgrade(_ context.Context, _ string, _ bool) error { return s.upgradeErr } func (s *stubUpgradeTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubUpgradeTalosClient) RebootPowercycle(_ context.Context) error { s.powercycleCalled = true; return nil } func (s *stubUpgradeTalosClient) Reset(_ context.Context, _ bool) error { return nil } func (s *stubUpgradeTalosClient) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (s *stubUpgradeTalosClient) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } @@ -270,3 +276,131 @@ func TestWaitForNodeReboot_KubeNotReadyReturnsError(t *testing.T) { t.Error("expected error when Kubernetes node remains NotReady, got nil") } } + +// ── nodesFromMachineConfigCRs ──────────────────────────────────────────────── + +// buildUpgradeDynClientWithMachineConfigs returns a fake DynamicClient containing +// both an UpgradePolicy CR and a set of MachineConfig CRs for order-based iteration tests. +// nodes is a slice of (nodeIP, order) pairs; the function creates one MachineConfig CR per entry. +func buildUpgradeDynClientWithMachineConfigs(clusterRef, policyName, upgradeType, targetVersion string, nodes []struct{ ip string; order int64 }) *fakedyn.FakeDynamicClient { + scheme := runtime.NewScheme() + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "UpgradePolicy", + }, &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "UpgradePolicyList", + }, &unstructured.UnstructuredList{}) + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }, &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfigList", + }, &unstructured.UnstructuredList{}) + + ns := "seam-tenant-" + clusterRef + + policy := &unstructured.Unstructured{} + policy.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "UpgradePolicy", + }) + policy.SetName(policyName) + policy.SetNamespace(ns) + specVersionKey := "targetTalosVersion" + if upgradeType == "kubernetes" { + specVersionKey = "targetKubernetesVersion" + } + policy.Object["spec"] = map[string]interface{}{ + "upgradeType": upgradeType, + specVersionKey: targetVersion, + } + + objs := []runtime.Object{policy} + for i, n := range nodes { + mc := &unstructured.Unstructured{} + mc.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }) + mc.SetName(fmt.Sprintf("seam-mc-%s-node%d", clusterRef, i)) + mc.SetNamespace(ns) + mc.Object["spec"] = map[string]interface{}{ + "nodeIP": n.ip, + "order": n.order, + } + objs = append(objs, mc) + } + return fakedyn.NewSimpleDynamicClient(scheme, objs...) +} + +// TestNodesFromMachineConfigCRs_SortsAscendingByOrder verifies that +// nodesFromMachineConfigCRs returns nodeIPs sorted by spec.order ascending. +// Phase 4b: upgrade order is driven by MachineConfig CR spec.order. +func TestNodesFromMachineConfigCRs_SortsAscendingByOrder(t *testing.T) { + // Intentionally seed CRs out of order: 2, 0, 1 -- expect 0, 1, 2 back. + dyn := buildUpgradeDynClientWithMachineConfigs("ccs-dev", "up-mc", "talos", "v1.10.0", []struct{ ip string; order int64 }{ + {ip: "10.20.0.12", order: 2}, + {ip: "10.20.0.10", order: 0}, + {ip: "10.20.0.11", order: 1}, + }) + ns := "seam-tenant-ccs-dev" + + got := nodesFromMachineConfigCRs(context.Background(), dyn, ns) + want := []string{"10.20.0.10", "10.20.0.11", "10.20.0.12"} + if len(got) != len(want) { + t.Fatalf("got %d nodes, want %d: %v", len(got), len(want), got) + } + for i, ip := range want { + if got[i] != ip { + t.Errorf("node[%d]: got %q, want %q", i, got[i], ip) + } + } +} + +// TestNodesFromMachineConfigCRs_EmptyWhenNoCRs verifies that nodesFromMachineConfigCRs +// returns nil when no MachineConfig CRs exist, so the caller can fall back. +func TestNodesFromMachineConfigCRs_EmptyWhenNoCRs(t *testing.T) { + scheme := runtime.NewScheme() + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }, &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfigList", + }, &unstructured.UnstructuredList{}) + dyn := fakedyn.NewSimpleDynamicClient(scheme) + + got := nodesFromMachineConfigCRs(context.Background(), dyn, "seam-tenant-ccs-dev") + if len(got) != 0 { + t.Errorf("expected nil/empty, got %v", got) + } +} + +// TestTalosUpgrade_PowercycleCalledAfterStage verifies that after staging the Talos +// upgrade, the handler calls RebootPowercycle (not plain Reboot) on each node. +// Phase 4b: hardware powercycle ensures clean BIOS/UEFI re-initialisation post-upgrade. +func TestTalosUpgrade_PowercycleCalledAfterStage(t *testing.T) { + defer setRebootTimeouts()() + + talos := &stubUpgradeTalosClient{ + nodes: []string{"10.20.0.10"}, + } + dyn := buildUpgradeDynClientWithMachineConfigs("ccs-dev", "up-pow", "talos", "v1.10.0", []struct{ ip string; order int64 }{ + {ip: "10.20.0.10", order: 0}, + }) + + handler := &talosUpgradeHandler{} + result, err := handler.Execute(context.Background(), ExecuteParams{ + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TalosClient: talos, + DynamicClient: dyn, + }, + }) + if err != nil { + t.Fatalf("Execute error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Errorf("expected ResultSucceeded, got %q: %v", result.Status, result.Steps) + } + if !talos.powercycleCalled { + t.Error("expected RebootPowercycle to be called; it was not") + } +} diff --git a/test/unit/capability/platform_test.go b/test/unit/capability/platform_test.go index 45dc17e..dca2dd7 100644 --- a/test/unit/capability/platform_test.go +++ b/test/unit/capability/platform_test.go @@ -36,6 +36,7 @@ var platformKindToResource = map[string]string{ "PKIRotation": "pkirotations", "ClusterReset": "clusterresets", "HardeningProfile": "hardeningprofiles", + "MachineConfig": "machineconfigs", } // seamKindToResource maps seam.ontai.dev Kind names to GVR resources. @@ -160,6 +161,10 @@ func (s *stubTalosClient) Reboot(_ context.Context) error { s.rebootCalled = true return s.rebootErr } +func (s *stubTalosClient) RebootPowercycle(_ context.Context) error { + s.rebootCalled = true + return s.rebootErr +} func (s *stubTalosClient) Reset(_ context.Context, _ bool) error { s.resetCalled = true return s.resetErr From 46cd143acd02b813228e0ad6ba948718c3fcd194 Mon Sep 17 00:00:00 2001 From: ontave Date: Mon, 1 Jun 2026 08:45:03 +0200 Subject: [PATCH 13/15] fix(conductor): machineconfig-sync reconstruct adds version/debug/persist header fields Talos v1alpha1 machineconfig requires top-level version, debug, persist fields. Omitting them caused 'this config change can't be applied in immediate mode' when Talos diffed the incoming config against the running one. --- internal/capability/platform_machineconfig_sync.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/internal/capability/platform_machineconfig_sync.go b/internal/capability/platform_machineconfig_sync.go index 9684811..fb54c9c 100644 --- a/internal/capability/platform_machineconfig_sync.go +++ b/internal/capability/platform_machineconfig_sync.go @@ -184,14 +184,18 @@ func reconstructMachineConfigYAML(obj map[string]interface{}) ([]byte, error) { if spec == nil { return nil, fmt.Errorf("MachineConfig CR has no spec") } - combined := map[string]interface{}{} + combined := map[string]interface{}{ + "version": "v1alpha1", + "debug": false, + "persist": true, + } if m := spec["machine"]; m != nil { combined["machine"] = m } if c := spec["cluster"]; c != nil { combined["cluster"] = c } - if len(combined) == 0 { + if spec["machine"] == nil && spec["cluster"] == nil { return nil, fmt.Errorf("MachineConfig CR spec has neither machine nor cluster section") } data, err := sigsyaml.Marshal(combined) From c34e656b42ea1c5087a071dfc731006b855f9552 Mon Sep 17 00:00:00 2001 From: ontave Date: Mon, 1 Jun 2026 09:25:36 +0200 Subject: [PATCH 14/15] fix(conductor): SeamMembership ont-system principal + NodeRegistrationDrift auto-resolution identity.go: correct PrincipalRef from seam-system to ont-system (conductor runs in ont-system; seam-system caused DomainIdentityMismatch / Validated=False on SeamMembership seam-conductor). cluster_node_health_loop.go: add resolveNodeRegistrationDrift() -- after each checkNodeRegistration pass, patches any DriftSignal of kind NodeRegistrationDrift to state=resolved for nodes that now carry the ont.platform.dev/controlled=true label. No DriftSignalController exists in seam; signals must be resolved by the component that detected them once the condition clears. Tests: assertion on ont-system principalRef; three table-driven tests for resolve (label present, label absent, already resolved). --- internal/agent/cluster_node_health_loop.go | 60 ++++++++++ .../agent/cluster_node_health_loop_test.go | 106 ++++++++++++++++++ internal/identity/identity.go | 2 +- internal/identity/identity_test.go | 4 + 4 files changed, 171 insertions(+), 1 deletion(-) diff --git a/internal/agent/cluster_node_health_loop.go b/internal/agent/cluster_node_health_loop.go index 21640ec..ea669e5 100644 --- a/internal/agent/cluster_node_health_loop.go +++ b/internal/agent/cluster_node_health_loop.go @@ -220,6 +220,8 @@ func (l *ClusterNodeHealthLoop) checkOnce(ctx context.Context) { // Detect nodes missing the ONT enrollment label. RECON-C2. l.checkNodeRegistration(ctx, nodes) + // Resolve any NodeRegistrationDrift signals for nodes that now have the label. + l.resolveNodeRegistrationDrift(ctx, nodes) // Check CPU/memory utilisation against the CapacitySaturation threshold. RECON-C6. l.checkCapacitySaturation(ctx, nodes) @@ -823,6 +825,64 @@ func (l *ClusterNodeHealthLoop) checkNodeRegistration(ctx context.Context, nodes } } +// resolveNodeRegistrationDrift patches NodeRegistrationDrift DriftSignals to +// state=resolved when the affected node now carries ont.platform.dev/controlled=true. +// Called on every checkOnce cycle after checkNodeRegistration. RECON-C2. +func (l *ClusterNodeHealthLoop) resolveNodeRegistrationDrift(ctx context.Context, nodes []map[string]interface{}) { + ns := "seam-tenant-" + l.clusterRef + + // Build a set of node names that now have the controlled label. + controlled := make(map[string]bool, len(nodes)) + for _, n := range nodes { + meta, _ := n["metadata"].(map[string]interface{}) + if meta == nil { + continue + } + name, _ := meta["name"].(string) + if name == "" { + continue + } + labels, _ := meta["labels"].(map[string]interface{}) + if v, ok := labels["ont.platform.dev/controlled"]; ok && v == "true" { + controlled[name] = true + } + } + + // List all DriftSignals of kind NodeRegistrationDrift in the tenant namespace. + list, err := l.localClient.Resource(driftSignalGVR).Namespace(ns).List(ctx, metav1.ListOptions{}) + if err != nil { + return + } + + patchBytes := []byte(`{"spec":{"state":"resolved"}}`) + for _, item := range list.Items { + spec, _ := item.Object["spec"].(map[string]interface{}) + if spec == nil { + continue + } + if spec["signalKind"] != "NodeRegistrationDrift" { + continue + } + if spec["state"] == "resolved" { + continue + } + affectedCRRef, _ := spec["affectedCRRef"].(map[string]interface{}) + nodeName, _ := affectedCRRef["name"].(string) + if nodeName == "" || !controlled[nodeName] { + continue + } + if _, err := l.localClient.Resource(driftSignalGVR).Namespace(ns).Patch( + ctx, item.GetName(), types.MergePatchType, patchBytes, metav1.PatchOptions{}, + ); err != nil { + fmt.Printf("cluster node health loop: cluster=%q resolveNodeRegistrationDrift patch %s: %v\n", + l.clusterRef, item.GetName(), err) + continue + } + fmt.Printf("cluster node health loop: cluster=%q node=%q NodeRegistrationDrift DriftSignal %s resolved\n", + l.clusterRef, nodeName, item.GetName()) + } +} + // checkCapacitySaturation queries the metrics-server NodeMetrics API and compares // CPU and memory usage to each node's allocatable capacity. Nodes above // capacitySaturationThresholdPct for capacitySaturationConsecutiveRequired diff --git a/internal/agent/cluster_node_health_loop_test.go b/internal/agent/cluster_node_health_loop_test.go index dfeffa2..e9c58ae 100644 --- a/internal/agent/cluster_node_health_loop_test.go +++ b/internal/agent/cluster_node_health_loop_test.go @@ -902,3 +902,109 @@ func TestEmitTier3DriftSignal_CreatesSignalWithLossScope(t *testing.T) { t.Errorf("lossScope.severity = %v, want quorum-at-risk", ls["severity"]) } } + +// --------------------------------------------------------------------------- +// resolveNodeRegistrationDrift: auto-resolution when controlled label present +// --------------------------------------------------------------------------- + +func TestResolveNodeRegistrationDrift_ResolvesWhenLabelPresent(t *testing.T) { + // Create a DriftSignal in "pending" state for a node that now has the label. + existingSignal := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": "node-reg-drift-cp1", "namespace": "seam-tenant-ccs-mgmt"}, + "spec": map[string]interface{}{ + "signalKind": "NodeRegistrationDrift", + "state": "pending", + "driftLayer": "infrastructure", + "correlationID": "node-reg-ccs-mgmt-cp1", + "affectedCRRef": map[string]interface{}{"group": "", "kind": "Node", "name": "cp1"}, + }, + }, + } + tc := makeTalosCluster("ccs-mgmt", "ont-system") + dynClient := buildHealthFakeClient(tc, existingSignal) + + // Node now has the controlled label. + nodeControlled := makeNodeWithLabels("cp1", "10.20.0.2", "True", map[string]interface{}{ + "node-role.kubernetes.io/control-plane": "", + "ont.platform.dev/controlled": "true", + }) + + loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) + loop.resolveNodeRegistrationDrift(context.Background(), []map[string]interface{}{nodeControlled.Object}) + + list, err := dynClient.Resource(driftSignalGVR).Namespace("seam-tenant-ccs-mgmt").List( + context.Background(), metav1.ListOptions{}, + ) + if err != nil { + t.Fatalf("list DriftSignals: %v", err) + } + if len(list.Items) == 0 { + t.Fatal("DriftSignal was deleted; expected it to be patched to resolved") + } + specBytes, _ := json.Marshal(list.Items[0].Object["spec"]) + var spec map[string]interface{} + _ = json.Unmarshal(specBytes, &spec) + if spec["state"] != "resolved" { + t.Errorf("state = %q, want resolved", spec["state"]) + } +} + +func TestResolveNodeRegistrationDrift_SkipsWhenLabelAbsent(t *testing.T) { + existingSignal := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": "node-reg-drift-cp1", "namespace": "seam-tenant-ccs-mgmt"}, + "spec": map[string]interface{}{ + "signalKind": "NodeRegistrationDrift", + "state": "pending", + "affectedCRRef": map[string]interface{}{"group": "", "kind": "Node", "name": "cp1"}, + }, + }, + } + tc := makeTalosCluster("ccs-mgmt", "ont-system") + dynClient := buildHealthFakeClient(tc, existingSignal) + + // Node still lacks the controlled label. + nodeUncontrolled := makeNode("cp1", "10.20.0.2", "True") + + loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) + loop.resolveNodeRegistrationDrift(context.Background(), []map[string]interface{}{nodeUncontrolled.Object}) + + list, _ := dynClient.Resource(driftSignalGVR).Namespace("seam-tenant-ccs-mgmt").List( + context.Background(), metav1.ListOptions{}, + ) + specBytes, _ := json.Marshal(list.Items[0].Object["spec"]) + var spec map[string]interface{} + _ = json.Unmarshal(specBytes, &spec) + if spec["state"] == "resolved" { + t.Error("state was resolved but label was absent; should not have been patched") + } +} + +func TestResolveNodeRegistrationDrift_SkipsAlreadyResolved(t *testing.T) { + existingSignal := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": "node-reg-drift-cp1", "namespace": "seam-tenant-ccs-mgmt"}, + "spec": map[string]interface{}{ + "signalKind": "NodeRegistrationDrift", + "state": "resolved", + "affectedCRRef": map[string]interface{}{"group": "", "kind": "Node", "name": "cp1"}, + }, + }, + } + tc := makeTalosCluster("ccs-mgmt", "ont-system") + dynClient := buildHealthFakeClient(tc, existingSignal) + + nodeControlled := makeNodeWithLabels("cp1", "10.20.0.2", "True", map[string]interface{}{ + "ont.platform.dev/controlled": "true", + }) + loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) + // Should be a no-op -- already resolved. + loop.resolveNodeRegistrationDrift(context.Background(), []map[string]interface{}{nodeControlled.Object}) +} diff --git a/internal/identity/identity.go b/internal/identity/identity.go index 1a5caff..3c84ab0 100644 --- a/internal/identity/identity.go +++ b/internal/identity/identity.go @@ -54,7 +54,7 @@ func EnsureSeamMembership(ctx context.Context, c client.Client) error { Spec: seamv1alpha1.SeamMembershipSpec{ AppIdentityRef: id.OperatorName(), DomainIdentityRef: id.OperatorName(), - PrincipalRef: "system:serviceaccount:seam-system:" + id.OperatorName(), + PrincipalRef: "system:serviceaccount:ont-system:" + id.OperatorName(), Tier: "infrastructure", }, } diff --git a/internal/identity/identity_test.go b/internal/identity/identity_test.go index 81dbf8f..02e4144 100644 --- a/internal/identity/identity_test.go +++ b/internal/identity/identity_test.go @@ -92,6 +92,10 @@ func TestEnsureSeamMembership_Creates(t *testing.T) { if sm.Spec.Tier != "infrastructure" { t.Errorf("Tier = %q, want %q", sm.Spec.Tier, "infrastructure") } + wantPrincipal := "system:serviceaccount:ont-system:conductor" + if sm.Spec.PrincipalRef != wantPrincipal { + t.Errorf("PrincipalRef = %q, want %q (conductor runs in ont-system, not seam-system)", sm.Spec.PrincipalRef, wantPrincipal) + } } func TestEnsureSeamMembership_Idempotent(t *testing.T) { From 95fa3eaa8e3f3abb9054b5c66266afdcd017ede9 Mon Sep 17 00:00:00 2001 From: ontave Date: Mon, 1 Jun 2026 13:17:18 +0200 Subject: [PATCH 15/15] fix(conductor): use seam-system for all TalosCluster lookups in health loop TalosCluster CRs live in seam-system, not in the operator namespace (ont-system). All health loop files were using l.namespace (ont-system) for TalosCluster patches, causing "not found" errors that silently dropped: NodeHealthSummary annotation/condition, HumanInterventionRequired status patches, DiskPressure condition, endpoint drift condition, etcd health annotation, PKI expiry reads, capacity saturation condition. Fix: import namespaces package and use namespaces.SeamSystem in all six files (cluster_node_health_loop, cluster_pki_expiry, cluster_disk_pressure, cluster_endpoint_drift, cluster_etcd_health). Update test fixtures accordingly (makeTalosCluster / makeTalosClusterWithPKIExpiry namespace arg: seam-system). --- internal/agent/cluster_disk_pressure.go | 4 +- internal/agent/cluster_endpoint_drift.go | 6 ++- internal/agent/cluster_etcd_health.go | 4 +- internal/agent/cluster_node_health_loop.go | 17 ++++---- ...uster_node_health_loop_maintenance_test.go | 18 ++++---- .../agent/cluster_node_health_loop_test.go | 42 +++++++++---------- internal/agent/cluster_pki_expiry.go | 4 +- internal/agent/cluster_pki_expiry_test.go | 14 +++---- 8 files changed, 60 insertions(+), 49 deletions(-) diff --git a/internal/agent/cluster_disk_pressure.go b/internal/agent/cluster_disk_pressure.go index e69870a..5dbe48b 100644 --- a/internal/agent/cluster_disk_pressure.go +++ b/internal/agent/cluster_disk_pressure.go @@ -8,6 +8,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + + "github.com/ontai-dev/seam/pkg/namespaces" ) // diskPressureWarnThreshold is the number of consecutive check cycles with DiskPressure=True @@ -114,7 +116,7 @@ func (l *ClusterNodeHealthLoop) setTalosClusterDiskPressure(ctx context.Context, fmt.Printf("disk pressure: cluster=%q marshal condition patch: %v\n", l.clusterRef, err) return } - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem).Patch( ctx, l.clusterRef, types.MergePatchType, data, metav1.PatchOptions{}, "status", ); err != nil { fmt.Printf("disk pressure: cluster=%q patch DiskPressure condition: %v\n", l.clusterRef, err) diff --git a/internal/agent/cluster_endpoint_drift.go b/internal/agent/cluster_endpoint_drift.go index 295fa85..42f9235 100644 --- a/internal/agent/cluster_endpoint_drift.go +++ b/internal/agent/cluster_endpoint_drift.go @@ -12,6 +12,8 @@ import ( k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/types" "gopkg.in/yaml.v3" + + "github.com/ontai-dev/seam/pkg/namespaces" ) // endpointDriftConsecutiveRequired is the number of consecutive checkOnce cycles with @@ -164,7 +166,7 @@ func (l *ClusterNodeHealthLoop) setHumanInterventionRequired(ctx context.Context fmt.Printf("cluster endpoint drift: cluster=%q marshal HumanInterventionRequired patch: %v\n", l.clusterRef, err) return } - _, err = l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + _, err = l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem).Patch( ctx, l.clusterRef, types.MergePatchType, patchBytes, metav1.PatchOptions{}, "status", ) if err != nil { @@ -197,7 +199,7 @@ func (l *ClusterNodeHealthLoop) emitEndpointDriftSignal(ctx context.Context, old "affectedCRRef": map[string]interface{}{ "group": "seam.ontai.dev", "kind": "TalosCluster", - "namespace": l.namespace, + "namespace": namespaces.SeamSystem, "name": l.clusterRef, }, "driftReason": msg, diff --git a/internal/agent/cluster_etcd_health.go b/internal/agent/cluster_etcd_health.go index e8ad733..cb896ba 100644 --- a/internal/agent/cluster_etcd_health.go +++ b/internal/agent/cluster_etcd_health.go @@ -10,6 +10,8 @@ import ( k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" + + "github.com/ontai-dev/seam/pkg/namespaces" ) // etcdDegradedThreshold is the number of consecutive cycles a member must be @@ -205,7 +207,7 @@ func (l *ClusterNodeHealthLoop) writeEtcdHealthAnnotation(ctx context.Context, m fmt.Printf("etcd health: cluster=%q marshal annotation: %v\n", l.clusterRef, err) return } - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem).Patch( ctx, l.clusterRef, types.MergePatchType, data, metav1.PatchOptions{}, ); err != nil { fmt.Printf("etcd health: cluster=%q write etcd annotation: %v\n", l.clusterRef, err) diff --git a/internal/agent/cluster_node_health_loop.go b/internal/agent/cluster_node_health_loop.go index ea669e5..18901d1 100644 --- a/internal/agent/cluster_node_health_loop.go +++ b/internal/agent/cluster_node_health_loop.go @@ -18,6 +18,8 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/dynamic" sigsyaml "sigs.k8s.io/yaml" + + "github.com/ontai-dev/seam/pkg/namespaces" ) // NodeHealthState classifies a node's health based on Kubernetes node conditions. @@ -443,17 +445,18 @@ func (l *ClusterNodeHealthLoop) writeTalosClusterHealthStatus(ctx context.Contex return fmt.Errorf("marshal TalosCluster health patch: %w", err) } // Metadata patch via merge-patch. - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + tcNS := namespaces.SeamSystem + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(tcNS).Patch( ctx, l.clusterRef, types.MergePatchType, data, metav1.PatchOptions{}, ); err != nil { - return fmt.Errorf("patch TalosCluster %s/%s metadata: %w", l.namespace, l.clusterRef, err) + return fmt.Errorf("patch TalosCluster %s/%s metadata: %w", tcNS, l.clusterRef, err) } // Status subresource patch. statusData, _ := json.Marshal(patch) - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(tcNS).Patch( ctx, l.clusterRef, types.MergePatchType, statusData, metav1.PatchOptions{}, "status", ); err != nil { - return fmt.Errorf("patch TalosCluster %s/%s status: %w", l.namespace, l.clusterRef, err) + return fmt.Errorf("patch TalosCluster %s/%s status: %w", tcNS, l.clusterRef, err) } return nil } @@ -561,7 +564,7 @@ func (l *ClusterNodeHealthLoop) emitHumanInterventionRequired(ctx context.Contex }, } data, _ := json.Marshal(patch) - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem).Patch( ctx, l.clusterRef, types.MergePatchType, data, metav1.PatchOptions{}, "status", ); err != nil { fmt.Printf("cluster node health loop: cluster=%q set HumanInterventionRequired: %v\n", l.clusterRef, err) @@ -612,7 +615,7 @@ func (l *ClusterNodeHealthLoop) emitTier3DriftSignal( "affectedCRRef": map[string]interface{}{ "group": "seam.ontai.dev", "kind": "TalosCluster", - "namespace": l.namespace, + "namespace": namespaces.SeamSystem, "name": l.clusterRef, }, "driftReason": msg, @@ -1023,7 +1026,7 @@ func (l *ClusterNodeHealthLoop) writeCapacitySaturationCondition(ctx context.Con }, } data, _ := json.Marshal(patch) - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem).Patch( ctx, l.clusterRef, types.MergePatchType, data, metav1.PatchOptions{}, "status", ); err != nil { fmt.Printf("cluster node health loop: cluster=%q writeCapacitySaturationCondition: %v\n", l.clusterRef, err) diff --git a/internal/agent/cluster_node_health_loop_maintenance_test.go b/internal/agent/cluster_node_health_loop_maintenance_test.go index 54aa5c9..7fd1e55 100644 --- a/internal/agent/cluster_node_health_loop_maintenance_test.go +++ b/internal/agent/cluster_node_health_loop_maintenance_test.go @@ -24,7 +24,7 @@ func TestTwoPhase_UnreachableNodeWithMaintenancePortOpen_ClassifiedAsMaintenance defer func() { probeMaintenancePortFn = old }() node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) ocWatcher := NewOperatorContextWatcher(dynClient, "ont-system") @@ -50,7 +50,7 @@ func TestTwoPhase_UnreachableNodeWithPortClosed_RemainsUnreachable(t *testing.T) defer func() { probeMaintenancePortFn = old }() node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) ocWatcher := NewOperatorContextWatcher(dynClient, "ont-system") @@ -77,7 +77,7 @@ func TestTwoPhase_ReadyNodeSkipsProbe(t *testing.T) { defer func() { probeMaintenancePortFn = old }() node := makeNode("cp1", "10.20.0.2", "True") // Ready - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) ocWatcher := NewOperatorContextWatcher(dynClient, "ont-system") @@ -97,7 +97,7 @@ func TestTwoPhase_MaintenanceMode_DoesNotIncrementConsecutiveBad(t *testing.T) { defer func() { probeMaintenancePortFn = old }() node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) ocWatcher := NewOperatorContextWatcher(dynClient, "ont-system") @@ -129,7 +129,7 @@ func TestTriggerReenrollment_WithPerNodeSecret_DelegatedLevel_CreatesNodeOperati const ns = "seam-tenant-ccs-mgmt" node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster(clusterRef, "ont-system") + tc := makeTalosCluster(clusterRef, "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") // Per-node machineconfig secret. @@ -166,7 +166,7 @@ func TestTriggerReenrollment_WithPerNodeSecret_DelegatedLevel_CreatesNodeOperati func TestTriggerReenrollment_NoSecret_SetsHumanInterventionRequired(t *testing.T) { const clusterRef = "ccs-mgmt" - tc := makeTalosCluster(clusterRef, "ont-system") + tc := makeTalosCluster(clusterRef, "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") // No machineconfig secrets. dynClient := buildHealthFakeClient(tc, ocObj) @@ -182,7 +182,7 @@ func TestTriggerReenrollment_NoSecret_SetsHumanInterventionRequired(t *testing.T talosClusterGVRTest := schema.GroupVersionResource{ Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "talosclusters", } - tc2, err := dynClient.Resource(talosClusterGVRTest).Namespace("ont-system").Get( + tc2, err := dynClient.Resource(talosClusterGVRTest).Namespace("seam-system").Get( context.Background(), clusterRef, metav1.GetOptions{}) if err != nil { t.Fatalf("get TalosCluster: %v", err) @@ -211,7 +211,7 @@ func TestTriggerReenrollment_LowAutonomyLevel_BlockedFromCreatingNodeOperation(t const ns = "seam-tenant-ccs-mgmt" node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster(clusterRef, "ont-system") + tc := makeTalosCluster(clusterRef, "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") mcSecret := &unstructured.Unstructured{} @@ -245,7 +245,7 @@ func TestTriggerReenrollment_Idempotent_SecondCheckDoesNotDuplicateCR(t *testing const ns = "seam-tenant-ccs-mgmt" node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster(clusterRef, "ont-system") + tc := makeTalosCluster(clusterRef, "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") mcSecret := &unstructured.Unstructured{} diff --git a/internal/agent/cluster_node_health_loop_test.go b/internal/agent/cluster_node_health_loop_test.go index e9c58ae..40ee8bd 100644 --- a/internal/agent/cluster_node_health_loop_test.go +++ b/internal/agent/cluster_node_health_loop_test.go @@ -243,7 +243,7 @@ func TestConsecutiveFailureTracking_ResetsOnReady(t *testing.T) { func TestConsecutiveFailureTracking_Increments(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "False") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -270,7 +270,7 @@ func TestConsecutiveFailureTracking_Increments(t *testing.T) { func TestTier1Reboot_BlockedByObserveOnly(t *testing.T) { // Create a node that has been Degraded 3 times consecutively. node := makeNode("cp1", "10.20.0.2", "False") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) @@ -301,7 +301,7 @@ func TestTier1Reboot_BlockedByObserveOnly(t *testing.T) { func TestTier1Reboot_AllowedByDelegated(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "False") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) @@ -330,7 +330,7 @@ func TestTier1Reboot_AllowedByDelegated(t *testing.T) { func TestTier3_MultipleNodesDegraded_SetsHumanIntervention(t *testing.T) { node1 := makeNode("cp1", "10.20.0.2", "False") node2 := makeNode("cp2", "10.20.0.3", "False") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node1, node2, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -340,7 +340,7 @@ func TestTier3_MultipleNodesDegraded_SetsHumanIntervention(t *testing.T) { {name: "cp2", ip: "10.20.0.3", state: NodeHealthStateDegraded, consecutiveBad: 1}, }, false) - tc2, err := dynClient.Resource(talosClusterGVR).Namespace("ont-system").Get( + tc2, err := dynClient.Resource(talosClusterGVR).Namespace("seam-system").Get( context.Background(), "ccs-mgmt", metav1.GetOptions{}, "status", ) if err != nil { @@ -369,14 +369,14 @@ func TestTier3_MultipleNodesDegraded_SetsHumanIntervention(t *testing.T) { func TestNodeHealthSummaryAnnotation_Content(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "True") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) loop.checkOnce(context.Background()) // Verify the TalosCluster was patched with an annotation. - updated, err := dynClient.Resource(talosClusterGVR).Namespace("ont-system").Get( + updated, err := dynClient.Resource(talosClusterGVR).Namespace("seam-system").Get( context.Background(), "ccs-mgmt", metav1.GetOptions{}, ) if err != nil { @@ -403,7 +403,7 @@ func TestNodeHealthSummaryAnnotation_Content(t *testing.T) { func TestClusterNodeHealthLoop_RunCancelsCleanly(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "True") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -449,7 +449,7 @@ func TestCheckNodeRegistration_LabeledNode_NoDriftSignal(t *testing.T) { node := makeNodeWithLabels("cp1", "10.20.0.2", "True", map[string]interface{}{ "ont.platform.dev/controlled": "true", }) - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -468,7 +468,7 @@ func TestCheckNodeRegistration_LabeledNode_NoDriftSignal(t *testing.T) { func TestCheckNodeRegistration_UnlabeledNode_CreatesDriftSignal(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "True") // no labels at all - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -495,7 +495,7 @@ func TestCheckNodeRegistration_UnlabeledNode_CreatesDriftSignal(t *testing.T) { func TestCheckNodeRegistration_DuplicateSignal_NotCreated(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "True") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -525,7 +525,7 @@ func TestCheckNodeRegistration_DuplicateSignal_NotCreated(t *testing.T) { // metrics-server returns an empty list (no NodeMetrics objects), no saturation is detected. func TestCheckCapacitySaturation_NoMetricsObjects_NoSaturation(t *testing.T) { node := makeNodeWithAllocatable("cp1", "10.20.0.2", "True", "4", "8Gi") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -565,7 +565,7 @@ func deleteNodeMetrics(t *testing.T, dynClient *dynamicfake.FakeDynamicClient, n func TestCheckCapacitySaturation_BelowThreshold_NeverSaturates(t *testing.T) { // 4 CPUs allocatable; 200m usage = 5%. 8Gi allocatable; 400Mi usage = ~5%. node := makeNodeWithAllocatable("cp1", "10.20.0.2", "True", "4", "8Gi") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) injectNodeMetrics(t, dynClient, "cp1", "200m", "400Mi") @@ -590,7 +590,7 @@ func TestCheckCapacitySaturation_BelowThreshold_NeverSaturates(t *testing.T) { func TestCheckCapacitySaturation_ConsecutiveRequired_SetsCondition(t *testing.T) { // 4 CPUs allocatable; 3800m usage = 95% (above 85% threshold). node := makeNodeWithAllocatable("cp1", "10.20.0.2", "True", "4", "8Gi") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) injectNodeMetrics(t, dynClient, "cp1", "3800m", "400Mi") // CPU: 95%, mem: ~5% @@ -618,7 +618,7 @@ func TestCheckCapacitySaturation_ConsecutiveRequired_SetsCondition(t *testing.T) // the consecutive counter resets to 0 when utilisation drops below the threshold. func TestCheckCapacitySaturation_ConsecutiveResetsOnBelowThreshold(t *testing.T) { node := makeNodeWithAllocatable("cp1", "10.20.0.2", "True", "4", "8Gi") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) injectNodeMetrics(t, dynClient, "cp1", "3800m", "400Mi") // CPU: 95% @@ -723,7 +723,7 @@ func TestFleetHealthDispatcher_ContextCancellation(t *testing.T) { func TestCheckNodeRegistration_DriftLayerIsInfrastructure(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "True") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -830,7 +830,7 @@ func TestCheckNodeRegistration_LossScopePopulated(t *testing.T) { node := makeNodeWithLabels("cp1", "10.20.0.2", "True", map[string]interface{}{ "node-role.kubernetes.io/control-plane": "", }) - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -863,7 +863,7 @@ func TestCheckNodeRegistration_LossScopePopulated(t *testing.T) { } func TestEmitTier3DriftSignal_CreatesSignalWithLossScope(t *testing.T) { - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -923,7 +923,7 @@ func TestResolveNodeRegistrationDrift_ResolvesWhenLabelPresent(t *testing.T) { }, }, } - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(tc, existingSignal) // Node now has the controlled label. @@ -965,7 +965,7 @@ func TestResolveNodeRegistrationDrift_SkipsWhenLabelAbsent(t *testing.T) { }, }, } - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(tc, existingSignal) // Node still lacks the controlled label. @@ -998,7 +998,7 @@ func TestResolveNodeRegistrationDrift_SkipsAlreadyResolved(t *testing.T) { }, }, } - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(tc, existingSignal) nodeControlled := makeNodeWithLabels("cp1", "10.20.0.2", "True", map[string]interface{}{ diff --git a/internal/agent/cluster_pki_expiry.go b/internal/agent/cluster_pki_expiry.go index 6d787a5..034ac92 100644 --- a/internal/agent/cluster_pki_expiry.go +++ b/internal/agent/cluster_pki_expiry.go @@ -13,6 +13,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/ontai-dev/seam/pkg/namespaces" ) // pkiRotationGVR is the GroupVersionResource for PKIRotation CRs. @@ -35,7 +37,7 @@ const pkiExpiryActionThreshold = 7 // approaching, either auto-creates a PKIRotation CR (if AutonomyLevel >= delegated) // or sets HumanInterventionRequired=True on the TalosCluster. RECON-C1. func (l *ClusterNodeHealthLoop) checkPKIExpiry(ctx context.Context) { - tc, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace). + tc, err := l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem). Get(ctx, l.clusterRef, metav1.GetOptions{}) if err != nil { return diff --git a/internal/agent/cluster_pki_expiry_test.go b/internal/agent/cluster_pki_expiry_test.go index e250868..50ee56c 100644 --- a/internal/agent/cluster_pki_expiry_test.go +++ b/internal/agent/cluster_pki_expiry_test.go @@ -22,7 +22,7 @@ func makeTalosClusterWithPKIExpiry(name, namespace, expiryRFC3339 string) *unstr // pkiExpiryDate field, checkPKIExpiry exits early without creating any PKIRotation CR. // RECON-C1. func TestCheckPKIExpiry_NoActionWhenNoExpiryDate(t *testing.T) { - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(tc) w := NewOperatorContextWatcher(dynClient, "ont-system") loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", w) @@ -44,7 +44,7 @@ func TestCheckPKIExpiry_NoActionWhenNoExpiryDate(t *testing.T) { // expiry 60 days in the future does not trigger any action. RECON-C1. func TestCheckPKIExpiry_NoActionWhenFarFromExpiry(t *testing.T) { expiry := time.Now().UTC().Add(60 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) dynClient := buildHealthFakeClient(tc) w := NewOperatorContextWatcher(dynClient, "ont-system") loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", w) @@ -67,7 +67,7 @@ func TestCheckPKIExpiry_NoActionWhenFarFromExpiry(t *testing.T) { // RECON-C1. func TestCheckPKIExpiry_LogsOnlyWhenWithin30Days(t *testing.T) { expiry := time.Now().UTC().Add(15 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) dynClient := buildHealthFakeClient(tc) w := NewOperatorContextWatcher(dynClient, "ont-system") loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", w) @@ -89,7 +89,7 @@ func TestCheckPKIExpiry_LogsOnlyWhenWithin30Days(t *testing.T) { // expires in 5 days and AutonomyLevel=delegated, a PKIRotation CR is created. RECON-C1. func TestCheckPKIExpiry_CreatesRotationWhenWithin7DaysAndDelegated(t *testing.T) { expiry := time.Now().UTC().Add(5 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") dynClient := buildHealthFakeClient(tc, ocObj) @@ -118,7 +118,7 @@ func TestCheckPKIExpiry_CreatesRotationWhenWithin7DaysAndDelegated(t *testing.T) // autonomy level also triggers PKIRotation creation within the 7-day window. RECON-C1. func TestCheckPKIExpiry_CreatesRotationWhenFullDelegation(t *testing.T) { expiry := time.Now().UTC().Add(3 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) ocObj := makeOperatorContext("ont-system", AutonomyLevelFullDelegation, "normal") dynClient := buildHealthFakeClient(tc, ocObj) @@ -144,7 +144,7 @@ func TestCheckPKIExpiry_CreatesRotationWhenFullDelegation(t *testing.T) { // RECON-C1. func TestCheckPKIExpiry_NoRotationWhenLowAutonomy(t *testing.T) { expiry := time.Now().UTC().Add(5 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(tc, ocObj) @@ -170,7 +170,7 @@ func TestCheckPKIExpiry_NoRotationWhenLowAutonomy(t *testing.T) { // RECON-C1. func TestCheckPKIExpiry_IdempotentWhenCRAlreadyPending(t *testing.T) { expiry := time.Now().UTC().Add(5 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") dynClient := buildHealthFakeClient(tc, ocObj)