diff --git a/cmd/compiler/addnode.go b/cmd/compiler/addnode.go new file mode 100644 index 0000000..98d885c --- /dev/null +++ b/cmd/compiler/addnode.go @@ -0,0 +1,210 @@ +// addnode.go implements the compiler addnode subcommand for generating a new +// MachineConfig CR for a node being added to an existing cluster. +// +// Usage: +// +// compiler addnode --cluster --hostname --ip --role +// [--order ] [--existing-cr ] --output +// +// When --existing-cr is given, the machine and cluster config sections are +// copied from the specified MachineConfig CR with identity fields overridden. +// When absent, a skeleton CR is emitted with empty machine and cluster stubs. +// +// conductor-schema.md §9. platform-schema.md §9. +package main + +import ( + "flag" + "fmt" + "os" + "path/filepath" + + corev1 "k8s.io/api/core/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/yaml" + + platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" +) + +// addnodeHelp is the authored per-subcommand help for 'compiler addnode'. +const addnodeHelp = `Usage: compiler addnode --cluster --hostname --ip --role --output + [--order ] [--existing-cr ] + +Generate a MachineConfig CR for a node being added to an existing cluster. +The output CR is placed in the --output directory as seam-mc-{cluster}-{hostname}.yaml. + +Flags: + --cluster Cluster name (matches the TalosCluster CR name and seam-tenant-{cluster} namespace). + --hostname Node hostname. The cluster-name prefix is stripped automatically if present, + so both "cp4" and "ccs-dev-cp4" produce seam-mc-{cluster}-cp4. + --ip Node IP address reachable on Talos API port 50000. + --role Node role: controlplane or worker. (init is reserved for compiler bootstrap.) + --order Upgrade sequence order (default: 1). init=0, controlplane=1..N, worker=N+1..M. + --existing-cr Path to an existing MachineConfig CR YAML. When provided, spec.machine and + spec.cluster are copied from the existing CR and identity fields are overridden + with the flags above. Use to clone an existing node config for a new peer. + --output Output directory for the generated MachineConfig CR YAML (required). + +When --existing-cr is absent, a skeleton CR is emitted with empty machine and cluster sections. +Populate those sections with the Talos v1alpha1 machineconfig content before applying. + +Compile-only: compiler addnode never applies resources. Human review and GitOps apply required. +` + +// compileAddNode generates a MachineConfig CR for a node being added to an existing cluster. +// clusterName is the TalosCluster name. hostname may include the cluster-name prefix -- +// it is stripped before constructing the CR name. role must be "controlplane" or "worker" +// (init is managed exclusively by compiler bootstrap). order is the upgrade sequence position. +// existingCRPath, when non-empty, is read to copy spec.machine and spec.cluster. output is the +// directory receiving seam-mc-{cluster}-{bareHostname}.yaml. +func compileAddNode(clusterName, hostname, ip, role string, order int32, existingCRPath, output string) error { + if clusterName == "" { + return fmt.Errorf("--cluster is required") + } + if hostname == "" { + return fmt.Errorf("--hostname is required") + } + if ip == "" { + return fmt.Errorf("--ip is required") + } + switch role { + case "controlplane", "worker": + case "init": + return fmt.Errorf("role=init is reserved for compiler bootstrap; use controlplane or worker") + default: + return fmt.Errorf("--role must be controlplane or worker, got %q", role) + } + if output == "" { + return fmt.Errorf("--output is required") + } + + // Strip cluster-name prefix from hostname so seam-mc-{cluster}-{hostname} is not doubled. + bareHostname := stripClusterPrefix(clusterName, hostname) + + mcRole := platformv1alpha1.MachineConfigRoleControlPlane + if role == "worker" { + mcRole = platformv1alpha1.MachineConfigRoleWorker + } + + var machineJSON, clusterJSON *apiextensionsv1.JSON + if existingCRPath != "" { + m, c, err := loadMachineClusterFromCR(existingCRPath) + if err != nil { + return fmt.Errorf("read existing CR %q: %w", existingCRPath, err) + } + machineJSON = m + clusterJSON = c + } + + crName := "seam-mc-" + clusterName + "-" + bareHostname + mc := platformv1alpha1.MachineConfig{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "platform.ontai.dev/v1alpha1", + Kind: "MachineConfig", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: crName, + Namespace: "seam-tenant-" + clusterName, + Labels: map[string]string{ + "ontai.dev/cluster": clusterName, + "ontai.dev/node": hostname, + "ontai.dev/node-role": role, + "ontai.dev/managed-by": "compiler", + }, + }, + Spec: platformv1alpha1.MachineConfigSpec{ + Role: mcRole, + Order: order, + ClusterRef: corev1.LocalObjectReference{ + Name: clusterName, + }, + NodeIP: ip, + NodeHostname: bareHostname, + Machine: machineJSON, + Cluster: clusterJSON, + }, + } + + data, err := yaml.Marshal(mc) + if err != nil { + return fmt.Errorf("marshal MachineConfig CR: %w", err) + } + + var header string + if existingCRPath == "" { + header = "# MachineConfig CR skeleton generated by compiler addnode.\n" + + "# Populate spec.machine and spec.cluster with the Talos v1alpha1\n" + + "# machineconfig sections for this node before applying.\n" + + "# Refer to: https://www.talos.dev/latest/reference/configuration/\n" + } + + if err := os.MkdirAll(output, 0755); err != nil { + return fmt.Errorf("create output directory %q: %w", output, err) + } + outPath := filepath.Join(output, crName+".yaml") + if err := os.WriteFile(outPath, []byte(header+string(data)), 0644); err != nil { + return fmt.Errorf("write MachineConfig CR %q: %w", outPath, err) + } + return nil +} + +// stripClusterPrefix strips the "{clusterName}-" prefix from hostname if present. +// e.g. stripClusterPrefix("ccs-dev", "ccs-dev-cp4") → "cp4" +// +// stripClusterPrefix("ccs-dev", "cp4") → "cp4" +func stripClusterPrefix(clusterName, hostname string) string { + prefix := clusterName + "-" + if len(hostname) > len(prefix) && hostname[:len(prefix)] == prefix { + return hostname[len(prefix):] + } + return hostname +} + +// loadMachineClusterFromCR reads a MachineConfig CR YAML file and returns the +// spec.machine and spec.cluster sections. Used by addnode to clone the config +// body from an existing peer node. +func loadMachineClusterFromCR(path string) (*apiextensionsv1.JSON, *apiextensionsv1.JSON, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, nil, fmt.Errorf("read file: %w", err) + } + + var cr struct { + Spec struct { + Machine *apiextensionsv1.JSON `json:"machine" yaml:"machine"` + Cluster *apiextensionsv1.JSON `json:"cluster" yaml:"cluster"` + } `json:"spec" yaml:"spec"` + } + if err := yaml.Unmarshal(data, &cr); err != nil { + return nil, nil, fmt.Errorf("parse MachineConfig CR: %w", err) + } + return cr.Spec.Machine, cr.Spec.Cluster, nil +} + +// runAddNodeSubcommand parses addnode-specific flags and calls compileAddNode. +func runAddNodeSubcommand(args []string) { + fs := flag.NewFlagSet("addnode", flag.ExitOnError) + cluster := fs.String("cluster", "", "Cluster name (required)") + hostname := fs.String("hostname", "", "Node hostname (required)") + ip := fs.String("ip", "", "Node IP address (required)") + role := fs.String("role", "", "Node role: controlplane or worker (required)") + order := fs.Int("order", 1, "Upgrade sequence order (default: 1)") + existingCR := fs.String("existing-cr", "", "Path to existing MachineConfig CR to clone machine/cluster sections from") + output := fs.String("output", "", "Output directory (required)") + + fs.Usage = func() { + fmt.Fprint(os.Stderr, addnodeHelp) + fs.PrintDefaults() + } + + if err := fs.Parse(args); err != nil { + fmt.Fprintf(os.Stderr, "compiler addnode: flag error: %v\n", err) + os.Exit(1) + } + + if err := compileAddNode(*cluster, *hostname, *ip, *role, int32(*order), *existingCR, *output); err != nil { + fmt.Fprintf(os.Stderr, "compiler addnode: %v\n", err) + os.Exit(1) + } +} diff --git a/cmd/compiler/addnode_test.go b/cmd/compiler/addnode_test.go new file mode 100644 index 0000000..9fdfefe --- /dev/null +++ b/cmd/compiler/addnode_test.go @@ -0,0 +1,191 @@ +// addnode_test.go tests the compiler addnode subcommand. +// Covers skeleton generation (no --existing-cr) and CR cloning +// (--existing-cr copies machine/cluster sections, overrides identity fields). +// All tests are fully offline -- no cluster connectivity. +package main + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "sigs.k8s.io/yaml" +) + +// TestAddNode_SkeletonOutput verifies that addnode without --existing-cr produces +// a valid MachineConfig CR YAML with correct metadata and an empty machine/cluster section. +func TestAddNode_SkeletonOutput(t *testing.T) { + outDir := t.TempDir() + err := compileAddNode("ccs-dev", "cp4", "10.20.0.14", "controlplane", 3, "", outDir) + if err != nil { + t.Fatalf("compileAddNode error: %v", err) + } + + outPath := filepath.Join(outDir, "seam-mc-ccs-dev-cp4.yaml") + data, err := os.ReadFile(outPath) + if err != nil { + t.Fatalf("output file not found: %v", err) + } + content := string(data) + + assertContainsStr(t, content, "apiVersion: platform.ontai.dev/v1alpha1") + assertContainsStr(t, content, "kind: MachineConfig") + assertContainsStr(t, content, "name: seam-mc-ccs-dev-cp4") + assertContainsStr(t, content, "namespace: seam-tenant-ccs-dev") + assertContainsStr(t, content, "role: controlplane") + assertContainsStr(t, content, "nodeIP: 10.20.0.14") + assertContainsStr(t, content, "nodeHostname: cp4") + assertContainsStr(t, content, "ontai.dev/cluster: ccs-dev") + // Skeleton header comment must be present. + assertContainsStr(t, content, "# MachineConfig CR skeleton generated by compiler addnode.") +} + +// TestAddNode_SkeletonStripsClusterPrefix verifies that a hostname already containing +// the cluster-name prefix is not doubled in the output CR name. +func TestAddNode_SkeletonStripsClusterPrefix(t *testing.T) { + outDir := t.TempDir() + // hostname includes cluster prefix -- should produce seam-mc-ccs-dev-cp4, not seam-mc-ccs-dev-ccs-dev-cp4. + err := compileAddNode("ccs-dev", "ccs-dev-cp4", "10.20.0.14", "controlplane", 3, "", outDir) + if err != nil { + t.Fatalf("compileAddNode error: %v", err) + } + + outPath := filepath.Join(outDir, "seam-mc-ccs-dev-cp4.yaml") + if _, err := os.Stat(outPath); err != nil { + t.Errorf("expected output file seam-mc-ccs-dev-cp4.yaml not found: %v", err) + } + data, _ := os.ReadFile(outPath) + if strings.Contains(string(data), "ccs-dev-ccs-dev") { + t.Errorf("hostname prefix was doubled in output: %s", string(data)) + } +} + +// TestAddNode_OrderField verifies that the spec.order field is written correctly. +func TestAddNode_OrderField(t *testing.T) { + outDir := t.TempDir() + if err := compileAddNode("ccs-dev", "wk1", "10.20.0.20", "worker", 5, "", outDir); err != nil { + t.Fatalf("compileAddNode error: %v", err) + } + data, _ := os.ReadFile(filepath.Join(outDir, "seam-mc-ccs-dev-wk1.yaml")) + assertContainsStr(t, string(data), "order: 5") + assertContainsStr(t, string(data), "role: worker") +} + +// TestAddNode_InitRoleFails verifies that role=init is rejected (reserved for bootstrap). +func TestAddNode_InitRoleFails(t *testing.T) { + err := compileAddNode("ccs-dev", "cp1", "10.20.0.11", "init", 0, "", t.TempDir()) + if err == nil { + t.Error("expected error for role=init; got nil") + } + if !strings.Contains(err.Error(), "init") { + t.Errorf("error %q should mention 'init'", err.Error()) + } +} + +// TestAddNode_InvalidRoleFails verifies that an unknown role is rejected. +func TestAddNode_InvalidRoleFails(t *testing.T) { + err := compileAddNode("ccs-dev", "cp4", "10.20.0.14", "not-a-role", 1, "", t.TempDir()) + if err == nil { + t.Error("expected error for unknown role; got nil") + } +} + +// TestAddNode_MissingClusterFails verifies that an empty --cluster is rejected. +func TestAddNode_MissingClusterFails(t *testing.T) { + err := compileAddNode("", "cp4", "10.20.0.14", "controlplane", 1, "", t.TempDir()) + if err == nil { + t.Error("expected error for missing cluster; got nil") + } +} + +// TestAddNode_MissingIPFails verifies that an empty --ip is rejected. +func TestAddNode_MissingIPFails(t *testing.T) { + err := compileAddNode("ccs-dev", "cp4", "", "controlplane", 1, "", t.TempDir()) + if err == nil { + t.Error("expected error for missing ip; got nil") + } +} + +// TestAddNode_ExistingCR_CopiesMachineCluster verifies that when --existing-cr is +// provided, spec.machine and spec.cluster are copied from the existing CR and all +// identity fields are overridden with the supplied flags. +func TestAddNode_ExistingCR_CopiesMachineCluster(t *testing.T) { + // Build a bootstrap output to get a real MachineConfig CR as the source. + bootstrapDir := t.TempDir() + inputPath := writeInputFile(t, bootstrapInputYAML) + if err := compileBootstrap(inputPath, bootstrapDir, "", ""); err != nil { + t.Fatalf("compileBootstrap error: %v", err) + } + + // Use node1's MachineConfig CR as the existing-cr template. + existingCRPath := filepath.Join(bootstrapDir, "seam-mc-ccs-mgmt-node1.yaml") + if _, err := os.Stat(existingCRPath); err != nil { + t.Fatalf("existing CR not found: %v", err) + } + + outDir := t.TempDir() + err := compileAddNode("ccs-mgmt", "node4", "10.20.0.14", "controlplane", 3, existingCRPath, outDir) + if err != nil { + t.Fatalf("compileAddNode with existing-cr error: %v", err) + } + + data, err := os.ReadFile(filepath.Join(outDir, "seam-mc-ccs-mgmt-node4.yaml")) + if err != nil { + t.Fatalf("output file not found: %v", err) + } + content := string(data) + + // Identity fields must be overridden. + assertContainsStr(t, content, "name: seam-mc-ccs-mgmt-node4") + assertContainsStr(t, content, "namespace: seam-tenant-ccs-mgmt") + assertContainsStr(t, content, "nodeIP: 10.20.0.14") + assertContainsStr(t, content, "nodeHostname: node4") + assertContainsStr(t, content, "role: controlplane") + assertContainsStr(t, content, "order: 3") + + // Machine/cluster sections must be present (copied from source CR). + var cr map[string]interface{} + if err := yaml.Unmarshal(data, &cr); err != nil { + t.Fatalf("parse output CR: %v", err) + } + spec, _ := cr["spec"].(map[string]interface{}) + if spec == nil { + t.Fatal("output CR has no spec") + } + if spec["machine"] == nil { + t.Error("spec.machine should be populated from existing CR") + } + if spec["cluster"] == nil { + t.Error("spec.cluster should be populated from existing CR") + } + + // Skeleton comment must NOT appear (this is a cloned CR, not a skeleton). + if strings.Contains(content, "skeleton generated by compiler addnode") { + t.Error("cloned CR must not contain skeleton header comment") + } +} + +// TestAddNode_ExistingCR_MissingFileFails verifies that a missing --existing-cr path +// returns a descriptive error. +func TestAddNode_ExistingCR_MissingFileFails(t *testing.T) { + err := compileAddNode("ccs-dev", "cp4", "10.20.0.14", "controlplane", 1, "/nonexistent/cr.yaml", t.TempDir()) + if err == nil { + t.Error("expected error for missing existing-cr path; got nil") + } +} + +// TestAddNode_NamingConvention verifies the seam-mc-{cluster}-{hostname} naming +// convention for the output file. platform-schema.md §9. +func TestAddNode_NamingConvention(t *testing.T) { + outDir := t.TempDir() + if err := compileAddNode("my-cluster", "worker99", "10.10.0.99", "worker", 10, "", outDir); err != nil { + t.Fatalf("compileAddNode error: %v", err) + } + expectedFile := filepath.Join(outDir, "seam-mc-my-cluster-worker99.yaml") + if _, err := os.Stat(expectedFile); err != nil { + t.Errorf("expected output file seam-mc-my-cluster-worker99.yaml not found: %v", err) + } +} + +// assertContainsStr is defined in compile_bootstrap_test.go. diff --git a/cmd/compiler/compile.go b/cmd/compiler/compile.go index 4324193..de5cb96 100644 --- a/cmd/compiler/compile.go +++ b/cmd/compiler/compile.go @@ -6,6 +6,7 @@ package main import ( "context" + "encoding/json" "fmt" "log/slog" "os" @@ -14,7 +15,7 @@ import ( "time" corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/yaml" @@ -71,17 +72,20 @@ func ciliumPrerequisitesPatch() string { // the provided registry mirrors into machine.registries.mirrors. The http:// prefix // on endpoints is preserved exactly — no TLS config is added. func buildRegistryMirrorsPatch(mirrors []RegistryMirror) (string, error) { + // json: tags are required: sigs.k8s.io/yaml marshals via encoding/json, so + // without json: tags the field names default to PascalCase (Machine, Registries) + // which won't merge into the lowercase Talos machineconfig keys. type mirrorSpec struct { - Endpoints []string `yaml:"endpoints"` + Endpoints []string `json:"endpoints" yaml:"endpoints"` } type registriesSpec struct { - Mirrors map[string]mirrorSpec `yaml:"mirrors"` + Mirrors map[string]mirrorSpec `json:"mirrors" yaml:"mirrors"` } type machineSpec struct { - Registries registriesSpec `yaml:"registries"` + Registries registriesSpec `json:"registries" yaml:"registries"` } type patchSpec struct { - Machine machineSpec `yaml:"machine"` + Machine machineSpec `json:"machine" yaml:"machine"` } mirrorMap := make(map[string]mirrorSpec, len(mirrors)) @@ -171,9 +175,9 @@ func extractCAFromMachineConfig(machineConfigBytes []byte) (*secrets.Bundle, err } // BootstrapNode declares a single Talos node for management cluster bootstrap. -// Each node maps to one Talos machine configuration and one Kubernetes Secret. +// Each node maps to one Talos machine configuration and one MachineConfig CR. type BootstrapNode struct { - // Hostname is the node's hostname. Used as the node name in Secret naming + // Hostname is the node's hostname. Used as the bare hostname in naming // convention seam-mc-{cluster}-{hostname}. platform-schema.md §9. Hostname string `yaml:"hostname"` @@ -184,6 +188,11 @@ type BootstrapNode struct { // "controlplane" (additional control plane nodes), or "worker". // Exactly one node must have role "init". Role string `yaml:"role"` + + // MAC is the node's primary NIC MAC address. Informational only -- + // not used by the compiler or operator. Stored for admin reference. + // +optional + MAC string `yaml:"mac,omitempty"` } // BootstrapSection holds management cluster bootstrap configuration. @@ -744,25 +753,14 @@ func validateBootstrapInput(b *BootstrapSection) error { // compileBootstrap implements the bootstrap subcommand. // -// Reads a ClusterInput spec (with a bootstrap section declaring node IPs, roles, -// and Talos version) and produces three output artifacts in --output: -// - seam-mc-{cluster}-{hostname}.yaml — Kubernetes Secret YAML per node -// containing the Talos machine configuration. platform-schema.md §9. -// - {cluster-name}.yaml — TalosCluster CR with mode=bootstrap, capi.enabled=false. -// - bootstrap-sequence.yaml — documents the apply order. +// Bootstrap mode: generates MachineConfig CRs (one per node), namespace manifest, +// TalosCluster CR, and bootstrap-sequence.yaml. Uses Talos machinery for PKI. // -// kubeconfigPath is the optional path to a kubeconfig file, used only when -// in.ImportExistingCluster=true. Pass empty string to use the standard resolution -// chain (KUBECONFIG env → ~/.kube/config). +// Import mode: generates namespace manifest, talosconfig Secret (if resolvable), +// TalosCluster CR, and bootstrap-sequence.yaml. MachineConfig CRs are NOT +// generated -- admin provides them via compiler addnode or hand-authored. CP-INV-004. // -// When importExistingCluster=true, Compiler connects to the cluster Kubernetes API -// via kubeconfig, reads the init-node machine config Secret from seam-system, parses -// it, and derives the secrets bundle from existing CA material so new configs are -// signed with the same PKI. Fails fast if the kubeconfig is unreachable or the -// Secret or its machineconfig.yaml field is missing. -// -// Uses the Talos machinery library to generate machine configurations. -// No cluster connection is required in the default (fresh PKI) path. +// kubeconfigPath is not used; retained for CLI flag compatibility. // conductor-schema.md §9. func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) error { in, err := readClusterInput(input) @@ -797,6 +795,11 @@ func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) err controlPlaneEndpoint = ep } + tcMode := platformv1alpha1.TalosClusterModeBootstrap + if in.Mode == "import" || in.ImportExistingCluster { + tcMode = platformv1alpha1.TalosClusterModeImport + } + // Resolve kubernetesVersion: explicit > support matrix. kubernetesVersion := b.KubernetesVersion if kubernetesVersion == "" { @@ -808,8 +811,10 @@ func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) err } // Resolve installDisk: explicit > extracted from machineConfigPaths > default. + // Only needed for bootstrap mode (generate.NewInput); skip extraction in import mode + // to avoid requiring machineConfigPaths files that admin provides separately. installDisk := b.InstallDisk - if installDisk == "" { + if installDisk == "" && tcMode == platformv1alpha1.TalosClusterModeBootstrap { extracted, err := extractFromInitNode(in.MachineConfigPaths, b.Nodes, func(mcBytes []byte) (string, error) { return extractInstallDiskFromMachineConfig(mcBytes), nil @@ -843,106 +848,6 @@ func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) err } } - // Resolve the secrets bundle. When importExistingCluster=true, extract PKI from - // an existing cluster. Two paths are available: - // - // machineConfigPaths non-empty — local file path (pre-Seam clusters): - // Read the init node entry from the map, load the raw machine config file, - // and extract CA material via extractCAFromMachineConfig. - // - // machineConfigPaths absent — Kubernetes API path (Seam clusters): - // Connect to the cluster API via kubeconfig, read the seam-mc-{cluster}-{init} - // Secret from seam-system, extract machineconfig.yaml, and extract CA material. - // - // Both paths share extractCAFromMachineConfig for the final CA extraction step. - var secretsBundle *secrets.Bundle - if in.Mode == "import" || in.ImportExistingCluster { - // Find the init node hostname (guaranteed present by validateBootstrapInput). - var initHostname string - for _, n := range b.Nodes { - if n.Role == "init" { - initHostname = n.Hostname - break - } - } - - if len(in.MachineConfigPaths) > 0 { - // Local file path: read CA from user-provided machine config file. - // Only the init node entry is required; the same bundle is used for all nodes. - mcPath, ok := in.MachineConfigPaths[initHostname] - if !ok { - return fmt.Errorf("importExistingCluster: machineConfigPaths is non-empty but init node %q has no entry", initHostname) - } - mcBytes, err := os.ReadFile(mcPath) - if err != nil { - return fmt.Errorf("importExistingCluster: read machineconfig for init node %q from %q: %w", initHostname, mcPath, err) - } - secretsBundle, err = extractCAFromMachineConfig(mcBytes) - if err != nil { - return fmt.Errorf("importExistingCluster: extract CA from local file %q: %w", mcPath, err) - } - } else { - // Kubernetes API path: read CA from seam-mc Secret in seam-system. - resolvedKubeconfig := resolveKubeconfigPath(kubeconfigPath) - k8sClient, err := buildK8sClient(resolvedKubeconfig) - if err != nil { - return fmt.Errorf("importExistingCluster: connect to cluster via kubeconfig %q: %w", resolvedKubeconfig, err) - } - - // Strip cluster-name prefix from hostname: Talos node names carry the - // cluster prefix (e.g. "ccs-mgmt-cp1" for cluster "ccs-mgmt"), so the - // Secret name would double the prefix without this strip. C-32. - hostname := strings.TrimPrefix(initHostname, in.Name+"-") - secretName := "seam-mc-" + in.Name + "-" + hostname - mcSecret, err := k8sClient.CoreV1().Secrets("seam-system").Get( - context.Background(), secretName, metav1.GetOptions{}, - ) - if err != nil { - if apierrors.IsNotFound(err) { - // seam-mc Secret absent — cluster was not bootstrapped via Seam. - // Fall through to the talosconfig-only path: emit only the - // talosconfig Secret and TalosCluster CR. No machineconfig - // generation, no PKI extraction. C-32 Bug 2. - return compileImportTalosconfigSecret(in, output, talosconfigPath) - } - return fmt.Errorf("importExistingCluster: read secret %q from seam-system: %w", secretName, err) - } - - mcBytes, ok := mcSecret.Data["machineconfig.yaml"] - if !ok { - return fmt.Errorf("importExistingCluster: secret %q is missing machineconfig.yaml field", secretName) - } - - secretsBundle, err = extractCAFromMachineConfig(mcBytes) - if err != nil { - return fmt.Errorf("importExistingCluster: extract CA from secret %q: %w", secretName, err) - } - } - } else { - secretsBundle, err = secrets.NewBundle( - secrets.NewFixedClock(time.Now()), - versionContract, - ) - if err != nil { - return fmt.Errorf("generate secrets bundle: %w", err) - } - } - - // Build the generate input with cluster-wide settings. - genInput, err := generate.NewInput( - in.Name, - controlPlaneEndpoint, - kubernetesVersion, - generate.WithVersionContract(versionContract), - generate.WithSecretsBundle(secretsBundle), - generate.WithInstallDisk(installDisk), - generate.WithInstallImage(installerImage), - generate.WithEndpointList(cpIPs), - ) - if err != nil { - return fmt.Errorf("build generate input: %w", err) - } - if err := os.MkdirAll(output, 0755); err != nil { return fmt.Errorf("create output directory: %w", err) } @@ -952,111 +857,135 @@ func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) err ns = "seam-system" } - // Build the ordered patch list: - // 1. CiliumPrerequisites (built-in, applied first) - // 2. RegistryMirrors (injected next) - // 3. User Patches (applied last, in order) - var patches []string - if in.CiliumPrerequisites { - patches = append(patches, ciliumPrerequisitesPatch()) - } - if len(in.RegistryMirrors) > 0 { - mirrorPatch, err := buildRegistryMirrorsPatch(in.RegistryMirrors) + // MachineConfig CRs are generated for bootstrap mode only. + // Import mode: admin provides MachineConfig CRs (via compiler addnode or hand-authored). + // platform-schema.md §9, CP-INV-004. + var crNames []string + if tcMode == platformv1alpha1.TalosClusterModeBootstrap { + secretsBundle, err := secrets.NewBundle( + secrets.NewFixedClock(time.Now()), + versionContract, + ) if err != nil { - return fmt.Errorf("build registry mirrors patch: %w", err) + return fmt.Errorf("generate secrets bundle: %w", err) } - patches = append(patches, mirrorPatch) - } - patches = append(patches, in.Patches...) - // Generate machine configuration for each node and write as a Secret. - var secretNames []string - for _, node := range b.Nodes { - machineType, err := nodeRoleToMachineType(node.Role) + genInput, err := generate.NewInput( + in.Name, + controlPlaneEndpoint, + kubernetesVersion, + generate.WithVersionContract(versionContract), + generate.WithSecretsBundle(secretsBundle), + generate.WithInstallDisk(installDisk), + generate.WithInstallImage(installerImage), + generate.WithEndpointList(cpIPs), + ) if err != nil { - return fmt.Errorf("node %q: %w", node.Hostname, err) + return fmt.Errorf("build generate input: %w", err) } - cfg, err := genInput.Config(machineType) - if err != nil { - return fmt.Errorf("generate config for node %q: %w", node.Hostname, err) + // Build the ordered patch list: + // 1. CiliumPrerequisites (built-in, applied first) + // 2. RegistryMirrors (injected next) + // 3. User Patches (applied last, in order) + var patches []string + if in.CiliumPrerequisites { + patches = append(patches, ciliumPrerequisitesPatch()) } + if len(in.RegistryMirrors) > 0 { + mirrorPatch, err := buildRegistryMirrorsPatch(in.RegistryMirrors) + if err != nil { + return fmt.Errorf("build registry mirrors patch: %w", err) + } + patches = append(patches, mirrorPatch) + } + patches = append(patches, in.Patches...) - cfgBytes, err := cfg.Bytes() - if err != nil { - return fmt.Errorf("marshal config for node %q: %w", node.Hostname, err) + // Pre-compute upgrade order for each node. + // init=0, controlplane nodes=1..N in declaration order, workers=N+1..M. + cpIdx := int32(0) + workerIdx := int32(0) + cpCount := int32(0) + for _, n := range b.Nodes { + if n.Role == "controlplane" { + cpCount++ + } + } + nodeOrder := make(map[string]int32, len(b.Nodes)) + for _, n := range b.Nodes { + switch n.Role { + case "init": + nodeOrder[n.Hostname] = 0 + case "controlplane": + cpIdx++ + nodeOrder[n.Hostname] = cpIdx + case "worker": + nodeOrder[n.Hostname] = cpCount + 1 + workerIdx + workerIdx++ + } } - // Apply all patches in order (CiliumPrerequisites → RegistryMirrors → user Patches). - for i, patch := range patches { - cfgBytes, err = applyYAMLPatch(cfgBytes, patch) + for _, node := range b.Nodes { + machineType, err := nodeRoleToMachineType(node.Role) if err != nil { - return fmt.Errorf("apply patch %d to node %q: %w", i, node.Hostname, err) + return fmt.Errorf("node %q: %w", node.Hostname, err) } - } - // Strip cluster-name prefix from hostname before constructing the secret - // name so the prefix is not doubled (e.g. ccs-mgmt-cp1 → cp1). C-32. - // Machine config secrets always live in seam-tenant-{cluster}, not in the - // TalosCluster CR namespace (seam-system). Platform reads them from there. - bareHostname := strings.TrimPrefix(node.Hostname, in.Name+"-") - secretName := "seam-mc-" + in.Name + "-" + bareHostname - secret := corev1.Secret{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "v1", - Kind: "Secret", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: secretName, - Namespace: "seam-tenant-" + in.Name, - Labels: map[string]string{ - "ontai.dev/cluster": in.Name, - "ontai.dev/node": node.Hostname, - "ontai.dev/node-role": node.Role, - "ontai.dev/managed-by": "compiler", - }, - }, - Type: corev1.SecretTypeOpaque, - StringData: map[string]string{ - "machineconfig.yaml": string(cfgBytes), - }, - } + cfg, err := genInput.Config(machineType) + if err != nil { + return fmt.Errorf("generate config for node %q: %w", node.Hostname, err) + } - if err := writeCRYAML(output, secretName, secret); err != nil { - return fmt.Errorf("write machineconfig secret for node %q: %w", node.Hostname, err) + cfgBytes, err := cfg.Bytes() + if err != nil { + return fmt.Errorf("marshal config for node %q: %w", node.Hostname, err) + } + + // Apply all patches in order (CiliumPrerequisites → RegistryMirrors → user Patches). + for i, patch := range patches { + cfgBytes, err = applyYAMLPatch(cfgBytes, patch) + if err != nil { + return fmt.Errorf("apply patch %d to node %q: %w", i, node.Hostname, err) + } + } + + // Strip cluster-name prefix from hostname before constructing the CR + // name so the prefix is not doubled (e.g. ccs-mgmt-cp1 → cp1). C-32. + // MachineConfig CRs always live in seam-tenant-{cluster}. + bareHostname := strings.TrimPrefix(node.Hostname, in.Name+"-") + mc, err := buildMachineConfigCR(node, cfgBytes, in.Name, bareHostname, nodeOrder[node.Hostname]) + if err != nil { + return fmt.Errorf("build MachineConfig CR for node %q: %w", node.Hostname, err) + } + crName := mc.Name + if err := writeCRYAML(output, crName, mc); err != nil { + return fmt.Errorf("write MachineConfig CR for node %q: %w", node.Hostname, err) + } + crNames = append(crNames, crName+".yaml") } - secretNames = append(secretNames, secretName+".yaml") } - // C-35: When importExistingCluster=true, also emit the talosconfig Secret so - // Platform can generate the kubeconfig via ensureKubeconfigSecret. Applies to - // both the machineConfigPaths path (local file PKI) and the Kubernetes API path - // (Seam clusters). Failure is a warning -- the operator can apply manually. - // Also emit the seam-tenant namespace manifest so the admin can apply it before - // the Secrets (which live in seam-tenant-{cluster}). platform-schema.md §9. - if in.Mode == "import" || in.ImportExistingCluster { - nsFile, err := writeSeamTenantNamespaceManifest(in.Name, output) - if err != nil { - return err - } - secretNames = append([]string{nsFile}, secretNames...) + // Namespace manifest: always emitted. MachineConfig CRs (bootstrap) and + // talosconfig Secrets (import) both live in seam-tenant-{cluster}. + nsFile, err := writeSeamTenantNamespaceManifest(in.Name, output) + if err != nil { + return err + } + allResources := append([]string{nsFile}, crNames...) + + // Import mode: also emit the talosconfig Secret so Platform can generate the + // kubeconfig via ensureKubeconfigSecret. Failure is a warning -- the operator + // can apply manually. + if tcMode == platformv1alpha1.TalosClusterModeImport { if tcfgFile, err := writeTalosconfigSecret(in, talosconfigPath, output); err != nil { return err } else if tcfgFile != "" { - secretNames = append(secretNames, tcfgFile) + allResources = append(allResources, tcfgFile) } } - // Fix 1: importExistingCluster=true always emits mode=import. The - // machineConfigPaths field only controls where PKI is read from, not the - // cluster lifecycle mode. A re-imported cluster is always mode=import. - tcMode := platformv1alpha1.TalosClusterModeBootstrap - if in.Mode == "import" || in.ImportExistingCluster { - tcMode = platformv1alpha1.TalosClusterModeImport - } - // Produce TalosCluster CR. ontai.dev/owns-runnerconfig signals Platform to add - // a finalizer and clean up the RunnerConfig in ont-system on deletion. Bug 3. + // a finalizer and clean up the RunnerConfig in ont-system on deletion. // // Role is set when: (a) import path -- clusterRole defaults empty to management; // (b) bootstrap path with explicit role field (e.g. role: tenant in fixture). @@ -1093,7 +1022,79 @@ func compileBootstrap(input, output, kubeconfigPath, talosconfigPath string) err } // Produce bootstrap-sequence.yaml documenting the apply order. - return writeBootstrapSequence(output, in.Name, secretNames, tcMode) + return writeBootstrapSequence(output, in.Name, allResources, tcMode) +} + +// buildMachineConfigCR converts a generated Talos machine config YAML into a +// MachineConfig CR. The machine and cluster top-level sections are stored as +// unstructured JSON in spec.machine and spec.cluster respectively so the CR +// remains Talos-version-agnostic. +// +// bareHostname must be the hostname with any cluster-name prefix stripped +// (e.g. "cp1" for node "ccs-mgmt-cp1" in cluster "ccs-mgmt"). +func buildMachineConfigCR(node BootstrapNode, cfgBytes []byte, clusterName, bareHostname string, order int32) (platformv1alpha1.MachineConfig, error) { + var rawMap map[string]interface{} + if err := yaml.Unmarshal(cfgBytes, &rawMap); err != nil { + return platformv1alpha1.MachineConfig{}, fmt.Errorf("parse machineconfig for node %q: %w", node.Hostname, err) + } + + toJSON := func(key string) (*apiextensionsv1.JSON, error) { + v, ok := rawMap[key] + if !ok || v == nil { + return nil, nil + } + b, err := json.Marshal(v) + if err != nil { + return nil, fmt.Errorf("json-encode %q section: %w", key, err) + } + return &apiextensionsv1.JSON{Raw: b}, nil + } + + machineJSON, err := toJSON("machine") + if err != nil { + return platformv1alpha1.MachineConfig{}, err + } + clusterJSON, err := toJSON("cluster") + if err != nil { + return platformv1alpha1.MachineConfig{}, err + } + + role := platformv1alpha1.MachineConfigRoleControlPlane + switch node.Role { + case "init": + role = platformv1alpha1.MachineConfigRoleInit + case "worker": + role = platformv1alpha1.MachineConfigRoleWorker + } + + crName := "seam-mc-" + clusterName + "-" + bareHostname + return platformv1alpha1.MachineConfig{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "platform.ontai.dev/v1alpha1", + Kind: "MachineConfig", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: crName, + Namespace: "seam-tenant-" + clusterName, + Labels: map[string]string{ + "ontai.dev/cluster": clusterName, + "ontai.dev/node": node.Hostname, + "ontai.dev/node-role": node.Role, + "ontai.dev/managed-by": "compiler", + }, + }, + Spec: platformv1alpha1.MachineConfigSpec{ + Role: role, + Order: order, + ClusterRef: corev1.LocalObjectReference{ + Name: clusterName, + }, + NodeIP: node.IP, + NodeHostname: bareHostname, + Machine: machineJSON, + Cluster: clusterJSON, + }, + }, nil } // nodeRoleToMachineType converts a bootstrap node role to the Talos machine.Type. @@ -1193,18 +1194,18 @@ type BootstrapSequence struct { // // C-36: previously used kind: BootstrapSequence (not a valid CRD). platform-schema.md §9. func writeBootstrapSequence(output, clusterName string, secretFiles []string, mode platformv1alpha1.TalosClusterMode) error { - step1Desc := "Apply Talos machineconfig Secrets — one per node. " + - "Apply ALL before the TalosCluster CR." + step1Desc := "Apply seam-tenant namespace manifest AND MachineConfig CRs (one per node) " + + "in seam-tenant-" + clusterName + ". Apply ALL before the TalosCluster CR." step2Desc := "Apply TalosCluster CR with mode=bootstrap and capi.enabled=false. " + "Platform's TalosClusterReconciler watches this CR and submits the bootstrap Conductor Job." if mode == platformv1alpha1.TalosClusterModeImport { - step1Desc = "Apply ALL Secrets: machineconfig Secrets (one per node) AND the talosconfig Secret " + - "(seam-mc-" + clusterName + "-talosconfig.yaml). " + - "The talosconfig Secret is required for Platform to generate the kubeconfig. " + - "Apply ALL before TalosCluster CR." + step1Desc = "Apply seam-tenant namespace manifest AND the talosconfig Secret " + + "(seam-mc-" + clusterName + "-talosconfig.yaml) in seam-tenant-" + clusterName + ". " + + "Admin must separately apply MachineConfig CRs (via compiler addnode or hand-authored) " + + "before applying the TalosCluster CR." step2Desc = "Apply TalosCluster CR with mode=import. " + - "Apply AFTER all Secrets in step 1 are present in the cluster — " + + "Apply AFTER namespace, talosconfig Secret, and MachineConfig CRs are present — " + "Platform reads the talosconfig Secret during TalosCluster reconciliation " + "to generate and store the cluster kubeconfig." } diff --git a/cmd/compiler/compile_bootstrap_features_test.go b/cmd/compiler/compile_bootstrap_features_test.go index a57f129..1a79946 100644 --- a/cmd/compiler/compile_bootstrap_features_test.go +++ b/cmd/compiler/compile_bootstrap_features_test.go @@ -673,10 +673,11 @@ bootstrap: // ── ImportExistingCluster ───────────────────────────────────────────────────── -// TestBootstrap_ImportExistingCluster_MissingKubeconfigReturnsError verifies that -// importExistingCluster: true with a non-existent kubeconfig path returns an error -// rather than silently generating fresh PKI material. -func TestBootstrap_ImportExistingCluster_MissingKubeconfigReturnsError(t *testing.T) { +// TestBootstrap_ImportExistingCluster_Succeeds verifies that importExistingCluster:true +// with all required fields present succeeds. The kubeconfig API fallback path was +// removed when the MachineConfig CRD migration eliminated PKI extraction from +// the cluster. import mode no longer connects to any external API. +func TestBootstrap_ImportExistingCluster_Succeeds(t *testing.T) { input := ` name: test-cluster namespace: seam-system @@ -696,10 +697,16 @@ bootstrap: role: init ` inputPath := writeInputFile(t, input) - // Pass a kubeconfig path that does not exist — connection must fail with an error. - err := compileBootstrap(inputPath, t.TempDir(), "/nonexistent/kubeconfig.yaml", "") - if err == nil { - t.Fatal("expected error for missing kubeconfig; got nil") + outDir := t.TempDir() + if err := compileBootstrap(inputPath, outDir, "", ""); err != nil { + t.Fatalf("expected importExistingCluster=true to succeed; got: %v", err) + } + // importExistingCluster=true → tcMode=Import → no MachineConfig CRs emitted. + if _, err := os.Stat(filepath.Join(outDir, "seam-mc-test-cluster-cp1.yaml")); err == nil { + t.Error("import mode must not emit MachineConfig CRs") + } + if _, err := os.Stat(filepath.Join(outDir, "test-cluster.yaml")); err != nil { + t.Errorf("TalosCluster CR not found: %v", err) } } diff --git a/cmd/compiler/compile_bootstrap_import_test.go b/cmd/compiler/compile_bootstrap_import_test.go index 7eec389..9e0bef5 100644 --- a/cmd/compiler/compile_bootstrap_import_test.go +++ b/cmd/compiler/compile_bootstrap_import_test.go @@ -14,12 +14,17 @@ import ( ) // generateMachineConfigFile produces a valid Talos init-node machine config YAML -// file for use in import-path tests. It runs compileBootstrap with fresh PKI to -// generate a seam-mc Secret, extracts the machineconfig.yaml field, writes it to -// a temp file, and returns the path. +// file for use in import-path tests (machineConfigPaths). It runs compileBootstrap +// with fresh PKI to generate a MachineConfig CR, extracts spec.machine and +// spec.cluster, reconstructs a Talos machineconfig YAML, and writes it to a temp +// file. This file is used as a machineConfigPaths entry for endpoint/disk extraction. func generateMachineConfigFile(t *testing.T, clusterName, hostname string) string { t.Helper() + // Strip the cluster-name prefix from hostname to match the CR name. + // compileBootstrap uses TrimPrefix(hostname, clusterName+"-") for the bare name. + bareHostname := strings.TrimPrefix(hostname, clusterName+"-") + input := fmt.Sprintf(` name: %s namespace: seam-system @@ -45,31 +50,44 @@ bootstrap: t.Fatalf("generateMachineConfigFile: compileBootstrap failed: %v", err) } - // Read the Secret YAML produced for the init node. - secretPath := filepath.Join(outDir, fmt.Sprintf("seam-mc-%s-%s.yaml", clusterName, hostname)) - secretData, err := os.ReadFile(secretPath) + // Read the MachineConfig CR YAML produced for the init node. + crPath := filepath.Join(outDir, fmt.Sprintf("seam-mc-%s-%s.yaml", clusterName, bareHostname)) + crData, err := os.ReadFile(crPath) if err != nil { - t.Fatalf("generateMachineConfigFile: read secret YAML: %v", err) + t.Fatalf("generateMachineConfigFile: read MachineConfig CR YAML: %v", err) } - // Extract machineconfig.yaml from the Secret's stringData field. - var secretObj struct { - StringData map[string]string `yaml:"stringData"` + // Parse spec.machine and spec.cluster from the CR, then reconstruct the + // full Talos machineconfig YAML (used by extractEndpointFromMachineConfig, + // extractInstallDiskFromMachineConfig, and extractCAFromMachineConfig). + var crObj struct { + Spec struct { + Machine interface{} `yaml:"machine"` + Cluster interface{} `yaml:"cluster"` + } `yaml:"spec"` } - if err := yaml.Unmarshal(secretData, &secretObj); err != nil { - t.Fatalf("generateMachineConfigFile: parse secret YAML: %v", err) + if err := yaml.Unmarshal(crData, &crObj); err != nil { + t.Fatalf("generateMachineConfigFile: parse MachineConfig CR: %v", err) } - mcYAML, ok := secretObj.StringData["machineconfig.yaml"] - if !ok { - t.Fatal("generateMachineConfigFile: secret missing machineconfig.yaml field") + if crObj.Spec.Machine == nil { + t.Fatal("generateMachineConfigFile: spec.machine is nil in generated CR") } - // Write the raw machine config YAML to a dedicated temp file. + reconstructed := map[string]interface{}{ + "machine": crObj.Spec.Machine, + "cluster": crObj.Spec.Cluster, + } + mcYAML, err := yaml.Marshal(reconstructed) + if err != nil { + t.Fatalf("generateMachineConfigFile: marshal reconstructed machineconfig: %v", err) + } + + // Write the reconstructed machineconfig YAML to a temp file. f, err := os.CreateTemp(t.TempDir(), "mc-*.yaml") if err != nil { t.Fatalf("generateMachineConfigFile: create temp file: %v", err) } - if _, err := f.WriteString(mcYAML); err != nil { + if _, err := f.Write(mcYAML); err != nil { t.Fatalf("generateMachineConfigFile: write machine config: %v", err) } f.Close() @@ -80,11 +98,11 @@ bootstrap: // TestBootstrap_ImportExistingCluster_LocalFilePath verifies that when // importExistingCluster=true and machineConfigPaths is non-empty, Compiler -// reads CA material from the local machine config file and successfully generates -// all output artifacts (machine config Secrets, TalosCluster CR, bootstrap-sequence). -// This path is used for clusters bootstrapped before Seam. +// successfully generates all output artifacts for import mode. +// Import mode emits: namespace manifest, TalosCluster CR, bootstrap-sequence. +// MachineConfig CRs are NOT emitted for import mode -- admin provides them +// (via compiler addnode or hand-authored). CP-INV-004. func TestBootstrap_ImportExistingCluster_LocalFilePath(t *testing.T) { - // Generate a real init-node machine config file from a fresh PKI bundle. mcPath := generateMachineConfigFile(t, "import-cluster", "cp1") input := fmt.Sprintf(` @@ -118,11 +136,9 @@ bootstrap: t.Fatalf("compileBootstrap (local file path) error: %v", err) } - // All expected output files must be present. + // Import mode output: namespace + TalosCluster + bootstrap-sequence only. for _, name := range []string{ "seam-tenant-namespace.yaml", - "seam-mc-import-cluster-cp1.yaml", - "seam-mc-import-cluster-wk1.yaml", "import-cluster.yaml", "bootstrap-sequence.yaml", } { @@ -130,11 +146,23 @@ bootstrap: t.Errorf("expected output file %q not found: %v", name, err) } } + + // MachineConfig CRs are admin-provided; compiler must not generate them. + for _, name := range []string{ + "seam-mc-import-cluster-cp1.yaml", + "seam-mc-import-cluster-wk1.yaml", + } { + if _, err := os.Stat(filepath.Join(outDir, name)); err == nil { + t.Errorf("import mode must not generate MachineConfig CR %q; admin provides these", name) + } + } } // TestBootstrap_ImportExistingCluster_LocalFileMissingReturnsError verifies that -// when machineConfigPaths is non-empty but the referenced file does not exist, -// Compiler returns an error rather than panicking or silently producing output. +// when machineConfigPaths references a nonexistent file and controlPlaneEndpoint +// is absent (forcing endpoint extraction from the file), Compiler returns an error. +// When endpoint and disk are explicit, the file is not read; omitting them forces +// the compiler to attempt to read the file. func TestBootstrap_ImportExistingCluster_LocalFileMissingReturnsError(t *testing.T) { input := ` name: import-cluster @@ -147,7 +175,6 @@ importExistingCluster: true machineConfigPaths: cp1: /nonexistent/machineconfig.yaml bootstrap: - controlPlaneEndpoint: "https://10.0.0.10:6443" talosVersion: "v1.7.0" kubernetesVersion: "1.30.0" installDisk: "/dev/sda" @@ -164,9 +191,9 @@ bootstrap: } // TestBootstrap_ImportExistingCluster_InitNodeAbsentFromMapReturnsError verifies -// that when machineConfigPaths is non-empty but the init node hostname is absent -// from the map, Compiler returns an error. The init node entry is required for -// CA extraction; omitting it is a configuration error. +// that when machineConfigPaths is non-empty but contains no entry for any +// control-plane node, and controlPlaneEndpoint is absent (requiring extraction), +// Compiler returns an error about the missing endpoint. func TestBootstrap_ImportExistingCluster_InitNodeAbsentFromMapReturnsError(t *testing.T) { input := ` name: import-cluster @@ -179,7 +206,6 @@ importExistingCluster: true machineConfigPaths: worker1: /some/path/worker.yaml bootstrap: - controlPlaneEndpoint: "https://10.0.0.10:6443" talosVersion: "v1.7.0" kubernetesVersion: "1.30.0" installDisk: "/dev/sda" @@ -191,10 +217,10 @@ bootstrap: inputPath := writeInputFile(t, input) err := compileBootstrap(inputPath, t.TempDir(), "", "") if err == nil { - t.Fatal("expected error when init node hostname absent from machineConfigPaths; got nil") + t.Fatal("expected error when no control-plane node in machineConfigPaths and endpoint absent; got nil") } - if !containsStr(err.Error(), "cp1") { - t.Errorf("error message should mention the missing hostname %q; got: %v", "cp1", err) + if !containsStr(err.Error(), "controlPlaneEndpoint") { + t.Errorf("error should mention controlPlaneEndpoint; got: %v", err) } } @@ -247,11 +273,11 @@ bootstrap: assertContainsStr(t, content, "ontai.dev/cluster: my-cluster") } -// TestBootstrap_BootstrapMode_DoesNotEmitSeamTenantNamespaceManifest verifies that -// compileBootstrap in mode=bootstrap (importExistingCluster=false) does NOT emit -// seam-tenant-namespace.yaml. Platform creates the namespace for bootstrap/CAPI clusters. -// Governor ruling 2026-04-21. -func TestBootstrap_BootstrapMode_DoesNotEmitSeamTenantNamespaceManifest(t *testing.T) { +// TestBootstrap_BootstrapMode_EmitsSeamTenantNamespaceManifest verifies that +// compileBootstrap in mode=bootstrap emits seam-tenant-namespace.yaml. +// Compiler creates namespaces for all modes; platform no longer creates +// seam-tenant-{cluster} namespaces. CP-INV-004 amended 2026-05-31. +func TestBootstrap_BootstrapMode_EmitsSeamTenantNamespaceManifest(t *testing.T) { input := ` name: fresh-cluster namespace: seam-system @@ -277,9 +303,11 @@ bootstrap: } nsPath := filepath.Join(outDir, "seam-tenant-namespace.yaml") - if _, err := os.Stat(nsPath); err == nil { - t.Error("seam-tenant-namespace.yaml must not be emitted for mode=bootstrap") + nsData, err := os.ReadFile(nsPath) + if err != nil { + t.Fatalf("seam-tenant-namespace.yaml must be emitted for mode=bootstrap: %v", err) } + assertContainsStr(t, string(nsData), "name: seam-tenant-fresh-cluster") } // TestBootstrap_ImportMode_NamespaceNameIsSeamTenantNotTenant verifies that the @@ -328,11 +356,12 @@ bootstrap: // ── Kubernetes API fallback (machineConfigPaths absent) ─────────────────────── -// TestBootstrap_ImportExistingCluster_KubeconfigFallback verifies that when -// importExistingCluster=true and machineConfigPaths is absent, Compiler falls -// back to the Kubernetes API path and returns an error when the kubeconfig -// is unreachable. This is the existing Seam-cluster import path. -func TestBootstrap_ImportExistingCluster_KubeconfigFallback(t *testing.T) { +// TestBootstrap_ImportMode_NoMachineConfigPaths_Succeeds verifies that import mode +// with no machineConfigPaths (and explicit endpoint and disk) succeeds and emits +// namespace + TalosCluster + bootstrap-sequence. The kubeconfig API fallback was +// removed when the MachineConfig CRD migration eliminated PKI extraction. +// Admin provides MachineConfig CRs separately. CP-INV-004. +func TestBootstrap_ImportMode_NoMachineConfigPaths_Succeeds(t *testing.T) { input := ` name: import-cluster namespace: seam-system @@ -352,10 +381,14 @@ bootstrap: role: init ` inputPath := writeInputFile(t, input) - // Pass a non-existent kubeconfig — the API path must fail with an error. - err := compileBootstrap(inputPath, t.TempDir(), "/nonexistent/kubeconfig.yaml", "") - if err == nil { - t.Fatal("expected error for missing kubeconfig in API fallback path; got nil") + outDir := t.TempDir() + if err := compileBootstrap(inputPath, outDir, "", ""); err != nil { + t.Fatalf("expected import mode to succeed without machineConfigPaths; got: %v", err) + } + for _, name := range []string{"seam-tenant-namespace.yaml", "import-cluster.yaml", "bootstrap-sequence.yaml"} { + if _, err := os.Stat(filepath.Join(outDir, name)); err != nil { + t.Errorf("expected output file %q not found: %v", name, err) + } } } diff --git a/cmd/compiler/compile_bootstrap_test.go b/cmd/compiler/compile_bootstrap_test.go index 35444aa..618fd2c 100644 --- a/cmd/compiler/compile_bootstrap_test.go +++ b/cmd/compiler/compile_bootstrap_test.go @@ -52,8 +52,9 @@ func TestBootstrap_ProducesExpectedOutputFiles(t *testing.T) { t.Fatalf("compileBootstrap error: %v", err) } - // Expect 3 node Secrets + 1 TalosCluster + 1 bootstrap-sequence. + // Expect: namespace manifest + 3 MachineConfig CRs + TalosCluster CR + bootstrap-sequence. expectedFiles := []string{ + "seam-tenant-namespace.yaml", "seam-mc-ccs-mgmt-node1.yaml", "seam-mc-ccs-mgmt-node2.yaml", "seam-mc-ccs-mgmt-node3.yaml", @@ -68,10 +69,11 @@ func TestBootstrap_ProducesExpectedOutputFiles(t *testing.T) { } } -// TestBootstrap_SecretHasCorrectStructure verifies that the generated machineconfig -// Secret for the init node has the required Kubernetes Secret fields. +// TestBootstrap_MachineConfigCRHasCorrectStructure verifies that the generated +// MachineConfig CR for the init node has the required fields. // platform-schema.md §9: naming convention seam-mc-{cluster}-{hostname}. -func TestBootstrap_SecretHasCorrectStructure(t *testing.T) { +// Phase 3a: MachineConfig CRD replaces machineconfig Secrets for bootstrap output. +func TestBootstrap_MachineConfigCRHasCorrectStructure(t *testing.T) { outDir := t.TempDir() inputPath := writeInputFile(t, bootstrapInputYAML) @@ -81,15 +83,17 @@ func TestBootstrap_SecretHasCorrectStructure(t *testing.T) { data, err := os.ReadFile(filepath.Join(outDir, "seam-mc-ccs-mgmt-node1.yaml")) if err != nil { - t.Fatalf("read Secret YAML: %v", err) + t.Fatalf("read MachineConfig CR YAML: %v", err) } content := string(data) - assertContainsStr(t, content, "apiVersion: v1") - assertContainsStr(t, content, "kind: Secret") + assertContainsStr(t, content, "apiVersion: platform.ontai.dev/v1alpha1") + assertContainsStr(t, content, "kind: MachineConfig") assertContainsStr(t, content, "name: seam-mc-ccs-mgmt-node1") assertContainsStr(t, content, "namespace: seam-tenant-ccs-mgmt") - assertContainsStr(t, content, "machineconfig.yaml:") + assertContainsStr(t, content, "role: init") + assertContainsStr(t, content, "nodeHostname: node1") + assertContainsStr(t, content, "nodeIP: 10.20.0.11") assertContainsStr(t, content, "ontai.dev/cluster: ccs-mgmt") } @@ -316,12 +320,9 @@ bootstrap: t.Fatalf("compileBootstrap error: %v", err) } - // The Secret YAML should contain the default installer image reference. + // The MachineConfig CR's spec.machine should contain the default installer image. data, _ := os.ReadFile(filepath.Join(outDir, "seam-mc-ccs-mgmt-node1.yaml")) - content := string(data) - assertContainsStr(t, content, "machineconfig.yaml:") - // machineconfig.yaml should contain the default installer image. - assertContainsStr(t, content, "ghcr.io/siderolabs/installer:v1.7.0") + assertContainsStr(t, string(data), "ghcr.io/siderolabs/installer:v1.7.0") } // WS2 — Bootstrap malformed input validation tests. diff --git a/cmd/compiler/compile_enable.go b/cmd/compiler/compile_enable.go index 27a6ec8..14c8b2b 100644 --- a/cmd/compiler/compile_enable.go +++ b/cmd/compiler/compile_enable.go @@ -938,9 +938,12 @@ func writeBootstrapRBACPolicy(dir string) error { } // writeBootstrapPermissionSets writes guardian-permissionsets.yaml to dir. -// Emits ONLY management-maximum, the Layer 1 fleet ceiling (CS-INV-008). -// Per-operator PermissionSets are not emitted. All Seam operator RBACProfiles -// reference management-maximum directly. guardian-schema.md §6, §19. +// Emits two Layer 1 PermissionSets: +// - management-maximum: the fleet ceiling; all Seam operator RBACProfiles reference it. +// - extensions-maximum: the extension ceiling; covers CRDs for all ONT-managed extension +// operators (EXT-1 through EXT-10). RECON-CMN2. +// +// Per-operator PermissionSets are not emitted. guardian-schema.md §6, §19, CS-INV-008. func writeBootstrapPermissionSets(dir string) error { // rule builds a single permission rule map. rule := func(apiGroups, resources, verbs []string) map[string]interface{} { @@ -975,14 +978,37 @@ func writeBootstrapPermissionSets(dir string) error { rule([]string{"*"}, []string{"*"}, allVerbs), }, }, + { + // extensions-maximum: Layer 1 extension ceiling. Covers CRDs for all ten + // ONT-managed extension operator categories (EXT-1 through EXT-10). Extension + // RBACProfiles in seam-tenant-* namespaces reference cluster-maximum (the + // per-cluster copy) for permission enforcement; this PermissionSet declares the + // fleet-level CRD-group boundary for governance audits. RECON-CMN2. + name: "extensions-maximum", + labels: map[string]string{ + "ontai.dev/managed-by": "compiler", + "ontai.dev/permission-set-type": "bootstrap", + "ontai.dev/policy-type": "management", + }, + description: "Extension permission ceiling -- CRDs for ONT-managed extension operators", + permissions: []map[string]interface{}{ + rule([]string{"external-secrets.io"}, []string{"externalsecrets", "secretstores", "clustersecretstores"}, allVerbs), + rule([]string{"kyverno.io"}, []string{"clusterpolicies", "policies", "policyreports", "clusterpolicyreports"}, allVerbs), + rule([]string{"aquasecurity.github.io"}, []string{"vulnerabilityreports", "configauditreports", "clustervulnerabilityreports"}, allVerbs), + rule([]string{"velero.io"}, []string{"backups", "backupstoragelocations", "restores", "schedules", "volumesnapshotlocations"}, allVerbs), + rule([]string{"cost.grafana.com"}, []string{"*"}, allVerbs), + rule([]string{"monitoring.coreos.com"}, []string{"servicemonitors", "prometheusrules", "podmonitors"}, allVerbs), + rule([]string{"apiextensions.crossplane.io", "pkg.crossplane.io"}, []string{"*"}, allVerbs), + }, + }, } var buf bytes.Buffer buf.WriteString("# Bootstrap PermissionSet CRs\n") buf.WriteString("# Generated by: compiler enable (phase 1 guardian-bootstrap)\n") - buf.WriteString("# management-maximum is the Layer 1 fleet ceiling (guardian-schema.md §19 Layer 1).\n") - buf.WriteString("# CS-INV-008: exactly one PermissionSet at Layer 1. All Seam operator RBACProfiles\n") - buf.WriteString("# reference management-maximum directly. No per-operator PermissionSets are emitted.\n") + buf.WriteString("# management-maximum: Layer 1 fleet ceiling (guardian-schema.md §19 Layer 1).\n") + buf.WriteString("# extensions-maximum: Layer 1 extension ceiling for ONT-managed extension operators.\n") + buf.WriteString("# CS-INV-008. No per-operator PermissionSets are emitted.\n") for _, s := range sets { spec := map[string]interface{}{ @@ -1653,6 +1679,7 @@ func writePhase3PlatformDispatcher(output string, ops []operatorSpec) error { "platform-dispatcher-deployments.yaml", "platform-dispatcher-metrics-services.yaml", "seam-service.yaml", + "seam-declaring-principal-webhook.yaml", "seam-lineage-webhooks.yaml", } @@ -1664,6 +1691,8 @@ func writePhase3PlatformDispatcher(output string, ops []operatorSpec) error { "provisioned=true (kubectl get rbacprofiles -n seam-system). " + "These operators must be operational before Conductor's RBACProfile " + "can be provisioned in phase 4. " + + "Verify seam MutatingWebhookConfiguration is registered: " + + "kubectl get mutatingwebhookconfigurations | grep seam-root-declaration. " + "Verify seam ValidatingWebhookConfigurations are registered: " + "kubectl get validatingwebhookconfigurations | grep seam-lineage.", ApplyOrder: files, @@ -1727,6 +1756,14 @@ func writePhase3PlatformDispatcher(output string, ops []operatorSpec) error { return err } + // seam-declaring-principal-webhook.yaml — MutatingWebhookConfiguration that stamps + // infrastructure.ontai.dev/declaring-principal on TalosCluster and PackDelivery at + // CREATE time. Required for LineageController to populate declaringPrincipal on + // LineageRecord with the actual requesting principal. + if err := writeSeamDeclaringPrincipalWebhook(dir, seamNamespace); err != nil { + return err + } + // seam-lineage-webhooks.yaml — three ValidatingWebhookConfigurations for LineageRecord // governance: immutability (Decision 1), authorship gate (Decision 3), domainRef (Decision 2). if err := writeSeamWebhooks(dir); err != nil { @@ -1736,6 +1773,82 @@ func writePhase3PlatformDispatcher(output string, ops []operatorSpec) error { return nil } +// writeSeamDeclaringPrincipalWebhook writes seam-declaring-principal-webhook.yaml to dir. +// Emits a MutatingWebhookConfiguration that intercepts CREATE for TalosCluster and +// PackDelivery and stamps infrastructure.ontai.dev/declaring-principal with the +// requesting user's identity from AdmissionReview.UserInfo.Username. +// +// Without this webhook, the LineageController falls back to "system:unknown" for +// declaringPrincipal on every LineageRecord, making ownership tracing impossible. +// +// caBundle injected by cert-manager CA injector via cert-manager.io/inject-ca-from. +func writeSeamDeclaringPrincipalWebhook(dir, seamNamespace string) error { + injectAnnotation := seamNamespace + "/seam-webhook-cert" + + mwc := map[string]interface{}{ + "apiVersion": "admissionregistration.k8s.io/v1", + "kind": "MutatingWebhookConfiguration", + "metadata": map[string]interface{}{ + "name": "seam-root-declaration-principal", + "annotations": map[string]string{ + "ontai.dev/managed-by": "compiler", + "cert-manager.io/inject-ca-from": injectAnnotation, + }, + }, + "webhooks": []map[string]interface{}{ + { + "name": "mutate-root-declaration-declaring-principal.seam.ontai.dev", + "admissionReviewVersions": []string{"v1"}, + "sideEffects": "None", + "failurePolicy": "Fail", + "rules": []map[string]interface{}{ + { + "apiGroups": []string{"seam.ontai.dev"}, + "apiVersions": []string{"v1alpha1"}, + "operations": []string{"CREATE"}, + "resources": []string{"talosclusters", "packdeliveries"}, + "scope": "Namespaced", + }, + }, + "namespaceSelector": map[string]interface{}{ + "matchExpressions": []map[string]interface{}{ + { + "key": "seam.ontai.dev/webhook-mode", + "operator": "NotIn", + "values": []string{"exempt"}, + }, + }, + }, + "clientConfig": map[string]interface{}{ + "service": map[string]interface{}{ + "name": "seam", + "namespace": seamNamespace, + "path": "/mutate-root-declaration-declaring-principal", + "port": 443, + }, + }, + }, + }, + } + + data, err := yaml.Marshal(mwc) + if err != nil { + return fmt.Errorf("marshal seam declaring-principal MutatingWebhookConfiguration: %w", err) + } + + var buf bytes.Buffer + buf.WriteString("# seam Root Declaration Declaring Principal MutatingWebhookConfiguration\n") + buf.WriteString("# Generated by: compiler enable (phase 3 platform-dispatcher)\n") + buf.WriteString("# Stamps infrastructure.ontai.dev/declaring-principal on TalosCluster and\n") + buf.WriteString("# PackDelivery at CREATE time from AdmissionReview.UserInfo.Username.\n") + buf.WriteString("# Required for LineageController declaringPrincipal traceability.\n") + buf.WriteString("# caBundle injected by cert-manager CA injector.\n") + buf.WriteString("---\n") + buf.Write(data) + + return os.WriteFile(filepath.Join(dir, "seam-declaring-principal-webhook.yaml"), buf.Bytes(), 0644) +} + // writePlatformExecutorRoleFile emits a Role and RoleBinding in ont-system for the // platform-executor SA. Conductor executor Jobs run as this SA and write // InfrastructureTalosClusterOperationResult CRs to POD_NAMESPACE (ont-system). @@ -1949,11 +2062,20 @@ func writePhase5PostBootstrap(output string, operators []operatorSpec, clusterNa files := []string{ "leaderelection.yaml", } - // pack-deploy-queue.yaml and dispatcher-runner.yaml require Kueue and seam-tenant-{name} - // namespaces, which exist only on the management cluster (INV-003). - if clusterName != "" && clusterRole != "tenant" { + if clusterName != "" { + // pack-deploy-queue.yaml and dispatcher-runner.yaml are required for both + // management and tenant cluster enable bundles. These resources live in + // seam-tenant-{clusterName} on the management cluster and must be applied to + // the management cluster (not the tenant cluster). The enable script is + // responsible for routing these files to the correct kubectl context. files = append(files, "pack-deploy-queue.yaml", "dispatcher-runner.yaml") } + if clusterName != "" && clusterRole != "tenant" { + // watchdog-queue.yaml is management-cluster-only: the conductor watchdog + // submits remediation Jobs in ont-system, which only exists on ccs-mgmt. + // conductor-schema.md §6 RuntimeDrift remediation. + files = append(files, "watchdog-queue.yaml") + } meta := phaseMeta{ Phase: "post-bootstrap", @@ -1971,19 +2093,27 @@ func writePhase5PostBootstrap(output string, operators []operatorSpec, clusterNa return err } - // Kueue and seam-tenant-{name} resources are management-cluster-only (INV-003). - if clusterName != "" && clusterRole != "tenant" { - // pack-deploy-queue.yaml — Kueue LocalQueue in seam-tenant-{clusterName}. - // wrapper-schema.md §9 pack delivery chain. + if clusterName != "" { + // pack-deploy-queue.yaml — Kueue LocalQueue in seam-tenant-{clusterName} on the + // management cluster. Required for pack-deploy Job admission for any cluster. + // dispatcher-schema.md §9, conductor-schema.md §5 (execute mode). if err := writePackDeployQueueYAML(dir, clusterName); err != nil { return err } - // dispatcher-runner.yaml — SA, Role, RoleBinding for pack-deploy Job identity. - // guardian-schema.md §6, INV-004. + // dispatcher-runner.yaml — SA, Role, RoleBinding for the pack-deploy Job identity + // in seam-tenant-{clusterName} on the management cluster. Required for both + // management and tenant cluster PackExecution RBAC gates. INV-004. if err := writeDispatcherRunnerRBACYAML(dir, clusterName); err != nil { return err } } + if clusterName != "" && clusterRole != "tenant" { + // watchdog-queue.yaml — Kueue LocalQueue in ont-system for watchdog Jobs. + // conductor-schema.md §6 RuntimeDrift remediation. + if err := writeWatchdogQueueYAML(dir); err != nil { + return err + } + } return nil } @@ -2400,6 +2530,30 @@ func buildOperatorDeployment(op operatorSpec) appsv1.Deployment { }) } + // Tenant conductor mounts the management cluster kubeconfig so all target-cluster + // drift loops (TalosVersionDriftLoop, KubernetesVersionDriftLoop, PackPodHealthLoop, + // PackReceiptDriftLoop, etc.) can write DriftSignals and read PackInstalled on + // ccs-mgmt. Without MGMT_KUBECONFIG_PATH the gate in agent.go silently disables + // every loop that requires management cluster access. conductor-schema.md §15. + if op.Name == "conductor" && op.Role == "tenant" { + env = append(env, + corev1.EnvVar{Name: "MGMT_KUBECONFIG_PATH", Value: "/etc/conductor/mgmt/kubeconfig"}, + ) + volumes = append(volumes, corev1.Volume{ + Name: "conductor-mgmt-kubeconfig", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: "conductor-mgmt-kubeconfig", + }, + }, + }) + volumeMounts = append(volumeMounts, corev1.VolumeMount{ + Name: "conductor-mgmt-kubeconfig", + MountPath: "/etc/conductor/mgmt", + ReadOnly: true, + }) + } + // Platform, Dispatcher, and seam carry OPERATOR_NAMESPACE so their webhook // servers and controllers can resolve their own namespace without downward API // duplication. OPERATOR_NAMESPACE is also required by Guardian admission hooks @@ -2413,6 +2567,22 @@ func buildOperatorDeployment(op operatorSpec) appsv1.Deployment { }) } + // seam carries LINEAGE_CNPG_URI so the LineageController can archive + // LineageRecords to CNPG on root declaration deletion. The URI is sourced from + // the guardian-db-app Secret which CNPG generates for the app user. + // seam-schema.md §4 CNPG Lineage Archival. INV-016. + if op.Name == "seam" { + env = append(env, corev1.EnvVar{ + Name: "LINEAGE_CNPG_URI", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "guardian-db-app"}, + Key: "uri", + }, + }, + }) + } + // Platform carries CONDUCTOR_REGISTRY so it can construct Conductor executor Job // image references without hardcoding the registry. conductor-schema.md §15. if op.ConductorRegistry != "" { @@ -2715,6 +2885,42 @@ func writePackDeployQueueYAML(dir, clusterName string) error { return os.WriteFile(filepath.Join(dir, "pack-deploy-queue.yaml"), buf.Bytes(), 0644) } +// writeWatchdogQueueYAML emits a Kueue LocalQueue named watchdog-queue in ont-system +// referencing ClusterQueue seam-pack-deploy. The LocalQueue gates watchdog Job admission +// for RuntimeDrift remediation capabilities submitted by the conductor agent. +// conductor-schema.md §6 RuntimeDrift remediation. +func writeWatchdogQueueYAML(dir string) error { + lq := map[string]interface{}{ + "apiVersion": "kueue.x-k8s.io/v1beta1", + "kind": "LocalQueue", + "metadata": map[string]interface{}{ + "name": "watchdog-queue", + "namespace": "ont-system", + "labels": map[string]interface{}{ + "ontai.dev/managed-by": "compiler", + }, + }, + "spec": map[string]interface{}{ + "clusterQueue": "seam-pack-deploy", + }, + } + + data, err := yaml.Marshal(lq) + if err != nil { + return fmt.Errorf("marshal watchdog-queue LocalQueue: %w", err) + } + + var buf bytes.Buffer + buf.WriteString("# Kueue LocalQueue — watchdog-queue in ont-system\n") + buf.WriteString("# References ClusterQueue seam-pack-deploy.\n") + buf.WriteString("# Required for watchdog Job admission for RuntimeDrift remediation.\n") + buf.WriteString("# Generated by: compiler enable (phase 05 post-bootstrap)\n") + buf.WriteString("# conductor-schema.md §6.\n") + buf.WriteString("---\n") + buf.Write(data) + return os.WriteFile(filepath.Join(dir, "watchdog-queue.yaml"), buf.Bytes(), 0644) +} + // writeDispatcherRunnerRBACYAML emits ServiceAccount, Role, and RoleBinding for the // dispatcher-runner identity in seam-tenant-{clusterName}. The Role is annotated with // ontai.dev/rbac-owner=guardian per INV-004. diff --git a/cmd/compiler/compile_enable_test.go b/cmd/compiler/compile_enable_test.go index fd605a5..15feef7 100644 --- a/cmd/compiler/compile_enable_test.go +++ b/cmd/compiler/compile_enable_test.go @@ -441,10 +441,10 @@ func TestEnable_ManagementMaximumHasPolicyTypeLabel(t *testing.T) { assertContainsStr(t, content, "ontai.dev/policy-type: management") } -// TestEnable_OnlyManagementMaximumPermissionSet verifies that guardian-permissionsets.yaml -// contains exactly one PermissionSet document (management-maximum) and that it is the -// wildcard Layer 1 ceiling. Per-operator PermissionSets must not be emitted. CS-INV-008. -func TestEnable_OnlyManagementMaximumPermissionSet(t *testing.T) { +// TestEnable_BootstrapPermissionSetCount verifies that guardian-permissionsets.yaml +// contains exactly two PermissionSet documents: management-maximum and extensions-maximum. +// Per-operator PermissionSets must not be emitted. CS-INV-008, RECON-CMN2. +func TestEnable_BootstrapPermissionSetCount(t *testing.T) { outDir := t.TempDir() if err := compileEnableBundle(outDir, "dev", defaultRegistry, "", false, "", "", "", "", ""); err != nil { t.Fatalf("compileEnableBundle error: %v", err) @@ -452,16 +452,43 @@ func TestEnable_OnlyManagementMaximumPermissionSet(t *testing.T) { content := readPhaseFile(t, outDir, "01-guardian-bootstrap", "guardian-permissionsets.yaml") - // Count PermissionSet documents. count := strings.Count(content, "kind: PermissionSet") - if count != 1 { - t.Errorf("expected exactly 1 PermissionSet document, got %d (CS-INV-008)", count) + if count != 2 { + t.Errorf("expected exactly 2 PermissionSet documents (management-maximum + extensions-maximum), got %d (CS-INV-008, RECON-CMN2)", count) } - // The sole document must be management-maximum. if !strings.Contains(content, "name: management-maximum") { t.Error("expected management-maximum PermissionSet document") } + if !strings.Contains(content, "name: extensions-maximum") { + t.Error("expected extensions-maximum PermissionSet document (RECON-CMN2)") + } +} + +// TestEnable_ExtensionsMaximumPermissionSet verifies that extensions-maximum covers all +// ten ONT extension operator CRD groups. RECON-CMN2. +func TestEnable_ExtensionsMaximumPermissionSet(t *testing.T) { + outDir := t.TempDir() + if err := compileEnableBundle(outDir, "dev", defaultRegistry, "", false, "", "", "", "", ""); err != nil { + t.Fatalf("compileEnableBundle error: %v", err) + } + + content := readPhaseFile(t, outDir, "01-guardian-bootstrap", "guardian-permissionsets.yaml") + + for _, group := range []string{ + "external-secrets.io", + "kyverno.io", + "aquasecurity.github.io", + "velero.io", + "cost.grafana.com", + "monitoring.coreos.com", + "apiextensions.crossplane.io", + "pkg.crossplane.io", + } { + if !strings.Contains(content, group) { + t.Errorf("extensions-maximum missing CRD group %q (RECON-CMN2)", group) + } + } } // TestEnable_RBACProfilesRefManagementPolicyAndMaximum verifies that all emitted diff --git a/cmd/compiler/main.go b/cmd/compiler/main.go index 93aa5d7..0acec76 100644 --- a/cmd/compiler/main.go +++ b/cmd/compiler/main.go @@ -42,6 +42,8 @@ func main() { runComponentSubcommand(os.Args[2:]) case "maintenance": runMaintenanceSubcommand(os.Args[2:]) + case "addnode": + runAddNodeSubcommand(os.Args[2:]) case "scaffold": runScaffoldSubcommand(os.Args[2:]) case "domain": @@ -198,6 +200,7 @@ func printUsageTo(w *os.File) { fmt.Fprintln(w, " packbuild Compile a PackBuild spec into a ClusterPack CR") fmt.Fprintln(w, " maintenance Compile a MaintenanceBundle CR with pre-resolved scheduling context") fmt.Fprintln(w, " component Produce RBACProfile CR YAML from the embedded catalog or a descriptor") + fmt.Fprintln(w, " addnode Generate a MachineConfig CR for a node being added to an existing cluster") fmt.Fprintln(w, " scaffold Generate a seam-domain operator scaffold pre-wired with seam-sdk") fmt.Fprintln(w, " domain Reserved — not yet implemented") fmt.Fprintln(w, "") diff --git a/config/crd/seam.ontai.dev_runnerconfigs.yaml b/config/crd/seam.ontai.dev_runnerconfigs.yaml deleted file mode 100644 index 094bf6e..0000000 --- a/config/crd/seam.ontai.dev_runnerconfigs.yaml +++ /dev/null @@ -1,323 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.16.1 - name: runnerconfigs.seam.ontai.dev -spec: - group: seam.ontai.dev - names: - kind: RunnerConfig - listKind: RunnerConfigList - plural: runnerconfigs - shortNames: - - rc - singular: runnerconfig - scope: Namespaced - versions: - - additionalPrinterColumns: - - jsonPath: .spec.clusterRef - name: Cluster - type: string - - jsonPath: .metadata.creationTimestamp - name: Age - type: date - name: v1alpha1 - schema: - openAPIV3Schema: - description: |- - RunnerConfig is the seam-core CRD for Conductor agent runtime configuration. - Owned by seam-core; authored exclusively by the platform operator. INV-009. - conductor-schema.md. MIGRATION-3.8. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - RunnerConfigSpec is the operator-generated operational contract for a - specific cluster. Generated at runtime by platform using the runner shared library. - Never human-authored. INV-009, INV-010. conductor-schema.md. - properties: - clusterRef: - description: ClusterRef is the name of the TalosCluster this RunnerConfig - is authoritative for. - type: string - maintenanceTargetNodes: - description: MaintenanceTargetNodes is the list of node names that - are the subject of the operation. - items: - type: string - type: array - operationalHistory: - description: OperationalHistory is an append-only record of completed - RunnerConfig executions. - items: - description: |- - RunnerOperationalHistoryEntry is a single append-only audit record describing one - configuration change applied to this RunnerConfig. Never truncated. - properties: - appliedAt: - description: AppliedAt is the time this change was applied. - format: date-time - type: string - appliedBy: - description: AppliedBy identifies who applied the change. - type: string - concern: - description: Concern identifies what aspect of configuration - changed. - type: string - newValue: - description: NewValue is the value after the change. - type: string - previousValue: - description: PreviousValue is the value before the change. Empty - for initial entries. - type: string - required: - - appliedAt - - appliedBy - - concern - - newValue - type: object - type: array - operatorLeaderNode: - description: OperatorLeaderNode is the node hosting the leader pod - of the initiating operator. - type: string - phases: - description: Phases is the ordered list of operational phases for - this cluster's Conductor lifecycle. - items: - description: RunnerPhaseConfig carries per-phase parameters for - the runner's execution context. - properties: - name: - description: Name identifies the phase. - type: string - parameters: - additionalProperties: - type: string - description: Parameters holds phase-specific key-value configuration. - type: object - required: - - name - type: object - type: array - runnerImage: - description: |- - RunnerImage is the fully qualified container image reference for the Conductor agent. - Tag convention: v{talosVersion}-r{revision} stable, dev/dev-rc{N} development. INV-011. - type: string - selfOperation: - description: SelfOperation is true when the Job's execution cluster - and the target cluster are the same. - type: boolean - steps: - description: Steps is the ordered list of execution steps across all - phases. - items: - description: RunnerConfigStep declares one step in a multi-step - operation intent. - properties: - capability: - description: Capability is the named Conductor capability to - invoke for this step. - type: string - dependsOn: - description: DependsOn is the name of a prior step that must - complete before this step begins. - type: string - haltOnFailure: - description: |- - HaltOnFailure controls sequencer behaviour when this step fails. - When true, failure terminates the RunnerConfig with no further steps executing. - type: boolean - name: - description: Name is the unique identifier for this step within - the RunnerConfig. - type: string - parameters: - additionalProperties: - type: string - description: Parameters is the input parameter map passed to - the capability at Job materialisation time. - type: object - required: - - capability - - name - type: object - type: array - required: - - clusterRef - - runnerImage - type: object - status: - description: |- - RunnerConfigStatus is written exclusively by the Conductor agent leader. - CR-INV-006. - properties: - agentLeader: - description: AgentLeader is the pod name of the current Conductor - agent leader. - type: string - agentVersion: - description: AgentVersion is the version string of the Conductor agent - binary currently running. - type: string - capabilities: - description: |- - Capabilities is the self-declared capability manifest emitted by the Conductor agent on startup. - CR-INV-005. - items: - description: RunnerCapabilityEntry is one capability declared by - the Conductor agent on startup. - properties: - description: - description: Description is a human-readable description of - what this capability does. - type: string - name: - description: Name is the capability name (e.g., pack-deploy, - talos-upgrade). - type: string - version: - description: Version is the capability version declared by the - agent. - type: string - required: - - name - - version - type: object - type: array - conditions: - description: Conditions is the standard Kubernetes condition list - for this RunnerConfig. - items: - description: Condition contains details for one aspect of the current - state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - failedStep: - description: |- - FailedStep is the name of the first step that reached the Failed phase. - Present only when Phase="Failed". conductor-schema.md §17. - type: string - phase: - description: |- - Phase is the terminal execution phase written by Conductor execute mode. - "Completed" means all steps succeeded. "Failed" means at least one step failed. - Empty means execution is in progress. Platform operators watch this field to - detect terminal conditions without scanning StepResults. conductor-schema.md §17. - type: string - stepResults: - description: StepResults is the ordered list of step result records - written by Conductor execute mode. - items: - description: RunnerConfigStepResult is the status record for one - step. - properties: - completedAt: - description: CompletedAt is the time this step finished execution. - format: date-time - type: string - message: - description: Message is additional context about the step outcome. - type: string - name: - description: Name matches the Name field of the corresponding - RunnerConfigStep in spec. - type: string - startedAt: - description: StartedAt is the time this step began execution. - format: date-time - type: string - status: - allOf: - - enum: - - Succeeded - - Failed - - Skipped - - enum: - - Succeeded - - Failed - - Skipped - description: Status is the terminal status of this step execution. - type: string - required: - - name - - status - type: object - type: array - type: object - type: object - served: true - storage: true - subresources: - status: {} diff --git a/go.mod b/go.mod index e88fae0..13818d3 100644 --- a/go.mod +++ b/go.mod @@ -18,17 +18,19 @@ require ( github.com/onsi/ginkgo/v2 v2.27.2 github.com/onsi/gomega v1.38.2 github.com/ontai-dev/conductor-sdk v0.0.0-00010101000000-000000000000 + github.com/ontai-dev/dispatcher v0.0.0-00010101000000-000000000000 github.com/ontai-dev/guardian v0.0.0-00010101000000-000000000000 github.com/ontai-dev/platform v0.0.0-00010101000000-000000000000 - github.com/ontai-dev/dispatcher v0.0.0-00010101000000-000000000000 github.com/ontai-dev/seam v0.0.0-00010101000000-000000000000 github.com/ontai-dev/seam-sdk v0.0.0-00010101000000-000000000000 github.com/prometheus/client_golang v1.23.2 github.com/siderolabs/talos/pkg/machinery v1.12.6 + golang.org/x/time v0.14.0 google.golang.org/grpc v1.79.3 gopkg.in/yaml.v3 v3.0.1 helm.sh/helm/v3 v3.17.3 k8s.io/api v0.35.3 + k8s.io/apiextensions-apiserver v0.35.0 k8s.io/apimachinery v0.35.3 k8s.io/client-go v0.35.3 sigs.k8s.io/controller-runtime v0.23.3 @@ -158,7 +160,6 @@ require ( golang.org/x/sys v0.41.0 // indirect golang.org/x/term v0.40.0 // indirect golang.org/x/text v0.34.0 // indirect - golang.org/x/time v0.14.0 // indirect golang.org/x/tools v0.41.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect @@ -166,7 +167,6 @@ require ( google.golang.org/protobuf v1.36.10 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - k8s.io/apiextensions-apiserver v0.35.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect diff --git a/internal/agent/backup_health_loop.go b/internal/agent/backup_health_loop.go new file mode 100644 index 0000000..acc6d49 --- /dev/null +++ b/internal/agent/backup_health_loop.go @@ -0,0 +1,291 @@ +package agent + +import ( + "context" + "encoding/json" + "fmt" + "time" + + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/dynamic" +) + +// backupStorageLocationGVR is the GroupVersionResource for BackupStorageLocation CRs (Velero). +var backupStorageLocationGVR = schema.GroupVersionResource{ + Group: "velero.io", + Version: "v1", + Resource: "backupstoragelocations", +} + +// veleroBackupGVR is the GroupVersionResource for Backup CRs (Velero). +var veleroBackupGVR = schema.GroupVersionResource{ + Group: "velero.io", + Version: "v1", + Resource: "backups", +} + +const bslSignalPrefix = "drift-bsl-" +const backupRPOSignalPrefix = "drift-backup-rpo-" + +// defaultBackupRPO is the maximum age of a successful backup before a RPO breach signal is emitted. +const defaultBackupRPO = 25 * time.Hour + +// BackupHealthLoop runs on conductor role=management. On each cycle it: +// 1. Lists BackupStorageLocation CRs in the management namespace; emits BackupStorageUnavailable +// when status.phase is not Available. +// 2. Lists Backup CRs and finds the most recent successful backup; emits BackupRPOBreached +// when no successful backup is younger than defaultBackupRPO (25h, covering daily schedules +// with a 1-hour grace window). +// +// Skips cleanly when Velero CRDs are not installed. +// AutonomyLevel=observe-only: logs only, no DriftSignal written. +// RECON-N2. +type BackupHealthLoop struct { + client dynamic.Interface + namespace string + ocWatcher *OperatorContextWatcher + clusterRef string + rpo time.Duration +} + +// NewBackupHealthLoop constructs a BackupHealthLoop for the given namespace. +func NewBackupHealthLoop(client dynamic.Interface, namespace, clusterRef string) *BackupHealthLoop { + return &BackupHealthLoop{ + client: client, + namespace: namespace, + clusterRef: clusterRef, + rpo: defaultBackupRPO, + } +} + +// WithOperatorContextWatcher sets the OperatorContextWatcher for autonomy-level gating. +func (l *BackupHealthLoop) WithOperatorContextWatcher(w *OperatorContextWatcher) { + l.ocWatcher = w +} + +// Run runs the loop until ctx is cancelled. +func (l *BackupHealthLoop) Run(ctx context.Context, interval time.Duration) { + l.checkOnce(ctx) + if ctx.Err() != nil { + return + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + l.checkOnce(ctx) + } + } +} + +func (l *BackupHealthLoop) checkOnce(ctx context.Context) { + l.checkBSLs(ctx) + l.checkBackupRPO(ctx) +} + +func (l *BackupHealthLoop) checkBSLs(ctx context.Context) { + list, err := l.client.Resource(backupStorageLocationGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("backup health loop: list BackupStorageLocations in %s: %v\n", l.namespace, err) + return + } + for i := range list.Items { + l.checkBSL(ctx, &list.Items[i]) + } +} + +func (l *BackupHealthLoop) checkBSL(ctx context.Context, bsl *k8sunstructured.Unstructured) { + name := bsl.GetName() + signalName := bslSignalPrefix + name + + status, _, _ := unstructuredNestedMap(bsl.Object, "status") + phase, _ := status["phase"].(string) + + if phase == "Available" { + l.confirmSignalIfPresent(ctx, signalName) + return + } + + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("backup health loop: bsl=%q phase=%q -- observe-only mode, no DriftSignal written\n", name, phase) + return + } + + driftReason := fmt.Sprintf("BackupStorageLocation unavailable: name=%s phase=%s", name, phase) + l.emitSignal(ctx, signalName, "BackupStorageUnavailable", name, "velero.io", "BackupStorageLocation", driftReason) +} + +func (l *BackupHealthLoop) checkBackupRPO(ctx context.Context) { + list, err := l.client.Resource(veleroBackupGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("backup health loop: list Backups in %s: %v\n", l.namespace, err) + return + } + + signalName := backupRPOSignalPrefix + "cluster" + latestSuccess := l.findLatestSuccessfulBackup(list.Items) + + if latestSuccess.IsZero() { + // No successful backup at all. + if len(list.Items) == 0 { + // No backups scheduled yet -- not a breach. + return + } + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("backup health loop: no successful backup found -- observe-only mode, no DriftSignal written\n") + return + } + driftReason := "BackupRPOBreached: no successful backup found" + l.emitSignal(ctx, signalName, "BackupRPOBreached", "cluster", "velero.io", "Backup", driftReason) + return + } + + age := time.Since(latestSuccess) + if age <= l.rpo { + l.confirmSignalIfPresent(ctx, signalName) + return + } + + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("backup health loop: last successful backup age=%v exceeds RPO=%v -- observe-only mode, no DriftSignal written\n", age.Round(time.Minute), l.rpo) + return + } + + driftReason := fmt.Sprintf("BackupRPOBreached: last successful backup age=%v exceeds RPO=%v", age.Round(time.Minute), l.rpo) + l.emitSignal(ctx, signalName, "BackupRPOBreached", "cluster", "velero.io", "Backup", driftReason) +} + +// findLatestSuccessfulBackup returns the completion time of the most recent Completed backup. +func (l *BackupHealthLoop) findLatestSuccessfulBackup(items []k8sunstructured.Unstructured) time.Time { + var latest time.Time + for i := range items { + status, _, _ := unstructuredNestedMap(items[i].Object, "status") + phase, _ := status["phase"].(string) + if phase != "Completed" { + continue + } + completionStr, _ := status["completionTimestamp"].(string) + t, err := time.Parse(time.RFC3339, completionStr) + if err != nil { + continue + } + if t.After(latest) { + latest = t + } + } + return latest +} + +func (l *BackupHealthLoop) emitSignal(ctx context.Context, signalName, signalKind, resourceName, group, kind, driftReason string) { + now := time.Now().UTC().Format(time.RFC3339) + + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil && !k8serrors.IsNotFound(err) { + fmt.Printf("backup health loop: get DriftSignal %s: %v\n", signalName, err) + return + } + + if k8serrors.IsNotFound(err) { + obj := map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": signalName, "namespace": l.namespace}, + "spec": map[string]interface{}{ + "state": "pending", + "signalKind": signalKind, + "driftLayer": "infrastructure", + "correlationID": fmt.Sprintf("backup-%s-%d", resourceName, time.Now().UnixNano()), + "observedAt": now, + "driftReason": driftReason, + "affectedCRRef": map[string]interface{}{ + "group": group, + "kind": kind, + "namespace": l.namespace, + "name": resourceName, + }, + "escalationCounter": int64(0), + }, + } + if _, cErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Create( + ctx, &k8sunstructured.Unstructured{Object: obj}, metav1.CreateOptions{}, + ); cErr != nil { + fmt.Printf("backup health loop: create DriftSignal %s: %v\n", signalName, cErr) + } + fmt.Printf("backup health loop: %s -- DriftSignal written\n", driftReason) + return + } + + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + counter, _ := spec["escalationCounter"].(int64) + if int32(counter) >= escalationThreshold { + return + } + if state == "confirmed" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "correlationID": fmt.Sprintf("backup-%s-%d", resourceName, time.Now().UnixNano()), + "observedAt": now, + "escalationCounter": int64(0), + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("backup health loop: reset DriftSignal %s: %v\n", signalName, pErr) + } + return + } + if state == "queued" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "escalationCounter": counter + 1, + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("backup health loop: increment escalation counter %s: %v\n", signalName, pErr) + } + } +} + +func (l *BackupHealthLoop) confirmSignalIfPresent(ctx context.Context, signalName string) { + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil { + return + } + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + if state == "confirmed" || state == "" { + return + } + patch := map[string]interface{}{ + "spec": map[string]interface{}{"state": "confirmed", "correlationID": ""}, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("backup health loop: confirm DriftSignal %s/%s: %v\n", l.namespace, signalName, pErr) + } +} diff --git a/internal/agent/backup_health_loop_test.go b/internal/agent/backup_health_loop_test.go new file mode 100644 index 0000000..ea9cda0 --- /dev/null +++ b/internal/agent/backup_health_loop_test.go @@ -0,0 +1,194 @@ +package agent + +import ( + "context" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic/fake" +) + +var backupTestGVRs = map[schema.GroupVersionResource]string{ + backupStorageLocationGVR: "BackupStorageLocationList", + veleroBackupGVR: "BackupList", + driftSignalGVR: "DriftSignalList", +} + +func newBackupFakeClient(objs ...runtime.Object) *fake.FakeDynamicClient { + return fake.NewSimpleDynamicClientWithCustomListKinds(runtime.NewScheme(), backupTestGVRs, objs...) +} + +func fakeBSL(name, phase string) *unstructured.Unstructured { + return &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "velero.io/v1", + "kind": "BackupStorageLocation", + "metadata": map[string]interface{}{"name": name, "namespace": "seam-system"}, + "status": map[string]interface{}{"phase": phase}, + }} +} + +func fakeBackup(name, phase, completionTimestamp string) *unstructured.Unstructured { + return &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "velero.io/v1", + "kind": "Backup", + "metadata": map[string]interface{}{"name": name, "namespace": "seam-system"}, + "status": map[string]interface{}{"phase": phase, "completionTimestamp": completionTimestamp}, + }} +} + +func TestBackupHealthLoop_BSLAvailable_NoSignal(t *testing.T) { + bsl := fakeBSL("default", "Available") + backup := fakeBackup("daily-backup", "Completed", time.Now().Add(-1*time.Hour).UTC().Format(time.RFC3339)) + + client := newBackupFakeClient(bsl, backup) + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Errorf("expected no DriftSignal for available BSL with recent backup, got create on %s", a.GetResource().Resource) + } + } +} + +func TestBackupHealthLoop_BSLUnavailable_EmitsSignal(t *testing.T) { + bsl := fakeBSL("default", "Unavailable") + client := newBackupFakeClient(bsl) + + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + signalName := bslSignalPrefix + "default" + ds, err := client.Resource(driftSignalGVR).Namespace("seam-system").Get( + context.Background(), signalName, metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("expected BackupStorageUnavailable DriftSignal: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + if kind, _ := spec["signalKind"].(string); kind != "BackupStorageUnavailable" { + t.Errorf("signalKind = %q, want BackupStorageUnavailable", kind) + } +} + +func TestBackupHealthLoop_RPOBreached_EmitsSignal(t *testing.T) { + bsl := fakeBSL("default", "Available") + // Backup completed 30 hours ago -- exceeds defaultBackupRPO (25h). + oldBackup := fakeBackup("old-backup", "Completed", time.Now().Add(-30*time.Hour).UTC().Format(time.RFC3339)) + + client := newBackupFakeClient(bsl, oldBackup) + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + signalName := backupRPOSignalPrefix + "cluster" + ds, err := client.Resource(driftSignalGVR).Namespace("seam-system").Get( + context.Background(), signalName, metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("expected BackupRPOBreached DriftSignal: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + if kind, _ := spec["signalKind"].(string); kind != "BackupRPOBreached" { + t.Errorf("signalKind = %q, want BackupRPOBreached", kind) + } +} + +func TestBackupHealthLoop_RecentBackup_NoRPOSignal(t *testing.T) { + bsl := fakeBSL("default", "Available") + recentBackup := fakeBackup("recent-backup", "Completed", time.Now().Add(-2*time.Hour).UTC().Format(time.RFC3339)) + + client := newBackupFakeClient(bsl, recentBackup) + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Errorf("expected no DriftSignal for recent backup, got create on %s", a.GetResource().Resource) + } + } +} + +func TestBackupHealthLoop_ObserveOnly_NoSignal(t *testing.T) { + bsl := fakeBSL("default", "Unavailable") + client := newBackupFakeClient(bsl) + + ocWatcher := NewOperatorContextWatcher(client, "ont-system") + ocWatcher.mu.Lock() + ocWatcher.resolved["ccs-mgmt"] = resolvedContext{autonomyLevel: AutonomyLevelObserveOnly, mode: "normal"} + ocWatcher.mu.Unlock() + + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.WithOperatorContextWatcher(ocWatcher) + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal under observe-only mode") + } + } +} + +func TestBackupHealthLoop_BSLConfirmedWhenAvailable(t *testing.T) { + bsl := fakeBSL("default", "Available") + backup := fakeBackup("daily", "Completed", time.Now().Add(-1*time.Hour).UTC().Format(time.RFC3339)) + existingSignal := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": bslSignalPrefix + "default", "namespace": "seam-system"}, + "spec": map[string]interface{}{"state": "queued"}, + }} + + client := newBackupFakeClient(bsl, backup, existingSignal) + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var patched bool + for _, a := range client.Actions() { + if a.GetVerb() == "patch" && a.GetResource().Resource == "driftsignals" { + patched = true + } + } + if !patched { + t.Error("expected DriftSignal to be confirmed when BSL returns to Available") + } +} + +func TestBackupHealthLoop_FindLatestSuccessfulBackup(t *testing.T) { + l := NewBackupHealthLoop(nil, "seam-system", "ccs-mgmt") + + older := time.Now().Add(-10 * time.Hour) + newer := time.Now().Add(-2 * time.Hour) + + items := []unstructured.Unstructured{ + *fakeBackup("b1", "Completed", older.UTC().Format(time.RFC3339)), + *fakeBackup("b2", "Failed", newer.UTC().Format(time.RFC3339)), + *fakeBackup("b3", "Completed", newer.UTC().Format(time.RFC3339)), + } + + result := l.findLatestSuccessfulBackup(items) + if result.IsZero() { + t.Fatal("expected a valid timestamp") + } + if result.Before(older) || result.Before(newer.Add(-time.Second)) { + t.Errorf("expected result close to newer time, got %v", result) + } +} + +func TestBackupHealthLoop_NoBackups_NoRPOSignal(t *testing.T) { + bsl := fakeBSL("default", "Available") + client := newBackupFakeClient(bsl) + + l := NewBackupHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" && a.GetResource().Resource == "driftsignals" { + spec := a.(interface{ GetObject() runtime.Object }).GetObject() + t.Errorf("unexpected DriftSignal create: %v", spec) + } + } +} diff --git a/internal/agent/capability_publisher.go b/internal/agent/capability_publisher.go index 5e5a6fa..af9d236 100644 --- a/internal/agent/capability_publisher.go +++ b/internal/agent/capability_publisher.go @@ -35,11 +35,11 @@ const capabilityWatchInterval = 15 * time.Second const runnerConfigMissingDriftThreshold = 5 // runnerConfigGVR is the GroupVersionResource for RunnerConfig CRs. -// API group infrastructure.ontai.dev, schema version v1alpha1. conductor-schema.md §5. +// API group seam.ontai.dev, schema version v1alpha1. conductor-schema.md §5. var runnerConfigGVR = schema.GroupVersionResource{ - Group: "infrastructure.ontai.dev", + Group: "seam.ontai.dev", Version: "v1alpha1", - Resource: "infrastructurerunnerconfigs", + Resource: "runnerconfigs", } // CapabilityPublisher writes the Conductor capability manifest to the RunnerConfig @@ -69,7 +69,7 @@ func (p *CapabilityPublisher) emitRunnerConfigMissingSignal(ctx context.Context, now := time.Now().UTC().Format(time.RFC3339) obj := map[string]interface{}{ - "apiVersion": "infrastructure.ontai.dev/v1alpha1", + "apiVersion": "seam.ontai.dev/v1alpha1", "kind": "DriftSignal", "metadata": map[string]interface{}{ "name": signalName, @@ -81,7 +81,7 @@ func (p *CapabilityPublisher) emitRunnerConfigMissingSignal(ctx context.Context, "observedAt": now, "driftReason": "RunnerConfig not found in ont-system -- cluster-state drift", "affectedCRRef": map[string]interface{}{ - "group": "infrastructure.ontai.dev", + "group": "seam.ontai.dev", "kind": "RunnerConfig", "name": clusterRef, }, diff --git a/internal/agent/cluster_disk_pressure.go b/internal/agent/cluster_disk_pressure.go index e69870a..5dbe48b 100644 --- a/internal/agent/cluster_disk_pressure.go +++ b/internal/agent/cluster_disk_pressure.go @@ -8,6 +8,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + + "github.com/ontai-dev/seam/pkg/namespaces" ) // diskPressureWarnThreshold is the number of consecutive check cycles with DiskPressure=True @@ -114,7 +116,7 @@ func (l *ClusterNodeHealthLoop) setTalosClusterDiskPressure(ctx context.Context, fmt.Printf("disk pressure: cluster=%q marshal condition patch: %v\n", l.clusterRef, err) return } - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem).Patch( ctx, l.clusterRef, types.MergePatchType, data, metav1.PatchOptions{}, "status", ); err != nil { fmt.Printf("disk pressure: cluster=%q patch DiskPressure condition: %v\n", l.clusterRef, err) diff --git a/internal/agent/cluster_endpoint_drift.go b/internal/agent/cluster_endpoint_drift.go index 295fa85..42f9235 100644 --- a/internal/agent/cluster_endpoint_drift.go +++ b/internal/agent/cluster_endpoint_drift.go @@ -12,6 +12,8 @@ import ( k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/types" "gopkg.in/yaml.v3" + + "github.com/ontai-dev/seam/pkg/namespaces" ) // endpointDriftConsecutiveRequired is the number of consecutive checkOnce cycles with @@ -164,7 +166,7 @@ func (l *ClusterNodeHealthLoop) setHumanInterventionRequired(ctx context.Context fmt.Printf("cluster endpoint drift: cluster=%q marshal HumanInterventionRequired patch: %v\n", l.clusterRef, err) return } - _, err = l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + _, err = l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem).Patch( ctx, l.clusterRef, types.MergePatchType, patchBytes, metav1.PatchOptions{}, "status", ) if err != nil { @@ -197,7 +199,7 @@ func (l *ClusterNodeHealthLoop) emitEndpointDriftSignal(ctx context.Context, old "affectedCRRef": map[string]interface{}{ "group": "seam.ontai.dev", "kind": "TalosCluster", - "namespace": l.namespace, + "namespace": namespaces.SeamSystem, "name": l.clusterRef, }, "driftReason": msg, diff --git a/internal/agent/cluster_etcd_health.go b/internal/agent/cluster_etcd_health.go index e8ad733..cb896ba 100644 --- a/internal/agent/cluster_etcd_health.go +++ b/internal/agent/cluster_etcd_health.go @@ -10,6 +10,8 @@ import ( k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" + + "github.com/ontai-dev/seam/pkg/namespaces" ) // etcdDegradedThreshold is the number of consecutive cycles a member must be @@ -205,7 +207,7 @@ func (l *ClusterNodeHealthLoop) writeEtcdHealthAnnotation(ctx context.Context, m fmt.Printf("etcd health: cluster=%q marshal annotation: %v\n", l.clusterRef, err) return } - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem).Patch( ctx, l.clusterRef, types.MergePatchType, data, metav1.PatchOptions{}, ); err != nil { fmt.Printf("etcd health: cluster=%q write etcd annotation: %v\n", l.clusterRef, err) diff --git a/internal/agent/cluster_node_health_loop.go b/internal/agent/cluster_node_health_loop.go index 21640ec..18901d1 100644 --- a/internal/agent/cluster_node_health_loop.go +++ b/internal/agent/cluster_node_health_loop.go @@ -18,6 +18,8 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/dynamic" sigsyaml "sigs.k8s.io/yaml" + + "github.com/ontai-dev/seam/pkg/namespaces" ) // NodeHealthState classifies a node's health based on Kubernetes node conditions. @@ -220,6 +222,8 @@ func (l *ClusterNodeHealthLoop) checkOnce(ctx context.Context) { // Detect nodes missing the ONT enrollment label. RECON-C2. l.checkNodeRegistration(ctx, nodes) + // Resolve any NodeRegistrationDrift signals for nodes that now have the label. + l.resolveNodeRegistrationDrift(ctx, nodes) // Check CPU/memory utilisation against the CapacitySaturation threshold. RECON-C6. l.checkCapacitySaturation(ctx, nodes) @@ -441,17 +445,18 @@ func (l *ClusterNodeHealthLoop) writeTalosClusterHealthStatus(ctx context.Contex return fmt.Errorf("marshal TalosCluster health patch: %w", err) } // Metadata patch via merge-patch. - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + tcNS := namespaces.SeamSystem + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(tcNS).Patch( ctx, l.clusterRef, types.MergePatchType, data, metav1.PatchOptions{}, ); err != nil { - return fmt.Errorf("patch TalosCluster %s/%s metadata: %w", l.namespace, l.clusterRef, err) + return fmt.Errorf("patch TalosCluster %s/%s metadata: %w", tcNS, l.clusterRef, err) } // Status subresource patch. statusData, _ := json.Marshal(patch) - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(tcNS).Patch( ctx, l.clusterRef, types.MergePatchType, statusData, metav1.PatchOptions{}, "status", ); err != nil { - return fmt.Errorf("patch TalosCluster %s/%s status: %w", l.namespace, l.clusterRef, err) + return fmt.Errorf("patch TalosCluster %s/%s status: %w", tcNS, l.clusterRef, err) } return nil } @@ -559,7 +564,7 @@ func (l *ClusterNodeHealthLoop) emitHumanInterventionRequired(ctx context.Contex }, } data, _ := json.Marshal(patch) - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem).Patch( ctx, l.clusterRef, types.MergePatchType, data, metav1.PatchOptions{}, "status", ); err != nil { fmt.Printf("cluster node health loop: cluster=%q set HumanInterventionRequired: %v\n", l.clusterRef, err) @@ -610,7 +615,7 @@ func (l *ClusterNodeHealthLoop) emitTier3DriftSignal( "affectedCRRef": map[string]interface{}{ "group": "seam.ontai.dev", "kind": "TalosCluster", - "namespace": l.namespace, + "namespace": namespaces.SeamSystem, "name": l.clusterRef, }, "driftReason": msg, @@ -823,6 +828,64 @@ func (l *ClusterNodeHealthLoop) checkNodeRegistration(ctx context.Context, nodes } } +// resolveNodeRegistrationDrift patches NodeRegistrationDrift DriftSignals to +// state=resolved when the affected node now carries ont.platform.dev/controlled=true. +// Called on every checkOnce cycle after checkNodeRegistration. RECON-C2. +func (l *ClusterNodeHealthLoop) resolveNodeRegistrationDrift(ctx context.Context, nodes []map[string]interface{}) { + ns := "seam-tenant-" + l.clusterRef + + // Build a set of node names that now have the controlled label. + controlled := make(map[string]bool, len(nodes)) + for _, n := range nodes { + meta, _ := n["metadata"].(map[string]interface{}) + if meta == nil { + continue + } + name, _ := meta["name"].(string) + if name == "" { + continue + } + labels, _ := meta["labels"].(map[string]interface{}) + if v, ok := labels["ont.platform.dev/controlled"]; ok && v == "true" { + controlled[name] = true + } + } + + // List all DriftSignals of kind NodeRegistrationDrift in the tenant namespace. + list, err := l.localClient.Resource(driftSignalGVR).Namespace(ns).List(ctx, metav1.ListOptions{}) + if err != nil { + return + } + + patchBytes := []byte(`{"spec":{"state":"resolved"}}`) + for _, item := range list.Items { + spec, _ := item.Object["spec"].(map[string]interface{}) + if spec == nil { + continue + } + if spec["signalKind"] != "NodeRegistrationDrift" { + continue + } + if spec["state"] == "resolved" { + continue + } + affectedCRRef, _ := spec["affectedCRRef"].(map[string]interface{}) + nodeName, _ := affectedCRRef["name"].(string) + if nodeName == "" || !controlled[nodeName] { + continue + } + if _, err := l.localClient.Resource(driftSignalGVR).Namespace(ns).Patch( + ctx, item.GetName(), types.MergePatchType, patchBytes, metav1.PatchOptions{}, + ); err != nil { + fmt.Printf("cluster node health loop: cluster=%q resolveNodeRegistrationDrift patch %s: %v\n", + l.clusterRef, item.GetName(), err) + continue + } + fmt.Printf("cluster node health loop: cluster=%q node=%q NodeRegistrationDrift DriftSignal %s resolved\n", + l.clusterRef, nodeName, item.GetName()) + } +} + // checkCapacitySaturation queries the metrics-server NodeMetrics API and compares // CPU and memory usage to each node's allocatable capacity. Nodes above // capacitySaturationThresholdPct for capacitySaturationConsecutiveRequired @@ -963,7 +1026,7 @@ func (l *ClusterNodeHealthLoop) writeCapacitySaturationCondition(ctx context.Con }, } data, _ := json.Marshal(patch) - if _, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace).Patch( + if _, err := l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem).Patch( ctx, l.clusterRef, types.MergePatchType, data, metav1.PatchOptions{}, "status", ); err != nil { fmt.Printf("cluster node health loop: cluster=%q writeCapacitySaturationCondition: %v\n", l.clusterRef, err) diff --git a/internal/agent/cluster_node_health_loop_maintenance_test.go b/internal/agent/cluster_node_health_loop_maintenance_test.go index 54aa5c9..7fd1e55 100644 --- a/internal/agent/cluster_node_health_loop_maintenance_test.go +++ b/internal/agent/cluster_node_health_loop_maintenance_test.go @@ -24,7 +24,7 @@ func TestTwoPhase_UnreachableNodeWithMaintenancePortOpen_ClassifiedAsMaintenance defer func() { probeMaintenancePortFn = old }() node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) ocWatcher := NewOperatorContextWatcher(dynClient, "ont-system") @@ -50,7 +50,7 @@ func TestTwoPhase_UnreachableNodeWithPortClosed_RemainsUnreachable(t *testing.T) defer func() { probeMaintenancePortFn = old }() node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) ocWatcher := NewOperatorContextWatcher(dynClient, "ont-system") @@ -77,7 +77,7 @@ func TestTwoPhase_ReadyNodeSkipsProbe(t *testing.T) { defer func() { probeMaintenancePortFn = old }() node := makeNode("cp1", "10.20.0.2", "True") // Ready - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) ocWatcher := NewOperatorContextWatcher(dynClient, "ont-system") @@ -97,7 +97,7 @@ func TestTwoPhase_MaintenanceMode_DoesNotIncrementConsecutiveBad(t *testing.T) { defer func() { probeMaintenancePortFn = old }() node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) ocWatcher := NewOperatorContextWatcher(dynClient, "ont-system") @@ -129,7 +129,7 @@ func TestTriggerReenrollment_WithPerNodeSecret_DelegatedLevel_CreatesNodeOperati const ns = "seam-tenant-ccs-mgmt" node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster(clusterRef, "ont-system") + tc := makeTalosCluster(clusterRef, "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") // Per-node machineconfig secret. @@ -166,7 +166,7 @@ func TestTriggerReenrollment_WithPerNodeSecret_DelegatedLevel_CreatesNodeOperati func TestTriggerReenrollment_NoSecret_SetsHumanInterventionRequired(t *testing.T) { const clusterRef = "ccs-mgmt" - tc := makeTalosCluster(clusterRef, "ont-system") + tc := makeTalosCluster(clusterRef, "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") // No machineconfig secrets. dynClient := buildHealthFakeClient(tc, ocObj) @@ -182,7 +182,7 @@ func TestTriggerReenrollment_NoSecret_SetsHumanInterventionRequired(t *testing.T talosClusterGVRTest := schema.GroupVersionResource{ Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "talosclusters", } - tc2, err := dynClient.Resource(talosClusterGVRTest).Namespace("ont-system").Get( + tc2, err := dynClient.Resource(talosClusterGVRTest).Namespace("seam-system").Get( context.Background(), clusterRef, metav1.GetOptions{}) if err != nil { t.Fatalf("get TalosCluster: %v", err) @@ -211,7 +211,7 @@ func TestTriggerReenrollment_LowAutonomyLevel_BlockedFromCreatingNodeOperation(t const ns = "seam-tenant-ccs-mgmt" node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster(clusterRef, "ont-system") + tc := makeTalosCluster(clusterRef, "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") mcSecret := &unstructured.Unstructured{} @@ -245,7 +245,7 @@ func TestTriggerReenrollment_Idempotent_SecondCheckDoesNotDuplicateCR(t *testing const ns = "seam-tenant-ccs-mgmt" node := makeUnreachableNode("cp2", "10.20.0.3") - tc := makeTalosCluster(clusterRef, "ont-system") + tc := makeTalosCluster(clusterRef, "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") mcSecret := &unstructured.Unstructured{} diff --git a/internal/agent/cluster_node_health_loop_test.go b/internal/agent/cluster_node_health_loop_test.go index dfeffa2..40ee8bd 100644 --- a/internal/agent/cluster_node_health_loop_test.go +++ b/internal/agent/cluster_node_health_loop_test.go @@ -243,7 +243,7 @@ func TestConsecutiveFailureTracking_ResetsOnReady(t *testing.T) { func TestConsecutiveFailureTracking_Increments(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "False") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -270,7 +270,7 @@ func TestConsecutiveFailureTracking_Increments(t *testing.T) { func TestTier1Reboot_BlockedByObserveOnly(t *testing.T) { // Create a node that has been Degraded 3 times consecutively. node := makeNode("cp1", "10.20.0.2", "False") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) @@ -301,7 +301,7 @@ func TestTier1Reboot_BlockedByObserveOnly(t *testing.T) { func TestTier1Reboot_AllowedByDelegated(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "False") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") dynClient := buildHealthFakeClient(node, tc, ocObj) @@ -330,7 +330,7 @@ func TestTier1Reboot_AllowedByDelegated(t *testing.T) { func TestTier3_MultipleNodesDegraded_SetsHumanIntervention(t *testing.T) { node1 := makeNode("cp1", "10.20.0.2", "False") node2 := makeNode("cp2", "10.20.0.3", "False") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node1, node2, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -340,7 +340,7 @@ func TestTier3_MultipleNodesDegraded_SetsHumanIntervention(t *testing.T) { {name: "cp2", ip: "10.20.0.3", state: NodeHealthStateDegraded, consecutiveBad: 1}, }, false) - tc2, err := dynClient.Resource(talosClusterGVR).Namespace("ont-system").Get( + tc2, err := dynClient.Resource(talosClusterGVR).Namespace("seam-system").Get( context.Background(), "ccs-mgmt", metav1.GetOptions{}, "status", ) if err != nil { @@ -369,14 +369,14 @@ func TestTier3_MultipleNodesDegraded_SetsHumanIntervention(t *testing.T) { func TestNodeHealthSummaryAnnotation_Content(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "True") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) loop.checkOnce(context.Background()) // Verify the TalosCluster was patched with an annotation. - updated, err := dynClient.Resource(talosClusterGVR).Namespace("ont-system").Get( + updated, err := dynClient.Resource(talosClusterGVR).Namespace("seam-system").Get( context.Background(), "ccs-mgmt", metav1.GetOptions{}, ) if err != nil { @@ -403,7 +403,7 @@ func TestNodeHealthSummaryAnnotation_Content(t *testing.T) { func TestClusterNodeHealthLoop_RunCancelsCleanly(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "True") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -449,7 +449,7 @@ func TestCheckNodeRegistration_LabeledNode_NoDriftSignal(t *testing.T) { node := makeNodeWithLabels("cp1", "10.20.0.2", "True", map[string]interface{}{ "ont.platform.dev/controlled": "true", }) - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -468,7 +468,7 @@ func TestCheckNodeRegistration_LabeledNode_NoDriftSignal(t *testing.T) { func TestCheckNodeRegistration_UnlabeledNode_CreatesDriftSignal(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "True") // no labels at all - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -495,7 +495,7 @@ func TestCheckNodeRegistration_UnlabeledNode_CreatesDriftSignal(t *testing.T) { func TestCheckNodeRegistration_DuplicateSignal_NotCreated(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "True") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -525,7 +525,7 @@ func TestCheckNodeRegistration_DuplicateSignal_NotCreated(t *testing.T) { // metrics-server returns an empty list (no NodeMetrics objects), no saturation is detected. func TestCheckCapacitySaturation_NoMetricsObjects_NoSaturation(t *testing.T) { node := makeNodeWithAllocatable("cp1", "10.20.0.2", "True", "4", "8Gi") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -565,7 +565,7 @@ func deleteNodeMetrics(t *testing.T, dynClient *dynamicfake.FakeDynamicClient, n func TestCheckCapacitySaturation_BelowThreshold_NeverSaturates(t *testing.T) { // 4 CPUs allocatable; 200m usage = 5%. 8Gi allocatable; 400Mi usage = ~5%. node := makeNodeWithAllocatable("cp1", "10.20.0.2", "True", "4", "8Gi") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) injectNodeMetrics(t, dynClient, "cp1", "200m", "400Mi") @@ -590,7 +590,7 @@ func TestCheckCapacitySaturation_BelowThreshold_NeverSaturates(t *testing.T) { func TestCheckCapacitySaturation_ConsecutiveRequired_SetsCondition(t *testing.T) { // 4 CPUs allocatable; 3800m usage = 95% (above 85% threshold). node := makeNodeWithAllocatable("cp1", "10.20.0.2", "True", "4", "8Gi") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) injectNodeMetrics(t, dynClient, "cp1", "3800m", "400Mi") // CPU: 95%, mem: ~5% @@ -618,7 +618,7 @@ func TestCheckCapacitySaturation_ConsecutiveRequired_SetsCondition(t *testing.T) // the consecutive counter resets to 0 when utilisation drops below the threshold. func TestCheckCapacitySaturation_ConsecutiveResetsOnBelowThreshold(t *testing.T) { node := makeNodeWithAllocatable("cp1", "10.20.0.2", "True", "4", "8Gi") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) injectNodeMetrics(t, dynClient, "cp1", "3800m", "400Mi") // CPU: 95% @@ -723,7 +723,7 @@ func TestFleetHealthDispatcher_ContextCancellation(t *testing.T) { func TestCheckNodeRegistration_DriftLayerIsInfrastructure(t *testing.T) { node := makeNode("cp1", "10.20.0.2", "True") - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -830,7 +830,7 @@ func TestCheckNodeRegistration_LossScopePopulated(t *testing.T) { node := makeNodeWithLabels("cp1", "10.20.0.2", "True", map[string]interface{}{ "node-role.kubernetes.io/control-plane": "", }) - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(node, tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -863,7 +863,7 @@ func TestCheckNodeRegistration_LossScopePopulated(t *testing.T) { } func TestEmitTier3DriftSignal_CreatesSignalWithLossScope(t *testing.T) { - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(tc) loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) @@ -902,3 +902,109 @@ func TestEmitTier3DriftSignal_CreatesSignalWithLossScope(t *testing.T) { t.Errorf("lossScope.severity = %v, want quorum-at-risk", ls["severity"]) } } + +// --------------------------------------------------------------------------- +// resolveNodeRegistrationDrift: auto-resolution when controlled label present +// --------------------------------------------------------------------------- + +func TestResolveNodeRegistrationDrift_ResolvesWhenLabelPresent(t *testing.T) { + // Create a DriftSignal in "pending" state for a node that now has the label. + existingSignal := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": "node-reg-drift-cp1", "namespace": "seam-tenant-ccs-mgmt"}, + "spec": map[string]interface{}{ + "signalKind": "NodeRegistrationDrift", + "state": "pending", + "driftLayer": "infrastructure", + "correlationID": "node-reg-ccs-mgmt-cp1", + "affectedCRRef": map[string]interface{}{"group": "", "kind": "Node", "name": "cp1"}, + }, + }, + } + tc := makeTalosCluster("ccs-mgmt", "seam-system") + dynClient := buildHealthFakeClient(tc, existingSignal) + + // Node now has the controlled label. + nodeControlled := makeNodeWithLabels("cp1", "10.20.0.2", "True", map[string]interface{}{ + "node-role.kubernetes.io/control-plane": "", + "ont.platform.dev/controlled": "true", + }) + + loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) + loop.resolveNodeRegistrationDrift(context.Background(), []map[string]interface{}{nodeControlled.Object}) + + list, err := dynClient.Resource(driftSignalGVR).Namespace("seam-tenant-ccs-mgmt").List( + context.Background(), metav1.ListOptions{}, + ) + if err != nil { + t.Fatalf("list DriftSignals: %v", err) + } + if len(list.Items) == 0 { + t.Fatal("DriftSignal was deleted; expected it to be patched to resolved") + } + specBytes, _ := json.Marshal(list.Items[0].Object["spec"]) + var spec map[string]interface{} + _ = json.Unmarshal(specBytes, &spec) + if spec["state"] != "resolved" { + t.Errorf("state = %q, want resolved", spec["state"]) + } +} + +func TestResolveNodeRegistrationDrift_SkipsWhenLabelAbsent(t *testing.T) { + existingSignal := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": "node-reg-drift-cp1", "namespace": "seam-tenant-ccs-mgmt"}, + "spec": map[string]interface{}{ + "signalKind": "NodeRegistrationDrift", + "state": "pending", + "affectedCRRef": map[string]interface{}{"group": "", "kind": "Node", "name": "cp1"}, + }, + }, + } + tc := makeTalosCluster("ccs-mgmt", "seam-system") + dynClient := buildHealthFakeClient(tc, existingSignal) + + // Node still lacks the controlled label. + nodeUncontrolled := makeNode("cp1", "10.20.0.2", "True") + + loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) + loop.resolveNodeRegistrationDrift(context.Background(), []map[string]interface{}{nodeUncontrolled.Object}) + + list, _ := dynClient.Resource(driftSignalGVR).Namespace("seam-tenant-ccs-mgmt").List( + context.Background(), metav1.ListOptions{}, + ) + specBytes, _ := json.Marshal(list.Items[0].Object["spec"]) + var spec map[string]interface{} + _ = json.Unmarshal(specBytes, &spec) + if spec["state"] == "resolved" { + t.Error("state was resolved but label was absent; should not have been patched") + } +} + +func TestResolveNodeRegistrationDrift_SkipsAlreadyResolved(t *testing.T) { + existingSignal := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": "node-reg-drift-cp1", "namespace": "seam-tenant-ccs-mgmt"}, + "spec": map[string]interface{}{ + "signalKind": "NodeRegistrationDrift", + "state": "resolved", + "affectedCRRef": map[string]interface{}{"group": "", "kind": "Node", "name": "cp1"}, + }, + }, + } + tc := makeTalosCluster("ccs-mgmt", "seam-system") + dynClient := buildHealthFakeClient(tc, existingSignal) + + nodeControlled := makeNodeWithLabels("cp1", "10.20.0.2", "True", map[string]interface{}{ + "ont.platform.dev/controlled": "true", + }) + loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", nil) + // Should be a no-op -- already resolved. + loop.resolveNodeRegistrationDrift(context.Background(), []map[string]interface{}{nodeControlled.Object}) +} diff --git a/internal/agent/cluster_pki_expiry.go b/internal/agent/cluster_pki_expiry.go index 6d787a5..034ac92 100644 --- a/internal/agent/cluster_pki_expiry.go +++ b/internal/agent/cluster_pki_expiry.go @@ -13,6 +13,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/ontai-dev/seam/pkg/namespaces" ) // pkiRotationGVR is the GroupVersionResource for PKIRotation CRs. @@ -35,7 +37,7 @@ const pkiExpiryActionThreshold = 7 // approaching, either auto-creates a PKIRotation CR (if AutonomyLevel >= delegated) // or sets HumanInterventionRequired=True on the TalosCluster. RECON-C1. func (l *ClusterNodeHealthLoop) checkPKIExpiry(ctx context.Context) { - tc, err := l.localClient.Resource(talosClusterGVR).Namespace(l.namespace). + tc, err := l.localClient.Resource(talosClusterGVR).Namespace(namespaces.SeamSystem). Get(ctx, l.clusterRef, metav1.GetOptions{}) if err != nil { return diff --git a/internal/agent/cluster_pki_expiry_test.go b/internal/agent/cluster_pki_expiry_test.go index e250868..50ee56c 100644 --- a/internal/agent/cluster_pki_expiry_test.go +++ b/internal/agent/cluster_pki_expiry_test.go @@ -22,7 +22,7 @@ func makeTalosClusterWithPKIExpiry(name, namespace, expiryRFC3339 string) *unstr // pkiExpiryDate field, checkPKIExpiry exits early without creating any PKIRotation CR. // RECON-C1. func TestCheckPKIExpiry_NoActionWhenNoExpiryDate(t *testing.T) { - tc := makeTalosCluster("ccs-mgmt", "ont-system") + tc := makeTalosCluster("ccs-mgmt", "seam-system") dynClient := buildHealthFakeClient(tc) w := NewOperatorContextWatcher(dynClient, "ont-system") loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", w) @@ -44,7 +44,7 @@ func TestCheckPKIExpiry_NoActionWhenNoExpiryDate(t *testing.T) { // expiry 60 days in the future does not trigger any action. RECON-C1. func TestCheckPKIExpiry_NoActionWhenFarFromExpiry(t *testing.T) { expiry := time.Now().UTC().Add(60 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) dynClient := buildHealthFakeClient(tc) w := NewOperatorContextWatcher(dynClient, "ont-system") loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", w) @@ -67,7 +67,7 @@ func TestCheckPKIExpiry_NoActionWhenFarFromExpiry(t *testing.T) { // RECON-C1. func TestCheckPKIExpiry_LogsOnlyWhenWithin30Days(t *testing.T) { expiry := time.Now().UTC().Add(15 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) dynClient := buildHealthFakeClient(tc) w := NewOperatorContextWatcher(dynClient, "ont-system") loop := NewClusterNodeHealthLoop(dynClient, "ccs-mgmt", "ont-system", w) @@ -89,7 +89,7 @@ func TestCheckPKIExpiry_LogsOnlyWhenWithin30Days(t *testing.T) { // expires in 5 days and AutonomyLevel=delegated, a PKIRotation CR is created. RECON-C1. func TestCheckPKIExpiry_CreatesRotationWhenWithin7DaysAndDelegated(t *testing.T) { expiry := time.Now().UTC().Add(5 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") dynClient := buildHealthFakeClient(tc, ocObj) @@ -118,7 +118,7 @@ func TestCheckPKIExpiry_CreatesRotationWhenWithin7DaysAndDelegated(t *testing.T) // autonomy level also triggers PKIRotation creation within the 7-day window. RECON-C1. func TestCheckPKIExpiry_CreatesRotationWhenFullDelegation(t *testing.T) { expiry := time.Now().UTC().Add(3 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) ocObj := makeOperatorContext("ont-system", AutonomyLevelFullDelegation, "normal") dynClient := buildHealthFakeClient(tc, ocObj) @@ -144,7 +144,7 @@ func TestCheckPKIExpiry_CreatesRotationWhenFullDelegation(t *testing.T) { // RECON-C1. func TestCheckPKIExpiry_NoRotationWhenLowAutonomy(t *testing.T) { expiry := time.Now().UTC().Add(5 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) ocObj := makeOperatorContext("ont-system", AutonomyLevelObserveOnly, "normal") dynClient := buildHealthFakeClient(tc, ocObj) @@ -170,7 +170,7 @@ func TestCheckPKIExpiry_NoRotationWhenLowAutonomy(t *testing.T) { // RECON-C1. func TestCheckPKIExpiry_IdempotentWhenCRAlreadyPending(t *testing.T) { expiry := time.Now().UTC().Add(5 * 24 * time.Hour).Format(time.RFC3339) - tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "ont-system", expiry) + tc := makeTalosClusterWithPKIExpiry("ccs-mgmt", "seam-system", expiry) ocObj := makeOperatorContext("ont-system", AutonomyLevelDelegated, "normal") dynClient := buildHealthFakeClient(tc, ocObj) diff --git a/internal/agent/eso_health_loop.go b/internal/agent/eso_health_loop.go new file mode 100644 index 0000000..24cc407 --- /dev/null +++ b/internal/agent/eso_health_loop.go @@ -0,0 +1,257 @@ +package agent + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "time" + + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/dynamic" +) + +// externalSecretGVR is the GroupVersionResource for ExternalSecret CRs (ESO v1beta1). +var externalSecretGVR = schema.GroupVersionResource{ + Group: "external-secrets.io", + Version: "v1beta1", + Resource: "externalsecrets", +} + +// esoSignalPrefix is the DriftSignal name prefix for ESO sync failure signals. +const esoSignalPrefix = "drift-eso-sync-" + +// ESOHealthLoop runs on conductor role=management. On each cycle it: +// 1. Lists ExternalSecret CRs across the management namespace (seam-system). +// 2. For each, inspects status.conditions for a Ready=False or Synced=False condition. +// 3. Emits an ExternalSecretSyncFailed DriftSignal when a sync error is detected. +// 4. Confirms any existing signal when the ExternalSecret reaches Ready=True. +// +// Skips cleanly when the external-secrets CRDs are not installed on the cluster. +// AutonomyLevel=observe-only: logs only, no DriftSignal written. +// RECON-K3. +type ESOHealthLoop struct { + client dynamic.Interface + namespace string + ocWatcher *OperatorContextWatcher + clusterRef string +} + +// NewESOHealthLoop constructs an ESOHealthLoop for the given namespace. +func NewESOHealthLoop(client dynamic.Interface, namespace, clusterRef string) *ESOHealthLoop { + return &ESOHealthLoop{ + client: client, + namespace: namespace, + clusterRef: clusterRef, + } +} + +// WithOperatorContextWatcher sets the OperatorContextWatcher for autonomy-level gating. +func (l *ESOHealthLoop) WithOperatorContextWatcher(w *OperatorContextWatcher) { + l.ocWatcher = w +} + +// Run runs the loop until ctx is cancelled. +func (l *ESOHealthLoop) Run(ctx context.Context, interval time.Duration) { + l.checkOnce(ctx) + if ctx.Err() != nil { + return + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + l.checkOnce(ctx) + } + } +} + +func (l *ESOHealthLoop) checkOnce(ctx context.Context) { + list, err := l.client.Resource(externalSecretGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("eso health loop: list ExternalSecrets in %s: %v\n", l.namespace, err) + return + } + for i := range list.Items { + l.checkESO(ctx, &list.Items[i]) + } +} + +func (l *ESOHealthLoop) checkESO(ctx context.Context, es *k8sunstructured.Unstructured) { + name := es.GetName() + signalName := esoSignalPrefix + name + + conditions, _, _ := unstructuredNestedSlice(es.Object, "status", "conditions") + syncFailed, reason := esoSyncFailed(conditions) + + if !syncFailed { + l.confirmSignalIfPresent(ctx, signalName) + return + } + + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("eso health loop: eso=%q sync failed (%s) -- observe-only mode, no DriftSignal written\n", name, reason) + return + } + + driftReason := fmt.Sprintf("ExternalSecret sync failed: name=%s reason=%s", name, reason) + l.emitSignal(ctx, signalName, name, driftReason) +} + +// esoSyncFailed returns true when any condition indicates sync failure. +func esoSyncFailed(conditions []interface{}) (bool, string) { + for _, raw := range conditions { + cond, ok := raw.(map[string]interface{}) + if !ok { + continue + } + condType, _ := cond["type"].(string) + condStatus, _ := cond["status"].(string) + reason, _ := cond["reason"].(string) + if (condType == "Ready" || condType == "Synced") && condStatus == "False" { + if reason == "" { + reason = "unknown" + } + return true, reason + } + } + return false, "" +} + +func (l *ESOHealthLoop) emitSignal(ctx context.Context, signalName, esName, driftReason string) { + now := time.Now().UTC().Format(time.RFC3339) + + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil && !k8serrors.IsNotFound(err) { + fmt.Printf("eso health loop: eso=%q get DriftSignal: %v\n", esName, err) + return + } + + if k8serrors.IsNotFound(err) { + obj := map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": signalName, "namespace": l.namespace}, + "spec": map[string]interface{}{ + "state": "pending", + "signalKind": "ExternalSecretSyncFailed", + "driftLayer": "kubernetes", + "correlationID": fmt.Sprintf("eso-%s-%d", esName, time.Now().UnixNano()), + "observedAt": now, + "driftReason": driftReason, + "affectedCRRef": map[string]interface{}{ + "group": "external-secrets.io", + "kind": "ExternalSecret", + "namespace": l.namespace, + "name": esName, + }, + "escalationCounter": int64(0), + }, + } + if _, cErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Create( + ctx, &k8sunstructured.Unstructured{Object: obj}, metav1.CreateOptions{}, + ); cErr != nil { + fmt.Printf("eso health loop: eso=%q create DriftSignal: %v\n", esName, cErr) + } + fmt.Printf("eso health loop: eso=%q sync failed -- DriftSignal written\n", esName) + return + } + + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + counter, _ := spec["escalationCounter"].(int64) + if int32(counter) >= escalationThreshold { + return + } + if state == "confirmed" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "correlationID": fmt.Sprintf("eso-%s-%d", esName, time.Now().UnixNano()), + "observedAt": now, + "escalationCounter": int64(0), + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("eso health loop: eso=%q reset DriftSignal: %v\n", esName, pErr) + } + return + } + if state == "queued" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "escalationCounter": counter + 1, + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("eso health loop: eso=%q increment escalation counter: %v\n", esName, pErr) + } + } +} + +func (l *ESOHealthLoop) confirmSignalIfPresent(ctx context.Context, signalName string) { + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil { + return + } + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + if state == "confirmed" || state == "" { + return + } + patch := map[string]interface{}{ + "spec": map[string]interface{}{"state": "confirmed", "correlationID": ""}, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("eso health loop: confirm DriftSignal %s/%s: %v\n", l.namespace, signalName, pErr) + } +} + +// isNoCRDError returns true when the API server reports the CRD resource type is unknown. +// This occurs when an extension is not installed and its CRDs are absent. +func isNoCRDError(err error) bool { + if err == nil { + return false + } + msg := err.Error() + return strings.Contains(msg, "no matches for kind") || + strings.Contains(msg, "the server could not find the requested resource") || + k8serrors.IsNotFound(err) +} + +// unstructuredNestedSlice extracts a []interface{} from an unstructured map by field path. +func unstructuredNestedSlice(obj map[string]interface{}, fields ...string) ([]interface{}, bool, error) { + cur := obj + for _, f := range fields[:len(fields)-1] { + next, ok := cur[f].(map[string]interface{}) + if !ok { + return nil, false, nil + } + cur = next + } + last := fields[len(fields)-1] + val, ok := cur[last].([]interface{}) + return val, ok, nil +} diff --git a/internal/agent/eso_health_loop_test.go b/internal/agent/eso_health_loop_test.go new file mode 100644 index 0000000..88e32f6 --- /dev/null +++ b/internal/agent/eso_health_loop_test.go @@ -0,0 +1,237 @@ +package agent + +import ( + "context" + "fmt" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic/fake" +) + +// esoTestGVRs contains the list-kind mappings used by all ESO loop tests. +var esoTestGVRs = map[schema.GroupVersionResource]string{ + externalSecretGVR: "ExternalSecretList", + driftSignalGVR: "DriftSignalList", +} + +// newESOFakeClient builds a fake dynamic client pre-loaded with the given objects, +// registering ExternalSecret and DriftSignal list kinds. +func newESOFakeClient(objs ...runtime.Object) *fake.FakeDynamicClient { + return fake.NewSimpleDynamicClientWithCustomListKinds(runtime.NewScheme(), esoTestGVRs, objs...) +} + +func TestESOHealthLoop_HealthyESO_NoSignal(t *testing.T) { + es := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "external-secrets.io/v1beta1", + "kind": "ExternalSecret", + "metadata": map[string]interface{}{"name": "my-secret", "namespace": "seam-system"}, + "status": map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"type": "Ready", "status": "True"}, + }, + }, + }} + + client := newESOFakeClient(es) + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Errorf("expected no DriftSignal created for healthy ESO, got create action") + } + } +} + +func TestESOHealthLoop_SyncFailed_EmitsSignal(t *testing.T) { + es := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "external-secrets.io/v1beta1", + "kind": "ExternalSecret", + "metadata": map[string]interface{}{"name": "bad-secret", "namespace": "seam-system"}, + "status": map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"type": "Ready", "status": "False", "reason": "SecretSyncError"}, + }, + }, + }} + + client := newESOFakeClient(es) + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var created bool + for _, a := range client.Actions() { + if a.GetVerb() == "create" && a.GetResource().Resource == "driftsignals" { + created = true + } + } + if !created { + t.Error("expected DriftSignal to be created for failed ESO sync") + } +} + +func TestESOHealthLoop_SyncFailed_ObserveOnly_NoSignal(t *testing.T) { + es := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "external-secrets.io/v1beta1", + "kind": "ExternalSecret", + "metadata": map[string]interface{}{"name": "bad-secret", "namespace": "seam-system"}, + "status": map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"type": "Synced", "status": "False", "reason": "VaultError"}, + }, + }, + }} + + client := newESOFakeClient(es) + + ocWatcher := NewOperatorContextWatcher(client, "ont-system") + ocWatcher.mu.Lock() + ocWatcher.resolved["ccs-mgmt"] = resolvedContext{autonomyLevel: AutonomyLevelObserveOnly, mode: "normal"} + ocWatcher.mu.Unlock() + + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + l.WithOperatorContextWatcher(ocWatcher) + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal created under observe-only mode") + } + } +} + +func TestESOHealthLoop_ConfirmsSignalWhenHealthy(t *testing.T) { + es := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "external-secrets.io/v1beta1", + "kind": "ExternalSecret", + "metadata": map[string]interface{}{"name": "my-secret", "namespace": "seam-system"}, + "status": map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"type": "Ready", "status": "True"}, + }, + }, + }} + existingSignal := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": esoSignalPrefix + "my-secret", "namespace": "seam-system"}, + "spec": map[string]interface{}{"state": "queued"}, + }} + + client := newESOFakeClient(es, existingSignal) + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var patched bool + for _, a := range client.Actions() { + if a.GetVerb() == "patch" && a.GetResource().Resource == "driftsignals" { + patched = true + } + } + if !patched { + t.Error("expected DriftSignal to be confirmed (patched) when ESO is healthy") + } +} + +func TestESOHealthLoop_SignalCreated_VerifyFields(t *testing.T) { + es := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "external-secrets.io/v1beta1", + "kind": "ExternalSecret", + "metadata": map[string]interface{}{"name": "vault-secret", "namespace": "seam-system"}, + "status": map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"type": "Ready", "status": "False", "reason": "VaultUnreachable"}, + }, + }, + }} + + client := newESOFakeClient(es) + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + ds, err := client.Resource(driftSignalGVR).Namespace("seam-system").Get( + context.Background(), esoSignalPrefix+"vault-secret", metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("expected DriftSignal: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + if kind, _ := spec["signalKind"].(string); kind != "ExternalSecretSyncFailed" { + t.Errorf("signalKind = %q, want ExternalSecretSyncFailed", kind) + } + if state, _ := spec["state"].(string); state != "pending" { + t.Errorf("state = %q, want pending", state) + } +} + +func TestESOSyncFailed_BothConditionTypes(t *testing.T) { + tests := []struct { + name string + condType string + status string + wantFail bool + }{ + {"ready false", "Ready", "False", true}, + {"synced false", "Synced", "False", true}, + {"ready true", "Ready", "True", false}, + {"synced true", "Synced", "True", false}, + {"other type false", "Connected", "False", false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + conditions := []interface{}{ + map[string]interface{}{"type": tt.condType, "status": tt.status}, + } + failed, _ := esoSyncFailed(conditions) + if failed != tt.wantFail { + t.Errorf("esoSyncFailed: got %v want %v", failed, tt.wantFail) + } + }) + } +} + +func TestIsNoCRDError_DetectsKnownPatterns(t *testing.T) { + tests := []struct { + err error + want bool + }{ + {nil, false}, + {&esoTestError{"no matches for kind ExternalSecret"}, true}, + {&esoTestError{"the server could not find the requested resource"}, true}, + {fmt.Errorf("connection refused"), false}, + } + for _, tt := range tests { + got := isNoCRDError(tt.err) + if got != tt.want { + t.Errorf("isNoCRDError(%v) = %v, want %v", tt.err, got, tt.want) + } + } +} + +func TestESOHealthLoop_Run_StopsOnContextCancel(t *testing.T) { + client := newESOFakeClient() + l := NewESOHealthLoop(client, "seam-system", "ccs-mgmt") + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // cancel immediately so Run exits after first checkOnce returns + done := make(chan struct{}) + go func() { + l.Run(ctx, 100*time.Millisecond) + close(done) + }() + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("ESOHealthLoop.Run did not stop after context cancel") + } +} + +// esoTestError is a minimal error type for CRD-not-installed test cases. +type esoTestError struct{ msg string } + +func (e *esoTestError) Error() string { return e.msg } diff --git a/internal/agent/pack_source_version_loop.go b/internal/agent/pack_source_version_loop.go new file mode 100644 index 0000000..1c5cc3f --- /dev/null +++ b/internal/agent/pack_source_version_loop.go @@ -0,0 +1,291 @@ +package agent + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "time" + + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/dynamic" + "sigs.k8s.io/yaml" +) + +// packSourceSignalPrefix is the DriftSignal name prefix for upstream version signals. +// One signal per PackDelivery, written in the same namespace as the PackDelivery. +const packSourceSignalPrefix = "drift-pack-source-" + +// helmIndexMaxBytes is the upper bound for index.yaml fetches. +const helmIndexMaxBytes = 8 * 1024 * 1024 // 8 MiB + +// helmIndex represents the relevant fields of a Helm chart repository index.yaml. +type helmIndex struct { + Entries map[string][]helmIndexEntry `json:"entries" yaml:"entries"` +} + +type helmIndexEntry struct { + Version string `json:"version" yaml:"version"` +} + +// PackSourceVersionLoop runs on conductor role=management. On each cycle it: +// 1. Lists all PackDeliveries in the management namespace (seam-system). +// 2. Filters those with a non-empty spec.chartURL (Helm-backed packs). +// 3. For each, derives the Helm repository base URL from spec.chartURL, fetches +// the repository index.yaml, and finds the latest version for spec.chartName. +// 4. If the latest version is newer than spec.chartVersion, emits an +// UpstreamVersionAvailable DriftSignal in the same namespace as the PackDelivery. +// 5. If no newer version is found, confirms any existing signal. +// +// RECON-CMN1. conductor-schema.md §7. +type PackSourceVersionLoop struct { + client dynamic.Interface + namespace string + httpClient *http.Client +} + +// NewPackSourceVersionLoop constructs a PackSourceVersionLoop for the given namespace. +func NewPackSourceVersionLoop(client dynamic.Interface, namespace string) *PackSourceVersionLoop { + return &PackSourceVersionLoop{ + client: client, + namespace: namespace, + httpClient: &http.Client{ + Timeout: 15 * time.Second, + }, + } +} + +// Run runs the loop until ctx is cancelled. Fires once immediately then repeats. +func (l *PackSourceVersionLoop) Run(ctx context.Context, interval time.Duration) { + l.checkOnce(ctx) + if ctx.Err() != nil { + return + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + l.checkOnce(ctx) + } + } +} + +// checkOnce performs one version check cycle across all Helm-backed PackDeliveries. +func (l *PackSourceVersionLoop) checkOnce(ctx context.Context) { + list, err := l.client.Resource(clusterPackMgmtGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + fmt.Printf("pack source version loop: list PackDeliveries in %s: %v\n", l.namespace, err) + return + } + + for i := range list.Items { + l.checkPack(ctx, &list.Items[i]) + } +} + +// checkPack checks one PackDelivery for upstream version availability. +func (l *PackSourceVersionLoop) checkPack(ctx context.Context, pd *k8sunstructured.Unstructured) { + spec, _, _ := unstructuredNestedMap(pd.Object, "spec") + chartURL, _ := spec["chartURL"].(string) + chartName, _ := spec["chartName"].(string) + chartVersion, _ := spec["chartVersion"].(string) + packName := pd.GetName() + + if chartURL == "" || chartName == "" || chartVersion == "" { + return + } + + repoURL, err := helmRepoBaseURL(chartURL) + if err != nil { + fmt.Printf("pack source version loop: pack=%q derive repo URL from %q: %v\n", packName, chartURL, err) + return + } + + latest, err := l.fetchLatestHelmVersion(ctx, repoURL, chartName) + if err != nil { + fmt.Printf("pack source version loop: pack=%q fetch index from %s: %v\n", packName, repoURL, err) + return + } + if latest == "" { + return + } + + signalName := packSourceSignalPrefix + packName + + if latest == chartVersion { + l.confirmSignalIfPresent(ctx, signalName, l.namespace) + return + } + + driftReason := fmt.Sprintf("upstream version available: chart=%s current=%s latest=%s", chartName, chartVersion, latest) + l.emitVersionSignal(ctx, signalName, l.namespace, packName, chartName, chartVersion, latest, driftReason) +} + +// fetchLatestHelmVersion downloads {repoURL}/index.yaml and returns the newest version +// listed for chartName. Returns "" if the chart is not found in the index. +func (l *PackSourceVersionLoop) fetchLatestHelmVersion(ctx context.Context, repoURL, chartName string) (string, error) { + indexURL := repoURL + "/index.yaml" + req, err := http.NewRequestWithContext(ctx, http.MethodGet, indexURL, nil) + if err != nil { + return "", fmt.Errorf("build request for %s: %w", indexURL, err) + } + resp, err := l.httpClient.Do(req) + if err != nil { + return "", fmt.Errorf("GET %s: %w", indexURL, err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("GET %s: status %d", indexURL, resp.StatusCode) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, helmIndexMaxBytes)) + if err != nil { + return "", fmt.Errorf("read index body from %s: %w", indexURL, err) + } + + var idx helmIndex + if err := yaml.Unmarshal(body, &idx); err != nil { + return "", fmt.Errorf("parse index.yaml from %s: %w", indexURL, err) + } + + entries := idx.Entries[chartName] + if len(entries) == 0 { + return "", nil + } + // Helm index.yaml entries are sorted newest-first by convention. + return entries[0].Version, nil +} + +// emitVersionSignal writes or updates the UpstreamVersionAvailable DriftSignal. +// Idempotent: creates if absent, increments counter if present. +func (l *PackSourceVersionLoop) emitVersionSignal(ctx context.Context, signalName, namespace, packName, chartName, currentVersion, latestVersion, driftReason string) { + now := time.Now().UTC().Format(time.RFC3339) + + existing, err := l.client.Resource(driftSignalGVR).Namespace(namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil && !k8serrors.IsNotFound(err) { + fmt.Printf("pack source version loop: pack=%q get DriftSignal: %v\n", packName, err) + return + } + + if k8serrors.IsNotFound(err) { + obj := map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": signalName, "namespace": namespace}, + "spec": map[string]interface{}{ + "state": "pending", + "signalKind": "UpstreamVersionAvailable", + "driftLayer": "governance", + "correlationID": fmt.Sprintf("pack-source-%s-%d", packName, time.Now().UnixNano()), + "observedAt": now, + "driftReason": driftReason, + "affectedCRRef": map[string]interface{}{ + "group": "seam.ontai.dev", + "kind": "PackDelivery", + "namespace": namespace, + "name": packName, + }, + "escalationCounter": int64(0), + }, + } + if _, cErr := l.client.Resource(driftSignalGVR).Namespace(namespace).Create( + ctx, &k8sunstructured.Unstructured{Object: obj}, metav1.CreateOptions{}, + ); cErr != nil { + fmt.Printf("pack source version loop: pack=%q create DriftSignal: %v\n", packName, cErr) + } + fmt.Printf("pack source version loop: pack=%q upstream version available (current=%s latest=%s)\n", + packName, currentVersion, latestVersion) + return + } + + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + counter, _ := spec["escalationCounter"].(int64) + + if int32(counter) >= escalationThreshold { + return + } + + if state == "confirmed" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "correlationID": fmt.Sprintf("pack-source-%s-%d", packName, time.Now().UnixNano()), + "observedAt": now, + "escalationCounter": int64(0), + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("pack source version loop: pack=%q reset confirmed DriftSignal: %v\n", packName, pErr) + } + return + } + + if state == "queued" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "escalationCounter": counter + 1, + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("pack source version loop: pack=%q increment escalation counter: %v\n", packName, pErr) + } + } +} + +// confirmSignalIfPresent advances the DriftSignal to confirmed if it exists and is not +// already in a terminal state. +func (l *PackSourceVersionLoop) confirmSignalIfPresent(ctx context.Context, signalName, namespace string) { + existing, err := l.client.Resource(driftSignalGVR).Namespace(namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil { + return + } + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + if state == "confirmed" || state == "" { + return + } + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "confirmed", + "correlationID": "", + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("pack source version loop: confirm DriftSignal %s/%s: %v\n", namespace, signalName, pErr) + } +} + +// helmRepoBaseURL extracts the Helm repository base URL (scheme + host) from a chart URL. +// Helm chart repositories serve index.yaml at the root of the host. +// Example: "http://10.20.0.1:5000/charts/mychart-1.0.0.tgz" -> "http://10.20.0.1:5000" +func helmRepoBaseURL(chartURL string) (string, error) { + u, err := url.Parse(chartURL) + if err != nil { + return "", fmt.Errorf("parse chart URL %q: %w", chartURL, err) + } + if u.Scheme == "" || u.Host == "" { + return "", fmt.Errorf("chart URL %q missing scheme or host", chartURL) + } + return fmt.Sprintf("%s://%s", u.Scheme, u.Host), nil +} diff --git a/internal/agent/pack_source_version_loop_test.go b/internal/agent/pack_source_version_loop_test.go new file mode 100644 index 0000000..0c3bede --- /dev/null +++ b/internal/agent/pack_source_version_loop_test.go @@ -0,0 +1,330 @@ +package agent + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/dynamic/fake" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +// helmIndexYAML is a minimal Helm chart repository index for testing. +const helmIndexYAML = ` +apiVersion: v1 +entries: + mychart: + - version: "2.0.0" + - version: "1.5.0" + - version: "1.0.0" + otherchart: + - version: "0.3.0" +` + +// helmIndexEmpty has no entries for the requested chart. +const helmIndexEmpty = ` +apiVersion: v1 +entries: {} +` + +// TestHelmRepoBaseURL verifies URL base extraction from chart URLs. +func TestHelmRepoBaseURL(t *testing.T) { + cases := []struct { + input string + want string + wantErr bool + }{ + {"http://10.20.0.1:5000/charts/mychart-1.0.0.tgz", "http://10.20.0.1:5000", false}, + {"http://10.20.0.1:5000/mychart-1.0.0.tgz", "http://10.20.0.1:5000", false}, + {"https://charts.example.com/charts/app-2.0.0.tgz", "https://charts.example.com", false}, + {"not-a-url", "", true}, + {"", "", true}, + } + for _, tc := range cases { + got, err := helmRepoBaseURL(tc.input) + if tc.wantErr { + if err == nil { + t.Errorf("helmRepoBaseURL(%q): expected error, got %q", tc.input, got) + } + continue + } + if err != nil { + t.Errorf("helmRepoBaseURL(%q): unexpected error: %v", tc.input, err) + continue + } + if got != tc.want { + t.Errorf("helmRepoBaseURL(%q) = %q, want %q", tc.input, got, tc.want) + } + } +} + +// TestFetchLatestHelmVersion verifies the Helm index fetch and parse logic. +func TestFetchLatestHelmVersion(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/index.yaml" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/x-yaml") + _, _ = w.Write([]byte(helmIndexYAML)) + })) + defer srv.Close() + + loop := NewPackSourceVersionLoop(nil, "seam-system") + loop.httpClient = srv.Client() + + t.Run("KnownChart", func(t *testing.T) { + got, err := loop.fetchLatestHelmVersion(context.Background(), srv.URL, "mychart") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != "2.0.0" { + t.Errorf("fetchLatestHelmVersion(mychart) = %q, want %q", got, "2.0.0") + } + }) + + t.Run("UnknownChart", func(t *testing.T) { + got, err := loop.fetchLatestHelmVersion(context.Background(), srv.URL, "notexist") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != "" { + t.Errorf("fetchLatestHelmVersion(notexist) = %q, want empty", got) + } + }) +} + +// TestFetchLatestHelmVersion_HTTPError verifies error propagation on server errors. +func TestFetchLatestHelmVersion_HTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "internal server error", http.StatusInternalServerError) + })) + defer srv.Close() + + loop := NewPackSourceVersionLoop(nil, "seam-system") + loop.httpClient = srv.Client() + + _, err := loop.fetchLatestHelmVersion(context.Background(), srv.URL, "mychart") + if err == nil { + t.Error("expected error on HTTP 500, got nil") + } +} + +// TestFetchLatestHelmVersion_EmptyIndex verifies that an empty index returns "". +func TestFetchLatestHelmVersion_EmptyIndex(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/x-yaml") + _, _ = w.Write([]byte(helmIndexEmpty)) + })) + defer srv.Close() + + loop := NewPackSourceVersionLoop(nil, "seam-system") + loop.httpClient = srv.Client() + + got, err := loop.fetchLatestHelmVersion(context.Background(), srv.URL, "mychart") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != "" { + t.Errorf("fetchLatestHelmVersion on empty index = %q, want empty", got) + } +} + +// newFakePackDelivery builds an unstructured PackDelivery for testing. +func newFakePackDelivery(name, namespace, chartURL, chartName, chartVersion string) *unstructured.Unstructured { + return &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "PackDelivery", + "metadata": map[string]interface{}{ + "name": name, + "namespace": namespace, + }, + "spec": map[string]interface{}{ + "chartURL": chartURL, + "chartName": chartName, + "chartVersion": chartVersion, + }, + }, + } +} + +// newFakeDynamicClient builds a fake dynamic client pre-loaded with the given objects. +func newFakeDynamicClient(scheme *runtime.Scheme, objs ...runtime.Object) *fake.FakeDynamicClient { + return fake.NewSimpleDynamicClient(scheme, objs...) +} + +// TestCheckOnce_EmitsDriftSignalOnNewerVersion verifies that checkOnce creates a +// DriftSignal when the index reports a version newer than spec.chartVersion. +func TestCheckOnce_EmitsDriftSignalOnNewerVersion(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.HasSuffix(r.URL.Path, "/index.yaml") { + _, _ = w.Write([]byte(helmIndexYAML)) // latest: 2.0.0 + return + } + http.NotFound(w, r) + })) + defer srv.Close() + + scheme := runtime.NewScheme() + pd := newFakePackDelivery("mypack", "seam-system", srv.URL+"/charts/mychart-1.0.0.tgz", "mychart", "1.0.0") + + // Register GroupVersionResource for fake client. + pdGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "packdeliveries"} + dsGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "driftsignals"} + + client := fake.NewSimpleDynamicClientWithCustomListKinds(scheme, + map[schema.GroupVersionResource]string{ + pdGVR: "PackDeliveryList", + dsGVR: "DriftSignalList", + }, + pd, + ) + + loop := NewPackSourceVersionLoop(client, "seam-system") + loop.httpClient = srv.Client() + + loop.checkOnce(context.Background()) + + signalName := packSourceSignalPrefix + "mypack" + ds, err := client.Resource(dsGVR).Namespace("seam-system").Get(context.Background(), signalName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("expected DriftSignal to be created, got error: %v", err) + } + + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + state, _ := spec["state"].(string) + if state != "pending" { + t.Errorf("DriftSignal state = %q, want %q", state, "pending") + } + signalKind, _ := spec["signalKind"].(string) + if signalKind != "UpstreamVersionAvailable" { + t.Errorf("DriftSignal signalKind = %q, want UpstreamVersionAvailable", signalKind) + } +} + +// TestCheckOnce_NoSignalWhenVersionCurrent verifies that checkOnce does not create +// a DriftSignal when spec.chartVersion matches the latest version in the index. +func TestCheckOnce_NoSignalWhenVersionCurrent(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(helmIndexYAML)) // latest: 2.0.0 + })) + defer srv.Close() + + scheme := runtime.NewScheme() + // Pack already at latest version. + pd := newFakePackDelivery("mypack", "seam-system", srv.URL+"/charts/mychart-2.0.0.tgz", "mychart", "2.0.0") + + pdGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "packdeliveries"} + dsGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "driftsignals"} + + client := fake.NewSimpleDynamicClientWithCustomListKinds(scheme, + map[schema.GroupVersionResource]string{ + pdGVR: "PackDeliveryList", + dsGVR: "DriftSignalList", + }, + pd, + ) + + loop := NewPackSourceVersionLoop(client, "seam-system") + loop.httpClient = srv.Client() + + loop.checkOnce(context.Background()) + + signalName := packSourceSignalPrefix + "mypack" + _, err := client.Resource(dsGVR).Namespace("seam-system").Get(context.Background(), signalName, metav1.GetOptions{}) + if err == nil { + t.Error("expected no DriftSignal when version is current, but one was created") + } +} + +// TestCheckOnce_SkipsNonHelmPacks verifies that packs without chartURL are skipped. +func TestCheckOnce_SkipsNonHelmPacks(t *testing.T) { + scheme := runtime.NewScheme() + pd := newFakePackDelivery("rawpack", "seam-system", "", "", "") // no chartURL + + pdGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "packdeliveries"} + dsGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "driftsignals"} + + client := fake.NewSimpleDynamicClientWithCustomListKinds(scheme, + map[schema.GroupVersionResource]string{ + pdGVR: "PackDeliveryList", + dsGVR: "DriftSignalList", + }, + pd, + ) + + loop := NewPackSourceVersionLoop(client, "seam-system") + + // Should complete without panicking or creating any DriftSignal. + loop.checkOnce(context.Background()) + + signalName := packSourceSignalPrefix + "rawpack" + _, err := client.Resource(dsGVR).Namespace("seam-system").Get(context.Background(), signalName, metav1.GetOptions{}) + if err == nil { + t.Error("expected no DriftSignal for non-Helm pack, but one was created") + } +} + +// TestCheckOnce_ConfirmsExistingSignalWhenVersionCurrent verifies that checkOnce +// advances an existing DriftSignal to confirmed when the pack is at the latest version. +func TestCheckOnce_ConfirmsExistingSignalWhenVersionCurrent(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(helmIndexYAML)) // latest: 2.0.0 + })) + defer srv.Close() + + scheme := runtime.NewScheme() + pd := newFakePackDelivery("mypack", "seam-system", srv.URL+"/charts/mychart-2.0.0.tgz", "mychart", "2.0.0") + + signalName := packSourceSignalPrefix + "mypack" + existingSignal := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{ + "name": signalName, + "namespace": "seam-system", + }, + "spec": map[string]interface{}{ + "state": "queued", + "signalKind": "UpstreamVersionAvailable", + }, + }, + } + // Serialize and deserialize to ensure the raw JSON format that the fake client returns. + rawBytes, _ := json.Marshal(existingSignal.Object) + _ = json.Unmarshal(rawBytes, &existingSignal.Object) + + pdGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "packdeliveries"} + dsGVR := schema.GroupVersionResource{Group: "seam.ontai.dev", Version: "v1alpha1", Resource: "driftsignals"} + + client := fake.NewSimpleDynamicClientWithCustomListKinds(scheme, + map[schema.GroupVersionResource]string{ + pdGVR: "PackDeliveryList", + dsGVR: "DriftSignalList", + }, + pd, existingSignal, + ) + + loop := NewPackSourceVersionLoop(client, "seam-system") + loop.httpClient = srv.Client() + + loop.checkOnce(context.Background()) + + ds, err := client.Resource(dsGVR).Namespace("seam-system").Get(context.Background(), signalName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("expected DriftSignal to exist: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + state, _ := spec["state"].(string) + if state != "confirmed" { + t.Errorf("DriftSignal state = %q, want confirmed", state) + } +} diff --git a/internal/agent/packinstance_pull_loop.go b/internal/agent/packinstance_pull_loop.go index c841c7d..1f62fb2 100644 --- a/internal/agent/packinstance_pull_loop.go +++ b/internal/agent/packinstance_pull_loop.go @@ -356,7 +356,7 @@ func (l *PackInstancePullLoop) upsertPackReceipt( receipt := &unstructured.Unstructured{ Object: map[string]interface{}{ "apiVersion": "seam.ontai.dev/v1alpha1", - "kind": "InfrastructurePackReceipt", + "kind": "PackReceipt", "metadata": map[string]interface{}{ "name": receiptName, "namespace": l.namespace, diff --git a/internal/agent/policy_report_drift_loop.go b/internal/agent/policy_report_drift_loop.go new file mode 100644 index 0000000..b54d66a --- /dev/null +++ b/internal/agent/policy_report_drift_loop.go @@ -0,0 +1,249 @@ +package agent + +import ( + "context" + "encoding/json" + "fmt" + "time" + + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/dynamic" +) + +// clusterPolicyReportGVR is the GroupVersionResource for ClusterPolicyReport CRs (Kyverno). +var clusterPolicyReportGVR = schema.GroupVersionResource{ + Group: "wgpolicyk8s.io", + Version: "v1alpha2", + Resource: "clusterpolicyreports", +} + +// policyReportGVR is the GroupVersionResource for namespaced PolicyReport CRs (Kyverno). +var policyReportGVR = schema.GroupVersionResource{ + Group: "wgpolicyk8s.io", + Version: "v1alpha2", + Resource: "policyreports", +} + +// policyReportSignalPrefix is the DriftSignal name prefix for Kyverno policy violation signals. +const policyReportSignalPrefix = "drift-policy-" + +// PolicyReportDriftLoop runs on conductor role=management. On each cycle it: +// 1. Lists ClusterPolicyReport and PolicyReport CRs across the management namespace. +// 2. For each report with at least one fail result, emits a KyvernoPolicyViolation DriftSignal. +// 3. Confirms any existing signal when the report has no fail results. +// +// Skips cleanly when Kyverno CRDs are not installed. +// AutonomyLevel=observe-only: logs only, no DriftSignal written. +// RECON-L2. +type PolicyReportDriftLoop struct { + client dynamic.Interface + namespace string + ocWatcher *OperatorContextWatcher + clusterRef string +} + +// NewPolicyReportDriftLoop constructs a PolicyReportDriftLoop for the given namespace. +func NewPolicyReportDriftLoop(client dynamic.Interface, namespace, clusterRef string) *PolicyReportDriftLoop { + return &PolicyReportDriftLoop{ + client: client, + namespace: namespace, + clusterRef: clusterRef, + } +} + +// WithOperatorContextWatcher sets the OperatorContextWatcher for autonomy-level gating. +func (l *PolicyReportDriftLoop) WithOperatorContextWatcher(w *OperatorContextWatcher) { + l.ocWatcher = w +} + +// Run runs the loop until ctx is cancelled. +func (l *PolicyReportDriftLoop) Run(ctx context.Context, interval time.Duration) { + l.checkOnce(ctx) + if ctx.Err() != nil { + return + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + l.checkOnce(ctx) + } + } +} + +func (l *PolicyReportDriftLoop) checkOnce(ctx context.Context) { + clusterList, err := l.client.Resource(clusterPolicyReportGVR).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("policy report drift loop: list ClusterPolicyReports: %v\n", err) + return + } + for i := range clusterList.Items { + l.checkReport(ctx, &clusterList.Items[i], true) + } + + nsList, err := l.client.Resource(policyReportGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("policy report drift loop: list PolicyReports in %s: %v\n", l.namespace, err) + return + } + for i := range nsList.Items { + l.checkReport(ctx, &nsList.Items[i], false) + } +} + +func (l *PolicyReportDriftLoop) checkReport(ctx context.Context, report *k8sunstructured.Unstructured, cluster bool) { + name := report.GetName() + prefix := "cluster-" + if !cluster { + prefix = "" + } + signalName := policyReportSignalPrefix + prefix + name + + failCount, policies := policyReportFailures(report.Object) + if failCount == 0 { + l.confirmSignalIfPresent(ctx, signalName) + return + } + + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("policy report drift loop: report=%q has %d fail(s) -- observe-only mode, no DriftSignal written\n", name, failCount) + return + } + + driftReason := fmt.Sprintf("Kyverno policy violations: report=%s failCount=%d policies=%v", name, failCount, policies) + l.emitSignal(ctx, signalName, name, driftReason) +} + +// policyReportFailures counts fail results in a PolicyReport object and returns policy names. +func policyReportFailures(obj map[string]interface{}) (int, []string) { + results, _, _ := unstructuredNestedSlice(obj, "results") + var count int + var names []string + for _, raw := range results { + entry, ok := raw.(map[string]interface{}) + if !ok { + continue + } + result, _ := entry["result"].(string) + if result == "fail" { + count++ + if policy, ok := entry["policy"].(string); ok { + names = append(names, policy) + } + } + } + return count, names +} + +func (l *PolicyReportDriftLoop) emitSignal(ctx context.Context, signalName, reportName, driftReason string) { + now := time.Now().UTC().Format(time.RFC3339) + + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil && !k8serrors.IsNotFound(err) { + fmt.Printf("policy report drift loop: report=%q get DriftSignal: %v\n", reportName, err) + return + } + + if k8serrors.IsNotFound(err) { + obj := map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": signalName, "namespace": l.namespace}, + "spec": map[string]interface{}{ + "state": "pending", + "signalKind": "KyvernoPolicyViolation", + "driftLayer": "kubernetes", + "correlationID": fmt.Sprintf("policy-%s-%d", reportName, time.Now().UnixNano()), + "observedAt": now, + "driftReason": driftReason, + "affectedCRRef": map[string]interface{}{ + "group": "wgpolicyk8s.io", + "kind": "PolicyReport", + "name": reportName, + }, + "escalationCounter": int64(0), + }, + } + if _, cErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Create( + ctx, &k8sunstructured.Unstructured{Object: obj}, metav1.CreateOptions{}, + ); cErr != nil { + fmt.Printf("policy report drift loop: report=%q create DriftSignal: %v\n", reportName, cErr) + } + fmt.Printf("policy report drift loop: report=%q Kyverno policy violations -- DriftSignal written\n", reportName) + return + } + + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + counter, _ := spec["escalationCounter"].(int64) + if int32(counter) >= escalationThreshold { + return + } + if state == "confirmed" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "correlationID": fmt.Sprintf("policy-%s-%d", reportName, time.Now().UnixNano()), + "observedAt": now, + "escalationCounter": int64(0), + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("policy report drift loop: report=%q reset DriftSignal: %v\n", reportName, pErr) + } + return + } + if state == "queued" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "escalationCounter": counter + 1, + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("policy report drift loop: report=%q increment escalation counter: %v\n", reportName, pErr) + } + } +} + +func (l *PolicyReportDriftLoop) confirmSignalIfPresent(ctx context.Context, signalName string) { + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil { + return + } + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + if state == "confirmed" || state == "" { + return + } + patch := map[string]interface{}{ + "spec": map[string]interface{}{"state": "confirmed", "correlationID": ""}, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("policy report drift loop: confirm DriftSignal %s/%s: %v\n", l.namespace, signalName, pErr) + } +} diff --git a/internal/agent/policy_report_drift_loop_test.go b/internal/agent/policy_report_drift_loop_test.go new file mode 100644 index 0000000..4ec4a87 --- /dev/null +++ b/internal/agent/policy_report_drift_loop_test.go @@ -0,0 +1,169 @@ +package agent + +import ( + "context" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic/fake" +) + +var policyTestGVRs = map[schema.GroupVersionResource]string{ + clusterPolicyReportGVR: "ClusterPolicyReportList", + policyReportGVR: "PolicyReportList", + driftSignalGVR: "DriftSignalList", +} + +func newPolicyFakeClient(objs ...runtime.Object) *fake.FakeDynamicClient { + return fake.NewSimpleDynamicClientWithCustomListKinds(runtime.NewScheme(), policyTestGVRs, objs...) +} + +func TestPolicyReportDriftLoop_NoViolations_NoSignal(t *testing.T) { + cr := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "wgpolicyk8s.io/v1alpha2", + "kind": "ClusterPolicyReport", + "metadata": map[string]interface{}{"name": "cluster-report"}, + "results": []interface{}{ + map[string]interface{}{"result": "pass", "policy": "require-labels"}, + }, + }} + + client := newPolicyFakeClient(cr) + l := NewPolicyReportDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal for passing policy report") + } + } +} + +func TestPolicyReportDriftLoop_ClusterReportFail_EmitsSignal(t *testing.T) { + cr := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "wgpolicyk8s.io/v1alpha2", + "kind": "ClusterPolicyReport", + "metadata": map[string]interface{}{"name": "cluster-report"}, + "results": []interface{}{ + map[string]interface{}{"result": "fail", "policy": "require-psa"}, + map[string]interface{}{"result": "pass", "policy": "require-labels"}, + }, + }} + + client := newPolicyFakeClient(cr) + l := NewPolicyReportDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var created bool + for _, a := range client.Actions() { + if a.GetVerb() == "create" && a.GetResource().Resource == "driftsignals" { + created = true + } + } + if !created { + t.Error("expected DriftSignal created for ClusterPolicyReport with fail results") + } +} + +func TestPolicyReportDriftLoop_SignalFields(t *testing.T) { + cr := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "wgpolicyk8s.io/v1alpha2", + "kind": "ClusterPolicyReport", + "metadata": map[string]interface{}{"name": "my-report"}, + "results": []interface{}{ + map[string]interface{}{"result": "fail", "policy": "no-privileged"}, + }, + }} + + client := newPolicyFakeClient(cr) + l := NewPolicyReportDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + signalName := policyReportSignalPrefix + "cluster-" + "my-report" + ds, err := client.Resource(driftSignalGVR).Namespace("seam-system").Get( + context.Background(), signalName, metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("expected DriftSignal: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + if kind, _ := spec["signalKind"].(string); kind != "KyvernoPolicyViolation" { + t.Errorf("signalKind = %q, want KyvernoPolicyViolation", kind) + } +} + +func TestPolicyReportDriftLoop_ObserveOnly_NoSignal(t *testing.T) { + cr := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "wgpolicyk8s.io/v1alpha2", + "kind": "ClusterPolicyReport", + "metadata": map[string]interface{}{"name": "cluster-report"}, + "results": []interface{}{ + map[string]interface{}{"result": "fail", "policy": "require-psa"}, + }, + }} + + client := newPolicyFakeClient(cr) + ocWatcher := NewOperatorContextWatcher(client, "ont-system") + ocWatcher.mu.Lock() + ocWatcher.resolved["ccs-mgmt"] = resolvedContext{autonomyLevel: AutonomyLevelObserveOnly, mode: "normal"} + ocWatcher.mu.Unlock() + + l := NewPolicyReportDriftLoop(client, "seam-system", "ccs-mgmt") + l.WithOperatorContextWatcher(ocWatcher) + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal under observe-only mode") + } + } +} + +func TestPolicyReportDriftLoop_ConfirmsSignalWhenClean(t *testing.T) { + cr := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "wgpolicyk8s.io/v1alpha2", + "kind": "ClusterPolicyReport", + "metadata": map[string]interface{}{"name": "clean-report"}, + "results": []interface{}{}, + }} + existingSignal := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": policyReportSignalPrefix + "cluster-" + "clean-report", "namespace": "seam-system"}, + "spec": map[string]interface{}{"state": "queued"}, + }} + + client := newPolicyFakeClient(cr, existingSignal) + l := NewPolicyReportDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var patched bool + for _, a := range client.Actions() { + if a.GetVerb() == "patch" && a.GetResource().Resource == "driftsignals" { + patched = true + } + } + if !patched { + t.Error("expected DriftSignal to be confirmed when report has no violations") + } +} + +func TestPolicyReportFailures_CountsCorrectly(t *testing.T) { + obj := map[string]interface{}{ + "results": []interface{}{ + map[string]interface{}{"result": "fail", "policy": "pol-a"}, + map[string]interface{}{"result": "pass", "policy": "pol-b"}, + map[string]interface{}{"result": "fail", "policy": "pol-c"}, + }, + } + count, policies := policyReportFailures(obj) + if count != 2 { + t.Errorf("failCount = %d, want 2", count) + } + if len(policies) != 2 { + t.Errorf("policies len = %d, want 2", len(policies)) + } +} diff --git a/internal/agent/runtime_drift_handler.go b/internal/agent/runtime_drift_handler.go index 260fe28..e159108 100644 --- a/internal/agent/runtime_drift_handler.go +++ b/internal/agent/runtime_drift_handler.go @@ -13,6 +13,8 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/dynamic" + + "github.com/ontai-dev/conductor-sdk/runnerlib" ) // packLogGVR is the GroupVersionResource for PackLog CRs (dispatcher). @@ -212,14 +214,21 @@ func (h *RuntimeDriftHandler) reconcileRuntimeDrift( h.markApprovalActed(ctx, approval.GetName(), packInstalledNS) } - // 4. Submit remediation Job (Job scheduling via Kueue placeholder). - // The actual Kueue Job submission is handled by the remediation capability - // executor. Here we increment the attempt count in PackLog and advance the - // signal to state=queued. + // 4. Submit a Kueue watchdog Job that runs the appropriate remediation + // capability against the target cluster. The capability is chosen from + // the failureReason via watchdogCapabilityForFailureReason. + capability := watchdogCapabilityForFailureReason(failureReason) + executeImage := h.resolveExecuteImage(ctx, clusterName) + jobErr := h.submitWatchdogJob(ctx, clusterName, packInstalledName, capability, failureReason, executeImage) + if jobErr != nil { + fmt.Printf("runtime drift handler: cluster=%q signal=%q Job submit failed: %v\n", + clusterName, signalName, jobErr) + return + } h.incrementPackLogAttempts(ctx, packLogName, packInstalledNS, failureReason, currentAttempts+1) h.advanceSignalState(ctx, tenantNS, signalName, "queued") - fmt.Printf("runtime drift handler: cluster=%q signal=%q remediation attempt %d submitted\n", - clusterName, signalName, currentAttempts+1) + fmt.Printf("runtime drift handler: cluster=%q signal=%q capability=%q attempt %d queued\n", + clusterName, signalName, capability, currentAttempts+1) return } @@ -535,3 +544,147 @@ func (h *RuntimeDriftHandler) markApprovalActed(ctx context.Context, approvalNam namespace, approvalName, pErr) } } + +// watchdogCapabilityForFailureReason maps a DriftSignal failureReason string to the +// appropriate watchdog capability name. Defaults to pod-restart for unknown reasons. +func watchdogCapabilityForFailureReason(failureReason string) string { + switch failureReason { + case "OOMKilled": + return runnerlib.CapabilityResourcePatch + case "ImagePullBackOff", "ErrImagePull": + return runnerlib.CapabilityCredentialRefresh + case "FailedMount", "MultiAttachError": + return runnerlib.CapabilityForceVolumeDetach + default: + // CrashLoopBackOff and all other reasons. + return runnerlib.CapabilityPodRestart + } +} + +// watchdogExecuteImageFallback is used when the RunnerConfig cannot be read. +const watchdogExecuteImageFallback = "10.20.0.1:5000/ontai-dev/conductor-execute:dev" + +// resolveExecuteImage reads spec.runnerImage from the RunnerConfig for clusterRef. +// Returns the fallback image when the RunnerConfig is absent or the field is empty. +func (h *RuntimeDriftHandler) resolveExecuteImage(ctx context.Context, clusterRef string) string { + rc, err := h.client.Resource(runnerConfigGVR).Namespace(h.namespace).Get(ctx, clusterRef, metav1.GetOptions{}) + if err != nil { + return watchdogExecuteImageFallback + } + img, _, _ := unstructured.NestedString(rc.Object, "spec", "runnerImage") + if img == "" { + return watchdogExecuteImageFallback + } + return img +} + +// watchdogJobGVR is the GroupVersionResource for batch/v1 Jobs. +var watchdogJobGVR = schema.GroupVersionResource{Group: "batch", Version: "v1", Resource: "jobs"} + +// submitWatchdogJob creates a Kueue-admitted batch/v1 Job in h.namespace that runs +// the given watchdog capability against the target cluster. The Job mounts the +// tenant kubeconfig Secret so the capability executor can reach the tenant cluster. +// conductor-schema.md §6, wrapper-schema.md §9. +func (h *RuntimeDriftHandler) submitWatchdogJob( + ctx context.Context, + clusterRef, packInstalledName, capability, failureReason, executeImage string, +) error { + jobName := fmt.Sprintf("watchdog-%s-%s-%d", sanitizeLabel(capability), sanitizeLabel(clusterRef), time.Now().Unix()) + ttl := int64(600) + completions := int64(1) + backoffLimit := int64(0) + falseVal := false + trueVal := true + + job := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": map[string]interface{}{ + "name": jobName, + "namespace": h.namespace, + "labels": map[string]interface{}{ + "kueue.x-k8s.io/queue-name": "watchdog-queue", + "ontai.dev/watchdog-capability": capability, + "ontai.dev/cluster-ref": clusterRef, + }, + }, + "spec": map[string]interface{}{ + "ttlSecondsAfterFinished": ttl, + "completions": completions, + "backoffLimit": backoffLimit, + "template": map[string]interface{}{ + "spec": map[string]interface{}{ + "serviceAccountName": "conductor", + "restartPolicy": "Never", + "securityContext": map[string]interface{}{ + "runAsNonRoot": trueVal, + "seccompProfile": map[string]interface{}{ + "type": "RuntimeDefault", + }, + }, + "volumes": []interface{}{ + map[string]interface{}{ + "name": "kubeconfig", + "secret": map[string]interface{}{ + "secretName": "seam-mc-" + clusterRef + "-kubeconfig", + }, + }, + }, + "containers": []interface{}{ + map[string]interface{}{ + "name": "conductor", + "image": executeImage, + "imagePullPolicy": "Always", + "env": []interface{}{ + map[string]interface{}{"name": "CAPABILITY", "value": capability}, + map[string]interface{}{"name": "CLUSTER_REF", "value": clusterRef}, + map[string]interface{}{"name": "POD_NAMESPACE", "value": h.namespace}, + map[string]interface{}{"name": "PACK_INSTALLED_NAME", "value": packInstalledName}, + map[string]interface{}{"name": "FAILURE_REASON", "value": failureReason}, + }, + "volumeMounts": []interface{}{ + map[string]interface{}{ + "name": "kubeconfig", + "mountPath": "/var/run/secrets/kubeconfig", + "subPath": "value", + "readOnly": trueVal, + }, + }, + "securityContext": map[string]interface{}{ + "allowPrivilegeEscalation": falseVal, + "capabilities": map[string]interface{}{ + "drop": []interface{}{"ALL"}, + }, + "runAsNonRoot": trueVal, + "seccompProfile": map[string]interface{}{ + "type": "RuntimeDefault", + }, + }, + }, + }, + }, + }, + }, + }, + } + + _, err := h.client.Resource(watchdogJobGVR).Namespace(h.namespace).Create(ctx, job, metav1.CreateOptions{}) + return err +} + +// sanitizeLabel trims characters that are not valid in Kubernetes label values or +// Job name components (alphanumeric plus hyphen and dot, max 63 chars per segment). +// Used to build the Job name from capability and clusterRef strings. +func sanitizeLabel(s string) string { + out := make([]byte, 0, len(s)) + for i := 0; i < len(s) && len(out) < 30; i++ { + c := s[i] + if (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' { + out = append(out, c) + } else if c >= 'A' && c <= 'Z' { + out = append(out, c+32) // to lower + } + } + return string(out) +} diff --git a/internal/agent/runtime_drift_handler_test.go b/internal/agent/runtime_drift_handler_test.go index 05dc505..a5440bb 100644 --- a/internal/agent/runtime_drift_handler_test.go +++ b/internal/agent/runtime_drift_handler_test.go @@ -160,3 +160,98 @@ func TestRuntimeDriftHandler_SkipsGovernanceDrift(t *testing.T) { // If this panics, the guard is missing and the test fails. _ = h } + +// TestWatchdogCapabilityForFailureReason verifies that each known failure reason +// maps to the expected watchdog capability and that unknown reasons fall through +// to pod-restart. +func TestWatchdogCapabilityForFailureReason(t *testing.T) { + cases := []struct { + reason string + want string + }{ + {"OOMKilled", "resource-patch"}, + {"ImagePullBackOff", "credential-refresh"}, + {"ErrImagePull", "credential-refresh"}, + {"FailedMount", "force-volume-detach"}, + {"MultiAttachError", "force-volume-detach"}, + {"CrashLoopBackOff", "pod-restart"}, + {"Unknown", "pod-restart"}, + {"", "pod-restart"}, + } + for _, tc := range cases { + got := watchdogCapabilityForFailureReason(tc.reason) + if got != tc.want { + t.Errorf("watchdogCapabilityForFailureReason(%q) = %q, want %q", tc.reason, got, tc.want) + } + } +} + +// setupJobScheme builds a fake scheme with types needed to verify Job creation. +func setupJobScheme() *runtime.Scheme { + s := setupApprovalScheme() + s.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "batch", Version: "v1", Kind: "Job", + }, &unstructured.Unstructured{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "batch", Version: "v1", Kind: "JobList", + }, &unstructured.UnstructuredList{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "seam.ontai.dev", Version: "v1alpha1", Kind: "RunnerConfig", + }, &unstructured.Unstructured{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "seam.ontai.dev", Version: "v1alpha1", Kind: "RunnerConfigList", + }, &unstructured.UnstructuredList{}) + return s +} + +// TestSubmitWatchdogJob_CreatesJobInOntSystem verifies that submitWatchdogJob +// creates a batch/v1 Job in the ont-system namespace with the expected Kueue +// queue label and env vars. +func TestSubmitWatchdogJob_CreatesJobInOntSystem(t *testing.T) { + client := fake.NewSimpleDynamicClient(setupJobScheme()) + h := NewRuntimeDriftHandler(client, "ont-system") + + err := h.submitWatchdogJob(context.Background(), + "ccs-dev", "nginx", "pod-restart", "CrashLoopBackOff", "10.20.0.1:5000/ontai-dev/conductor-execute:dev") + if err != nil { + t.Fatalf("submitWatchdogJob returned unexpected error: %v", err) + } + + jobGVR := schema.GroupVersionResource{Group: "batch", Version: "v1", Resource: "jobs"} + list, listErr := client.Resource(jobGVR).Namespace("ont-system").List(context.Background(), metav1.ListOptions{}) + if listErr != nil { + t.Fatalf("list Jobs: %v", listErr) + } + if len(list.Items) != 1 { + t.Fatalf("expected 1 Job, got %d", len(list.Items)) + } + job := list.Items[0] + + // Verify Kueue queue label. + labels := job.GetLabels() + if queueName := labels["kueue.x-k8s.io/queue-name"]; queueName != "watchdog-queue" { + t.Errorf("expected queue label watchdog-queue, got %q", queueName) + } + // Verify namespace. + if job.GetNamespace() != "ont-system" { + t.Errorf("expected namespace ont-system, got %q", job.GetNamespace()) + } + // Verify CAPABILITY env var. + containers, _, _ := unstructured.NestedSlice(job.Object, "spec", "template", "spec", "containers") + if len(containers) == 0 { + t.Fatal("expected at least 1 container in Job spec") + } + container, _ := containers[0].(map[string]interface{}) + envVars, _, _ := unstructured.NestedSlice(container, "env") + found := false + for _, envRaw := range envVars { + env, _ := envRaw.(map[string]interface{}) + if env["name"] == "CAPABILITY" && env["value"] == "pod-restart" { + found = true + break + } + } + if !found { + t.Errorf("CAPABILITY=pod-restart env var not found in Job container; env: %v", envVars) + } +} diff --git a/internal/agent/vulnerability_drift_loop.go b/internal/agent/vulnerability_drift_loop.go new file mode 100644 index 0000000..4105a6e --- /dev/null +++ b/internal/agent/vulnerability_drift_loop.go @@ -0,0 +1,248 @@ +package agent + +import ( + "context" + "encoding/json" + "fmt" + "time" + + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sunstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/dynamic" +) + +// vulnerabilityReportGVR is the GroupVersionResource for VulnerabilityReport CRs (Trivy Operator). +var vulnerabilityReportGVR = schema.GroupVersionResource{ + Group: "aquasecurity.github.io", + Version: "v1alpha1", + Resource: "vulnerabilityreports", +} + +// vulnerabilitySignalPrefix is the DriftSignal name prefix for Trivy vulnerability signals. +const vulnerabilitySignalPrefix = "drift-vuln-" + +// criticalSeverity is the default minimum severity threshold for emitting a DriftSignal. +const criticalSeverity = "CRITICAL" + +// VulnerabilityDriftLoop runs on conductor role=management. On each cycle it: +// 1. Lists VulnerabilityReport CRs in the management namespace (seam-system). +// 2. For each report containing at least one vulnerability at or above CRITICAL severity, +// emits a VulnerableImageDetected DriftSignal. +// 3. Confirms any existing signal when the report has no CRITICAL vulnerabilities. +// +// Skips cleanly when Trivy Operator CRDs are not installed. +// AutonomyLevel=observe-only: logs only, no DriftSignal written. +// RECON-M2. +type VulnerabilityDriftLoop struct { + client dynamic.Interface + namespace string + ocWatcher *OperatorContextWatcher + clusterRef string +} + +// NewVulnerabilityDriftLoop constructs a VulnerabilityDriftLoop for the given namespace. +func NewVulnerabilityDriftLoop(client dynamic.Interface, namespace, clusterRef string) *VulnerabilityDriftLoop { + return &VulnerabilityDriftLoop{ + client: client, + namespace: namespace, + clusterRef: clusterRef, + } +} + +// WithOperatorContextWatcher sets the OperatorContextWatcher for autonomy-level gating. +func (l *VulnerabilityDriftLoop) WithOperatorContextWatcher(w *OperatorContextWatcher) { + l.ocWatcher = w +} + +// Run runs the loop until ctx is cancelled. +func (l *VulnerabilityDriftLoop) Run(ctx context.Context, interval time.Duration) { + l.checkOnce(ctx) + if ctx.Err() != nil { + return + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + l.checkOnce(ctx) + } + } +} + +func (l *VulnerabilityDriftLoop) checkOnce(ctx context.Context) { + list, err := l.client.Resource(vulnerabilityReportGVR).Namespace(l.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + if isNoCRDError(err) { + return + } + fmt.Printf("vulnerability drift loop: list VulnerabilityReports in %s: %v\n", l.namespace, err) + return + } + for i := range list.Items { + l.checkReport(ctx, &list.Items[i]) + } +} + +func (l *VulnerabilityDriftLoop) checkReport(ctx context.Context, report *k8sunstructured.Unstructured) { + name := report.GetName() + signalName := vulnerabilitySignalPrefix + name + + critCount, imageRef := vulnerabilityCriticalCount(report.Object) + if critCount == 0 { + l.confirmSignalIfPresent(ctx, signalName) + return + } + + if l.ocWatcher != nil && !l.ocWatcher.IsAutonomousActionsAllowedFor(l.clusterRef, "management") { + fmt.Printf("vulnerability drift loop: report=%q has %d CRITICAL vuln(s) -- observe-only mode, no DriftSignal written\n", name, critCount) + return + } + + driftReason := fmt.Sprintf("vulnerable image detected: report=%s image=%s criticalCount=%d", name, imageRef, critCount) + l.emitSignal(ctx, signalName, name, imageRef, driftReason) +} + +// vulnerabilityCriticalCount returns the count of CRITICAL severity vulnerabilities +// and the image reference from the VulnerabilityReport object. +func vulnerabilityCriticalCount(obj map[string]interface{}) (int, string) { + report, _, _ := unstructuredNestedMap(obj, "report") + imageRef := "" + if artifact, _, _ := unstructuredNestedMap(obj, "report", "artifact"); len(artifact) > 0 { + imageRef, _ = artifact["repository"].(string) + if tag, _ := artifact["tag"].(string); tag != "" { + imageRef = imageRef + ":" + tag + } + } + + summary, _, _ := unstructuredNestedMap(report, "summary") + if len(summary) == 0 { + // Fall back to scanning vulnerabilities slice directly. + return vulnerabilityCriticalCountFromSlice(obj, imageRef) + } + + critFloat, _ := summary["criticalCount"].(float64) + return int(critFloat), imageRef +} + +func vulnerabilityCriticalCountFromSlice(obj map[string]interface{}, imageRef string) (int, string) { + vulnerabilities, _, _ := unstructuredNestedSlice(obj, "report", "vulnerabilities") + count := 0 + for _, raw := range vulnerabilities { + v, ok := raw.(map[string]interface{}) + if !ok { + continue + } + severity, _ := v["severity"].(string) + if severity == criticalSeverity { + count++ + } + } + return count, imageRef +} + +func (l *VulnerabilityDriftLoop) emitSignal(ctx context.Context, signalName, reportName, imageRef, driftReason string) { + now := time.Now().UTC().Format(time.RFC3339) + + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil && !k8serrors.IsNotFound(err) { + fmt.Printf("vulnerability drift loop: report=%q get DriftSignal: %v\n", reportName, err) + return + } + + if k8serrors.IsNotFound(err) { + obj := map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": signalName, "namespace": l.namespace}, + "spec": map[string]interface{}{ + "state": "pending", + "signalKind": "VulnerableImageDetected", + "driftLayer": "kubernetes", + "correlationID": fmt.Sprintf("vuln-%s-%d", reportName, time.Now().UnixNano()), + "observedAt": now, + "driftReason": driftReason, + "affectedCRRef": map[string]interface{}{ + "group": "aquasecurity.github.io", + "kind": "VulnerabilityReport", + "namespace": l.namespace, + "name": reportName, + }, + "escalationCounter": int64(0), + }, + } + if _, cErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Create( + ctx, &k8sunstructured.Unstructured{Object: obj}, metav1.CreateOptions{}, + ); cErr != nil { + fmt.Printf("vulnerability drift loop: report=%q create DriftSignal: %v\n", reportName, cErr) + } + fmt.Printf("vulnerability drift loop: report=%q CRITICAL vuln(s) in %s -- DriftSignal written\n", reportName, imageRef) + return + } + + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + counter, _ := spec["escalationCounter"].(int64) + if int32(counter) >= escalationThreshold { + return + } + if state == "confirmed" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "correlationID": fmt.Sprintf("vuln-%s-%d", reportName, time.Now().UnixNano()), + "observedAt": now, + "escalationCounter": int64(0), + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("vulnerability drift loop: report=%q reset DriftSignal: %v\n", reportName, pErr) + } + return + } + if state == "queued" { + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "state": "pending", + "driftReason": driftReason, + "escalationCounter": counter + 1, + }, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("vulnerability drift loop: report=%q increment escalation counter: %v\n", reportName, pErr) + } + } +} + +func (l *VulnerabilityDriftLoop) confirmSignalIfPresent(ctx context.Context, signalName string) { + existing, err := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Get(ctx, signalName, metav1.GetOptions{}) + if err != nil { + return + } + spec, _, _ := unstructuredNestedMap(existing.Object, "spec") + state, _ := spec["state"].(string) + if state == "confirmed" || state == "" { + return + } + patch := map[string]interface{}{ + "spec": map[string]interface{}{"state": "confirmed", "correlationID": ""}, + } + data, _ := json.Marshal(patch) + if _, pErr := l.client.Resource(driftSignalGVR).Namespace(l.namespace).Patch( + ctx, signalName, types.MergePatchType, data, metav1.PatchOptions{}, + ); pErr != nil { + fmt.Printf("vulnerability drift loop: confirm DriftSignal %s/%s: %v\n", l.namespace, signalName, pErr) + } +} diff --git a/internal/agent/vulnerability_drift_loop_test.go b/internal/agent/vulnerability_drift_loop_test.go new file mode 100644 index 0000000..c81a6c8 --- /dev/null +++ b/internal/agent/vulnerability_drift_loop_test.go @@ -0,0 +1,162 @@ +package agent + +import ( + "context" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic/fake" +) + +var vulnTestGVRs = map[schema.GroupVersionResource]string{ + vulnerabilityReportGVR: "VulnerabilityReportList", + driftSignalGVR: "DriftSignalList", +} + +func newVulnFakeClient(objs ...runtime.Object) *fake.FakeDynamicClient { + return fake.NewSimpleDynamicClientWithCustomListKinds(runtime.NewScheme(), vulnTestGVRs, objs...) +} + +// fakeVulnReport builds a VulnerabilityReport with the given criticalCount in the summary. +func fakeVulnReport(name string, critCount int) *unstructured.Unstructured { + return &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "aquasecurity.github.io/v1alpha1", + "kind": "VulnerabilityReport", + "metadata": map[string]interface{}{"name": name, "namespace": "seam-system"}, + "report": map[string]interface{}{ + "artifact": map[string]interface{}{"repository": "nginx", "tag": "1.25.0"}, + "summary": map[string]interface{}{"criticalCount": float64(critCount)}, + }, + }} +} + +func TestVulnerabilityDriftLoop_NoCritical_NoSignal(t *testing.T) { + report := fakeVulnReport("safe-report", 0) + client := newVulnFakeClient(report) + + l := NewVulnerabilityDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal for report with zero critical vulnerabilities") + } + } +} + +func TestVulnerabilityDriftLoop_CriticalFound_EmitsSignal(t *testing.T) { + report := fakeVulnReport("vuln-report", 3) + client := newVulnFakeClient(report) + + l := NewVulnerabilityDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var created bool + for _, a := range client.Actions() { + if a.GetVerb() == "create" && a.GetResource().Resource == "driftsignals" { + created = true + } + } + if !created { + t.Error("expected DriftSignal created for report with critical vulnerabilities") + } +} + +func TestVulnerabilityDriftLoop_SignalFields(t *testing.T) { + report := fakeVulnReport("my-vuln-report", 2) + client := newVulnFakeClient(report) + + l := NewVulnerabilityDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + signalName := vulnerabilitySignalPrefix + "my-vuln-report" + ds, err := client.Resource(driftSignalGVR).Namespace("seam-system").Get( + context.Background(), signalName, metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("expected DriftSignal: %v", err) + } + spec, _, _ := unstructuredNestedMap(ds.Object, "spec") + if kind, _ := spec["signalKind"].(string); kind != "VulnerableImageDetected" { + t.Errorf("signalKind = %q, want VulnerableImageDetected", kind) + } + if state, _ := spec["state"].(string); state != "pending" { + t.Errorf("state = %q, want pending", state) + } +} + +func TestVulnerabilityDriftLoop_ObserveOnly_NoSignal(t *testing.T) { + report := fakeVulnReport("vuln-report", 5) + client := newVulnFakeClient(report) + + ocWatcher := NewOperatorContextWatcher(client, "ont-system") + ocWatcher.mu.Lock() + ocWatcher.resolved["ccs-mgmt"] = resolvedContext{autonomyLevel: AutonomyLevelObserveOnly, mode: "normal"} + ocWatcher.mu.Unlock() + + l := NewVulnerabilityDriftLoop(client, "seam-system", "ccs-mgmt") + l.WithOperatorContextWatcher(ocWatcher) + l.checkOnce(context.Background()) + + for _, a := range client.Actions() { + if a.GetVerb() == "create" { + t.Error("expected no DriftSignal under observe-only mode") + } + } +} + +func TestVulnerabilityDriftLoop_ConfirmsSignalWhenClean(t *testing.T) { + report := fakeVulnReport("fixed-report", 0) + existingSignal := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "DriftSignal", + "metadata": map[string]interface{}{"name": vulnerabilitySignalPrefix + "fixed-report", "namespace": "seam-system"}, + "spec": map[string]interface{}{"state": "queued"}, + }} + + client := newVulnFakeClient(report, existingSignal) + l := NewVulnerabilityDriftLoop(client, "seam-system", "ccs-mgmt") + l.checkOnce(context.Background()) + + var patched bool + for _, a := range client.Actions() { + if a.GetVerb() == "patch" && a.GetResource().Resource == "driftsignals" { + patched = true + } + } + if !patched { + t.Error("expected DriftSignal to be confirmed when no critical vulns remain") + } +} + +func TestVulnerabilityCriticalCount_FromSummary(t *testing.T) { + obj := map[string]interface{}{ + "report": map[string]interface{}{ + "summary": map[string]interface{}{"criticalCount": float64(7)}, + }, + } + count, _ := vulnerabilityCriticalCount(obj) + if count != 7 { + t.Errorf("criticalCount = %d, want 7", count) + } +} + +func TestVulnerabilityCriticalCount_FromSlice(t *testing.T) { + obj := map[string]interface{}{ + "report": map[string]interface{}{ + "vulnerabilities": []interface{}{ + map[string]interface{}{"severity": "CRITICAL"}, + map[string]interface{}{"severity": "HIGH"}, + map[string]interface{}{"severity": "CRITICAL"}, + map[string]interface{}{"severity": "MEDIUM"}, + }, + }, + } + count, _ := vulnerabilityCriticalCount(obj) + if count != 2 { + t.Errorf("criticalCount from slice = %d, want 2", count) + } +} diff --git a/internal/capability/adapters.go b/internal/capability/adapters.go index 9abd046..d327824 100644 --- a/internal/capability/adapters.go +++ b/internal/capability/adapters.go @@ -287,6 +287,12 @@ func (a *TalosClientAdapter) Reboot(ctx context.Context) error { return a.inner.Reboot(ctx) } +// RebootPowercycle reboots the node in hardware powercycle mode (power off then on). +// Required after Talos upgrade staging so that BIOS/UEFI re-initialises cleanly. +func (a *TalosClientAdapter) RebootPowercycle(ctx context.Context) error { + return a.inner.Reboot(ctx, talos_client.WithPowerCycle) +} + // Reset performs a factory reset of the node. reboot is always false; // the caller controls any subsequent reboot via a separate Reboot capability. func (a *TalosClientAdapter) Reset(ctx context.Context, graceful bool) error { diff --git a/internal/capability/clients.go b/internal/capability/clients.go index a993866..d432ae5 100644 --- a/internal/capability/clients.go +++ b/internal/capability/clients.go @@ -30,6 +30,11 @@ type TalosNodeClient interface { // Reboot reboots the node. Reboot(ctx context.Context) error + // RebootPowercycle reboots the node using hardware powercycle mode (power off then + // power on). Required for Talos upgrade to ensure BIOS/UEFI re-initialises cleanly. + // Distinct from Reboot (OS-level restart) to allow test stubs to record the mode. + RebootPowercycle(ctx context.Context) error + // Reset performs a factory reset of the node. graceful=true drains workloads first. Reset(ctx context.Context, graceful bool) error diff --git a/internal/capability/platform_cluster_test.go b/internal/capability/platform_cluster_test.go index 2062d1b..423c872 100644 --- a/internal/capability/platform_cluster_test.go +++ b/internal/capability/platform_cluster_test.go @@ -28,7 +28,8 @@ type stubBootstrapTalosClient struct { func (s *stubBootstrapTalosClient) Bootstrap(_ context.Context) error { return nil } func (s *stubBootstrapTalosClient) ApplyConfiguration(_ context.Context, _ []byte, _ string) error { return nil } func (s *stubBootstrapTalosClient) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (s *stubBootstrapTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubBootstrapTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubBootstrapTalosClient) RebootPowercycle(_ context.Context) error { return nil } func (s *stubBootstrapTalosClient) Reset(_ context.Context, _ bool) error { return nil } func (s *stubBootstrapTalosClient) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (s *stubBootstrapTalosClient) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } diff --git a/internal/capability/platform_machineconfig_constants.go b/internal/capability/platform_machineconfig_constants.go new file mode 100644 index 0000000..0a3b2a8 --- /dev/null +++ b/internal/capability/platform_machineconfig_constants.go @@ -0,0 +1,9 @@ +package capability + +// platform_machineconfig_constants.go -- shared constants for machineconfig Secret keys. +// Used by reenrollment and scale-up capabilities that still manage the legacy per-node +// Secret model until those capabilities are migrated to MachineConfig CRs. + +// machineConfigSyncDataKey is the primary Secret data key that holds the raw Talos +// machineconfig YAML. Used by node-reenrollment and node-scale-up capabilities. +const machineConfigSyncDataKey = "machineconfig" diff --git a/internal/capability/platform_machineconfig_sync.go b/internal/capability/platform_machineconfig_sync.go index 640d79c..fb54c9c 100644 --- a/internal/capability/platform_machineconfig_sync.go +++ b/internal/capability/platform_machineconfig_sync.go @@ -2,9 +2,10 @@ package capability // platform_machineconfig_sync.go -- machineconfig-sync named capability. // -// Reads the canonical machineconfig from the source-of-truth Secret in -// seam-tenant-{clusterRef}, injects the ONT node label, and applies the config -// to each node in the target cluster via the Talos machine API. +// Reads the canonical machineconfig from the source-of-truth MachineConfig CR in +// seam-tenant-{clusterRef}, reconstructs full Talos YAML from spec.machine and +// spec.cluster, injects the ONT node label, and applies the config to the target +// node via the Talos machine API. // // Named Conductor capability: machineconfig-sync. // conductor-schema.md §6, platform-schema.md §15, RECON-A5. @@ -13,36 +14,44 @@ package capability // in execute mode. Never imported or called from agent mode. import ( - "bytes" - "compress/gzip" "context" "fmt" - "io" "os" - "strings" "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" sigsyaml "sigs.k8s.io/yaml" "github.com/ontai-dev/conductor-sdk/runnerlib" ) -// machineConfigCompressionLabel mirrors LabelMachineConfigCompression in platform. -const machineConfigCompressionLabel = "platform.ontai.dev/compression" +// machineConfigGVR is the GroupVersionResource for MachineConfig CRs. +// platform.ontai.dev/v1alpha1/machineconfigs -- platform-schema.md §9. +var machineConfigGVR = schema.GroupVersionResource{ + Group: "platform.ontai.dev", + Version: "v1alpha1", + Resource: "machineconfigs", +} // envMCSyncNodeClass is the env var key injected by MachineConfigSyncReconciler. // Must match envMCNodeClass in platform/internal/controller/machineconfigsync_reconciler.go. const envMCSyncNodeClass = "MC_NODE_CLASS" -// machineConfigSyncSecretNamespace returns the namespace that holds the source-of-truth Secret. -func machineConfigSyncSecretNamespace(clusterRef string) string { +// envMCSyncNodeIP is the env var key injected when MachineConfigSync.spec.nodeRef is set. +// When present, the capability applies the machineconfig to only this specific node IP. +// Must match envMCNodeIP in platform/internal/controller/machineconfigsync_reconciler.go. +// PLT-BUG-3-ARCH. +const envMCSyncNodeIP = "MC_NODE_IP" + +// machineConfigSyncCRNamespace returns the namespace holding the MachineConfig CR. +func machineConfigSyncCRNamespace(clusterRef string) string { return "seam-tenant-" + clusterRef } -// machineConfigSyncSecretName returns the canonical Secret name for a given cluster and class. -// Mirrors MachineConfigSecretName in platform/internal/controller/machineconfig_labels.go. -func machineConfigSyncSecretName(clusterRef, nodeClass string) string { +// machineConfigSyncCRName returns the MachineConfig CR name for a given cluster and nodeClass. +// Mirrors MachineConfigCRName in platform/internal/controller/machineconfig_labels.go. +func machineConfigSyncCRName(clusterRef, nodeClass string) string { return "seam-mc-" + clusterRef + "-" + nodeClass } @@ -51,19 +60,15 @@ func machineConfigSyncSecretName(clusterRef, nodeClass string) string { // Mirrors MachineConfigNodeLabel in platform/internal/controller/machineconfig_labels.go. const ontControlledLabel = "ont.platform.dev/controlled" -// machineConfigSyncDataKey is the Secret data key that holds the raw Talos machineconfig YAML. -// Mirrors MachineConfigDataKey in platform/internal/controller/machineconfig_labels.go. -const machineConfigSyncDataKey = "machineconfig" - // machineConfigSyncHandler implements the machineconfig-sync named capability. type machineConfigSyncHandler struct{} func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecuteParams) (runnerlib.OperationResultSpec, error) { now := time.Now().UTC() - if params.TalosClient == nil || params.KubeClient == nil { + if params.TalosClient == nil || params.DynamicClient == nil { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ValidationFailure, - "machineconfig-sync requires TalosClient and KubeClient"), nil + "machineconfig-sync requires TalosClient and DynamicClient"), nil } nodeClass := os.Getenv(envMCSyncNodeClass) @@ -72,31 +77,30 @@ func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecutePa "machineconfig-sync: MC_NODE_CLASS env var is required but not set"), nil } + // MC_NODE_IP is set by the MachineConfigSync reconciler when spec.nodeRef is + // non-empty. When present, apply to only this specific node. PLT-BUG-3-ARCH. + nodeIP := os.Getenv(envMCSyncNodeIP) + clusterRef := params.ClusterRef - secretNS := machineConfigSyncSecretNamespace(clusterRef) - secretName := machineConfigSyncSecretName(clusterRef, nodeClass) + crNS := machineConfigSyncCRNamespace(clusterRef) + crName := machineConfigSyncCRName(clusterRef, nodeClass) - // Read the source-of-truth machineconfig Secret. - secret, err := params.KubeClient.CoreV1().Secrets(secretNS).Get(ctx, secretName, metav1.GetOptions{}) + // Read the source-of-truth MachineConfig CR via the management cluster DynamicClient. + // MachineConfig CRs live in seam-tenant-{clusterRef} on the management cluster. + mcObj, err := params.DynamicClient.Resource(machineConfigGVR).Namespace(crNS). + Get(ctx, crName, metav1.GetOptions{}) if err != nil { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ExecutionFailure, - fmt.Sprintf("get MachineConfig Secret %s/%s: %v", secretNS, secretName, err)), nil + fmt.Sprintf("get MachineConfig CR %s/%s: %v", crNS, crName, err)), nil } - mcBytes := secret.Data[machineConfigSyncDataKey] - if len(mcBytes) == 0 { + // Reconstruct full Talos YAML from spec.machine and spec.cluster sections. + // Both sections are stored as unstructured JSON in the CR; unmarshal and + // marshal back to produce a valid Talos v1alpha1 machineconfig YAML. + mcBytes, err := reconstructMachineConfigYAML(mcObj.Object) + if err != nil { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ValidationFailure, - fmt.Sprintf("MachineConfig Secret %s/%s has no data key %q", secretNS, secretName, machineConfigSyncDataKey)), nil - } - - // Decompress if the secret was stored with gzip compression. RECON-F5. - if secret.Labels[machineConfigCompressionLabel] == "gzip" { - decompressed, dcErr := decompressMachineConfig(mcBytes) - if dcErr != nil { - return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ValidationFailure, - fmt.Sprintf("decompress machineconfig from %s/%s: %v", secretNS, secretName, dcErr)), nil - } - mcBytes = decompressed + fmt.Sprintf("reconstruct machineconfig YAML from CR %s/%s: %v", crNS, crName, err)), nil } // Inject the ONT controlled node label into the machineconfig. @@ -116,9 +120,12 @@ func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecutePa fmt.Sprintf("merged machineconfig is not valid YAML: %v", err)), nil } - // Enumerate nodes from talosconfig; fall back to single-context when absent. + // When MC_NODE_IP is set (PLT-BUG-3-ARCH per-node MCS), apply to only that + // specific node. Skip talosconfig endpoint enumeration. var nodeIPs []string - if params.TalosconfigPath != "" { + if nodeIP != "" { + nodeIPs = []string{nodeIP} + } else if params.TalosconfigPath != "" { ips, epErr := EndpointsFromTalosconfig(params.TalosconfigPath) if epErr != nil { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ExecutionFailure, @@ -127,28 +134,20 @@ func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecutePa nodeIPs = ips } - // singleNodeClass is true when nodeClass is already a per-node class (e.g. "node-cp1"). - // In that mode, the base secret IS the per-node config; skip per-node patch lookup. - singleNodeClass := strings.HasPrefix(nodeClass, "node-") - var steps []runnerlib.StepResult if len(nodeIPs) > 0 { - for _, nodeIP := range nodeIPs { - nodeConfig := modifiedConfig - if !singleNodeClass { - nodeConfig = perNodePatchConfig(ctx, params, secretNS, clusterRef, nodeIP, modifiedConfig) - } + for _, ip := range nodeIPs { stepStart := time.Now().UTC() - if err := params.TalosClient.ApplyConfiguration(NodeContext(ctx, nodeIP), nodeConfig, "no-reboot"); err != nil { + if err := params.TalosClient.ApplyConfiguration(NodeContext(ctx, ip), modifiedConfig, "no-reboot"); err != nil { return failureResult(runnerlib.CapabilityMachineConfigSync, now, runnerlib.ExecutionFailure, - fmt.Sprintf("ApplyConfiguration on %s: %v", nodeIP, err)), nil + fmt.Sprintf("ApplyConfiguration on %s: %v", ip, err)), nil } steps = append(steps, runnerlib.StepResult{ - Name: "machineconfig-sync-" + nodeIP, + Name: "machineconfig-sync-" + ip, Status: runnerlib.ResultSucceeded, StartedAt: stepStart, CompletedAt: time.Now().UTC(), - Message: fmt.Sprintf("machineconfig applied to %s (nodeClass=%s)", nodeIP, nodeClass), + Message: fmt.Sprintf("machineconfig applied to %s (nodeClass=%s)", ip, nodeClass), }) } } else { @@ -175,67 +174,33 @@ func (h *machineConfigSyncHandler) Execute(ctx context.Context, params ExecutePa }, nil } -// decompressMachineConfig gunzips gzip-compressed machineconfig bytes. RECON-F5. -func decompressMachineConfig(compressed []byte) ([]byte, error) { - r, err := gzip.NewReader(bytes.NewReader(compressed)) - if err != nil { - return nil, fmt.Errorf("gzip.NewReader: %w", err) - } - defer r.Close() - out, err := io.ReadAll(r) - if err != nil { - return nil, fmt.Errorf("read decompressed: %w", err) - } - return out, nil -} - -// perNodePatchConfig looks up any per-node patch secret for the Kubernetes node -// whose InternalIP matches nodeIP, then merges it with baseConfig. The ONT controlled -// label is always re-injected after merging so it cannot be overridden by a patch. -// Returns baseConfig unchanged when no per-node secret exists or any step fails. RECON-A8. -func perNodePatchConfig(ctx context.Context, params ExecuteParams, secretNS, clusterRef, nodeIP string, baseConfig []byte) []byte { - nodeList, err := params.KubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - if err != nil { - return baseConfig - } - - var hostname string - for i := range nodeList.Items { - node := &nodeList.Items[i] - for _, addr := range node.Status.Addresses { - if string(addr.Type) == "InternalIP" && addr.Address == nodeIP { - hostname = node.Name - break - } - } - if hostname != "" { - break - } +// reconstructMachineConfigYAML builds a Talos v1alpha1 machineconfig YAML document +// from a MachineConfig CR's unstructured object. The spec.machine and spec.cluster +// sections are extracted and merged into a single top-level map. +// +// Returns an error when neither section is present (empty CR is not applicable). +func reconstructMachineConfigYAML(obj map[string]interface{}) ([]byte, error) { + spec, _ := obj["spec"].(map[string]interface{}) + if spec == nil { + return nil, fmt.Errorf("MachineConfig CR has no spec") } - if hostname == "" { - return baseConfig + combined := map[string]interface{}{ + "version": "v1alpha1", + "debug": false, + "persist": true, } - - patchSecretName := machineConfigSyncSecretName(clusterRef, "node-"+hostname) - patchSecret, pErr := params.KubeClient.CoreV1().Secrets(secretNS).Get(ctx, patchSecretName, metav1.GetOptions{}) - if pErr != nil { - return baseConfig + if m := spec["machine"]; m != nil { + combined["machine"] = m } - patchBytes := patchSecret.Data[machineConfigSyncDataKey] - if len(patchBytes) == 0 { - return baseConfig + if c := spec["cluster"]; c != nil { + combined["cluster"] = c } - - merged, mergeErr := mergeYAMLPatch(baseConfig, patchBytes) - if mergeErr != nil { - return baseConfig + if spec["machine"] == nil && spec["cluster"] == nil { + return nil, fmt.Errorf("MachineConfig CR spec has neither machine nor cluster section") } - - // Re-inject the ONT controlled label: it must never be overridden by a per-node patch. - labelPatch := []byte(fmt.Sprintf(`{"machine":{"nodeLabels":{%q:"true"}}}`, ontControlledLabel)) - result, lErr := mergeYAMLPatch(merged, labelPatch) - if lErr != nil { - return merged + data, err := sigsyaml.Marshal(combined) + if err != nil { + return nil, fmt.Errorf("marshal reconstructed machineconfig: %w", err) } - return result + return data, nil } diff --git a/internal/capability/platform_machineconfig_sync_test.go b/internal/capability/platform_machineconfig_sync_test.go index 16ada2b..87af176 100644 --- a/internal/capability/platform_machineconfig_sync_test.go +++ b/internal/capability/platform_machineconfig_sync_test.go @@ -1,25 +1,24 @@ package capability import ( - "bytes" - "compress/gzip" "context" - "fmt" "io" "os" "testing" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + fakedyn "k8s.io/client-go/dynamic/fake" "github.com/ontai-dev/conductor-sdk/runnerlib" ) // stubApplyTalosClient records ApplyConfiguration calls and exposes applied bytes. type stubApplyTalosClient struct { - applied [][]byte - applyErr error + applied [][]byte + applyErr error } func (s *stubApplyTalosClient) Bootstrap(_ context.Context) error { return nil } @@ -28,45 +27,106 @@ func (s *stubApplyTalosClient) ApplyConfiguration(_ context.Context, cfg []byte, return s.applyErr } func (s *stubApplyTalosClient) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (s *stubApplyTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) RebootPowercycle(_ context.Context) error { return nil } func (s *stubApplyTalosClient) Reset(_ context.Context, _ bool) error { return nil } func (s *stubApplyTalosClient) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } -func (s *stubApplyTalosClient) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } -func (s *stubApplyTalosClient) EtcdDefragment(_ context.Context) error { return nil } -func (s *stubApplyTalosClient) GetMachineConfig(_ context.Context) ([]byte, error) { return nil, nil } -func (s *stubApplyTalosClient) Kubeconfig(_ context.Context) ([]byte, error) { return nil, nil } -func (s *stubApplyTalosClient) Nodes() []string { return nil } -func (s *stubApplyTalosClient) Rollback(_ context.Context) error { return nil } -func (s *stubApplyTalosClient) WipeDisk(_ context.Context) error { return nil } -func (s *stubApplyTalosClient) Health(_ context.Context) error { return nil } -func (s *stubApplyTalosClient) Close() error { return nil } - -// mcSyncTestSecret builds a Kubernetes Secret for the machineconfig-sync capability tests. -func mcSyncTestSecret(clusterRef, nodeClass string, content []byte) *corev1.Secret { - return &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: "seam-mc-" + clusterRef + "-" + nodeClass, - Namespace: "seam-tenant-" + clusterRef, - }, - Data: map[string][]byte{ - "machineconfig": content, - }, +func (s *stubApplyTalosClient) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } +func (s *stubApplyTalosClient) EtcdDefragment(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) GetMachineConfig(_ context.Context) ([]byte, error) { return nil, nil } +func (s *stubApplyTalosClient) Kubeconfig(_ context.Context) ([]byte, error) { return nil, nil } +func (s *stubApplyTalosClient) Nodes() []string { return nil } +func (s *stubApplyTalosClient) Rollback(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) WipeDisk(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) Health(_ context.Context) error { return nil } +func (s *stubApplyTalosClient) Close() error { return nil } + +// buildMCSyncScheme returns a runtime.Scheme with MachineConfig and MachineConfigList registered. +func buildMCSyncScheme() *runtime.Scheme { + scheme := runtime.NewScheme() + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }, &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfigList", + }, &unstructured.UnstructuredList{}) + return scheme +} + +// buildMCSyncDynClient returns a fake DynamicClient with one MachineConfig CR seeded. +// machineSection and clusterSection are the spec.machine and spec.cluster content +// stored as unstructured Go maps (matching the CR's unstructured representation). +func buildMCSyncDynClient(clusterRef, nodeClass string, machineSection, clusterSection map[string]interface{}) *fakedyn.FakeDynamicClient { + cr := &unstructured.Unstructured{} + cr.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }) + cr.SetName(machineConfigSyncCRName(clusterRef, nodeClass)) + cr.SetNamespace(machineConfigSyncCRNamespace(clusterRef)) + + spec := map[string]interface{}{ + "role": "controlplane", + "order": int64(1), + "clusterRef": map[string]interface{}{"name": clusterRef}, + "nodeIP": "10.20.0.11", + "nodeHostname": nodeClass, + } + if machineSection != nil { + spec["machine"] = machineSection + } + if clusterSection != nil { + spec["cluster"] = clusterSection + } + cr.Object["spec"] = spec + + return fakedyn.NewSimpleDynamicClient(buildMCSyncScheme(), cr) +} + +// writeFakeTalosconfig writes a minimal talosconfig YAML to a temp file and returns +// its path. The config uses ctx.nodes so EndpointsFromTalosconfig returns nodeIPs directly. +func writeFakeTalosconfig(t *testing.T, nodeIPs []string) string { + t.Helper() + var nodesYAML string + for _, ip := range nodeIPs { + nodesYAML += " - " + ip + "\n" } + content := "context: default\ncontexts:\n default:\n endpoints: []\n nodes:\n" + nodesYAML + f, err := os.CreateTemp(t.TempDir(), "talosconfig-*.yaml") + if err != nil { + t.Fatalf("create temp talosconfig: %v", err) + } + if _, err := f.WriteString(content); err != nil { + t.Fatalf("write talosconfig: %v", err) + } + _ = f.Close() + return f.Name() +} + +// containsString is a simple string containment check for test use only. +func containsString(s, sub string) bool { + return len(s) >= len(sub) && (s == sub || len(sub) == 0 || + func() bool { + for i := 0; i <= len(s)-len(sub); i++ { + if s[i:i+len(sub)] == sub { + return true + } + } + return false + }()) } // TestMachineConfigSyncHandler_MissingEnvVar verifies that a ValidationFailure is // returned when MC_NODE_CLASS is absent from the environment. func TestMachineConfigSyncHandler_MissingEnvVar(t *testing.T) { - // Ensure MC_NODE_CLASS is not set. t.Setenv(envMCSyncNodeClass, "") handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ - Capability: runnerlib.CapabilityMachineConfigSync, - ClusterRef: "ccs-mgmt", + Capability: runnerlib.CapabilityMachineConfigSync, + ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ - TalosClient: &stubApplyTalosClient{}, - KubeClient: fake.NewSimpleClientset(), + TalosClient: &stubApplyTalosClient{}, + DynamicClient: buildMCSyncDynClient("ccs-mgmt", "", nil, nil), }, }) if err != nil { @@ -81,9 +141,9 @@ func TestMachineConfigSyncHandler_MissingEnvVar(t *testing.T) { } // TestMachineConfigSyncHandler_NilClients verifies that a ValidationFailure is -// returned when TalosClient or KubeClient is nil. +// returned when TalosClient or DynamicClient is nil. func TestMachineConfigSyncHandler_NilClients(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") + t.Setenv(envMCSyncNodeClass, "cp1") handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ @@ -101,18 +161,18 @@ func TestMachineConfigSyncHandler_NilClients(t *testing.T) { } } -// TestMachineConfigSyncHandler_SecretNotFound verifies that a ExecutionFailure is -// returned when the machineconfig Secret is absent from the cluster. -func TestMachineConfigSyncHandler_SecretNotFound(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") +// TestMachineConfigSyncHandler_CRNotFound verifies that an ExecutionFailure is +// returned when the MachineConfig CR is absent from the management cluster. +func TestMachineConfigSyncHandler_CRNotFound(t *testing.T) { + t.Setenv(envMCSyncNodeClass, "cp1") handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ Capability: runnerlib.CapabilityMachineConfigSync, ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ - TalosClient: &stubApplyTalosClient{}, - KubeClient: fake.NewSimpleClientset(), // no secret + TalosClient: &stubApplyTalosClient{}, + DynamicClient: fakedyn.NewSimpleDynamicClient(buildMCSyncScheme()), // no CR seeded }, }) if err != nil { @@ -121,16 +181,24 @@ func TestMachineConfigSyncHandler_SecretNotFound(t *testing.T) { if result.Status != runnerlib.ResultFailed { t.Errorf("expected ResultFailed, got %q", result.Status) } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ExecutionFailure { + t.Errorf("expected ExecutionFailure for missing CR, got %v", result.FailureReason) + } } // TestMachineConfigSyncHandler_AppliesAndInjectsLabel verifies that the handler -// applies the machineconfig to the Talos node and injects the ONT controlled label. +// reads the MachineConfig CR, reconstructs Talos YAML, applies it, and injects +// the ONT controlled node label. func TestMachineConfigSyncHandler_AppliesAndInjectsLabel(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") + t.Setenv(envMCSyncNodeClass, "cp1") + t.Setenv(envMCSyncNodeIP, "10.20.0.11") + defer t.Setenv(envMCSyncNodeIP, "") - machineConfigContent := []byte("machine:\n type: controlplane\n nodeLabels: {}\n") - secret := mcSyncTestSecret("ccs-mgmt", "controlplane", machineConfigContent) - kubeClient := fake.NewSimpleClientset(secret) + machineSection := map[string]interface{}{ + "type": "controlplane", + "nodeLabels": map[string]interface{}{}, + } + dynClient := buildMCSyncDynClient("ccs-mgmt", "cp1", machineSection, nil) talosClient := &stubApplyTalosClient{} handler := &machineConfigSyncHandler{} @@ -138,8 +206,8 @@ func TestMachineConfigSyncHandler_AppliesAndInjectsLabel(t *testing.T) { Capability: runnerlib.CapabilityMachineConfigSync, ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ - TalosClient: talosClient, - KubeClient: kubeClient, + TalosClient: talosClient, + DynamicClient: dynClient, }, }) if err != nil { @@ -152,79 +220,32 @@ func TestMachineConfigSyncHandler_AppliesAndInjectsLabel(t *testing.T) { } } - // Verify ApplyConfiguration was called once. if len(talosClient.applied) != 1 { t.Fatalf("expected 1 ApplyConfiguration call, got %d", len(talosClient.applied)) } - - // Verify the applied config contains the ONT label. applied := string(talosClient.applied[0]) if !containsString(applied, ontControlledLabel) { t.Errorf("applied config does not contain node label %q:\n%s", ontControlledLabel, applied) } - if len(result.Steps) != 1 { t.Errorf("expected 1 step result, got %d", len(result.Steps)) } } -// containsString is a simple string containment check for test use only. -func containsString(s, sub string) bool { - return len(s) >= len(sub) && (s == sub || len(sub) == 0 || - func() bool { - for i := 0; i <= len(s)-len(sub); i++ { - if s[i:i+len(sub)] == sub { - return true - } - } - return false - }()) -} +// TestMachineConfigSyncHandler_ReconstructsBothSections verifies that spec.machine +// and spec.cluster are both present in the reconstructed Talos YAML. +func TestMachineConfigSyncHandler_ReconstructsBothSections(t *testing.T) { + t.Setenv(envMCSyncNodeClass, "cp1") + t.Setenv(envMCSyncNodeIP, "10.20.0.11") + defer t.Setenv(envMCSyncNodeIP, "") -// writeFakeTalosconfig writes a minimal talosconfig YAML to a temp file and returns -// its path. The config uses ctx.nodes so EndpointsFromTalosconfig returns nodeIPs directly. -func writeFakeTalosconfig(t *testing.T, nodeIPs []string) string { - t.Helper() - var nodesYAML string - for _, ip := range nodeIPs { - nodesYAML += fmt.Sprintf(" - %s\n", ip) - } - content := fmt.Sprintf("context: default\ncontexts:\n default:\n endpoints: []\n nodes:\n%s", nodesYAML) - f, err := os.CreateTemp(t.TempDir(), "talosconfig-*.yaml") - if err != nil { - t.Fatalf("create temp talosconfig: %v", err) + machineSection := map[string]interface{}{ + "type": "controlplane", } - if _, err := f.WriteString(content); err != nil { - t.Fatalf("write talosconfig: %v", err) + clusterSection := map[string]interface{}{ + "clusterName": "ccs-mgmt", } - _ = f.Close() - return f.Name() -} - -// TestMachineConfigSyncHandler_DecompressesGzipSecret verifies that when the machineconfig -// secret is gzip-compressed (compression label present), the capability decompresses it -// before applying. RECON-F5. -func TestMachineConfigSyncHandler_DecompressesGzipSecret(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") - - rawContent := []byte("machine:\n type: controlplane\n") - var buf bytes.Buffer - w := mustGzipWriter(t, &buf) - _, _ = w.Write(rawContent) - _ = w.Close() - compressed := buf.Bytes() - - secret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: "seam-mc-ccs-mgmt-controlplane", - Namespace: "seam-tenant-ccs-mgmt", - Labels: map[string]string{ - "platform.ontai.dev/compression": "gzip", - }, - }, - Data: map[string][]byte{"machineconfig": compressed}, - } - kubeClient := fake.NewSimpleClientset(secret) + dynClient := buildMCSyncDynClient("ccs-mgmt", "cp1", machineSection, clusterSection) talosClient := &stubApplyTalosClient{} handler := &machineConfigSyncHandler{} @@ -232,8 +253,8 @@ func TestMachineConfigSyncHandler_DecompressesGzipSecret(t *testing.T) { Capability: runnerlib.CapabilityMachineConfigSync, ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ - TalosClient: talosClient, - KubeClient: kubeClient, + TalosClient: talosClient, + DynamicClient: dynClient, }, }) if err != nil { @@ -243,57 +264,38 @@ func TestMachineConfigSyncHandler_DecompressesGzipSecret(t *testing.T) { t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) } if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 apply call, got %d", len(talosClient.applied)) + t.Fatalf("expected 1 ApplyConfiguration call, got %d", len(talosClient.applied)) } applied := string(talosClient.applied[0]) - if !containsString(applied, "type: controlplane") { - t.Errorf("decompressed content not present in applied config:\n%s", applied) + if !containsString(applied, "machine:") { + t.Errorf("applied config missing machine section:\n%s", applied) } -} - -// mustGzipWriter returns a gzip.Writer writing to w. Fatals if creation fails. -func mustGzipWriter(t *testing.T, w *bytes.Buffer) *gzip.Writer { - t.Helper() - gw := gzip.NewWriter(w) - return gw -} - -// buildMCSyncNode returns a minimal Kubernetes Node object with the given name and InternalIP. -func buildMCSyncNode(name, ip string) *corev1.Node { - return &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: name}, - Status: corev1.NodeStatus{ - Addresses: []corev1.NodeAddress{ - {Type: corev1.NodeInternalIP, Address: ip}, - }, - }, + if !containsString(applied, "cluster:") { + t.Errorf("applied config missing cluster section:\n%s", applied) } } -// TestMachineConfigSyncHandler_PerNodePatchMerged verifies that when a per-node patch -// secret exists for a node, its content is merged into the base class config. RECON-A8. -func TestMachineConfigSyncHandler_PerNodePatchMerged(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") - - baseContent := []byte("machine:\n type: controlplane\n") - patchContent := []byte("machine:\n network:\n hostname: cp1\n") - - secret := mcSyncTestSecret("ccs-mgmt", "controlplane", baseContent) - patchSecret := mcSyncTestSecret("ccs-mgmt", "node-cp1", patchContent) - node := buildMCSyncNode("cp1", "10.20.0.2") - kubeClient := fake.NewSimpleClientset(secret, patchSecret, node) +// TestMachineConfigSyncHandler_NodeIPTargetsSingleNode verifies that when MC_NODE_IP +// is set, the capability applies to only that one node and skips talosconfig enumeration. +// PLT-BUG-3-ARCH. +func TestMachineConfigSyncHandler_NodeIPTargetsSingleNode(t *testing.T) { + t.Setenv(envMCSyncNodeClass, "cp1") + t.Setenv(envMCSyncNodeIP, "10.20.0.11") + defer t.Setenv(envMCSyncNodeIP, "") + + machineSection := map[string]interface{}{"type": "controlplane"} + dynClient := buildMCSyncDynClient("ccs-dev", "cp1", machineSection, nil) + // Provide a talosconfig with multiple nodes -- only MC_NODE_IP should be targeted. + talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.11", "10.20.0.12", "10.20.0.13"}) talosClient := &stubApplyTalosClient{} - // Provide a fake talosconfig so the handler enumerates nodeIPs. - talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.2"}) - handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ Capability: runnerlib.CapabilityMachineConfigSync, - ClusterRef: "ccs-mgmt", + ClusterRef: "ccs-dev", ExecuteClients: ExecuteClients{ TalosClient: talosClient, - KubeClient: kubeClient, + DynamicClient: dynClient, TalosconfigPath: talosconfigPath, }, }) @@ -303,38 +305,28 @@ func TestMachineConfigSyncHandler_PerNodePatchMerged(t *testing.T) { if result.Status != runnerlib.ResultSucceeded { t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) } + // Must apply to exactly 1 node (not 3). if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 apply call, got %d", len(talosClient.applied)) + t.Fatalf("expected 1 ApplyConfiguration call (MC_NODE_IP single-target), got %d", len(talosClient.applied)) } - applied := string(talosClient.applied[0]) - if !containsString(applied, "hostname: cp1") { - t.Errorf("per-node patch hostname not merged into applied config:\n%s", applied) + if len(result.Steps) != 1 { + t.Errorf("expected 1 step result, got %d", len(result.Steps)) } - if !containsString(applied, ontControlledLabel) { - t.Errorf("ONT controlled label missing from merged config:\n%s", applied) + if !containsString(result.Steps[0].Message, "10.20.0.11") { + t.Errorf("step message must reference nodeIP 10.20.0.11, got %q", result.Steps[0].Message) } } -// TestMachineConfigSyncHandler_PerNodePatchPreservesONTLabel verifies that a per-node -// patch cannot override the ontControlledLabel (protected field). RECON-A8. -func TestMachineConfigSyncHandler_PerNodePatchPreservesONTLabel(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") - - baseContent := []byte("machine:\n type: controlplane\n") - // Patch explicitly tries to remove/override the ONT label. - patchContent := []byte(`machine: - nodeLabels: - ont.platform.dev/controlled: "false" - custom-key: custom-val -`) - - secret := mcSyncTestSecret("ccs-mgmt", "controlplane", baseContent) - patchSecret := mcSyncTestSecret("ccs-mgmt", "node-cp2", patchContent) - node := buildMCSyncNode("cp2", "10.20.0.3") - kubeClient := fake.NewSimpleClientset(secret, patchSecret, node) - talosClient := &stubApplyTalosClient{} +// TestMachineConfigSyncHandler_TalosconfigMultipleNodes verifies that when MC_NODE_IP +// is not set but a talosconfig with multiple nodes is provided, the capability applies +// to all enumerated nodes. +func TestMachineConfigSyncHandler_TalosconfigMultipleNodes(t *testing.T) { + t.Setenv(envMCSyncNodeClass, "cp1") - talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.3"}) + machineSection := map[string]interface{}{"type": "controlplane"} + dynClient := buildMCSyncDynClient("ccs-mgmt", "cp1", machineSection, nil) + talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.11", "10.20.0.12"}) + talosClient := &stubApplyTalosClient{} handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ @@ -342,7 +334,7 @@ func TestMachineConfigSyncHandler_PerNodePatchPreservesONTLabel(t *testing.T) { ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ TalosClient: talosClient, - KubeClient: kubeClient, + DynamicClient: dynClient, TalosconfigPath: talosconfigPath, }, }) @@ -352,90 +344,98 @@ func TestMachineConfigSyncHandler_PerNodePatchPreservesONTLabel(t *testing.T) { if result.Status != runnerlib.ResultSucceeded { t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) } - if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 apply call, got %d", len(talosClient.applied)) - } - applied := string(talosClient.applied[0]) - // ONT label must be "true" (re-injected after merge). - if !containsString(applied, `ont.platform.dev/controlled: "true"`) { - t.Errorf("ONT controlled label not protected; applied config:\n%s", applied) - } - // Per-node patch content should also be present. - if !containsString(applied, "custom-key") { - t.Errorf("per-node patch custom label missing from merged config:\n%s", applied) + if len(talosClient.applied) != 2 { + t.Errorf("expected 2 ApplyConfiguration calls (one per node), got %d", len(talosClient.applied)) } } -// TestMachineConfigSyncHandler_SingleNodeClass verifies that when nodeClass starts with -// "node-", no additional per-node patch lookup is performed and the base config is applied -// directly. RECON-A8. -func TestMachineConfigSyncHandler_SingleNodeClass(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "node-cp1") - - nodeContent := []byte("machine:\n type: controlplane\n network:\n hostname: cp1\n") - secret := mcSyncTestSecret("ccs-mgmt", "node-cp1", nodeContent) - kubeClient := fake.NewSimpleClientset(secret) - talosClient := &stubApplyTalosClient{} +// TestMachineConfigSyncHandler_CREmptySpecFails verifies that a ValidationFailure +// is returned when the MachineConfig CR spec has neither machine nor cluster section. +func TestMachineConfigSyncHandler_CREmptySpecFails(t *testing.T) { + t.Setenv(envMCSyncNodeClass, "cp1") + t.Setenv(envMCSyncNodeIP, "10.20.0.11") + defer t.Setenv(envMCSyncNodeIP, "") + + // Seed a CR with no machine/cluster sections. + cr := &unstructured.Unstructured{} + cr.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }) + cr.SetName("seam-mc-ccs-mgmt-cp1") + cr.SetNamespace("seam-tenant-ccs-mgmt") + cr.Object["spec"] = map[string]interface{}{ + "role": "controlplane", + "order": int64(1), + } + dynClient := fakedyn.NewSimpleDynamicClient(buildMCSyncScheme(), cr) handler := &machineConfigSyncHandler{} result, err := handler.Execute(context.Background(), ExecuteParams{ Capability: runnerlib.CapabilityMachineConfigSync, ClusterRef: "ccs-mgmt", ExecuteClients: ExecuteClients{ - TalosClient: talosClient, - KubeClient: kubeClient, + TalosClient: &stubApplyTalosClient{}, + DynamicClient: dynClient, }, }) if err != nil { t.Fatalf("unexpected error: %v", err) } - if result.Status != runnerlib.ResultSucceeded { - t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) - } - if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 apply call, got %d", len(talosClient.applied)) + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed for empty CR spec, got %q", result.Status) } - applied := string(talosClient.applied[0]) - if !containsString(applied, ontControlledLabel) { - t.Errorf("ONT controlled label missing in single-node-class apply:\n%s", applied) + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure for empty CR spec, got %v", result.FailureReason) } } -// TestMachineConfigSyncHandler_NoPatchSecretFallsBackToBase verifies that when no -// per-node patch secret exists, the base class config is applied unchanged. RECON-A8. -func TestMachineConfigSyncHandler_NoPatchSecretFallsBackToBase(t *testing.T) { - t.Setenv(envMCSyncNodeClass, "controlplane") - - baseContent := []byte("machine:\n type: controlplane\n") - secret := mcSyncTestSecret("ccs-mgmt", "controlplane", baseContent) - node := buildMCSyncNode("cp3", "10.20.0.4") - // No per-node patch secret in the fake client. - kubeClient := fake.NewSimpleClientset(secret, node) - talosClient := &stubApplyTalosClient{} - - talosconfigPath := writeFakeTalosconfig(t, []string{"10.20.0.4"}) - - handler := &machineConfigSyncHandler{} - result, err := handler.Execute(context.Background(), ExecuteParams{ - Capability: runnerlib.CapabilityMachineConfigSync, - ClusterRef: "ccs-mgmt", - ExecuteClients: ExecuteClients{ - TalosClient: talosClient, - KubeClient: kubeClient, - TalosconfigPath: talosconfigPath, +// TestReconstructMachineConfigYAML_BothSections verifies YAML reconstruction from a CR +// with both machine and cluster sections. +func TestReconstructMachineConfigYAML_BothSections(t *testing.T) { + obj := map[string]interface{}{ + "spec": map[string]interface{}{ + "machine": map[string]interface{}{ + "type": "controlplane", + }, + "cluster": map[string]interface{}{ + "clusterName": "ccs-mgmt", + }, }, - }) + } + out, err := reconstructMachineConfigYAML(obj) if err != nil { t.Fatalf("unexpected error: %v", err) } - if result.Status != runnerlib.ResultSucceeded { - t.Fatalf("expected ResultSucceeded, got %q; reason: %v", result.Status, result.FailureReason) + s := string(out) + if !containsString(s, "machine:") { + t.Errorf("output missing machine section:\n%s", s) } - if len(talosClient.applied) != 1 { - t.Fatalf("expected 1 apply call, got %d", len(talosClient.applied)) + if !containsString(s, "cluster:") { + t.Errorf("output missing cluster section:\n%s", s) } - applied := string(talosClient.applied[0]) - if !containsString(applied, ontControlledLabel) { - t.Errorf("ONT label missing in fallback apply:\n%s", applied) +} + +// TestReconstructMachineConfigYAML_MissingSpec verifies an error when spec is absent. +func TestReconstructMachineConfigYAML_MissingSpec(t *testing.T) { + _, err := reconstructMachineConfigYAML(map[string]interface{}{}) + if err == nil { + t.Error("expected error for missing spec, got nil") + } +} + +// TestReconstructMachineConfigYAML_EmptySections verifies an error when neither +// machine nor cluster is present in spec. +func TestReconstructMachineConfigYAML_EmptySections(t *testing.T) { + obj := map[string]interface{}{ + "spec": map[string]interface{}{ + "role": "controlplane", + }, + } + _, err := reconstructMachineConfigYAML(obj) + if err == nil { + t.Error("expected error for spec with no machine/cluster, got nil") } } + +// metav1 import is used via metav1.GetOptions in the dynamic client calls. +var _ = metav1.GetOptions{} diff --git a/internal/capability/platform_machineconfig_test.go b/internal/capability/platform_machineconfig_test.go index 66b2719..3c717d8 100644 --- a/internal/capability/platform_machineconfig_test.go +++ b/internal/capability/platform_machineconfig_test.go @@ -23,7 +23,8 @@ type stubTalosClientMC struct { func (s *stubTalosClientMC) Bootstrap(_ context.Context) error { return nil } func (s *stubTalosClientMC) ApplyConfiguration(_ context.Context, _ []byte, _ string) error { return nil } func (s *stubTalosClientMC) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (s *stubTalosClientMC) Reboot(_ context.Context) error { return nil } +func (s *stubTalosClientMC) Reboot(_ context.Context) error { return nil } +func (s *stubTalosClientMC) RebootPowercycle(_ context.Context) error { return nil } func (s *stubTalosClientMC) Reset(_ context.Context, _ bool) error { return nil } func (s *stubTalosClientMC) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (s *stubTalosClientMC) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } @@ -302,7 +303,8 @@ func (s *stubTalosClientRestore) ApplyConfiguration(_ context.Context, _ []byte, return s.applyErr } func (s *stubTalosClientRestore) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (s *stubTalosClientRestore) Reboot(_ context.Context) error { return nil } +func (s *stubTalosClientRestore) Reboot(_ context.Context) error { return nil } +func (s *stubTalosClientRestore) RebootPowercycle(_ context.Context) error { return nil } func (s *stubTalosClientRestore) Reset(_ context.Context, _ bool) error { return nil } func (s *stubTalosClientRestore) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (s *stubTalosClientRestore) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } diff --git a/internal/capability/platform_node_scaleup_test.go b/internal/capability/platform_node_scaleup_test.go index cb1cabf..7e3d02e 100644 --- a/internal/capability/platform_node_scaleup_test.go +++ b/internal/capability/platform_node_scaleup_test.go @@ -37,7 +37,8 @@ func (s *stubEnrollTalosClient) ApplyConfiguration(_ context.Context, cfg []byte return s.applyErr } func (s *stubEnrollTalosClient) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (s *stubEnrollTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubEnrollTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubEnrollTalosClient) RebootPowercycle(_ context.Context) error { return nil } func (s *stubEnrollTalosClient) Reset(_ context.Context, _ bool) error { return nil } func (s *stubEnrollTalosClient) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (s *stubEnrollTalosClient) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } diff --git a/internal/capability/platform_postop_test.go b/internal/capability/platform_postop_test.go index 332a003..26432ce 100644 --- a/internal/capability/platform_postop_test.go +++ b/internal/capability/platform_postop_test.go @@ -18,7 +18,8 @@ type mockTalosPostOp struct { func (m *mockTalosPostOp) Bootstrap(ctx context.Context) error { return nil } func (m *mockTalosPostOp) ApplyConfiguration(_ context.Context, _ []byte, _ string) error { return nil } func (m *mockTalosPostOp) Upgrade(_ context.Context, _ string, _ bool) error { return nil } -func (m *mockTalosPostOp) Reboot(_ context.Context) error { return nil } +func (m *mockTalosPostOp) Reboot(_ context.Context) error { return nil } +func (m *mockTalosPostOp) RebootPowercycle(_ context.Context) error { return nil } func (m *mockTalosPostOp) Reset(_ context.Context, _ bool) error { return nil } func (m *mockTalosPostOp) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (m *mockTalosPostOp) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } diff --git a/internal/capability/platform_upgrade.go b/internal/capability/platform_upgrade.go index 9a94507..fb3cd0a 100644 --- a/internal/capability/platform_upgrade.go +++ b/internal/capability/platform_upgrade.go @@ -9,6 +9,7 @@ import ( "encoding/json" "fmt" "log/slog" + "sort" "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -116,6 +117,40 @@ func clearUpgradeProgress(ctx context.Context, dynClient dynamic.Interface, ns, } } +// nodesFromMachineConfigCRs lists all MachineConfig CRs in seam-tenant-{clusterRef}, +// sorts them by spec.order ascending (lower ordinal upgrades first), and returns the +// ordered slice of nodeIP strings. Returns nil when no CRs are found or on list error +// so the caller can fall back to talosconfig endpoint enumeration. +func nodesFromMachineConfigCRs(ctx context.Context, dynClient dynamic.Interface, ns string) []string { + crList, err := dynClient.Resource(machineConfigGVR).Namespace(ns).List(ctx, metav1.ListOptions{}) + if err != nil || len(crList.Items) == 0 { + return nil + } + type nodeEntry struct { + order int64 + nodeIP string + } + entries := make([]nodeEntry, 0, len(crList.Items)) + for _, item := range crList.Items { + spec, _ := item.Object["spec"].(map[string]interface{}) + if spec == nil { + continue + } + ip, _, _ := unstructuredString(spec, "nodeIP") + if ip == "" { + continue + } + order, _ := spec["order"].(int64) + entries = append(entries, nodeEntry{order: order, nodeIP: ip}) + } + sort.Slice(entries, func(i, j int) bool { return entries[i].order < entries[j].order }) + ips := make([]string, len(entries)) + for i, e := range entries { + ips[i] = e.nodeIP + } + return ips +} + // talosUpgradeHandler implements the talos-upgrade named capability. // Performs a rolling sequential upgrade of all nodes: each node is upgraded // with stage=false (immediate reboot), then we wait for it to return healthy @@ -130,13 +165,18 @@ func (h *talosUpgradeHandler) Execute(ctx context.Context, params ExecuteParams) "talos-upgrade requires TalosClient and DynamicClient"), nil } - nodes := params.TalosClient.Nodes() + ns := tenantNamespace(params.ClusterRef) + + // Use MachineConfig CRs sorted by spec.order as the canonical node iteration order. + // Falls back to talosconfig endpoint enumeration when no CRs are present. + nodes := nodesFromMachineConfigCRs(ctx, params.DynamicClient, ns) + if len(nodes) == 0 { + nodes = params.TalosClient.Nodes() + } if len(nodes) == 0 { return failureResult(runnerlib.CapabilityTalosUpgrade, now, runnerlib.ValidationFailure, - "talos-upgrade: no nodes available from talosconfig"), nil + "talos-upgrade: no nodes available from MachineConfig CRs or talosconfig"), nil } - - ns := tenantNamespace(params.ClusterRef) crList, err := params.DynamicClient.Resource(upgradePolicyGVR).Namespace(ns). List(ctx, metav1.ListOptions{}) if err != nil { @@ -220,11 +260,11 @@ func (h *talosUpgradeHandler) Execute(ctx context.Context, params ExecuteParams) fmt.Sprintf("stage upgrade node %s to %s: %v", nodeIP, upgradeImage, uErr)), nil } - if rErr := params.TalosClient.Reboot(nodeCtx); rErr != nil { - slog.Info("talos-upgrade: forced reboot failed", + if rErr := params.TalosClient.RebootPowercycle(nodeCtx); rErr != nil { + slog.Info("talos-upgrade: powercycle reboot failed", slog.String("node", nodeIP), slog.String("error", rErr.Error())) return failureResult(runnerlib.CapabilityTalosUpgrade, now, runnerlib.ExecutionFailure, - fmt.Sprintf("reboot node %s after staging upgrade to %s: %v", nodeIP, upgradeImage, rErr)), nil + fmt.Sprintf("powercycle reboot node %s after staging upgrade to %s: %v", nodeIP, upgradeImage, rErr)), nil } slog.Info("talos-upgrade: upgrade staged and reboot forced, waiting for node reboot", diff --git a/internal/capability/platform_upgrade_test.go b/internal/capability/platform_upgrade_test.go index c3b227a..c6d7be3 100644 --- a/internal/capability/platform_upgrade_test.go +++ b/internal/capability/platform_upgrade_test.go @@ -2,6 +2,7 @@ package capability import ( "context" + "fmt" "io" "testing" @@ -11,20 +12,25 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" fakedyn "k8s.io/client-go/dynamic/fake" "k8s.io/client-go/kubernetes/fake" + + "github.com/ontai-dev/conductor-sdk/runnerlib" ) // stubUpgradeTalosClient is a TalosNodeClient stub for upgrade tests. // Health returns healthErr on every call (nil = node is healthy). +// powercycleCalled records whether RebootPowercycle was called (for Phase 4b test assertions). type stubUpgradeTalosClient struct { - nodes []string - healthErr error - upgradeErr error + nodes []string + healthErr error + upgradeErr error + powercycleCalled bool } func (s *stubUpgradeTalosClient) Bootstrap(_ context.Context) error { return nil } func (s *stubUpgradeTalosClient) ApplyConfiguration(_ context.Context, _ []byte, _ string) error { return nil } func (s *stubUpgradeTalosClient) Upgrade(_ context.Context, _ string, _ bool) error { return s.upgradeErr } func (s *stubUpgradeTalosClient) Reboot(_ context.Context) error { return nil } +func (s *stubUpgradeTalosClient) RebootPowercycle(_ context.Context) error { s.powercycleCalled = true; return nil } func (s *stubUpgradeTalosClient) Reset(_ context.Context, _ bool) error { return nil } func (s *stubUpgradeTalosClient) EtcdSnapshot(_ context.Context, _ io.Writer) error { return nil } func (s *stubUpgradeTalosClient) EtcdRecover(_ context.Context, _ io.Reader) error { return nil } @@ -270,3 +276,131 @@ func TestWaitForNodeReboot_KubeNotReadyReturnsError(t *testing.T) { t.Error("expected error when Kubernetes node remains NotReady, got nil") } } + +// ── nodesFromMachineConfigCRs ──────────────────────────────────────────────── + +// buildUpgradeDynClientWithMachineConfigs returns a fake DynamicClient containing +// both an UpgradePolicy CR and a set of MachineConfig CRs for order-based iteration tests. +// nodes is a slice of (nodeIP, order) pairs; the function creates one MachineConfig CR per entry. +func buildUpgradeDynClientWithMachineConfigs(clusterRef, policyName, upgradeType, targetVersion string, nodes []struct{ ip string; order int64 }) *fakedyn.FakeDynamicClient { + scheme := runtime.NewScheme() + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "UpgradePolicy", + }, &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "UpgradePolicyList", + }, &unstructured.UnstructuredList{}) + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }, &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfigList", + }, &unstructured.UnstructuredList{}) + + ns := "seam-tenant-" + clusterRef + + policy := &unstructured.Unstructured{} + policy.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "UpgradePolicy", + }) + policy.SetName(policyName) + policy.SetNamespace(ns) + specVersionKey := "targetTalosVersion" + if upgradeType == "kubernetes" { + specVersionKey = "targetKubernetesVersion" + } + policy.Object["spec"] = map[string]interface{}{ + "upgradeType": upgradeType, + specVersionKey: targetVersion, + } + + objs := []runtime.Object{policy} + for i, n := range nodes { + mc := &unstructured.Unstructured{} + mc.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }) + mc.SetName(fmt.Sprintf("seam-mc-%s-node%d", clusterRef, i)) + mc.SetNamespace(ns) + mc.Object["spec"] = map[string]interface{}{ + "nodeIP": n.ip, + "order": n.order, + } + objs = append(objs, mc) + } + return fakedyn.NewSimpleDynamicClient(scheme, objs...) +} + +// TestNodesFromMachineConfigCRs_SortsAscendingByOrder verifies that +// nodesFromMachineConfigCRs returns nodeIPs sorted by spec.order ascending. +// Phase 4b: upgrade order is driven by MachineConfig CR spec.order. +func TestNodesFromMachineConfigCRs_SortsAscendingByOrder(t *testing.T) { + // Intentionally seed CRs out of order: 2, 0, 1 -- expect 0, 1, 2 back. + dyn := buildUpgradeDynClientWithMachineConfigs("ccs-dev", "up-mc", "talos", "v1.10.0", []struct{ ip string; order int64 }{ + {ip: "10.20.0.12", order: 2}, + {ip: "10.20.0.10", order: 0}, + {ip: "10.20.0.11", order: 1}, + }) + ns := "seam-tenant-ccs-dev" + + got := nodesFromMachineConfigCRs(context.Background(), dyn, ns) + want := []string{"10.20.0.10", "10.20.0.11", "10.20.0.12"} + if len(got) != len(want) { + t.Fatalf("got %d nodes, want %d: %v", len(got), len(want), got) + } + for i, ip := range want { + if got[i] != ip { + t.Errorf("node[%d]: got %q, want %q", i, got[i], ip) + } + } +} + +// TestNodesFromMachineConfigCRs_EmptyWhenNoCRs verifies that nodesFromMachineConfigCRs +// returns nil when no MachineConfig CRs exist, so the caller can fall back. +func TestNodesFromMachineConfigCRs_EmptyWhenNoCRs(t *testing.T) { + scheme := runtime.NewScheme() + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfig", + }, &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "platform.ontai.dev", Version: "v1alpha1", Kind: "MachineConfigList", + }, &unstructured.UnstructuredList{}) + dyn := fakedyn.NewSimpleDynamicClient(scheme) + + got := nodesFromMachineConfigCRs(context.Background(), dyn, "seam-tenant-ccs-dev") + if len(got) != 0 { + t.Errorf("expected nil/empty, got %v", got) + } +} + +// TestTalosUpgrade_PowercycleCalledAfterStage verifies that after staging the Talos +// upgrade, the handler calls RebootPowercycle (not plain Reboot) on each node. +// Phase 4b: hardware powercycle ensures clean BIOS/UEFI re-initialisation post-upgrade. +func TestTalosUpgrade_PowercycleCalledAfterStage(t *testing.T) { + defer setRebootTimeouts()() + + talos := &stubUpgradeTalosClient{ + nodes: []string{"10.20.0.10"}, + } + dyn := buildUpgradeDynClientWithMachineConfigs("ccs-dev", "up-pow", "talos", "v1.10.0", []struct{ ip string; order int64 }{ + {ip: "10.20.0.10", order: 0}, + }) + + handler := &talosUpgradeHandler{} + result, err := handler.Execute(context.Background(), ExecuteParams{ + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TalosClient: talos, + DynamicClient: dyn, + }, + }) + if err != nil { + t.Fatalf("Execute error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Errorf("expected ResultSucceeded, got %q: %v", result.Status, result.Steps) + } + if !talos.powercycleCalled { + t.Error("expected RebootPowercycle to be called; it was not") + } +} diff --git a/internal/capability/stubs.go b/internal/capability/stubs.go index 826a229..2d31a06 100644 --- a/internal/capability/stubs.go +++ b/internal/capability/stubs.go @@ -42,6 +42,13 @@ func RegisterAll(reg *Registry) { // Guardian capabilities -- RBAC plane. reg.Register(runnerlib.CapabilityRBACProvision, &rbacProvisionHandler{}) + // Watchdog capabilities -- runtime failure remediation. Triggered by + // RuntimeDrift DriftSignals on the management cluster. conductor-schema.md §6. + reg.Register(runnerlib.CapabilityPodRestart, &podRestartHandler{}) + reg.Register(runnerlib.CapabilityResourcePatch, &resourcePatchHandler{}) + reg.Register(runnerlib.CapabilityForceVolumeDetach, &forceVolumeDetachHandler{}) + reg.Register(runnerlib.CapabilityCredentialRefresh, &credentialRefreshHandler{}) + // Note: CapabilityPackCompile is NOT registered here. pack-compile is a // Compiler compile-mode invocation only -- it never runs as a Conductor Job. // Registering it here would be a schema violation. conductor-schema.md §6. diff --git a/internal/capability/watchdog.go b/internal/capability/watchdog.go new file mode 100644 index 0000000..2fab1fe --- /dev/null +++ b/internal/capability/watchdog.go @@ -0,0 +1,355 @@ +// Package capability — Conductor Watchdog remediation capability implementations. +// pod-restart, resource-patch, force-volume-detach, credential-refresh. +// Triggered by RuntimeDrift DriftSignals on the management cluster. +// conductor-schema.md §6, conductor-sdk runnerlib/constants.go. +package capability + +import ( + "context" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + + "github.com/ontai-dev/conductor-sdk/runnerlib" +) + +// packNameLabel is the label key used to scope watchdog operations to a specific pack. +// Set by the pack-deploy handler on pod template specs at deploy time. +const packNameLabel = "seam.ontai.dev/pack-name" + +// volumeAttachmentGVR is the GroupVersionResource for storage.k8s.io/v1 VolumeAttachment. +var volumeAttachmentGVR = schema.GroupVersionResource{ + Group: "storage.k8s.io", + Version: "v1", + Resource: "volumeattachments", +} + +// podGVR is the GroupVersionResource for core/v1 Pod. +var podGVR = schema.GroupVersionResource{Group: "", Version: "v1", Resource: "pods"} + +// podNamespaceAnnotation is stamped on Jobs by the execute runner so watchdog handlers +// can find the namespace where the failing pod lives. +const podNamespaceAnnotation = "conductor.ontai.dev/pod-namespace" + +// --------------------------------------------------------------------------- +// pod-restart +// --------------------------------------------------------------------------- + +// podRestartHandler implements the pod-restart named capability. +// Deletes all pods bearing the seam.ontai.dev/pack-name={PackInstalledName} label +// so the ReplicaSet controller recreates them. Used for CrashLoopBackOff failures. +// conductor-schema.md §6, conductor-sdk CapabilityPodRestart. +type podRestartHandler struct{} + +func (h *podRestartHandler) Execute(ctx context.Context, params ExecuteParams) (runnerlib.OperationResultSpec, error) { + now := time.Now().UTC() + + if params.TenantKubeClient == nil { + return failureResult(runnerlib.CapabilityPodRestart, now, runnerlib.ValidationFailure, + "pod-restart requires TenantKubeClient"), nil + } + if params.PackInstalledName == "" { + return failureResult(runnerlib.CapabilityPodRestart, now, runnerlib.ValidationFailure, + "pod-restart requires PackInstalledName"), nil + } + + selector := packNameLabel + "=" + params.PackInstalledName + podList, err := params.TenantKubeClient.CoreV1().Pods("").List(ctx, metav1.ListOptions{ + LabelSelector: selector, + }) + if err != nil { + return failureResult(runnerlib.CapabilityPodRestart, now, runnerlib.ExecutionFailure, + fmt.Sprintf("list pods with label %s on cluster %s: %v", selector, params.ClusterRef, err)), nil + } + + stepStart := time.Now().UTC() + deleted := 0 + for _, pod := range podList.Items { + if err := params.TenantKubeClient.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil { + return failureResult(runnerlib.CapabilityPodRestart, now, runnerlib.ExecutionFailure, + fmt.Sprintf("delete pod %s/%s on cluster %s: %v", pod.Namespace, pod.Name, params.ClusterRef, err)), nil + } + deleted++ + params.Log().Info("pod-restart: deleted pod", "cluster", params.ClusterRef, "pod", pod.Namespace+"/"+pod.Name) + } + + return runnerlib.OperationResultSpec{ + Capability: runnerlib.CapabilityPodRestart, + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Artifacts: []runnerlib.ArtifactRef{}, + Steps: []runnerlib.StepResult{{ + Name: "delete-pods", + Status: runnerlib.ResultSucceeded, + StartedAt: stepStart, + CompletedAt: time.Now().UTC(), + Message: fmt.Sprintf("%d pod(s) deleted from pack %s on cluster %s", deleted, params.PackInstalledName, params.ClusterRef), + }}, + }, nil +} + +// --------------------------------------------------------------------------- +// resource-patch +// --------------------------------------------------------------------------- + +// resourcePatchHandler implements the resource-patch named capability. +// Triggers a rollout restart on all Deployments and StatefulSets bearing the +// seam.ontai.dev/pack-name label. The rollout annotation forces pods to be +// recreated with updated scheduler placement, resolving OOMKilled failures. +// conductor-schema.md §6, conductor-sdk CapabilityResourcePatch. +type resourcePatchHandler struct{} + +func (h *resourcePatchHandler) Execute(ctx context.Context, params ExecuteParams) (runnerlib.OperationResultSpec, error) { + now := time.Now().UTC() + + if params.TenantDynamicClient == nil { + return failureResult(runnerlib.CapabilityResourcePatch, now, runnerlib.ValidationFailure, + "resource-patch requires TenantDynamicClient"), nil + } + if params.PackInstalledName == "" { + return failureResult(runnerlib.CapabilityResourcePatch, now, runnerlib.ValidationFailure, + "resource-patch requires PackInstalledName"), nil + } + + selector := packNameLabel + "=" + params.PackInstalledName + restartTS := time.Now().UTC().Format(time.RFC3339) + + // Patch both Deployments and StatefulSets that belong to this pack. + gvrs := []schema.GroupVersionResource{deploymentGVR, statefulSetGVR} + stepStart := time.Now().UTC() + patched := 0 + + for _, gvr := range gvrs { + list, err := params.TenantDynamicClient.Resource(gvr).Namespace("").List(ctx, metav1.ListOptions{ + LabelSelector: selector, + }) + if err != nil { + return failureResult(runnerlib.CapabilityResourcePatch, now, runnerlib.ExecutionFailure, + fmt.Sprintf("list %s with label %s on cluster %s: %v", gvr.Resource, selector, params.ClusterRef, err)), nil + } + for _, item := range list.Items { + // Inject restart annotation on spec.template.metadata.annotations. + patch := fmt.Sprintf( + `{"spec":{"template":{"metadata":{"annotations":{"kubectl.kubernetes.io/restartedAt":%q}}}}}`, + restartTS, + ) + _, err := params.TenantDynamicClient.Resource(gvr).Namespace(item.GetNamespace()).Patch( + ctx, item.GetName(), types.MergePatchType, []byte(patch), metav1.PatchOptions{}, + ) + if err != nil { + return failureResult(runnerlib.CapabilityResourcePatch, now, runnerlib.ExecutionFailure, + fmt.Sprintf("patch %s %s/%s on cluster %s: %v", gvr.Resource, item.GetNamespace(), item.GetName(), params.ClusterRef, err)), nil + } + patched++ + params.Log().Info("resource-patch: rollout restart triggered", + "cluster", params.ClusterRef, "resource", gvr.Resource, "name", item.GetNamespace()+"/"+item.GetName()) + } + } + + return runnerlib.OperationResultSpec{ + Capability: runnerlib.CapabilityResourcePatch, + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Artifacts: []runnerlib.ArtifactRef{}, + Steps: []runnerlib.StepResult{{ + Name: "rollout-restart", + Status: runnerlib.ResultSucceeded, + StartedAt: stepStart, + CompletedAt: time.Now().UTC(), + Message: fmt.Sprintf("%d workload(s) patched for rollout restart on cluster %s", patched, params.ClusterRef), + }}, + }, nil +} + +// --------------------------------------------------------------------------- +// force-volume-detach +// --------------------------------------------------------------------------- + +// forceVolumeDetachHandler implements the force-volume-detach named capability. +// Deletes VolumeAttachment objects for PVCs belonging to the pack so the kubelet +// can re-attach the volumes on a healthy node. Used for FailedMount and +// MultiAttachError failures. conductor-schema.md §6, conductor-sdk CapabilityForceVolumeDetach. +type forceVolumeDetachHandler struct{} + +func (h *forceVolumeDetachHandler) Execute(ctx context.Context, params ExecuteParams) (runnerlib.OperationResultSpec, error) { + now := time.Now().UTC() + + if params.TenantDynamicClient == nil { + return failureResult(runnerlib.CapabilityForceVolumeDetach, now, runnerlib.ValidationFailure, + "force-volume-detach requires TenantDynamicClient"), nil + } + if params.PackInstalledName == "" { + return failureResult(runnerlib.CapabilityForceVolumeDetach, now, runnerlib.ValidationFailure, + "force-volume-detach requires PackInstalledName"), nil + } + + selector := packNameLabel + "=" + params.PackInstalledName + + // List PVCs with the pack label to find which PVs to detach. + pvcList, err := params.TenantDynamicClient.Resource(pvcGVR).Namespace("").List(ctx, metav1.ListOptions{ + LabelSelector: selector, + }) + if err != nil { + return failureResult(runnerlib.CapabilityForceVolumeDetach, now, runnerlib.ExecutionFailure, + fmt.Sprintf("list PVCs with label %s on cluster %s: %v", selector, params.ClusterRef, err)), nil + } + + // Build the set of PV names bound to these PVCs. + pvNames := make(map[string]struct{}, len(pvcList.Items)) + for _, pvc := range pvcList.Items { + spec, _, _ := unstructuredNestedMap(pvc.Object, "spec") + if pvName, _ := spec["volumeName"].(string); pvName != "" { + pvNames[pvName] = struct{}{} + } + } + + if len(pvNames) == 0 { + return runnerlib.OperationResultSpec{ + Capability: runnerlib.CapabilityForceVolumeDetach, + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Artifacts: []runnerlib.ArtifactRef{}, + Steps: []runnerlib.StepResult{{ + Name: "delete-volume-attachments", + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Message: fmt.Sprintf("no PVs bound to pack %s on cluster %s", params.PackInstalledName, params.ClusterRef), + }}, + }, nil + } + + // List all VolumeAttachments and delete those referencing our PVs. + stepStart := time.Now().UTC() + vaList, err := params.TenantDynamicClient.Resource(volumeAttachmentGVR).List(ctx, metav1.ListOptions{}) + if err != nil { + return failureResult(runnerlib.CapabilityForceVolumeDetach, now, runnerlib.ExecutionFailure, + fmt.Sprintf("list VolumeAttachments on cluster %s: %v", params.ClusterRef, err)), nil + } + + deleted := 0 + for _, va := range vaList.Items { + spec, _, _ := unstructuredNestedMap(va.Object, "spec") + pvRef, _ := spec["source"].(map[string]interface{}) + pvName, _ := pvRef["persistentVolumeName"].(string) + if _, ok := pvNames[pvName]; !ok { + continue + } + if err := params.TenantDynamicClient.Resource(volumeAttachmentGVR).Delete(ctx, va.GetName(), metav1.DeleteOptions{}); err != nil { + return failureResult(runnerlib.CapabilityForceVolumeDetach, now, runnerlib.ExecutionFailure, + fmt.Sprintf("delete VolumeAttachment %s on cluster %s: %v", va.GetName(), params.ClusterRef, err)), nil + } + deleted++ + params.Log().Info("force-volume-detach: deleted VolumeAttachment", + "cluster", params.ClusterRef, "volumeAttachment", va.GetName(), "pv", pvName) + } + + return runnerlib.OperationResultSpec{ + Capability: runnerlib.CapabilityForceVolumeDetach, + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Artifacts: []runnerlib.ArtifactRef{}, + Steps: []runnerlib.StepResult{{ + Name: "delete-volume-attachments", + Status: runnerlib.ResultSucceeded, + StartedAt: stepStart, + CompletedAt: time.Now().UTC(), + Message: fmt.Sprintf("%d VolumeAttachment(s) deleted for pack %s on cluster %s", deleted, params.PackInstalledName, params.ClusterRef), + }}, + }, nil +} + +// --------------------------------------------------------------------------- +// credential-refresh +// --------------------------------------------------------------------------- + +// credentialRefreshHandler implements the credential-refresh named capability. +// Deletes pods bearing the seam.ontai.dev/pack-name label so the kubelet +// retries the image pull with up-to-date imagePullSecret credentials. +// Intended for ImagePullBackOff failures where the imagePullSecret has been +// refreshed out-of-band (e.g., by a secret rotation operator). +// conductor-schema.md §6, conductor-sdk CapabilityCredentialRefresh. +type credentialRefreshHandler struct{} + +func (h *credentialRefreshHandler) Execute(ctx context.Context, params ExecuteParams) (runnerlib.OperationResultSpec, error) { + now := time.Now().UTC() + + if params.TenantKubeClient == nil { + return failureResult(runnerlib.CapabilityCredentialRefresh, now, runnerlib.ValidationFailure, + "credential-refresh requires TenantKubeClient"), nil + } + if params.PackInstalledName == "" { + return failureResult(runnerlib.CapabilityCredentialRefresh, now, runnerlib.ValidationFailure, + "credential-refresh requires PackInstalledName"), nil + } + + selector := packNameLabel + "=" + params.PackInstalledName + podList, err := params.TenantKubeClient.CoreV1().Pods("").List(ctx, metav1.ListOptions{ + LabelSelector: selector, + }) + if err != nil { + return failureResult(runnerlib.CapabilityCredentialRefresh, now, runnerlib.ExecutionFailure, + fmt.Sprintf("list pods with label %s on cluster %s: %v", selector, params.ClusterRef, err)), nil + } + + // Filter to pods in ImagePullBackOff or ErrImagePull state. + stepStart := time.Now().UTC() + deleted := 0 + for _, pod := range podList.Items { + if !hasPullFailure(pod) { + continue + } + if err := params.TenantKubeClient.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil { + return failureResult(runnerlib.CapabilityCredentialRefresh, now, runnerlib.ExecutionFailure, + fmt.Sprintf("delete pod %s/%s on cluster %s: %v", pod.Namespace, pod.Name, params.ClusterRef, err)), nil + } + deleted++ + params.Log().Info("credential-refresh: deleted pod for image pull retry", + "cluster", params.ClusterRef, "pod", pod.Namespace+"/"+pod.Name) + } + + return runnerlib.OperationResultSpec{ + Capability: runnerlib.CapabilityCredentialRefresh, + Status: runnerlib.ResultSucceeded, + StartedAt: now, + CompletedAt: time.Now().UTC(), + Artifacts: []runnerlib.ArtifactRef{}, + Steps: []runnerlib.StepResult{{ + Name: "delete-image-pull-failed-pods", + Status: runnerlib.ResultSucceeded, + StartedAt: stepStart, + CompletedAt: time.Now().UTC(), + Message: fmt.Sprintf("%d pod(s) deleted for image pull retry on cluster %s", deleted, params.ClusterRef), + }}, + }, nil +} + +// hasPullFailure returns true when any container in the pod is in ImagePullBackOff +// or ErrImagePull waiting state, indicating the pod needs a fresh pull attempt. +func hasPullFailure(pod corev1.Pod) bool { + for _, cs := range pod.Status.ContainerStatuses { + if cs.State.Waiting != nil { + switch cs.State.Waiting.Reason { + case "ImagePullBackOff", "ErrImagePull": + return true + } + } + } + for _, ics := range pod.Status.InitContainerStatuses { + if ics.State.Waiting != nil { + switch ics.State.Waiting.Reason { + case "ImagePullBackOff", "ErrImagePull": + return true + } + } + } + return false +} diff --git a/internal/capability/watchdog_test.go b/internal/capability/watchdog_test.go new file mode 100644 index 0000000..66ecf98 --- /dev/null +++ b/internal/capability/watchdog_test.go @@ -0,0 +1,322 @@ +package capability + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + dynfake "k8s.io/client-go/dynamic/fake" + kubefake "k8s.io/client-go/kubernetes/fake" + + "github.com/ontai-dev/conductor-sdk/runnerlib" +) + +// setupWatchdogScheme returns a runtime.Scheme with enough type registrations for +// watchdog handler tests. Only the GVKs exercised in these tests need to be present. +func setupWatchdogScheme() *runtime.Scheme { + s := runtime.NewScheme() + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "storage.k8s.io", Version: "v1", Kind: "VolumeAttachment"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "storage.k8s.io", Version: "v1", Kind: "VolumeAttachmentList"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "", Version: "v1", Kind: "PersistentVolumeClaim"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "", Version: "v1", Kind: "PersistentVolumeClaimList"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "apps", Version: "v1", Kind: "Deployment"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "apps", Version: "v1", Kind: "DeploymentList"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "apps", Version: "v1", Kind: "StatefulSet"}, &runtime.Unknown{}) + s.AddKnownTypeWithName(schema.GroupVersionKind{Group: "apps", Version: "v1", Kind: "StatefulSetList"}, &runtime.Unknown{}) + return s +} + +// TestPodRestartHandler_NilClient verifies that a nil TenantKubeClient returns +// a ValidationFailure without panicking. +func TestPodRestartHandler_NilClient(t *testing.T) { + h := &podRestartHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityPodRestart, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestPodRestartHandler_NilPackInstalledName verifies that a missing PackInstalledName +// returns a ValidationFailure. +func TestPodRestartHandler_NilPackInstalledName(t *testing.T) { + h := &podRestartHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityPodRestart, + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TenantKubeClient: kubefake.NewSimpleClientset(), + }, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestPodRestartHandler_DeletesPods verifies that Execute deletes pods bearing +// the pack-name label on the tenant cluster. +func TestPodRestartHandler_DeletesPods(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nginx-abc", + Namespace: "default", + Labels: map[string]string{packNameLabel: "nginx"}, + }, + } + client := kubefake.NewSimpleClientset(pod) + h := &podRestartHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityPodRestart, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{TenantKubeClient: client}, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Errorf("expected ResultSucceeded, got %q; reason %+v", result.Status, result.FailureReason) + } + // Verify pod was deleted. + _, getErr := client.CoreV1().Pods("default").Get(context.Background(), "nginx-abc", metav1.GetOptions{}) + if getErr == nil { + t.Error("expected pod to be deleted but it still exists") + } +} + +// TestResourcePatchHandler_NilClient verifies that a nil TenantDynamicClient returns +// a ValidationFailure without panicking. +func TestResourcePatchHandler_NilClient(t *testing.T) { + h := &resourcePatchHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityResourcePatch, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestResourcePatchHandler_NilPackInstalledName verifies that a missing PackInstalledName +// returns a ValidationFailure. +func TestResourcePatchHandler_NilPackInstalledName(t *testing.T) { + h := &resourcePatchHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityResourcePatch, + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TenantDynamicClient: dynfake.NewSimpleDynamicClient(setupWatchdogScheme()), + }, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestResourcePatchHandler_EmptyCluster verifies that no Deployments with the pack +// label results in a Succeeded result (no-op is valid). +func TestResourcePatchHandler_EmptyCluster(t *testing.T) { + client := dynfake.NewSimpleDynamicClient(setupWatchdogScheme()) + h := &resourcePatchHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityResourcePatch, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{TenantDynamicClient: client}, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Errorf("expected ResultSucceeded for empty cluster, got %q; reason %+v", result.Status, result.FailureReason) + } +} + +// TestForceVolumeDetachHandler_NilClient verifies that a nil TenantDynamicClient +// returns a ValidationFailure without panicking. +func TestForceVolumeDetachHandler_NilClient(t *testing.T) { + h := &forceVolumeDetachHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityForceVolumeDetach, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestForceVolumeDetachHandler_NilPackInstalledName verifies that a missing +// PackInstalledName returns a ValidationFailure. +func TestForceVolumeDetachHandler_NilPackInstalledName(t *testing.T) { + h := &forceVolumeDetachHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityForceVolumeDetach, + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TenantDynamicClient: dynfake.NewSimpleDynamicClient(setupWatchdogScheme()), + }, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestForceVolumeDetachHandler_NoPVCs verifies that when no PVCs match the pack +// label the result is Succeeded (no-op is valid). +func TestForceVolumeDetachHandler_NoPVCs(t *testing.T) { + client := dynfake.NewSimpleDynamicClient(setupWatchdogScheme()) + h := &forceVolumeDetachHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityForceVolumeDetach, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{TenantDynamicClient: client}, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Errorf("expected ResultSucceeded for no PVCs, got %q; reason %+v", result.Status, result.FailureReason) + } +} + +// TestCredentialRefreshHandler_NilClient verifies that a nil TenantKubeClient +// returns a ValidationFailure without panicking. +func TestCredentialRefreshHandler_NilClient(t *testing.T) { + h := &credentialRefreshHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityCredentialRefresh, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestCredentialRefreshHandler_NilPackInstalledName verifies that a missing +// PackInstalledName returns a ValidationFailure. +func TestCredentialRefreshHandler_NilPackInstalledName(t *testing.T) { + h := &credentialRefreshHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityCredentialRefresh, + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{ + TenantKubeClient: kubefake.NewSimpleClientset(), + }, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultFailed { + t.Errorf("expected ResultFailed, got %q", result.Status) + } + if result.FailureReason == nil || result.FailureReason.Category != runnerlib.ValidationFailure { + t.Errorf("expected ValidationFailure, got %+v", result.FailureReason) + } +} + +// TestCredentialRefreshHandler_DeletesPullFailedPods verifies that pods in +// ImagePullBackOff state are deleted and pods in other states are skipped. +func TestCredentialRefreshHandler_DeletesPullFailedPods(t *testing.T) { + pullFailPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nginx-pull-fail", + Namespace: "default", + Labels: map[string]string{packNameLabel: "nginx"}, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{{ + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{Reason: "ImagePullBackOff"}, + }, + }}, + }, + } + runningPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nginx-running", + Namespace: "default", + Labels: map[string]string{packNameLabel: "nginx"}, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{{ + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }}, + }, + } + client := kubefake.NewSimpleClientset(pullFailPod, runningPod) + h := &credentialRefreshHandler{} + result, err := h.Execute(context.Background(), ExecuteParams{ + Capability: runnerlib.CapabilityCredentialRefresh, + PackInstalledName: "nginx", + ClusterRef: "ccs-dev", + ExecuteClients: ExecuteClients{TenantKubeClient: client}, + }) + if err != nil { + t.Fatalf("Execute returned unexpected error: %v", err) + } + if result.Status != runnerlib.ResultSucceeded { + t.Errorf("expected ResultSucceeded, got %q; reason %+v", result.Status, result.FailureReason) + } + // pull-fail pod must be deleted. + _, getErr := client.CoreV1().Pods("default").Get(context.Background(), "nginx-pull-fail", metav1.GetOptions{}) + if getErr == nil { + t.Error("expected pull-fail pod to be deleted but it still exists") + } + // running pod must be preserved. + _, getErr = client.CoreV1().Pods("default").Get(context.Background(), "nginx-running", metav1.GetOptions{}) + if getErr != nil { + t.Errorf("expected running pod to be preserved but got: %v", getErr) + } +} diff --git a/internal/capability/wrapper.go b/internal/capability/wrapper.go index cf2ee6b..1065268 100644 --- a/internal/capability/wrapper.go +++ b/internal/capability/wrapper.go @@ -1019,9 +1019,10 @@ func ensureNamespaces(ctx context.Context, dynClient dynamic.Interface, manifest for ns := range needed { nsJSON := []byte(fmt.Sprintf( `{"apiVersion":"v1","kind":"Namespace","metadata":{"name":%q}}`, ns)) + forceNS := true _, err := dynClient.Resource(namespaceGVR).Patch( ctx, ns, types.ApplyPatchType, nsJSON, - metav1.PatchOptions{FieldManager: "conductor-pack-deploy"}, + metav1.PatchOptions{FieldManager: "conductor-pack-deploy", Force: &forceNS}, ) if err != nil { return created, fmt.Errorf("pre-create namespace %q: %w", ns, err) @@ -1103,18 +1104,23 @@ func stageForKind(kind string) string { // --------------------------------------------------------------------------- // applyParsedManifest applies m to the cluster via server-side apply. +// Force is set to true so conductor-pack-deploy takes field ownership from any +// prior manager (e.g. kubectl) without returning a conflict error. func applyParsedManifest(ctx context.Context, dynClient dynamic.Interface, m parsedManifest) error { gvr := gvrFromAPIVersionKind(m.apiVersion, m.kind) + force := true if m.namespace != "" { _, err := dynClient.Resource(gvr).Namespace(m.namespace). Patch(ctx, m.name, types.ApplyPatchType, m.jsonData, metav1.PatchOptions{ FieldManager: "conductor-pack-deploy", + Force: &force, }) return err } _, err := dynClient.Resource(gvr). Patch(ctx, m.name, types.ApplyPatchType, m.jsonData, metav1.PatchOptions{ FieldManager: "conductor-pack-deploy", + Force: &force, }) return err } diff --git a/internal/federation/metrics.go b/internal/federation/metrics.go new file mode 100644 index 0000000..33e11c2 --- /dev/null +++ b/internal/federation/metrics.go @@ -0,0 +1,29 @@ +package federation + +import ( + "github.com/prometheus/client_golang/prometheus" + ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + // metricActiveStreams tracks the current number of live streams accepted by FederationServer. + // ADR-F6 D4. + metricActiveStreams = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "conductor_federation_stream_active_count", + Help: "Current number of live streams accepted by FederationServer.", + }) + + // metricReconnectsTotal counts reconnect events observed per tenant cluster. + // ADR-F6 D4. + metricReconnectsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "conductor_federation_stream_reconnects_total", + Help: "Total number of stream reconnect events observed, labeled by cluster ID.", + }, + []string{"cluster_id"}, + ) +) + +func init() { + ctrlmetrics.Registry.MustRegister(metricActiveStreams, metricReconnectsTotal) +} diff --git a/internal/federation/server.go b/internal/federation/server.go index 6fa4daf..8f78ecb 100644 --- a/internal/federation/server.go +++ b/internal/federation/server.go @@ -7,8 +7,10 @@ import ( "fmt" "net" "sync" + "sync/atomic" "time" + "golang.org/x/time/rate" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials" @@ -47,6 +49,19 @@ type clusterStatus struct { missedHeartbeats int } +// FederationServerOptions configures the stream admission limits for FederationServer. +// Zero values disable the corresponding limit. ADR-F6. +type FederationServerOptions struct { + // MaxConcurrentStreams is the maximum number of simultaneous active streams. + // When reached, new connections receive codes.ResourceExhausted. + // Must be in [1, 1000]; 0 means unlimited (no semaphore). Default: 50 via env. + MaxConcurrentStreams int + + // AdmissionRate is the token-bucket refill rate in tokens per second. + // Burst capacity is 2x this value. 0 means unlimited. Default: 5 via env. + AdmissionRate int +} + // FederationServer is the management-side federation gRPC server. // It listens on the federation port with mutual TLS, extracts cluster IDs // from client certificate SANs, and maintains the bidirectional stream with @@ -63,31 +78,82 @@ type FederationServer struct { mu sync.RWMutex // connectedClusters maps clusterID → stream status for heartbeat tracking. connectedClusters map[string]*clusterStatus + + // semaphore limits concurrent active streams. nil = unlimited. ADR-F6 D1. + semaphore chan struct{} + + // admissionLimiter rate-limits new stream accepts. nil = unlimited. ADR-F6 D2. + admissionLimiter *rate.Limiter + + // activeCount is the live stream count, kept in sync with the semaphore. ADR-F6 D4. + activeCount atomic.Int64 } // NewFederationServer constructs a FederationServer from certificate paths. // The server does not start until Start is called. // conductor-schema.md §18. -func NewFederationServer(caCertPath, serverCertPath, serverKeyPath string, kubeClient kubernetes.Interface) (*FederationServer, error) { +func NewFederationServer(caCertPath, serverCertPath, serverKeyPath string, kubeClient kubernetes.Interface, opts FederationServerOptions) (*FederationServer, error) { tlsCfg, err := BuildServerTLSConfig(caCertPath, serverCertPath, serverKeyPath) if err != nil { return nil, fmt.Errorf("federation server TLS config: %w", err) } - return &FederationServer{ - tlsCfg: tlsCfg, - kubeClient: kubeClient, - connectedClusters: make(map[string]*clusterStatus), - }, nil + return newFederationServer(tlsCfg, kubeClient, opts), nil } // NewFederationServerFromTLS constructs a FederationServer from an already-built // tls.Config. Used in tests to inject a test TLS config directly. -func NewFederationServerFromTLS(tlsCfg *tls.Config, kubeClient kubernetes.Interface) *FederationServer { - return &FederationServer{ +func NewFederationServerFromTLS(tlsCfg *tls.Config, kubeClient kubernetes.Interface, opts FederationServerOptions) *FederationServer { + return newFederationServer(tlsCfg, kubeClient, opts) +} + +func newFederationServer(tlsCfg *tls.Config, kubeClient kubernetes.Interface, opts FederationServerOptions) *FederationServer { + s := &FederationServer{ tlsCfg: tlsCfg, kubeClient: kubeClient, connectedClusters: make(map[string]*clusterStatus), } + if opts.MaxConcurrentStreams > 0 { + s.semaphore = make(chan struct{}, opts.MaxConcurrentStreams) + } + if opts.AdmissionRate > 0 { + s.admissionLimiter = rate.NewLimiter(rate.Limit(opts.AdmissionRate), 2*opts.AdmissionRate) + } + return s +} + +// ActiveStreamCount returns the number of currently active streams. ADR-F6 D4. +func (s *FederationServer) ActiveStreamCount() int64 { + return s.activeCount.Load() +} + +// ParseFederationMaxStreams parses FEDERATION_MAX_CONCURRENT_STREAMS env value. +// Valid range: [1, 1000]. Returns 50 (default) if empty, 0 on invalid input. +// ADR-F6 D1. +func ParseFederationMaxStreams(v string) int { + if v == "" { + return 50 + } + var n int + if _, err := fmt.Sscanf(v, "%d", &n); err != nil || n <= 0 || n > 1000 { + fmt.Printf("federation server: invalid FEDERATION_MAX_CONCURRENT_STREAMS %q (must be 1-1000) — using default 50\n", v) + return 50 + } + return n +} + +// ParseFederationAdmissionRate parses FEDERATION_ADMISSION_RATE env value. +// Returns 5 (default) if empty, 0 on invalid input (disables rate limiting). +// ADR-F6 D2. +func ParseFederationAdmissionRate(v string) int { + if v == "" { + return 5 + } + var n int + if _, err := fmt.Sscanf(v, "%d", &n); err != nil || n <= 0 { + fmt.Printf("federation server: invalid FEDERATION_ADMISSION_RATE %q (must be >0) — using default 5\n", v) + return 5 + } + return n } // Start begins listening on addr and serves the federation gRPC stream with mutual TLS. @@ -138,12 +204,39 @@ func (s *FederationServer) ConnectedClusterIDs() []string { // federationStream handles a single bidirectional stream from a connected tenant. // It implements the grpc.ServerStream interface handler for the FederationService/Stream method. func (s *FederationServer) federationStream(stream grpc.ServerStream) error { + // D2: admission rate-limit check before semaphore acquisition. ADR-F6. + if s.admissionLimiter != nil && !s.admissionLimiter.Allow() { + return status.Errorf(codes.ResourceExhausted, "federation server: admission rate limit exceeded") + } + + // D1: semaphore -- reject when max concurrent streams reached. ADR-F6. + if s.semaphore != nil { + select { + case s.semaphore <- struct{}{}: + // slot acquired + default: + return status.Errorf(codes.ResourceExhausted, "federation server: max concurrent stream limit reached") + } + defer func() { <-s.semaphore }() + } + + // Track active count and update the Prometheus gauge. ADR-F6 D4. + s.activeCount.Add(1) + metricActiveStreams.Inc() + defer func() { + s.activeCount.Add(-1) + metricActiveStreams.Dec() + }() + // Extract cluster ID from the peer TLS certificate SAN. clusterID, err := s.clusterIDFromStream(stream) if err != nil { return status.Errorf(codes.Unauthenticated, "cluster ID extraction: %v", err) } + // Count this as a reconnect event (every stream accept = one connection). ADR-F6 D4. + metricReconnectsTotal.WithLabelValues(clusterID).Inc() + // Register this cluster as connected. cs := &clusterStatus{} s.mu.Lock() diff --git a/internal/federation/server_pool_test.go b/internal/federation/server_pool_test.go new file mode 100644 index 0000000..8816c6a --- /dev/null +++ b/internal/federation/server_pool_test.go @@ -0,0 +1,155 @@ +package federation + +import ( + "sync" + "testing" + "time" +) + +// acquireSlot tests the semaphore directly, bypassing TLS cert extraction. +// Returns true if the slot was acquired (semaphore not full), false otherwise. +func acquireSlot(s *FederationServer) (release func(), ok bool) { + if s.semaphore == nil { + return func() {}, true + } + select { + case s.semaphore <- struct{}{}: + return func() { <-s.semaphore }, true + default: + return nil, false + } +} + +// TestFederationServer_RejectsWhenLimitReached verifies that a server with +// limit=2 rejects the third concurrent connection with RESOURCE_EXHAUSTED. +// ADR-F6 D1. +func TestFederationServer_RejectsWhenLimitReached(t *testing.T) { + opts := FederationServerOptions{MaxConcurrentStreams: 2} + s := newFederationServer(nil, nil, opts) + + // Acquire both slots. + rel1, ok1 := acquireSlot(s) + if !ok1 { + t.Fatal("expected slot 1 to be acquired") + } + defer rel1() + rel2, ok2 := acquireSlot(s) + if !ok2 { + t.Fatal("expected slot 2 to be acquired") + } + defer rel2() + + // Third attempt must be rejected. + _, ok3 := acquireSlot(s) + if ok3 { + t.Error("expected slot 3 to be rejected (limit=2 reached)") + } +} + +// TestFederationServer_AdmitsUpToLimit verifies that a server with limit=2 +// admits exactly two concurrent streams and both are recorded as active. +// ADR-F6 D1. +func TestFederationServer_AdmitsUpToLimit(t *testing.T) { + opts := FederationServerOptions{MaxConcurrentStreams: 2} + s := newFederationServer(nil, nil, opts) + + var mu sync.Mutex + admitted := 0 + + var wg sync.WaitGroup + for i := 0; i < 2; i++ { + wg.Add(1) + go func() { + defer wg.Done() + rel, ok := acquireSlot(s) + if !ok { + return + } + defer rel() + mu.Lock() + admitted++ + mu.Unlock() + // Hold the slot briefly. + time.Sleep(10 * time.Millisecond) + }() + } + wg.Wait() + + if admitted != 2 { + t.Errorf("expected 2 admitted streams, got %d", admitted) + } +} + +// TestActiveStreamCount_DecreasesOnDisconnect verifies that ActiveStreamCount +// increments when a slot is acquired and decrements when it is released. +// ADR-F6 D4. +func TestActiveStreamCount_DecreasesOnDisconnect(t *testing.T) { + opts := FederationServerOptions{MaxConcurrentStreams: 5} + s := newFederationServer(nil, nil, opts) + + if n := s.ActiveStreamCount(); n != 0 { + t.Fatalf("expected ActiveStreamCount=0 before any stream, got %d", n) + } + + // Simulate what federationStream does: acquire semaphore + track activeCount. + rel, ok := acquireSlot(s) + if !ok { + t.Fatal("expected slot to be acquired") + } + s.activeCount.Add(1) + metricActiveStreams.Inc() + + if n := s.ActiveStreamCount(); n != 1 { + t.Errorf("expected ActiveStreamCount=1 after connect, got %d", n) + } + + // Simulate disconnect. + s.activeCount.Add(-1) + metricActiveStreams.Dec() + rel() + + if n := s.ActiveStreamCount(); n != 0 { + t.Errorf("expected ActiveStreamCount=0 after disconnect, got %d", n) + } +} + +// TestParseFederationMaxStreams verifies the env var parser. ADR-F6 D1. +func TestParseFederationMaxStreams(t *testing.T) { + cases := []struct { + input string + want int + }{ + {"", 50}, + {"10", 10}, + {"1000", 1000}, + {"0", 50}, // out of range: default + {"1001", 50}, // out of range: default + {"bad", 50}, // invalid: default + {"-5", 50}, // negative: default + } + for _, tc := range cases { + if got := ParseFederationMaxStreams(tc.input); got != tc.want { + t.Errorf("ParseFederationMaxStreams(%q) = %d, want %d", tc.input, got, tc.want) + } + } +} + +// TestParseFederationAdmissionRate verifies the env var parser. ADR-F6 D2. +func TestParseFederationAdmissionRate(t *testing.T) { + cases := []struct { + input string + want int + }{ + {"", 5}, + {"10", 10}, + {"1", 1}, + {"0", 5}, // zero invalid: default + {"-1", 5}, // negative: default + {"bad", 5}, // invalid: default + } + for _, tc := range cases { + if got := ParseFederationAdmissionRate(tc.input); got != tc.want { + t.Errorf("ParseFederationAdmissionRate(%q) = %d, want %d", tc.input, got, tc.want) + } + } +} diff --git a/internal/identity/identity.go b/internal/identity/identity.go index 1a5caff..3c84ab0 100644 --- a/internal/identity/identity.go +++ b/internal/identity/identity.go @@ -54,7 +54,7 @@ func EnsureSeamMembership(ctx context.Context, c client.Client) error { Spec: seamv1alpha1.SeamMembershipSpec{ AppIdentityRef: id.OperatorName(), DomainIdentityRef: id.OperatorName(), - PrincipalRef: "system:serviceaccount:seam-system:" + id.OperatorName(), + PrincipalRef: "system:serviceaccount:ont-system:" + id.OperatorName(), Tier: "infrastructure", }, } diff --git a/internal/identity/identity_test.go b/internal/identity/identity_test.go index 81dbf8f..02e4144 100644 --- a/internal/identity/identity_test.go +++ b/internal/identity/identity_test.go @@ -92,6 +92,10 @@ func TestEnsureSeamMembership_Creates(t *testing.T) { if sm.Spec.Tier != "infrastructure" { t.Errorf("Tier = %q, want %q", sm.Spec.Tier, "infrastructure") } + wantPrincipal := "system:serviceaccount:ont-system:conductor" + if sm.Spec.PrincipalRef != wantPrincipal { + t.Errorf("PrincipalRef = %q, want %q (conductor runs in ont-system, not seam-system)", sm.Spec.PrincipalRef, wantPrincipal) + } } func TestEnsureSeamMembership_Idempotent(t *testing.T) { diff --git a/internal/kernel/agent.go b/internal/kernel/agent.go index 19d766a..dc7b2f3 100644 --- a/internal/kernel/agent.go +++ b/internal/kernel/agent.go @@ -333,12 +333,75 @@ func RunAgent(goCtx context.Context, execCtx config.ExecutionContext, client kub execCtx.ClusterRef) } + // PackSourceVersionLoop — role=management only. Polls Helm chart repository + // index.yaml for each Helm-backed PackDelivery in seam-system and emits an + // UpstreamVersionAvailable DriftSignal when a newer chart version is found. + // RECON-CMN1. + var packSourceVersionLoop *agent.PackSourceVersionLoop + if role == RoleManagement { + packSourceVersionLoop = agent.NewPackSourceVersionLoop(dynamicClient, ns) + fmt.Printf("conductor agent: cluster=%q pack source version loop enabled (management role)\n", + execCtx.ClusterRef) + } + + // ESOHealthLoop — role=management only. Polls ExternalSecret CRs in seam-system + // and emits ExternalSecretSyncFailed DriftSignals on sync errors. + // Skips cleanly when ESO CRDs are not installed. RECON-K3. + var esoHealthLoop *agent.ESOHealthLoop + if role == RoleManagement { + esoHealthLoop = agent.NewESOHealthLoop(dynamicClient, ns, execCtx.ClusterRef) + fmt.Printf("conductor agent: cluster=%q ESO health loop enabled (management role)\n", + execCtx.ClusterRef) + } + + // PolicyReportDriftLoop — role=management only. Polls Kyverno PolicyReport and + // ClusterPolicyReport CRs and emits KyvernoPolicyViolation DriftSignals on failures. + // Skips cleanly when Kyverno CRDs are not installed. RECON-L2. + var policyReportDriftLoop *agent.PolicyReportDriftLoop + if role == RoleManagement { + policyReportDriftLoop = agent.NewPolicyReportDriftLoop(dynamicClient, ns, execCtx.ClusterRef) + fmt.Printf("conductor agent: cluster=%q policy report drift loop enabled (management role)\n", + execCtx.ClusterRef) + } + + // VulnerabilityDriftLoop — role=management only. Polls Trivy Operator + // VulnerabilityReport CRs and emits VulnerableImageDetected DriftSignals for + // CRITICAL severity findings. Skips cleanly when Trivy CRDs not installed. RECON-M2. + var vulnerabilityDriftLoop *agent.VulnerabilityDriftLoop + if role == RoleManagement { + vulnerabilityDriftLoop = agent.NewVulnerabilityDriftLoop(dynamicClient, ns, execCtx.ClusterRef) + fmt.Printf("conductor agent: cluster=%q vulnerability drift loop enabled (management role)\n", + execCtx.ClusterRef) + } + + // BackupHealthLoop — role=management only. Polls Velero BackupStorageLocation and + // Backup CRs; emits BackupStorageUnavailable and BackupRPOBreached DriftSignals. + // Skips cleanly when Velero CRDs are not installed. RECON-N2. + var backupHealthLoop *agent.BackupHealthLoop + if role == RoleManagement { + backupHealthLoop = agent.NewBackupHealthLoop(dynamicClient, ns, execCtx.ClusterRef) + fmt.Printf("conductor agent: cluster=%q backup health loop enabled (management role)\n", + execCtx.ClusterRef) + } + if runtimeDriftHandler != nil { runtimeDriftHandler.WithOperatorContextWatcher(ocWatcher) } if packPodHealthLoop != nil { packPodHealthLoop.WithOperatorContextWatcher(ocWatcher) } + if esoHealthLoop != nil { + esoHealthLoop.WithOperatorContextWatcher(ocWatcher) + } + if policyReportDriftLoop != nil { + policyReportDriftLoop.WithOperatorContextWatcher(ocWatcher) + } + if vulnerabilityDriftLoop != nil { + vulnerabilityDriftLoop.WithOperatorContextWatcher(ocWatcher) + } + if backupHealthLoop != nil { + backupHealthLoop.WithOperatorContextWatcher(ocWatcher) + } // Phase 3b — Start the federation channel listener/client. // Management Conductor: start FederationServer when FEDERATION_CA_CERT_PATH, @@ -354,7 +417,11 @@ func RunAgent(goCtx context.Context, execCtx config.ExecutionContext, client kub if fedCACertPath != "" && fedServerCertPath != "" && fedServerKeyPath != "" { // Management Conductor: start the federation server. - fedServer, fedErr := federation.NewFederationServer(fedCACertPath, fedServerCertPath, fedServerKeyPath, nil) + fedOpts := federation.FederationServerOptions{ + MaxConcurrentStreams: federation.ParseFederationMaxStreams(os.Getenv("FEDERATION_MAX_CONCURRENT_STREAMS")), + AdmissionRate: federation.ParseFederationAdmissionRate(os.Getenv("FEDERATION_ADMISSION_RATE")), + } + fedServer, fedErr := federation.NewFederationServer(fedCACertPath, fedServerCertPath, fedServerKeyPath, nil, fedOpts) if fedErr != nil { return fmt.Errorf("conductor agent: build federation server: %w", fedErr) } @@ -453,7 +520,7 @@ func RunAgent(goCtx context.Context, execCtx config.ExecutionContext, client kub "", // identity: resolved from hostname inside RunLeaderElection agent.LeaderCallbacks{ OnStartedLeading: func(leaderCtx context.Context) { - onLeaderStart(leaderCtx, execCtx.ClusterRef, ns, manifest, publisher, reconciler, signingLoop, snapshotPullLoop, packInstancePullLoop, packReceiptDriftLoop, rbacProfilePullLoop, rbacPolicyPullLoop, driftSignalHandler, talosVersionDriftLoop, kubernetesVersionDriftLoop, packPodHealthLoop, runtimeDriftHandler, ocWatcher, clusterNodeHealthLoop, dynamicClient) + onLeaderStart(leaderCtx, execCtx.ClusterRef, ns, manifest, publisher, reconciler, signingLoop, snapshotPullLoop, packInstancePullLoop, packReceiptDriftLoop, rbacProfilePullLoop, rbacPolicyPullLoop, driftSignalHandler, talosVersionDriftLoop, kubernetesVersionDriftLoop, packPodHealthLoop, runtimeDriftHandler, ocWatcher, clusterNodeHealthLoop, packSourceVersionLoop, esoHealthLoop, policyReportDriftLoop, vulnerabilityDriftLoop, backupHealthLoop, dynamicClient) }, OnStoppedLeading: func() { fmt.Printf("conductor agent: cluster=%q lost leadership — entering standby\n", @@ -490,6 +557,11 @@ func onLeaderStart( runtimeDriftHandler *agent.RuntimeDriftHandler, ocWatcher *agent.OperatorContextWatcher, clusterNodeHealthLoop *agent.ClusterNodeHealthLoop, + packSourceVersionLoop *agent.PackSourceVersionLoop, + esoHealthLoop *agent.ESOHealthLoop, + policyReportDriftLoop *agent.PolicyReportDriftLoop, + vulnerabilityDriftLoop *agent.VulnerabilityDriftLoop, + backupHealthLoop *agent.BackupHealthLoop, dynamicClient dynamic.Interface, ) { // Publish capability manifest to RunnerConfig status with background retry. @@ -625,6 +697,42 @@ func onLeaderStart( go clusterNodeHealthLoop.Run(leaderCtx, reconcileInterval) } + // Start PackSourceVersionLoop (management cluster only). Polls Helm chart repository + // index.yaml for each Helm-backed PackDelivery in the management namespace and emits + // UpstreamVersionAvailable DriftSignals. RECON-CMN1. + const packVersionInterval = 6 * time.Hour + if packSourceVersionLoop != nil { + go packSourceVersionLoop.Run(leaderCtx, packVersionInterval) + } + + // Start ESOHealthLoop (management cluster only). Polls ExternalSecret CRs for sync + // failures and emits ExternalSecretSyncFailed DriftSignals. Skips when ESO CRDs absent. + // RECON-K3. + if esoHealthLoop != nil { + go esoHealthLoop.Run(leaderCtx, reconcileInterval) + } + + // Start PolicyReportDriftLoop (management cluster only). Polls Kyverno PolicyReport and + // ClusterPolicyReport CRs and emits KyvernoPolicyViolation DriftSignals. Skips when + // Kyverno CRDs absent. RECON-L2. + if policyReportDriftLoop != nil { + go policyReportDriftLoop.Run(leaderCtx, reconcileInterval) + } + + // Start VulnerabilityDriftLoop (management cluster only). Polls Trivy Operator + // VulnerabilityReport CRs and emits VulnerableImageDetected DriftSignals for CRITICAL + // findings. Skips when Trivy CRDs absent. RECON-M2. + if vulnerabilityDriftLoop != nil { + go vulnerabilityDriftLoop.Run(leaderCtx, reconcileInterval) + } + + // Start BackupHealthLoop (management cluster only). Polls Velero BackupStorageLocation + // and Backup CRs; emits BackupStorageUnavailable and BackupRPOBreached DriftSignals. + // Skips when Velero CRDs absent. RECON-N2. + if backupHealthLoop != nil { + go backupHealthLoop.Run(leaderCtx, reconcileInterval) + } + // Mark InfrastructureTalosCluster Ready=True (tenant clusters only). // snapshotPullLoop non-nil indicates role=tenant. Conductor signals readiness // to management once leadership is established. guardian-schema.md §3. diff --git a/test/e2e/backup_health_loop_test.go b/test/e2e/backup_health_loop_test.go new file mode 100644 index 0000000..c8bf20c --- /dev/null +++ b/test/e2e/backup_health_loop_test.go @@ -0,0 +1,41 @@ +package e2e_test + +// backup_health_loop_test.go -- live cluster verification that BackupHealthLoop +// correctly detects Velero BSL unavailability and RPO breaches, emitting +// BackupStorageUnavailable and BackupRPOBreached DriftSignals. +// +// Pre-conditions: +// - MGMT_KUBECONFIG set; ccs-mgmt fully onboarded with Guardian operational. +// - Velero PackDelivery deployed to seam-system (RECON-N1 closed). +// - At least one BackupStorageLocation present in seam-system. +// - Conductor agent role=management running with BackupHealthLoop enabled. +// +// What this test verifies (RECON-N2): +// - BackupHealthLoop creates a BackupStorageUnavailable DriftSignal in seam-system +// when a BackupStorageLocation is not in phase=Available. +// - BackupHealthLoop creates a BackupRPOBreached DriftSignal when no successful +// backup is younger than 25 hours. +// - DriftSignal spec.signalKind == "BackupStorageUnavailable" or "BackupRPOBreached". +// - After the BSL returns to Available and a recent backup completes, the loop +// confirms both DriftSignals (state=confirmed). +// - Under AutonomyLevel=observe-only, no DriftSignal is created. + +import ( + "testing" +) + +func TestBackupHealthLoop_BSLUnavailableEmitsDriftSignal(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and Velero deployed with a degraded BackupStorageLocation with RECON-N1 and RECON-N2 closed") +} + +func TestBackupHealthLoop_RPOBreachedEmitsDriftSignal(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and Velero deployed with no successful backup within RPO window with RECON-N1 and RECON-N2 closed") +} + +func TestBackupHealthLoop_ConfirmsSignalAfterRecovery(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and existing BackupStorageUnavailable or BackupRPOBreached DriftSignal with RECON-N2 closed") +} + +func TestBackupHealthLoop_ObserveOnly_NoSignalCreated(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and OperatorContext with autonomyLevel=observe-only with RECON-N2 closed") +} diff --git a/test/e2e/eso_health_loop_test.go b/test/e2e/eso_health_loop_test.go new file mode 100644 index 0000000..a702ea2 --- /dev/null +++ b/test/e2e/eso_health_loop_test.go @@ -0,0 +1,34 @@ +package e2e_test + +// eso_health_loop_test.go -- live cluster verification that ESOHealthLoop correctly +// detects ExternalSecret sync failures and emits ExternalSecretSyncFailed DriftSignals. +// +// Pre-conditions: +// - MGMT_KUBECONFIG set; ccs-mgmt fully onboarded with Guardian operational. +// - External Secrets Operator PackDelivery deployed to seam-system (RECON-K1 closed). +// - At least one ExternalSecret CR present in seam-system pointing to a secret store. +// - Conductor agent role=management running with ESOHealthLoop enabled. +// +// What this test verifies (RECON-K3): +// - ESOHealthLoop creates an ExternalSecretSyncFailed DriftSignal in seam-system +// when an ExternalSecret has a Ready=False or Synced=False condition. +// - DriftSignal spec.signalKind == "ExternalSecretSyncFailed". +// - After the ExternalSecret recovers (Ready=True), the loop confirms the +// DriftSignal (state=confirmed) within the next poll interval. +// - Under AutonomyLevel=observe-only, no DriftSignal is created. + +import ( + "testing" +) + +func TestESOHealthLoop_SyncFailedEmitsDriftSignal(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and ESO PackDelivery deployed to seam-system with RECON-K1 and RECON-K3 closed") +} + +func TestESOHealthLoop_ConfirmsSignalOnRecovery(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and an existing ExternalSecretSyncFailed DriftSignal with RECON-K1 and RECON-K3 closed") +} + +func TestESOHealthLoop_ObserveOnly_NoSignalCreated(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and OperatorContext with autonomyLevel=observe-only with RECON-K3 closed") +} diff --git a/test/e2e/pack_source_version_loop_test.go b/test/e2e/pack_source_version_loop_test.go new file mode 100644 index 0000000..9e17bc8 --- /dev/null +++ b/test/e2e/pack_source_version_loop_test.go @@ -0,0 +1,36 @@ +package e2e_test + +// pack_source_version_loop_test.go -- live cluster verification that the +// PackSourceVersionLoop correctly detects and signals upstream Helm chart +// version availability for extension PackDeliveries on ccs-mgmt. +// +// Pre-conditions: +// - MGMT_KUBECONFIG set; ccs-mgmt fully onboarded with Guardian operational. +// - At least one Helm-backed PackDelivery deployed to seam-system (e.g., Dex). +// - Helm chart repository at 10.20.0.1:5000 serving index.yaml with a newer +// chart version than the one referenced by the PackDelivery. +// - Conductor agent role=management running with PackSourceVersionLoop enabled. +// +// What this test verifies (RECON-CMN1): +// - PackSourceVersionLoop detects the version gap within one poll interval. +// - UpstreamVersionAvailable DriftSignal created in seam-system for the pack. +// - DriftSignal spec.signalKind == "UpstreamVersionAvailable". +// - After updating the PackDelivery to the latest chart version, the loop +// confirms the DriftSignal (state=confirmed) within the next poll interval. + +import ( + "testing" +) + +// TestPackSourceVersionLoop_DetectsAndSignalsNewChartVersion verifies the full +// upstream version detection and DriftSignal lifecycle. RECON-CMN1. +func TestPackSourceVersionLoop_DetectsAndSignalsNewChartVersion(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and a Helm-backed PackDelivery in seam-system with an available newer chart version and RECON-K1 closed") +} + +// TestPackSourceVersionLoop_ConfirmsSignalAfterVersionUpdate verifies that the +// loop confirms an existing UpstreamVersionAvailable signal after the PackDelivery +// spec.chartVersion is updated to match the latest index version. RECON-CMN1. +func TestPackSourceVersionLoop_ConfirmsSignalAfterVersionUpdate(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and an existing UpstreamVersionAvailable DriftSignal on a Helm-backed PackDelivery and RECON-K1 closed") +} diff --git a/test/e2e/policy_report_drift_loop_test.go b/test/e2e/policy_report_drift_loop_test.go new file mode 100644 index 0000000..dea3970 --- /dev/null +++ b/test/e2e/policy_report_drift_loop_test.go @@ -0,0 +1,34 @@ +package e2e_test + +// policy_report_drift_loop_test.go -- live cluster verification that PolicyReportDriftLoop +// correctly detects Kyverno policy violations and emits KyvernoPolicyViolation DriftSignals. +// +// Pre-conditions: +// - MGMT_KUBECONFIG set; ccs-mgmt fully onboarded with Guardian operational. +// - Kyverno PackDelivery deployed to seam-system (RECON-L1 closed). +// - At least one ClusterPolicy or Policy active; at least one ClusterPolicyReport present. +// - Conductor agent role=management running with PolicyReportDriftLoop enabled. +// +// What this test verifies (RECON-L2): +// - PolicyReportDriftLoop creates a KyvernoPolicyViolation DriftSignal in seam-system +// when a PolicyReport or ClusterPolicyReport contains at least one fail result. +// - DriftSignal spec.signalKind == "KyvernoPolicyViolation". +// - After the policy violation is remediated (fail result removed), the loop confirms +// the DriftSignal (state=confirmed). +// - Under AutonomyLevel=observe-only, no DriftSignal is created. + +import ( + "testing" +) + +func TestPolicyReportDriftLoop_ViolationEmitsDriftSignal(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and Kyverno deployed with a failing ClusterPolicyReport with RECON-L1 and RECON-L2 closed") +} + +func TestPolicyReportDriftLoop_ConfirmsSignalAfterRemediation(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and an existing KyvernoPolicyViolation DriftSignal with RECON-L2 closed") +} + +func TestPolicyReportDriftLoop_ObserveOnly_NoSignalCreated(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and OperatorContext with autonomyLevel=observe-only with RECON-L2 closed") +} diff --git a/test/e2e/vulnerability_drift_loop_test.go b/test/e2e/vulnerability_drift_loop_test.go new file mode 100644 index 0000000..c6523b7 --- /dev/null +++ b/test/e2e/vulnerability_drift_loop_test.go @@ -0,0 +1,34 @@ +package e2e_test + +// vulnerability_drift_loop_test.go -- live cluster verification that VulnerabilityDriftLoop +// correctly detects CRITICAL vulnerabilities and emits VulnerableImageDetected DriftSignals. +// +// Pre-conditions: +// - MGMT_KUBECONFIG set; ccs-mgmt fully onboarded with Guardian operational. +// - Trivy Operator PackDelivery deployed to seam-system (RECON-M1 closed). +// - At least one VulnerabilityReport present in seam-system with scan results. +// - Conductor agent role=management running with VulnerabilityDriftLoop enabled. +// +// What this test verifies (RECON-M2): +// - VulnerabilityDriftLoop creates a VulnerableImageDetected DriftSignal in seam-system +// when a VulnerabilityReport contains at least one CRITICAL severity vulnerability. +// - DriftSignal spec.signalKind == "VulnerableImageDetected". +// - After the image is updated to a patched version (criticalCount drops to zero), +// the loop confirms the DriftSignal (state=confirmed). +// - Under AutonomyLevel=observe-only, no DriftSignal is created. + +import ( + "testing" +) + +func TestVulnerabilityDriftLoop_CriticalVulnEmitsDriftSignal(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and Trivy Operator deployed with a VulnerabilityReport containing CRITICAL CVEs with RECON-M1 and RECON-M2 closed") +} + +func TestVulnerabilityDriftLoop_ConfirmsSignalAfterImageUpdate(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and an existing VulnerableImageDetected DriftSignal with RECON-M2 closed") +} + +func TestVulnerabilityDriftLoop_ObserveOnly_NoSignalCreated(t *testing.T) { + t.Skip("requires MGMT_KUBECONFIG and OperatorContext with autonomyLevel=observe-only with RECON-M2 closed") +} diff --git a/test/integration/federation/stream_integration_test.go b/test/integration/federation/stream_integration_test.go index cb12e8b..4ffa890 100644 --- a/test/integration/federation/stream_integration_test.go +++ b/test/integration/federation/stream_integration_test.go @@ -209,7 +209,7 @@ func TestStream_HeartBeat_ServerRespondsWithACK(t *testing.T) { if err != nil { t.Fatalf("server TLS: %v", err) } - srv := federation.NewFederationServerFromTLS(serverTLS, nil) + srv := federation.NewFederationServerFromTLS(serverTLS, nil, federation.FederationServerOptions{}) addr, _ := startStreamServer(t, srv) clientTLS, err := federation.BuildClientTLSConfig(caPath, clientCertPath, clientKeyPath) @@ -264,7 +264,7 @@ func TestStream_AuditEventBatch_ServerRespondsWithAck(t *testing.T) { t.Fatalf("server TLS: %v", err) } // kubeClient is nil — server skips ConfigMap creation but still ACKs. - srv := federation.NewFederationServerFromTLS(serverTLS, nil) + srv := federation.NewFederationServerFromTLS(serverTLS, nil, federation.FederationServerOptions{}) addr, _ := startStreamServer(t, srv) clientTLS, err := federation.BuildClientTLSConfig(caPath, clientCertPath, clientKeyPath) @@ -327,7 +327,7 @@ func TestStream_ClusterID_ExtractedFromClientCert(t *testing.T) { if err != nil { t.Fatalf("server TLS: %v", err) } - srv := federation.NewFederationServerFromTLS(serverTLS, nil) + srv := federation.NewFederationServerFromTLS(serverTLS, nil, federation.FederationServerOptions{}) addr, _ := startStreamServer(t, srv) clientTLS, err := federation.BuildClientTLSConfig(caPath, clientCertPath, clientKeyPath) @@ -374,7 +374,7 @@ func TestStream_WALReplay_OnReconnect(t *testing.T) { if err != nil { t.Fatalf("server TLS: %v", err) } - srv := federation.NewFederationServerFromTLS(serverTLS, nil) + srv := federation.NewFederationServerFromTLS(serverTLS, nil, federation.FederationServerOptions{}) addr, _ := startStreamServer(t, srv) // Pre-populate WAL with 3 entries; ACK sequence 1. diff --git a/test/unit/agent/capability_publisher_test.go b/test/unit/agent/capability_publisher_test.go index a4b2f55..accc6d6 100644 --- a/test/unit/agent/capability_publisher_test.go +++ b/test/unit/agent/capability_publisher_test.go @@ -18,9 +18,9 @@ import ( ) var runnerConfigGVR = schema.GroupVersionResource{ - Group: "infrastructure.ontai.dev", - Version: "v1alpha1", - Resource: "infrastructurerunnerconfigs", + Group: "seam.ontai.dev", + Version: "v1alpha1", + Resource: "runnerconfigs", } // makeRunnerConfig constructs an Unstructured RunnerConfig with optional capabilities @@ -28,7 +28,7 @@ var runnerConfigGVR = schema.GroupVersionResource{ func makeRunnerConfig(name, namespace string, hasCaps bool) *unstructured.Unstructured { obj := &unstructured.Unstructured{ Object: map[string]interface{}{ - "apiVersion": "infrastructure.ontai.dev/v1alpha1", + "apiVersion": "seam.ontai.dev/v1alpha1", "kind": "RunnerConfig", "metadata": map[string]interface{}{ "name": name, @@ -55,15 +55,15 @@ func newFakeDynamicClient(scheme *runtime.Scheme) *dynamicfake.FakeDynamicClient // Register the RunnerConfig GVR in the RESTMapper by adding it to the scheme. // dynamicfake uses the scheme to resolve GVKs; we add a dummy unstructured type. gvk := schema.GroupVersionKind{ - Group: "infrastructure.ontai.dev", + Group: "seam.ontai.dev", Version: "v1alpha1", Kind: "RunnerConfig", } scheme.AddKnownTypeWithName(gvk, &runtime.Unknown{}) gvkList := schema.GroupVersionKind{ - Group: "infrastructure.ontai.dev", + Group: "seam.ontai.dev", Version: "v1alpha1", - Kind: "InfrastructureRunnerConfigList", + Kind: "RunnerConfigList", } scheme.AddKnownTypeWithName(gvkList, &runtime.Unknown{}) _ = meta.NewDefaultRESTMapper(nil) @@ -137,11 +137,11 @@ func TestCapabilityPublisher_ConstructsWithoutPanic(t *testing.T) { // fake tracker knows the list kind mapping. func newAllFakeDynamicClient(scheme *runtime.Scheme) *dynamicfake.FakeDynamicClient { scheme.AddKnownTypeWithName(schema.GroupVersionKind{ - Group: "infrastructure.ontai.dev", Version: "v1alpha1", Kind: "RunnerConfig", + Group: "seam.ontai.dev", Version: "v1alpha1", Kind: "RunnerConfig", }, &unstructured.Unstructured{}) _ = meta.NewDefaultRESTMapper(nil) return dynamicfake.NewSimpleDynamicClientWithCustomListKinds(scheme, - map[schema.GroupVersionResource]string{runnerConfigGVR: "InfrastructureRunnerConfigList"}, + map[schema.GroupVersionResource]string{runnerConfigGVR: "RunnerConfigList"}, ) } diff --git a/test/unit/capability/platform_test.go b/test/unit/capability/platform_test.go index 45dc17e..dca2dd7 100644 --- a/test/unit/capability/platform_test.go +++ b/test/unit/capability/platform_test.go @@ -36,6 +36,7 @@ var platformKindToResource = map[string]string{ "PKIRotation": "pkirotations", "ClusterReset": "clusterresets", "HardeningProfile": "hardeningprofiles", + "MachineConfig": "machineconfigs", } // seamKindToResource maps seam.ontai.dev Kind names to GVR resources. @@ -160,6 +161,10 @@ func (s *stubTalosClient) Reboot(_ context.Context) error { s.rebootCalled = true return s.rebootErr } +func (s *stubTalosClient) RebootPowercycle(_ context.Context) error { + s.rebootCalled = true + return s.rebootErr +} func (s *stubTalosClient) Reset(_ context.Context, _ bool) error { s.resetCalled = true return s.resetErr diff --git a/test/unit/compiler/wrapper_runner_rbac_test.go b/test/unit/compiler/wrapper_runner_rbac_test.go index 651298d..fc11c5b 100644 --- a/test/unit/compiler/wrapper_runner_rbac_test.go +++ b/test/unit/compiler/wrapper_runner_rbac_test.go @@ -203,3 +203,50 @@ func TestDispatcherRunnerRole_NotGeneratedWithoutClusterName(t *testing.T) { t.Errorf("dispatcher-runner.yaml was generated without --cluster-name; must not be present") } } + +// TestWatchdogQueueYAML_EmittedInPostBootstrap verifies that watchdog-queue.yaml is +// generated in 05-post-bootstrap when --cluster-name is provided. +func TestWatchdogQueueYAML_EmittedInPostBootstrap(t *testing.T) { + bin := buildCompiler(t) + out := t.TempDir() + cmd := exec.Command(bin, "enable", "--cluster-name", "ccs-mgmt", "--output", out) + if output, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("compiler enable failed: %v\n%s", err, output) + } + + path := filepath.Join(out, "05-post-bootstrap", "watchdog-queue.yaml") + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("watchdog-queue.yaml not generated: %v", err) + } + content := string(raw) + + if !strings.Contains(content, "watchdog-queue") { + t.Error("watchdog-queue.yaml does not contain 'watchdog-queue' name") + } + if !strings.Contains(content, "ont-system") { + t.Error("watchdog-queue.yaml is not scoped to ont-system namespace") + } + if !strings.Contains(content, "seam-pack-deploy") { + t.Error("watchdog-queue.yaml does not reference ClusterQueue seam-pack-deploy") + } + if !strings.Contains(content, "LocalQueue") { + t.Error("watchdog-queue.yaml is not a LocalQueue kind") + } +} + +// TestWatchdogQueueYAML_NotGeneratedWithoutClusterName verifies that watchdog-queue.yaml +// is NOT generated when --cluster-name is absent. +func TestWatchdogQueueYAML_NotGeneratedWithoutClusterName(t *testing.T) { + bin := buildCompiler(t) + out := t.TempDir() + cmd := exec.Command(bin, "enable", "--output", out) + if output, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("compiler enable failed: %v\n%s", err, output) + } + + path := filepath.Join(out, "05-post-bootstrap", "watchdog-queue.yaml") + if _, err := os.Stat(path); err == nil { + t.Errorf("watchdog-queue.yaml was generated without --cluster-name; must not be present") + } +} diff --git a/test/unit/federation/federation_stream_test.go b/test/unit/federation/federation_stream_test.go index 6ba1aec..2eac4f6 100644 --- a/test/unit/federation/federation_stream_test.go +++ b/test/unit/federation/federation_stream_test.go @@ -38,7 +38,7 @@ func setupStreamTest(t *testing.T) *streamTestEnv { serverCertPath, serverKeyPath, caPath := writeTempCerts(t, serverCertPEM, serverKeyPEM, ca.caPEM()) // Use a fake kubeClient in tests (nil — server skips ConfigMap creation). - srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil) + srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil, federation.FederationServerOptions{}) if err != nil { t.Fatalf("NewFederationServer: %v", err) } diff --git a/test/unit/federation/federation_tls_test.go b/test/unit/federation/federation_tls_test.go index 03d347f..2def2ca 100644 --- a/test/unit/federation/federation_tls_test.go +++ b/test/unit/federation/federation_tls_test.go @@ -286,7 +286,7 @@ func TestFederationServer_gRPC_AcceptsValidCert(t *testing.T) { serverCertPEM, serverKeyPEM := ca.issueServerCert(t, []string{"localhost"}) serverCertPath, serverKeyPath, caPath := writeTempCerts(t, serverCertPEM, serverKeyPEM, ca.caPEM()) - srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil) + srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil, federation.FederationServerOptions{}) if err != nil { t.Fatalf("NewFederationServer: %v", err) } @@ -334,7 +334,7 @@ func TestFederationServer_gRPC_RejectsNoCert(t *testing.T) { serverCertPEM, serverKeyPEM := ca.issueServerCert(t, []string{"localhost"}) serverCertPath, serverKeyPath, caPath := writeTempCerts(t, serverCertPEM, serverKeyPEM, ca.caPEM()) - srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil) + srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil, federation.FederationServerOptions{}) if err != nil { t.Fatalf("NewFederationServer: %v", err) } @@ -389,7 +389,7 @@ func TestFederationServer_gRPC_RejectsWrongCA(t *testing.T) { serverCertPEM, serverKeyPEM := serverCA.issueServerCert(t, []string{"localhost"}) serverCertPath, serverKeyPath, caPath := writeTempCerts(t, serverCertPEM, serverKeyPEM, serverCA.caPEM()) - srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil) + srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil, federation.FederationServerOptions{}) if err != nil { t.Fatalf("NewFederationServer: %v", err) } @@ -460,7 +460,7 @@ func TestFederationClient_ClusterIDExtraction(t *testing.T) { serverCertPEM, serverKeyPEM := ca.issueServerCert(t, []string{"localhost"}) serverCertPath, serverKeyPath, caPath := writeTempCerts(t, serverCertPEM, serverKeyPEM, ca.caPEM()) - srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil) + srv, err := federation.NewFederationServer(caPath, serverCertPath, serverKeyPath, nil, federation.FederationServerOptions{}) if err != nil { t.Fatalf("NewFederationServer: %v", err) }