From 8da9848ad948fbf2db305683ffe5d62e75da3538 Mon Sep 17 00:00:00 2001 From: Andrii Shestakov Date: Tue, 24 Feb 2026 22:26:12 +0100 Subject: [PATCH 1/2] [WIP] Allow deploy multiple clusters in same ns --- api/v1alpha1/nodeset_types.go | 6 + cmd/sconfigcontroller/main.go | 3 +- .../crd/bases/slurm.nebius.ai_nodesets.yaml | 6 + helm/nodesets/templates/nodeset.yaml | 2 + helm/nodesets/tests/basic_test.yaml | 28 ++++ helm/nodesets/values.yaml | 5 +- helm/slurm-cluster/templates/_helpers.tpl | 8 +- .../templates/slurm-cluster-cr.yaml | 8 +- .../templates/slurm-scripts-cm.yaml | 2 +- .../tests/exporter-rbac_test.yaml | 8 +- .../tests/volume-sources-filtering_test.yaml | 4 +- .../templates/slurmcluster-crd.yaml | 6 + helm/soperator/crds/slurmcluster-crd.yaml | 6 + internal/consts/annotation.go | 2 - .../clustercontroller/accounting.go | 2 +- .../clustercontroller/sconfigcontroller.go | 2 +- .../controller/nodesetcontroller/reconcile.go | 6 +- .../jailedconfig_controller.go | 18 ++- .../jailedconfig_controller_test.go | 1 + .../workertopology_controller.go | 47 ++++--- .../workertopology_controller_update_test.go | 28 ++-- internal/naming/naming.go | 15 +- internal/render/accounting/deployment.go | 2 +- internal/render/accounting/deployment_test.go | 2 +- internal/render/common/configmap.go | 3 +- internal/render/rest/rest.go | 2 +- .../render/sconfigcontroller/deployment.go | 2 +- internal/utils/resourcegetter/cluster.go | 29 ---- internal/utils/resourcegetter/nodeset.go | 9 +- internal/values/slurm_accounting.go | 2 +- internal/values/slurm_accounting_test.go | 2 +- internal/values/slurm_controller.go | 4 +- internal/values/slurm_login.go | 2 +- internal/values/slurm_nodeset.go | 2 +- internal/webhook/v1alpha1/nodeset_webhook.go | 26 +--- .../webhook/v1alpha1/nodeset_webhook_test.go | 128 +++++++++--------- 36 files changed, 230 insertions(+), 198 deletions(-) diff --git a/api/v1alpha1/nodeset_types.go b/api/v1alpha1/nodeset_types.go index a3dd64b83..feca25ffc 100644 --- a/api/v1alpha1/nodeset_types.go +++ b/api/v1alpha1/nodeset_types.go @@ -113,6 +113,12 @@ const ( // NodeSetSpec defines the desired state of NodeSet type NodeSetSpec struct { + // SlurmClusterRefName is the name of the parent SlurmCluster this NodeSet belongs to. + // Must be in the same namespace as the NodeSet. + // + // +kubebuilder:validation:Required + SlurmClusterRefName string `json:"slurmClusterRefName"` + // Replicas specifies the number of worker nodes in the NodeSet. // // Defaults to 1 if not specified. diff --git a/cmd/sconfigcontroller/main.go b/cmd/sconfigcontroller/main.go index fca7d7344..1da45c012 100644 --- a/cmd/sconfigcontroller/main.go +++ b/cmd/sconfigcontroller/main.go @@ -176,7 +176,7 @@ func main() { WebhookServer: webhookServer, HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, - LeaderElectionID: "vqeyz6ae.nebius.ai", + LeaderElectionID: clusterName + ".vqeyz6ae.nebius.ai", LeaderElectionReleaseOnCancel: true, Cache: cache.Options{ DefaultNamespaces: map[string]cache.Config{ @@ -215,6 +215,7 @@ func main() { if err = (sconfigcontroller.NewJailedConfigReconciler( mgr.GetClient(), mgr.GetScheme(), + clusterName, slurmAPIClient, jailFs, reconfigurePollInterval, diff --git a/config/crd/bases/slurm.nebius.ai_nodesets.yaml b/config/crd/bases/slurm.nebius.ai_nodesets.yaml index adb7aecea..08674ec71 100644 --- a/config/crd/bases/slurm.nebius.ai_nodesets.yaml +++ b/config/crd/bases/slurm.nebius.ai_nodesets.yaml @@ -2680,6 +2680,11 @@ spec: Defaults to 1 if not specified. format: int32 type: integer + slurmClusterRefName: + description: |- + SlurmClusterRefName is the name of the parent SlurmCluster this NodeSet belongs to. + Must be in the same namespace as the NodeSet. + type: string slurmd: description: Slurmd defines the Slurm worker daemon configuration. properties: @@ -11028,6 +11033,7 @@ spec: type: object required: - munge + - slurmClusterRefName - slurmd type: object status: diff --git a/helm/nodesets/templates/nodeset.yaml b/helm/nodesets/templates/nodeset.yaml index 3c7c5e91e..ab72b91d8 100644 --- a/helm/nodesets/templates/nodeset.yaml +++ b/helm/nodesets/templates/nodeset.yaml @@ -19,6 +19,8 @@ metadata: {{- end }} spec: + slurmClusterRefName: {{ $.Values.slurmClusterRefName | quote }} + {{- with (.replicas | default 1) }} replicas: {{ . }} {{- end }} diff --git a/helm/nodesets/tests/basic_test.yaml b/helm/nodesets/tests/basic_test.yaml index 7d6015ca0..5b89e7b7b 100644 --- a/helm/nodesets/tests/basic_test.yaml +++ b/helm/nodesets/tests/basic_test.yaml @@ -129,6 +129,34 @@ tests: - exists: path: metadata.labels["app.kubernetes.io/managed-by"] + - it: should set slurmClusterRefName from top-level value + set: + slurmClusterRefName: "my-cluster" + nodesets: + - name: test-workers + slurmd: + image: + repository: "test/slurm" + resources: + cpu: "1" + memory: "1Gi" + volumes: + spool: + emptyDir: {} + jail: + emptyDir: {} + jailSubMounts: [] + munge: + image: + repository: "test/munge" + resources: + cpu: "100m" + memory: "128Mi" + asserts: + - equal: + path: spec.slurmClusterRefName + value: "my-cluster" + - it: should set correct namespace set: nodesets: diff --git a/helm/nodesets/values.yaml b/helm/nodesets/values.yaml index 7c33ca1b0..a4e575857 100644 --- a/helm/nodesets/values.yaml +++ b/helm/nodesets/values.yaml @@ -5,6 +5,9 @@ # Global settings nameOverride: "" fullnameOverride: "" +# Name of the SlurmCluster this NodeSets release belongs to. +# Must be in the same namespace as the NodeSets. +slurmClusterRefName: "soperator" # Priority Classes configuration # Define priority classes that can be used by NodeSets priorityClasses: @@ -122,7 +125,7 @@ nodesets: # Could be any corev1.VolumeSource jail: persistentVolumeClaim: - claimName: &jailPvcClaimName "jail-pvc" + claimName: &jailPvcClaimName "soperator-jail-pvc" # Volumes being mounted inside Jail mount # Optional, defaults to empty list jailSubMounts: diff --git a/helm/slurm-cluster/templates/_helpers.tpl b/helm/slurm-cluster/templates/_helpers.tpl index 7f3d5d1b8..d45cea400 100644 --- a/helm/slurm-cluster/templates/_helpers.tpl +++ b/helm/slurm-cluster/templates/_helpers.tpl @@ -62,7 +62,7 @@ Create the name of the service account to use for exporter */}} {{- define "slurm-cluster.exporter.serviceAccountName" -}} {{- if .Values.slurmNodes.exporter.serviceAccount.create -}} - {{- default "slurm-exporter-sa" .Values.slurmNodes.exporter.serviceAccount.name }} + {{- default (printf "%s-slurm-exporter-sa" (include "slurm-cluster.name" .)) .Values.slurmNodes.exporter.serviceAccount.name }} {{- else -}} {{- default "default" .Values.slurmNodes.exporter.serviceAccount.name }} {{- end -}} @@ -107,3 +107,9 @@ Create the name of the role binding for slurm-controller {{- printf "%s-slurm-controller" (include "slurm-cluster.name" .) }} {{- end -}} +{{/* +Create the name of the slurm-scripts ConfigMap +*/}} +{{- define "slurm-cluster.slurmScripts.configMapName" -}} +{{- printf "%s-slurm-scripts" (include "slurm-cluster.name" .) }} +{{- end -}} diff --git a/helm/slurm-cluster/templates/slurm-cluster-cr.yaml b/helm/slurm-cluster/templates/slurm-cluster-cr.yaml index 27d80c5ec..e0d99fb04 100644 --- a/helm/slurm-cluster/templates/slurm-cluster-cr.yaml +++ b/helm/slurm-cluster/templates/slurm-cluster-cr.yaml @@ -91,9 +91,9 @@ spec: {{- end }} {{- end }} volumeSources: - - name: slurm-scripts + - name: {{ include "slurm-cluster.slurmScripts.configMapName" . }} configMap: - name: slurm-scripts + name: {{ include "slurm-cluster.slurmScripts.configMapName" . }} defaultMode: 0755 {{- range .Values.volumeSources }} - name: {{ .name | quote }} @@ -325,10 +325,10 @@ spec: customMounts: - name: slurm-scripts mountPath: /opt/slurm_scripts/ - volumeSourceName: slurm-scripts + volumeSourceName: {{ include "slurm-cluster.slurmScripts.configMapName" . }} - name: slurm-scripts-jail mountPath: /mnt/jail.upper/opt/slurm_scripts/ - volumeSourceName: slurm-scripts + volumeSourceName: {{ include "slurm-cluster.slurmScripts.configMapName" . }} {{- if .Values.slurmNodes.login.volumes.customMounts }} {{- .Values.slurmNodes.login.volumes.customMounts | toYaml | nindent 10 }} {{- end }} diff --git a/helm/slurm-cluster/templates/slurm-scripts-cm.yaml b/helm/slurm-cluster/templates/slurm-scripts-cm.yaml index bfa21a123..6fdab4119 100644 --- a/helm/slurm-cluster/templates/slurm-scripts-cm.yaml +++ b/helm/slurm-cluster/templates/slurm-scripts-cm.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: namespace: {{ .Release.Namespace }} - name: slurm-scripts + name: {{ include "slurm-cluster.slurmScripts.configMapName" . }} labels: app: {{ .Chart.Name }} release: {{ .Release.Name }} diff --git a/helm/slurm-cluster/tests/exporter-rbac_test.yaml b/helm/slurm-cluster/tests/exporter-rbac_test.yaml index 413d76477..331d92ed5 100644 --- a/helm/slurm-cluster/tests/exporter-rbac_test.yaml +++ b/helm/slurm-cluster/tests/exporter-rbac_test.yaml @@ -15,7 +15,7 @@ tests: of: ServiceAccount - equal: path: metadata.name - value: slurm-exporter-sa + value: test-cluster-slurm-exporter-sa - equal: path: metadata.namespace value: NAMESPACE @@ -114,7 +114,7 @@ tests: path: subjects content: kind: ServiceAccount - name: slurm-exporter-sa + name: test-cluster-exporter-sa namespace: NAMESPACE # Test RoleBinding with custom ServiceAccount name @@ -162,7 +162,7 @@ tests: asserts: - equal: path: spec.slurmNodes.exporter.serviceAccountName - value: slurm-exporter-sa + value: test-cluster-exporter-sa # Test SlurmCluster CR uses custom ServiceAccount name - it: should set custom serviceAccountName in SlurmCluster CR @@ -207,4 +207,4 @@ tests: asserts: - equal: path: spec.slurmNodes.exporter.serviceAccountName - value: slurm-exporter-sa + value: test-cluster-exporter-sa diff --git a/helm/slurm-cluster/tests/volume-sources-filtering_test.yaml b/helm/slurm-cluster/tests/volume-sources-filtering_test.yaml index 462a48874..c7b3aa340 100644 --- a/helm/slurm-cluster/tests/volume-sources-filtering_test.yaml +++ b/helm/slurm-cluster/tests/volume-sources-filtering_test.yaml @@ -117,7 +117,7 @@ tests: # Verify correct names are preserved (slurm-scripts is always first) - equal: path: spec.volumeSources[0].name - value: "slurm-scripts" + value: "test-cluster-slurm-scripts" - equal: path: spec.volumeSources[1].name value: "jail" @@ -206,7 +206,7 @@ tests: # Verify correct values - equal: path: spec.volumeSources[0].name - value: "slurm-scripts" + value: "test-cluster-slurm-scripts" - equal: path: spec.volumeSources[1].name value: "simple-pvc" diff --git a/helm/soperator-crds/templates/slurmcluster-crd.yaml b/helm/soperator-crds/templates/slurmcluster-crd.yaml index 0bdccd0c3..2bba5d548 100644 --- a/helm/soperator-crds/templates/slurmcluster-crd.yaml +++ b/helm/soperator-crds/templates/slurmcluster-crd.yaml @@ -17766,6 +17766,11 @@ spec: Defaults to 1 if not specified. format: int32 type: integer + slurmClusterRefName: + description: |- + SlurmClusterRefName is the name of the parent SlurmCluster this NodeSet belongs to. + Must be in the same namespace as the NodeSet. + type: string slurmd: description: Slurmd defines the Slurm worker daemon configuration. properties: @@ -26114,6 +26119,7 @@ spec: type: object required: - munge + - slurmClusterRefName - slurmd type: object status: diff --git a/helm/soperator/crds/slurmcluster-crd.yaml b/helm/soperator/crds/slurmcluster-crd.yaml index 0bdccd0c3..2bba5d548 100644 --- a/helm/soperator/crds/slurmcluster-crd.yaml +++ b/helm/soperator/crds/slurmcluster-crd.yaml @@ -17766,6 +17766,11 @@ spec: Defaults to 1 if not specified. format: int32 type: integer + slurmClusterRefName: + description: |- + SlurmClusterRefName is the name of the parent SlurmCluster this NodeSet belongs to. + Must be in the same namespace as the NodeSet. + type: string slurmd: description: Slurmd defines the Slurm worker daemon configuration. properties: @@ -26114,6 +26119,7 @@ spec: type: object required: - munge + - slurmClusterRefName - slurmd type: object status: diff --git a/internal/consts/annotation.go b/internal/consts/annotation.go index 12e58ba64..ed47ddc34 100644 --- a/internal/consts/annotation.go +++ b/internal/consts/annotation.go @@ -6,6 +6,4 @@ const ( AnnotationDefaultContainerName = "kubectl.kubernetes.io/default-container" AnnotationClusterName = K8sGroupNameSoperator + "/cluster" AnnotationActiveCheckName = K8sGroupNameSoperator + "/activecheck" - - AnnotationParentalClusterRefName = K8sGroupNameSoperator + "/parental-cluster-ref" ) diff --git a/internal/controller/clustercontroller/accounting.go b/internal/controller/clustercontroller/accounting.go index 5b2362bab..c7d6c045c 100644 --- a/internal/controller/clustercontroller/accounting.go +++ b/internal/controller/clustercontroller/accounting.go @@ -340,7 +340,7 @@ func (r SlurmClusterReconciler) ReconcileAccounting( if !isAccountingEnabled { stepLogger.V(1).Info("Removing") - deploymentName := naming.BuildDeploymentName(consts.ComponentTypeAccounting) + deploymentName := naming.BuildDeploymentName(consts.ComponentTypeAccounting, clusterValues.Name) if err = r.Deployment.Cleanup(stepCtx, cluster, deploymentName); err != nil { return fmt.Errorf("cleanup accounting Deployment: %w", err) } diff --git a/internal/controller/clustercontroller/sconfigcontroller.go b/internal/controller/clustercontroller/sconfigcontroller.go index 0a9ea11c8..d5d812536 100644 --- a/internal/controller/clustercontroller/sconfigcontroller.go +++ b/internal/controller/clustercontroller/sconfigcontroller.go @@ -93,7 +93,7 @@ func (r SlurmClusterReconciler) ValidateSConfigController( ctx, types.NamespacedName{ Namespace: clusterValues.Namespace, - Name: naming.BuildDeploymentName(consts.ComponentTypeSConfigController), + Name: naming.BuildDeploymentName(consts.ComponentTypeSConfigController, clusterValues.Name), }, existing, ) diff --git a/internal/controller/nodesetcontroller/reconcile.go b/internal/controller/nodesetcontroller/reconcile.go index b6fa87572..8c9ba2ca1 100644 --- a/internal/controller/nodesetcontroller/reconcile.go +++ b/internal/controller/nodesetcontroller/reconcile.go @@ -68,9 +68,9 @@ func (r *NodeSetReconciler) reconcile(ctx context.Context, nodeSet *slurmv1alpha err error ) { - clusterName, hasClusterRef := nodeSet.GetAnnotations()[consts.AnnotationParentalClusterRefName] - if !hasClusterRef { - err = fmt.Errorf("getting parental cluster ref from annotations") + clusterName := nodeSet.Spec.SlurmClusterRefName + if clusterName == "" { + err = fmt.Errorf("NodeSet %q has empty spec.slurmClusterRefName", nodeSet.Name) logger.Error(err, "No parent cluster ref found") return ctrl.Result{}, err } diff --git a/internal/controller/sconfigcontroller/jailedconfig_controller.go b/internal/controller/sconfigcontroller/jailedconfig_controller.go index 68576d317..c6cd3eef5 100644 --- a/internal/controller/sconfigcontroller/jailedconfig_controller.go +++ b/internal/controller/sconfigcontroller/jailedconfig_controller.go @@ -62,6 +62,7 @@ type JailedConfigReconciler struct { client.Client Scheme *runtime.Scheme + clusterName string slurmAPIClient slurmapi.Client clock Clock fs Fs @@ -331,7 +332,10 @@ func (r *JailedConfigReconciler) reconcileWithAggregation(ctx context.Context, j jailedConfigs := &slurmv1alpha1.JailedConfigList{} err := r.Client.List(ctx, jailedConfigs, client.InNamespace(jailedConfig.Namespace), - client.MatchingLabels{consts.LabelJailedAggregationKey: aggregationKey}, + client.MatchingLabels{ + consts.LabelJailedAggregationKey: aggregationKey, + consts.LabelInstanceKey: r.clusterName, + }, ) if err != nil { return ctrl.Result{}, fmt.Errorf("listing JailedConfigs with aggregation key %q: %w", aggregationKey, err) @@ -557,6 +561,7 @@ func (r *JailedConfigReconciler) shouldInitializeConditions(ctx context.Context, func NewJailedConfigReconciler( client client.Client, scheme *runtime.Scheme, + clusterName string, slurmAPIClient slurmapi.Client, fs Fs, reconfigurePollInterval time.Duration, @@ -571,6 +576,7 @@ func NewJailedConfigReconciler( return &JailedConfigReconciler{ Client: client, Scheme: scheme, + clusterName: clusterName, slurmAPIClient: slurmAPIClient, fs: fs, reconfigurePollInterval: reconfigurePollInterval, @@ -592,14 +598,18 @@ func (r *JailedConfigReconciler) SetupWithManager(mgr ctrl.Manager, maxConcurren }); err != nil { return err } + clusterPredicate := predicate.NewPredicateFuncs(func(obj client.Object) bool { + return obj.GetLabels()[consts.LabelInstanceKey] == r.clusterName + }) + return ctrl.NewControllerManagedBy(mgr). - For(&slurmv1alpha1.JailedConfig{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). + For(&slurmv1alpha1.JailedConfig{}, builder.WithPredicates(predicate.GenerationChangedPredicate{}, clusterPredicate)). Watches( &corev1.ConfigMap{}, handler.EnqueueRequestsFromMapFunc(r.findObjectsForConfigMap), - builder.WithPredicates(predicate.ResourceVersionChangedPredicate{}), + builder.WithPredicates(predicate.ResourceVersionChangedPredicate{}, clusterPredicate), ). - Named("jailedconfig"). + Named(r.clusterName + "-jailedconfig"). WithOptions(controllerconfig.ControllerOptionsWithRateLimit(maxConcurrency, cacheSyncTimeout, 15*time.Second, 1*time.Minute)). Complete(r) } diff --git a/internal/controller/sconfigcontroller/jailedconfig_controller_test.go b/internal/controller/sconfigcontroller/jailedconfig_controller_test.go index 1b57031ca..1362eeb49 100644 --- a/internal/controller/sconfigcontroller/jailedconfig_controller_test.go +++ b/internal/controller/sconfigcontroller/jailedconfig_controller_test.go @@ -87,6 +87,7 @@ func newTestJailedConfigController( sctrl := NewJailedConfigReconciler( mgr.GetClient(), mgr.GetScheme(), + "soperator", apiClient, fakeFs, 1*time.Second, // Poll interval for tests diff --git a/internal/controller/topologyconfcontroller/workertopology_controller.go b/internal/controller/topologyconfcontroller/workertopology_controller.go index d8554ed8a..49c51d546 100644 --- a/internal/controller/topologyconfcontroller/workertopology_controller.go +++ b/internal/controller/topologyconfcontroller/workertopology_controller.go @@ -7,6 +7,7 @@ import ( "encoding/json" "fmt" "path/filepath" + "regexp" "strconv" "strings" "time" @@ -41,6 +42,7 @@ var ( RequeueAfter: 1 * time.Minute, Requeue: true, } + TopologyConfigRegex = regexp.MustCompile(`.*-` + consts.ConfigMapNameTopologyConfig + `$`) ) const ( @@ -108,13 +110,13 @@ func (r *WorkerTopologyReconciler) Reconcile(ctx context.Context, req ctrl.Reque existingTopologyConfig := existing.Data[consts.ConfigMapKeyTopologyConfig] if r.calculateConfigHash(desired) == r.calculateConfigHash(existingTopologyConfig) { logger.Info("Topology config unchanged, skipping update") - if err := r.ensureJailedConfig(ctx, req.Namespace); err != nil { + if err := r.ensureJailedConfig(ctx, req.Namespace, slurmCluster.Name); err != nil { return ctrl.Result{}, fmt.Errorf("ensure JailedConfig: %w", err) } return DefaultRequeueResult, nil } - if err := r.updateTopologyConfigMap(ctx, req.Namespace, desired); err != nil { + if err := r.updateTopologyConfigMap(ctx, req.Namespace, slurmCluster.Name, desired); err != nil { logger.Error(err, "Update ConfigMap with topology config") return ctrl.Result{}, fmt.Errorf("update ConfigMap with topology config: %w", err) } @@ -128,6 +130,10 @@ func isClusterReconciliationNeeded(slurmCluster *slurmv1.SlurmCluster) bool { slurmCluster.Spec.SlurmConfig.TopologyPlugin == consts.SlurmTopologyBlock } +func topologyConfigMapName(clusterName string) string { + return clusterName + "-" + consts.ConfigMapNameTopologyConfig +} + func (r *WorkerTopologyReconciler) getNodeTopologyLabelsConfigMap( ctx context.Context, req ctrl.Request, logger logr.Logger) (*corev1.ConfigMap, error) { configMap := &corev1.ConfigMap{ @@ -145,15 +151,19 @@ func (r *WorkerTopologyReconciler) getNodeTopologyLabelsConfigMap( return configMap, nil } -func (r *WorkerTopologyReconciler) renderTopologyConfigMap(namespace string, config string) *corev1.ConfigMap { +func (r *WorkerTopologyReconciler) renderTopologyConfigMap(namespace string, config string, clusterName string) *corev1.ConfigMap { + cmName := topologyConfigMapName(clusterName) return &corev1.ConfigMap{ TypeMeta: ctrl.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), Kind: "ConfigMap", }, ObjectMeta: ctrl.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: cmName, Namespace: namespace, + Labels: map[string]string{ + consts.LabelInstanceKey: clusterName, + }, }, Data: map[string]string{ consts.ConfigMapKeyTopologyConfig: config, @@ -161,22 +171,24 @@ func (r *WorkerTopologyReconciler) renderTopologyConfigMap(namespace string, con } } -func (r *WorkerTopologyReconciler) renderTopologyJailedConfig(namespace string) *v1alpha1.JailedConfig { +func (r *WorkerTopologyReconciler) renderTopologyJailedConfig(namespace string, clusterName string) *v1alpha1.JailedConfig { + cmName := topologyConfigMapName(clusterName) return &v1alpha1.JailedConfig{ TypeMeta: ctrl.TypeMeta{ APIVersion: v1alpha1.GroupVersion.String(), Kind: "JailedConfig", }, ObjectMeta: ctrl.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: cmName, Namespace: namespace, Labels: map[string]string{ + consts.LabelInstanceKey: clusterName, consts.LabelJailedAggregationKey: consts.LabelJailedAggregationCommonValue, }, }, Spec: v1alpha1.JailedConfigSpec{ ConfigMap: v1alpha1.ConfigMapReference{ - Name: consts.ConfigMapNameTopologyConfig, + Name: cmName, }, Items: []corev1.KeyToPath{ { @@ -267,15 +279,16 @@ func (r *WorkerTopologyReconciler) calculateConfigHash(config string) string { return hex.EncodeToString(hash[:]) } -func (r *WorkerTopologyReconciler) updateTopologyConfigMap(ctx context.Context, namespace string, config string) error { - configMapKey := client.ObjectKey{Name: consts.ConfigMapNameTopologyConfig, Namespace: namespace} +func (r *WorkerTopologyReconciler) updateTopologyConfigMap(ctx context.Context, namespace, clusterName, config string) error { + cmName := topologyConfigMapName(clusterName) + configMapKey := client.ObjectKey{Name: cmName, Namespace: namespace} existingConfigMap := &corev1.ConfigMap{} err := r.Client.Get(ctx, configMapKey, existingConfigMap) if err != nil { if apierrors.IsNotFound(err) { cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: cmName, Namespace: namespace, }, Data: map[string]string{ @@ -283,10 +296,10 @@ func (r *WorkerTopologyReconciler) updateTopologyConfigMap(ctx context.Context, }, } if err := r.Client.Create(ctx, cm); err != nil { - return fmt.Errorf("create ConfigMap %s: %w", consts.ConfigMapNameTopologyConfig, err) + return fmt.Errorf("create ConfigMap %s: %w", cmName, err) } } else { - return fmt.Errorf("get ConfigMap %s: %w", consts.ConfigMapNameTopologyConfig, err) + return fmt.Errorf("get ConfigMap %s: %w", cmName, err) } } else { existingConfigMap.Data[consts.ConfigMapKeyTopologyConfig] = config @@ -295,7 +308,7 @@ func (r *WorkerTopologyReconciler) updateTopologyConfigMap(ctx context.Context, } } - if err := r.ensureJailedConfig(ctx, namespace); err != nil { + if err := r.ensureJailedConfig(ctx, namespace, clusterName); err != nil { return fmt.Errorf("ensure JailedConfig: %w", err) } @@ -304,8 +317,8 @@ func (r *WorkerTopologyReconciler) updateTopologyConfigMap(ctx context.Context, // ensureJailedConfig ensures the JailedConfig for topology exists and matches the desired state. // If it doesn't exist, it creates one. If it exists, it updates the spec to match desired. -func (r *WorkerTopologyReconciler) ensureJailedConfig(ctx context.Context, namespace string) error { - desired := r.renderTopologyJailedConfig(namespace) +func (r *WorkerTopologyReconciler) ensureJailedConfig(ctx context.Context, namespace string, clusterName string) error { + desired := r.renderTopologyJailedConfig(namespace, clusterName) existing := &v1alpha1.JailedConfig{} err := r.Client.Get(ctx, client.ObjectKeyFromObject(desired), existing) @@ -382,7 +395,7 @@ func (r *WorkerTopologyReconciler) SetupWithManager(mgr ctrl.Manager, return false }, DeleteFunc: func(e event.DeleteEvent) bool { - return e.Object.GetName() == consts.ConfigMapNameTopologyConfig + return TopologyConfigRegex.MatchString(e.Object.GetName()) }, UpdateFunc: func(e event.UpdateEvent) bool { return false @@ -398,7 +411,7 @@ func (r *WorkerTopologyReconciler) SetupWithManager(mgr ctrl.Manager, return false }, DeleteFunc: func(e event.DeleteEvent) bool { - return e.Object.GetName() == consts.ConfigMapNameTopologyConfig + return TopologyConfigRegex.MatchString(e.Object.GetName()) }, UpdateFunc: func(e event.UpdateEvent) bool { return false diff --git a/internal/controller/topologyconfcontroller/workertopology_controller_update_test.go b/internal/controller/topologyconfcontroller/workertopology_controller_update_test.go index 48689e6e9..1903c9c88 100644 --- a/internal/controller/topologyconfcontroller/workertopology_controller_update_test.go +++ b/internal/controller/topologyconfcontroller/workertopology_controller_update_test.go @@ -24,6 +24,8 @@ func TestWorkerTopologyReconciler_updateTopologyConfigMap_Fixed(t *testing.T) { utilruntime.Must(v1alpha1.AddToScheme(scheme)) namespace := "test-namespace" + clusterName := "test-cluster" + expectedCMName := topologyConfigMapName(clusterName) tests := []struct { name string @@ -41,7 +43,7 @@ func TestWorkerTopologyReconciler_updateTopologyConfigMap_Fixed(t *testing.T) { existingObjects: []client.Object{ &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: expectedCMName, Namespace: namespace, ResourceVersion: "1000", }, @@ -57,13 +59,13 @@ func TestWorkerTopologyReconciler_updateTopologyConfigMap_Fixed(t *testing.T) { existingObjects: []client.Object{ &v1alpha1.JailedConfig{ ObjectMeta: metav1.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: expectedCMName, Namespace: namespace, ResourceVersion: "1000", }, Spec: v1alpha1.JailedConfigSpec{ ConfigMap: v1alpha1.ConfigMapReference{ - Name: consts.ConfigMapNameTopologyConfig, + Name: expectedCMName, }, Items: []corev1.KeyToPath{ { @@ -81,7 +83,7 @@ func TestWorkerTopologyReconciler_updateTopologyConfigMap_Fixed(t *testing.T) { existingObjects: []client.Object{ &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: expectedCMName, Namespace: namespace, ResourceVersion: "1000", }, @@ -91,13 +93,13 @@ func TestWorkerTopologyReconciler_updateTopologyConfigMap_Fixed(t *testing.T) { }, &v1alpha1.JailedConfig{ ObjectMeta: metav1.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: expectedCMName, Namespace: namespace, ResourceVersion: "1000", }, Spec: v1alpha1.JailedConfigSpec{ ConfigMap: v1alpha1.ConfigMapReference{ - Name: consts.ConfigMapNameTopologyConfig, + Name: expectedCMName, }, Items: []corev1.KeyToPath{ { @@ -115,7 +117,7 @@ func TestWorkerTopologyReconciler_updateTopologyConfigMap_Fixed(t *testing.T) { existingObjects: []client.Object{ &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: expectedCMName, Namespace: namespace, ResourceVersion: "1000", }, @@ -125,13 +127,13 @@ func TestWorkerTopologyReconciler_updateTopologyConfigMap_Fixed(t *testing.T) { }, &v1alpha1.JailedConfig{ ObjectMeta: metav1.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: expectedCMName, Namespace: namespace, ResourceVersion: "1000", }, Spec: v1alpha1.JailedConfigSpec{ ConfigMap: v1alpha1.ConfigMapReference{ - Name: consts.ConfigMapNameTopologyConfig, + Name: expectedCMName, }, Items: []corev1.KeyToPath{ { @@ -162,7 +164,7 @@ func TestWorkerTopologyReconciler_updateTopologyConfigMap_Fixed(t *testing.T) { } ctx := context.Background() - err := reconciler.updateTopologyConfigMap(ctx, namespace, "new-topology-config") + err := reconciler.updateTopologyConfigMap(ctx, namespace, clusterName, "new-topology-config") if tt.expectedError { assert.Error(t, err) @@ -173,7 +175,7 @@ func TestWorkerTopologyReconciler_updateTopologyConfigMap_Fixed(t *testing.T) { // Verify ConfigMap was updated var updatedConfigMap corev1.ConfigMap err = fakeClient.Get(ctx, types.NamespacedName{ - Name: consts.ConfigMapNameTopologyConfig, + Name: expectedCMName, Namespace: namespace, }, &updatedConfigMap) assert.NoError(t, err) @@ -182,11 +184,11 @@ func TestWorkerTopologyReconciler_updateTopologyConfigMap_Fixed(t *testing.T) { // Verify JailedConfig exists and has correct spec var updatedJailedConfig v1alpha1.JailedConfig err = fakeClient.Get(ctx, types.NamespacedName{ - Name: consts.ConfigMapNameTopologyConfig, + Name: expectedCMName, Namespace: namespace, }, &updatedJailedConfig) assert.NoError(t, err) - assert.Equal(t, consts.ConfigMapNameTopologyConfig, updatedJailedConfig.Spec.ConfigMap.Name) + assert.Equal(t, expectedCMName, updatedJailedConfig.Spec.ConfigMap.Name) assert.Len(t, updatedJailedConfig.Spec.Items, 1) assert.Equal(t, consts.ConfigMapKeyTopologyConfig, updatedJailedConfig.Spec.Items[0].Key) assert.Equal(t, filepath.Join("/etc/slurm/", consts.ConfigMapKeyTopologyConfig), updatedJailedConfig.Spec.Items[0].Path) diff --git a/internal/naming/naming.go b/internal/naming/naming.go index 589fa55ee..1a28c14b5 100644 --- a/internal/naming/naming.go +++ b/internal/naming/naming.go @@ -83,33 +83,34 @@ func BuildAppArmorProfileName(clusterName, namespace string) string { }.String() } -func BuildStatefulSetName(componentType consts.ComponentType) string { +func BuildStatefulSetName(componentType consts.ComponentType, clusterName string) string { return namedEntity{ componentType: &componentType, - clusterName: "", + clusterName: clusterName, entity: "", }.String() } -func BuildNodeSetStatefulSetName(nodeSetName string) string { +func BuildNodeSetStatefulSetName(clusterName, nodeSetName string) string { return namedEntity{ componentType: nil, - clusterName: "", + clusterName: clusterName, entity: nodeSetName, }.String() } -func BuildDaemonSetName(componentType consts.ComponentType) string { +func BuildDaemonSetName(componentType consts.ComponentType, clusterName string) string { return namedEntity{ componentType: &componentType, - clusterName: "", + clusterName: clusterName, entity: "", }.String() } -func BuildDeploymentName(componentType consts.ComponentType) string { +func BuildDeploymentName(componentType consts.ComponentType, clusterName string) string { return namedEntity{ componentType: &componentType, + clusterName: clusterName, entity: "", }.String() } diff --git a/internal/render/accounting/deployment.go b/internal/render/accounting/deployment.go index af4dd85ad..f5f76800a 100644 --- a/internal/render/accounting/deployment.go +++ b/internal/render/accounting/deployment.go @@ -42,7 +42,7 @@ func RenderDeployment( return &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: naming.BuildDeploymentName(consts.ComponentTypeAccounting), + Name: naming.BuildDeploymentName(consts.ComponentTypeAccounting, clusterName), Namespace: namespace, Labels: labels, }, diff --git a/internal/render/accounting/deployment_test.go b/internal/render/accounting/deployment_test.go index 9b96242d6..89a92bf6f 100644 --- a/internal/render/accounting/deployment_test.go +++ b/internal/render/accounting/deployment_test.go @@ -17,7 +17,7 @@ func Test_RenderDeployment(t *testing.T) { deployment, err := accounting.RenderDeployment(defaultNamespace, defaultNameCluster, acc, defaultNodeFilter, defaultVolumeSources) assert.NoError(t, err) - assert.Equal(t, naming.BuildDeploymentName(consts.ComponentTypeAccounting), deployment.Name) + assert.Equal(t, naming.BuildDeploymentName(consts.ComponentTypeAccounting, defaultNameCluster), deployment.Name) assert.Equal(t, defaultNamespace, deployment.Namespace) assert.Equal(t, common.RenderLabels(consts.ComponentTypeAccounting, defaultNameCluster), deployment.Labels) diff --git a/internal/render/common/configmap.go b/internal/render/common/configmap.go index 350478fcb..97da1be0d 100644 --- a/internal/render/common/configmap.go +++ b/internal/render/common/configmap.go @@ -250,7 +250,8 @@ func generateSlurmConfig(cluster *values.SlurmCluster) renderutils.ConfigFile { { svcName := cluster.NodeController.Service.Name - res.AddProperty("SlurmctldHost", fmt.Sprintf("%s(%s)", "controller-0", svcName)) + controllerName := cluster.NodeController.StatefulSet.Name + "-0" + res.AddProperty("SlurmctldHost", fmt.Sprintf("%s(%s)", controllerName, svcName)) } res.AddComment("") diff --git a/internal/render/rest/rest.go b/internal/render/rest/rest.go index b7d5573ce..b45e47594 100644 --- a/internal/render/rest/rest.go +++ b/internal/render/rest/rest.go @@ -50,7 +50,7 @@ func RenderDeploymentREST( return &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: naming.BuildDeploymentName(consts.ComponentTypeREST), + Name: naming.BuildDeploymentName(consts.ComponentTypeREST, clusterName), Namespace: namespace, Labels: labels, }, diff --git a/internal/render/sconfigcontroller/deployment.go b/internal/render/sconfigcontroller/deployment.go index 25b47b77f..246beaecf 100644 --- a/internal/render/sconfigcontroller/deployment.go +++ b/internal/render/sconfigcontroller/deployment.go @@ -45,7 +45,7 @@ func RenderDeployment( return &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: naming.BuildDeploymentName(consts.ComponentTypeSConfigController), + Name: naming.BuildDeploymentName(consts.ComponentTypeSConfigController, clusterName), Namespace: clusterNamespace, Labels: labels, }, diff --git a/internal/utils/resourcegetter/cluster.go b/internal/utils/resourcegetter/cluster.go index 4a566f294..cd8e4880b 100644 --- a/internal/utils/resourcegetter/cluster.go +++ b/internal/utils/resourcegetter/cluster.go @@ -5,7 +5,6 @@ import ( "fmt" apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -13,34 +12,6 @@ import ( slurmv1 "nebius.ai/slurm-operator/api/v1" ) -// GetClusterInNamespace returns a [slurmv1.SlurmCluster] if it's found in the namespace. -// It assumes that there may be no or just one cluster in the namespace. -func GetClusterInNamespace(ctx context.Context, r client.Reader, namespace string) (*slurmv1.SlurmCluster, error) { - logger := log.FromContext(ctx) - - clusterList := slurmv1.SlurmClusterList{} - if err := r.List(ctx, &clusterList, client.InNamespace(namespace)); err != nil { - logger.Error(err, fmt.Sprintf("Failed to list %s in namespace %q", slurmv1.KindSlurmCluster, namespace)) - return nil, fmt.Errorf("listing %s: %w", slurmv1.KindSlurmCluster, err) - } - - if len(clusterList.Items) == 0 { - logger.V(1).Info(fmt.Sprintf("No %s resources found in namespace %q", slurmv1.KindSlurmCluster, namespace)) - return nil, apierrors.NewNotFound(schema.GroupResource{ - Group: slurmv1.GroupVersion.Group, - Resource: "slurmclusters", - }, fmt.Sprintf("no SlurmCluster found in namespace %s", namespace)) - } - - if len(clusterList.Items) > 1 { - err := fmt.Errorf("multiple %s resources found in namespace %q", slurmv1.KindSlurmCluster, namespace) - logger.Error(err, fmt.Sprintf("%d %s resources found in namespace %q. This should not happen and definitely is a bug", len(clusterList.Items), slurmv1.KindSlurmCluster, namespace)) - return nil, err - } - - return clusterList.Items[0].DeepCopy(), nil -} - // GetCluster returns a [slurmv1.SlurmCluster] if it's found by types.NamespacedName. func GetCluster(ctx context.Context, r client.Reader, name types.NamespacedName) (*slurmv1.SlurmCluster, error) { logger := log.FromContext(ctx) diff --git a/internal/utils/resourcegetter/nodeset.go b/internal/utils/resourcegetter/nodeset.go index 51554c3df..a8011c029 100644 --- a/internal/utils/resourcegetter/nodeset.go +++ b/internal/utils/resourcegetter/nodeset.go @@ -11,11 +11,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" slurmv1alpha1 "nebius.ai/slurm-operator/api/v1alpha1" - "nebius.ai/slurm-operator/internal/consts" "nebius.ai/slurm-operator/internal/utils/sliceutils" ) -// ListNodeSetsByClusterRef returns a list of [slurmv1alpha1.NodeSet] that have consts.AnnotationParentalClusterRefName annotation with clusterRef.Name value. +// ListNodeSetsByClusterRef returns a list of [slurmv1alpha1.NodeSet] whose spec.slurmClusterRefName matches clusterRef.Name. func ListNodeSetsByClusterRef(ctx context.Context, r client.Reader, clusterRef types.NamespacedName) ([]slurmv1alpha1.NodeSet, error) { logger := log.FromContext(ctx) @@ -29,11 +28,7 @@ func ListNodeSetsByClusterRef(ctx context.Context, r client.Reader, clusterRef t return slices.SortedFunc( sliceutils.FilterSliceSeq(nodeSetList.Items, func(nodeSet slurmv1alpha1.NodeSet) bool { - clusterName, found := nodeSet.GetAnnotations()[consts.AnnotationParentalClusterRefName] - if found && clusterName == clusterRef.Name { - return true - } - return false + return nodeSet.Spec.SlurmClusterRefName == clusterRef.Name }), func(a, b slurmv1alpha1.NodeSet) int { return strings.Compare(a.Name, b.Name) diff --git a/internal/values/slurm_accounting.go b/internal/values/slurm_accounting.go index aaea19303..08e4de92c 100644 --- a/internal/values/slurm_accounting.go +++ b/internal/values/slurm_accounting.go @@ -48,7 +48,7 @@ func buildAccountingFrom(clusterName string, maintenance *consts.MaintenanceMode CustomInitContainers: accounting.CustomInitContainers, Service: buildServiceFrom(naming.BuildServiceName(consts.ComponentTypeAccounting, clusterName)), Deployment: buildDeploymentFrom( - naming.BuildDeploymentName(consts.ComponentTypeAccounting), + naming.BuildDeploymentName(consts.ComponentTypeAccounting, clusterName), ), ExternalDB: accounting.ExternalDB, MariaDb: accounting.MariaDbOperator, diff --git a/internal/values/slurm_accounting_test.go b/internal/values/slurm_accounting_test.go index 23172354d..2ad0c985b 100644 --- a/internal/values/slurm_accounting_test.go +++ b/internal/values/slurm_accounting_test.go @@ -60,7 +60,7 @@ func TestBuildAccountingFrom(t *testing.T) { assert.Equal(t, buildContainerFrom(accounting.Munge, consts.ContainerNameMunge).Port, result.ContainerMunge.Port) assert.Equal(t, buildContainerFrom(accounting.Munge, consts.ContainerNameMunge).Resources, result.ContainerMunge.Resources) assert.Equal(t, buildServiceFrom(naming.BuildServiceName(consts.ComponentTypeAccounting, defaultNameCluster)), result.Service) - assert.Equal(t, buildDeploymentFrom(naming.BuildDeploymentName(consts.ComponentTypeAccounting)), result.Deployment) + assert.Equal(t, buildDeploymentFrom(naming.BuildDeploymentName(consts.ComponentTypeAccounting, defaultNameCluster)), result.Deployment) assert.Equal(t, accounting.ExternalDB, result.ExternalDB) assert.Equal(t, accounting.Enabled, result.Enabled) assert.Equal(t, slurmv1.NodeVolume{VolumeSourceName: ptr.To(consts.VolumeNameJail)}, result.VolumeJail) diff --git a/internal/values/slurm_controller.go b/internal/values/slurm_controller.go index 35488a594..936606a0e 100644 --- a/internal/values/slurm_controller.go +++ b/internal/values/slurm_controller.go @@ -33,13 +33,13 @@ type SlurmController struct { func buildSlurmControllerFrom(clusterName string, maintenance *consts.MaintenanceMode, controller *slurmv1.SlurmNodeController) SlurmController { // Controller always has 1 replica statefulSet := buildStatefulSetWithMaxUnavailableFrom( - naming.BuildStatefulSetName(consts.ComponentTypeController), + naming.BuildStatefulSetName(consts.ComponentTypeController, clusterName), consts.SingleReplicas, nil, ) daemonSet := buildDaemonSetFrom( - naming.BuildDaemonSetName(consts.ComponentTypeController), + naming.BuildDaemonSetName(consts.ComponentTypeController, clusterName), ) res := SlurmController{ diff --git a/internal/values/slurm_login.go b/internal/values/slurm_login.go index f1eaf35d6..0a7a3374a 100644 --- a/internal/values/slurm_login.go +++ b/internal/values/slurm_login.go @@ -63,7 +63,7 @@ func buildSlurmLoginFrom(clusterName string, maintenance *consts.MaintenanceMode Service: svc, HeadlessService: headlessSvc, StatefulSet: buildStatefulSetFrom( - naming.BuildStatefulSetName(consts.ComponentTypeLogin), + naming.BuildStatefulSetName(consts.ComponentTypeLogin, clusterName), login.SlurmNode.Size, ), SSHDConfigMapName: sshdConfigMapName, diff --git a/internal/values/slurm_nodeset.go b/internal/values/slurm_nodeset.go index 18c4eac9a..a4381d099 100644 --- a/internal/values/slurm_nodeset.go +++ b/internal/values/slurm_nodeset.go @@ -105,7 +105,7 @@ func BuildSlurmNodeSetFrom( GPU: nsSpec.GPU.DeepCopy(), // StatefulSet: buildStatefulSetWithMaxUnavailableFrom( - naming.BuildNodeSetStatefulSetName(nodeSet.Name), + naming.BuildNodeSetStatefulSetName(clusterName, nodeSet.Name), nsSpec.Replicas, nsSpec.MaxUnavailable, ), diff --git a/internal/webhook/v1alpha1/nodeset_webhook.go b/internal/webhook/v1alpha1/nodeset_webhook.go index 355b2a38d..25eb6fa5e 100644 --- a/internal/webhook/v1alpha1/nodeset_webhook.go +++ b/internal/webhook/v1alpha1/nodeset_webhook.go @@ -28,8 +28,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" slurmv1alpha1 "nebius.ai/slurm-operator/api/v1alpha1" - "nebius.ai/slurm-operator/internal/consts" - "nebius.ai/slurm-operator/internal/utils/resourcegetter" ) // nodesetLog is for logging in this package. @@ -54,7 +52,7 @@ type NodeSetCustomDefaulter struct { var _ webhook.CustomDefaulter = &NodeSetCustomDefaulter{} // Default implements webhook.CustomDefaulter so a webhook will be registered for the Kind NodeSet. -func (d *NodeSetCustomDefaulter) Default(ctx context.Context, obj runtime.Object) error { +func (d *NodeSetCustomDefaulter) Default(_ context.Context, obj runtime.Object) error { nodeSet, ok := obj.(*slurmv1alpha1.NodeSet) if !ok { @@ -62,28 +60,6 @@ func (d *NodeSetCustomDefaulter) Default(ctx context.Context, obj runtime.Object } nodesetLog.Info("Defaulting for NodeSet", "name", nodeSet.GetName()) - if err := defaultNodeSetParentalClusterRef(ctx, d.Client, nodeSet); err != nil { - return err - } - - return nil -} - -func defaultNodeSetParentalClusterRef(ctx context.Context, client client.Client, nodeSet *slurmv1alpha1.NodeSet) error { - if _, hasClusterRef := nodeSet.GetAnnotations()[consts.AnnotationParentalClusterRefName]; hasClusterRef { - return nil - } - - cluster, err := resourcegetter.GetClusterInNamespace(ctx, client, nodeSet.Namespace) - if err != nil { - return fmt.Errorf("seeking parental cluster: %w", err) - } - - if nodeSet.Annotations == nil { - nodeSet.Annotations = map[string]string{} - } - nodeSet.Annotations[consts.AnnotationParentalClusterRefName] = cluster.Name - return nil } diff --git a/internal/webhook/v1alpha1/nodeset_webhook_test.go b/internal/webhook/v1alpha1/nodeset_webhook_test.go index eb30d34c7..2e6d55f85 100644 --- a/internal/webhook/v1alpha1/nodeset_webhook_test.go +++ b/internal/webhook/v1alpha1/nodeset_webhook_test.go @@ -22,83 +22,83 @@ import ( "github.com/stretchr/testify/assert" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - slurmv1 "nebius.ai/slurm-operator/api/v1" slurmv1alpha1 "nebius.ai/slurm-operator/api/v1alpha1" - "nebius.ai/slurm-operator/internal/consts" . "nebius.ai/slurm-operator/internal/webhook/v1alpha1" ) -func TestDefaultSlurmClusterRef(t *testing.T) { - defaulter := &NodeSetCustomDefaulter{ - Client: newClientBuilder().Build(), - } - const ( - testNamespace = "default" - testNodeSetName = "test-nodeset" - testClusterName = "test-cluster" - ) - - t.Run("Parental cluster ref exists", func(t *testing.T) { - obj := &slurmv1alpha1.NodeSet{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: testNodeSetName, - Annotations: map[string]string{ - consts.AnnotationParentalClusterRefName: testClusterName, - }, - }, - } - - err := defaulter.Default(context.Background(), obj) +const ( + testNamespace = "default" + testNodeSetName = "test-nodeset" + testClusterName = "test-cluster" +) + +// --- Validator tests --- + +func TestNodeSetValidator_ValidateCreate(t *testing.T) { + validator := &NodeSetCustomValidator{} + + t.Run("accepts NodeSet with slurmClusterRefName set", func(t *testing.T) { + obj := nodeSetWithClusterRef(testClusterName) + _, err := validator.ValidateCreate(context.Background(), obj) assert.NoError(t, err) }) - t.Run("Parental cluster ref is set", func(t *testing.T) { - defer func() { - defaulter.Client = newClientBuilder().Build() - }() - - defaulter.Client = newClientBuilder(). - WithObjects(&slurmv1.SlurmCluster{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: testClusterName, - }, - }). - Build() - - obj := &slurmv1alpha1.NodeSet{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: testClusterName, - }, - } - - err := defaulter.Default(context.Background(), obj) - assert.NoError(t, err) - assert.Equal(t, testClusterName, obj.GetAnnotations()[consts.AnnotationParentalClusterRefName]) + t.Run("rejects NodeSet without slurmClusterRefName", func(t *testing.T) { + obj := nodeSetWithoutClusterRef() + _, err := validator.ValidateCreate(context.Background(), obj) + assert.Error(t, err) + assert.Contains(t, err.Error(), "slurmClusterRefName") }) +} - t.Run("Parental cluster doesn't exist", func(t *testing.T) { - obj := &slurmv1alpha1.NodeSet{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: testClusterName, - }, - } +func TestNodeSetValidator_ValidateUpdate(t *testing.T) { + validator := &NodeSetCustomValidator{} + + t.Run("accepts update that keeps slurmClusterRefName", func(t *testing.T) { + old := nodeSetWithClusterRef(testClusterName) + updated := nodeSetWithClusterRef(testClusterName) + _, err := validator.ValidateUpdate(context.Background(), old, updated) + assert.NoError(t, err) + }) - err := defaulter.Default(context.Background(), obj) + t.Run("rejects update that clears slurmClusterRefName", func(t *testing.T) { + old := nodeSetWithClusterRef(testClusterName) + updated := nodeSetWithoutClusterRef() + _, err := validator.ValidateUpdate(context.Background(), old, updated) assert.Error(t, err) + assert.Contains(t, err.Error(), "slurmClusterRefName") + }) +} + +func TestNodeSetValidator_ValidateDelete(t *testing.T) { + validator := &NodeSetCustomValidator{} + + t.Run("always accepts delete", func(t *testing.T) { + _, err := validator.ValidateDelete(context.Background(), nodeSetWithoutClusterRef()) + assert.NoError(t, err) }) } -func newClientBuilder() *fake.ClientBuilder { - scheme := runtime.NewScheme() - _ = slurmv1.AddToScheme(scheme) - _ = slurmv1alpha1.AddToScheme(scheme) - return fake.NewClientBuilder(). - WithScheme(scheme) +// --- helpers --- + +func nodeSetWithClusterRef(clusterName string) *slurmv1alpha1.NodeSet { + return &slurmv1alpha1.NodeSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNamespace, + Name: testNodeSetName, + }, + Spec: slurmv1alpha1.NodeSetSpec{ + SlurmClusterRefName: clusterName, + }, + } +} + +func nodeSetWithoutClusterRef() *slurmv1alpha1.NodeSet { + return &slurmv1alpha1.NodeSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNamespace, + Name: testNodeSetName, + }, + } } From 111379d8a4c63c46e3a9f4cc2287edcf2d31d6ae Mon Sep 17 00:00:00 2001 From: Andrii Shestakov Date: Thu, 26 Feb 2026 11:45:53 +0100 Subject: [PATCH 2/2] fix test --- .../webhook/v1alpha1/nodeset_webhook_test.go | 104 ------------------ 1 file changed, 104 deletions(-) delete mode 100644 internal/webhook/v1alpha1/nodeset_webhook_test.go diff --git a/internal/webhook/v1alpha1/nodeset_webhook_test.go b/internal/webhook/v1alpha1/nodeset_webhook_test.go deleted file mode 100644 index 2e6d55f85..000000000 --- a/internal/webhook/v1alpha1/nodeset_webhook_test.go +++ /dev/null @@ -1,104 +0,0 @@ -/* -Copyright 2025 Nebius B.V. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1alpha1_test - -import ( - "context" - "testing" - - "github.com/stretchr/testify/assert" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - slurmv1alpha1 "nebius.ai/slurm-operator/api/v1alpha1" - . "nebius.ai/slurm-operator/internal/webhook/v1alpha1" -) - -const ( - testNamespace = "default" - testNodeSetName = "test-nodeset" - testClusterName = "test-cluster" -) - -// --- Validator tests --- - -func TestNodeSetValidator_ValidateCreate(t *testing.T) { - validator := &NodeSetCustomValidator{} - - t.Run("accepts NodeSet with slurmClusterRefName set", func(t *testing.T) { - obj := nodeSetWithClusterRef(testClusterName) - _, err := validator.ValidateCreate(context.Background(), obj) - assert.NoError(t, err) - }) - - t.Run("rejects NodeSet without slurmClusterRefName", func(t *testing.T) { - obj := nodeSetWithoutClusterRef() - _, err := validator.ValidateCreate(context.Background(), obj) - assert.Error(t, err) - assert.Contains(t, err.Error(), "slurmClusterRefName") - }) -} - -func TestNodeSetValidator_ValidateUpdate(t *testing.T) { - validator := &NodeSetCustomValidator{} - - t.Run("accepts update that keeps slurmClusterRefName", func(t *testing.T) { - old := nodeSetWithClusterRef(testClusterName) - updated := nodeSetWithClusterRef(testClusterName) - _, err := validator.ValidateUpdate(context.Background(), old, updated) - assert.NoError(t, err) - }) - - t.Run("rejects update that clears slurmClusterRefName", func(t *testing.T) { - old := nodeSetWithClusterRef(testClusterName) - updated := nodeSetWithoutClusterRef() - _, err := validator.ValidateUpdate(context.Background(), old, updated) - assert.Error(t, err) - assert.Contains(t, err.Error(), "slurmClusterRefName") - }) -} - -func TestNodeSetValidator_ValidateDelete(t *testing.T) { - validator := &NodeSetCustomValidator{} - - t.Run("always accepts delete", func(t *testing.T) { - _, err := validator.ValidateDelete(context.Background(), nodeSetWithoutClusterRef()) - assert.NoError(t, err) - }) -} - -// --- helpers --- - -func nodeSetWithClusterRef(clusterName string) *slurmv1alpha1.NodeSet { - return &slurmv1alpha1.NodeSet{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: testNodeSetName, - }, - Spec: slurmv1alpha1.NodeSetSpec{ - SlurmClusterRefName: clusterName, - }, - } -} - -func nodeSetWithoutClusterRef() *slurmv1alpha1.NodeSet { - return &slurmv1alpha1.NodeSet{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: testNodeSetName, - }, - } -}