Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions api/v1alpha1/nodeset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,12 @@ const (

// NodeSetSpec defines the desired state of NodeSet
type NodeSetSpec struct {
// SlurmClusterRefName is the name of the parent SlurmCluster this NodeSet belongs to.
// Must be in the same namespace as the NodeSet.
//
// +kubebuilder:validation:Required
SlurmClusterRefName string `json:"slurmClusterRefName"`

// Replicas specifies the number of worker nodes in the NodeSet.
//
// Defaults to 1 if not specified.
Expand Down
3 changes: 2 additions & 1 deletion cmd/sconfigcontroller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ func main() {
WebhookServer: webhookServer,
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "vqeyz6ae.nebius.ai",
LeaderElectionID: clusterName + ".vqeyz6ae.nebius.ai",
LeaderElectionReleaseOnCancel: true,
Cache: cache.Options{
DefaultNamespaces: map[string]cache.Config{
Expand Down Expand Up @@ -215,6 +215,7 @@ func main() {
if err = (sconfigcontroller.NewJailedConfigReconciler(
mgr.GetClient(),
mgr.GetScheme(),
clusterName,
slurmAPIClient,
jailFs,
reconfigurePollInterval,
Expand Down
6 changes: 6 additions & 0 deletions config/crd/bases/slurm.nebius.ai_nodesets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2680,6 +2680,11 @@ spec:
Defaults to 1 if not specified.
format: int32
type: integer
slurmClusterRefName:
description: |-
SlurmClusterRefName is the name of the parent SlurmCluster this NodeSet belongs to.
Must be in the same namespace as the NodeSet.
type: string
slurmd:
description: Slurmd defines the Slurm worker daemon configuration.
properties:
Expand Down Expand Up @@ -11028,6 +11033,7 @@ spec:
type: object
required:
- munge
- slurmClusterRefName
- slurmd
type: object
status:
Expand Down
2 changes: 2 additions & 0 deletions helm/nodesets/templates/nodeset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ metadata:
{{- end }}

spec:
slurmClusterRefName: {{ $.Values.slurmClusterRefName | quote }}

{{- with (.replicas | default 1) }}
replicas: {{ . }}
{{- end }}
Expand Down
28 changes: 28 additions & 0 deletions helm/nodesets/tests/basic_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,34 @@ tests:
- exists:
path: metadata.labels["app.kubernetes.io/managed-by"]

- it: should set slurmClusterRefName from top-level value
set:
slurmClusterRefName: "my-cluster"
nodesets:
- name: test-workers
slurmd:
image:
repository: "test/slurm"
resources:
cpu: "1"
memory: "1Gi"
volumes:
spool:
emptyDir: {}
jail:
emptyDir: {}
jailSubMounts: []
munge:
image:
repository: "test/munge"
resources:
cpu: "100m"
memory: "128Mi"
asserts:
- equal:
path: spec.slurmClusterRefName
value: "my-cluster"

- it: should set correct namespace
set:
nodesets:
Expand Down
5 changes: 4 additions & 1 deletion helm/nodesets/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
# Global settings
nameOverride: ""
fullnameOverride: ""
# Name of the SlurmCluster this NodeSets release belongs to.
# Must be in the same namespace as the NodeSets.
slurmClusterRefName: "soperator"
# Priority Classes configuration
# Define priority classes that can be used by NodeSets
priorityClasses:
Expand Down Expand Up @@ -122,7 +125,7 @@ nodesets:
# Could be any corev1.VolumeSource
jail:
persistentVolumeClaim:
claimName: &jailPvcClaimName "jail-pvc"
claimName: &jailPvcClaimName "soperator-jail-pvc"
# Volumes being mounted inside Jail mount
# Optional, defaults to empty list
jailSubMounts:
Expand Down
8 changes: 7 additions & 1 deletion helm/slurm-cluster/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Create the name of the service account to use for exporter
*/}}
{{- define "slurm-cluster.exporter.serviceAccountName" -}}
{{- if .Values.slurmNodes.exporter.serviceAccount.create -}}
{{- default "slurm-exporter-sa" .Values.slurmNodes.exporter.serviceAccount.name }}
{{- default (printf "%s-slurm-exporter-sa" (include "slurm-cluster.name" .)) .Values.slurmNodes.exporter.serviceAccount.name }}
{{- else -}}
{{- default "default" .Values.slurmNodes.exporter.serviceAccount.name }}
{{- end -}}
Expand Down Expand Up @@ -107,3 +107,9 @@ Create the name of the role binding for slurm-controller
{{- printf "%s-slurm-controller" (include "slurm-cluster.name" .) }}
{{- end -}}

{{/*
Create the name of the slurm-scripts ConfigMap
*/}}
{{- define "slurm-cluster.slurmScripts.configMapName" -}}
{{- printf "%s-slurm-scripts" (include "slurm-cluster.name" .) }}
{{- end -}}
8 changes: 4 additions & 4 deletions helm/slurm-cluster/templates/slurm-cluster-cr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ spec:
{{- end }}
{{- end }}
volumeSources:
- name: slurm-scripts
- name: {{ include "slurm-cluster.slurmScripts.configMapName" . }}
configMap:
name: slurm-scripts
name: {{ include "slurm-cluster.slurmScripts.configMapName" . }}
defaultMode: 0755
{{- range .Values.volumeSources }}
- name: {{ .name | quote }}
Expand Down Expand Up @@ -325,10 +325,10 @@ spec:
customMounts:
- name: slurm-scripts
mountPath: /opt/slurm_scripts/
volumeSourceName: slurm-scripts
volumeSourceName: {{ include "slurm-cluster.slurmScripts.configMapName" . }}
- name: slurm-scripts-jail
mountPath: /mnt/jail.upper/opt/slurm_scripts/
volumeSourceName: slurm-scripts
volumeSourceName: {{ include "slurm-cluster.slurmScripts.configMapName" . }}
{{- if .Values.slurmNodes.login.volumes.customMounts }}
{{- .Values.slurmNodes.login.volumes.customMounts | toYaml | nindent 10 }}
{{- end }}
Expand Down
2 changes: 1 addition & 1 deletion helm/slurm-cluster/templates/slurm-scripts-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ apiVersion: v1
kind: ConfigMap
metadata:
namespace: {{ .Release.Namespace }}
name: slurm-scripts
name: {{ include "slurm-cluster.slurmScripts.configMapName" . }}
labels:
app: {{ .Chart.Name }}
release: {{ .Release.Name }}
Expand Down
8 changes: 4 additions & 4 deletions helm/slurm-cluster/tests/exporter-rbac_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ tests:
of: ServiceAccount
- equal:
path: metadata.name
value: slurm-exporter-sa
value: test-cluster-slurm-exporter-sa
- equal:
path: metadata.namespace
value: NAMESPACE
Expand Down Expand Up @@ -114,7 +114,7 @@ tests:
path: subjects
content:
kind: ServiceAccount
name: slurm-exporter-sa
name: test-cluster-exporter-sa
namespace: NAMESPACE

# Test RoleBinding with custom ServiceAccount name
Expand Down Expand Up @@ -162,7 +162,7 @@ tests:
asserts:
- equal:
path: spec.slurmNodes.exporter.serviceAccountName
value: slurm-exporter-sa
value: test-cluster-exporter-sa

# Test SlurmCluster CR uses custom ServiceAccount name
- it: should set custom serviceAccountName in SlurmCluster CR
Expand Down Expand Up @@ -207,4 +207,4 @@ tests:
asserts:
- equal:
path: spec.slurmNodes.exporter.serviceAccountName
value: slurm-exporter-sa
value: test-cluster-exporter-sa
4 changes: 2 additions & 2 deletions helm/slurm-cluster/tests/volume-sources-filtering_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ tests:
# Verify correct names are preserved (slurm-scripts is always first)
- equal:
path: spec.volumeSources[0].name
value: "slurm-scripts"
value: "test-cluster-slurm-scripts"
- equal:
path: spec.volumeSources[1].name
value: "jail"
Expand Down Expand Up @@ -206,7 +206,7 @@ tests:
# Verify correct values
- equal:
path: spec.volumeSources[0].name
value: "slurm-scripts"
value: "test-cluster-slurm-scripts"
- equal:
path: spec.volumeSources[1].name
value: "simple-pvc"
Expand Down
6 changes: 6 additions & 0 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17766,6 +17766,11 @@ spec:
Defaults to 1 if not specified.
format: int32
type: integer
slurmClusterRefName:
description: |-
SlurmClusterRefName is the name of the parent SlurmCluster this NodeSet belongs to.
Must be in the same namespace as the NodeSet.
type: string
slurmd:
description: Slurmd defines the Slurm worker daemon configuration.
properties:
Expand Down Expand Up @@ -26114,6 +26119,7 @@ spec:
type: object
required:
- munge
- slurmClusterRefName
- slurmd
type: object
status:
Expand Down
6 changes: 6 additions & 0 deletions helm/soperator/crds/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17766,6 +17766,11 @@ spec:
Defaults to 1 if not specified.
format: int32
type: integer
slurmClusterRefName:
description: |-
SlurmClusterRefName is the name of the parent SlurmCluster this NodeSet belongs to.
Must be in the same namespace as the NodeSet.
type: string
slurmd:
description: Slurmd defines the Slurm worker daemon configuration.
properties:
Expand Down Expand Up @@ -26114,6 +26119,7 @@ spec:
type: object
required:
- munge
- slurmClusterRefName
- slurmd
type: object
status:
Expand Down
2 changes: 0 additions & 2 deletions internal/consts/annotation.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,4 @@ const (
AnnotationDefaultContainerName = "kubectl.kubernetes.io/default-container"
AnnotationClusterName = K8sGroupNameSoperator + "/cluster"
AnnotationActiveCheckName = K8sGroupNameSoperator + "/activecheck"

AnnotationParentalClusterRefName = K8sGroupNameSoperator + "/parental-cluster-ref"
)
2 changes: 1 addition & 1 deletion internal/controller/clustercontroller/accounting.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ func (r SlurmClusterReconciler) ReconcileAccounting(

if !isAccountingEnabled {
stepLogger.V(1).Info("Removing")
deploymentName := naming.BuildDeploymentName(consts.ComponentTypeAccounting)
deploymentName := naming.BuildDeploymentName(consts.ComponentTypeAccounting, clusterValues.Name)
if err = r.Deployment.Cleanup(stepCtx, cluster, deploymentName); err != nil {
return fmt.Errorf("cleanup accounting Deployment: %w", err)
}
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/clustercontroller/sconfigcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ func (r SlurmClusterReconciler) ValidateSConfigController(
ctx,
types.NamespacedName{
Namespace: clusterValues.Namespace,
Name: naming.BuildDeploymentName(consts.ComponentTypeSConfigController),
Name: naming.BuildDeploymentName(consts.ComponentTypeSConfigController, clusterValues.Name),
},
existing,
)
Expand Down
6 changes: 3 additions & 3 deletions internal/controller/nodesetcontroller/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ func (r *NodeSetReconciler) reconcile(ctx context.Context, nodeSet *slurmv1alpha
err error
)
{
clusterName, hasClusterRef := nodeSet.GetAnnotations()[consts.AnnotationParentalClusterRefName]
if !hasClusterRef {
err = fmt.Errorf("getting parental cluster ref from annotations")
clusterName := nodeSet.Spec.SlurmClusterRefName
if clusterName == "" {
err = fmt.Errorf("NodeSet %q has empty spec.slurmClusterRefName", nodeSet.Name)
logger.Error(err, "No parent cluster ref found")
return ctrl.Result{}, err
}
Expand Down
18 changes: 14 additions & 4 deletions internal/controller/sconfigcontroller/jailedconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ type JailedConfigReconciler struct {
client.Client
Scheme *runtime.Scheme

clusterName string
slurmAPIClient slurmapi.Client
clock Clock
fs Fs
Expand Down Expand Up @@ -331,7 +332,10 @@ func (r *JailedConfigReconciler) reconcileWithAggregation(ctx context.Context, j
jailedConfigs := &slurmv1alpha1.JailedConfigList{}
err := r.Client.List(ctx, jailedConfigs,
client.InNamespace(jailedConfig.Namespace),
client.MatchingLabels{consts.LabelJailedAggregationKey: aggregationKey},
client.MatchingLabels{
consts.LabelJailedAggregationKey: aggregationKey,
consts.LabelInstanceKey: r.clusterName,
},
)
if err != nil {
return ctrl.Result{}, fmt.Errorf("listing JailedConfigs with aggregation key %q: %w", aggregationKey, err)
Expand Down Expand Up @@ -557,6 +561,7 @@ func (r *JailedConfigReconciler) shouldInitializeConditions(ctx context.Context,
func NewJailedConfigReconciler(
client client.Client,
scheme *runtime.Scheme,
clusterName string,
slurmAPIClient slurmapi.Client,
fs Fs,
reconfigurePollInterval time.Duration,
Expand All @@ -571,6 +576,7 @@ func NewJailedConfigReconciler(
return &JailedConfigReconciler{
Client: client,
Scheme: scheme,
clusterName: clusterName,
slurmAPIClient: slurmAPIClient,
fs: fs,
reconfigurePollInterval: reconfigurePollInterval,
Expand All @@ -592,14 +598,18 @@ func (r *JailedConfigReconciler) SetupWithManager(mgr ctrl.Manager, maxConcurren
}); err != nil {
return err
}
clusterPredicate := predicate.NewPredicateFuncs(func(obj client.Object) bool {
return obj.GetLabels()[consts.LabelInstanceKey] == r.clusterName
})

return ctrl.NewControllerManagedBy(mgr).
For(&slurmv1alpha1.JailedConfig{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
For(&slurmv1alpha1.JailedConfig{}, builder.WithPredicates(predicate.GenerationChangedPredicate{}, clusterPredicate)).
Watches(
&corev1.ConfigMap{},
handler.EnqueueRequestsFromMapFunc(r.findObjectsForConfigMap),
builder.WithPredicates(predicate.ResourceVersionChangedPredicate{}),
builder.WithPredicates(predicate.ResourceVersionChangedPredicate{}, clusterPredicate),
).
Named("jailedconfig").
Named(r.clusterName + "-jailedconfig").
WithOptions(controllerconfig.ControllerOptionsWithRateLimit(maxConcurrency, cacheSyncTimeout, 15*time.Second, 1*time.Minute)).
Complete(r)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ func newTestJailedConfigController(
sctrl := NewJailedConfigReconciler(
mgr.GetClient(),
mgr.GetScheme(),
"soperator",
apiClient,
fakeFs,
1*time.Second, // Poll interval for tests
Expand Down
Loading
Loading