Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 119 additions & 3 deletions cmd/compute-domain-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,14 @@ import (
"path"
"syscall"

"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/urfave/cli/v2"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/leaderelection"
"k8s.io/client-go/tools/leaderelection/resourcelock"
"k8s.io/component-base/logs"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog/v2"
Expand All @@ -56,7 +60,8 @@ const (
)

type Flags struct {
kubeClientConfig pkgflags.KubeClientConfig
kubeClientConfig pkgflags.KubeClientConfig
leaderElectionConfig pkgflags.LeaderElectionConfig

podName string
namespace string
Expand Down Expand Up @@ -157,6 +162,7 @@ func newApp() *cli.App {
},
}

cliFlags = append(cliFlags, flags.leaderElectionConfig.Flags()...)
cliFlags = append(cliFlags, flags.kubeClientConfig.Flags()...)
cliFlags = append(cliFlags, featureGateConfig.Flags()...)
cliFlags = append(cliFlags, loggingConfig.Flags()...)
Expand Down Expand Up @@ -217,12 +223,19 @@ func newApp() *cli.App {
controller := NewController(config)
ctx, cancel := context.WithCancel(c.Context)
go func() {
errChan <- controller.Run(ctx)
// Fallback to standalone mode if leader election is disabled
if !config.flags.leaderElectionConfig.Enabled {
klog.Info("Leader election disabled, starting controller directly")
errChan <- controller.Run(ctx)
return
}
errChan <- runWithLeaderElection(ctx, config, controller)
}()

for {
select {
case <-sigs:
case sig := <-sigs:
klog.InfoS("Received signal, shutting down", "signal", sig.String())
cancel()
case err := <-errChan:
cancel()
Expand Down Expand Up @@ -253,6 +266,109 @@ func newApp() *cli.App {
return app
}

func runWithLeaderElection(ctx context.Context, config *Config, controller *Controller) error {
klog.Info("Leader election enabled")
// Unique identity: PodName + UUID to prevent conflicts on restarts
id := uuid.New().String()
lockID := fmt.Sprintf("%s-%s", config.flags.podName, id)
klog.InfoS("Leader election candidate registered", "lockID", lockID,
"leaseName", config.flags.leaderElectionConfig.LeaseLockName,
"leaseNamespace", config.flags.leaderElectionConfig.LeaseLockNamespace)

// electorCtx controls the lifecycle of the leader election loop
electorCtx, cancelElector := context.WithCancel(ctx)
// Standard defer to ensure resources are cleaned up on function exit
defer cancelElector()

lock := &resourcelock.LeaseLock{
LeaseMeta: metav1.ObjectMeta{
Name: config.flags.leaderElectionConfig.LeaseLockName,
Namespace: config.flags.leaderElectionConfig.LeaseLockNamespace,
},
Client: config.clientsets.Core.CoordinationV1(),
LockConfig: resourcelock.ResourceLockConfig{
Identity: lockID,
},
}

controllerErrCh := make(chan error, 1)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want this to be a buffered channel? I would think we want it to be unbuffered.

callbacks := leaderelection.LeaderCallbacks{
OnStartedLeading: func(leaderCtx context.Context) {
klog.InfoS("Became leader, starting controller", "lockID", lockID)

// ARCHITECTURE NOTE:
// We use cancelElector() to ensure that if the controller logic exits
// (either gracefully or with an error), the entire leader election loop
// terminates. This triggers ReleaseOnCancel, clearing the lease holder
// identity and allowing standby replicas to take over immediately.
//
// By returning from run() after elector.Run() finishes, we rely on
// Kubernetes to restart the Pod, ensuring a clean in-memory state
// for the next leadership term.
defer cancelElector()

// NOTE: Use leaderCtx provided by the callback.
// It is automatically cancelled if leadership is lost.
if err := controller.Run(leaderCtx); err != nil {
select {
case controllerErrCh <- err:
default:
}
Comment on lines +313 to +316
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this select necessary? I would hope that erroring here would trigger elector.Run() to fail, meaning we can / should just do a direct write here, i.e.:

controllerErrCh <- err

Which will then block until the follow up call to:

err := <-controllerErrCh

happens later (which we want to block to ensure the controller has shutdown completely before returning the error).

klog.ErrorS(err, "Controller exited with error", "lockID", lockID)
} else {
klog.InfoS("Controller exited gracefully", "lockID", lockID)
}
},
OnStoppedLeading: func() {
// ARCHITECTURE NOTE:
// We only log here. The actual shutdown of the controller is handled by the
// cancellation of the leaderCtx passed to OnStartedLeading.
// When leadership is lost, the library cancels that context, triggering
// the controller's graceful shutdown logic.
klog.Warningf("Stopped leading, lockID: %s", lockID)
},
OnNewLeader: func(identity string) {
// OnNewLeader is called when a new leader is observed.
// We ignore the case where the "new" leader is ourselves to avoid
// redundant logs during initial election or re-election.
if identity == lockID {
klog.V(6).InfoS("OnNewLeader callback: observed leader is still ourselves", "lockID", lockID)
return
}
klog.InfoS("New leader elected", "leader", identity, "currentCandidate", lockID)
},
}

elector, err := leaderelection.NewLeaderElector(leaderelection.LeaderElectionConfig{
Lock: lock,
LeaseDuration: config.flags.leaderElectionConfig.LeaseDuration,
RenewDeadline: config.flags.leaderElectionConfig.RenewDeadline,
RetryPeriod: config.flags.leaderElectionConfig.RetryPeriod,
Name: config.flags.leaderElectionConfig.LeaseLockName,
Callbacks: callbacks,
ReleaseOnCancel: true, // Steps down immediately by clearing the Lease holder
})
if err != nil {
return fmt.Errorf("failed to create leader elector: %w", err)
}

// Block until electorCtx is cancelled or leadership is lost
klog.InfoS("Starting leader election loop", "lockID", lockID)
elector.Run(electorCtx)

// If exiting due to a controller failure, propagate the error to main
select {
case err := <-controllerErrCh:
if err != nil {
klog.ErrorS(err, "Process exiting due to controller failure")
return fmt.Errorf("controller execution failed: %w", err)
}
default:
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As above, I don't think we want a select here, just:

if err := <-controllerErrCh; err != nil {
	klog.ErrorS(err, "Process exiting due to controller failure")
	return fmt.Errorf("controller execution failed: %w", err)
}

We can / should block here until the controller has pushed an error into this channel.
Is there ever a case we could get here without that happening?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @klueska , thanks for the review!

Actually, making the channel unbuffered and removing the select blocks would lead to deadlocks in both the error path and the graceful shutdown path.

  1. Why make(chan error, 1) and the non-blocking write are required:
    elector.Run() does not automatically return when the callback (OnStartedLeading) errors out. We must explicitly trigger cancelElector() to stop the leader election loop. If we use an unbuffered channel and a blocking write (controllerErrCh <- err), the callback goroutine will block forever waiting for a reader. Since it's blocked, defer cancelElector() is never reached, elector.Run() never returns, and the reader at the bottom is never reached. Deadlock.

  2. Why the non-blocking read (the bottom select) is required:
    You asked: "Is there ever a case we could get here without that happening?" Yes, absolutely! During a normal pod termination (e.g., receiving SIGTERM), the global ctx is cancelled, causing elector.Run() to return gracefully. In this case, controller.Run() exits without error, and nothing is pushed to controllerErrCh. If we block on <-controllerErrCh at the bottom, the process will hang forever during graceful shutdown until Kubernetes SIGKILLs it. The select + default allows us to safely check for errors without hanging during a normal exit.

The current buffered channel + non-blocking select pattern acts as a safe 'error mailbox' across goroutine boundaries, ensuring we never block the crucial cancelElector() call or the graceful shutdown flow.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. That makes sense. I'm still not 100% happy with the way it reads with these selects, but I'll defer to @shivamerla to decide if something should be changed here.

klog.InfoS("Leader election loop ended gracefully", "lockID", lockID)
return nil
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we always want to return nil here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In our updated version, we do not always return nil. We check controllerErr after elector.Run returns. If the controller failed while it was the leader, we propagate that error so the process exits with code 1. If it returns nil, it means the process received a standard SIGTERM and is exiting gracefully.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

}

func SetupHTTPEndpoint(config *Config) error {
if config.flags.metricsPath != "" {
// To collect metrics data from the metric handler itself, we
Expand Down
14 changes: 13 additions & 1 deletion deployments/helm/nvidia-dra-driver-gpu/templates/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ metadata:
labels:
{{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }}
spec:
replicas: 1
replicas: {{ .Values.controller.replicas }}
Comment thread
klueska marked this conversation as resolved.
selector:
matchLabels:
{{- include "nvidia-dra-driver-gpu.selectorLabels" (dict "context" . "componentName" "controller") | nindent 6 }}
Expand Down Expand Up @@ -87,6 +87,18 @@ spec:
{{- with .Values.controller.containers.computeDomain.env }}
{{- toYaml . | nindent 8 }}
{{- end }}
- name: LEADER_ELECTION_ENABLED
value: "{{ .Values.controller.leaderElection.enabled }}"
- name: LEADER_ELECTION_LEASE_LOCK_NAME
value: "{{ include "nvidia-dra-driver-gpu.name" . }}-controller"
- name: LEADER_ELECTION_LEASE_LOCK_NAMESPACE
value: "{{ include "nvidia-dra-driver-gpu.namespace" . }}"
- name: LEADER_ELECTION_LEASE_DURATION
value: "{{ .Values.controller.leaderElection.leaseDuration }}"
- name: LEADER_ELECTION_RENEW_DEADLINE
value: "{{ .Values.controller.leaderElection.renewDeadline }}"
- name: LEADER_ELECTION_RETRY_PERIOD
value: "{{ .Values.controller.leaderElection.retryPeriod }}"
{{- with .Values.controller.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ rules:
- apiGroups: ["resource.k8s.io"]
resources: ["resourceclaimtemplates"]
verbs: ["get", "list", "watch", "create", "update", "delete"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get", "create", "update"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch", "update"]
Expand Down
14 changes: 14 additions & 0 deletions deployments/helm/nvidia-dra-driver-gpu/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@ webhook:
caBundle: ""

controller:
replicas: 1
leaderElection:
enabled: false
leaseDuration: "15s"
renewDeadline: "10s"
retryPeriod: "2s"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where did we take inspiration for these values? :)

Copy link
Copy Markdown
Contributor Author

@herb-duan herb-duan Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/kubernetes/client-go/blob/v0.34.0/tools/leaderelection/leaderelection.go#L116

type LeaderElectionConfig struct {
	// Lock is the resource that will be used for locking
	Lock rl.Interface

	// LeaseDuration is the duration that non-leader candidates will
	// wait to force acquire leadership. This is measured against time of
	// last observed ack.
	//
	// A client needs to wait a full LeaseDuration without observing a change to
	// the record before it can attempt to take over. When all clients are
	// shutdown and a new set of clients are started with different names against
	// the same leader record, they must wait the full LeaseDuration before
	// attempting to acquire the lease. Thus LeaseDuration should be as short as
	// possible (within your tolerance for clock skew rate) to avoid a possible
	// long waits in the scenario.
	//
	// Core clients default this value to 15 seconds.
	LeaseDuration time.Duration
	// RenewDeadline is the duration that the acting master will retry
	// refreshing leadership before giving up.
	//
	// Core clients default this value to 10 seconds.
	RenewDeadline time.Duration
	// RetryPeriod is the duration the LeaderElector clients should wait
	// between tries of actions.
	//
	// Core clients default this value to 2 seconds.
	RetryPeriod time.Duration

the recommended defaults in the client-go leaderelection package.

priorityClassName: "system-node-critical"
podAnnotations: {}
podSecurityContext: {}
Expand All @@ -208,6 +214,14 @@ controller:
- matchExpressions:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
nvidia-dra-driver-gpu-component: controller
Comment thread
klueska marked this conversation as resolved.
topologyKey: kubernetes.io/hostname
# Network policy settings
networkPolicy:
# If the network policy is enabled or not
Expand Down
85 changes: 85 additions & 0 deletions pkg/flags/leaderelection.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* Copyright 2025 NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package flags

import (
"time"

"github.com/urfave/cli/v2"
)

type LeaderElectionConfig struct {
Enabled bool
LeaseLockName string
LeaseLockNamespace string
LeaseDuration time.Duration
RenewDeadline time.Duration
RetryPeriod time.Duration
}

func (l *LeaderElectionConfig) Flags() []cli.Flag {
return []cli.Flag{
&cli.BoolFlag{
Category: "Leader election:",
Name: "leader-election-enabled",
Usage: "Start a leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability.",
Value: false,
Destination: &l.Enabled,
EnvVars: []string{"LEADER_ELECTION_ENABLED"},
},
&cli.StringFlag{
Category: "Leader election:",
Name: "leader-election-lease-lock-namespace",
Usage: "The lease lock resource namespace.",
Value: "default",
Destination: &l.LeaseLockNamespace,
EnvVars: []string{"LEADER_ELECTION_LEASE_LOCK_NAMESPACE"},
},
&cli.StringFlag{
Category: "Leader election:",
Name: "leader-election-lease-lock-name",
Usage: "The lease lock resource name.",
Value: "nvidia-compute-domain-controller",
Destination: &l.LeaseLockName,
EnvVars: []string{"LEADER_ELECTION_LEASE_LOCK_NAME"},
},
&cli.DurationFlag{
Category: "Leader election:",
Name: "leader-election-lease-duration",
Usage: "The duration that non-leader candidates will wait to force acquire leadership. This is measured against time of last observed ack.",
Value: 15 * time.Second,
Destination: &l.LeaseDuration,
EnvVars: []string{"LEADER_ELECTION_LEASE_DURATION"},
},
&cli.DurationFlag{
Category: "Leader election:",
Name: "leader-election-renew-deadline",
Usage: "The duration that the acting controlplane will retry refreshing leadership before giving up.",
Value: 10 * time.Second,
Destination: &l.RenewDeadline,
EnvVars: []string{"LEADER_ELECTION_RENEW_DEADLINE"},
},
&cli.DurationFlag{
Category: "Leader election:",
Name: "leader-election-retry-period",
Usage: "The duration the LeaderElector clients should wait between tries of actions.",
Value: 2 * time.Second,
Destination: &l.RetryPeriod,
EnvVars: []string{"LEADER_ELECTION_RETRY_PERIOD"},
},
}
}
13 changes: 13 additions & 0 deletions vendor/k8s.io/client-go/tools/leaderelection/OWNERS

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading