From 31520d1104ffc79797fcc1faa4f63189cced15c8 Mon Sep 17 00:00:00 2001 From: bainaichang <3215903958@qq.com> Date: Tue, 2 Jun 2026 12:46:06 +0800 Subject: [PATCH] Use hard link for applyingUpdate to ensure idempotent rename When client.Update fails after os.Rename(k0s.tmp, k0s), the reconciler retries but k0s.tmp is gone, causing an infinite error loop that leaves the node stuck in ApplyingUpdate status. Replace the single rename with a hard link + rename sequence: create k0s.new as a hard link to k0s.tmp, then rename k0s.new to k0s. Since both k0s.tmp and k0s.new share the same inode, k0s.tmp survives the rename. If client.Update fails and the reconciler re-triggers, the checks on k0s.tmp correctly detect the pending work and the whole sequence can be safely replayed. Fixes: https://github.com/k0sproject/k0s/issues/7703 Signed-off-by: bainaichang <3215903958@qq.com> --- pkg/autopilot/constant/static.go | 1 + pkg/autopilot/controller/signal/k0s/apply.go | 56 +++++++++++++++----- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/pkg/autopilot/constant/static.go b/pkg/autopilot/constant/static.go index c3341295e7e8..63be8ff48861 100644 --- a/pkg/autopilot/constant/static.go +++ b/pkg/autopilot/constant/static.go @@ -7,6 +7,7 @@ const ( AutopilotName = "autopilot" AutopilotNamespace = "k0s-autopilot" K0sTempFilename = "k0s.tmp" + K0sTempLinkFilename = "k0s.new" CentralCordoningLabel = "autopilot.k0sproject.io/central-cordoning" K0SControlNodeModeAnnotation = "autopilot.k0sproject.io/mode" K0SControlNodeModeController = "controller" diff --git a/pkg/autopilot/controller/signal/k0s/apply.go b/pkg/autopilot/controller/signal/k0s/apply.go index a0e1478109c8..cc315eabf2cf 100644 --- a/pkg/autopilot/controller/signal/k0s/apply.go +++ b/pkg/autopilot/controller/signal/k0s/apply.go @@ -108,21 +108,41 @@ func (r *applyingUpdate) Reconcile(ctx context.Context, req cr.Request) (cr.Resu return cr.Result{}, nil } + k0sBinaryFilenamePath := filepath.Join(r.k0sBinaryDir, "k0s") updateFilenamePath := filepath.Join(r.k0sBinaryDir, apconst.K0sTempFilename) - - // Ensure that the expected file exists - if _, err := os.Stat(updateFilenamePath); errors.Is(err, os.ErrNotExist) { - return cr.Result{}, fmt.Errorf("unable to find update file '%s': %w", apconst.K0sTempFilename, err) - } - - // Ensure that the new file is executable - if err := os.Chmod(updateFilenamePath, 0755); err != nil { - return cr.Result{}, fmt.Errorf("unable to chmod update file '%s': %w", apconst.K0sTempFilename, err) - } - - // Perform the update atomically - if err := os.Rename(updateFilenamePath, filepath.Join(r.k0sBinaryDir, "k0s")); err != nil { - return cr.Result{}, fmt.Errorf("unable to update (rename) to the new file: %w", err) + updateLinkFilenamePath := filepath.Join(r.k0sBinaryDir, apconst.K0sTempLinkFilename) + + // Check if the update file still exists. If not, the rename was already + // performed in a previous reconciler run whose client.Update failed. + // In that case the file operations can be skipped and we can proceed + // directly to updating the signaling status to Restart. + if _, err := os.Stat(updateFilenamePath); err != nil { + if !errors.Is(err, os.ErrNotExist) { + return cr.Result{}, fmt.Errorf("unable to stat update file '%s': %w", updateFilenamePath, err) + } + logger.Info("Update file already applied, skipping file operations") + } else { + // Ensure the downloaded temporary file is executable + if err := os.Chmod(updateFilenamePath, 0755); err != nil { + return cr.Result{}, fmt.Errorf("unable to chmod update file '%s': %w", updateFilenamePath, err) + } + + // Clean up any stale link file from a previous failed rename attempt + os.Remove(updateLinkFilenamePath) + + // Create k0s.new as a hard link to k0s.tmp, sharing the same + // inode. This way k0s.tmp survives the subsequent rename, + // providing idempotency: if client.Update fails and the + // reconciler is re-triggered, k0s.tmp will still exist and + // the whole sequence can be replayed. + if err := os.Link(updateFilenamePath, updateLinkFilenamePath); err != nil { + return cr.Result{}, fmt.Errorf("unable to create hard link '%s' -> '%s': %w", updateLinkFilenamePath, updateFilenamePath, err) + } + + // Atomically replace the running k0s binary with the new version + if err := os.Rename(updateLinkFilenamePath, k0sBinaryFilenamePath); err != nil { + return cr.Result{}, fmt.Errorf("unable to rename '%s' -> '%s': %w", updateLinkFilenamePath, k0sBinaryFilenamePath, err) + } } // When the k0s process has been terminated, move to 'Restart' @@ -138,5 +158,13 @@ func (r *applyingUpdate) Reconcile(ctx context.Context, req cr.Request) (cr.Resu return cr.Result{Requeue: true}, fmt.Errorf("failed to update signal node to status '%s': %w", signalData.Status.Status, err) } + // Clean up k0s.tmp after a successful apply. If the file does not exist + // (e.g. this is a retry where the file was already removed), ignore the error. + if err := os.Remove(updateFilenamePath); err != nil { + if !errors.Is(err, os.ErrNotExist) { + logger.WithError(err).Warn("Failed to remove update file") + } + } + return cr.Result{}, nil }