From 5b9459b896ce49526be371eda961fc2cf71917ea Mon Sep 17 00:00:00 2001 From: Michael Weibel Date: Thu, 7 May 2026 16:14:02 +0200 Subject: [PATCH 1/9] feat: HTTP timeout configuration - Add IsTimeoutError/IsDeadlineExceeded helpers to cloudscale client - Configure client with tuned http.Transport (5s dial, 10s response header, 60s overall) - Wrap all .Create() call sites with IsTimeoutError checks and 90s requeue - Update reconcileNetwork and reconcileManagedNetwork to return (ctrl.Result, error) - Add comprehensive timeout handling unit tests across all controller files --- cmd/main.go | 18 ++- internal/cloudscale/client.go | 66 +++++++- internal/cloudscale/client_test.go | 82 ++++++++++ internal/controller/cloudscale_services.go | 8 + .../cloudscalecluster_controller.go | 16 +- .../cloudscalecluster_floatingip.go | 24 +-- .../cloudscalecluster_floatingip_test.go | 54 +++++-- .../cloudscalecluster_loadbalancer.go | 114 ++++++++----- .../cloudscalecluster_loadbalancer_test.go | 137 ++++++++++++---- .../controller/cloudscalecluster_network.go | 37 +++-- .../cloudscalecluster_network_test.go | 150 +++++++++++++++--- .../cloudscalemachine_controller.go | 6 +- .../controller/cloudscalemachine_server.go | 4 + .../cloudscalemachine_servergroup.go | 21 ++- .../cloudscalemachine_servergroup_test.go | 14 +- test/e2e/cloudscale_helpers.go | 6 +- test/e2e/e2e_suite_test.go | 7 +- 17 files changed, 615 insertions(+), 149 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 015214b..73770c8 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -35,6 +35,7 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" ctrl "sigs.k8s.io/controller-runtime" + ctrlconfig "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" @@ -174,6 +175,21 @@ func main() { HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, LeaderElectionID: "cloudscale.infrastructure.cluster.x-k8s.io", + Controller: ctrlconfig.Controller{ + // MaxConcurrentReconciles is hardcoded to 1 for all controllers. This is + // intentional: CloudscaleCluster and CloudscaleMachine controllers both + // mutate shared cloudscale.ch resources (server groups, networks), and + // there is no distributed locking between them. + // + // If you increase this value, you MUST also audit (incomplete list): + // - deleteServerGroups: races with CloudscaleMachine server deletion + // - deleteNetwork: races with async LB pool member cleanup + // - reconcileServerGroup: races with cluster-level server group deletion + // + // In general, deletion ordering is not guaranteed between controllers. + // A value >1 requires explicit coordination or idempotency guarantees. + MaxConcurrentReconciles: 1, + }, // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily // when the Manager ends. This requires the binary to immediately end when the // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly @@ -272,7 +288,7 @@ func fetchAPIInfo() (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - client := cloudscale.NewClient(token) + client := cloudscale.NewClient(token, cloudscale.DefaultCloudscaleRequestTimeout) var regionInfo *cloudscale.RegionInfo var flavorInfo *cloudscale.FlavorInfo diff --git a/internal/cloudscale/client.go b/internal/cloudscale/client.go index e88b324..5033402 100644 --- a/internal/cloudscale/client.go +++ b/internal/cloudscale/client.go @@ -17,9 +17,13 @@ limitations under the License. package cloudscale import ( - "context" "errors" + "net" + "net/http" + "net/url" + "os" "strings" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" "golang.org/x/oauth2" @@ -40,9 +44,40 @@ type Client struct { Flavors FlavorService } -func NewClient(token string) *Client { +// DefaultCloudscaleRequestTimeout is the default HTTP request timeout for all Create calls. +const DefaultCloudscaleRequestTimeout = 60 * time.Second + +func NewClient(token string, requestTimeout time.Duration) *Client { tokenSource := oauth2.StaticTokenSource(&oauth2.Token{AccessToken: token}) - httpClient := oauth2.NewClient(context.Background(), tokenSource) + baseTransport := &http.Transport{ + DialContext: (&net.Dialer{ + Timeout: 5 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, + + TLSHandshakeTimeout: 5 * time.Second, + ResponseHeaderTimeout: 10 * time.Second, + + // needs to be set because we also set DialContext + ForceAttemptHTTP2: true, + HTTP2: &http.HTTP2Config{ + SendPingTimeout: 5 * time.Second, + PingTimeout: 3 * time.Second, + }, + + IdleConnTimeout: 90 * time.Second, + MaxIdleConns: 50, + MaxIdleConnsPerHost: 50, + MaxConnsPerHost: 0, + } + + httpClient := &http.Client{ + Timeout: requestTimeout, + Transport: &oauth2.Transport{ + Source: tokenSource, + Base: baseTransport, + }, + } sdkClient := cloudscalesdk.NewClient(httpClient) return &Client{ @@ -82,3 +117,28 @@ func IsFloatingIPNoPublicInterface(err error) bool { } return strings.Contains(err.Error(), "does not have a public interface with an IPv4 address") } + +// IsTimeoutError reports whether err indicates the HTTP request timed out +// before receiving a response. +func IsTimeoutError(err error) bool { + if err == nil { + return false + } + + var urlErr *url.Error + if errors.As(err, &urlErr) { + return urlErr.Timeout() + } + if errors.Is(err, os.ErrDeadlineExceeded) { + return true + } + return false +} + +// IsDeadlineExceeded reports whether err indicates a deadline was exceeded. +func IsDeadlineExceeded(err error) bool { + if err == nil { + return false + } + return errors.Is(err, os.ErrDeadlineExceeded) +} diff --git a/internal/cloudscale/client_test.go b/internal/cloudscale/client_test.go index 2fa1fde..48c6f38 100644 --- a/internal/cloudscale/client_test.go +++ b/internal/cloudscale/client_test.go @@ -17,8 +17,12 @@ limitations under the License. package cloudscale import ( + "errors" "fmt" + "net/url" + "os" "testing" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" . "github.com/onsi/gomega" @@ -46,3 +50,81 @@ func TestIsNotFound(t *testing.T) { }) } } + +func TestIsTimeoutError(t *testing.T) { + tests := []struct { + name string + err error + expected bool + }{ + {"nil error returns false", nil, false}, + { + "url.Error with Timeout=true returns true", + &url.Error{Op: "Post", URL: "https://api.example.com/v1/servers", Err: os.ErrDeadlineExceeded}, + true, + }, + { + "url.Error with Timeout=false returns false", + &url.Error{Op: "Get", URL: "https://api.example.com/v1/servers", Err: fmt.Errorf("connection refused")}, + false, + }, + { + "wrapped url.Error with Timeout=true returns true", + fmt.Errorf("outer: %w", &url.Error{Op: "Post", URL: "https://api.example.com/v1/servers", Err: os.ErrDeadlineExceeded}), + true, + }, + { + "os.ErrDeadlineExceeded returns true", + os.ErrDeadlineExceeded, + true, + }, + { + "generic error returns false", + fmt.Errorf("some other error"), + false, + }, + { + "ErrorResponse with 500 returns false", + &cloudscalesdk.ErrorResponse{StatusCode: 500}, + false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + result := IsTimeoutError(tt.err) + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestNewClient_ReturnsNonNilClient(t *testing.T) { + g := NewWithT(t) + client := NewClient("fake-token", 30*time.Second) + g.Expect(client).ToNot(BeNil()) + g.Expect(client.LoadBalancers).ToNot(BeNil()) + g.Expect(client.Servers).ToNot(BeNil()) + g.Expect(client.Networks).ToNot(BeNil()) +} + +func TestIsDeadlineExceeded(t *testing.T) { + tests := []struct { + name string + err error + expected bool + }{ + {"nil error returns false", nil, false}, + {"os.ErrDeadlineExceeded returns true", os.ErrDeadlineExceeded, true}, + {"wrapped os.ErrDeadlineExceeded returns true", errors.Join(os.ErrDeadlineExceeded), true}, + {"generic error returns false", fmt.Errorf("timeout"), false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + result := IsDeadlineExceeded(tt.err) + g.Expect(result).To(Equal(tt.expected)) + }) + } +} diff --git a/internal/controller/cloudscale_services.go b/internal/controller/cloudscale_services.go index f1835bb..ca472bc 100644 --- a/internal/controller/cloudscale_services.go +++ b/internal/controller/cloudscale_services.go @@ -3,6 +3,7 @@ package controller import ( "context" "fmt" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" @@ -10,6 +11,13 @@ import ( "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) +const ( + // CreateTimeoutRequeueInterval is the requeue interval after an HTTP timeout + // on a Create call. This MUST be longer than the HTTP client timeout (60s) so the + // original request has time to complete before we retry. + CreateTimeoutRequeueInterval = 90 * time.Second +) + // getListService is satisfied by all cloudscale SDK resource services. type getListService[T any] interface { Get(ctx context.Context, id string) (*T, error) diff --git a/internal/controller/cloudscalecluster_controller.go b/internal/controller/cloudscalecluster_controller.go index 1f6b058..f66c73e 100644 --- a/internal/controller/cloudscalecluster_controller.go +++ b/internal/controller/cloudscalecluster_controller.go @@ -99,7 +99,7 @@ func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re return ctrl.Result{}, fmt.Errorf("failed to get cloudscale.ch credentials: %w", err) } - cloudscaleClient := cloudscale.NewClient(token) + cloudscaleClient := cloudscale.NewClient(token, cloudscale.DefaultCloudscaleRequestTimeout) clusterScope, err := scope.NewClusterScope(scope.ClusterScopeParams{ Client: r.Client, @@ -135,11 +135,15 @@ func (r *CloudscaleClusterReconciler) reconcileNormal(ctx context.Context, clust // update ready conditions upon returning from this function based on updated clusterScope. defer r.setReadyCondition(clusterScope) - if err := r.reconcileNetwork(ctx, clusterScope); err != nil { + result, err := r.reconcileNetwork(ctx, clusterScope) + if err != nil { return ctrl.Result{}, fmt.Errorf("reconciling network: %w", err) } + if !result.IsZero() { + return result, nil + } - result, err := r.reconcileLoadBalancer(ctx, clusterScope) + result, err = r.reconcileLoadBalancer(ctx, clusterScope) if err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer: %w", err) } @@ -147,9 +151,13 @@ func (r *CloudscaleClusterReconciler) reconcileNormal(ctx context.Context, clust return result, nil } - if err := r.reconcileFloatingIP(ctx, clusterScope); err != nil { + result, err = r.reconcileFloatingIP(ctx, clusterScope) + if err != nil { return ctrl.Result{}, fmt.Errorf("reconciling floating IP: %w", err) } + if !result.IsZero() { + return result, nil + } // Mark infrastructure as provisioned when all resources exist if clusterScope.CloudscaleCluster.Status.Initialization == nil { diff --git a/internal/controller/cloudscalecluster_floatingip.go b/internal/controller/cloudscalecluster_floatingip.go index a92cb85..e7e81e8 100644 --- a/internal/controller/cloudscalecluster_floatingip.go +++ b/internal/controller/cloudscalecluster_floatingip.go @@ -25,6 +25,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" @@ -34,11 +35,11 @@ import ( // reconcileFloatingIP ensures the floating IP exists and is assigned to the correct target. // When no floating IP is configured, this sets the condition to true and returns. -func (r *CloudscaleClusterReconciler) reconcileFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope) (reterr error) { +func (r *CloudscaleClusterReconciler) reconcileFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { fipSpec := clusterScope.CloudscaleCluster.Spec.FloatingIP if fipSpec == nil { r.setCondition(clusterScope, infrastructurev1beta2.FloatingIPReadyCondition, metav1.ConditionTrue, infrastructurev1beta2.FloatingIPDisabledReason, "") - return nil + return ctrl.Result{}, nil } defer func() { @@ -51,7 +52,8 @@ func (r *CloudscaleClusterReconciler) reconcileFloatingIP(ctx context.Context, c // Pre-existing floating IP: just look it up and use its address if fipSpec.Address != "" { - return r.reconcilePreExistingFloatingIP(ctx, clusterScope, fipSpec.Address) + err := r.reconcilePreExistingFloatingIP(ctx, clusterScope, fipSpec.Address) + return ctrl.Result{}, err } // Managed floating IP: create if needed, then assign @@ -74,7 +76,7 @@ func (r *CloudscaleClusterReconciler) reconcilePreExistingFloatingIP(ctx context return r.ensureFloatingIPAssignment(ctx, clusterScope, fip) } -func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope, fipSpec *infrastructurev1beta2.FloatingIPSpec) error { +func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope, fipSpec *infrastructurev1beta2.FloatingIPSpec) (_ ctrl.Result, reterr error) { tags := clusterOwnershipTags(clusterScope.CloudscaleCluster) clusterScope.Info("reconcile managed floating IP") @@ -88,17 +90,17 @@ func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Con tags, ) if err != nil { - return err + return ctrl.Result{}, err } clusterScope.CloudscaleCluster.Status.FloatingIP = id if id != "" { // Existing floating IP: ensure it's assigned to the right target and set endpoint if err := r.ensureFloatingIPAssignment(ctx, clusterScope, fip); err != nil { - return err + return ctrl.Result{}, err } r.setControlPlaneEndpointFromFIP(clusterScope, fip) - return nil + return ctrl.Result{}, nil } // Create new floating IP @@ -134,7 +136,11 @@ func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Con clusterScope.Info("Creating floating IP", "ipVersion", ipVersion, "target", target) fip, err = clusterScope.CloudscaleClient.FloatingIPs.Create(ctx, req) if err != nil { - return fmt.Errorf("creating floating IP: %w", err) + if cloudscale.IsTimeoutError(err) { + clusterScope.Info("Floating IP creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) + return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + } + return ctrl.Result{}, fmt.Errorf("creating floating IP: %w", err) } ip := fip.IP() @@ -144,7 +150,7 @@ func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Con "Created floating IP %s", ip) r.setControlPlaneEndpointFromFIP(clusterScope, fip) - return nil + return ctrl.Result{}, nil } type floatingIPTarget struct { diff --git a/internal/controller/cloudscalecluster_floatingip_test.go b/internal/controller/cloudscalecluster_floatingip_test.go index a9426e1..55f19c8 100644 --- a/internal/controller/cloudscalecluster_floatingip_test.go +++ b/internal/controller/cloudscalecluster_floatingip_test.go @@ -19,6 +19,8 @@ package controller import ( "context" "fmt" + "net/url" + "os" "testing" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" @@ -83,7 +85,7 @@ func TestReconcileFloatingIP_Disabled(t *testing.T) { // No FloatingIP spec = disabled r := newFIPTestReconciler() - err := r.reconcileFloatingIP(context.Background(), clusterScope) + _, err := r.reconcileFloatingIP(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.FloatingIPReadyCondition) @@ -110,7 +112,7 @@ func TestReconcileFloatingIP_PreExisting(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileFloatingIP(context.Background(), clusterScope) + _, err := r.reconcileFloatingIP(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.FloatingIP).To(Equal("1.2.3.4")) @@ -136,7 +138,7 @@ func TestReconcileFloatingIP_ErrorSetsConditionFalse(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileFloatingIP(context.Background(), clusterScope) + _, err := r.reconcileFloatingIP(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.FloatingIPReadyCondition) @@ -249,7 +251,7 @@ func TestReconcilePreExistingFloatingIP_RegionMismatchErrors(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileFloatingIP(context.Background(), clusterScope) + _, err := r.reconcileFloatingIP(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("lpg")) @@ -300,7 +302,7 @@ func TestReconcilePreExistingFloatingIP_NoPublicInterfaceSetsConditionAndEvent(t } r := newFIPTestReconciler(cpMachine) - err := r.reconcileFloatingIP(context.Background(), clusterScope) + _, err := r.reconcileFloatingIP(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("public interface")) g.Expect(err.Error()).To(ContainSubstring("control-plane machine template")) @@ -423,7 +425,7 @@ func TestReconcileManagedFloatingIP_ExistingFIPEnsuresAssignment(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + _, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Spec.ControlPlaneEndpoint.Host).To(Equal("10.0.0.1")) @@ -452,7 +454,7 @@ func TestReconcileManagedFloatingIP_CreatesIPv4(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + _, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq).ToNot(BeNil()) @@ -488,7 +490,7 @@ func TestReconcileManagedFloatingIP_CreatesIPv6(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + _, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.IPVersion).To(Equal(6)) @@ -512,7 +514,7 @@ func TestReconcileManagedFloatingIP_CreateError(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + _, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("creating floating IP")) @@ -539,7 +541,7 @@ func TestReconcileManagedFloatingIP_AssignsToLBTarget(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + _, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq).ToNot(BeNil()) @@ -932,6 +934,38 @@ func TestDeleteFloatingIP_DeleteError(t *testing.T) { g.Expect(cond.Reason).To(Equal(infrastructurev1beta2.FloatingIPErrorReason)) } +// --- Timeout handling tests for Create() calls --- + +func TestReconcileManagedFloatingIP_CreateTimeoutRequeues(t *testing.T) { + g := NewWithT(t) + + fipService := &mockFloatingIPService{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.FloatingIP, error) { + return nil, nil + }, + createFn: func(ctx context.Context, req *cloudscalesdk.FloatingIPCreateRequest) (*cloudscalesdk.FloatingIP, error) { + if req.IPVersion == 4 { + // Simulate timeout via context deadline exceeded wrapped in url.Error + return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/floating_ips", Err: os.ErrDeadlineExceeded} + } + // IPv6 path: return success so we can verify timeout path above is hit first + return &cloudscalesdk.FloatingIP{Network: "2001:db8::1/128"}, nil + }, + } + + clusterScope := newFIPTestClusterScope(fipService) + clusterScope.CloudscaleCluster.Status.LoadBalancerID = "lb-uuid" + fipSpec := &infrastructurev1beta2.FloatingIPSpec{} + + r := newFIPTestReconciler() + + result, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(Equal(CreateTimeoutRequeueInterval), + "Should requeue after CreateTimeoutRequeueInterval on timeout error") +} + // --- Mock FloatingIPService --- type mockFloatingIPService struct { diff --git a/internal/controller/cloudscalecluster_loadbalancer.go b/internal/controller/cloudscalecluster_loadbalancer.go index e6666bd..3aa5ce3 100644 --- a/internal/controller/cloudscalecluster_loadbalancer.go +++ b/internal/controller/cloudscalecluster_loadbalancer.go @@ -65,8 +65,10 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, }() // 1. Reconcile the load balancer itself - if err := r.reconcileLB(ctx, clusterScope); err != nil { + if result, err := r.reconcileLB(ctx, clusterScope); err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer: %w", err) + } else if !result.IsZero() { + return result, nil } // Wait for LB to be running before creating pool/listener @@ -83,27 +85,41 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, if lb.Status != LoadBalancerRunningStatus { clusterScope.Info("Waiting for load balancer to be running", "status", lb.Status) lbPending = true - return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + requeueAfter := 5 * time.Second + if lb.Status == "error" || lb.Status == "degraded" { + // During bootstrap, error/degraded is expected because the health + // monitor checks an empty pool (no CP machines ready yet). + requeueAfter = 30 * time.Second + } + return ctrl.Result{RequeueAfter: requeueAfter}, nil } // 2. Reconcile the pool - if err := r.reconcileLBPool(ctx, clusterScope); err != nil { + if result, err := r.reconcileLBPool(ctx, clusterScope); err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer pool: %w", err) + } else if !result.IsZero() { + return result, nil } // 3. Reconcile the listener - if err := r.reconcileLBListener(ctx, clusterScope); err != nil { + if result, err := r.reconcileLBListener(ctx, clusterScope); err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer listener: %w", err) + } else if !result.IsZero() { + return result, nil } // 4. Reconcile the health monitor - if err := r.reconcileLBHealthMonitor(ctx, clusterScope); err != nil { + if result, err := r.reconcileLBHealthMonitor(ctx, clusterScope); err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer health monitor: %w", err) + } else if !result.IsZero() { + return result, nil } // 5. Reconcile the members - if err := r.reconcileLBMembers(ctx, clusterScope); err != nil { + if result, err := r.reconcileLBMembers(ctx, clusterScope); err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer members: %w", err) + } else if !result.IsZero() { + return result, nil } // 6. Set the control plane endpoint from the VIP @@ -126,7 +142,7 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, } // reconcileLB ensures the load balancer exists. -func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterScope *scope.ClusterScope) error { +func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerID, "load balancer", @@ -135,11 +151,11 @@ func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterSc clusterOwnershipTags(clusterScope.CloudscaleCluster), ) if err != nil { - return err + return ctrl.Result{}, err } clusterScope.CloudscaleCluster.Status.LoadBalancerID = id if id != "" { - return nil + return ctrl.Result{}, nil } // Create new load balancer @@ -161,7 +177,7 @@ func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterSc if lbSpec.Network != "" { subnetID, err := lbPrivateNetworkSubnetID(clusterScope) if err != nil { - return err + return ctrl.Result{}, err } req.VIPAddresses = &[]cloudscalesdk.VIPAddressRequest{ {Subnet: subnetID}, @@ -172,18 +188,22 @@ func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterSc clusterScope.Info("Creating load balancer", "zone", zone, "flavor", lbSpec.Flavor) lb, err := clusterScope.CloudscaleClient.LoadBalancers.Create(ctx, req) if err != nil { - return fmt.Errorf("creating load balancer: %w", err) + if cloudscale.IsTimeoutError(err) { + clusterScope.Info("Load balancer creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) + return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + } + return ctrl.Result{}, fmt.Errorf("creating load balancer: %w", err) } clusterScope.CloudscaleCluster.Status.LoadBalancerID = lb.UUID clusterScope.Info("Created load balancer", "loadBalancerID", lb.UUID) r.recorder.Eventf(clusterScope.CloudscaleCluster, nil, corev1.EventTypeNormal, "LoadBalancerCreated", "CreateLoadBalancer", "Created load balancer %s in zone %s", lb.UUID, zone) - return nil + return ctrl.Result{}, nil } // reconcileLBPool ensures the load balancer pool exists. -func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clusterScope *scope.ClusterScope) error { +func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, "load balancer pool", @@ -192,11 +212,11 @@ func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clust clusterOwnershipTags(clusterScope.CloudscaleCluster), ) if err != nil { - return err + return ctrl.Result{}, err } clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = id if id != "" { - return nil + return ctrl.Result{}, nil } // Create new pool @@ -215,18 +235,22 @@ func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clust clusterScope.Info("Creating load balancer pool", "algorithm", algorithm) pool, err := clusterScope.CloudscaleClient.LoadBalancerPools.Create(ctx, req) if err != nil { - return fmt.Errorf("creating load balancer pool: %w", err) + if cloudscale.IsTimeoutError(err) { + clusterScope.Info("Load balancer pool creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) + return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + } + return ctrl.Result{}, fmt.Errorf("creating load balancer pool: %w", err) } clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = pool.UUID clusterScope.Info("Created load balancer pool", "poolID", pool.UUID) r.recorder.Eventf(clusterScope.CloudscaleCluster, nil, corev1.EventTypeNormal, "PoolCreated", "CreateLoadBalancerPool", "Created load balancer pool %s", pool.UUID) - return nil + return ctrl.Result{}, nil } // reconcileLBListener ensures the load balancer listener exists. -func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, clusterScope *scope.ClusterScope) error { +func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID, "load balancer listener", @@ -235,11 +259,11 @@ func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, c clusterOwnershipTags(clusterScope.CloudscaleCluster), ) if err != nil { - return err + return ctrl.Result{}, err } clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = id if id != "" { - return nil + return ctrl.Result{}, nil } apiServerPort := int(clusterScope.CloudscaleCluster.Spec.ControlPlaneLoadBalancer.APIServerPort) @@ -256,7 +280,11 @@ func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, c clusterScope.Info("Creating load balancer listener", "port", apiServerPort) listener, err := clusterScope.CloudscaleClient.LoadBalancerListeners.Create(ctx, req) if err != nil { - return fmt.Errorf("creating load balancer listener: %w", err) + if cloudscale.IsTimeoutError(err) { + clusterScope.Info("Load balancer listener creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) + return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + } + return ctrl.Result{}, fmt.Errorf("creating load balancer listener: %w", err) } clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = listener.UUID @@ -264,12 +292,12 @@ func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, c r.recorder.Eventf(clusterScope.CloudscaleCluster, nil, corev1.EventTypeNormal, "ListenerCreated", "CreateLoadBalancerListener", "Created load balancer listener %s on port %d", listener.UUID, apiServerPort) - return nil + return ctrl.Result{}, nil } // reconcileLBHealthMonitor ensures the load balancer health monitor exists. // The health monitor performs TCP health checks on the API server port. -func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Context, clusterScope *scope.ClusterScope) error { +func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID, "load balancer health monitor", @@ -278,11 +306,11 @@ func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Conte clusterOwnershipTags(clusterScope.CloudscaleCluster), ) if err != nil { - return err + return ctrl.Result{}, err } clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = id if id != "" { - return nil + return ctrl.Result{}, nil } healthMonitorSpec := clusterScope.CloudscaleCluster.Spec.ControlPlaneLoadBalancer.HealthMonitor @@ -302,7 +330,11 @@ func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Conte clusterScope.Info("Creating load balancer health monitor", "type", "tcp", "pool", clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, "spec", healthMonitorSpec) monitor, err := clusterScope.CloudscaleClient.LoadBalancerHealthMonitors.Create(ctx, req) if err != nil { - return fmt.Errorf("creating load balancer health monitor: %w", err) + if cloudscale.IsTimeoutError(err) { + clusterScope.Info("Load balancer health monitor creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) + return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + } + return ctrl.Result{}, fmt.Errorf("creating load balancer health monitor: %w", err) } clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = monitor.UUID @@ -310,20 +342,20 @@ func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Conte r.recorder.Eventf(clusterScope.CloudscaleCluster, nil, corev1.EventTypeNormal, "HealthMonitorCreated", "CreateLoadBalancerHealthMonitor", "Created load balancer health monitor %s", monitor.UUID) - return nil + return ctrl.Result{}, nil } -func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, clusterScope *scope.ClusterScope) error { +func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { // Fetch current members from the load balancer currentMembers, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.List(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID) if err != nil { - return fmt.Errorf("failed to get current load balancer members: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to get current load balancer members: %w", err) } // Fetch control plane machines as desired members desiredMembers, err := r.getDesiredLoadBalancerMembers(ctx, clusterScope) if err != nil { - return fmt.Errorf("failed to get desired load balancer members: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to get desired load balancer members: %w", err) } clusterScope.V(2).Info("reconcileLBMembers", "currentMembers", currentMembers, "desiredMembers", desiredMembers) @@ -343,12 +375,14 @@ func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, cl for _, desired := range desiredMembers { current, exists := currentByName[desired.Name] if !exists { - if err := r.createLoadBalancerMember(ctx, clusterScope, desired); err != nil { - return fmt.Errorf("failed to create load balancer member: %w", err) + if result, err := r.createLoadBalancerMember(ctx, clusterScope, desired); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to create load balancer member: %w", err) + } else if !result.IsZero() { + return result, nil } } else if current.Address != desired.Address { if err := r.updateLoadBalancerMember(ctx, clusterScope, current.UUID, desired.Address); err != nil { - return fmt.Errorf("failed to update load balancer member: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to update load balancer member: %w", err) } } } @@ -357,12 +391,12 @@ func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, cl for _, member := range currentMembers { if _, exists := desiredByName[member.Name]; !exists { if err := r.deleteLoadBalancerMember(ctx, clusterScope, member); err != nil { - return fmt.Errorf("failed to delete load balancer member: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to delete load balancer member: %w", err) } } } - return nil + return ctrl.Result{}, nil } func (r *CloudscaleClusterReconciler) getDesiredLoadBalancerMembers(ctx context.Context, clusterScope *scope.ClusterScope) ([]cloudscalesdk.LoadBalancerPoolMemberRequest, error) { @@ -470,16 +504,20 @@ func lbPrivateNetworkSubnetID(clusterScope *scope.ClusterScope) (string, error) return ns.SubnetID, nil } -func (r *CloudscaleClusterReconciler) createLoadBalancerMember(ctx context.Context, clusterScope *scope.ClusterScope, member cloudscalesdk.LoadBalancerPoolMemberRequest) error { +func (r *CloudscaleClusterReconciler) createLoadBalancerMember(ctx context.Context, clusterScope *scope.ClusterScope, member cloudscalesdk.LoadBalancerPoolMemberRequest) (ctrl.Result, error) { clusterScope.V(2).Info("Creating load balancer member", "member", member) cm, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Create(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, &member) if err != nil { - return fmt.Errorf("creating load balancer member: %w", err) + if cloudscale.IsTimeoutError(err) { + clusterScope.Info("Load balancer member creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) + return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + } + return ctrl.Result{}, fmt.Errorf("creating load balancer member: %w", err) } clusterScope.V(2).Info("Created load balancer member", "member", member) clusterScope.CloudscaleCluster.Status.LoadBalancerMemberIDs = append(clusterScope.CloudscaleCluster.Status.LoadBalancerMemberIDs, cm.UUID) - return nil + return ctrl.Result{}, nil } func (r *CloudscaleClusterReconciler) updateLoadBalancerMember(ctx context.Context, clusterScope *scope.ClusterScope, memberUUID, newAddress string) error { diff --git a/internal/controller/cloudscalecluster_loadbalancer_test.go b/internal/controller/cloudscalecluster_loadbalancer_test.go index fa34162..009eb56 100644 --- a/internal/controller/cloudscalecluster_loadbalancer_test.go +++ b/internal/controller/cloudscalecluster_loadbalancer_test.go @@ -19,6 +19,7 @@ package controller import ( "context" "testing" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" "github.com/go-logr/logr" @@ -169,7 +170,7 @@ func TestReconcileLB_CreatesLoadBalancer(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLB(context.Background(), clusterScope) + _, err := r.reconcileLB(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(Equal("lb-uuid-123")) @@ -199,7 +200,7 @@ func TestReconcileLB_SkipsIfAlreadyExists(t *testing.T) { r := newTestReconciler() - err := r.reconcileLB(context.Background(), clusterScope) + _, err := r.reconcileLB(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(Equal("existing-lb-uuid")) @@ -226,7 +227,7 @@ func TestReconcileLB_FindsExistingByTag(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLB(context.Background(), clusterScope) + _, err := r.reconcileLB(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(Equal("found-lb-uuid")) @@ -250,7 +251,7 @@ func TestReconcileLB_ErrorsOnMultipleByTag(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLB(context.Background(), clusterScope) + _, err := r.reconcileLB(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("found 2 load balancers matching tag filter")) @@ -283,7 +284,7 @@ func TestReconcileLB_RecreatesIfDeletedExternally(t *testing.T) { r := newTestReconciler() - err := r.reconcileLB(context.Background(), clusterScope) + _, err := r.reconcileLB(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createdLB).To(BeTrue(), "Should create a new LB when old one was deleted") @@ -309,7 +310,7 @@ func TestReconcileLB_UsesCustomFlavor(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLB(context.Background(), clusterScope) + _, err := r.reconcileLB(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.Flavor).To(Equal("lb-flex-2")) @@ -343,7 +344,7 @@ func TestReconcileLBPool_CreatesPool(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBPool(context.Background(), clusterScope) + _, err := r.reconcileLBPool(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(Equal("pool-uuid-123")) @@ -374,7 +375,7 @@ func TestReconcileLBPool_SkipsIfAlreadyExists(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBPool(context.Background(), clusterScope) + _, err := r.reconcileLBPool(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(Equal("existing-pool-uuid")) @@ -401,7 +402,7 @@ func TestReconcileLBPool_FindsExistingByTag(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLBPool(context.Background(), clusterScope) + _, err := r.reconcileLBPool(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(Equal("found-pool-uuid")) @@ -425,7 +426,7 @@ func TestReconcileLBPool_ErrorsOnMultipleByTag(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLBPool(context.Background(), clusterScope) + _, err := r.reconcileLBPool(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("found 2 load balancer pools matching tag filter")) @@ -459,7 +460,7 @@ func TestReconcileLBPool_RecreatesIfDeletedExternally(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBPool(context.Background(), clusterScope) + _, err := r.reconcileLBPool(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createdPool).To(BeTrue(), "Should create a new pool when old one was deleted") @@ -487,7 +488,7 @@ func TestReconcileLBPool_UsesCustomAlgorithm(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBPool(context.Background(), clusterScope) + _, err := r.reconcileLBPool(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.Algorithm).To(Equal("least_connections")) @@ -520,7 +521,7 @@ func TestReconcileLBListener_CreatesListener(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBListener(context.Background(), clusterScope) + _, err := r.reconcileLBListener(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(Equal("listener-uuid-123")) @@ -551,7 +552,7 @@ func TestReconcileLBListener_SkipsIfAlreadyExists(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBListener(context.Background(), clusterScope) + _, err := r.reconcileLBListener(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(Equal("existing-listener-uuid")) @@ -578,7 +579,7 @@ func TestReconcileLBListener_FindsExistingByTag(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLBListener(context.Background(), clusterScope) + _, err := r.reconcileLBListener(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(Equal("found-listener-uuid")) @@ -602,7 +603,7 @@ func TestReconcileLBListener_ErrorsOnMultipleByTag(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLBListener(context.Background(), clusterScope) + _, err := r.reconcileLBListener(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("found 2 load balancer listeners matching tag filter")) @@ -636,7 +637,7 @@ func TestReconcileLBListener_RecreatesIfDeletedExternally(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBListener(context.Background(), clusterScope) + _, err := r.reconcileLBListener(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createdListener).To(BeTrue(), "Should create a new listener when old one was deleted") @@ -664,7 +665,7 @@ func TestReconcileLBListener_UsesCustomPort(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBListener(context.Background(), clusterScope) + _, err := r.reconcileLBListener(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.ProtocolPort).To(Equal(8443)) @@ -696,7 +697,7 @@ func TestReconcileLBHealthMonitor_CreatesMonitor(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) + _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("hm-uuid-123")) @@ -729,7 +730,7 @@ func TestReconcileLBHealthMonitor_SkipsIfAlreadyExists(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) + _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("existing-hm-uuid")) @@ -756,7 +757,7 @@ func TestReconcileLBHealthMonitor_FindsExistingByTag(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) + _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("found-hm-uuid")) @@ -780,7 +781,7 @@ func TestReconcileLBHealthMonitor_ErrorsOnMultipleByTag(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) + _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("found 2 load balancer health monitors matching tag filter")) @@ -814,7 +815,7 @@ func TestReconcileLBHealthMonitor_RecreatesIfDeletedExternally(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) + _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createdHM).To(BeTrue(), "Should create a new health monitor when old one was deleted") @@ -845,7 +846,7 @@ func TestReconcileLBHealthMonitor_UsesCustomThresholds(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) + _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.DelayS).To(Equal(10)) @@ -911,6 +912,84 @@ func TestReconcileLoadBalancer_WaitsForLBRunning(t *testing.T) { g.Expect(result.RequeueAfter).ToNot(BeZero(), "Should requeue when LB is not running") } +func TestReconcileLoadBalancer_ErrorStatusHasLongerRequeue(t *testing.T) { + g := NewWithT(t) + + lbService := &mockLoadBalancerService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { + return &cloudscalesdk.LoadBalancer{ + UUID: id, + Status: "error", + }, nil + }, + } + + clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ + loadBalancerService: lbService, + lbEnabled: true, + }) + clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID + + r := newTestReconciler() + + result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(Equal(30*time.Second), "error status should requeue after 30s") +} + +func TestReconcileLoadBalancer_DegradedStatusHasLongerRequeue(t *testing.T) { + g := NewWithT(t) + + lbService := &mockLoadBalancerService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { + return &cloudscalesdk.LoadBalancer{ + UUID: id, + Status: "degraded", + }, nil + }, + } + + clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ + loadBalancerService: lbService, + lbEnabled: true, + }) + clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID + + r := newTestReconciler() + + result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(Equal(30*time.Second), "degraded status should requeue after 30s") +} + +func TestReconcileLoadBalancer_CreatingStatusHasShorterRequeue(t *testing.T) { + g := NewWithT(t) + + lbService := &mockLoadBalancerService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { + return &cloudscalesdk.LoadBalancer{ + UUID: id, + Status: "creating", + }, nil + }, + } + + clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ + loadBalancerService: lbService, + lbEnabled: true, + }) + clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID + + r := newTestReconciler() + + result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(Equal(5*time.Second), "creating status should requeue after 5s") +} + func TestReconcileLoadBalancer_SetsControlPlaneEndpoint(t *testing.T) { g := NewWithT(t) @@ -1158,7 +1237,7 @@ func TestReconcileLBMembers_AddsMissingMember(t *testing.T) { r := newTestReconcilerWithClient(k8sClient) - err := r.reconcileLBMembers(context.Background(), clusterScope) + _, err := r.reconcileLBMembers(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createdReq).ToNot(BeNil()) @@ -1195,7 +1274,7 @@ func TestReconcileLBMembers_RemovesStaleMember(t *testing.T) { r := newTestReconcilerWithClient(k8sClient) - err := r.reconcileLBMembers(context.Background(), clusterScope) + _, err := r.reconcileLBMembers(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(deletedMemberID).To(Equal("stale-member-uuid")) @@ -1247,7 +1326,7 @@ func TestReconcileLBMembers_UpdatesChangedAddress(t *testing.T) { r := newTestReconcilerWithClient(k8sClient) - err := r.reconcileLBMembers(context.Background(), clusterScope) + _, err := r.reconcileLBMembers(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(updatedMemberID).To(Equal("member-uuid-1")) @@ -1304,7 +1383,7 @@ func TestReconcileLBMembers_NoopWhenInSync(t *testing.T) { r := newTestReconcilerWithClient(k8sClient) - err := r.reconcileLBMembers(context.Background(), clusterScope) + _, err := r.reconcileLBMembers(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) } @@ -1440,7 +1519,7 @@ func TestCreateLoadBalancerMember_AppendsToStatus(t *testing.T) { r := newTestReconciler() - err := r.createLoadBalancerMember(context.Background(), clusterScope, cloudscalesdk.LoadBalancerPoolMemberRequest{ + _, err := r.createLoadBalancerMember(context.Background(), clusterScope, cloudscalesdk.LoadBalancerPoolMemberRequest{ Name: "cp-machine-1", Address: "10.0.0.1", }) diff --git a/internal/controller/cloudscalecluster_network.go b/internal/controller/cloudscalecluster_network.go index 381d9de..e3a6daa 100644 --- a/internal/controller/cloudscalecluster_network.go +++ b/internal/controller/cloudscalecluster_network.go @@ -25,6 +25,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" @@ -33,7 +34,7 @@ import ( // reconcileNetwork orchestrates network and subnet provisioning for all networks // defined in spec.networks. A single NetworkReadyCondition covers all networks. -func (r *CloudscaleClusterReconciler) reconcileNetwork(ctx context.Context, clusterScope *scope.ClusterScope) (reterr error) { +func (r *CloudscaleClusterReconciler) reconcileNetwork(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { defer func() { if reterr != nil { r.setCondition(clusterScope, infrastructurev1beta2.NetworkReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.NetworkErrorReason, reterr.Error()) @@ -43,22 +44,26 @@ func (r *CloudscaleClusterReconciler) reconcileNetwork(ctx context.Context, clus }() if len(clusterScope.CloudscaleCluster.Spec.Networks) == 0 { - return fmt.Errorf("no networks defined in spec") + return ctrl.Result{}, fmt.Errorf("no networks defined in spec") } for _, netSpec := range clusterScope.CloudscaleCluster.Spec.Networks { if netSpec.UUID != "" { if err := r.reconcilePreExistingNetwork(ctx, clusterScope, netSpec); err != nil { - return fmt.Errorf("reconciling pre-existing network %q: %w", netSpec.Name, err) + return ctrl.Result{}, fmt.Errorf("reconciling pre-existing network %q: %w", netSpec.Name, err) } } else { - if err := r.reconcileManagedNetwork(ctx, clusterScope, netSpec); err != nil { - return fmt.Errorf("reconciling managed network %q: %w", netSpec.Name, err) + result, err := r.reconcileManagedNetwork(ctx, clusterScope, netSpec) + if err != nil { + return ctrl.Result{}, fmt.Errorf("reconciling managed network %q: %w", netSpec.Name, err) + } + if !result.IsZero() { + return result, nil } } } - return nil + return ctrl.Result{}, nil } // reconcilePreExistingNetwork validates a pre-existing network exists and discovers its subnet. @@ -93,7 +98,7 @@ func (r *CloudscaleClusterReconciler) reconcilePreExistingNetwork(ctx context.Co } // reconcileManagedNetwork ensures a managed network and its subnet exist. -func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Context, clusterScope *scope.ClusterScope, netSpec infrastructurev1beta2.NetworkSpec) error { +func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Context, clusterScope *scope.ClusterScope, netSpec infrastructurev1beta2.NetworkSpec) (_ ctrl.Result, reterr error) { ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus(netSpec.Name) // Reconcile the network resource @@ -112,7 +117,7 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex tags, ) if err != nil { - return err + return ctrl.Result{}, err } if resolvedNetworkID == "" { @@ -129,7 +134,11 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex }, }) if err != nil { - return fmt.Errorf("creating network: %w", err) + if cloudscale.IsTimeoutError(err) { + clusterScope.Info("Network creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) + return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + } + return ctrl.Result{}, fmt.Errorf("creating network: %w", err) } resolvedNetworkID = network.UUID clusterScope.Info("Created network", "name", netSpec.Name, "networkID", network.UUID) @@ -151,7 +160,7 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex tags, ) if err != nil { - return err + return ctrl.Result{}, err } if resolvedSubnetID == "" { @@ -166,7 +175,11 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex }, }) if err != nil { - return fmt.Errorf("creating subnet: %w", err) + if cloudscale.IsTimeoutError(err) { + clusterScope.Info("Subnet creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) + return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + } + return ctrl.Result{}, fmt.Errorf("creating subnet: %w", err) } resolvedSubnetID = subnet.UUID clusterScope.Info("Created subnet", "name", netSpec.Name, "subnetID", subnet.UUID) @@ -175,7 +188,7 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex } r.setNetworkStatus(clusterScope, netSpec.Name, resolvedNetworkID, resolvedSubnetID, netSpec.CIDR, true) - return nil + return ctrl.Result{}, nil } // deleteNetwork deletes all managed networks. Pre-existing networks are left untouched. diff --git a/internal/controller/cloudscalecluster_network_test.go b/internal/controller/cloudscalecluster_network_test.go index ce12ef9..6769f6b 100644 --- a/internal/controller/cloudscalecluster_network_test.go +++ b/internal/controller/cloudscalecluster_network_test.go @@ -19,6 +19,8 @@ package controller import ( "context" "fmt" + "net/url" + "os" "testing" "github.com/cloudscale-ch/cloudscale-go-sdk/v8" @@ -99,7 +101,7 @@ func TestReconcileNetwork_CreatesBothResources(t *testing.T) { clusterScope := newTestClusterScope(networkService, subnetService) r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("test") @@ -144,7 +146,7 @@ func TestReconcileNetwork_SkipsIfBothExist(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("test") @@ -171,7 +173,7 @@ func TestReconcileNetwork_NetworkErrorStopsSubnet(t *testing.T) { clusterScope := newTestClusterScope(networkService, subnetService) r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("api error")) @@ -195,7 +197,7 @@ func TestReconcileNetwork_SubnetErrorSurfaced(t *testing.T) { clusterScope := newTestClusterScope(networkService, subnetService) r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("subnet api error")) @@ -232,7 +234,7 @@ func TestReconcileNetwork_FindsByTag(t *testing.T) { clusterScope := newTestClusterScope(networkService, subnetService) r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("test") @@ -256,7 +258,7 @@ func TestReconcileNetwork_ErrorsOnMultipleNetworks(t *testing.T) { clusterScope := newTestClusterScope(networkService, &mockSubnetService{}) r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("found 2 network/tests matching tag filter")) @@ -292,7 +294,7 @@ func TestReconcileNetwork_RecreatesIfDeletedExternally(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(created).To(BeTrue(), "Should create a new network when old one was deleted") @@ -335,7 +337,7 @@ func TestReconcileNetwork_SubnetFindsByTag(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("test") @@ -367,7 +369,7 @@ func TestReconcileNetwork_SubnetErrorsOnMultiple(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("found 2 subnet/tests matching tag filter")) @@ -403,7 +405,7 @@ func TestReconcileNetwork_SubnetRecreatesIfDeletedExternally(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(created).To(BeTrue(), "Should create a new subnet when old one was deleted") @@ -434,7 +436,7 @@ func TestReconcileNetwork_CustomCIDR(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.CIDR).To(Equal("192.168.0.0/16")) @@ -462,7 +464,7 @@ func TestReconcileNetwork_ExplicitGateway(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.GatewayAddress).To(Equal("10.0.0.254")) @@ -659,7 +661,7 @@ func TestReconcileNetwork_PreExistingCachedShortCircuits(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("pre-existing-net") @@ -697,7 +699,7 @@ func TestReconcileNetwork_PreExistingReDiscoversWhenCIDRMissing(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("pre-existing-net") @@ -733,7 +735,7 @@ func TestReconcileNetwork_PreExistingFetchesAndSetsStatus(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("pre-existing-net") @@ -759,7 +761,7 @@ func TestReconcileNetwork_PreExistingGetError(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("network not found")) @@ -789,7 +791,7 @@ func TestReconcileNetwork_PreExistingZoneMismatchErrors(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("lpg1")) @@ -819,12 +821,124 @@ func TestReconcileNetwork_PreExistingNoSubnetsErrors(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("has no subnets")) } +// --- Timeout handling tests for Create() calls --- + +func TestReconcileNetwork_NetworkCreateTimeoutRequeues(t *testing.T) { + g := NewWithT(t) + + networkService := &mockNetworkService{ + createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/networks", Err: os.ErrDeadlineExceeded} + }, + } + subnetService := &mockSubnetService{} + + clusterScope := newTestClusterScope(networkService, subnetService) + r := newTestReconciler() + + result, err := r.reconcileNetwork(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(Equal(CreateTimeoutRequeueInterval), + "Should requeue after CreateTimeoutRequeueInterval on timeout error") + g.Expect(networkService.createFn).ToNot(BeNil(), "Network create should have been called") +} + +func TestReconcileNetwork_NetworkCreateGenericErrorPropagates(t *testing.T) { + g := NewWithT(t) + + networkService := &mockNetworkService{ + createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/networks", Err: fmt.Errorf("connection refused")} + }, + } + subnetService := &mockSubnetService{} + + clusterScope := newTestClusterScope(networkService, subnetService) + r := newTestReconciler() + + _, err := r.reconcileNetwork(context.Background(), clusterScope) + + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("creating network")) +} + +func TestReconcileNetwork_SubnetCreateTimeoutRequeues(t *testing.T) { + g := NewWithT(t) + + var netCreated bool + networkService := &mockNetworkService{ + createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + netCreated = true + return &cloudscale.Network{UUID: "new-net-uuid", Name: req.Name}, nil + }, + } + subnetService := &mockSubnetService{ + createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/subnets", Err: os.ErrDeadlineExceeded} + }, + } + + clusterScope := newTestClusterScope(networkService, subnetService) + r := newTestReconciler() + + result, err := r.reconcileNetwork(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(netCreated).To(BeTrue(), "Network should have been created before hitting subnet") + g.Expect(result.RequeueAfter).To(Equal(CreateTimeoutRequeueInterval), + "Should requeue after CreateTimeoutRequeueInterval on timeout error") +} + +func TestReconcileNetwork_SubnetCreateGenericErrorPropagates(t *testing.T) { + g := NewWithT(t) + + networkService := &mockNetworkService{ + createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + return &cloudscale.Network{UUID: "new-net-uuid", Name: req.Name}, nil + }, + } + subnetService := &mockSubnetService{ + createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + return nil, fmt.Errorf("quota exceeded") + }, + } + + clusterScope := newTestClusterScope(networkService, subnetService) + r := newTestReconciler() + + _, err := r.reconcileNetwork(context.Background(), clusterScope) + + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("creating subnet")) +} + +func TestReconcileNetwork_NetworkCreateWrappedTimeoutRequeues(t *testing.T) { + g := NewWithT(t) + + networkService := &mockNetworkService{ + createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + return nil, fmt.Errorf("outer: %w", &url.Error{Op: "Post", URL: "https://api.example.com/v1/networks", Err: os.ErrDeadlineExceeded}) + }, + } + subnetService := &mockSubnetService{} + + clusterScope := newTestClusterScope(networkService, subnetService) + r := newTestReconciler() + + result, err := r.reconcileNetwork(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(Equal(CreateTimeoutRequeueInterval), + "Should requeue after CreateTimeoutRequeueInterval even when error is wrapped") +} + // --- Mock services --- type mockNetworkService struct { diff --git a/internal/controller/cloudscalemachine_controller.go b/internal/controller/cloudscalemachine_controller.go index b886f6b..52a0246 100644 --- a/internal/controller/cloudscalemachine_controller.go +++ b/internal/controller/cloudscalemachine_controller.go @@ -128,7 +128,7 @@ func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re return ctrl.Result{}, fmt.Errorf("failed to get cloudscale.ch credentials: %w", err) } - cloudscaleClient := cloudscale.NewClient(token) + cloudscaleClient := cloudscale.NewClient(token, cloudscale.DefaultCloudscaleRequestTimeout) machineScope, err := scope.NewMachineScope(scope.MachineScopeParams{ Client: r.Client, @@ -174,8 +174,10 @@ func (r *CloudscaleMachineReconciler) reconcileNormal(ctx context.Context, machi } if machineScope.CloudscaleMachine.Spec.ServerGroup != nil { - if err := r.reconcileServerGroup(ctx, machineScope); err != nil { + if result, err := r.reconcileServerGroup(ctx, machineScope); err != nil { return ctrl.Result{}, err + } else if !result.IsZero() { + return result, nil } } diff --git a/internal/controller/cloudscalemachine_server.go b/internal/controller/cloudscalemachine_server.go index e7fd681..1e86362 100644 --- a/internal/controller/cloudscalemachine_server.go +++ b/internal/controller/cloudscalemachine_server.go @@ -162,6 +162,10 @@ func (r *CloudscaleMachineReconciler) reconcileServer(ctx context.Context, machi server, err = machineScope.CloudscaleClient.Servers.Create(ctx, req) if err != nil { + if cloudscale.IsTimeoutError(err) { + machineScope.Info("Server creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) + return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + } return ctrl.Result{}, fmt.Errorf("creating server: %w", err) } diff --git a/internal/controller/cloudscalemachine_servergroup.go b/internal/controller/cloudscalemachine_servergroup.go index 5687772..dcad664 100644 --- a/internal/controller/cloudscalemachine_servergroup.go +++ b/internal/controller/cloudscalemachine_servergroup.go @@ -23,6 +23,7 @@ import ( cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" @@ -31,7 +32,7 @@ import ( // reconcileServerGroup ensures the server group exists if specified. // Server groups are zone-scoped and created once per unique name+zone combination. -func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, machineScope *scope.MachineScope) (reterr error) { +func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, machineScope *scope.MachineScope) (_ ctrl.Result, reterr error) { defer func() { if reterr != nil { r.setCondition(machineScope.CloudscaleMachine, infrastructurev1beta2.ServerGroupReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.ServerGroupErrorReason, reterr.Error()) @@ -41,17 +42,17 @@ func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, }() if machineScope.CloudscaleMachine.Spec.ServerGroup == nil { - return nil // No server group requested + return ctrl.Result{}, nil // No server group requested } // If we already have a server group ID, verify it still exists if machineScope.CloudscaleMachine.Status.ServerGroupID != "" { _, err := machineScope.CloudscaleClient.ServerGroups.Get(ctx, machineScope.CloudscaleMachine.Status.ServerGroupID) if err == nil { - return nil + return ctrl.Result{}, nil } if !cloudscale.IsNotFound(err) { - return fmt.Errorf("getting server group: %w", err) + return ctrl.Result{}, fmt.Errorf("getting server group: %w", err) } // Server group was deleted externally, fall through to re-create machineScope.CloudscaleMachine.Status.ServerGroupID = "" @@ -64,13 +65,13 @@ func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, // so that all machines in the cluster can find the same server group. groups, err := machineScope.CloudscaleClient.ServerGroups.List(ctx, cloudscalesdk.WithTagFilter(clusterOwnershipTags(machineScope.CloudscaleCluster))) if err != nil { - return fmt.Errorf("listing server groups: %w", err) + return ctrl.Result{}, fmt.Errorf("listing server groups: %w", err) } for _, g := range groups { if g.Name == groupName && g.Zone.Slug == zone { machineScope.CloudscaleMachine.Status.ServerGroupID = g.UUID - return nil + return ctrl.Result{}, nil } } @@ -86,11 +87,15 @@ func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, group, err := machineScope.CloudscaleClient.ServerGroups.Create(ctx, req) if err != nil { - return fmt.Errorf("creating server group: %w", err) + if cloudscale.IsTimeoutError(err) { + machineScope.Info("Server group creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) + return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + } + return ctrl.Result{}, fmt.Errorf("creating server group: %w", err) } machineScope.CloudscaleMachine.Status.ServerGroupID = group.UUID - return nil + return ctrl.Result{}, nil } // deleteServerGroup clears the server group reference from the machine status. diff --git a/internal/controller/cloudscalemachine_servergroup_test.go b/internal/controller/cloudscalemachine_servergroup_test.go index 4506be4..b1f472c 100644 --- a/internal/controller/cloudscalemachine_servergroup_test.go +++ b/internal/controller/cloudscalemachine_servergroup_test.go @@ -167,7 +167,7 @@ func TestReconcileServerGroup_NoServerGroup_Noop(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).ToNot(HaveOccurred()) @@ -202,7 +202,7 @@ func TestReconcileServerGroup_FindsExisting_SetsStatusID(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createCalled).To(BeFalse(), "Create should not be called when group exists") @@ -244,7 +244,7 @@ func TestReconcileServerGroup_SkipsNonMatchingName(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createCalled).To(BeTrue(), "Create should be called when no matching name found") @@ -282,7 +282,7 @@ func TestReconcileServerGroup_SkipsNonMatchingZone(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createCalled).To(BeTrue(), "Create should be called when no matching zone found") @@ -313,7 +313,7 @@ func TestReconcileServerGroup_CreatesNew_SetsStatusID(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(machineScope.CloudscaleMachine.Status.ServerGroupID).To(Equal("created-group-uuid")) @@ -347,7 +347,7 @@ func TestReconcileServerGroup_ListError_PropagatesError(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("listing server groups")) @@ -375,7 +375,7 @@ func TestReconcileServerGroup_CreateError_PropagatesError(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("creating server group")) diff --git a/test/e2e/cloudscale_helpers.go b/test/e2e/cloudscale_helpers.go index 6f33bb6..2dc7831 100644 --- a/test/e2e/cloudscale_helpers.go +++ b/test/e2e/cloudscale_helpers.go @@ -23,14 +23,12 @@ import ( "fmt" "strings" - "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" ) // newCloudscaleClient creates a new cloudscale API client from the given token. func newCloudscaleClient(token string) *cloudscale.Client { - c := cloudscale.NewClient(nil) - c.AuthToken = token - return c + return cloudscale.NewClient(token, cloudscale.DefaultCloudscaleRequestTimeout) } // resourceSnapshot holds a snapshot of cloudscale API resources for leak detection. diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index b3dbe62..387b1e9 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -25,15 +25,12 @@ import ( "path/filepath" "testing" - "github.com/cloudscale-ch/cloudscale-go-sdk/v8" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/klog/v2" - ctrl "sigs.k8s.io/controller-runtime" - - apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" bootstrapv1 "sigs.k8s.io/cluster-api/api/bootstrap/kubeadm/v1beta2" controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" @@ -41,8 +38,10 @@ import ( "sigs.k8s.io/cluster-api/test/framework" "sigs.k8s.io/cluster-api/test/framework/bootstrap" "sigs.k8s.io/cluster-api/test/framework/clusterctl" + ctrl "sigs.k8s.io/controller-runtime" infrav1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" ) var ( From f6f444e2f9dde02948f0c72b73eb3b0cf77a959c Mon Sep 17 00:00:00 2001 From: Michael Weibel Date: Thu, 7 May 2026 16:24:42 +0200 Subject: [PATCH 2/9] fix: skip server group deletion when owned servers are present Prevent 'Cannot delete non-empty server group' race condition during cluster deletion by checking CloudscaleMachine resources for owned servers before attempting server group deletion. When owned servers are still present, return a sentinel error that triggers a 10s graceful requeue instead of exponential backoff. Also skip server groups containing foreign servers (servers from other clusters sharing this server group) to prevent accidental deletion. --- .../cloudscalecluster_controller.go | 5 + .../cloudscalecluster_servergroup.go | 59 ++++++++++ .../cloudscalecluster_servergroup_test.go | 111 ++++++++++++++++++ 3 files changed, 175 insertions(+) diff --git a/internal/controller/cloudscalecluster_controller.go b/internal/controller/cloudscalecluster_controller.go index f66c73e..199351c 100644 --- a/internal/controller/cloudscalecluster_controller.go +++ b/internal/controller/cloudscalecluster_controller.go @@ -18,7 +18,9 @@ package controller import ( "context" + "errors" "fmt" + "time" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -190,6 +192,9 @@ func (r *CloudscaleClusterReconciler) reconcileDelete(ctx context.Context, clust } if err := r.deleteServerGroups(ctx, clusterScope); err != nil { + if errors.Is(err, errServerGroupNotEmpty) { + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } return ctrl.Result{}, fmt.Errorf("deleting server groups: %w", err) } diff --git a/internal/controller/cloudscalecluster_servergroup.go b/internal/controller/cloudscalecluster_servergroup.go index 97b95f2..02f9684 100644 --- a/internal/controller/cloudscalecluster_servergroup.go +++ b/internal/controller/cloudscalecluster_servergroup.go @@ -18,17 +18,34 @@ package controller import ( "context" + "errors" "fmt" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + "k8s.io/apimachinery/pkg/util/sets" + clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + "sigs.k8s.io/controller-runtime/pkg/client" + infrav1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) +// errServerGroupNotEmpty is returned when a server group still contains owned servers +// and cannot be deleted yet. +var errServerGroupNotEmpty = errors.New("server group still contains owned servers") + // deleteServerGroups deletes all server groups owned by the cluster. // This is called during cluster deletion, after all machines have been removed. +// When owned servers are still present in a group, the function returns +// errServerGroupNotEmpty to signal a graceful requeue, preventing the +// "Cannot delete non-empty server group" API error. func (r *CloudscaleClusterReconciler) deleteServerGroups(ctx context.Context, clusterScope *scope.ClusterScope) error { + ownedServerIDs, err := r.getOwnedServerIDs(ctx, clusterScope) + if err != nil { + return fmt.Errorf("listing owned servers: %w", err) + } + groups, err := clusterScope.CloudscaleClient.ServerGroups.List(ctx, cloudscalesdk.WithTagFilter(clusterOwnershipTags(clusterScope.CloudscaleCluster))) if err != nil { @@ -36,6 +53,28 @@ func (r *CloudscaleClusterReconciler) deleteServerGroups(ctx context.Context, cl } for _, g := range groups { + hasOwnedServers := false + hasForeignServers := false + for _, s := range g.Servers { + if ownedServerIDs.Has(s.UUID) { + hasOwnedServers = true + } else { + hasForeignServers = true + } + } + + if hasOwnedServers { + clusterScope.Info("Waiting for owned servers to leave server group", + "serverGroupID", g.UUID, "name", g.Name) + return errServerGroupNotEmpty + } + + if hasForeignServers { + clusterScope.Info("Server group contains foreign servers, skipping deletion", + "serverGroupID", g.UUID, "name", g.Name) + continue + } + clusterScope.Info("Deleting server group", "serverGroupID", g.UUID, "name", g.Name) if err := clusterScope.CloudscaleClient.ServerGroups.Delete(ctx, g.UUID); err != nil { if !cloudscale.IsNotFound(err) { @@ -47,3 +86,23 @@ func (r *CloudscaleClusterReconciler) deleteServerGroups(ctx context.Context, cl return nil } + +// getOwnedServerIDs returns a set of server UUIDs from CloudscaleMachine resources +// that belong to the given cluster. +func (r *CloudscaleClusterReconciler) getOwnedServerIDs(ctx context.Context, clusterScope *scope.ClusterScope) (sets.Set[string], error) { + machineList := &infrav1beta2.CloudscaleMachineList{} + if err := r.List(ctx, machineList, + client.InNamespace(clusterScope.CloudscaleCluster.Namespace), + client.MatchingLabels{clusterv1.ClusterNameLabel: clusterScope.Cluster.Name}, + ); err != nil { + return nil, fmt.Errorf("listing CloudscaleMachines: %w", err) + } + + ids := sets.New[string]() + for _, m := range machineList.Items { + if m.Status.ServerID != "" { + ids.Insert(m.Status.ServerID) + } + } + return ids, nil +} diff --git a/internal/controller/cloudscalecluster_servergroup_test.go b/internal/controller/cloudscalecluster_servergroup_test.go index f55bdf2..b4f044d 100644 --- a/internal/controller/cloudscalecluster_servergroup_test.go +++ b/internal/controller/cloudscalecluster_servergroup_test.go @@ -178,3 +178,114 @@ func TestDeleteServerGroups_Ignores404(t *testing.T) { g.Expect(err).ToNot(HaveOccurred()) } + +func TestDeleteServerGroups_OwnedServerPresent_SkipsDeletion(t *testing.T) { + g := NewWithT(t) + + // The server group has a server that is owned by our cluster + serverGroupService := &mockServerGroupService{ + listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { + return []cloudscale.ServerGroup{ + {UUID: "sg-1", Name: "group-1", Servers: []cloudscale.ServerStub{{UUID: "server-123"}}}, + }, nil + }, + deleteFn: func(ctx context.Context, id string) error { + // Delete should NOT be called because server group has owned servers + g.Fail("Delete must not be called when group has owned servers") + return nil + }, + } + + clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) + fakeClient := newTestFakeClient( + &infrastructurev1beta2.CloudscaleMachine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "machine-1", + Namespace: "default", + Labels: map[string]string{ + clusterv1.ClusterNameLabel: "test-cluster", + }, + }, + Status: infrastructurev1beta2.CloudscaleMachineStatus{ + ServerID: "server-123", + }, + }, + ) + r := &CloudscaleClusterReconciler{ + Client: fakeClient, + } + + err := r.deleteServerGroups(context.Background(), clusterScope) + + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("server group still contains owned servers")) +} + +func TestDeleteServerGroups_ForeignServers_Skips(t *testing.T) { + g := NewWithT(t) + called := false + + // Server group has a server that is NOT owned by this cluster + serverGroupService := &mockServerGroupService{ + listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { + return []cloudscale.ServerGroup{ + {UUID: "sg-foreign", Name: "foreign-group", Servers: []cloudscale.ServerStub{{UUID: "server-999"}}}, + }, nil + }, + deleteFn: func(ctx context.Context, id string) error { + called = true + return nil + }, + } + + clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) + fakeClient := newTestFakeClient(&infrastructurev1beta2.CloudscaleMachine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "machine-1", + Namespace: "default", + Labels: map[string]string{ + clusterv1.ClusterNameLabel: "test-cluster", + }, + }, + Status: infrastructurev1beta2.CloudscaleMachineStatus{ + ServerID: "server-123", + }, + }) + r := &CloudscaleClusterReconciler{ + Client: fakeClient, + } + + err := r.deleteServerGroups(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(called).To(BeFalse()) +} + +func TestDeleteServerGroups_EmptyGroupName_DoesNotSkip(t *testing.T) { + g := NewWithT(t) + var deletedID string + + // Server group with no servers should be deleted immediately + serverGroupService := &mockServerGroupService{ + listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { + return []cloudscale.ServerGroup{ + {UUID: "sg-empty", Name: "empty-group", Servers: nil}, + }, nil + }, + deleteFn: func(ctx context.Context, id string) error { + deletedID = id + return nil + }, + } + + clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) + fakeClient := newTestFakeClient() + r := &CloudscaleClusterReconciler{ + Client: fakeClient, + } + + err := r.deleteServerGroups(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(deletedID).To(Equal("sg-empty")) +} From 52ac3914a62aac197ab94bf643636f566768b821 Mon Sep 17 00:00:00 2001 From: Michael Weibel Date: Thu, 7 May 2026 16:50:14 +0200 Subject: [PATCH 3/9] chore: replace t.Fatal with gomega-style g.Fail --- .../cloudscalecluster_floatingip_test.go | 8 ++--- .../cloudscalecluster_loadbalancer_test.go | 28 ++++++++-------- .../cloudscalecluster_network_test.go | 21 ++++-------- .../cloudscalecluster_reconcile_test.go | 4 +-- .../cloudscalecluster_servergroup_test.go | 33 +++++++------------ .../cloudscalemachine_reconcile_test.go | 6 ++-- .../cloudscalemachine_server_test.go | 6 ++-- .../cloudscalemachine_servergroup_test.go | 2 +- internal/controller/suite_test.go | 8 +++++ 9 files changed, 53 insertions(+), 63 deletions(-) diff --git a/internal/controller/cloudscalecluster_floatingip_test.go b/internal/controller/cloudscalecluster_floatingip_test.go index 55f19c8..9fbb30f 100644 --- a/internal/controller/cloudscalecluster_floatingip_test.go +++ b/internal/controller/cloudscalecluster_floatingip_test.go @@ -162,7 +162,7 @@ func TestReconcilePreExistingFloatingIP_RefetchesAndKeepsAssignmentWhenCached(t }, nil }, updateFn: func(ctx context.Context, id string, req *cloudscalesdk.FloatingIPUpdateRequest) error { - t.Fatal("Update must not fire when FIP is already assigned to the LB") + g.Fail("Update must not fire when FIP is already assigned to the LB") return nil }, } @@ -645,7 +645,7 @@ func TestEnsureFloatingIPAssignment_LBAlreadyCorrect(t *testing.T) { fipService := &mockFloatingIPService{ updateFn: func(ctx context.Context, id string, req *cloudscalesdk.FloatingIPUpdateRequest) error { - t.Fatal("Update should not be called when assignment is correct") + g.Fail("Update should not be called when assignment is correct") return nil }, } @@ -821,7 +821,7 @@ func TestDeleteFloatingIP_PreExistingSkipsDeletionAndLeavesConditionUntouched(t fipService := &mockFloatingIPService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called for pre-existing floating IPs") + g.Fail("Delete should not be called for pre-existing floating IPs") return nil }, } @@ -873,7 +873,7 @@ func TestDeleteFloatingIP_NoStatusSkipsDeletion(t *testing.T) { fipService := &mockFloatingIPService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when status is empty") + g.Fail("Delete should not be called when status is empty") return nil }, } diff --git a/internal/controller/cloudscalecluster_loadbalancer_test.go b/internal/controller/cloudscalecluster_loadbalancer_test.go index 009eb56..47e5233 100644 --- a/internal/controller/cloudscalecluster_loadbalancer_test.go +++ b/internal/controller/cloudscalecluster_loadbalancer_test.go @@ -187,7 +187,7 @@ func TestReconcileLB_SkipsIfAlreadyExists(t *testing.T) { return &cloudscalesdk.LoadBalancer{UUID: id, Status: "running"}, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerRequest) (*cloudscalesdk.LoadBalancer, error) { - t.Fatal("Create should not be called when LB already exists") + g.Fail("Create should not be called when LB already exists") return nil, nil }, } @@ -216,7 +216,7 @@ func TestReconcileLB_FindsExistingByTag(t *testing.T) { }, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerRequest) (*cloudscalesdk.LoadBalancer, error) { - t.Fatal("Create should not be called when LB is found by tag") + g.Fail("Create should not be called when LB is found by tag") return nil, nil }, } @@ -362,7 +362,7 @@ func TestReconcileLBPool_SkipsIfAlreadyExists(t *testing.T) { return &cloudscalesdk.LoadBalancerPool{UUID: id}, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerPoolRequest) (*cloudscalesdk.LoadBalancerPool, error) { - t.Fatal("Create should not be called when pool already exists") + g.Fail("Create should not be called when pool already exists") return nil, nil }, } @@ -391,7 +391,7 @@ func TestReconcileLBPool_FindsExistingByTag(t *testing.T) { }, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerPoolRequest) (*cloudscalesdk.LoadBalancerPool, error) { - t.Fatal("Create should not be called when pool is found by tag") + g.Fail("Create should not be called when pool is found by tag") return nil, nil }, } @@ -539,7 +539,7 @@ func TestReconcileLBListener_SkipsIfAlreadyExists(t *testing.T) { return &cloudscalesdk.LoadBalancerListener{UUID: id}, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerListenerRequest) (*cloudscalesdk.LoadBalancerListener, error) { - t.Fatal("Create should not be called when listener already exists") + g.Fail("Create should not be called when listener already exists") return nil, nil }, } @@ -568,7 +568,7 @@ func TestReconcileLBListener_FindsExistingByTag(t *testing.T) { }, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerListenerRequest) (*cloudscalesdk.LoadBalancerListener, error) { - t.Fatal("Create should not be called when listener is found by tag") + g.Fail("Create should not be called when listener is found by tag") return nil, nil }, } @@ -717,7 +717,7 @@ func TestReconcileLBHealthMonitor_SkipsIfAlreadyExists(t *testing.T) { return &cloudscalesdk.LoadBalancerHealthMonitor{UUID: id}, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerHealthMonitorRequest) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { - t.Fatal("Create should not be called when health monitor already exists") + g.Fail("Create should not be called when health monitor already exists") return nil, nil }, } @@ -746,7 +746,7 @@ func TestReconcileLBHealthMonitor_FindsExistingByTag(t *testing.T) { }, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerHealthMonitorRequest) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { - t.Fatal("Create should not be called when health monitor is found by tag") + g.Fail("Create should not be called when health monitor is found by tag") return nil, nil }, } @@ -864,11 +864,11 @@ func TestReconcileLoadBalancer_SkipsWhenDisabled(t *testing.T) { lbService := &mockLoadBalancerService{ createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerRequest) (*cloudscalesdk.LoadBalancer, error) { - t.Fatal("Create should not be called when LB is disabled") + g.Fail("Create should not be called when LB is disabled") return nil, nil }, getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - t.Fatal("Get should not be called when LB is disabled") + g.Fail("Get should not be called when LB is disabled") return nil, nil }, } @@ -1060,7 +1060,7 @@ func TestDeleteLoadBalancer_SkipsWhenDisabled(t *testing.T) { lbService := &mockLoadBalancerService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when LB is disabled") + g.Fail("Delete should not be called when LB is disabled") return nil }, } @@ -1344,15 +1344,15 @@ func TestReconcileLBMembers_NoopWhenInSync(t *testing.T) { }, nil }, createFn: func(ctx context.Context, poolID string, req *cloudscalesdk.LoadBalancerPoolMemberRequest) (*cloudscalesdk.LoadBalancerPoolMember, error) { - t.Fatal("Create should not be called when in sync") + g.Fail("Create should not be called when in sync") return nil, nil }, deleteFn: func(ctx context.Context, poolID, memberID string) error { - t.Fatal("Delete should not be called when in sync") + g.Fail("Delete should not be called when in sync") return nil }, updateFn: func(ctx context.Context, poolID, memberID string, req *cloudscalesdk.LoadBalancerPoolMemberRequest) error { - t.Fatal("Update should not be called when in sync") + g.Fail("Update should not be called when in sync") return nil }, } diff --git a/internal/controller/cloudscalecluster_network_test.go b/internal/controller/cloudscalecluster_network_test.go index 6769f6b..ac6aaa9 100644 --- a/internal/controller/cloudscalecluster_network_test.go +++ b/internal/controller/cloudscalecluster_network_test.go @@ -27,7 +27,6 @@ import ( "github.com/go-logr/logr" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/tools/events" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" @@ -71,12 +70,6 @@ func newTestClusterScope(networkService cs.NetworkService, subnetService cs.Subn } } -func newTestReconciler() *CloudscaleClusterReconciler { - return &CloudscaleClusterReconciler{ - recorder: events.NewFakeRecorder(10), - } -} - // --- Orchestrator tests --- func TestReconcileNetwork_CreatesBothResources(t *testing.T) { @@ -125,7 +118,7 @@ func TestReconcileNetwork_SkipsIfBothExist(t *testing.T) { return &cloudscale.Network{UUID: id}, nil }, createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - t.Fatal("Network create should not be called") + g.Fail("Network create should not be called") return nil, nil }, } @@ -134,7 +127,7 @@ func TestReconcileNetwork_SkipsIfBothExist(t *testing.T) { return &cloudscale.Subnet{UUID: id}, nil }, createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { - t.Fatal("Subnet create should not be called") + g.Fail("Create should not be called when subnet is found by tag") return nil, nil }, } @@ -165,7 +158,7 @@ func TestReconcileNetwork_NetworkErrorStopsSubnet(t *testing.T) { } subnetService := &mockSubnetService{ createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { - t.Fatal("Subnet create should not be called when network fails") + g.Fail("Subnet create should not be called when network fails") return nil, nil }, } @@ -215,7 +208,7 @@ func TestReconcileNetwork_FindsByTag(t *testing.T) { }, nil }, createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - t.Fatal("Create should not be called when network is found by tag") + g.Fail("Create should not be called when network is found by tag") return nil, nil }, } @@ -503,7 +496,7 @@ func TestDeleteNetwork_SkipsIfNoNetwork(t *testing.T) { networkService := &mockNetworkService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when no network exists") + g.Fail("Delete should not be called when no network exists") return nil }, } @@ -544,7 +537,7 @@ func TestDeleteNetwork_SkipsPreExistingNetwork(t *testing.T) { networkService := &mockNetworkService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called for Pre-existing networks") + g.Fail("Delete should not be called for Pre-existing networks") return nil }, } @@ -646,7 +639,7 @@ func TestReconcileNetwork_PreExistingCachedShortCircuits(t *testing.T) { networkService := &mockNetworkService{ getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - t.Fatal("Get should not be called when pre-existing status is cached") + g.Fail("Get should not be called when pre-existing status is cached") return nil, nil }, } diff --git a/internal/controller/cloudscalecluster_reconcile_test.go b/internal/controller/cloudscalecluster_reconcile_test.go index 55ba6ea..8ae740b 100644 --- a/internal/controller/cloudscalecluster_reconcile_test.go +++ b/internal/controller/cloudscalecluster_reconcile_test.go @@ -366,7 +366,7 @@ func TestReconcileDelete_LBDeleteErrorStopsDeletion(t *testing.T) { } mocks.networkService = &mockNetworkService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Network delete should not be called when LB delete fails") + g.Fail("Network delete should not be called when LB delete fails") return nil }, } @@ -429,7 +429,7 @@ func TestReconcileDelete_LBDisabledSkipsLBDeletion(t *testing.T) { mocks.lbEnabled = false mocks.lbService = &mockLoadBalancerService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("LB delete should not be called when LB is disabled") + g.Fail("LB delete should not be called when LB is disabled") return nil }, } diff --git a/internal/controller/cloudscalecluster_servergroup_test.go b/internal/controller/cloudscalecluster_servergroup_test.go index b4f044d..a664347 100644 --- a/internal/controller/cloudscalecluster_servergroup_test.go +++ b/internal/controller/cloudscalecluster_servergroup_test.go @@ -77,9 +77,7 @@ func TestDeleteServerGroups_DeletesAll(t *testing.T) { } clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) - r := &CloudscaleClusterReconciler{ - recorder: events.NewFakeRecorder(10), - } + r := newTestReconciler() err := r.deleteServerGroups(context.Background(), clusterScope) @@ -95,15 +93,13 @@ func TestDeleteServerGroups_NoGroups_Noop(t *testing.T) { return nil, nil }, deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when no server groups exist") + g.Fail("Delete should not be called when no server groups exist") return nil }, } clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) - r := &CloudscaleClusterReconciler{ - recorder: events.NewFakeRecorder(10), - } + r := newTestReconciler() err := r.deleteServerGroups(context.Background(), clusterScope) @@ -120,9 +116,7 @@ func TestDeleteServerGroups_ListError_PropagatesError(t *testing.T) { } clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) - r := &CloudscaleClusterReconciler{ - recorder: events.NewFakeRecorder(10), - } + r := newTestReconciler() err := r.deleteServerGroups(context.Background(), clusterScope) @@ -145,9 +139,7 @@ func TestDeleteServerGroups_DeleteError_PropagatesError(t *testing.T) { } clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) - r := &CloudscaleClusterReconciler{ - recorder: events.NewFakeRecorder(10), - } + r := newTestReconciler() err := r.deleteServerGroups(context.Background(), clusterScope) @@ -170,9 +162,7 @@ func TestDeleteServerGroups_Ignores404(t *testing.T) { } clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) - r := &CloudscaleClusterReconciler{ - recorder: events.NewFakeRecorder(10), - } + r := newTestReconciler() err := r.deleteServerGroups(context.Background(), clusterScope) @@ -212,7 +202,8 @@ func TestDeleteServerGroups_OwnedServerPresent_SkipsDeletion(t *testing.T) { }, ) r := &CloudscaleClusterReconciler{ - Client: fakeClient, + Client: fakeClient, + recorder: events.NewFakeRecorder(10), } err := r.deleteServerGroups(context.Background(), clusterScope) @@ -252,7 +243,8 @@ func TestDeleteServerGroups_ForeignServers_Skips(t *testing.T) { }, }) r := &CloudscaleClusterReconciler{ - Client: fakeClient, + Client: fakeClient, + recorder: events.NewFakeRecorder(10), } err := r.deleteServerGroups(context.Background(), clusterScope) @@ -279,10 +271,7 @@ func TestDeleteServerGroups_EmptyGroupName_DoesNotSkip(t *testing.T) { } clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) - fakeClient := newTestFakeClient() - r := &CloudscaleClusterReconciler{ - Client: fakeClient, - } + r := newTestReconciler() err := r.deleteServerGroups(context.Background(), clusterScope) diff --git a/internal/controller/cloudscalemachine_reconcile_test.go b/internal/controller/cloudscalemachine_reconcile_test.go index 4f2dda4..53dff81 100644 --- a/internal/controller/cloudscalemachine_reconcile_test.go +++ b/internal/controller/cloudscalemachine_reconcile_test.go @@ -89,11 +89,11 @@ func TestMachineReconcileNormal_BootstrapDataNotReady(t *testing.T) { serverService := &mockServerService{ getFn: func(ctx context.Context, id string) (*cloudscalesdk.Server, error) { - t.Fatal("Server Get should not be called when bootstrap data is not ready") + g.Fail("Server Get should not be called when bootstrap data is not ready") return nil, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.ServerRequest) (*cloudscalesdk.Server, error) { - t.Fatal("Server Create should not be called when bootstrap data is not ready") + g.Fail("Server Create should not be called when bootstrap data is not ready") return nil, nil }, } @@ -242,7 +242,7 @@ func TestMachineReconcileDelete_NoServer(t *testing.T) { serverService := &mockServerService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when no server exists") + g.Fail("Delete should not be called when no server exists") return nil }, } diff --git a/internal/controller/cloudscalemachine_server_test.go b/internal/controller/cloudscalemachine_server_test.go index d97a84c..b0b08e2 100644 --- a/internal/controller/cloudscalemachine_server_test.go +++ b/internal/controller/cloudscalemachine_server_test.go @@ -292,7 +292,7 @@ func TestReconcileServer_SkipsIfAlreadyExists(t *testing.T) { }, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.ServerRequest) (*cloudscalesdk.Server, error) { - t.Fatal("Create should not be called when server already exists") + g.Fail("Create should not be called when server already exists") return nil, nil }, } @@ -324,7 +324,7 @@ func TestReconcileServer_FindsExistingByTag(t *testing.T) { }, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.ServerRequest) (*cloudscalesdk.Server, error) { - t.Fatal("Create should not be called when server is found by tag") + g.Fail("Create should not be called when server is found by tag") return nil, nil }, } @@ -407,7 +407,7 @@ func TestDeleteServer_SkipsIfNoServer(t *testing.T) { g := NewWithT(t) serverService := &mockServerService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when no server exists") + g.Fail("Delete should not be called when no server exists") return nil }, } diff --git a/internal/controller/cloudscalemachine_servergroup_test.go b/internal/controller/cloudscalemachine_servergroup_test.go index b1f472c..7e39804 100644 --- a/internal/controller/cloudscalemachine_servergroup_test.go +++ b/internal/controller/cloudscalemachine_servergroup_test.go @@ -155,7 +155,7 @@ func TestReconcileServerGroup_NoServerGroup_Noop(t *testing.T) { serverGroupService := &mockServerGroupService{ listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { - t.Fatal("List should not be called when no server group is specified") + g.Fail("List should not be called when no server group is specified") return nil, nil }, } diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index ee6afbc..e6f77a2 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -25,6 +25,7 @@ import ( "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/events" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -115,3 +116,10 @@ func getFirstFoundEnvTestBinaryDir() string { } return "" } + +func newTestReconciler() *CloudscaleClusterReconciler { + return &CloudscaleClusterReconciler{ + Client: newTestFakeClient(), + recorder: events.NewFakeRecorder(10), + } +} From a16c925f6cc287222812fc42dd3962f44948ea42 Mon Sep 17 00:00:00 2001 From: Michael Weibel Date: Thu, 7 May 2026 17:01:38 +0200 Subject: [PATCH 4/9] fix: requeue on network deletion when LB pool members block it Convert the 'There are still one or more load balancer pool members' API error into a graceful 10s requeue instead of propagating it as a failure (which triggers exponential backoff up to 30 minutes). Added isLBPoolMembersError() helper to detect the specific API error message and errNetworkNotReady sentinel error for controlled flow through the delete path. --- .../cloudscalecluster_controller.go | 3 ++ .../controller/cloudscalecluster_network.go | 39 ++++++++++++++++--- .../cloudscalecluster_network_test.go | 25 ++++++++++++ 3 files changed, 61 insertions(+), 6 deletions(-) diff --git a/internal/controller/cloudscalecluster_controller.go b/internal/controller/cloudscalecluster_controller.go index 199351c..ceb9e74 100644 --- a/internal/controller/cloudscalecluster_controller.go +++ b/internal/controller/cloudscalecluster_controller.go @@ -199,6 +199,9 @@ func (r *CloudscaleClusterReconciler) reconcileDelete(ctx context.Context, clust } if err := r.deleteNetwork(ctx, clusterScope); err != nil { + if errors.Is(err, errNetworkNotReady) { + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } return ctrl.Result{}, fmt.Errorf("deleting network: %w", err) } diff --git a/internal/controller/cloudscalecluster_network.go b/internal/controller/cloudscalecluster_network.go index e3a6daa..0d51422 100644 --- a/internal/controller/cloudscalecluster_network.go +++ b/internal/controller/cloudscalecluster_network.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "strings" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" corev1 "k8s.io/api/core/v1" @@ -219,12 +220,25 @@ func (r *CloudscaleClusterReconciler) deleteNetwork(ctx context.Context, cluster } clusterScope.Info("Deleting network", "name", ns.Name, "networkID", ns.NetworkID) - if err := clusterScope.CloudscaleClient.Networks.Delete(ctx, ns.NetworkID); err != nil { - if !cloudscale.IsNotFound(err) { - remaining = append(remaining, ns) - errs = append(errs, fmt.Errorf("deleting network %s: %w", ns.Name, err)) - continue - } + err := clusterScope.CloudscaleClient.Networks.Delete(ctx, ns.NetworkID) + + // return sentinel error which will wait a pre-defined amount of time + if isLBPoolMembersError(err) { + clusterScope.Info("Network still has LB pool members, waiting for async cleanup", "networkID", ns.NetworkID) + return errNetworkNotReady + } + + isNotFound := cloudscale.IsNotFound(err) + + // in case of an actual error, keep the network in status and move on. + if err != nil && !isNotFound { + remaining = append(remaining, ns) + errs = append(errs, fmt.Errorf("deleting network %s: %w", ns.Name, err)) + continue + } + + // Idempotent not-found: log and fall through to the event. + if isNotFound { clusterScope.Info("Network already deleted", "name", ns.Name, "networkID", ns.NetworkID) } @@ -236,6 +250,19 @@ func (r *CloudscaleClusterReconciler) deleteNetwork(ctx context.Context, cluster return errors.Join(errs...) } +// errNetworkNotReady is returned when a network cannot be deleted because +// it still has pending dependencies (e.g. LB pool members). +var errNetworkNotReady = errors.New("network has pending dependencies") + +// isLBPoolMembersError reports whether err is the cloudscale API error +// for deleting a network that still has load balancer pool members in it. +func isLBPoolMembersError(err error) bool { + if err == nil { + return false + } + return strings.Contains(err.Error(), "load balancer pool members") +} + // setNetworkStatus updates or appends the network status entry for the given name. func (r *CloudscaleClusterReconciler) setNetworkStatus(clusterScope *scope.ClusterScope, name, networkID, subnetID, cidr string, managed bool) { for i, ns := range clusterScope.CloudscaleCluster.Status.Networks { diff --git a/internal/controller/cloudscalecluster_network_test.go b/internal/controller/cloudscalecluster_network_test.go index ac6aaa9..3689fcc 100644 --- a/internal/controller/cloudscalecluster_network_test.go +++ b/internal/controller/cloudscalecluster_network_test.go @@ -932,6 +932,31 @@ func TestReconcileNetwork_NetworkCreateWrappedTimeoutRequeues(t *testing.T) { "Should requeue after CreateTimeoutRequeueInterval even when error is wrapped") } +// --- LB pool-member error test --- + +func TestDeleteNetwork_RequeuesOnLBPoolMembersError(t *testing.T) { + g := NewWithT(t) + + networkService := &mockNetworkService{ + deleteFn: func(ctx context.Context, id string) error { + //goland:noinspection GoErrorStringFormat + return fmt.Errorf("There are still one or more load balancer pool members in this network.") + }, + } + + clusterScope := newTestClusterScope(networkService, &mockSubnetService{}) + clusterScope.CloudscaleCluster.Status.Networks = []infrastructurev1beta2.NetworkStatus{ + {Name: "test", NetworkID: "net-with-pool-members", SubnetID: "subnet-uuid", Managed: true}, + } + + r := newTestReconciler() + + err := r.deleteNetwork(context.Background(), clusterScope) + + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("network has pending dependencies")) +} + // --- Mock services --- type mockNetworkService struct { From 7d8d299cae06014a18335474050d646d9869f2c8 Mon Sep 17 00:00:00 2001 From: Michael Weibel Date: Thu, 7 May 2026 17:18:34 +0200 Subject: [PATCH 5/9] refactoring --- internal/cloudscale/client.go | 8 - internal/cloudscale/client_test.go | 22 -- .../cloudscalecluster_floatingip.go | 10 +- .../cloudscalecluster_floatingip_test.go | 18 +- .../cloudscalecluster_loadbalancer.go | 37 ++- .../cloudscalecluster_loadbalancer_test.go | 4 +- .../controller/cloudscalecluster_network.go | 23 +- .../cloudscalecluster_network_test.go | 276 +++++++----------- .../cloudscalecluster_reconcile_test.go | 4 +- .../cloudscalecluster_servergroup_test.go | 42 +-- .../controller/cloudscalemachine_server.go | 6 +- .../cloudscalemachine_server_test.go | 17 +- .../cloudscalemachine_servergroup.go | 6 +- .../cloudscalemachine_servergroup_test.go | 6 +- internal/controller/suite_test.go | 18 +- 15 files changed, 210 insertions(+), 287 deletions(-) diff --git a/internal/cloudscale/client.go b/internal/cloudscale/client.go index 5033402..fb35083 100644 --- a/internal/cloudscale/client.go +++ b/internal/cloudscale/client.go @@ -134,11 +134,3 @@ func IsTimeoutError(err error) bool { } return false } - -// IsDeadlineExceeded reports whether err indicates a deadline was exceeded. -func IsDeadlineExceeded(err error) bool { - if err == nil { - return false - } - return errors.Is(err, os.ErrDeadlineExceeded) -} diff --git a/internal/cloudscale/client_test.go b/internal/cloudscale/client_test.go index 48c6f38..b051bac 100644 --- a/internal/cloudscale/client_test.go +++ b/internal/cloudscale/client_test.go @@ -17,7 +17,6 @@ limitations under the License. package cloudscale import ( - "errors" "fmt" "net/url" "os" @@ -107,24 +106,3 @@ func TestNewClient_ReturnsNonNilClient(t *testing.T) { g.Expect(client.Servers).ToNot(BeNil()) g.Expect(client.Networks).ToNot(BeNil()) } - -func TestIsDeadlineExceeded(t *testing.T) { - tests := []struct { - name string - err error - expected bool - }{ - {"nil error returns false", nil, false}, - {"os.ErrDeadlineExceeded returns true", os.ErrDeadlineExceeded, true}, - {"wrapped os.ErrDeadlineExceeded returns true", errors.Join(os.ErrDeadlineExceeded), true}, - {"generic error returns false", fmt.Errorf("timeout"), false}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - g := NewWithT(t) - result := IsDeadlineExceeded(tt.err) - g.Expect(result).To(Equal(tt.expected)) - }) - } -} diff --git a/internal/controller/cloudscalecluster_floatingip.go b/internal/controller/cloudscalecluster_floatingip.go index e7e81e8..d2f5db9 100644 --- a/internal/controller/cloudscalecluster_floatingip.go +++ b/internal/controller/cloudscalecluster_floatingip.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" corev1 "k8s.io/api/core/v1" @@ -33,6 +34,8 @@ import ( "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) +const createFloatingIPTimeoutRequeueAfter = 5 * time.Second + // reconcileFloatingIP ensures the floating IP exists and is assigned to the correct target. // When no floating IP is configured, this sets the condition to true and returns. func (r *CloudscaleClusterReconciler) reconcileFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { @@ -76,7 +79,7 @@ func (r *CloudscaleClusterReconciler) reconcilePreExistingFloatingIP(ctx context return r.ensureFloatingIPAssignment(ctx, clusterScope, fip) } -func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope, fipSpec *infrastructurev1beta2.FloatingIPSpec) (_ ctrl.Result, reterr error) { +func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope, fipSpec *infrastructurev1beta2.FloatingIPSpec) (ctrl.Result, error) { tags := clusterOwnershipTags(clusterScope.CloudscaleCluster) clusterScope.Info("reconcile managed floating IP") @@ -137,8 +140,9 @@ func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Con fip, err = clusterScope.CloudscaleClient.FloatingIPs.Create(ctx, req) if err != nil { if cloudscale.IsTimeoutError(err) { - clusterScope.Info("Floating IP creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) - return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + const timeout = createFloatingIPTimeoutRequeueAfter + clusterScope.Info("Floating IP creation timed out, waiting before retry", "requeueAfter", timeout) + return ctrl.Result{RequeueAfter: timeout}, nil } return ctrl.Result{}, fmt.Errorf("creating floating IP: %w", err) } diff --git a/internal/controller/cloudscalecluster_floatingip_test.go b/internal/controller/cloudscalecluster_floatingip_test.go index 9fbb30f..f07007d 100644 --- a/internal/controller/cloudscalecluster_floatingip_test.go +++ b/internal/controller/cloudscalecluster_floatingip_test.go @@ -34,13 +34,13 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) // --- Test helpers --- -func newFIPTestClusterScope(fipService cs.FloatingIPService) *scope.ClusterScope { +func newFIPTestClusterScope(fipService cloudscale.FloatingIPService) *scope.ClusterScope { return &scope.ClusterScope{ Logger: logr.Discard(), Cluster: &clusterv1.Cluster{ @@ -63,7 +63,7 @@ func newFIPTestClusterScope(fipService cs.FloatingIPService) *scope.ClusterScope }, }, }, - CloudscaleClient: &cs.Client{ + CloudscaleClient: &cloudscale.Client{ FloatingIPs: fipService, }, } @@ -944,12 +944,8 @@ func TestReconcileManagedFloatingIP_CreateTimeoutRequeues(t *testing.T) { return nil, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.FloatingIPCreateRequest) (*cloudscalesdk.FloatingIP, error) { - if req.IPVersion == 4 { - // Simulate timeout via context deadline exceeded wrapped in url.Error - return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/floating_ips", Err: os.ErrDeadlineExceeded} - } - // IPv6 path: return success so we can verify timeout path above is hit first - return &cloudscalesdk.FloatingIP{Network: "2001:db8::1/128"}, nil + // Simulate timeout via context deadline exceeded wrapped in url.Error + return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/floating_ips", Err: os.ErrDeadlineExceeded} }, } @@ -962,8 +958,8 @@ func TestReconcileManagedFloatingIP_CreateTimeoutRequeues(t *testing.T) { result, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).To(Equal(CreateTimeoutRequeueInterval), - "Should requeue after CreateTimeoutRequeueInterval on timeout error") + g.Expect(result.RequeueAfter).To(Equal(createFloatingIPTimeoutRequeueAfter), + "Should requeue after createFloatingIPTimeoutRequeueAfter on timeout error") } // --- Mock FloatingIPService --- diff --git a/internal/controller/cloudscalecluster_loadbalancer.go b/internal/controller/cloudscalecluster_loadbalancer.go index 3aa5ce3..6fad342 100644 --- a/internal/controller/cloudscalecluster_loadbalancer.go +++ b/internal/controller/cloudscalecluster_loadbalancer.go @@ -89,7 +89,7 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, if lb.Status == "error" || lb.Status == "degraded" { // During bootstrap, error/degraded is expected because the health // monitor checks an empty pool (no CP machines ready yet). - requeueAfter = 30 * time.Second + requeueAfter = 10 * time.Second } return ctrl.Result{RequeueAfter: requeueAfter}, nil } @@ -142,7 +142,7 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, } // reconcileLB ensures the load balancer exists. -func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { +func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerID, "load balancer", @@ -189,8 +189,9 @@ func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterSc lb, err := clusterScope.CloudscaleClient.LoadBalancers.Create(ctx, req) if err != nil { if cloudscale.IsTimeoutError(err) { - clusterScope.Info("Load balancer creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) - return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + requeueAfter := 5 * time.Second + clusterScope.Info("Load balancer creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil } return ctrl.Result{}, fmt.Errorf("creating load balancer: %w", err) } @@ -203,7 +204,7 @@ func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterSc } // reconcileLBPool ensures the load balancer pool exists. -func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { +func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, "load balancer pool", @@ -236,8 +237,9 @@ func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clust pool, err := clusterScope.CloudscaleClient.LoadBalancerPools.Create(ctx, req) if err != nil { if cloudscale.IsTimeoutError(err) { - clusterScope.Info("Load balancer pool creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) - return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + requeueAfter := 5 * time.Second + clusterScope.Info("Load balancer pool creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil } return ctrl.Result{}, fmt.Errorf("creating load balancer pool: %w", err) } @@ -250,7 +252,7 @@ func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clust } // reconcileLBListener ensures the load balancer listener exists. -func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { +func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID, "load balancer listener", @@ -281,8 +283,9 @@ func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, c listener, err := clusterScope.CloudscaleClient.LoadBalancerListeners.Create(ctx, req) if err != nil { if cloudscale.IsTimeoutError(err) { - clusterScope.Info("Load balancer listener creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) - return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + requeueAfter := 5 * time.Second + clusterScope.Info("Load balancer listener creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil } return ctrl.Result{}, fmt.Errorf("creating load balancer listener: %w", err) } @@ -297,7 +300,7 @@ func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, c // reconcileLBHealthMonitor ensures the load balancer health monitor exists. // The health monitor performs TCP health checks on the API server port. -func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { +func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID, "load balancer health monitor", @@ -331,8 +334,9 @@ func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Conte monitor, err := clusterScope.CloudscaleClient.LoadBalancerHealthMonitors.Create(ctx, req) if err != nil { if cloudscale.IsTimeoutError(err) { - clusterScope.Info("Load balancer health monitor creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) - return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + requeueAfter := 5 * time.Second + clusterScope.Info("Load balancer health monitor creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil } return ctrl.Result{}, fmt.Errorf("creating load balancer health monitor: %w", err) } @@ -345,7 +349,7 @@ func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Conte return ctrl.Result{}, nil } -func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { +func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { // Fetch current members from the load balancer currentMembers, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.List(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID) if err != nil { @@ -509,8 +513,9 @@ func (r *CloudscaleClusterReconciler) createLoadBalancerMember(ctx context.Conte cm, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Create(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, &member) if err != nil { if cloudscale.IsTimeoutError(err) { - clusterScope.Info("Load balancer member creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) - return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + requeueAfter := 5 * time.Second + clusterScope.Info("Load balancer member creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil } return ctrl.Result{}, fmt.Errorf("creating load balancer member: %w", err) } diff --git a/internal/controller/cloudscalecluster_loadbalancer_test.go b/internal/controller/cloudscalecluster_loadbalancer_test.go index 47e5233..1631189 100644 --- a/internal/controller/cloudscalecluster_loadbalancer_test.go +++ b/internal/controller/cloudscalecluster_loadbalancer_test.go @@ -935,7 +935,7 @@ func TestReconcileLoadBalancer_ErrorStatusHasLongerRequeue(t *testing.T) { result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).To(Equal(30*time.Second), "error status should requeue after 30s") + g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "error status should requeue after 10s") } func TestReconcileLoadBalancer_DegradedStatusHasLongerRequeue(t *testing.T) { @@ -961,7 +961,7 @@ func TestReconcileLoadBalancer_DegradedStatusHasLongerRequeue(t *testing.T) { result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).To(Equal(30*time.Second), "degraded status should requeue after 30s") + g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "degraded status should requeue after 10s") } func TestReconcileLoadBalancer_CreatingStatusHasShorterRequeue(t *testing.T) { diff --git a/internal/controller/cloudscalecluster_network.go b/internal/controller/cloudscalecluster_network.go index 0d51422..7435015 100644 --- a/internal/controller/cloudscalecluster_network.go +++ b/internal/controller/cloudscalecluster_network.go @@ -21,6 +21,7 @@ import ( "errors" "fmt" "strings" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" corev1 "k8s.io/api/core/v1" @@ -33,6 +34,8 @@ import ( "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) +const createNetworkTimeoutRequeueAfter = 5 * time.Second + // reconcileNetwork orchestrates network and subnet provisioning for all networks // defined in spec.networks. A single NetworkReadyCondition covers all networks. func (r *CloudscaleClusterReconciler) reconcileNetwork(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { @@ -99,7 +102,7 @@ func (r *CloudscaleClusterReconciler) reconcilePreExistingNetwork(ctx context.Co } // reconcileManagedNetwork ensures a managed network and its subnet exist. -func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Context, clusterScope *scope.ClusterScope, netSpec infrastructurev1beta2.NetworkSpec) (_ ctrl.Result, reterr error) { +func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Context, clusterScope *scope.ClusterScope, netSpec infrastructurev1beta2.NetworkSpec) (ctrl.Result, error) { ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus(netSpec.Name) // Reconcile the network resource @@ -136,8 +139,8 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex }) if err != nil { if cloudscale.IsTimeoutError(err) { - clusterScope.Info("Network creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) - return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + clusterScope.Info("Network creation timed out, waiting before retry", "requeueAfter", createNetworkTimeoutRequeueAfter) + return ctrl.Result{RequeueAfter: createNetworkTimeoutRequeueAfter}, nil } return ctrl.Result{}, fmt.Errorf("creating network: %w", err) } @@ -177,8 +180,8 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex }) if err != nil { if cloudscale.IsTimeoutError(err) { - clusterScope.Info("Subnet creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) - return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + clusterScope.Info("Subnet creation timed out, waiting before retry", "requeueAfter", createNetworkTimeoutRequeueAfter) + return ctrl.Result{RequeueAfter: createNetworkTimeoutRequeueAfter}, nil } return ctrl.Result{}, fmt.Errorf("creating subnet: %w", err) } @@ -209,8 +212,10 @@ func (r *CloudscaleClusterReconciler) deleteNetwork(ctx context.Context, cluster var errs []error for _, ns := range clusterScope.CloudscaleCluster.Status.Networks { + logger := clusterScope.WithValues("name", ns.Name, "networkID", ns.NetworkID) + if !ns.Managed { - clusterScope.Info("Skipping pre-existing network deletion", "name", ns.Name, "networkID", ns.NetworkID) + logger.Info("Skipping pre-existing network deletion") remaining = append(remaining, ns) continue } @@ -219,12 +224,12 @@ func (r *CloudscaleClusterReconciler) deleteNetwork(ctx context.Context, cluster continue } - clusterScope.Info("Deleting network", "name", ns.Name, "networkID", ns.NetworkID) + logger.Info("Deleting network") err := clusterScope.CloudscaleClient.Networks.Delete(ctx, ns.NetworkID) // return sentinel error which will wait a pre-defined amount of time if isLBPoolMembersError(err) { - clusterScope.Info("Network still has LB pool members, waiting for async cleanup", "networkID", ns.NetworkID) + logger.Info("Network still has LB pool members, waiting for async cleanup") return errNetworkNotReady } @@ -239,7 +244,7 @@ func (r *CloudscaleClusterReconciler) deleteNetwork(ctx context.Context, cluster // Idempotent not-found: log and fall through to the event. if isNotFound { - clusterScope.Info("Network already deleted", "name", ns.Name, "networkID", ns.NetworkID) + logger.Info("Network already deleted") } r.recorder.Eventf(clusterScope.CloudscaleCluster, nil, corev1.EventTypeNormal, "NetworkDeleted", "DeleteNetwork", diff --git a/internal/controller/cloudscalecluster_network_test.go b/internal/controller/cloudscalecluster_network_test.go index 3689fcc..c688c23 100644 --- a/internal/controller/cloudscalecluster_network_test.go +++ b/internal/controller/cloudscalecluster_network_test.go @@ -23,14 +23,14 @@ import ( "os" "testing" - "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" "github.com/go-logr/logr" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -38,7 +38,7 @@ const netUUID = "net-uuid-123" // --- Test helpers --- -func newTestClusterScope(networkService cs.NetworkService, subnetService cs.SubnetService) *scope.ClusterScope { +func newTestClusterScope(networkService cloudscale.NetworkService, subnetService cloudscale.SubnetService) *scope.ClusterScope { return &scope.ClusterScope{ Logger: logr.Discard(), Cluster: &clusterv1.Cluster{ @@ -63,7 +63,7 @@ func newTestClusterScope(networkService cs.NetworkService, subnetService cs.Subn }, }, }, - CloudscaleClient: &cs.Client{ + CloudscaleClient: &cloudscale.Client{ Networks: networkService, Subnets: subnetService, }, @@ -75,19 +75,19 @@ func newTestClusterScope(networkService cs.NetworkService, subnetService cs.Subn func TestReconcileNetwork_CreatesBothResources(t *testing.T) { g := NewWithT(t) - var capturedNetReq *cloudscale.NetworkCreateRequest - var capturedSubReq *cloudscale.SubnetCreateRequest + var capturedNetReq *cloudscalesdk.NetworkCreateRequest + var capturedSubReq *cloudscalesdk.SubnetCreateRequest networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { capturedNetReq = req - return &cloudscale.Network{UUID: netUUID, Name: req.Name}, nil + return &cloudscalesdk.Network{UUID: netUUID, Name: req.Name}, nil }, } subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { capturedSubReq = req - return &cloudscale.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil + return &cloudscalesdk.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil }, } @@ -114,19 +114,19 @@ func TestReconcileNetwork_SkipsIfBothExist(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: id}, nil + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: id}, nil }, - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { g.Fail("Network create should not be called") return nil, nil }, } subnetService := &mockSubnetService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Subnet, error) { - return &cloudscale.Subnet{UUID: id}, nil + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Subnet, error) { + return &cloudscalesdk.Subnet{UUID: id}, nil }, - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { g.Fail("Create should not be called when subnet is found by tag") return nil, nil }, @@ -152,12 +152,12 @@ func TestReconcileNetwork_NetworkErrorStopsSubnet(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) { return nil, fmt.Errorf("api error") }, } subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { g.Fail("Subnet create should not be called when network fails") return nil, nil }, @@ -177,12 +177,12 @@ func TestReconcileNetwork_SubnetErrorSurfaced(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: "net-uuid"}, nil + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: "net-uuid"}, nil }, } subnetService := &mockSubnetService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { return nil, fmt.Errorf("subnet api error") }, } @@ -202,23 +202,23 @@ func TestReconcileNetwork_FindsByTag(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) { - return []cloudscale.Network{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) { + return []cloudscalesdk.Network{ {UUID: "found-net-uuid", Name: "test-cluster"}, }, nil }, - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { g.Fail("Create should not be called when network is found by tag") return nil, nil }, } subnetService := &mockSubnetService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { - return []cloudscale.Subnet{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { + return []cloudscalesdk.Subnet{ {UUID: "found-subnet-uuid", CIDR: "10.0.0.0/24"}, }, nil }, - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { t.Fatal("Create should not be called when subnet is found by tag") return nil, nil }, @@ -240,8 +240,8 @@ func TestReconcileNetwork_ErrorsOnMultipleNetworks(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) { - return []cloudscale.Network{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) { + return []cloudscalesdk.Network{ {UUID: "net-uuid-1"}, {UUID: "net-uuid-2"}, }, nil @@ -263,20 +263,20 @@ func TestReconcileNetwork_RecreatesIfDeletedExternally(t *testing.T) { var created bool networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return nil, &cloudscale.ErrorResponse{StatusCode: 404} + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} }, - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) { return nil, nil }, - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { created = true - return &cloudscale.Network{UUID: "new-net-uuid", Name: req.Name}, nil + return &cloudscalesdk.Network{UUID: "new-net-uuid", Name: req.Name}, nil }, } subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { - return &cloudscale.Subnet{UUID: "new-subnet-uuid", CIDR: req.CIDR}, nil + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { + return &cloudscalesdk.Subnet{UUID: "new-subnet-uuid", CIDR: req.CIDR}, nil }, } @@ -302,17 +302,17 @@ func TestReconcileNetwork_SubnetFindsByTag(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: netUUID, Name: req.Name}, nil + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: netUUID, Name: req.Name}, nil }, } subnetService := &mockSubnetService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { - return []cloudscale.Subnet{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { + return []cloudscalesdk.Subnet{ {UUID: "found-subnet-uuid", CIDR: "10.0.0.0/24"}, }, nil }, - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { t.Fatal("Create should not be called when subnet is found by tag") return nil, nil }, @@ -324,8 +324,8 @@ func TestReconcileNetwork_SubnetFindsByTag(t *testing.T) { {Name: "test", NetworkID: netUUID, Managed: true}, } // The networkService Get should return the existing network - networkService.getFn = func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: id}, nil + networkService.getFn = func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: id}, nil } r := newTestReconciler() @@ -342,13 +342,13 @@ func TestReconcileNetwork_SubnetErrorsOnMultiple(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: id}, nil + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: id}, nil }, } subnetService := &mockSubnetService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { - return []cloudscale.Subnet{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { + return []cloudscalesdk.Subnet{ {UUID: "subnet-uuid-1"}, {UUID: "subnet-uuid-2"}, }, nil @@ -374,20 +374,20 @@ func TestReconcileNetwork_SubnetRecreatesIfDeletedExternally(t *testing.T) { var created bool networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: id}, nil + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: id}, nil }, } subnetService := &mockSubnetService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Subnet, error) { - return nil, &cloudscale.ErrorResponse{StatusCode: 404} + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Subnet, error) { + return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} }, - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { return nil, nil }, - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { created = true - return &cloudscale.Subnet{UUID: "new-subnet-uuid", CIDR: req.CIDR}, nil + return &cloudscalesdk.Subnet{UUID: "new-subnet-uuid", CIDR: req.CIDR}, nil }, } @@ -410,17 +410,17 @@ func TestReconcileNetwork_SubnetRecreatesIfDeletedExternally(t *testing.T) { func TestReconcileNetwork_CustomCIDR(t *testing.T) { g := NewWithT(t) - var capturedReq *cloudscale.SubnetCreateRequest + var capturedReq *cloudscalesdk.SubnetCreateRequest networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: netUUID, Name: req.Name}, nil + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: netUUID, Name: req.Name}, nil }, } subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { capturedReq = req - return &cloudscale.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil + return &cloudscalesdk.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil }, } @@ -438,17 +438,17 @@ func TestReconcileNetwork_CustomCIDR(t *testing.T) { func TestReconcileNetwork_ExplicitGateway(t *testing.T) { g := NewWithT(t) - var capturedReq *cloudscale.SubnetCreateRequest + var capturedReq *cloudscalesdk.SubnetCreateRequest networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: netUUID, Name: req.Name}, nil + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: netUUID, Name: req.Name}, nil }, } subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { capturedReq = req - return &cloudscale.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil + return &cloudscalesdk.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil }, } @@ -515,7 +515,7 @@ func TestDeleteNetwork_IgnoresAlreadyDeleted(t *testing.T) { networkService := &mockNetworkService{ deleteFn: func(ctx context.Context, id string) error { - return &cloudscale.ErrorResponse{StatusCode: 404} + return &cloudscalesdk.ErrorResponse{StatusCode: 404} }, } @@ -638,7 +638,7 @@ func TestReconcileNetwork_PreExistingCachedShortCircuits(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { g.Fail("Get should not be called when pre-existing status is cached") return nil, nil }, @@ -667,14 +667,14 @@ func TestReconcileNetwork_PreExistingReDiscoversWhenCIDRMissing(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { g.Expect(id).To(Equal("pre-existing-uuid")) - return &cloudscale.Network{ + return &cloudscalesdk.Network{ UUID: "pre-existing-uuid", - ZonalResource: cloudscale.ZonalResource{ - Zone: cloudscale.ZoneStub{Slug: "rma1"}, + ZonalResource: cloudscalesdk.ZonalResource{ + Zone: cloudscalesdk.ZoneStub{Slug: "rma1"}, }, - Subnets: []cloudscale.SubnetStub{ + Subnets: []cloudscalesdk.SubnetStub{ {UUID: "pre-existing-subnet-uuid", CIDR: "192.168.0.0/24"}, }, }, nil @@ -707,14 +707,14 @@ func TestReconcileNetwork_PreExistingFetchesAndSetsStatus(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { g.Expect(id).To(Equal("pre-existing-uuid")) - return &cloudscale.Network{ + return &cloudscalesdk.Network{ UUID: "pre-existing-uuid", - ZonalResource: cloudscale.ZonalResource{ - Zone: cloudscale.ZoneStub{Slug: "rma1"}, + ZonalResource: cloudscalesdk.ZonalResource{ + Zone: cloudscalesdk.ZoneStub{Slug: "rma1"}, }, - Subnets: []cloudscale.SubnetStub{ + Subnets: []cloudscalesdk.SubnetStub{ {UUID: "discovered-subnet-uuid"}, }, }, nil @@ -742,7 +742,7 @@ func TestReconcileNetwork_PreExistingGetError(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { return nil, fmt.Errorf("network not found") }, } @@ -764,13 +764,13 @@ func TestReconcileNetwork_PreExistingZoneMismatchErrors(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{ UUID: "pre-existing-uuid", - ZonalResource: cloudscale.ZonalResource{ - Zone: cloudscale.ZoneStub{Slug: "lpg1"}, + ZonalResource: cloudscalesdk.ZonalResource{ + Zone: cloudscalesdk.ZoneStub{Slug: "lpg1"}, }, - Subnets: []cloudscale.SubnetStub{ + Subnets: []cloudscalesdk.SubnetStub{ {UUID: "discovered-subnet-uuid"}, }, }, nil @@ -796,13 +796,13 @@ func TestReconcileNetwork_PreExistingNoSubnetsErrors(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{ UUID: "pre-existing-uuid", - ZonalResource: cloudscale.ZonalResource{ - Zone: cloudscale.ZoneStub{Slug: "rma1"}, + ZonalResource: cloudscalesdk.ZonalResource{ + Zone: cloudscalesdk.ZoneStub{Slug: "rma1"}, }, - Subnets: []cloudscale.SubnetStub{}, + Subnets: []cloudscalesdk.SubnetStub{}, }, nil }, } @@ -826,7 +826,7 @@ func TestReconcileNetwork_NetworkCreateTimeoutRequeues(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/networks", Err: os.ErrDeadlineExceeded} }, } @@ -838,42 +838,23 @@ func TestReconcileNetwork_NetworkCreateTimeoutRequeues(t *testing.T) { result, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).To(Equal(CreateTimeoutRequeueInterval), - "Should requeue after CreateTimeoutRequeueInterval on timeout error") + g.Expect(result.RequeueAfter).To(Equal(createNetworkTimeoutRequeueAfter), + "Should requeue after createNetworkTimeoutRequeueAfter on timeout error") g.Expect(networkService.createFn).ToNot(BeNil(), "Network create should have been called") } -func TestReconcileNetwork_NetworkCreateGenericErrorPropagates(t *testing.T) { - g := NewWithT(t) - - networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/networks", Err: fmt.Errorf("connection refused")} - }, - } - subnetService := &mockSubnetService{} - - clusterScope := newTestClusterScope(networkService, subnetService) - r := newTestReconciler() - - _, err := r.reconcileNetwork(context.Background(), clusterScope) - - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("creating network")) -} - func TestReconcileNetwork_SubnetCreateTimeoutRequeues(t *testing.T) { g := NewWithT(t) var netCreated bool networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { netCreated = true - return &cloudscale.Network{UUID: "new-net-uuid", Name: req.Name}, nil + return &cloudscalesdk.Network{UUID: "new-net-uuid", Name: req.Name}, nil }, } subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/subnets", Err: os.ErrDeadlineExceeded} }, } @@ -885,51 +866,8 @@ func TestReconcileNetwork_SubnetCreateTimeoutRequeues(t *testing.T) { g.Expect(err).ToNot(HaveOccurred()) g.Expect(netCreated).To(BeTrue(), "Network should have been created before hitting subnet") - g.Expect(result.RequeueAfter).To(Equal(CreateTimeoutRequeueInterval), - "Should requeue after CreateTimeoutRequeueInterval on timeout error") -} - -func TestReconcileNetwork_SubnetCreateGenericErrorPropagates(t *testing.T) { - g := NewWithT(t) - - networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: "new-net-uuid", Name: req.Name}, nil - }, - } - subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { - return nil, fmt.Errorf("quota exceeded") - }, - } - - clusterScope := newTestClusterScope(networkService, subnetService) - r := newTestReconciler() - - _, err := r.reconcileNetwork(context.Background(), clusterScope) - - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("creating subnet")) -} - -func TestReconcileNetwork_NetworkCreateWrappedTimeoutRequeues(t *testing.T) { - g := NewWithT(t) - - networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - return nil, fmt.Errorf("outer: %w", &url.Error{Op: "Post", URL: "https://api.example.com/v1/networks", Err: os.ErrDeadlineExceeded}) - }, - } - subnetService := &mockSubnetService{} - - clusterScope := newTestClusterScope(networkService, subnetService) - r := newTestReconciler() - - result, err := r.reconcileNetwork(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).To(Equal(CreateTimeoutRequeueInterval), - "Should requeue after CreateTimeoutRequeueInterval even when error is wrapped") + g.Expect(result.RequeueAfter).To(Equal(createNetworkTimeoutRequeueAfter), + "Should requeue after createNetworkTimeoutRequeueAfter on timeout error") } // --- LB pool-member error test --- @@ -960,27 +898,27 @@ func TestDeleteNetwork_RequeuesOnLBPoolMembersError(t *testing.T) { // --- Mock services --- type mockNetworkService struct { - createFn func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) - getFn func(ctx context.Context, id string) (*cloudscale.Network, error) - listFn func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) + createFn func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) + getFn func(ctx context.Context, id string) (*cloudscalesdk.Network, error) + listFn func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) deleteFn func(ctx context.Context, id string) error } -func (m *mockNetworkService) Create(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { +func (m *mockNetworkService) Create(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { if m.createFn != nil { return m.createFn(ctx, req) } return nil, nil } -func (m *mockNetworkService) Get(ctx context.Context, id string) (*cloudscale.Network, error) { +func (m *mockNetworkService) Get(ctx context.Context, id string) (*cloudscalesdk.Network, error) { if m.getFn != nil { return m.getFn(ctx, id) } return nil, nil } -func (m *mockNetworkService) List(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) { +func (m *mockNetworkService) List(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) { if m.listFn != nil { return m.listFn(ctx, modifiers...) } @@ -995,27 +933,27 @@ func (m *mockNetworkService) Delete(ctx context.Context, id string) error { } type mockSubnetService struct { - createFn func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) - getFn func(ctx context.Context, id string) (*cloudscale.Subnet, error) - listFn func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) + createFn func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) + getFn func(ctx context.Context, id string) (*cloudscalesdk.Subnet, error) + listFn func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) deleteFn func(ctx context.Context, id string) error } -func (m *mockSubnetService) Create(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { +func (m *mockSubnetService) Create(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { if m.createFn != nil { return m.createFn(ctx, req) } return nil, nil } -func (m *mockSubnetService) Get(ctx context.Context, id string) (*cloudscale.Subnet, error) { +func (m *mockSubnetService) Get(ctx context.Context, id string) (*cloudscalesdk.Subnet, error) { if m.getFn != nil { return m.getFn(ctx, id) } return nil, nil } -func (m *mockSubnetService) List(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { +func (m *mockSubnetService) List(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { if m.listFn != nil { return m.listFn(ctx, modifiers...) } diff --git a/internal/controller/cloudscalecluster_reconcile_test.go b/internal/controller/cloudscalecluster_reconcile_test.go index 8ae740b..364efe6 100644 --- a/internal/controller/cloudscalecluster_reconcile_test.go +++ b/internal/controller/cloudscalecluster_reconcile_test.go @@ -33,7 +33,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -74,7 +74,7 @@ func reconcileTestScope(opts reconcileTestOpts) *scope.ClusterScope { }, }, }, - CloudscaleClient: &cs.Client{ + CloudscaleClient: &cloudscale.Client{ Networks: opts.networkService, Subnets: opts.subnetService, ServerGroups: opts.serverGroupService, diff --git a/internal/controller/cloudscalecluster_servergroup_test.go b/internal/controller/cloudscalecluster_servergroup_test.go index a664347..d009d95 100644 --- a/internal/controller/cloudscalecluster_servergroup_test.go +++ b/internal/controller/cloudscalecluster_servergroup_test.go @@ -21,7 +21,7 @@ import ( "fmt" "testing" - "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" "github.com/go-logr/logr" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -29,11 +29,11 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) -func newTestClusterScopeWithServerGroups(serverGroupService cs.ServerGroupService) *scope.ClusterScope { +func newTestClusterScopeWithServerGroups(serverGroupService cloudscale.ServerGroupService) *scope.ClusterScope { return &scope.ClusterScope{ Logger: logr.Discard(), Cluster: &clusterv1.Cluster{ @@ -52,7 +52,7 @@ func newTestClusterScopeWithServerGroups(serverGroupService cs.ServerGroupServic Zone: "rma1", }, }, - CloudscaleClient: &cs.Client{ + CloudscaleClient: &cloudscale.Client{ ServerGroups: serverGroupService, }, } @@ -64,8 +64,8 @@ func TestDeleteServerGroups_DeletesAll(t *testing.T) { var deletedIDs []string serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { - return []cloudscale.ServerGroup{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ {UUID: "sg-1", Name: "group-1"}, {UUID: "sg-2", Name: "group-2"}, }, nil @@ -89,7 +89,7 @@ func TestDeleteServerGroups_NoGroups_Noop(t *testing.T) { g := NewWithT(t) serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { return nil, nil }, deleteFn: func(ctx context.Context, id string) error { @@ -110,7 +110,7 @@ func TestDeleteServerGroups_ListError_PropagatesError(t *testing.T) { g := NewWithT(t) serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { return nil, fmt.Errorf("api error") }, } @@ -128,8 +128,8 @@ func TestDeleteServerGroups_DeleteError_PropagatesError(t *testing.T) { g := NewWithT(t) serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { - return []cloudscale.ServerGroup{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ {UUID: "sg-1", Name: "group-1"}, }, nil }, @@ -151,13 +151,13 @@ func TestDeleteServerGroups_Ignores404(t *testing.T) { g := NewWithT(t) serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { - return []cloudscale.ServerGroup{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ {UUID: "sg-already-deleted", Name: "group-1"}, }, nil }, deleteFn: func(ctx context.Context, id string) error { - return &cloudscale.ErrorResponse{StatusCode: 404} + return &cloudscalesdk.ErrorResponse{StatusCode: 404} }, } @@ -174,9 +174,9 @@ func TestDeleteServerGroups_OwnedServerPresent_SkipsDeletion(t *testing.T) { // The server group has a server that is owned by our cluster serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { - return []cloudscale.ServerGroup{ - {UUID: "sg-1", Name: "group-1", Servers: []cloudscale.ServerStub{{UUID: "server-123"}}}, + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ + {UUID: "sg-1", Name: "group-1", Servers: []cloudscalesdk.ServerStub{{UUID: "server-123"}}}, }, nil }, deleteFn: func(ctx context.Context, id string) error { @@ -218,9 +218,9 @@ func TestDeleteServerGroups_ForeignServers_Skips(t *testing.T) { // Server group has a server that is NOT owned by this cluster serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { - return []cloudscale.ServerGroup{ - {UUID: "sg-foreign", Name: "foreign-group", Servers: []cloudscale.ServerStub{{UUID: "server-999"}}}, + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ + {UUID: "sg-foreign", Name: "foreign-group", Servers: []cloudscalesdk.ServerStub{{UUID: "server-999"}}}, }, nil }, deleteFn: func(ctx context.Context, id string) error { @@ -259,8 +259,8 @@ func TestDeleteServerGroups_EmptyGroupName_DoesNotSkip(t *testing.T) { // Server group with no servers should be deleted immediately serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { - return []cloudscale.ServerGroup{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ {UUID: "sg-empty", Name: "empty-group", Servers: nil}, }, nil }, diff --git a/internal/controller/cloudscalemachine_server.go b/internal/controller/cloudscalemachine_server.go index 1e86362..09f8a11 100644 --- a/internal/controller/cloudscalemachine_server.go +++ b/internal/controller/cloudscalemachine_server.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "maps" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" corev1 "k8s.io/api/core/v1" @@ -163,8 +164,9 @@ func (r *CloudscaleMachineReconciler) reconcileServer(ctx context.Context, machi server, err = machineScope.CloudscaleClient.Servers.Create(ctx, req) if err != nil { if cloudscale.IsTimeoutError(err) { - machineScope.Info("Server creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) - return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + requeueAfter := 30 * time.Second + machineScope.Info("Server creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil } return ctrl.Result{}, fmt.Errorf("creating server: %w", err) } diff --git a/internal/controller/cloudscalemachine_server_test.go b/internal/controller/cloudscalemachine_server_test.go index b0b08e2..3616ef4 100644 --- a/internal/controller/cloudscalemachine_server_test.go +++ b/internal/controller/cloudscalemachine_server_test.go @@ -25,17 +25,14 @@ import ( . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/events" "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/util/conditions" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -84,16 +81,8 @@ func (m *mockServerService) Update(ctx context.Context, id string, req *cloudsca return nil } -func newTestFakeClient(objs ...client.Object) client.Client { - scheme := runtime.NewScheme() - _ = corev1.AddToScheme(scheme) - _ = clusterv1.AddToScheme(scheme) - _ = infrastructurev1beta2.AddToScheme(scheme) - return fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build() -} - -func newTestMachineScopeWithServer(serverService cs.ServerService) *scope.MachineScope { - cloudscaleClient := &cs.Client{ +func newTestMachineScopeWithServer(serverService cloudscale.ServerService) *scope.MachineScope { + cloudscaleClient := &cloudscale.Client{ Servers: serverService, } diff --git a/internal/controller/cloudscalemachine_servergroup.go b/internal/controller/cloudscalemachine_servergroup.go index dcad664..01a6f26 100644 --- a/internal/controller/cloudscalemachine_servergroup.go +++ b/internal/controller/cloudscalemachine_servergroup.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -88,8 +89,9 @@ func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, group, err := machineScope.CloudscaleClient.ServerGroups.Create(ctx, req) if err != nil { if cloudscale.IsTimeoutError(err) { - machineScope.Info("Server group creation timed out, waiting before retry", "requeueAfter", CreateTimeoutRequeueInterval) - return ctrl.Result{RequeueAfter: CreateTimeoutRequeueInterval}, nil + requeueAfter := 5 * time.Second + machineScope.Info("Server group creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil } return ctrl.Result{}, fmt.Errorf("creating server group: %w", err) } diff --git a/internal/controller/cloudscalemachine_servergroup_test.go b/internal/controller/cloudscalemachine_servergroup_test.go index 7e39804..e75f3e1 100644 --- a/internal/controller/cloudscalemachine_servergroup_test.go +++ b/internal/controller/cloudscalemachine_servergroup_test.go @@ -32,7 +32,7 @@ import ( "sigs.k8s.io/cluster-api/util/conditions" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -79,8 +79,8 @@ func (m *mockServerGroupService) Update(ctx context.Context, id string, req *clo return nil } -func newTestMachineScopeWithServerGroup(serverGroupService cs.ServerGroupService) *scope.MachineScope { - cloudscaleClient := &cs.Client{ +func newTestMachineScopeWithServerGroup(serverGroupService cloudscale.ServerGroupService) *scope.MachineScope { + cloudscaleClient := &cloudscale.Client{ ServerGroups: serverGroupService, } diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index e6f77a2..76b16b4 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -23,10 +23,14 @@ import ( "path/filepath" "testing" - "k8s.io/client-go/kubernetes/scheme" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" "k8s.io/client-go/tools/events" + clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/envtest" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" @@ -48,7 +52,7 @@ func TestMain(m *testing.M) { ctx, cancel = context.WithCancel(context.TODO()) - err := infrastructurev1beta2.AddToScheme(scheme.Scheme) + err := infrastructurev1beta2.AddToScheme(clientgoscheme.Scheme) if err != nil { fmt.Fprintf(os.Stderr, "Failed to add scheme: %v\n", err) os.Exit(1) @@ -77,7 +81,7 @@ func TestMain(m *testing.M) { os.Exit(1) } - k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + k8sClient, err = client.New(cfg, client.Options{Scheme: clientgoscheme.Scheme}) if err != nil { fmt.Fprintf(os.Stderr, "Failed to create client: %v\n", err) os.Exit(1) @@ -117,6 +121,14 @@ func getFirstFoundEnvTestBinaryDir() string { return "" } +func newTestFakeClient(objs ...client.Object) client.Client { + scheme := runtime.NewScheme() + _ = corev1.AddToScheme(scheme) + _ = clusterv1.AddToScheme(scheme) + _ = infrastructurev1beta2.AddToScheme(scheme) + return fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build() +} + func newTestReconciler() *CloudscaleClusterReconciler { return &CloudscaleClusterReconciler{ Client: newTestFakeClient(), From e767695fbabae17f422d068e0575466766e6342d Mon Sep 17 00:00:00 2001 From: Michael Weibel Date: Thu, 7 May 2026 17:59:55 +0200 Subject: [PATCH 6/9] improve http client, concurrency, etc. --- cmd/main.go | 70 +++++++-------- internal/cloudscale/client.go | 39 ++++++-- internal/cloudscale/client_test.go | 89 ++++++++++++++++++- internal/controller/cloudscale_services.go | 12 ++- .../cloudscalecluster_controller.go | 22 +++-- .../cloudscalecluster_floatingip.go | 16 +++- .../cloudscalecluster_loadbalancer.go | 40 ++++++--- .../controller/cloudscalecluster_network.go | 16 +++- .../cloudscalecluster_servergroup.go | 8 +- .../cloudscalemachine_controller.go | 22 +++-- .../controller/cloudscalemachine_server.go | 16 +++- .../cloudscalemachine_servergroup.go | 23 ++++- test/e2e/cloudscale_helpers.go | 2 +- 13 files changed, 288 insertions(+), 87 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 73770c8..6621ee3 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -21,6 +21,7 @@ import ( "crypto/tls" "flag" "fmt" + "net/http" "os" "time" @@ -35,7 +36,6 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" ctrl "sigs.k8s.io/controller-runtime" - ctrlconfig "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" @@ -71,6 +71,8 @@ func main() { var probeAddr string var secureMetrics bool var enableHTTP2 bool + var clusterConcurrency int + var machineConcurrency int var watchFilter string var tlsOpts []func(*tls.Config) flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ @@ -90,6 +92,10 @@ func main() { flag.StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.") flag.BoolVar(&enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers") + flag.IntVar(&clusterConcurrency, "cluster-concurrency", 1, + "Maximum concurrent reconciles for CloudscaleCluster controller (1-4)") + flag.IntVar(&machineConcurrency, "machine-concurrency", 1, + "Maximum concurrent reconciles for CloudscaleMachine controller (1-10)") flag.StringVar(&watchFilter, "watch-filter", "", fmt.Sprintf("Label value that the controller watches to reconcile cluster-api objects. Label key is always %s. "+ "If unspecified, the controller watches for all cluster-api objects.", clusterv1.WatchLabel)) @@ -101,6 +107,15 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + if clusterConcurrency < 1 || clusterConcurrency > 4 { + setupLog.Error(fmt.Errorf("--cluster-concurrency must be between 1 and 4, got %d", clusterConcurrency), "invalid flag") + os.Exit(1) + } + if machineConcurrency < 1 || machineConcurrency > 10 { + setupLog.Error(fmt.Errorf("--machine-concurrency must be between 1 and 10, got %d", machineConcurrency), "invalid flag") + os.Exit(1) + } + // if the enable-http2 flag is false (the default), http/2 should be disabled // due to its vulnerabilities. More specifically, disabling http/2 will // prevent from being vulnerable to the HTTP/2 Stream Cancellation and @@ -175,31 +190,8 @@ func main() { HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, LeaderElectionID: "cloudscale.infrastructure.cluster.x-k8s.io", - Controller: ctrlconfig.Controller{ - // MaxConcurrentReconciles is hardcoded to 1 for all controllers. This is - // intentional: CloudscaleCluster and CloudscaleMachine controllers both - // mutate shared cloudscale.ch resources (server groups, networks), and - // there is no distributed locking between them. - // - // If you increase this value, you MUST also audit (incomplete list): - // - deleteServerGroups: races with CloudscaleMachine server deletion - // - deleteNetwork: races with async LB pool member cleanup - // - reconcileServerGroup: races with cluster-level server group deletion - // - // In general, deletion ordering is not guaranteed between controllers. - // A value >1 requires explicit coordination or idempotency guarantees. - MaxConcurrentReconciles: 1, - }, - // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily - // when the Manager ends. This requires the binary to immediately end when the - // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly - // speeds up voluntary leader transitions as the new leader don't have to wait - // LeaseDuration time first. - // - // In the default scaffold provided, the program ends immediately after - // the manager stops, so would be fine to enable this option. However, - // if you are doing or is intended to do any operation such as perform cleanups - // after the manager stops then its usage might be unsafe. + // MaxConcurrentReconciles is set per-controller via --cluster-concurrency + // and --machine-concurrency flags, not globally. // LeaderElectionReleaseOnCancel: true, }) if err != nil { @@ -209,8 +201,12 @@ func main() { ctx := ctrl.SetupSignalHandler() + // Create a shared HTTP transport for all cloudscale API clients. + // This enables connection pooling and HTTP/2 multiplexing across reconciles. + transport := cloudscale.NewTransport() + // Fetch region information for controllers and webhooks - regionInfo, flavorInfo, err := fetchAPIInfo() + regionInfo, flavorInfo, err := fetchAPIInfo(transport) if err != nil { setupLog.Error(err, "unable to fetch API information") os.Exit(1) @@ -219,17 +215,21 @@ func main() { setupLog.Info("fetched flavor information", "flavors", len(flavorInfo.GetAllFlavors())) if err := (&controller.CloudscaleClusterReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - WatchFilter: watchFilter, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + WatchFilter: watchFilter, + Transport: transport, + MaxConcurrentReconciles: clusterConcurrency, }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleCluster") os.Exit(1) } if err := (&controller.CloudscaleMachineReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - WatchFilter: watchFilter, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + WatchFilter: watchFilter, + Transport: transport, + MaxConcurrentReconciles: machineConcurrency, }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleMachine") os.Exit(1) @@ -279,7 +279,7 @@ func main() { // fetchAPIInfo fetches region and flavor information from cloudscale.ch API. // Requires CLOUDSCALE_API_TOKEN environment variable. -func fetchAPIInfo() (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) { +func fetchAPIInfo(transport *http.Transport) (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) { token := os.Getenv("CLOUDSCALE_API_TOKEN") if token == "" { return nil, nil, fmt.Errorf("CLOUDSCALE_API_TOKEN environment variable is required") @@ -288,7 +288,7 @@ func fetchAPIInfo() (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - client := cloudscale.NewClient(token, cloudscale.DefaultCloudscaleRequestTimeout) + client := cloudscale.NewClient(token, transport) var regionInfo *cloudscale.RegionInfo var flavorInfo *cloudscale.FlavorInfo diff --git a/internal/cloudscale/client.go b/internal/cloudscale/client.go index fb35083..777c555 100644 --- a/internal/cloudscale/client.go +++ b/internal/cloudscale/client.go @@ -44,19 +44,29 @@ type Client struct { Flavors FlavorService } -// DefaultCloudscaleRequestTimeout is the default HTTP request timeout for all Create calls. -const DefaultCloudscaleRequestTimeout = 60 * time.Second +const ( + // ReadTimeout is the context timeout for Get/List API calls. + ReadTimeout = 30 * time.Second -func NewClient(token string, requestTimeout time.Duration) *Client { - tokenSource := oauth2.StaticTokenSource(&oauth2.Token{AccessToken: token}) - baseTransport := &http.Transport{ + // WriteTimeout is the context timeout for Create/Update API calls. + // Creates can take 60s+ under API load. + WriteTimeout = 2 * time.Minute + + // DeleteTimeout is the context timeout for Delete API calls. + DeleteTimeout = 30 * time.Second +) + +// NewTransport creates an http.Transport configured for the cloudscale.ch API. +// The returned transport should be created once and shared across all clients +// to benefit from connection pooling and HTTP/2 multiplexing. +func NewTransport() *http.Transport { + return &http.Transport{ DialContext: (&net.Dialer{ Timeout: 5 * time.Second, KeepAlive: 30 * time.Second, }).DialContext, - TLSHandshakeTimeout: 5 * time.Second, - ResponseHeaderTimeout: 10 * time.Second, + TLSHandshakeTimeout: 5 * time.Second, // needs to be set because we also set DialContext ForceAttemptHTTP2: true, @@ -70,12 +80,23 @@ func NewClient(token string, requestTimeout time.Duration) *Client { MaxIdleConnsPerHost: 50, MaxConnsPerHost: 0, } +} + +// NewClient creates a cloudscale.ch API client using the given token and +// shared transport. The transport should be created once via NewTransport() +// and reused across clients. Each client gets its own oauth2 token injection +// but shares the underlying connection pool. +// +// No global HTTP timeout is set on the client. Instead, callers must use +// context.WithTimeout with ReadTimeout, WriteTimeout, or DeleteTimeout +// for each API call. +func NewClient(token string, transport *http.Transport) *Client { + tokenSource := oauth2.StaticTokenSource(&oauth2.Token{AccessToken: token}) httpClient := &http.Client{ - Timeout: requestTimeout, Transport: &oauth2.Transport{ Source: tokenSource, - Base: baseTransport, + Base: transport, }, } sdkClient := cloudscalesdk.NewClient(httpClient) diff --git a/internal/cloudscale/client_test.go b/internal/cloudscale/client_test.go index b051bac..8dcc75a 100644 --- a/internal/cloudscale/client_test.go +++ b/internal/cloudscale/client_test.go @@ -17,7 +17,11 @@ limitations under the License. package cloudscale import ( + "context" + "errors" "fmt" + "net" + "net/http" "net/url" "os" "testing" @@ -25,6 +29,7 @@ import ( cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" . "github.com/onsi/gomega" + "golang.org/x/oauth2" ) func TestIsNotFound(t *testing.T) { @@ -98,11 +103,93 @@ func TestIsTimeoutError(t *testing.T) { } } +func TestNewTransport(t *testing.T) { + g := NewWithT(t) + transport := NewTransport() + + g.Expect(transport).ToNot(BeNil()) + g.Expect(transport.ForceAttemptHTTP2).To(BeTrue()) + g.Expect(transport.TLSHandshakeTimeout).To(Equal(5 * time.Second)) + g.Expect(transport.IdleConnTimeout).To(Equal(90 * time.Second)) + g.Expect(transport.MaxIdleConns).To(Equal(50)) + g.Expect(transport.MaxIdleConnsPerHost).To(Equal(50)) + g.Expect(transport.MaxConnsPerHost).To(Equal(0)) + + // Verify HTTP/2 ping settings + g.Expect(transport.HTTP2).ToNot(BeNil()) + g.Expect(transport.HTTP2.SendPingTimeout).To(Equal(5 * time.Second)) + g.Expect(transport.HTTP2.PingTimeout).To(Equal(3 * time.Second)) + + // Verify dial settings by inspecting the DialContext function + g.Expect(transport.DialContext).ToNot(BeNil()) +} + +func TestNewTransport_DialTimeout(t *testing.T) { + g := NewWithT(t) + transport := NewTransport() + + // Dial to an unroutable address to verify timeout works + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + // 192.0.2.1 is TEST-NET-1 (RFC 5737) - guaranteed unroutable + conn, err := transport.DialContext(ctx, "tcp", "192.0.2.1:443") + if conn != nil { + conn.Close() + } + + // Should fail with a timeout (dial timeout is 5s) + g.Expect(err).To(HaveOccurred()) + var netErr net.Error + g.Expect(err).To(BeAssignableToTypeOf(&net.OpError{})) + if ok := errors.As(err, &netErr); ok { + g.Expect(netErr.Timeout()).To(BeTrue()) + } +} + func TestNewClient_ReturnsNonNilClient(t *testing.T) { g := NewWithT(t) - client := NewClient("fake-token", 30*time.Second) + transport := NewTransport() + client := NewClient("fake-token", transport) g.Expect(client).ToNot(BeNil()) g.Expect(client.LoadBalancers).ToNot(BeNil()) g.Expect(client.Servers).ToNot(BeNil()) g.Expect(client.Networks).ToNot(BeNil()) } + +func TestNewClient_NoGlobalTimeout(t *testing.T) { + g := NewWithT(t) + + // Start a slow test server that takes 3s to respond + slowServer := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(3 * time.Second) + w.WriteHeader(http.StatusOK) + }) + server := &http.Server{Handler: slowServer} + listener, err := net.Listen("tcp", "127.0.0.1:0") + g.Expect(err).ToNot(HaveOccurred()) + go server.Serve(listener) + defer server.Close() + + transport := NewTransport() + + // The client should NOT have a global timeout - only context-based + tokenSource := oauth2.StaticTokenSource(&oauth2.Token{AccessToken: "test"}) + httpClient := &http.Client{ + Transport: &oauth2.Transport{ + Source: tokenSource, + Base: transport, + }, + } + // Verify no global timeout is set + g.Expect(httpClient.Timeout).To(Equal(time.Duration(0))) +} + +func TestTimeoutConstants(t *testing.T) { + g := NewWithT(t) + + // Verify timeout constants exist and have sensible values + g.Expect(ReadTimeout).To(Equal(30 * time.Second)) + g.Expect(WriteTimeout).To(Equal(2 * time.Minute)) + g.Expect(DeleteTimeout).To(Equal(30 * time.Second)) +} diff --git a/internal/controller/cloudscale_services.go b/internal/controller/cloudscale_services.go index ca472bc..8053ea4 100644 --- a/internal/controller/cloudscale_services.go +++ b/internal/controller/cloudscale_services.go @@ -13,9 +13,9 @@ import ( const ( // CreateTimeoutRequeueInterval is the requeue interval after an HTTP timeout - // on a Create call. This MUST be longer than the HTTP client timeout (60s) so the + // on a Create call. This MUST be longer than cloudscale.WriteTimeout (2m) so the // original request has time to complete before we retry. - CreateTimeoutRequeueInterval = 90 * time.Second + CreateTimeoutRequeueInterval = 150 * time.Second ) // getListService is satisfied by all cloudscale SDK resource services. @@ -37,7 +37,9 @@ func ensureResource[T any]( tags cloudscalesdk.TagMap, ) (*T, string, error) { if currentID != "" { - resource, err := svc.Get(ctx, currentID) + getCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + resource, err := svc.Get(getCtx, currentID) if err == nil { return resource, currentID, nil } @@ -47,7 +49,9 @@ func ensureResource[T any]( // Resource was deleted externally, fall through to list/recreate } - items, err := svc.List(ctx, cloudscalesdk.WithTagFilter(tags)) + listCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + items, err := svc.List(listCtx, cloudscalesdk.WithTagFilter(tags)) if err != nil { return nil, "", fmt.Errorf("listing %ss: %w", resourceName, err) } diff --git a/internal/controller/cloudscalecluster_controller.go b/internal/controller/cloudscalecluster_controller.go index ceb9e74..95adaab 100644 --- a/internal/controller/cloudscalecluster_controller.go +++ b/internal/controller/cloudscalecluster_controller.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "net/http" "time" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -34,6 +35,7 @@ import ( "sigs.k8s.io/cluster-api/util/predicates" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -47,9 +49,11 @@ import ( // CloudscaleClusterReconciler reconciles a CloudscaleCluster object type CloudscaleClusterReconciler struct { client.Client - Scheme *runtime.Scheme - recorder events.EventRecorder - WatchFilter string + Scheme *runtime.Scheme + recorder events.EventRecorder + WatchFilter string + Transport *http.Transport + MaxConcurrentReconciles int } // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=cloudscaleclusters,verbs=get;list;watch;create;update;patch;delete @@ -62,6 +66,9 @@ type CloudscaleClusterReconciler struct { // Reconcile handles CloudscaleCluster reconciliation. func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { + ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + logger := logf.FromContext(ctx) cloudscaleCluster := &infrastructurev1beta2.CloudscaleCluster{} @@ -101,7 +108,7 @@ func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re return ctrl.Result{}, fmt.Errorf("failed to get cloudscale.ch credentials: %w", err) } - cloudscaleClient := cloudscale.NewClient(token, cloudscale.DefaultCloudscaleRequestTimeout) + cloudscaleClient := cloudscale.NewClient(token, r.Transport) clusterScope, err := scope.NewClusterScope(scope.ClusterScopeParams{ Client: r.Client, @@ -115,7 +122,11 @@ func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re } defer func() { - if err := clusterScope.Close(ctx); err != nil && reterr == nil { + // Use a separate context for the status patch so it succeeds even + // when the reconcile context has timed out. + patchCtx, patchCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer patchCancel() + if err := clusterScope.Close(patchCtx); err != nil && reterr == nil { reterr = err } }() @@ -297,6 +308,7 @@ func (r *CloudscaleClusterReconciler) SetupWithManager(ctx context.Context, mgr return ctrl.NewControllerManagedBy(mgr). For(&infrastructurev1beta2.CloudscaleCluster{}). + WithOptions(controller.Options{MaxConcurrentReconciles: r.MaxConcurrentReconciles}). WithEventFilter(predicates.ResourceNotPaused(r.Scheme, logger)). WithEventFilter(predicates.ResourceHasFilterLabel(r.Scheme, logger, r.WatchFilter)). WithEventFilter(predicates.ResourceIsNotExternallyManaged(r.Scheme, logger)). diff --git a/internal/controller/cloudscalecluster_floatingip.go b/internal/controller/cloudscalecluster_floatingip.go index d2f5db9..b0b6d8e 100644 --- a/internal/controller/cloudscalecluster_floatingip.go +++ b/internal/controller/cloudscalecluster_floatingip.go @@ -64,7 +64,9 @@ func (r *CloudscaleClusterReconciler) reconcileFloatingIP(ctx context.Context, c } func (r *CloudscaleClusterReconciler) reconcilePreExistingFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope, ip string) error { - fip, err := clusterScope.CloudscaleClient.FloatingIPs.Get(ctx, ip) + getCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + fip, err := clusterScope.CloudscaleClient.FloatingIPs.Get(getCtx, ip) if err != nil { return fmt.Errorf("getting pre-existing floating IP %s: %w", ip, err) } @@ -137,7 +139,9 @@ func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Con } clusterScope.Info("Creating floating IP", "ipVersion", ipVersion, "target", target) - fip, err = clusterScope.CloudscaleClient.FloatingIPs.Create(ctx, req) + createCtx, cancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancel() + fip, err = clusterScope.CloudscaleClient.FloatingIPs.Create(createCtx, req) if err != nil { if cloudscale.IsTimeoutError(err) { const timeout = createFloatingIPTimeoutRequeueAfter @@ -225,7 +229,9 @@ func (r *CloudscaleClusterReconciler) ensureFloatingIPAssignment(ctx context.Con if needsUpdate { floatingIP := clusterScope.CloudscaleCluster.Status.FloatingIP clusterScope.Info("Reassigning floating IP", "ip", floatingIP, "target", target) - if err := clusterScope.CloudscaleClient.FloatingIPs.Update(ctx, floatingIP, updateReq); err != nil { + updateCtx, cancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancel() + if err := clusterScope.CloudscaleClient.FloatingIPs.Update(updateCtx, floatingIP, updateReq); err != nil { if cloudscale.IsFloatingIPNoPublicInterface(err) { return fmt.Errorf("floating IP cannot be assigned to control plane server: server must have a public interface when the load balancer is disabled; add {type: public} to the control-plane machine template interfaces") } @@ -283,7 +289,9 @@ func (r *CloudscaleClusterReconciler) deleteFloatingIP(ctx context.Context, clus } clusterScope.Info("Deleting floating IP", "id", floatingIP) - if err := clusterScope.CloudscaleClient.FloatingIPs.Delete(ctx, floatingIP); err != nil { + deleteCtx, cancel := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + defer cancel() + if err := clusterScope.CloudscaleClient.FloatingIPs.Delete(deleteCtx, floatingIP); err != nil { if !cloudscale.IsNotFound(err) { return fmt.Errorf("deleting floating IP: %w", err) } diff --git a/internal/controller/cloudscalecluster_loadbalancer.go b/internal/controller/cloudscalecluster_loadbalancer.go index 6fad342..859ce64 100644 --- a/internal/controller/cloudscalecluster_loadbalancer.go +++ b/internal/controller/cloudscalecluster_loadbalancer.go @@ -78,7 +78,9 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, } // Check if LB is running - lb, err := clusterScope.CloudscaleClient.LoadBalancers.Get(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerID) + getCtx, getCancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer getCancel() + lb, err := clusterScope.CloudscaleClient.LoadBalancers.Get(getCtx, clusterScope.CloudscaleCluster.Status.LoadBalancerID) if err != nil { return ctrl.Result{}, fmt.Errorf("getting load balancer status: %w", err) } @@ -186,7 +188,9 @@ func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterSc } clusterScope.Info("Creating load balancer", "zone", zone, "flavor", lbSpec.Flavor) - lb, err := clusterScope.CloudscaleClient.LoadBalancers.Create(ctx, req) + createCtx, createCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer createCancel() + lb, err := clusterScope.CloudscaleClient.LoadBalancers.Create(createCtx, req) if err != nil { if cloudscale.IsTimeoutError(err) { requeueAfter := 5 * time.Second @@ -234,7 +238,9 @@ func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clust } clusterScope.Info("Creating load balancer pool", "algorithm", algorithm) - pool, err := clusterScope.CloudscaleClient.LoadBalancerPools.Create(ctx, req) + createCtx, createCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer createCancel() + pool, err := clusterScope.CloudscaleClient.LoadBalancerPools.Create(createCtx, req) if err != nil { if cloudscale.IsTimeoutError(err) { requeueAfter := 5 * time.Second @@ -280,7 +286,9 @@ func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, c } clusterScope.Info("Creating load balancer listener", "port", apiServerPort) - listener, err := clusterScope.CloudscaleClient.LoadBalancerListeners.Create(ctx, req) + createCtx, createCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer createCancel() + listener, err := clusterScope.CloudscaleClient.LoadBalancerListeners.Create(createCtx, req) if err != nil { if cloudscale.IsTimeoutError(err) { requeueAfter := 5 * time.Second @@ -331,7 +339,9 @@ func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Conte } clusterScope.Info("Creating load balancer health monitor", "type", "tcp", "pool", clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, "spec", healthMonitorSpec) - monitor, err := clusterScope.CloudscaleClient.LoadBalancerHealthMonitors.Create(ctx, req) + createCtx, createCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer createCancel() + monitor, err := clusterScope.CloudscaleClient.LoadBalancerHealthMonitors.Create(createCtx, req) if err != nil { if cloudscale.IsTimeoutError(err) { requeueAfter := 5 * time.Second @@ -351,7 +361,9 @@ func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Conte func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { // Fetch current members from the load balancer - currentMembers, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.List(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID) + listCtx, listCancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer listCancel() + currentMembers, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.List(listCtx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID) if err != nil { return ctrl.Result{}, fmt.Errorf("failed to get current load balancer members: %w", err) } @@ -510,7 +522,9 @@ func lbPrivateNetworkSubnetID(clusterScope *scope.ClusterScope) (string, error) func (r *CloudscaleClusterReconciler) createLoadBalancerMember(ctx context.Context, clusterScope *scope.ClusterScope, member cloudscalesdk.LoadBalancerPoolMemberRequest) (ctrl.Result, error) { clusterScope.V(2).Info("Creating load balancer member", "member", member) - cm, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Create(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, &member) + createCtx, createCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer createCancel() + cm, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Create(createCtx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, &member) if err != nil { if cloudscale.IsTimeoutError(err) { requeueAfter := 5 * time.Second @@ -527,7 +541,9 @@ func (r *CloudscaleClusterReconciler) createLoadBalancerMember(ctx context.Conte func (r *CloudscaleClusterReconciler) updateLoadBalancerMember(ctx context.Context, clusterScope *scope.ClusterScope, memberUUID, newAddress string) error { clusterScope.V(2).Info("Updating load balancer member address", "memberUUID", memberUUID, "newAddress", newAddress) - err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Update(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, memberUUID, &cloudscalesdk.LoadBalancerPoolMemberRequest{ + updateCtx, updateCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer updateCancel() + err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Update(updateCtx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, memberUUID, &cloudscalesdk.LoadBalancerPoolMemberRequest{ Address: newAddress, }) if err != nil { @@ -539,7 +555,9 @@ func (r *CloudscaleClusterReconciler) updateLoadBalancerMember(ctx context.Conte func (r *CloudscaleClusterReconciler) deleteLoadBalancerMember(ctx context.Context, clusterScope *scope.ClusterScope, member cloudscalesdk.LoadBalancerPoolMember) error { clusterScope.V(2).Info("Deleting load balancer member", "member", member) - err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Delete(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, member.UUID) + deleteCtx, deleteCancel := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + defer deleteCancel() + err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Delete(deleteCtx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, member.UUID) if err != nil { return fmt.Errorf("deleting load balancer member: %w", err) } @@ -569,7 +587,9 @@ func (r *CloudscaleClusterReconciler) deleteLoadBalancer(ctx context.Context, cl lbID := clusterScope.CloudscaleCluster.Status.LoadBalancerID if lbID != "" { clusterScope.Info("Deleting load balancer", "id", lbID) - if err := clusterScope.CloudscaleClient.LoadBalancers.Delete(ctx, lbID); err != nil { + deleteCtx, deleteCancel := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + defer deleteCancel() + if err := clusterScope.CloudscaleClient.LoadBalancers.Delete(deleteCtx, lbID); err != nil { if !cloudscale.IsNotFound(err) { return fmt.Errorf("deleting load balancer: %w", err) } diff --git a/internal/controller/cloudscalecluster_network.go b/internal/controller/cloudscalecluster_network.go index 7435015..551387d 100644 --- a/internal/controller/cloudscalecluster_network.go +++ b/internal/controller/cloudscalecluster_network.go @@ -82,7 +82,9 @@ func (r *CloudscaleClusterReconciler) reconcilePreExistingNetwork(ctx context.Co return nil } - network, err := clusterScope.CloudscaleClient.Networks.Get(ctx, netSpec.UUID) + getCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + network, err := clusterScope.CloudscaleClient.Networks.Get(getCtx, netSpec.UUID) if err != nil { return fmt.Errorf("getting pre-existing network %s: %w", netSpec.UUID, err) } @@ -127,7 +129,9 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex if resolvedNetworkID == "" { // Create new network clusterScope.Info("Creating network", "name", netSpec.Name) - network, err := clusterScope.CloudscaleClient.Networks.Create(ctx, &cloudscalesdk.NetworkCreateRequest{ + createCtx, cancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancel() + network, err := clusterScope.CloudscaleClient.Networks.Create(createCtx, &cloudscalesdk.NetworkCreateRequest{ Name: netSpec.Name, AutoCreateIPV4Subnet: ptr.To(false), ZonalResourceRequest: cloudscalesdk.ZonalResourceRequest{ @@ -170,7 +174,9 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex if resolvedSubnetID == "" { // Create new subnet clusterScope.Info("Creating subnet", "name", netSpec.Name, "cidr", netSpec.CIDR, "gateway", netSpec.GatewayAddress) - subnet, err := clusterScope.CloudscaleClient.Subnets.Create(ctx, &cloudscalesdk.SubnetCreateRequest{ + createSubnetCtx, cancelSubnet := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancelSubnet() + subnet, err := clusterScope.CloudscaleClient.Subnets.Create(createSubnetCtx, &cloudscalesdk.SubnetCreateRequest{ Network: resolvedNetworkID, CIDR: netSpec.CIDR, GatewayAddress: netSpec.GatewayAddress, @@ -225,7 +231,9 @@ func (r *CloudscaleClusterReconciler) deleteNetwork(ctx context.Context, cluster } logger.Info("Deleting network") - err := clusterScope.CloudscaleClient.Networks.Delete(ctx, ns.NetworkID) + deleteCtx, cancel := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + defer cancel() + err := clusterScope.CloudscaleClient.Networks.Delete(deleteCtx, ns.NetworkID) // return sentinel error which will wait a pre-defined amount of time if isLBPoolMembersError(err) { diff --git a/internal/controller/cloudscalecluster_servergroup.go b/internal/controller/cloudscalecluster_servergroup.go index 02f9684..bd746dc 100644 --- a/internal/controller/cloudscalecluster_servergroup.go +++ b/internal/controller/cloudscalecluster_servergroup.go @@ -46,7 +46,9 @@ func (r *CloudscaleClusterReconciler) deleteServerGroups(ctx context.Context, cl return fmt.Errorf("listing owned servers: %w", err) } - groups, err := clusterScope.CloudscaleClient.ServerGroups.List(ctx, + listCtx, cancelList := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancelList() + groups, err := clusterScope.CloudscaleClient.ServerGroups.List(listCtx, cloudscalesdk.WithTagFilter(clusterOwnershipTags(clusterScope.CloudscaleCluster))) if err != nil { return fmt.Errorf("listing server groups: %w", err) @@ -76,7 +78,9 @@ func (r *CloudscaleClusterReconciler) deleteServerGroups(ctx context.Context, cl } clusterScope.Info("Deleting server group", "serverGroupID", g.UUID, "name", g.Name) - if err := clusterScope.CloudscaleClient.ServerGroups.Delete(ctx, g.UUID); err != nil { + deleteCtx, cancelDelete := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + defer cancelDelete() + if err := clusterScope.CloudscaleClient.ServerGroups.Delete(deleteCtx, g.UUID); err != nil { if !cloudscale.IsNotFound(err) { return fmt.Errorf("deleting server group %s: %w", g.UUID, err) } diff --git a/internal/controller/cloudscalemachine_controller.go b/internal/controller/cloudscalemachine_controller.go index 52a0246..16a5edb 100644 --- a/internal/controller/cloudscalemachine_controller.go +++ b/internal/controller/cloudscalemachine_controller.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "net/http" "time" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -34,6 +35,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -52,9 +54,11 @@ const ( // CloudscaleMachineReconciler reconciles a CloudscaleMachine object type CloudscaleMachineReconciler struct { client.Client - Scheme *runtime.Scheme - recorder events.EventRecorder - WatchFilter string + Scheme *runtime.Scheme + recorder events.EventRecorder + WatchFilter string + Transport *http.Transport + MaxConcurrentReconciles int } // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=cloudscalemachines,verbs=get;list;watch;create;update;patch;delete @@ -65,6 +69,9 @@ type CloudscaleMachineReconciler struct { // Reconcile handles CloudscaleMachine reconciliation. func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { + ctx, cancel := context.WithTimeout(ctx, 3*time.Minute) + defer cancel() + logger := logf.FromContext(ctx) cloudscaleMachine := &infrastructurev1beta2.CloudscaleMachine{} @@ -128,7 +135,7 @@ func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re return ctrl.Result{}, fmt.Errorf("failed to get cloudscale.ch credentials: %w", err) } - cloudscaleClient := cloudscale.NewClient(token, cloudscale.DefaultCloudscaleRequestTimeout) + cloudscaleClient := cloudscale.NewClient(token, r.Transport) machineScope, err := scope.NewMachineScope(scope.MachineScopeParams{ Client: r.Client, @@ -144,7 +151,11 @@ func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re } defer func() { - if err := machineScope.Close(ctx); err != nil && reterr == nil { + // Use a separate context for the status patch so it succeeds even + // when the reconcile context has timed out. + patchCtx, patchCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer patchCancel() + if err := machineScope.Close(patchCtx); err != nil && reterr == nil { reterr = err } }() @@ -263,6 +274,7 @@ func (r *CloudscaleMachineReconciler) SetupWithManager(ctx context.Context, mgr return ctrl.NewControllerManagedBy(mgr). For(&infrastructurev1beta2.CloudscaleMachine{}). + WithOptions(controller.Options{MaxConcurrentReconciles: r.MaxConcurrentReconciles}). WithEventFilter(predicates.ResourceNotPaused(r.Scheme, logger)). WithEventFilter(predicates.ResourceHasFilterLabel(r.Scheme, logger, r.WatchFilter)). WithEventFilter(predicates.ResourceIsNotExternallyManaged(r.Scheme, logger)). diff --git a/internal/controller/cloudscalemachine_server.go b/internal/controller/cloudscalemachine_server.go index 09f8a11..de1e54b 100644 --- a/internal/controller/cloudscalemachine_server.go +++ b/internal/controller/cloudscalemachine_server.go @@ -83,7 +83,9 @@ func (r *CloudscaleMachineReconciler) reconcileServer(ctx context.Context, machi // 1. If we have a server ID in status, verify it still exists if machineScope.CloudscaleMachine.Status.ServerID != "" { var err error - server, err = machineScope.CloudscaleClient.Servers.Get(ctx, machineScope.CloudscaleMachine.Status.ServerID) + getCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + server, err = machineScope.CloudscaleClient.Servers.Get(getCtx, machineScope.CloudscaleMachine.Status.ServerID) if err == nil { machineScope.V(2).Info("Server already exists", "serverID", machineScope.CloudscaleMachine.Status.ServerID, "status", server.Status) r.updateMachineFromServer(machineScope, server) @@ -102,7 +104,9 @@ func (r *CloudscaleMachineReconciler) reconcileServer(ctx context.Context, machi } // 2. Search for existing server by tag (idempotency after crash) - servers, err := machineScope.CloudscaleClient.Servers.List(ctx, + listCtx, cancelList := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancelList() + servers, err := machineScope.CloudscaleClient.Servers.List(listCtx, cloudscalesdk.WithTagFilter(r.machineLookupTag(machineScope))) if err != nil { return ctrl.Result{}, fmt.Errorf("listing servers: %w", err) @@ -161,7 +165,9 @@ func (r *CloudscaleMachineReconciler) reconcileServer(ctx context.Context, machi req.ServerGroups = []string{machineScope.CloudscaleMachine.Status.ServerGroupID} } - server, err = machineScope.CloudscaleClient.Servers.Create(ctx, req) + createCtx, cancelCreate := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancelCreate() + server, err = machineScope.CloudscaleClient.Servers.Create(createCtx, req) if err != nil { if cloudscale.IsTimeoutError(err) { requeueAfter := 30 * time.Second @@ -254,7 +260,9 @@ func (r *CloudscaleMachineReconciler) deleteServer(ctx context.Context, machineS serverID := machineScope.CloudscaleMachine.Status.ServerID machineScope.Info("Deleting server", "serverID", serverID) - if err := machineScope.CloudscaleClient.Servers.Delete(ctx, serverID); err != nil { + deleteCtx, cancel := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + defer cancel() + if err := machineScope.CloudscaleClient.Servers.Delete(deleteCtx, serverID); err != nil { // Ignore 404 - server was already deleted externally if !cloudscale.IsNotFound(err) { return fmt.Errorf("deleting server: %w", err) diff --git a/internal/controller/cloudscalemachine_servergroup.go b/internal/controller/cloudscalemachine_servergroup.go index 01a6f26..913adbe 100644 --- a/internal/controller/cloudscalemachine_servergroup.go +++ b/internal/controller/cloudscalemachine_servergroup.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "sync" "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" @@ -31,6 +32,12 @@ import ( "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) +// serverGroupMu serializes server group creation to prevent duplicates +// when multiple machines reconcile concurrently. The cloudscale API does +// not return a conflict on duplicate creation, so we serialize here. +// Safe: leader election guarantees a single replica. +var serverGroupMu sync.Mutex + // reconcileServerGroup ensures the server group exists if specified. // Server groups are zone-scoped and created once per unique name+zone combination. func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, machineScope *scope.MachineScope) (_ ctrl.Result, reterr error) { @@ -48,7 +55,9 @@ func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, // If we already have a server group ID, verify it still exists if machineScope.CloudscaleMachine.Status.ServerGroupID != "" { - _, err := machineScope.CloudscaleClient.ServerGroups.Get(ctx, machineScope.CloudscaleMachine.Status.ServerGroupID) + getCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + _, err := machineScope.CloudscaleClient.ServerGroups.Get(getCtx, machineScope.CloudscaleMachine.Status.ServerGroupID) if err == nil { return ctrl.Result{}, nil } @@ -59,12 +68,18 @@ func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, machineScope.CloudscaleMachine.Status.ServerGroupID = "" } + // Serialize server group list+create to prevent duplicates under concurrent reconciles. + serverGroupMu.Lock() + defer serverGroupMu.Unlock() + zone := machineScope.CloudscaleCluster.Spec.Zone groupName := machineScope.CloudscaleMachine.Spec.ServerGroup.Name // Search for existing server group by name and zone using cluster-level tags // so that all machines in the cluster can find the same server group. - groups, err := machineScope.CloudscaleClient.ServerGroups.List(ctx, cloudscalesdk.WithTagFilter(clusterOwnershipTags(machineScope.CloudscaleCluster))) + listCtx, cancelList := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancelList() + groups, err := machineScope.CloudscaleClient.ServerGroups.List(listCtx, cloudscalesdk.WithTagFilter(clusterOwnershipTags(machineScope.CloudscaleCluster))) if err != nil { return ctrl.Result{}, fmt.Errorf("listing server groups: %w", err) } @@ -86,7 +101,9 @@ func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, }, } - group, err := machineScope.CloudscaleClient.ServerGroups.Create(ctx, req) + createCtx, cancelCreate := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancelCreate() + group, err := machineScope.CloudscaleClient.ServerGroups.Create(createCtx, req) if err != nil { if cloudscale.IsTimeoutError(err) { requeueAfter := 5 * time.Second diff --git a/test/e2e/cloudscale_helpers.go b/test/e2e/cloudscale_helpers.go index 2dc7831..f46a973 100644 --- a/test/e2e/cloudscale_helpers.go +++ b/test/e2e/cloudscale_helpers.go @@ -28,7 +28,7 @@ import ( // newCloudscaleClient creates a new cloudscale API client from the given token. func newCloudscaleClient(token string) *cloudscale.Client { - return cloudscale.NewClient(token, cloudscale.DefaultCloudscaleRequestTimeout) + return cloudscale.NewClient(token, cloudscale.NewTransport()) } // resourceSnapshot holds a snapshot of cloudscale API resources for leak detection. From 029cea1971ba316b3e7db30a221e13673c173178 Mon Sep 17 00:00:00 2001 From: Michael Weibel Date: Fri, 8 May 2026 11:05:53 +0200 Subject: [PATCH 7/9] Fix load balancer deadlock for degraded/error status Replace the "not running" check in reconcileLoadBalancer with a switch that only blocks on "changing". When the LB is degraded or error, proceed with member reconciliation to remove stale pool members. Previously, a degraded LB might've block forever, creating a deadlock. Also refactor and clean up the load balancer tests: - Add constants for all used cloudscale LB API statuses (running, changing, degraded, error). - Fix the "creating" mock status. - Introduce reusable "exists" mock helpers and auto-default nil mocks, cutting orchestrator test setup from ~20 lines to ~3 lines each. - Remove 12 duplicate sub-reconciler tests already covered by ensureResource tests in cloudscale_services_test.go. --- cmd/main.go | 6 +- internal/cloudscale/client_test.go | 11 +- .../cloudscalecluster_loadbalancer.go | 26 +- .../cloudscalecluster_loadbalancer_test.go | 576 ++++-------------- .../cloudscalecluster_reconcile_test.go | 4 +- 5 files changed, 157 insertions(+), 466 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 6621ee3..1703d91 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -108,11 +108,13 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) if clusterConcurrency < 1 || clusterConcurrency > 4 { - setupLog.Error(fmt.Errorf("--cluster-concurrency must be between 1 and 4, got %d", clusterConcurrency), "invalid flag") + setupLog.Error( + fmt.Errorf("--cluster-concurrency must be between 1 and 4, got %d", clusterConcurrency), "invalid flag") os.Exit(1) } if machineConcurrency < 1 || machineConcurrency > 10 { - setupLog.Error(fmt.Errorf("--machine-concurrency must be between 1 and 10, got %d", machineConcurrency), "invalid flag") + setupLog.Error( + fmt.Errorf("--machine-concurrency must be between 1 and 10, got %d", machineConcurrency), "invalid flag") os.Exit(1) } diff --git a/internal/cloudscale/client_test.go b/internal/cloudscale/client_test.go index 8dcc75a..3c91e92 100644 --- a/internal/cloudscale/client_test.go +++ b/internal/cloudscale/client_test.go @@ -135,7 +135,7 @@ func TestNewTransport_DialTimeout(t *testing.T) { // 192.0.2.1 is TEST-NET-1 (RFC 5737) - guaranteed unroutable conn, err := transport.DialContext(ctx, "tcp", "192.0.2.1:443") if conn != nil { - conn.Close() + _ = conn.Close() } // Should fail with a timeout (dial timeout is 5s) @@ -168,8 +168,13 @@ func TestNewClient_NoGlobalTimeout(t *testing.T) { server := &http.Server{Handler: slowServer} listener, err := net.Listen("tcp", "127.0.0.1:0") g.Expect(err).ToNot(HaveOccurred()) - go server.Serve(listener) - defer server.Close() + go func() { + _ = server.Serve(listener) + + }() + defer func(server *http.Server) { + _ = server.Close() + }(server) transport := NewTransport() diff --git a/internal/controller/cloudscalecluster_loadbalancer.go b/internal/controller/cloudscalecluster_loadbalancer.go index 859ce64..720c93b 100644 --- a/internal/controller/cloudscalecluster_loadbalancer.go +++ b/internal/controller/cloudscalecluster_loadbalancer.go @@ -38,7 +38,10 @@ import ( const ( // LoadBalancerRunningStatus indicates the load balancer is ready. - LoadBalancerRunningStatus = "running" + LoadBalancerRunningStatus = "running" + LoadBalancerChangingStatus = "changing" + LoadBalancerDegradedStatus = "degraded" + LoadBalancerErrorStatus = "error" ) // reconcileLoadBalancer ensures the load balancer, pool, and listener exist for the control plane. @@ -84,16 +87,19 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, if err != nil { return ctrl.Result{}, fmt.Errorf("getting load balancer status: %w", err) } - if lb.Status != LoadBalancerRunningStatus { - clusterScope.Info("Waiting for load balancer to be running", "status", lb.Status) + + switch lb.Status { + case LoadBalancerRunningStatus: + // proceed normally + case LoadBalancerChangingStatus: + clusterScope.Info("Waiting for load balancer to finish changing", "status", lb.Status) + lbPending = true + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + default: + // degraded, error, stopped, unknown — proceed with member reconciliation + // because removing stale members is the only path back to "running" + clusterScope.Info("Load balancer is not running, proceeding with member reconciliation", "status", lb.Status) lbPending = true - requeueAfter := 5 * time.Second - if lb.Status == "error" || lb.Status == "degraded" { - // During bootstrap, error/degraded is expected because the health - // monitor checks an empty pool (no CP machines ready yet). - requeueAfter = 10 * time.Second - } - return ctrl.Result{RequeueAfter: requeueAfter}, nil } // 2. Reconcile the pool diff --git a/internal/controller/cloudscalecluster_loadbalancer_test.go b/internal/controller/cloudscalecluster_loadbalancer_test.go index 1631189..2499d1d 100644 --- a/internal/controller/cloudscalecluster_loadbalancer_test.go +++ b/internal/controller/cloudscalecluster_loadbalancer_test.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/events" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -93,6 +94,10 @@ func newTestClusterScopeWithLB(opts lbTestScopeOptions) *scope.ClusterScope { opts.healthMonitorDownThreshold = 3 } + // Default nil services to "exists" mocks so orchestrator tests + // only override the service they care about. + defaultLBServices(&opts) + cloudscaleClient := &cloudscale.Client{ LoadBalancers: opts.loadBalancerService, LoadBalancerPools: opts.poolService, @@ -143,6 +148,71 @@ func newTestClusterScopeWithLB(opts lbTestScopeOptions) *scope.ClusterScope { } } +// defaultLBServices fills in nil cloudscale service mocks with "exists" stubs +// so that orchestrator tests only need to override the one behaviour they care about. +func defaultLBServices(opts *lbTestScopeOptions) { + if opts.loadBalancerService == nil { + opts.loadBalancerService = existingLBMock(LoadBalancerRunningStatus) + } + if opts.poolService == nil { + opts.poolService = existingPoolMock() + } + if opts.listenerService == nil { + opts.listenerService = existingListenerMock() + } + if opts.healthMonitorService == nil { + opts.healthMonitorService = existingHealthMonitorMock() + } + if opts.poolMemberService == nil { + opts.poolMemberService = emptyMembersMock() + } +} + +// existingLBMock returns a mock that reports the LB already exists with the given status. +func existingLBMock(status string) *mockLoadBalancerService { + return &mockLoadBalancerService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { + return &cloudscalesdk.LoadBalancer{UUID: id, Status: status}, nil + }, + } +} + +// existingPoolMock returns a mock that reports the pool already exists. +func existingPoolMock() *mockLoadBalancerPoolService { + return &mockLoadBalancerPoolService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerPool, error) { + return &cloudscalesdk.LoadBalancerPool{UUID: id}, nil + }, + } +} + +// existingListenerMock returns a mock that reports the listener already exists. +func existingListenerMock() *mockLoadBalancerListenerService { + return &mockLoadBalancerListenerService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerListener, error) { + return &cloudscalesdk.LoadBalancerListener{UUID: id}, nil + }, + } +} + +// existingHealthMonitorMock returns a mock that reports the health monitor already exists. +func existingHealthMonitorMock() *mockLoadBalancerHealthMonitorService { + return &mockLoadBalancerHealthMonitorService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { + return &cloudscalesdk.LoadBalancerHealthMonitor{UUID: id}, nil + }, + } +} + +// emptyMembersMock returns a mock that reports no pool members. +func emptyMembersMock() *mockLoadBalancerPoolMemberService { + return &mockLoadBalancerPoolMemberService{ + listFn: func(ctx context.Context, poolID string, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPoolMember, error) { + return nil, nil + }, + } +} + // ============================================================================ // Tests for reconcileLB // ============================================================================ @@ -158,7 +228,7 @@ func TestReconcileLB_CreatesLoadBalancer(t *testing.T) { return &cloudscalesdk.LoadBalancer{ UUID: "lb-uuid-123", Name: req.Name, - Status: "creating", + Status: LoadBalancerChangingStatus, }, nil }, } @@ -184,7 +254,7 @@ func TestReconcileLB_SkipsIfAlreadyExists(t *testing.T) { lbService := &mockLoadBalancerService{ getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return &cloudscalesdk.LoadBalancer{UUID: id, Status: "running"}, nil + return &cloudscalesdk.LoadBalancer{UUID: id, Status: LoadBalancerRunningStatus}, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerRequest) (*cloudscalesdk.LoadBalancer, error) { g.Fail("Create should not be called when LB already exists") @@ -206,91 +276,6 @@ func TestReconcileLB_SkipsIfAlreadyExists(t *testing.T) { g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(Equal("existing-lb-uuid")) } -func TestReconcileLB_FindsExistingByTag(t *testing.T) { - g := NewWithT(t) - - lbService := &mockLoadBalancerService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancer, error) { - return []cloudscalesdk.LoadBalancer{ - {UUID: "found-lb-uuid", Name: "test-cluster-cp-lb"}, - }, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerRequest) (*cloudscalesdk.LoadBalancer, error) { - g.Fail("Create should not be called when LB is found by tag") - return nil, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, - lbEnabled: true, - }) - r := newTestReconciler() - - _, err := r.reconcileLB(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(Equal("found-lb-uuid")) -} - -func TestReconcileLB_ErrorsOnMultipleByTag(t *testing.T) { - g := NewWithT(t) - - lbService := &mockLoadBalancerService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancer, error) { - return []cloudscalesdk.LoadBalancer{ - {UUID: "lb-uuid-1", Name: "test-cluster-cp-lb"}, - {UUID: "lb-uuid-2", Name: "test-cluster-cp-lb"}, - }, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, - lbEnabled: true, - }) - r := newTestReconciler() - - _, err := r.reconcileLB(context.Background(), clusterScope) - - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("found 2 load balancers matching tag filter")) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(BeEmpty()) -} - -func TestReconcileLB_RecreatesIfDeletedExternally(t *testing.T) { - g := NewWithT(t) - - var createdLB bool - - lbService := &mockLoadBalancerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} - }, - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancer, error) { - return nil, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerRequest) (*cloudscalesdk.LoadBalancer, error) { - createdLB = true - return &cloudscalesdk.LoadBalancer{UUID: "new-lb-uuid", Name: req.Name}, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, - lbEnabled: true, - }) - clusterScope.CloudscaleCluster.Status.LoadBalancerID = "deleted-lb-uuid" - - r := newTestReconciler() - - _, err := r.reconcileLB(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(createdLB).To(BeTrue(), "Should create a new LB when old one was deleted") - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(Equal("new-lb-uuid")) -} - func TestReconcileLB_UsesCustomFlavor(t *testing.T) { g := NewWithT(t) @@ -381,92 +366,6 @@ func TestReconcileLBPool_SkipsIfAlreadyExists(t *testing.T) { g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(Equal("existing-pool-uuid")) } -func TestReconcileLBPool_FindsExistingByTag(t *testing.T) { - g := NewWithT(t) - - poolService := &mockLoadBalancerPoolService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPool, error) { - return []cloudscalesdk.LoadBalancerPool{ - {UUID: "found-pool-uuid", Name: "test-cluster-cp-pool"}, - }, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerPoolRequest) (*cloudscalesdk.LoadBalancerPool, error) { - g.Fail("Create should not be called when pool is found by tag") - return nil, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - poolService: poolService, - lbEnabled: true, - }) - r := newTestReconciler() - - _, err := r.reconcileLBPool(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(Equal("found-pool-uuid")) -} - -func TestReconcileLBPool_ErrorsOnMultipleByTag(t *testing.T) { - g := NewWithT(t) - - poolService := &mockLoadBalancerPoolService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPool, error) { - return []cloudscalesdk.LoadBalancerPool{ - {UUID: "pool-uuid-1", Name: "test-cluster-cp-pool"}, - {UUID: "pool-uuid-2", Name: "test-cluster-cp-pool"}, - }, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - poolService: poolService, - lbEnabled: true, - }) - r := newTestReconciler() - - _, err := r.reconcileLBPool(context.Background(), clusterScope) - - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("found 2 load balancer pools matching tag filter")) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(BeEmpty()) -} - -func TestReconcileLBPool_RecreatesIfDeletedExternally(t *testing.T) { - g := NewWithT(t) - - var createdPool bool - - poolService := &mockLoadBalancerPoolService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerPool, error) { - return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} - }, - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPool, error) { - return nil, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerPoolRequest) (*cloudscalesdk.LoadBalancerPool, error) { - createdPool = true - return &cloudscalesdk.LoadBalancerPool{UUID: "new-pool-uuid", Name: req.Name}, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - poolService: poolService, - lbEnabled: true, - }) - clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID - clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = "deleted-pool-uuid" - - r := newTestReconciler() - - _, err := r.reconcileLBPool(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(createdPool).To(BeTrue(), "Should create a new pool when old one was deleted") - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(Equal("new-pool-uuid")) -} - func TestReconcileLBPool_UsesCustomAlgorithm(t *testing.T) { g := NewWithT(t) @@ -558,92 +457,6 @@ func TestReconcileLBListener_SkipsIfAlreadyExists(t *testing.T) { g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(Equal("existing-listener-uuid")) } -func TestReconcileLBListener_FindsExistingByTag(t *testing.T) { - g := NewWithT(t) - - listenerService := &mockLoadBalancerListenerService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerListener, error) { - return []cloudscalesdk.LoadBalancerListener{ - {UUID: "found-listener-uuid", Name: "test-cluster-cp-listener"}, - }, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerListenerRequest) (*cloudscalesdk.LoadBalancerListener, error) { - g.Fail("Create should not be called when listener is found by tag") - return nil, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - listenerService: listenerService, - lbEnabled: true, - }) - r := newTestReconciler() - - _, err := r.reconcileLBListener(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(Equal("found-listener-uuid")) -} - -func TestReconcileLBListener_ErrorsOnMultipleByTag(t *testing.T) { - g := NewWithT(t) - - listenerService := &mockLoadBalancerListenerService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerListener, error) { - return []cloudscalesdk.LoadBalancerListener{ - {UUID: "listener-uuid-1", Name: "test-cluster-cp-listener"}, - {UUID: "listener-uuid-2", Name: "test-cluster-cp-listener"}, - }, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - listenerService: listenerService, - lbEnabled: true, - }) - r := newTestReconciler() - - _, err := r.reconcileLBListener(context.Background(), clusterScope) - - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("found 2 load balancer listeners matching tag filter")) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(BeEmpty()) -} - -func TestReconcileLBListener_RecreatesIfDeletedExternally(t *testing.T) { - g := NewWithT(t) - - var createdListener bool - - listenerService := &mockLoadBalancerListenerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerListener, error) { - return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} - }, - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerListener, error) { - return nil, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerListenerRequest) (*cloudscalesdk.LoadBalancerListener, error) { - createdListener = true - return &cloudscalesdk.LoadBalancerListener{UUID: "new-listener-uuid", Name: req.Name}, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - listenerService: listenerService, - lbEnabled: true, - }) - clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID - clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = "deleted-listener-uuid" - - r := newTestReconciler() - - _, err := r.reconcileLBListener(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(createdListener).To(BeTrue(), "Should create a new listener when old one was deleted") - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(Equal("new-listener-uuid")) -} - func TestReconcileLBListener_UsesCustomPort(t *testing.T) { g := NewWithT(t) @@ -736,92 +549,6 @@ func TestReconcileLBHealthMonitor_SkipsIfAlreadyExists(t *testing.T) { g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("existing-hm-uuid")) } -func TestReconcileLBHealthMonitor_FindsExistingByTag(t *testing.T) { - g := NewWithT(t) - - healthMonitorService := &mockLoadBalancerHealthMonitorService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerHealthMonitor, error) { - return []cloudscalesdk.LoadBalancerHealthMonitor{ - {UUID: "found-hm-uuid"}, - }, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerHealthMonitorRequest) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { - g.Fail("Create should not be called when health monitor is found by tag") - return nil, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - healthMonitorService: healthMonitorService, - lbEnabled: true, - }) - r := newTestReconciler() - - _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("found-hm-uuid")) -} - -func TestReconcileLBHealthMonitor_ErrorsOnMultipleByTag(t *testing.T) { - g := NewWithT(t) - - healthMonitorService := &mockLoadBalancerHealthMonitorService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerHealthMonitor, error) { - return []cloudscalesdk.LoadBalancerHealthMonitor{ - {UUID: "hm-uuid-1"}, - {UUID: "hm-uuid-2"}, - }, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - healthMonitorService: healthMonitorService, - lbEnabled: true, - }) - r := newTestReconciler() - - _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) - - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("found 2 load balancer health monitors matching tag filter")) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(BeEmpty()) -} - -func TestReconcileLBHealthMonitor_RecreatesIfDeletedExternally(t *testing.T) { - g := NewWithT(t) - - var createdHM bool - - healthMonitorService := &mockLoadBalancerHealthMonitorService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { - return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} - }, - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerHealthMonitor, error) { - return nil, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerHealthMonitorRequest) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { - createdHM = true - return &cloudscalesdk.LoadBalancerHealthMonitor{UUID: "new-hm-uuid"}, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - healthMonitorService: healthMonitorService, - lbEnabled: true, - }) - clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID - clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = "deleted-hm-uuid" - - r := newTestReconciler() - - _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(createdHM).To(BeTrue(), "Should create a new health monitor when old one was deleted") - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("new-hm-uuid")) -} - func TestReconcileLBHealthMonitor_UsesCustomThresholds(t *testing.T) { g := NewWithT(t) @@ -886,20 +613,11 @@ func TestReconcileLoadBalancer_SkipsWhenDisabled(t *testing.T) { g.Expect(result.RequeueAfter).To(BeZero()) } -func TestReconcileLoadBalancer_WaitsForLBRunning(t *testing.T) { +func TestReconcileLoadBalancer_ChangingStatusBlocksAndRequeues(t *testing.T) { g := NewWithT(t) - lbService := &mockLoadBalancerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return &cloudscalesdk.LoadBalancer{ - UUID: id, - Status: "creating", // Not running yet - }, nil - }, - } - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, + loadBalancerService: existingLBMock(LoadBalancerChangingStatus), lbEnabled: true, }) clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID @@ -909,133 +627,93 @@ func TestReconcileLoadBalancer_WaitsForLBRunning(t *testing.T) { result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).ToNot(BeZero(), "Should requeue when LB is not running") + g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "changing status should requeue after 10s") } -func TestReconcileLoadBalancer_ErrorStatusHasLongerRequeue(t *testing.T) { +func TestReconcileLoadBalancer_ErrorStatusProceedsWithMemberReconciliation(t *testing.T) { g := NewWithT(t) - lbService := &mockLoadBalancerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return &cloudscalesdk.LoadBalancer{ - UUID: id, - Status: "error", - }, nil - }, - } - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, + loadBalancerService: existingLBMock(LoadBalancerErrorStatus), lbEnabled: true, }) clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = testListenerUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = testHMUUID - r := newTestReconciler() + r := newTestReconcilerWithClient(newFakeClientForLB()) result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "error status should requeue after 10s") + g.Expect(result.RequeueAfter).To(BeZero(), "error status should not block member reconciliation") + + // LoadBalancerReadyCondition should be False because LB is not running + cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.LoadBalancerReadyCondition) + g.Expect(cond).ToNot(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(infrastructurev1beta2.LoadBalancerNotReadyReason)) } -func TestReconcileLoadBalancer_DegradedStatusHasLongerRequeue(t *testing.T) { +func TestReconcileLoadBalancer_DegradedStatusProceedsWithMemberReconciliation(t *testing.T) { g := NewWithT(t) - lbService := &mockLoadBalancerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return &cloudscalesdk.LoadBalancer{ - UUID: id, - Status: "degraded", - }, nil - }, - } + var deletedMemberUUID string clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, - lbEnabled: true, + loadBalancerService: existingLBMock(LoadBalancerDegradedStatus), + poolMemberService: &mockLoadBalancerPoolMemberService{ + listFn: func(ctx context.Context, poolID string, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPoolMember, error) { + return []cloudscalesdk.LoadBalancerPoolMember{ + {Name: "stale-cp-0", UUID: "stale-uuid", Address: "10.0.0.10"}, + }, nil + }, + deleteFn: func(ctx context.Context, poolID, memberID string) error { + deletedMemberUUID = memberID + return nil + }, + }, + lbEnabled: true, }) clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = testListenerUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = testHMUUID - r := newTestReconciler() + r := newTestReconcilerWithClient(newFakeClientForLB()) result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "degraded status should requeue after 10s") -} - -func TestReconcileLoadBalancer_CreatingStatusHasShorterRequeue(t *testing.T) { - g := NewWithT(t) - - lbService := &mockLoadBalancerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return &cloudscalesdk.LoadBalancer{ - UUID: id, - Status: "creating", - }, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, - lbEnabled: true, - }) - clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID + g.Expect(result.RequeueAfter).To(BeZero(), "degraded status should not block member reconciliation") - r := newTestReconciler() - - result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) + // Stale member should have been removed + g.Expect(deletedMemberUUID).To(Equal("stale-uuid")) - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).To(Equal(5*time.Second), "creating status should requeue after 5s") + // LoadBalancerReadyCondition should be False because LB is degraded + cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.LoadBalancerReadyCondition) + g.Expect(cond).ToNot(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(infrastructurev1beta2.LoadBalancerNotReadyReason)) } func TestReconcileLoadBalancer_SetsControlPlaneEndpoint(t *testing.T) { g := NewWithT(t) - lbService := &mockLoadBalancerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return &cloudscalesdk.LoadBalancer{ - UUID: id, - Status: "running", - VIPAddresses: []cloudscalesdk.VIPAddress{ - {Address: "203.0.113.10"}, - }, - }, nil - }, - } - - poolService := &mockLoadBalancerPoolService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerPool, error) { - return &cloudscalesdk.LoadBalancerPool{UUID: id}, nil - }, - } - - listenerService := &mockLoadBalancerListenerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerListener, error) { - return &cloudscalesdk.LoadBalancerListener{UUID: id}, nil - }, - } - - healthMonitorService := &mockLoadBalancerHealthMonitorService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { - return &cloudscalesdk.LoadBalancerHealthMonitor{UUID: id}, nil - }, - } - - poolMemberService := &mockLoadBalancerPoolMemberService{ - listFn: func(ctx context.Context, poolID string, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPoolMember, error) { - return nil, nil - }, - } - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, - poolService: poolService, - listenerService: listenerService, - healthMonitorService: healthMonitorService, - poolMemberService: poolMemberService, - lbEnabled: true, + loadBalancerService: &mockLoadBalancerService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { + return &cloudscalesdk.LoadBalancer{ + UUID: id, + Status: LoadBalancerRunningStatus, + VIPAddresses: []cloudscalesdk.VIPAddress{ + {Address: "203.0.113.10"}, + }, + }, nil + }, + }, + lbEnabled: true, }) clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID diff --git a/internal/controller/cloudscalecluster_reconcile_test.go b/internal/controller/cloudscalecluster_reconcile_test.go index 364efe6..ab5b4ca 100644 --- a/internal/controller/cloudscalecluster_reconcile_test.go +++ b/internal/controller/cloudscalecluster_reconcile_test.go @@ -122,7 +122,7 @@ func defaultMocks() reconcileTestOpts { getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { return &cloudscalesdk.LoadBalancer{ UUID: id, - Status: "running", + Status: LoadBalancerRunningStatus, VIPAddresses: []cloudscalesdk.VIPAddress{ {Address: "1.2.3.4"}, }, @@ -248,7 +248,7 @@ func TestReconcileNormal_LBPendingReturnsRequeue(t *testing.T) { mocks := defaultMocks() mocks.lbService = &mockLoadBalancerService{ getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return &cloudscalesdk.LoadBalancer{UUID: id, Status: "creating"}, nil + return &cloudscalesdk.LoadBalancer{UUID: id, Status: LoadBalancerChangingStatus}, nil }, } From 184ba561dbb6f59052641433fa18496dd75178df Mon Sep 17 00:00:00 2001 From: Michael Weibel Date: Tue, 12 May 2026 11:09:27 +0200 Subject: [PATCH 8/9] fix: network CIDR to avoid CNI CIDR overlap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cilium CNI CIDR is by default `10.0.0.0/24`. If we have the same network subnet CIDR, the loadbalancer health check may break which results in a broken cluster. Setting 172.18.0.0/24 avoids this: - does not overlap with Cilium's default cluster-pool 10.0.0.0/8. - does not overlap with the template's clusterNetwork.pods.cidrBlocks: ["192.168.0.0/16"] or services.cidrBlocks: ["10.96.0.0/12"]. - avoids 172.17.0.0/16 (Docker's default bridge on many Linux hosts). Old range: 10.100.0.0/24 — still inside Cilium's 10.0.0.0/8, only works for small clusters. --- README.md | 7 +- api/v1beta2/cloudscalecluster_types.go | 6 -- cmd/main.go | 2 - ...e.cluster.x-k8s.io_cloudscaleclusters.yaml | 13 --- internal/cloudscale/client.go | 4 +- internal/cloudscale/client_test.go | 102 ------------------ internal/controller/cloudscale_services.go | 14 ++- .../cloudscalecluster_floatingip.go | 5 +- .../cloudscalecluster_loadbalancer.go | 22 +++- .../cloudscalecluster_loadbalancer_test.go | 32 ++++++ .../controller/cloudscalecluster_network.go | 2 +- .../cloudscalecluster_servergroup.go | 5 +- .../v1beta2/cloudscalecluster_webhook.go | 8 +- .../v1beta2/cloudscalecluster_webhook_test.go | 24 +---- templates/cluster-template.yaml | 2 +- test/e2e/config/cloudscale.yaml | 2 +- 16 files changed, 80 insertions(+), 170 deletions(-) diff --git a/README.md b/README.md index 326fdcc..b027907 100644 --- a/README.md +++ b/README.md @@ -87,11 +87,16 @@ clusterctl generate cluster my-cluster \ | Flavor | Network | CP Endpoint | Node Connectivity | Extra Env Vars | Notes | |---------------------------|---------------------------|-----------------------|-------------------|---------------------------|----------------------| -| *(default)* | Managed (`10.100.0.0/24`) | Public LB (DualStack) | Public + cluster | — | | +| *(default)* | Managed (`172.18.0.0/24`) | Public LB (DualStack) | Public + cluster | — | | | `fip` | Pre-Existing | Floating IP (IPv4) | Public + cluster | `CLOUDSCALE_NETWORK_UUID` | | | `public-lb-private-nodes` | Pre-Existing + NAT | Public LB | Private only | `CLOUDSCALE_NETWORK_UUID` | Requires NAT gateway | | `pre-existing-network` | Pre-Existing | Public LB (DualStack) | Public + cluster | `CLOUDSCALE_NETWORK_UUID` | | +The default `networks[].cidr` is `172.18.0.0/24` so it does not overlap with the default Cilium +cluster-pool IPAM range `10.0.0.0/8`. If you override `networks[].cidr` to a range inside +`10.0.0.0/8`, make sure to configure your CNI's IP range correctly. Overlapping +ranges may break for example control-plane LB's health checks. + ## Development This is a kubebuilder-scaffolded project. For new APIs, Webhooks, etc. [kubebuilder](https://book.kubebuilder.io/) diff --git a/api/v1beta2/cloudscalecluster_types.go b/api/v1beta2/cloudscalecluster_types.go index 7250115..294cb68 100644 --- a/api/v1beta2/cloudscalecluster_types.go +++ b/api/v1beta2/cloudscalecluster_types.go @@ -163,12 +163,6 @@ type LoadBalancerSpec struct { // +optional Network string `json:"network,omitempty"` - // IPFamily specifies the IP family for the LB VIP address(es). - // +kubebuilder:validation:Enum=IPv4;IPv6;DualStack - // +kubebuilder:default=DualStack - // +optional - IPFamily IPFamily `json:"ipFamily,omitempty"` - // HealthMonitor configures the load balancer health monitor. // +optional HealthMonitor HealthMonitorSpec `json:"healthMonitor,omitempty"` diff --git a/cmd/main.go b/cmd/main.go index 1703d91..b17c68a 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -192,8 +192,6 @@ func main() { HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, LeaderElectionID: "cloudscale.infrastructure.cluster.x-k8s.io", - // MaxConcurrentReconciles is set per-controller via --cluster-concurrency - // and --machine-concurrency flags, not globally. // LeaderElectionReleaseOnCancel: true, }) if err != nil { diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml index 9fe7c87..d53dd94 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml @@ -139,19 +139,6 @@ spec: minimum: 1 type: integer type: object - ipFamily: - allOf: - - enum: - - IPv4 - - IPv6 - - DualStack - - enum: - - IPv4 - - IPv6 - - DualStack - default: DualStack - description: IPFamily specifies the IP family for the LB VIP address(es). - type: string network: description: |- Network places the LB VIP on a private network (internal LB). diff --git a/internal/cloudscale/client.go b/internal/cloudscale/client.go index 777c555..7539c8f 100644 --- a/internal/cloudscale/client.go +++ b/internal/cloudscale/client.go @@ -46,14 +46,14 @@ type Client struct { const ( // ReadTimeout is the context timeout for Get/List API calls. - ReadTimeout = 30 * time.Second + ReadTimeout = 1 * time.Second // WriteTimeout is the context timeout for Create/Update API calls. // Creates can take 60s+ under API load. WriteTimeout = 2 * time.Minute // DeleteTimeout is the context timeout for Delete API calls. - DeleteTimeout = 30 * time.Second + DeleteTimeout = 2 * time.Second ) // NewTransport creates an http.Transport configured for the cloudscale.ch API. diff --git a/internal/cloudscale/client_test.go b/internal/cloudscale/client_test.go index 3c91e92..865c0a2 100644 --- a/internal/cloudscale/client_test.go +++ b/internal/cloudscale/client_test.go @@ -17,19 +17,13 @@ limitations under the License. package cloudscale import ( - "context" - "errors" "fmt" - "net" - "net/http" "net/url" "os" "testing" - "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" . "github.com/onsi/gomega" - "golang.org/x/oauth2" ) func TestIsNotFound(t *testing.T) { @@ -102,99 +96,3 @@ func TestIsTimeoutError(t *testing.T) { }) } } - -func TestNewTransport(t *testing.T) { - g := NewWithT(t) - transport := NewTransport() - - g.Expect(transport).ToNot(BeNil()) - g.Expect(transport.ForceAttemptHTTP2).To(BeTrue()) - g.Expect(transport.TLSHandshakeTimeout).To(Equal(5 * time.Second)) - g.Expect(transport.IdleConnTimeout).To(Equal(90 * time.Second)) - g.Expect(transport.MaxIdleConns).To(Equal(50)) - g.Expect(transport.MaxIdleConnsPerHost).To(Equal(50)) - g.Expect(transport.MaxConnsPerHost).To(Equal(0)) - - // Verify HTTP/2 ping settings - g.Expect(transport.HTTP2).ToNot(BeNil()) - g.Expect(transport.HTTP2.SendPingTimeout).To(Equal(5 * time.Second)) - g.Expect(transport.HTTP2.PingTimeout).To(Equal(3 * time.Second)) - - // Verify dial settings by inspecting the DialContext function - g.Expect(transport.DialContext).ToNot(BeNil()) -} - -func TestNewTransport_DialTimeout(t *testing.T) { - g := NewWithT(t) - transport := NewTransport() - - // Dial to an unroutable address to verify timeout works - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - // 192.0.2.1 is TEST-NET-1 (RFC 5737) - guaranteed unroutable - conn, err := transport.DialContext(ctx, "tcp", "192.0.2.1:443") - if conn != nil { - _ = conn.Close() - } - - // Should fail with a timeout (dial timeout is 5s) - g.Expect(err).To(HaveOccurred()) - var netErr net.Error - g.Expect(err).To(BeAssignableToTypeOf(&net.OpError{})) - if ok := errors.As(err, &netErr); ok { - g.Expect(netErr.Timeout()).To(BeTrue()) - } -} - -func TestNewClient_ReturnsNonNilClient(t *testing.T) { - g := NewWithT(t) - transport := NewTransport() - client := NewClient("fake-token", transport) - g.Expect(client).ToNot(BeNil()) - g.Expect(client.LoadBalancers).ToNot(BeNil()) - g.Expect(client.Servers).ToNot(BeNil()) - g.Expect(client.Networks).ToNot(BeNil()) -} - -func TestNewClient_NoGlobalTimeout(t *testing.T) { - g := NewWithT(t) - - // Start a slow test server that takes 3s to respond - slowServer := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - time.Sleep(3 * time.Second) - w.WriteHeader(http.StatusOK) - }) - server := &http.Server{Handler: slowServer} - listener, err := net.Listen("tcp", "127.0.0.1:0") - g.Expect(err).ToNot(HaveOccurred()) - go func() { - _ = server.Serve(listener) - - }() - defer func(server *http.Server) { - _ = server.Close() - }(server) - - transport := NewTransport() - - // The client should NOT have a global timeout - only context-based - tokenSource := oauth2.StaticTokenSource(&oauth2.Token{AccessToken: "test"}) - httpClient := &http.Client{ - Transport: &oauth2.Transport{ - Source: tokenSource, - Base: transport, - }, - } - // Verify no global timeout is set - g.Expect(httpClient.Timeout).To(Equal(time.Duration(0))) -} - -func TestTimeoutConstants(t *testing.T) { - g := NewWithT(t) - - // Verify timeout constants exist and have sensible values - g.Expect(ReadTimeout).To(Equal(30 * time.Second)) - g.Expect(WriteTimeout).To(Equal(2 * time.Minute)) - g.Expect(DeleteTimeout).To(Equal(30 * time.Second)) -} diff --git a/internal/controller/cloudscale_services.go b/internal/controller/cloudscale_services.go index 8053ea4..664b571 100644 --- a/internal/controller/cloudscale_services.go +++ b/internal/controller/cloudscale_services.go @@ -3,7 +3,6 @@ package controller import ( "context" "fmt" - "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" @@ -11,13 +10,6 @@ import ( "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) -const ( - // CreateTimeoutRequeueInterval is the requeue interval after an HTTP timeout - // on a Create call. This MUST be longer than cloudscale.WriteTimeout (2m) so the - // original request has time to complete before we retry. - CreateTimeoutRequeueInterval = 150 * time.Second -) - // getListService is satisfied by all cloudscale SDK resource services. type getListService[T any] interface { Get(ctx context.Context, id string) (*T, error) @@ -49,6 +41,12 @@ func ensureResource[T any]( // Resource was deleted externally, fall through to list/recreate } + // tag-based lookup can happen after the resource has been created just before due to the caching behavior of + // controller-runtime. It is therefore expected to have the following in the logs: + // Reconcile 1: Create & Patch Status to have the ID recorded + // Reconcile 2: Get 404, List 200 (found existing ...) + // Reconcile 3: Get 200 -> here the cache is up-to-date + listCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) defer cancel() items, err := svc.List(listCtx, cloudscalesdk.WithTagFilter(tags)) diff --git a/internal/controller/cloudscalecluster_floatingip.go b/internal/controller/cloudscalecluster_floatingip.go index b0b6d8e..5cd5885 100644 --- a/internal/controller/cloudscalecluster_floatingip.go +++ b/internal/controller/cloudscalecluster_floatingip.go @@ -144,9 +144,8 @@ func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Con fip, err = clusterScope.CloudscaleClient.FloatingIPs.Create(createCtx, req) if err != nil { if cloudscale.IsTimeoutError(err) { - const timeout = createFloatingIPTimeoutRequeueAfter - clusterScope.Info("Floating IP creation timed out, waiting before retry", "requeueAfter", timeout) - return ctrl.Result{RequeueAfter: timeout}, nil + clusterScope.Info("Floating IP creation timed out, waiting before retry", "requeueAfter", createFloatingIPTimeoutRequeueAfter) + return ctrl.Result{RequeueAfter: createFloatingIPTimeoutRequeueAfter}, nil } return ctrl.Result{}, fmt.Errorf("creating floating IP: %w", err) } diff --git a/internal/controller/cloudscalecluster_loadbalancer.go b/internal/controller/cloudscalecluster_loadbalancer.go index 720c93b..e9e87c7 100644 --- a/internal/controller/cloudscalecluster_loadbalancer.go +++ b/internal/controller/cloudscalecluster_loadbalancer.go @@ -56,12 +56,13 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, } var lbPending bool + lbPendingMsg := "Load balancer is not yet running" defer func() { if reterr != nil { r.setCondition(clusterScope, infrastructurev1beta2.LoadBalancerReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.LoadBalancerErrorReason, reterr.Error()) } else if lbPending { - r.setCondition(clusterScope, infrastructurev1beta2.LoadBalancerReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.LoadBalancerNotReadyReason, "Load balancer is not yet running") + r.setCondition(clusterScope, infrastructurev1beta2.LoadBalancerReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.LoadBalancerNotReadyReason, lbPendingMsg) } else { r.setCondition(clusterScope, infrastructurev1beta2.LoadBalancerReadyCondition, metav1.ConditionTrue, infrastructurev1beta2.LoadBalancerRunningReason, "") } @@ -100,6 +101,25 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, // because removing stale members is the only path back to "running" clusterScope.Info("Load balancer is not running, proceeding with member reconciliation", "status", lb.Status) lbPending = true + lbPendingMsg = fmt.Sprintf("Load balancer is in %s", lb.Status) + if poolID := clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID; poolID != "" { + listCtx, listCancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + members, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.List(listCtx, poolID) + listCancel() + if err != nil { + clusterScope.Info("Failed to list pool members for diagnostics", "err", err.Error()) + } else { + var down []string + for _, m := range members { + if m.MonitorStatus == "down" { + down = append(down, fmt.Sprintf("%s@%s:%d", m.Name, m.Address, m.ProtocolPort)) + } + } + if len(down) > 0 { + lbPendingMsg = fmt.Sprintf("%s; pool members failing health check: %v", lbPendingMsg, down) + } + } + } } // 2. Reconcile the pool diff --git a/internal/controller/cloudscalecluster_loadbalancer_test.go b/internal/controller/cloudscalecluster_loadbalancer_test.go index 2499d1d..6656068 100644 --- a/internal/controller/cloudscalecluster_loadbalancer_test.go +++ b/internal/controller/cloudscalecluster_loadbalancer_test.go @@ -698,6 +698,38 @@ func TestReconcileLoadBalancer_DegradedStatusProceedsWithMemberReconciliation(t g.Expect(cond.Reason).To(Equal(infrastructurev1beta2.LoadBalancerNotReadyReason)) } +func TestReconcileLoadBalancer_ErrorStatusMessageNamesDownMembers(t *testing.T) { + g := NewWithT(t) + + clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ + loadBalancerService: existingLBMock(LoadBalancerErrorStatus), + poolMemberService: &mockLoadBalancerPoolMemberService{ + listFn: func(ctx context.Context, poolID string, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPoolMember, error) { + return []cloudscalesdk.LoadBalancerPoolMember{ + {Name: "cp-0", UUID: "cp-uuid", Address: "10.0.0.110", ProtocolPort: 6443, MonitorStatus: "down"}, + }, nil + }, + }, + lbEnabled: true, + }) + clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = testListenerUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = testHMUUID + + r := newTestReconcilerWithClient(newFakeClientForLB()) + + _, err := r.reconcileLoadBalancer(context.Background(), clusterScope) + g.Expect(err).ToNot(HaveOccurred()) + + cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.LoadBalancerReadyCondition) + g.Expect(cond).ToNot(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(infrastructurev1beta2.LoadBalancerNotReadyReason)) + g.Expect(cond.Message).To(ContainSubstring("error")) + g.Expect(cond.Message).To(ContainSubstring("cp-0@10.0.0.110:6443")) +} + func TestReconcileLoadBalancer_SetsControlPlaneEndpoint(t *testing.T) { g := NewWithT(t) diff --git a/internal/controller/cloudscalecluster_network.go b/internal/controller/cloudscalecluster_network.go index 551387d..180bc14 100644 --- a/internal/controller/cloudscalecluster_network.go +++ b/internal/controller/cloudscalecluster_network.go @@ -232,8 +232,8 @@ func (r *CloudscaleClusterReconciler) deleteNetwork(ctx context.Context, cluster logger.Info("Deleting network") deleteCtx, cancel := context.WithTimeout(ctx, cloudscale.DeleteTimeout) - defer cancel() err := clusterScope.CloudscaleClient.Networks.Delete(deleteCtx, ns.NetworkID) + cancel() // return sentinel error which will wait a pre-defined amount of time if isLBPoolMembersError(err) { diff --git a/internal/controller/cloudscalecluster_servergroup.go b/internal/controller/cloudscalecluster_servergroup.go index bd746dc..3f480e9 100644 --- a/internal/controller/cloudscalecluster_servergroup.go +++ b/internal/controller/cloudscalecluster_servergroup.go @@ -79,8 +79,9 @@ func (r *CloudscaleClusterReconciler) deleteServerGroups(ctx context.Context, cl clusterScope.Info("Deleting server group", "serverGroupID", g.UUID, "name", g.Name) deleteCtx, cancelDelete := context.WithTimeout(ctx, cloudscale.DeleteTimeout) - defer cancelDelete() - if err := clusterScope.CloudscaleClient.ServerGroups.Delete(deleteCtx, g.UUID); err != nil { + err := clusterScope.CloudscaleClient.ServerGroups.Delete(deleteCtx, g.UUID) + cancelDelete() + if err != nil { if !cloudscale.IsNotFound(err) { return fmt.Errorf("deleting server group %s: %w", g.UUID, err) } diff --git a/internal/webhook/v1beta2/cloudscalecluster_webhook.go b/internal/webhook/v1beta2/cloudscalecluster_webhook.go index 8143a65..698989d 100644 --- a/internal/webhook/v1beta2/cloudscalecluster_webhook.go +++ b/internal/webhook/v1beta2/cloudscalecluster_webhook.go @@ -59,7 +59,7 @@ type CloudscaleClusterCustomDefaulter struct { RegionInfo *cloudscale.RegionInfo } -const defaultSubnetCIDR = "10.0.0.0/24" +const defaultSubnetCIDR = "172.18.0.0/24" // Default implements webhook.CustomDefaulter so a webhook will be registered for the Kind CloudscaleCluster. func (d *CloudscaleClusterCustomDefaulter) Default(_ context.Context, cluster *infrastructurev1beta2.CloudscaleCluster) error { @@ -93,9 +93,6 @@ func (d *CloudscaleClusterCustomDefaulter) Default(_ context.Context, cluster *i if cluster.Spec.ControlPlaneLoadBalancer.APIServerPort == 0 { cluster.Spec.ControlPlaneLoadBalancer.APIServerPort = 6443 } - if cluster.Spec.ControlPlaneLoadBalancer.IPFamily == "" { - cluster.Spec.ControlPlaneLoadBalancer.IPFamily = infrastructurev1beta2.IPFamilyDualStack - } if cluster.Spec.ControlPlaneLoadBalancer.HealthMonitor.DelayS == 0 { cluster.Spec.ControlPlaneLoadBalancer.HealthMonitor.DelayS = 5 @@ -405,7 +402,7 @@ func validateFloatingIPRequiresLBOrPreExisting(cluster *infrastructurev1beta2.Cl } // validateLBImmutability forbids changes to LB fields that are baked into the LB at creation. -// Algorithm, Flavor, APIServerPort, IPFamily and the HealthMonitor settings cannot be reissued +// Algorithm, Flavor, APIServerPort and the HealthMonitor settings cannot be reissued // to an existing cloudscale.ch LB, so changing them in spec would silently lie to the user. func validateLBImmutability(oldLB, newLB *infrastructurev1beta2.LoadBalancerSpec, fldPath *field.Path) field.ErrorList { var allErrs field.ErrorList @@ -421,7 +418,6 @@ func validateLBImmutability(oldLB, newLB *infrastructurev1beta2.LoadBalancerSpec forbidIfChanged("algorithm", oldLB.Algorithm, newLB.Algorithm) forbidIfChanged("flavor", oldLB.Flavor, newLB.Flavor) forbidIfChanged("apiServerPort", oldLB.APIServerPort, newLB.APIServerPort) - forbidIfChanged("ipFamily", oldLB.IPFamily, newLB.IPFamily) hmPath := fldPath.Child("healthMonitor") hmForbid := func(child string, oldV, newV int) { diff --git a/internal/webhook/v1beta2/cloudscalecluster_webhook_test.go b/internal/webhook/v1beta2/cloudscalecluster_webhook_test.go index 33cfde5..84a6f70 100644 --- a/internal/webhook/v1beta2/cloudscalecluster_webhook_test.go +++ b/internal/webhook/v1beta2/cloudscalecluster_webhook_test.go @@ -153,15 +153,6 @@ func TestClusterDefaulting_APIServerPort(t *testing.T) { g.Expect(obj.Spec.ControlPlaneLoadBalancer.APIServerPort).To(Equal(int32(6443))) } -func TestClusterDefaulting_LBIPFamily(t *testing.T) { - g := NewWithT(t) - obj, _, _, defaulter := newClusterWebhookTestObjects() - obj.Spec.ControlPlaneLoadBalancer.IPFamily = "" - - g.Expect(defaulter.Default(ctx, obj)).To(Succeed()) - g.Expect(obj.Spec.ControlPlaneLoadBalancer.IPFamily).To(Equal(infrastructurev1beta2.IPFamilyDualStack)) -} - func TestClusterDefaulting_HealthMonitorFields(t *testing.T) { g := NewWithT(t) obj, _, _, defaulter := newClusterWebhookTestObjects() @@ -225,7 +216,6 @@ func TestClusterDefaulting_AllDefaultsApplied(t *testing.T) { g.Expect(obj.Spec.ControlPlaneLoadBalancer.Algorithm).To(Equal("round_robin")) g.Expect(obj.Spec.ControlPlaneLoadBalancer.Flavor).To(Equal("lb-standard")) g.Expect(obj.Spec.ControlPlaneLoadBalancer.APIServerPort).To(Equal(int32(6443))) - g.Expect(obj.Spec.ControlPlaneLoadBalancer.IPFamily).To(Equal(infrastructurev1beta2.IPFamilyDualStack)) g.Expect(obj.Spec.ControlPlaneLoadBalancer.HealthMonitor.DelayS).To(Equal(5)) g.Expect(obj.Spec.ControlPlaneLoadBalancer.HealthMonitor.TimeoutS).To(Equal(3)) g.Expect(obj.Spec.ControlPlaneLoadBalancer.HealthMonitor.UpThreshold).To(Equal(2)) @@ -330,7 +320,7 @@ func TestClusterValidateCreate_GatewayWithinCIDR(t *testing.T) { obj.Spec.Region = RegionRma obj.Spec.Zone = ZoneRma1 obj.Spec.Networks = []infrastructurev1beta2.NetworkSpec{ - {Name: "main", CIDR: defaultSubnetCIDR, GatewayAddress: "10.0.0.1"}, + {Name: "main", CIDR: defaultSubnetCIDR, GatewayAddress: "172.18.0.1"}, } _, err := validator.ValidateCreate(ctx, obj) @@ -742,13 +732,6 @@ func TestClusterValidateUpdate_LBFieldsImmutable(t *testing.T) { mutate: func(c *infrastructurev1beta2.CloudscaleCluster) { c.Spec.ControlPlaneLoadBalancer.APIServerPort = 8443 }, errPath: "controlPlaneLoadBalancer.apiServerPort", }, - { - name: "IPFamily", - mutate: func(c *infrastructurev1beta2.CloudscaleCluster) { - c.Spec.ControlPlaneLoadBalancer.IPFamily = infrastructurev1beta2.IPFamilyIPv6 - }, - errPath: "controlPlaneLoadBalancer.ipFamily", - }, { name: "HealthMonitor.DelayS", mutate: func(c *infrastructurev1beta2.CloudscaleCluster) { @@ -783,11 +766,10 @@ func TestClusterValidateUpdate_LBFieldsImmutable(t *testing.T) { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) obj, oldObj, validator := setupUpdateTestObjects() - // Seed health monitor + IPFamily on old so changes are visible. + // Seed health monitor on old so changes are visible. oldObj.Spec.ControlPlaneLoadBalancer.Algorithm = "round_robin" oldObj.Spec.ControlPlaneLoadBalancer.Flavor = "lb-standard" oldObj.Spec.ControlPlaneLoadBalancer.APIServerPort = 6443 - oldObj.Spec.ControlPlaneLoadBalancer.IPFamily = infrastructurev1beta2.IPFamilyDualStack oldObj.Spec.ControlPlaneLoadBalancer.HealthMonitor = infrastructurev1beta2.HealthMonitorSpec{ DelayS: 5, TimeoutS: 3, UpThreshold: 2, DownThreshold: 3, } @@ -833,7 +815,7 @@ func TestClusterValidateDelete_AlwaysSucceeds(t *testing.T) { func TestValidateGatewayInCIDR_ValidGateway(t *testing.T) { g := NewWithT(t) - errs := validateGatewayInCIDR(defaultSubnetCIDR, "10.0.0.1", field.NewPath("spec", "networks", "gatewayAddress")) + errs := validateGatewayInCIDR(defaultSubnetCIDR, "172.18.0.1", field.NewPath("spec", "networks", "gatewayAddress")) g.Expect(errs).To(BeEmpty()) } diff --git a/templates/cluster-template.yaml b/templates/cluster-template.yaml index d075f2a..f3f016b 100644 --- a/templates/cluster-template.yaml +++ b/templates/cluster-template.yaml @@ -39,7 +39,7 @@ spec: region: "${CLOUDSCALE_REGION}" networks: - name: "${CLUSTER_NAME}" - cidr: "10.0.0.0/24" + cidr: "172.18.0.0/24" gatewayAddress: "" # disable gateway, use public interface for internet access credentialsRef: name: "${CLUSTER_NAME}-credentials" diff --git a/test/e2e/config/cloudscale.yaml b/test/e2e/config/cloudscale.yaml index 55f6c7b..e26364b 100644 --- a/test/e2e/config/cloudscale.yaml +++ b/test/e2e/config/cloudscale.yaml @@ -73,7 +73,7 @@ variables: CLOUDSCALE_MACHINE_IMAGE_UPGRADE_FROM: "custom:ubuntu-2404-kube-v1.34.3" # CLOUDSCALE_SSH_PUBLIC_KEY: Set via environment variable CLOUDSCALE_ROOT_VOLUME_SIZE: "20" - CLOUDSCALE_NETWORK_CIDR: "10.100.0.0/24" + CLOUDSCALE_NETWORK_CIDR: "172.18.0.0/24" # Machine template names for upgrade tests CONTROL_PLANE_MACHINE_TEMPLATE_UPGRADE_TO: "k8s-upgrade-control-plane" From f0bb584c01904726289fcea92424688ad11aaf3e Mon Sep 17 00:00:00 2001 From: Michael Weibel Date: Tue, 12 May 2026 13:59:39 +0200 Subject: [PATCH 9/9] fix(e2e): increase timeout, use concurrency flags, default conformance-fast avoids test failures due to timeout because conformance takes a long time --- Makefile | 5 +++-- README.md | 16 ++++++++-------- config/e2e/kustomization.yaml | 11 +++++++++++ config/e2e/manager_concurrency_patch.yaml | 6 ++++++ test/e2e/config/cloudscale.yaml | 2 +- 5 files changed, 29 insertions(+), 11 deletions(-) create mode 100644 config/e2e/kustomization.yaml create mode 100644 config/e2e/manager_concurrency_patch.yaml diff --git a/Makefile b/Makefile index d000dc2..88f18d2 100644 --- a/Makefile +++ b/Makefile @@ -99,7 +99,7 @@ E2E_CONF_FILE_SOURCE ?= $(shell pwd)/test/e2e/config/cloudscale.yaml E2E_CONF_FILE ?= $(shell pwd)/test/e2e/config/cloudscale.generated.yaml E2E_ARTIFACTS_FOLDER ?= $(shell pwd)/_artifacts E2E_TEMPLATES := test/e2e/data/infrastructure-cloudscale -GINKGO_TIMEOUT ?= 2h +GINKGO_TIMEOUT ?= 3h GINKGO_NODES ?= 1 SKIP_RESOURCE_CLEANUP ?= false USE_EXISTING_CLUSTER ?= false @@ -147,7 +147,8 @@ generate-e2e-config: ## Generate e2e config from template by resolving environme .PHONY: test-e2e test-e2e: TAG = $(E2E_TAG) -test-e2e: $(GINKGO) generate-e2e-templates generate-e2e-config docker-build docker-push ## Run all e2e tests +test-e2e: KUBETEST_CONFIGURATION = ./data/kubetest/conformance-fast.yaml +test-e2e: $(GINKGO) generate-e2e-templates generate-e2e-config docker-build docker-push ## Run all e2e tests (uses conformance-fast; for full conformance run test-e2e-conformance separately) $(GINKGO) -v --trace --tags=e2e \ --nodes=$(GINKGO_NODES) \ --timeout=$(GINKGO_TIMEOUT) \ diff --git a/README.md b/README.md index b027907..afd3721 100644 --- a/README.md +++ b/README.md @@ -124,14 +124,14 @@ filtering and are split into suites of increasing cost, scheduled accordingly: | Suite | Label | Description | ~Duration | Schedule | Make target | |-------------------------|---------------------------|------------------------------------------------------------------------------------------|-----------|----------|------------------------------------| -| Lifecycle | `lifecycle` | 1 CP + 1 worker: create, validate cloudscale resources, delete | < 5 min | Nightly | `test-e2e-lifecycle` | -| HA lifecycle | `ha` | 3 CP + 2 workers with anti-affinity server groups | < 10 min | Weekly | `test-e2e-ha` | -| Cluster upgrade | `upgrade` | Rolling K8s version upgrade (v1.34 → v1.35) | < 10 min | Weekly | `test-e2e-upgrade` | -| Self-hosted | `self-hosted` | clusterctl move (pivot) to workload cluster. Requires container image in public registry | < 15 min | Weekly | `test-e2e-self-hosted` | -| MD remediation | `md-remediation` | MachineHealthCheck auto-replacement of unhealthy workers | < 10 min | Weekly | `test-e2e-md-remediation` | -| Pre-Existing networking | `pre-existing-networking` | Pre-Existing network: public-LB + private-nodes and floating-IP variants | < 10 min | Weekly | `test-e2e-pre-existing-networking` | -| Conformance (fast) | `conformance` | K8s conformance, skip Serial tests | < 60 min | Weekly | `test-e2e-conformance-fast` | -| Conformance (full) | `conformance` | Full K8s conformance including Serial tests | < 120 min | Biweekly | `test-e2e-conformance` | +| Lifecycle | `lifecycle` | 1 CP + 1 worker: create, validate cloudscale resources, delete | ~5 min | Nightly | `test-e2e-lifecycle` | +| HA lifecycle | `ha` | 3 CP + 2 workers with anti-affinity server groups | ~8 min | Weekly | `test-e2e-ha` | +| Cluster upgrade | `upgrade` | Rolling K8s version upgrade (v1.34 → v1.35) | ~25 min | Weekly | `test-e2e-upgrade` | +| Self-hosted | `self-hosted` | clusterctl move (pivot) to workload cluster. Requires container image in public registry | ~13 min | Weekly | `test-e2e-self-hosted` | +| MD remediation | `md-remediation` | MachineHealthCheck auto-replacement of unhealthy workers | ~6 min | Weekly | `test-e2e-md-remediation` | +| Pre-Existing networking | `pre-existing-networking` | Pre-Existing network: public-LB + private-nodes and floating-IP variants | ~30 min | Weekly | `test-e2e-pre-existing-networking` | +| Conformance (fast) | `conformance` | K8s conformance, skip Serial tests | ~55 min | Weekly | `test-e2e-conformance-fast` | +| Conformance (full) | `conformance` | Full K8s conformance including Serial tests | ~120 min | Biweekly | `test-e2e-conformance` | Durations are approximate from a real CI run; conformance varies with cluster size. diff --git a/config/e2e/kustomization.yaml b/config/e2e/kustomization.yaml new file mode 100644 index 0000000..03c4f02 --- /dev/null +++ b/config/e2e/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../default + +patches: +- path: manager_concurrency_patch.yaml + target: + kind: Deployment + name: controller-manager \ No newline at end of file diff --git a/config/e2e/manager_concurrency_patch.yaml b/config/e2e/manager_concurrency_patch.yaml new file mode 100644 index 0000000..1629101 --- /dev/null +++ b/config/e2e/manager_concurrency_patch.yaml @@ -0,0 +1,6 @@ +- op: add + path: /spec/template/spec/containers/0/args/- + value: --cluster-concurrency=4 +- op: add + path: /spec/template/spec/containers/0/args/- + value: --machine-concurrency=10 \ No newline at end of file diff --git a/test/e2e/config/cloudscale.yaml b/test/e2e/config/cloudscale.yaml index e26364b..910e6be 100644 --- a/test/e2e/config/cloudscale.yaml +++ b/test/e2e/config/cloudscale.yaml @@ -42,7 +42,7 @@ providers: type: InfrastructureProvider versions: - name: v0.99.99 # dev version, always higher than any release - value: ../../../config/default + value: ../../../config/e2e type: kustomize contract: v1beta2 replacements: