diff --git a/Makefile b/Makefile index d000dc2..88f18d2 100644 --- a/Makefile +++ b/Makefile @@ -99,7 +99,7 @@ E2E_CONF_FILE_SOURCE ?= $(shell pwd)/test/e2e/config/cloudscale.yaml E2E_CONF_FILE ?= $(shell pwd)/test/e2e/config/cloudscale.generated.yaml E2E_ARTIFACTS_FOLDER ?= $(shell pwd)/_artifacts E2E_TEMPLATES := test/e2e/data/infrastructure-cloudscale -GINKGO_TIMEOUT ?= 2h +GINKGO_TIMEOUT ?= 3h GINKGO_NODES ?= 1 SKIP_RESOURCE_CLEANUP ?= false USE_EXISTING_CLUSTER ?= false @@ -147,7 +147,8 @@ generate-e2e-config: ## Generate e2e config from template by resolving environme .PHONY: test-e2e test-e2e: TAG = $(E2E_TAG) -test-e2e: $(GINKGO) generate-e2e-templates generate-e2e-config docker-build docker-push ## Run all e2e tests +test-e2e: KUBETEST_CONFIGURATION = ./data/kubetest/conformance-fast.yaml +test-e2e: $(GINKGO) generate-e2e-templates generate-e2e-config docker-build docker-push ## Run all e2e tests (uses conformance-fast; for full conformance run test-e2e-conformance separately) $(GINKGO) -v --trace --tags=e2e \ --nodes=$(GINKGO_NODES) \ --timeout=$(GINKGO_TIMEOUT) \ diff --git a/README.md b/README.md index 326fdcc..afd3721 100644 --- a/README.md +++ b/README.md @@ -87,11 +87,16 @@ clusterctl generate cluster my-cluster \ | Flavor | Network | CP Endpoint | Node Connectivity | Extra Env Vars | Notes | |---------------------------|---------------------------|-----------------------|-------------------|---------------------------|----------------------| -| *(default)* | Managed (`10.100.0.0/24`) | Public LB (DualStack) | Public + cluster | — | | +| *(default)* | Managed (`172.18.0.0/24`) | Public LB (DualStack) | Public + cluster | — | | | `fip` | Pre-Existing | Floating IP (IPv4) | Public + cluster | `CLOUDSCALE_NETWORK_UUID` | | | `public-lb-private-nodes` | Pre-Existing + NAT | Public LB | Private only | `CLOUDSCALE_NETWORK_UUID` | Requires NAT gateway | | `pre-existing-network` | Pre-Existing | Public LB (DualStack) | Public + cluster | `CLOUDSCALE_NETWORK_UUID` | | +The default `networks[].cidr` is `172.18.0.0/24` so it does not overlap with the default Cilium +cluster-pool IPAM range `10.0.0.0/8`. If you override `networks[].cidr` to a range inside +`10.0.0.0/8`, make sure to configure your CNI's IP range correctly. Overlapping +ranges may break for example control-plane LB's health checks. + ## Development This is a kubebuilder-scaffolded project. For new APIs, Webhooks, etc. [kubebuilder](https://book.kubebuilder.io/) @@ -119,14 +124,14 @@ filtering and are split into suites of increasing cost, scheduled accordingly: | Suite | Label | Description | ~Duration | Schedule | Make target | |-------------------------|---------------------------|------------------------------------------------------------------------------------------|-----------|----------|------------------------------------| -| Lifecycle | `lifecycle` | 1 CP + 1 worker: create, validate cloudscale resources, delete | < 5 min | Nightly | `test-e2e-lifecycle` | -| HA lifecycle | `ha` | 3 CP + 2 workers with anti-affinity server groups | < 10 min | Weekly | `test-e2e-ha` | -| Cluster upgrade | `upgrade` | Rolling K8s version upgrade (v1.34 → v1.35) | < 10 min | Weekly | `test-e2e-upgrade` | -| Self-hosted | `self-hosted` | clusterctl move (pivot) to workload cluster. Requires container image in public registry | < 15 min | Weekly | `test-e2e-self-hosted` | -| MD remediation | `md-remediation` | MachineHealthCheck auto-replacement of unhealthy workers | < 10 min | Weekly | `test-e2e-md-remediation` | -| Pre-Existing networking | `pre-existing-networking` | Pre-Existing network: public-LB + private-nodes and floating-IP variants | < 10 min | Weekly | `test-e2e-pre-existing-networking` | -| Conformance (fast) | `conformance` | K8s conformance, skip Serial tests | < 60 min | Weekly | `test-e2e-conformance-fast` | -| Conformance (full) | `conformance` | Full K8s conformance including Serial tests | < 120 min | Biweekly | `test-e2e-conformance` | +| Lifecycle | `lifecycle` | 1 CP + 1 worker: create, validate cloudscale resources, delete | ~5 min | Nightly | `test-e2e-lifecycle` | +| HA lifecycle | `ha` | 3 CP + 2 workers with anti-affinity server groups | ~8 min | Weekly | `test-e2e-ha` | +| Cluster upgrade | `upgrade` | Rolling K8s version upgrade (v1.34 → v1.35) | ~25 min | Weekly | `test-e2e-upgrade` | +| Self-hosted | `self-hosted` | clusterctl move (pivot) to workload cluster. Requires container image in public registry | ~13 min | Weekly | `test-e2e-self-hosted` | +| MD remediation | `md-remediation` | MachineHealthCheck auto-replacement of unhealthy workers | ~6 min | Weekly | `test-e2e-md-remediation` | +| Pre-Existing networking | `pre-existing-networking` | Pre-Existing network: public-LB + private-nodes and floating-IP variants | ~30 min | Weekly | `test-e2e-pre-existing-networking` | +| Conformance (fast) | `conformance` | K8s conformance, skip Serial tests | ~55 min | Weekly | `test-e2e-conformance-fast` | +| Conformance (full) | `conformance` | Full K8s conformance including Serial tests | ~120 min | Biweekly | `test-e2e-conformance` | Durations are approximate from a real CI run; conformance varies with cluster size. diff --git a/api/v1beta2/cloudscalecluster_types.go b/api/v1beta2/cloudscalecluster_types.go index 7250115..294cb68 100644 --- a/api/v1beta2/cloudscalecluster_types.go +++ b/api/v1beta2/cloudscalecluster_types.go @@ -163,12 +163,6 @@ type LoadBalancerSpec struct { // +optional Network string `json:"network,omitempty"` - // IPFamily specifies the IP family for the LB VIP address(es). - // +kubebuilder:validation:Enum=IPv4;IPv6;DualStack - // +kubebuilder:default=DualStack - // +optional - IPFamily IPFamily `json:"ipFamily,omitempty"` - // HealthMonitor configures the load balancer health monitor. // +optional HealthMonitor HealthMonitorSpec `json:"healthMonitor,omitempty"` diff --git a/cmd/main.go b/cmd/main.go index 015214b..b17c68a 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -21,6 +21,7 @@ import ( "crypto/tls" "flag" "fmt" + "net/http" "os" "time" @@ -70,6 +71,8 @@ func main() { var probeAddr string var secureMetrics bool var enableHTTP2 bool + var clusterConcurrency int + var machineConcurrency int var watchFilter string var tlsOpts []func(*tls.Config) flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ @@ -89,6 +92,10 @@ func main() { flag.StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.") flag.BoolVar(&enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers") + flag.IntVar(&clusterConcurrency, "cluster-concurrency", 1, + "Maximum concurrent reconciles for CloudscaleCluster controller (1-4)") + flag.IntVar(&machineConcurrency, "machine-concurrency", 1, + "Maximum concurrent reconciles for CloudscaleMachine controller (1-10)") flag.StringVar(&watchFilter, "watch-filter", "", fmt.Sprintf("Label value that the controller watches to reconcile cluster-api objects. Label key is always %s. "+ "If unspecified, the controller watches for all cluster-api objects.", clusterv1.WatchLabel)) @@ -100,6 +107,17 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + if clusterConcurrency < 1 || clusterConcurrency > 4 { + setupLog.Error( + fmt.Errorf("--cluster-concurrency must be between 1 and 4, got %d", clusterConcurrency), "invalid flag") + os.Exit(1) + } + if machineConcurrency < 1 || machineConcurrency > 10 { + setupLog.Error( + fmt.Errorf("--machine-concurrency must be between 1 and 10, got %d", machineConcurrency), "invalid flag") + os.Exit(1) + } + // if the enable-http2 flag is false (the default), http/2 should be disabled // due to its vulnerabilities. More specifically, disabling http/2 will // prevent from being vulnerable to the HTTP/2 Stream Cancellation and @@ -174,16 +192,6 @@ func main() { HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, LeaderElectionID: "cloudscale.infrastructure.cluster.x-k8s.io", - // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily - // when the Manager ends. This requires the binary to immediately end when the - // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly - // speeds up voluntary leader transitions as the new leader don't have to wait - // LeaseDuration time first. - // - // In the default scaffold provided, the program ends immediately after - // the manager stops, so would be fine to enable this option. However, - // if you are doing or is intended to do any operation such as perform cleanups - // after the manager stops then its usage might be unsafe. // LeaderElectionReleaseOnCancel: true, }) if err != nil { @@ -193,8 +201,12 @@ func main() { ctx := ctrl.SetupSignalHandler() + // Create a shared HTTP transport for all cloudscale API clients. + // This enables connection pooling and HTTP/2 multiplexing across reconciles. + transport := cloudscale.NewTransport() + // Fetch region information for controllers and webhooks - regionInfo, flavorInfo, err := fetchAPIInfo() + regionInfo, flavorInfo, err := fetchAPIInfo(transport) if err != nil { setupLog.Error(err, "unable to fetch API information") os.Exit(1) @@ -203,17 +215,21 @@ func main() { setupLog.Info("fetched flavor information", "flavors", len(flavorInfo.GetAllFlavors())) if err := (&controller.CloudscaleClusterReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - WatchFilter: watchFilter, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + WatchFilter: watchFilter, + Transport: transport, + MaxConcurrentReconciles: clusterConcurrency, }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleCluster") os.Exit(1) } if err := (&controller.CloudscaleMachineReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - WatchFilter: watchFilter, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + WatchFilter: watchFilter, + Transport: transport, + MaxConcurrentReconciles: machineConcurrency, }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleMachine") os.Exit(1) @@ -263,7 +279,7 @@ func main() { // fetchAPIInfo fetches region and flavor information from cloudscale.ch API. // Requires CLOUDSCALE_API_TOKEN environment variable. -func fetchAPIInfo() (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) { +func fetchAPIInfo(transport *http.Transport) (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) { token := os.Getenv("CLOUDSCALE_API_TOKEN") if token == "" { return nil, nil, fmt.Errorf("CLOUDSCALE_API_TOKEN environment variable is required") @@ -272,7 +288,7 @@ func fetchAPIInfo() (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - client := cloudscale.NewClient(token) + client := cloudscale.NewClient(token, transport) var regionInfo *cloudscale.RegionInfo var flavorInfo *cloudscale.FlavorInfo diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml index 9fe7c87..d53dd94 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml @@ -139,19 +139,6 @@ spec: minimum: 1 type: integer type: object - ipFamily: - allOf: - - enum: - - IPv4 - - IPv6 - - DualStack - - enum: - - IPv4 - - IPv6 - - DualStack - default: DualStack - description: IPFamily specifies the IP family for the LB VIP address(es). - type: string network: description: |- Network places the LB VIP on a private network (internal LB). diff --git a/config/e2e/kustomization.yaml b/config/e2e/kustomization.yaml new file mode 100644 index 0000000..03c4f02 --- /dev/null +++ b/config/e2e/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../default + +patches: +- path: manager_concurrency_patch.yaml + target: + kind: Deployment + name: controller-manager \ No newline at end of file diff --git a/config/e2e/manager_concurrency_patch.yaml b/config/e2e/manager_concurrency_patch.yaml new file mode 100644 index 0000000..1629101 --- /dev/null +++ b/config/e2e/manager_concurrency_patch.yaml @@ -0,0 +1,6 @@ +- op: add + path: /spec/template/spec/containers/0/args/- + value: --cluster-concurrency=4 +- op: add + path: /spec/template/spec/containers/0/args/- + value: --machine-concurrency=10 \ No newline at end of file diff --git a/internal/cloudscale/client.go b/internal/cloudscale/client.go index e88b324..7539c8f 100644 --- a/internal/cloudscale/client.go +++ b/internal/cloudscale/client.go @@ -17,9 +17,13 @@ limitations under the License. package cloudscale import ( - "context" "errors" + "net" + "net/http" + "net/url" + "os" "strings" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" "golang.org/x/oauth2" @@ -40,9 +44,61 @@ type Client struct { Flavors FlavorService } -func NewClient(token string) *Client { +const ( + // ReadTimeout is the context timeout for Get/List API calls. + ReadTimeout = 1 * time.Second + + // WriteTimeout is the context timeout for Create/Update API calls. + // Creates can take 60s+ under API load. + WriteTimeout = 2 * time.Minute + + // DeleteTimeout is the context timeout for Delete API calls. + DeleteTimeout = 2 * time.Second +) + +// NewTransport creates an http.Transport configured for the cloudscale.ch API. +// The returned transport should be created once and shared across all clients +// to benefit from connection pooling and HTTP/2 multiplexing. +func NewTransport() *http.Transport { + return &http.Transport{ + DialContext: (&net.Dialer{ + Timeout: 5 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, + + TLSHandshakeTimeout: 5 * time.Second, + + // needs to be set because we also set DialContext + ForceAttemptHTTP2: true, + HTTP2: &http.HTTP2Config{ + SendPingTimeout: 5 * time.Second, + PingTimeout: 3 * time.Second, + }, + + IdleConnTimeout: 90 * time.Second, + MaxIdleConns: 50, + MaxIdleConnsPerHost: 50, + MaxConnsPerHost: 0, + } +} + +// NewClient creates a cloudscale.ch API client using the given token and +// shared transport. The transport should be created once via NewTransport() +// and reused across clients. Each client gets its own oauth2 token injection +// but shares the underlying connection pool. +// +// No global HTTP timeout is set on the client. Instead, callers must use +// context.WithTimeout with ReadTimeout, WriteTimeout, or DeleteTimeout +// for each API call. +func NewClient(token string, transport *http.Transport) *Client { tokenSource := oauth2.StaticTokenSource(&oauth2.Token{AccessToken: token}) - httpClient := oauth2.NewClient(context.Background(), tokenSource) + + httpClient := &http.Client{ + Transport: &oauth2.Transport{ + Source: tokenSource, + Base: transport, + }, + } sdkClient := cloudscalesdk.NewClient(httpClient) return &Client{ @@ -82,3 +138,20 @@ func IsFloatingIPNoPublicInterface(err error) bool { } return strings.Contains(err.Error(), "does not have a public interface with an IPv4 address") } + +// IsTimeoutError reports whether err indicates the HTTP request timed out +// before receiving a response. +func IsTimeoutError(err error) bool { + if err == nil { + return false + } + + var urlErr *url.Error + if errors.As(err, &urlErr) { + return urlErr.Timeout() + } + if errors.Is(err, os.ErrDeadlineExceeded) { + return true + } + return false +} diff --git a/internal/cloudscale/client_test.go b/internal/cloudscale/client_test.go index 2fa1fde..865c0a2 100644 --- a/internal/cloudscale/client_test.go +++ b/internal/cloudscale/client_test.go @@ -18,6 +18,8 @@ package cloudscale import ( "fmt" + "net/url" + "os" "testing" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" @@ -46,3 +48,51 @@ func TestIsNotFound(t *testing.T) { }) } } + +func TestIsTimeoutError(t *testing.T) { + tests := []struct { + name string + err error + expected bool + }{ + {"nil error returns false", nil, false}, + { + "url.Error with Timeout=true returns true", + &url.Error{Op: "Post", URL: "https://api.example.com/v1/servers", Err: os.ErrDeadlineExceeded}, + true, + }, + { + "url.Error with Timeout=false returns false", + &url.Error{Op: "Get", URL: "https://api.example.com/v1/servers", Err: fmt.Errorf("connection refused")}, + false, + }, + { + "wrapped url.Error with Timeout=true returns true", + fmt.Errorf("outer: %w", &url.Error{Op: "Post", URL: "https://api.example.com/v1/servers", Err: os.ErrDeadlineExceeded}), + true, + }, + { + "os.ErrDeadlineExceeded returns true", + os.ErrDeadlineExceeded, + true, + }, + { + "generic error returns false", + fmt.Errorf("some other error"), + false, + }, + { + "ErrorResponse with 500 returns false", + &cloudscalesdk.ErrorResponse{StatusCode: 500}, + false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + result := IsTimeoutError(tt.err) + g.Expect(result).To(Equal(tt.expected)) + }) + } +} diff --git a/internal/controller/cloudscale_services.go b/internal/controller/cloudscale_services.go index f1835bb..664b571 100644 --- a/internal/controller/cloudscale_services.go +++ b/internal/controller/cloudscale_services.go @@ -29,7 +29,9 @@ func ensureResource[T any]( tags cloudscalesdk.TagMap, ) (*T, string, error) { if currentID != "" { - resource, err := svc.Get(ctx, currentID) + getCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + resource, err := svc.Get(getCtx, currentID) if err == nil { return resource, currentID, nil } @@ -39,7 +41,15 @@ func ensureResource[T any]( // Resource was deleted externally, fall through to list/recreate } - items, err := svc.List(ctx, cloudscalesdk.WithTagFilter(tags)) + // tag-based lookup can happen after the resource has been created just before due to the caching behavior of + // controller-runtime. It is therefore expected to have the following in the logs: + // Reconcile 1: Create & Patch Status to have the ID recorded + // Reconcile 2: Get 404, List 200 (found existing ...) + // Reconcile 3: Get 200 -> here the cache is up-to-date + + listCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + items, err := svc.List(listCtx, cloudscalesdk.WithTagFilter(tags)) if err != nil { return nil, "", fmt.Errorf("listing %ss: %w", resourceName, err) } diff --git a/internal/controller/cloudscalecluster_controller.go b/internal/controller/cloudscalecluster_controller.go index 1f6b058..95adaab 100644 --- a/internal/controller/cloudscalecluster_controller.go +++ b/internal/controller/cloudscalecluster_controller.go @@ -18,7 +18,10 @@ package controller import ( "context" + "errors" "fmt" + "net/http" + "time" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -32,6 +35,7 @@ import ( "sigs.k8s.io/cluster-api/util/predicates" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -45,9 +49,11 @@ import ( // CloudscaleClusterReconciler reconciles a CloudscaleCluster object type CloudscaleClusterReconciler struct { client.Client - Scheme *runtime.Scheme - recorder events.EventRecorder - WatchFilter string + Scheme *runtime.Scheme + recorder events.EventRecorder + WatchFilter string + Transport *http.Transport + MaxConcurrentReconciles int } // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=cloudscaleclusters,verbs=get;list;watch;create;update;patch;delete @@ -60,6 +66,9 @@ type CloudscaleClusterReconciler struct { // Reconcile handles CloudscaleCluster reconciliation. func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { + ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + logger := logf.FromContext(ctx) cloudscaleCluster := &infrastructurev1beta2.CloudscaleCluster{} @@ -99,7 +108,7 @@ func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re return ctrl.Result{}, fmt.Errorf("failed to get cloudscale.ch credentials: %w", err) } - cloudscaleClient := cloudscale.NewClient(token) + cloudscaleClient := cloudscale.NewClient(token, r.Transport) clusterScope, err := scope.NewClusterScope(scope.ClusterScopeParams{ Client: r.Client, @@ -113,7 +122,11 @@ func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re } defer func() { - if err := clusterScope.Close(ctx); err != nil && reterr == nil { + // Use a separate context for the status patch so it succeeds even + // when the reconcile context has timed out. + patchCtx, patchCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer patchCancel() + if err := clusterScope.Close(patchCtx); err != nil && reterr == nil { reterr = err } }() @@ -135,11 +148,15 @@ func (r *CloudscaleClusterReconciler) reconcileNormal(ctx context.Context, clust // update ready conditions upon returning from this function based on updated clusterScope. defer r.setReadyCondition(clusterScope) - if err := r.reconcileNetwork(ctx, clusterScope); err != nil { + result, err := r.reconcileNetwork(ctx, clusterScope) + if err != nil { return ctrl.Result{}, fmt.Errorf("reconciling network: %w", err) } + if !result.IsZero() { + return result, nil + } - result, err := r.reconcileLoadBalancer(ctx, clusterScope) + result, err = r.reconcileLoadBalancer(ctx, clusterScope) if err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer: %w", err) } @@ -147,9 +164,13 @@ func (r *CloudscaleClusterReconciler) reconcileNormal(ctx context.Context, clust return result, nil } - if err := r.reconcileFloatingIP(ctx, clusterScope); err != nil { + result, err = r.reconcileFloatingIP(ctx, clusterScope) + if err != nil { return ctrl.Result{}, fmt.Errorf("reconciling floating IP: %w", err) } + if !result.IsZero() { + return result, nil + } // Mark infrastructure as provisioned when all resources exist if clusterScope.CloudscaleCluster.Status.Initialization == nil { @@ -182,10 +203,16 @@ func (r *CloudscaleClusterReconciler) reconcileDelete(ctx context.Context, clust } if err := r.deleteServerGroups(ctx, clusterScope); err != nil { + if errors.Is(err, errServerGroupNotEmpty) { + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } return ctrl.Result{}, fmt.Errorf("deleting server groups: %w", err) } if err := r.deleteNetwork(ctx, clusterScope); err != nil { + if errors.Is(err, errNetworkNotReady) { + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } return ctrl.Result{}, fmt.Errorf("deleting network: %w", err) } @@ -281,6 +308,7 @@ func (r *CloudscaleClusterReconciler) SetupWithManager(ctx context.Context, mgr return ctrl.NewControllerManagedBy(mgr). For(&infrastructurev1beta2.CloudscaleCluster{}). + WithOptions(controller.Options{MaxConcurrentReconciles: r.MaxConcurrentReconciles}). WithEventFilter(predicates.ResourceNotPaused(r.Scheme, logger)). WithEventFilter(predicates.ResourceHasFilterLabel(r.Scheme, logger, r.WatchFilter)). WithEventFilter(predicates.ResourceIsNotExternallyManaged(r.Scheme, logger)). diff --git a/internal/controller/cloudscalecluster_floatingip.go b/internal/controller/cloudscalecluster_floatingip.go index a92cb85..5cd5885 100644 --- a/internal/controller/cloudscalecluster_floatingip.go +++ b/internal/controller/cloudscalecluster_floatingip.go @@ -19,12 +19,14 @@ package controller import ( "context" "fmt" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" @@ -32,13 +34,15 @@ import ( "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) +const createFloatingIPTimeoutRequeueAfter = 5 * time.Second + // reconcileFloatingIP ensures the floating IP exists and is assigned to the correct target. // When no floating IP is configured, this sets the condition to true and returns. -func (r *CloudscaleClusterReconciler) reconcileFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope) (reterr error) { +func (r *CloudscaleClusterReconciler) reconcileFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { fipSpec := clusterScope.CloudscaleCluster.Spec.FloatingIP if fipSpec == nil { r.setCondition(clusterScope, infrastructurev1beta2.FloatingIPReadyCondition, metav1.ConditionTrue, infrastructurev1beta2.FloatingIPDisabledReason, "") - return nil + return ctrl.Result{}, nil } defer func() { @@ -51,7 +55,8 @@ func (r *CloudscaleClusterReconciler) reconcileFloatingIP(ctx context.Context, c // Pre-existing floating IP: just look it up and use its address if fipSpec.Address != "" { - return r.reconcilePreExistingFloatingIP(ctx, clusterScope, fipSpec.Address) + err := r.reconcilePreExistingFloatingIP(ctx, clusterScope, fipSpec.Address) + return ctrl.Result{}, err } // Managed floating IP: create if needed, then assign @@ -59,7 +64,9 @@ func (r *CloudscaleClusterReconciler) reconcileFloatingIP(ctx context.Context, c } func (r *CloudscaleClusterReconciler) reconcilePreExistingFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope, ip string) error { - fip, err := clusterScope.CloudscaleClient.FloatingIPs.Get(ctx, ip) + getCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + fip, err := clusterScope.CloudscaleClient.FloatingIPs.Get(getCtx, ip) if err != nil { return fmt.Errorf("getting pre-existing floating IP %s: %w", ip, err) } @@ -74,7 +81,7 @@ func (r *CloudscaleClusterReconciler) reconcilePreExistingFloatingIP(ctx context return r.ensureFloatingIPAssignment(ctx, clusterScope, fip) } -func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope, fipSpec *infrastructurev1beta2.FloatingIPSpec) error { +func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope, fipSpec *infrastructurev1beta2.FloatingIPSpec) (ctrl.Result, error) { tags := clusterOwnershipTags(clusterScope.CloudscaleCluster) clusterScope.Info("reconcile managed floating IP") @@ -88,17 +95,17 @@ func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Con tags, ) if err != nil { - return err + return ctrl.Result{}, err } clusterScope.CloudscaleCluster.Status.FloatingIP = id if id != "" { // Existing floating IP: ensure it's assigned to the right target and set endpoint if err := r.ensureFloatingIPAssignment(ctx, clusterScope, fip); err != nil { - return err + return ctrl.Result{}, err } r.setControlPlaneEndpointFromFIP(clusterScope, fip) - return nil + return ctrl.Result{}, nil } // Create new floating IP @@ -132,9 +139,15 @@ func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Con } clusterScope.Info("Creating floating IP", "ipVersion", ipVersion, "target", target) - fip, err = clusterScope.CloudscaleClient.FloatingIPs.Create(ctx, req) + createCtx, cancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancel() + fip, err = clusterScope.CloudscaleClient.FloatingIPs.Create(createCtx, req) if err != nil { - return fmt.Errorf("creating floating IP: %w", err) + if cloudscale.IsTimeoutError(err) { + clusterScope.Info("Floating IP creation timed out, waiting before retry", "requeueAfter", createFloatingIPTimeoutRequeueAfter) + return ctrl.Result{RequeueAfter: createFloatingIPTimeoutRequeueAfter}, nil + } + return ctrl.Result{}, fmt.Errorf("creating floating IP: %w", err) } ip := fip.IP() @@ -144,7 +157,7 @@ func (r *CloudscaleClusterReconciler) reconcileManagedFloatingIP(ctx context.Con "Created floating IP %s", ip) r.setControlPlaneEndpointFromFIP(clusterScope, fip) - return nil + return ctrl.Result{}, nil } type floatingIPTarget struct { @@ -215,7 +228,9 @@ func (r *CloudscaleClusterReconciler) ensureFloatingIPAssignment(ctx context.Con if needsUpdate { floatingIP := clusterScope.CloudscaleCluster.Status.FloatingIP clusterScope.Info("Reassigning floating IP", "ip", floatingIP, "target", target) - if err := clusterScope.CloudscaleClient.FloatingIPs.Update(ctx, floatingIP, updateReq); err != nil { + updateCtx, cancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancel() + if err := clusterScope.CloudscaleClient.FloatingIPs.Update(updateCtx, floatingIP, updateReq); err != nil { if cloudscale.IsFloatingIPNoPublicInterface(err) { return fmt.Errorf("floating IP cannot be assigned to control plane server: server must have a public interface when the load balancer is disabled; add {type: public} to the control-plane machine template interfaces") } @@ -273,7 +288,9 @@ func (r *CloudscaleClusterReconciler) deleteFloatingIP(ctx context.Context, clus } clusterScope.Info("Deleting floating IP", "id", floatingIP) - if err := clusterScope.CloudscaleClient.FloatingIPs.Delete(ctx, floatingIP); err != nil { + deleteCtx, cancel := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + defer cancel() + if err := clusterScope.CloudscaleClient.FloatingIPs.Delete(deleteCtx, floatingIP); err != nil { if !cloudscale.IsNotFound(err) { return fmt.Errorf("deleting floating IP: %w", err) } diff --git a/internal/controller/cloudscalecluster_floatingip_test.go b/internal/controller/cloudscalecluster_floatingip_test.go index a9426e1..f07007d 100644 --- a/internal/controller/cloudscalecluster_floatingip_test.go +++ b/internal/controller/cloudscalecluster_floatingip_test.go @@ -19,6 +19,8 @@ package controller import ( "context" "fmt" + "net/url" + "os" "testing" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" @@ -32,13 +34,13 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) // --- Test helpers --- -func newFIPTestClusterScope(fipService cs.FloatingIPService) *scope.ClusterScope { +func newFIPTestClusterScope(fipService cloudscale.FloatingIPService) *scope.ClusterScope { return &scope.ClusterScope{ Logger: logr.Discard(), Cluster: &clusterv1.Cluster{ @@ -61,7 +63,7 @@ func newFIPTestClusterScope(fipService cs.FloatingIPService) *scope.ClusterScope }, }, }, - CloudscaleClient: &cs.Client{ + CloudscaleClient: &cloudscale.Client{ FloatingIPs: fipService, }, } @@ -83,7 +85,7 @@ func TestReconcileFloatingIP_Disabled(t *testing.T) { // No FloatingIP spec = disabled r := newFIPTestReconciler() - err := r.reconcileFloatingIP(context.Background(), clusterScope) + _, err := r.reconcileFloatingIP(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.FloatingIPReadyCondition) @@ -110,7 +112,7 @@ func TestReconcileFloatingIP_PreExisting(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileFloatingIP(context.Background(), clusterScope) + _, err := r.reconcileFloatingIP(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.FloatingIP).To(Equal("1.2.3.4")) @@ -136,7 +138,7 @@ func TestReconcileFloatingIP_ErrorSetsConditionFalse(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileFloatingIP(context.Background(), clusterScope) + _, err := r.reconcileFloatingIP(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.FloatingIPReadyCondition) @@ -160,7 +162,7 @@ func TestReconcilePreExistingFloatingIP_RefetchesAndKeepsAssignmentWhenCached(t }, nil }, updateFn: func(ctx context.Context, id string, req *cloudscalesdk.FloatingIPUpdateRequest) error { - t.Fatal("Update must not fire when FIP is already assigned to the LB") + g.Fail("Update must not fire when FIP is already assigned to the LB") return nil }, } @@ -249,7 +251,7 @@ func TestReconcilePreExistingFloatingIP_RegionMismatchErrors(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileFloatingIP(context.Background(), clusterScope) + _, err := r.reconcileFloatingIP(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("lpg")) @@ -300,7 +302,7 @@ func TestReconcilePreExistingFloatingIP_NoPublicInterfaceSetsConditionAndEvent(t } r := newFIPTestReconciler(cpMachine) - err := r.reconcileFloatingIP(context.Background(), clusterScope) + _, err := r.reconcileFloatingIP(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("public interface")) g.Expect(err.Error()).To(ContainSubstring("control-plane machine template")) @@ -423,7 +425,7 @@ func TestReconcileManagedFloatingIP_ExistingFIPEnsuresAssignment(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + _, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Spec.ControlPlaneEndpoint.Host).To(Equal("10.0.0.1")) @@ -452,7 +454,7 @@ func TestReconcileManagedFloatingIP_CreatesIPv4(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + _, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq).ToNot(BeNil()) @@ -488,7 +490,7 @@ func TestReconcileManagedFloatingIP_CreatesIPv6(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + _, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.IPVersion).To(Equal(6)) @@ -512,7 +514,7 @@ func TestReconcileManagedFloatingIP_CreateError(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + _, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("creating floating IP")) @@ -539,7 +541,7 @@ func TestReconcileManagedFloatingIP_AssignsToLBTarget(t *testing.T) { r := newFIPTestReconciler() - err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + _, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq).ToNot(BeNil()) @@ -643,7 +645,7 @@ func TestEnsureFloatingIPAssignment_LBAlreadyCorrect(t *testing.T) { fipService := &mockFloatingIPService{ updateFn: func(ctx context.Context, id string, req *cloudscalesdk.FloatingIPUpdateRequest) error { - t.Fatal("Update should not be called when assignment is correct") + g.Fail("Update should not be called when assignment is correct") return nil }, } @@ -819,7 +821,7 @@ func TestDeleteFloatingIP_PreExistingSkipsDeletionAndLeavesConditionUntouched(t fipService := &mockFloatingIPService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called for pre-existing floating IPs") + g.Fail("Delete should not be called for pre-existing floating IPs") return nil }, } @@ -871,7 +873,7 @@ func TestDeleteFloatingIP_NoStatusSkipsDeletion(t *testing.T) { fipService := &mockFloatingIPService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when status is empty") + g.Fail("Delete should not be called when status is empty") return nil }, } @@ -932,6 +934,34 @@ func TestDeleteFloatingIP_DeleteError(t *testing.T) { g.Expect(cond.Reason).To(Equal(infrastructurev1beta2.FloatingIPErrorReason)) } +// --- Timeout handling tests for Create() calls --- + +func TestReconcileManagedFloatingIP_CreateTimeoutRequeues(t *testing.T) { + g := NewWithT(t) + + fipService := &mockFloatingIPService{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.FloatingIP, error) { + return nil, nil + }, + createFn: func(ctx context.Context, req *cloudscalesdk.FloatingIPCreateRequest) (*cloudscalesdk.FloatingIP, error) { + // Simulate timeout via context deadline exceeded wrapped in url.Error + return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/floating_ips", Err: os.ErrDeadlineExceeded} + }, + } + + clusterScope := newFIPTestClusterScope(fipService) + clusterScope.CloudscaleCluster.Status.LoadBalancerID = "lb-uuid" + fipSpec := &infrastructurev1beta2.FloatingIPSpec{} + + r := newFIPTestReconciler() + + result, err := r.reconcileManagedFloatingIP(context.Background(), clusterScope, fipSpec) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(Equal(createFloatingIPTimeoutRequeueAfter), + "Should requeue after createFloatingIPTimeoutRequeueAfter on timeout error") +} + // --- Mock FloatingIPService --- type mockFloatingIPService struct { diff --git a/internal/controller/cloudscalecluster_loadbalancer.go b/internal/controller/cloudscalecluster_loadbalancer.go index e6666bd..e9e87c7 100644 --- a/internal/controller/cloudscalecluster_loadbalancer.go +++ b/internal/controller/cloudscalecluster_loadbalancer.go @@ -38,7 +38,10 @@ import ( const ( // LoadBalancerRunningStatus indicates the load balancer is ready. - LoadBalancerRunningStatus = "running" + LoadBalancerRunningStatus = "running" + LoadBalancerChangingStatus = "changing" + LoadBalancerDegradedStatus = "degraded" + LoadBalancerErrorStatus = "error" ) // reconcileLoadBalancer ensures the load balancer, pool, and listener exist for the control plane. @@ -53,20 +56,23 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, } var lbPending bool + lbPendingMsg := "Load balancer is not yet running" defer func() { if reterr != nil { r.setCondition(clusterScope, infrastructurev1beta2.LoadBalancerReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.LoadBalancerErrorReason, reterr.Error()) } else if lbPending { - r.setCondition(clusterScope, infrastructurev1beta2.LoadBalancerReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.LoadBalancerNotReadyReason, "Load balancer is not yet running") + r.setCondition(clusterScope, infrastructurev1beta2.LoadBalancerReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.LoadBalancerNotReadyReason, lbPendingMsg) } else { r.setCondition(clusterScope, infrastructurev1beta2.LoadBalancerReadyCondition, metav1.ConditionTrue, infrastructurev1beta2.LoadBalancerRunningReason, "") } }() // 1. Reconcile the load balancer itself - if err := r.reconcileLB(ctx, clusterScope); err != nil { + if result, err := r.reconcileLB(ctx, clusterScope); err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer: %w", err) + } else if !result.IsZero() { + return result, nil } // Wait for LB to be running before creating pool/listener @@ -76,34 +82,72 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, } // Check if LB is running - lb, err := clusterScope.CloudscaleClient.LoadBalancers.Get(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerID) + getCtx, getCancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer getCancel() + lb, err := clusterScope.CloudscaleClient.LoadBalancers.Get(getCtx, clusterScope.CloudscaleCluster.Status.LoadBalancerID) if err != nil { return ctrl.Result{}, fmt.Errorf("getting load balancer status: %w", err) } - if lb.Status != LoadBalancerRunningStatus { - clusterScope.Info("Waiting for load balancer to be running", "status", lb.Status) + + switch lb.Status { + case LoadBalancerRunningStatus: + // proceed normally + case LoadBalancerChangingStatus: + clusterScope.Info("Waiting for load balancer to finish changing", "status", lb.Status) lbPending = true - return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + default: + // degraded, error, stopped, unknown — proceed with member reconciliation + // because removing stale members is the only path back to "running" + clusterScope.Info("Load balancer is not running, proceeding with member reconciliation", "status", lb.Status) + lbPending = true + lbPendingMsg = fmt.Sprintf("Load balancer is in %s", lb.Status) + if poolID := clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID; poolID != "" { + listCtx, listCancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + members, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.List(listCtx, poolID) + listCancel() + if err != nil { + clusterScope.Info("Failed to list pool members for diagnostics", "err", err.Error()) + } else { + var down []string + for _, m := range members { + if m.MonitorStatus == "down" { + down = append(down, fmt.Sprintf("%s@%s:%d", m.Name, m.Address, m.ProtocolPort)) + } + } + if len(down) > 0 { + lbPendingMsg = fmt.Sprintf("%s; pool members failing health check: %v", lbPendingMsg, down) + } + } + } } // 2. Reconcile the pool - if err := r.reconcileLBPool(ctx, clusterScope); err != nil { + if result, err := r.reconcileLBPool(ctx, clusterScope); err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer pool: %w", err) + } else if !result.IsZero() { + return result, nil } // 3. Reconcile the listener - if err := r.reconcileLBListener(ctx, clusterScope); err != nil { + if result, err := r.reconcileLBListener(ctx, clusterScope); err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer listener: %w", err) + } else if !result.IsZero() { + return result, nil } // 4. Reconcile the health monitor - if err := r.reconcileLBHealthMonitor(ctx, clusterScope); err != nil { + if result, err := r.reconcileLBHealthMonitor(ctx, clusterScope); err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer health monitor: %w", err) + } else if !result.IsZero() { + return result, nil } // 5. Reconcile the members - if err := r.reconcileLBMembers(ctx, clusterScope); err != nil { + if result, err := r.reconcileLBMembers(ctx, clusterScope); err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer members: %w", err) + } else if !result.IsZero() { + return result, nil } // 6. Set the control plane endpoint from the VIP @@ -126,7 +170,7 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, } // reconcileLB ensures the load balancer exists. -func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterScope *scope.ClusterScope) error { +func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerID, "load balancer", @@ -135,11 +179,11 @@ func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterSc clusterOwnershipTags(clusterScope.CloudscaleCluster), ) if err != nil { - return err + return ctrl.Result{}, err } clusterScope.CloudscaleCluster.Status.LoadBalancerID = id if id != "" { - return nil + return ctrl.Result{}, nil } // Create new load balancer @@ -161,7 +205,7 @@ func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterSc if lbSpec.Network != "" { subnetID, err := lbPrivateNetworkSubnetID(clusterScope) if err != nil { - return err + return ctrl.Result{}, err } req.VIPAddresses = &[]cloudscalesdk.VIPAddressRequest{ {Subnet: subnetID}, @@ -170,20 +214,27 @@ func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterSc } clusterScope.Info("Creating load balancer", "zone", zone, "flavor", lbSpec.Flavor) - lb, err := clusterScope.CloudscaleClient.LoadBalancers.Create(ctx, req) + createCtx, createCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer createCancel() + lb, err := clusterScope.CloudscaleClient.LoadBalancers.Create(createCtx, req) if err != nil { - return fmt.Errorf("creating load balancer: %w", err) + if cloudscale.IsTimeoutError(err) { + requeueAfter := 5 * time.Second + clusterScope.Info("Load balancer creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + return ctrl.Result{}, fmt.Errorf("creating load balancer: %w", err) } clusterScope.CloudscaleCluster.Status.LoadBalancerID = lb.UUID clusterScope.Info("Created load balancer", "loadBalancerID", lb.UUID) r.recorder.Eventf(clusterScope.CloudscaleCluster, nil, corev1.EventTypeNormal, "LoadBalancerCreated", "CreateLoadBalancer", "Created load balancer %s in zone %s", lb.UUID, zone) - return nil + return ctrl.Result{}, nil } // reconcileLBPool ensures the load balancer pool exists. -func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clusterScope *scope.ClusterScope) error { +func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, "load balancer pool", @@ -192,11 +243,11 @@ func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clust clusterOwnershipTags(clusterScope.CloudscaleCluster), ) if err != nil { - return err + return ctrl.Result{}, err } clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = id if id != "" { - return nil + return ctrl.Result{}, nil } // Create new pool @@ -213,20 +264,27 @@ func (r *CloudscaleClusterReconciler) reconcileLBPool(ctx context.Context, clust } clusterScope.Info("Creating load balancer pool", "algorithm", algorithm) - pool, err := clusterScope.CloudscaleClient.LoadBalancerPools.Create(ctx, req) + createCtx, createCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer createCancel() + pool, err := clusterScope.CloudscaleClient.LoadBalancerPools.Create(createCtx, req) if err != nil { - return fmt.Errorf("creating load balancer pool: %w", err) + if cloudscale.IsTimeoutError(err) { + requeueAfter := 5 * time.Second + clusterScope.Info("Load balancer pool creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + return ctrl.Result{}, fmt.Errorf("creating load balancer pool: %w", err) } clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = pool.UUID clusterScope.Info("Created load balancer pool", "poolID", pool.UUID) r.recorder.Eventf(clusterScope.CloudscaleCluster, nil, corev1.EventTypeNormal, "PoolCreated", "CreateLoadBalancerPool", "Created load balancer pool %s", pool.UUID) - return nil + return ctrl.Result{}, nil } // reconcileLBListener ensures the load balancer listener exists. -func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, clusterScope *scope.ClusterScope) error { +func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID, "load balancer listener", @@ -235,11 +293,11 @@ func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, c clusterOwnershipTags(clusterScope.CloudscaleCluster), ) if err != nil { - return err + return ctrl.Result{}, err } clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = id if id != "" { - return nil + return ctrl.Result{}, nil } apiServerPort := int(clusterScope.CloudscaleCluster.Spec.ControlPlaneLoadBalancer.APIServerPort) @@ -254,9 +312,16 @@ func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, c } clusterScope.Info("Creating load balancer listener", "port", apiServerPort) - listener, err := clusterScope.CloudscaleClient.LoadBalancerListeners.Create(ctx, req) + createCtx, createCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer createCancel() + listener, err := clusterScope.CloudscaleClient.LoadBalancerListeners.Create(createCtx, req) if err != nil { - return fmt.Errorf("creating load balancer listener: %w", err) + if cloudscale.IsTimeoutError(err) { + requeueAfter := 5 * time.Second + clusterScope.Info("Load balancer listener creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + return ctrl.Result{}, fmt.Errorf("creating load balancer listener: %w", err) } clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = listener.UUID @@ -264,12 +329,12 @@ func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, c r.recorder.Eventf(clusterScope.CloudscaleCluster, nil, corev1.EventTypeNormal, "ListenerCreated", "CreateLoadBalancerListener", "Created load balancer listener %s on port %d", listener.UUID, apiServerPort) - return nil + return ctrl.Result{}, nil } // reconcileLBHealthMonitor ensures the load balancer health monitor exists. // The health monitor performs TCP health checks on the API server port. -func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Context, clusterScope *scope.ClusterScope) error { +func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID, "load balancer health monitor", @@ -278,11 +343,11 @@ func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Conte clusterOwnershipTags(clusterScope.CloudscaleCluster), ) if err != nil { - return err + return ctrl.Result{}, err } clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = id if id != "" { - return nil + return ctrl.Result{}, nil } healthMonitorSpec := clusterScope.CloudscaleCluster.Spec.ControlPlaneLoadBalancer.HealthMonitor @@ -300,9 +365,16 @@ func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Conte } clusterScope.Info("Creating load balancer health monitor", "type", "tcp", "pool", clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, "spec", healthMonitorSpec) - monitor, err := clusterScope.CloudscaleClient.LoadBalancerHealthMonitors.Create(ctx, req) + createCtx, createCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer createCancel() + monitor, err := clusterScope.CloudscaleClient.LoadBalancerHealthMonitors.Create(createCtx, req) if err != nil { - return fmt.Errorf("creating load balancer health monitor: %w", err) + if cloudscale.IsTimeoutError(err) { + requeueAfter := 5 * time.Second + clusterScope.Info("Load balancer health monitor creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + return ctrl.Result{}, fmt.Errorf("creating load balancer health monitor: %w", err) } clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = monitor.UUID @@ -310,20 +382,22 @@ func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Conte r.recorder.Eventf(clusterScope.CloudscaleCluster, nil, corev1.EventTypeNormal, "HealthMonitorCreated", "CreateLoadBalancerHealthMonitor", "Created load balancer health monitor %s", monitor.UUID) - return nil + return ctrl.Result{}, nil } -func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, clusterScope *scope.ClusterScope) error { +func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { // Fetch current members from the load balancer - currentMembers, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.List(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID) + listCtx, listCancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer listCancel() + currentMembers, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.List(listCtx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID) if err != nil { - return fmt.Errorf("failed to get current load balancer members: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to get current load balancer members: %w", err) } // Fetch control plane machines as desired members desiredMembers, err := r.getDesiredLoadBalancerMembers(ctx, clusterScope) if err != nil { - return fmt.Errorf("failed to get desired load balancer members: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to get desired load balancer members: %w", err) } clusterScope.V(2).Info("reconcileLBMembers", "currentMembers", currentMembers, "desiredMembers", desiredMembers) @@ -343,12 +417,14 @@ func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, cl for _, desired := range desiredMembers { current, exists := currentByName[desired.Name] if !exists { - if err := r.createLoadBalancerMember(ctx, clusterScope, desired); err != nil { - return fmt.Errorf("failed to create load balancer member: %w", err) + if result, err := r.createLoadBalancerMember(ctx, clusterScope, desired); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to create load balancer member: %w", err) + } else if !result.IsZero() { + return result, nil } } else if current.Address != desired.Address { if err := r.updateLoadBalancerMember(ctx, clusterScope, current.UUID, desired.Address); err != nil { - return fmt.Errorf("failed to update load balancer member: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to update load balancer member: %w", err) } } } @@ -357,12 +433,12 @@ func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, cl for _, member := range currentMembers { if _, exists := desiredByName[member.Name]; !exists { if err := r.deleteLoadBalancerMember(ctx, clusterScope, member); err != nil { - return fmt.Errorf("failed to delete load balancer member: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to delete load balancer member: %w", err) } } } - return nil + return ctrl.Result{}, nil } func (r *CloudscaleClusterReconciler) getDesiredLoadBalancerMembers(ctx context.Context, clusterScope *scope.ClusterScope) ([]cloudscalesdk.LoadBalancerPoolMemberRequest, error) { @@ -470,21 +546,30 @@ func lbPrivateNetworkSubnetID(clusterScope *scope.ClusterScope) (string, error) return ns.SubnetID, nil } -func (r *CloudscaleClusterReconciler) createLoadBalancerMember(ctx context.Context, clusterScope *scope.ClusterScope, member cloudscalesdk.LoadBalancerPoolMemberRequest) error { +func (r *CloudscaleClusterReconciler) createLoadBalancerMember(ctx context.Context, clusterScope *scope.ClusterScope, member cloudscalesdk.LoadBalancerPoolMemberRequest) (ctrl.Result, error) { clusterScope.V(2).Info("Creating load balancer member", "member", member) - cm, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Create(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, &member) + createCtx, createCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer createCancel() + cm, err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Create(createCtx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, &member) if err != nil { - return fmt.Errorf("creating load balancer member: %w", err) + if cloudscale.IsTimeoutError(err) { + requeueAfter := 5 * time.Second + clusterScope.Info("Load balancer member creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + return ctrl.Result{}, fmt.Errorf("creating load balancer member: %w", err) } clusterScope.V(2).Info("Created load balancer member", "member", member) clusterScope.CloudscaleCluster.Status.LoadBalancerMemberIDs = append(clusterScope.CloudscaleCluster.Status.LoadBalancerMemberIDs, cm.UUID) - return nil + return ctrl.Result{}, nil } func (r *CloudscaleClusterReconciler) updateLoadBalancerMember(ctx context.Context, clusterScope *scope.ClusterScope, memberUUID, newAddress string) error { clusterScope.V(2).Info("Updating load balancer member address", "memberUUID", memberUUID, "newAddress", newAddress) - err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Update(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, memberUUID, &cloudscalesdk.LoadBalancerPoolMemberRequest{ + updateCtx, updateCancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer updateCancel() + err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Update(updateCtx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, memberUUID, &cloudscalesdk.LoadBalancerPoolMemberRequest{ Address: newAddress, }) if err != nil { @@ -496,7 +581,9 @@ func (r *CloudscaleClusterReconciler) updateLoadBalancerMember(ctx context.Conte func (r *CloudscaleClusterReconciler) deleteLoadBalancerMember(ctx context.Context, clusterScope *scope.ClusterScope, member cloudscalesdk.LoadBalancerPoolMember) error { clusterScope.V(2).Info("Deleting load balancer member", "member", member) - err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Delete(ctx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, member.UUID) + deleteCtx, deleteCancel := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + defer deleteCancel() + err := clusterScope.CloudscaleClient.LoadBalancerPoolMembers.Delete(deleteCtx, clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID, member.UUID) if err != nil { return fmt.Errorf("deleting load balancer member: %w", err) } @@ -526,7 +613,9 @@ func (r *CloudscaleClusterReconciler) deleteLoadBalancer(ctx context.Context, cl lbID := clusterScope.CloudscaleCluster.Status.LoadBalancerID if lbID != "" { clusterScope.Info("Deleting load balancer", "id", lbID) - if err := clusterScope.CloudscaleClient.LoadBalancers.Delete(ctx, lbID); err != nil { + deleteCtx, deleteCancel := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + defer deleteCancel() + if err := clusterScope.CloudscaleClient.LoadBalancers.Delete(deleteCtx, lbID); err != nil { if !cloudscale.IsNotFound(err) { return fmt.Errorf("deleting load balancer: %w", err) } diff --git a/internal/controller/cloudscalecluster_loadbalancer_test.go b/internal/controller/cloudscalecluster_loadbalancer_test.go index fa34162..6656068 100644 --- a/internal/controller/cloudscalecluster_loadbalancer_test.go +++ b/internal/controller/cloudscalecluster_loadbalancer_test.go @@ -19,6 +19,7 @@ package controller import ( "context" "testing" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" "github.com/go-logr/logr" @@ -27,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/events" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -92,6 +94,10 @@ func newTestClusterScopeWithLB(opts lbTestScopeOptions) *scope.ClusterScope { opts.healthMonitorDownThreshold = 3 } + // Default nil services to "exists" mocks so orchestrator tests + // only override the service they care about. + defaultLBServices(&opts) + cloudscaleClient := &cloudscale.Client{ LoadBalancers: opts.loadBalancerService, LoadBalancerPools: opts.poolService, @@ -142,6 +148,71 @@ func newTestClusterScopeWithLB(opts lbTestScopeOptions) *scope.ClusterScope { } } +// defaultLBServices fills in nil cloudscale service mocks with "exists" stubs +// so that orchestrator tests only need to override the one behaviour they care about. +func defaultLBServices(opts *lbTestScopeOptions) { + if opts.loadBalancerService == nil { + opts.loadBalancerService = existingLBMock(LoadBalancerRunningStatus) + } + if opts.poolService == nil { + opts.poolService = existingPoolMock() + } + if opts.listenerService == nil { + opts.listenerService = existingListenerMock() + } + if opts.healthMonitorService == nil { + opts.healthMonitorService = existingHealthMonitorMock() + } + if opts.poolMemberService == nil { + opts.poolMemberService = emptyMembersMock() + } +} + +// existingLBMock returns a mock that reports the LB already exists with the given status. +func existingLBMock(status string) *mockLoadBalancerService { + return &mockLoadBalancerService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { + return &cloudscalesdk.LoadBalancer{UUID: id, Status: status}, nil + }, + } +} + +// existingPoolMock returns a mock that reports the pool already exists. +func existingPoolMock() *mockLoadBalancerPoolService { + return &mockLoadBalancerPoolService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerPool, error) { + return &cloudscalesdk.LoadBalancerPool{UUID: id}, nil + }, + } +} + +// existingListenerMock returns a mock that reports the listener already exists. +func existingListenerMock() *mockLoadBalancerListenerService { + return &mockLoadBalancerListenerService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerListener, error) { + return &cloudscalesdk.LoadBalancerListener{UUID: id}, nil + }, + } +} + +// existingHealthMonitorMock returns a mock that reports the health monitor already exists. +func existingHealthMonitorMock() *mockLoadBalancerHealthMonitorService { + return &mockLoadBalancerHealthMonitorService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { + return &cloudscalesdk.LoadBalancerHealthMonitor{UUID: id}, nil + }, + } +} + +// emptyMembersMock returns a mock that reports no pool members. +func emptyMembersMock() *mockLoadBalancerPoolMemberService { + return &mockLoadBalancerPoolMemberService{ + listFn: func(ctx context.Context, poolID string, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPoolMember, error) { + return nil, nil + }, + } +} + // ============================================================================ // Tests for reconcileLB // ============================================================================ @@ -157,7 +228,7 @@ func TestReconcileLB_CreatesLoadBalancer(t *testing.T) { return &cloudscalesdk.LoadBalancer{ UUID: "lb-uuid-123", Name: req.Name, - Status: "creating", + Status: LoadBalancerChangingStatus, }, nil }, } @@ -169,7 +240,7 @@ func TestReconcileLB_CreatesLoadBalancer(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLB(context.Background(), clusterScope) + _, err := r.reconcileLB(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(Equal("lb-uuid-123")) @@ -183,10 +254,10 @@ func TestReconcileLB_SkipsIfAlreadyExists(t *testing.T) { lbService := &mockLoadBalancerService{ getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return &cloudscalesdk.LoadBalancer{UUID: id, Status: "running"}, nil + return &cloudscalesdk.LoadBalancer{UUID: id, Status: LoadBalancerRunningStatus}, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerRequest) (*cloudscalesdk.LoadBalancer, error) { - t.Fatal("Create should not be called when LB already exists") + g.Fail("Create should not be called when LB already exists") return nil, nil }, } @@ -199,97 +270,12 @@ func TestReconcileLB_SkipsIfAlreadyExists(t *testing.T) { r := newTestReconciler() - err := r.reconcileLB(context.Background(), clusterScope) + _, err := r.reconcileLB(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(Equal("existing-lb-uuid")) } -func TestReconcileLB_FindsExistingByTag(t *testing.T) { - g := NewWithT(t) - - lbService := &mockLoadBalancerService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancer, error) { - return []cloudscalesdk.LoadBalancer{ - {UUID: "found-lb-uuid", Name: "test-cluster-cp-lb"}, - }, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerRequest) (*cloudscalesdk.LoadBalancer, error) { - t.Fatal("Create should not be called when LB is found by tag") - return nil, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, - lbEnabled: true, - }) - r := newTestReconciler() - - err := r.reconcileLB(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(Equal("found-lb-uuid")) -} - -func TestReconcileLB_ErrorsOnMultipleByTag(t *testing.T) { - g := NewWithT(t) - - lbService := &mockLoadBalancerService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancer, error) { - return []cloudscalesdk.LoadBalancer{ - {UUID: "lb-uuid-1", Name: "test-cluster-cp-lb"}, - {UUID: "lb-uuid-2", Name: "test-cluster-cp-lb"}, - }, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, - lbEnabled: true, - }) - r := newTestReconciler() - - err := r.reconcileLB(context.Background(), clusterScope) - - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("found 2 load balancers matching tag filter")) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(BeEmpty()) -} - -func TestReconcileLB_RecreatesIfDeletedExternally(t *testing.T) { - g := NewWithT(t) - - var createdLB bool - - lbService := &mockLoadBalancerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} - }, - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancer, error) { - return nil, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerRequest) (*cloudscalesdk.LoadBalancer, error) { - createdLB = true - return &cloudscalesdk.LoadBalancer{UUID: "new-lb-uuid", Name: req.Name}, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, - lbEnabled: true, - }) - clusterScope.CloudscaleCluster.Status.LoadBalancerID = "deleted-lb-uuid" - - r := newTestReconciler() - - err := r.reconcileLB(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(createdLB).To(BeTrue(), "Should create a new LB when old one was deleted") - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerID).To(Equal("new-lb-uuid")) -} - func TestReconcileLB_UsesCustomFlavor(t *testing.T) { g := NewWithT(t) @@ -309,7 +295,7 @@ func TestReconcileLB_UsesCustomFlavor(t *testing.T) { }) r := newTestReconciler() - err := r.reconcileLB(context.Background(), clusterScope) + _, err := r.reconcileLB(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.Flavor).To(Equal("lb-flex-2")) @@ -343,7 +329,7 @@ func TestReconcileLBPool_CreatesPool(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBPool(context.Background(), clusterScope) + _, err := r.reconcileLBPool(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(Equal("pool-uuid-123")) @@ -361,7 +347,7 @@ func TestReconcileLBPool_SkipsIfAlreadyExists(t *testing.T) { return &cloudscalesdk.LoadBalancerPool{UUID: id}, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerPoolRequest) (*cloudscalesdk.LoadBalancerPool, error) { - t.Fatal("Create should not be called when pool already exists") + g.Fail("Create should not be called when pool already exists") return nil, nil }, } @@ -374,98 +360,12 @@ func TestReconcileLBPool_SkipsIfAlreadyExists(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBPool(context.Background(), clusterScope) + _, err := r.reconcileLBPool(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(Equal("existing-pool-uuid")) } -func TestReconcileLBPool_FindsExistingByTag(t *testing.T) { - g := NewWithT(t) - - poolService := &mockLoadBalancerPoolService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPool, error) { - return []cloudscalesdk.LoadBalancerPool{ - {UUID: "found-pool-uuid", Name: "test-cluster-cp-pool"}, - }, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerPoolRequest) (*cloudscalesdk.LoadBalancerPool, error) { - t.Fatal("Create should not be called when pool is found by tag") - return nil, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - poolService: poolService, - lbEnabled: true, - }) - r := newTestReconciler() - - err := r.reconcileLBPool(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(Equal("found-pool-uuid")) -} - -func TestReconcileLBPool_ErrorsOnMultipleByTag(t *testing.T) { - g := NewWithT(t) - - poolService := &mockLoadBalancerPoolService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPool, error) { - return []cloudscalesdk.LoadBalancerPool{ - {UUID: "pool-uuid-1", Name: "test-cluster-cp-pool"}, - {UUID: "pool-uuid-2", Name: "test-cluster-cp-pool"}, - }, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - poolService: poolService, - lbEnabled: true, - }) - r := newTestReconciler() - - err := r.reconcileLBPool(context.Background(), clusterScope) - - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("found 2 load balancer pools matching tag filter")) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(BeEmpty()) -} - -func TestReconcileLBPool_RecreatesIfDeletedExternally(t *testing.T) { - g := NewWithT(t) - - var createdPool bool - - poolService := &mockLoadBalancerPoolService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerPool, error) { - return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} - }, - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPool, error) { - return nil, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerPoolRequest) (*cloudscalesdk.LoadBalancerPool, error) { - createdPool = true - return &cloudscalesdk.LoadBalancerPool{UUID: "new-pool-uuid", Name: req.Name}, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - poolService: poolService, - lbEnabled: true, - }) - clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID - clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = "deleted-pool-uuid" - - r := newTestReconciler() - - err := r.reconcileLBPool(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(createdPool).To(BeTrue(), "Should create a new pool when old one was deleted") - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID).To(Equal("new-pool-uuid")) -} - func TestReconcileLBPool_UsesCustomAlgorithm(t *testing.T) { g := NewWithT(t) @@ -487,7 +387,7 @@ func TestReconcileLBPool_UsesCustomAlgorithm(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBPool(context.Background(), clusterScope) + _, err := r.reconcileLBPool(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.Algorithm).To(Equal("least_connections")) @@ -520,7 +420,7 @@ func TestReconcileLBListener_CreatesListener(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBListener(context.Background(), clusterScope) + _, err := r.reconcileLBListener(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(Equal("listener-uuid-123")) @@ -538,7 +438,7 @@ func TestReconcileLBListener_SkipsIfAlreadyExists(t *testing.T) { return &cloudscalesdk.LoadBalancerListener{UUID: id}, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerListenerRequest) (*cloudscalesdk.LoadBalancerListener, error) { - t.Fatal("Create should not be called when listener already exists") + g.Fail("Create should not be called when listener already exists") return nil, nil }, } @@ -551,98 +451,12 @@ func TestReconcileLBListener_SkipsIfAlreadyExists(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBListener(context.Background(), clusterScope) + _, err := r.reconcileLBListener(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(Equal("existing-listener-uuid")) } -func TestReconcileLBListener_FindsExistingByTag(t *testing.T) { - g := NewWithT(t) - - listenerService := &mockLoadBalancerListenerService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerListener, error) { - return []cloudscalesdk.LoadBalancerListener{ - {UUID: "found-listener-uuid", Name: "test-cluster-cp-listener"}, - }, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerListenerRequest) (*cloudscalesdk.LoadBalancerListener, error) { - t.Fatal("Create should not be called when listener is found by tag") - return nil, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - listenerService: listenerService, - lbEnabled: true, - }) - r := newTestReconciler() - - err := r.reconcileLBListener(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(Equal("found-listener-uuid")) -} - -func TestReconcileLBListener_ErrorsOnMultipleByTag(t *testing.T) { - g := NewWithT(t) - - listenerService := &mockLoadBalancerListenerService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerListener, error) { - return []cloudscalesdk.LoadBalancerListener{ - {UUID: "listener-uuid-1", Name: "test-cluster-cp-listener"}, - {UUID: "listener-uuid-2", Name: "test-cluster-cp-listener"}, - }, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - listenerService: listenerService, - lbEnabled: true, - }) - r := newTestReconciler() - - err := r.reconcileLBListener(context.Background(), clusterScope) - - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("found 2 load balancer listeners matching tag filter")) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(BeEmpty()) -} - -func TestReconcileLBListener_RecreatesIfDeletedExternally(t *testing.T) { - g := NewWithT(t) - - var createdListener bool - - listenerService := &mockLoadBalancerListenerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerListener, error) { - return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} - }, - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerListener, error) { - return nil, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerListenerRequest) (*cloudscalesdk.LoadBalancerListener, error) { - createdListener = true - return &cloudscalesdk.LoadBalancerListener{UUID: "new-listener-uuid", Name: req.Name}, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - listenerService: listenerService, - lbEnabled: true, - }) - clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID - clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = "deleted-listener-uuid" - - r := newTestReconciler() - - err := r.reconcileLBListener(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(createdListener).To(BeTrue(), "Should create a new listener when old one was deleted") - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID).To(Equal("new-listener-uuid")) -} - func TestReconcileLBListener_UsesCustomPort(t *testing.T) { g := NewWithT(t) @@ -664,7 +478,7 @@ func TestReconcileLBListener_UsesCustomPort(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBListener(context.Background(), clusterScope) + _, err := r.reconcileLBListener(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.ProtocolPort).To(Equal(8443)) @@ -696,7 +510,7 @@ func TestReconcileLBHealthMonitor_CreatesMonitor(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) + _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("hm-uuid-123")) @@ -716,7 +530,7 @@ func TestReconcileLBHealthMonitor_SkipsIfAlreadyExists(t *testing.T) { return &cloudscalesdk.LoadBalancerHealthMonitor{UUID: id}, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerHealthMonitorRequest) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { - t.Fatal("Create should not be called when health monitor already exists") + g.Fail("Create should not be called when health monitor already exists") return nil, nil }, } @@ -729,98 +543,12 @@ func TestReconcileLBHealthMonitor_SkipsIfAlreadyExists(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) + _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("existing-hm-uuid")) } -func TestReconcileLBHealthMonitor_FindsExistingByTag(t *testing.T) { - g := NewWithT(t) - - healthMonitorService := &mockLoadBalancerHealthMonitorService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerHealthMonitor, error) { - return []cloudscalesdk.LoadBalancerHealthMonitor{ - {UUID: "found-hm-uuid"}, - }, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerHealthMonitorRequest) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { - t.Fatal("Create should not be called when health monitor is found by tag") - return nil, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - healthMonitorService: healthMonitorService, - lbEnabled: true, - }) - r := newTestReconciler() - - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("found-hm-uuid")) -} - -func TestReconcileLBHealthMonitor_ErrorsOnMultipleByTag(t *testing.T) { - g := NewWithT(t) - - healthMonitorService := &mockLoadBalancerHealthMonitorService{ - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerHealthMonitor, error) { - return []cloudscalesdk.LoadBalancerHealthMonitor{ - {UUID: "hm-uuid-1"}, - {UUID: "hm-uuid-2"}, - }, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - healthMonitorService: healthMonitorService, - lbEnabled: true, - }) - r := newTestReconciler() - - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) - - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("found 2 load balancer health monitors matching tag filter")) - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(BeEmpty()) -} - -func TestReconcileLBHealthMonitor_RecreatesIfDeletedExternally(t *testing.T) { - g := NewWithT(t) - - var createdHM bool - - healthMonitorService := &mockLoadBalancerHealthMonitorService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { - return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} - }, - listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerHealthMonitor, error) { - return nil, nil - }, - createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerHealthMonitorRequest) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { - createdHM = true - return &cloudscalesdk.LoadBalancerHealthMonitor{UUID: "new-hm-uuid"}, nil - }, - } - - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - healthMonitorService: healthMonitorService, - lbEnabled: true, - }) - clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID - clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = "deleted-hm-uuid" - - r := newTestReconciler() - - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) - - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(createdHM).To(BeTrue(), "Should create a new health monitor when old one was deleted") - g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("new-hm-uuid")) -} - func TestReconcileLBHealthMonitor_UsesCustomThresholds(t *testing.T) { g := NewWithT(t) @@ -845,7 +573,7 @@ func TestReconcileLBHealthMonitor_UsesCustomThresholds(t *testing.T) { r := newTestReconciler() - err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) + _, err := r.reconcileLBHealthMonitor(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.DelayS).To(Equal(10)) @@ -863,11 +591,11 @@ func TestReconcileLoadBalancer_SkipsWhenDisabled(t *testing.T) { lbService := &mockLoadBalancerService{ createFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerRequest) (*cloudscalesdk.LoadBalancer, error) { - t.Fatal("Create should not be called when LB is disabled") + g.Fail("Create should not be called when LB is disabled") return nil, nil }, getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - t.Fatal("Get should not be called when LB is disabled") + g.Fail("Get should not be called when LB is disabled") return nil, nil }, } @@ -885,20 +613,11 @@ func TestReconcileLoadBalancer_SkipsWhenDisabled(t *testing.T) { g.Expect(result.RequeueAfter).To(BeZero()) } -func TestReconcileLoadBalancer_WaitsForLBRunning(t *testing.T) { +func TestReconcileLoadBalancer_ChangingStatusBlocksAndRequeues(t *testing.T) { g := NewWithT(t) - lbService := &mockLoadBalancerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return &cloudscalesdk.LoadBalancer{ - UUID: id, - Status: "creating", // Not running yet - }, nil - }, - } - clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, + loadBalancerService: existingLBMock(LoadBalancerChangingStatus), lbEnabled: true, }) clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID @@ -908,55 +627,125 @@ func TestReconcileLoadBalancer_WaitsForLBRunning(t *testing.T) { result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).ToNot(BeZero(), "Should requeue when LB is not running") + g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "changing status should requeue after 10s") } -func TestReconcileLoadBalancer_SetsControlPlaneEndpoint(t *testing.T) { +func TestReconcileLoadBalancer_ErrorStatusProceedsWithMemberReconciliation(t *testing.T) { g := NewWithT(t) - lbService := &mockLoadBalancerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return &cloudscalesdk.LoadBalancer{ - UUID: id, - Status: "running", - VIPAddresses: []cloudscalesdk.VIPAddress{ - {Address: "203.0.113.10"}, - }, - }, nil - }, - } + clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ + loadBalancerService: existingLBMock(LoadBalancerErrorStatus), + lbEnabled: true, + }) + clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = testListenerUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = testHMUUID - poolService := &mockLoadBalancerPoolService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerPool, error) { - return &cloudscalesdk.LoadBalancerPool{UUID: id}, nil - }, - } + r := newTestReconcilerWithClient(newFakeClientForLB()) - listenerService := &mockLoadBalancerListenerService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerListener, error) { - return &cloudscalesdk.LoadBalancerListener{UUID: id}, nil - }, - } + result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) - healthMonitorService := &mockLoadBalancerHealthMonitorService{ - getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { - return &cloudscalesdk.LoadBalancerHealthMonitor{UUID: id}, nil + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(BeZero(), "error status should not block member reconciliation") + + // LoadBalancerReadyCondition should be False because LB is not running + cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.LoadBalancerReadyCondition) + g.Expect(cond).ToNot(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(infrastructurev1beta2.LoadBalancerNotReadyReason)) +} + +func TestReconcileLoadBalancer_DegradedStatusProceedsWithMemberReconciliation(t *testing.T) { + g := NewWithT(t) + + var deletedMemberUUID string + + clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ + loadBalancerService: existingLBMock(LoadBalancerDegradedStatus), + poolMemberService: &mockLoadBalancerPoolMemberService{ + listFn: func(ctx context.Context, poolID string, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPoolMember, error) { + return []cloudscalesdk.LoadBalancerPoolMember{ + {Name: "stale-cp-0", UUID: "stale-uuid", Address: "10.0.0.10"}, + }, nil + }, + deleteFn: func(ctx context.Context, poolID, memberID string) error { + deletedMemberUUID = memberID + return nil + }, }, - } + lbEnabled: true, + }) + clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = testListenerUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = testHMUUID - poolMemberService := &mockLoadBalancerPoolMemberService{ - listFn: func(ctx context.Context, poolID string, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPoolMember, error) { - return nil, nil + r := newTestReconcilerWithClient(newFakeClientForLB()) + + result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(BeZero(), "degraded status should not block member reconciliation") + + // Stale member should have been removed + g.Expect(deletedMemberUUID).To(Equal("stale-uuid")) + + // LoadBalancerReadyCondition should be False because LB is degraded + cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.LoadBalancerReadyCondition) + g.Expect(cond).ToNot(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(infrastructurev1beta2.LoadBalancerNotReadyReason)) +} + +func TestReconcileLoadBalancer_ErrorStatusMessageNamesDownMembers(t *testing.T) { + g := NewWithT(t) + + clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ + loadBalancerService: existingLBMock(LoadBalancerErrorStatus), + poolMemberService: &mockLoadBalancerPoolMemberService{ + listFn: func(ctx context.Context, poolID string, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerPoolMember, error) { + return []cloudscalesdk.LoadBalancerPoolMember{ + {Name: "cp-0", UUID: "cp-uuid", Address: "10.0.0.110", ProtocolPort: 6443, MonitorStatus: "down"}, + }, nil + }, }, - } + lbEnabled: true, + }) + clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = testListenerUUID + clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = testHMUUID + + r := newTestReconcilerWithClient(newFakeClientForLB()) + + _, err := r.reconcileLoadBalancer(context.Background(), clusterScope) + g.Expect(err).ToNot(HaveOccurred()) + + cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.LoadBalancerReadyCondition) + g.Expect(cond).ToNot(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(infrastructurev1beta2.LoadBalancerNotReadyReason)) + g.Expect(cond.Message).To(ContainSubstring("error")) + g.Expect(cond.Message).To(ContainSubstring("cp-0@10.0.0.110:6443")) +} + +func TestReconcileLoadBalancer_SetsControlPlaneEndpoint(t *testing.T) { + g := NewWithT(t) clusterScope := newTestClusterScopeWithLB(lbTestScopeOptions{ - loadBalancerService: lbService, - poolService: poolService, - listenerService: listenerService, - healthMonitorService: healthMonitorService, - poolMemberService: poolMemberService, - lbEnabled: true, + loadBalancerService: &mockLoadBalancerService{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { + return &cloudscalesdk.LoadBalancer{ + UUID: id, + Status: LoadBalancerRunningStatus, + VIPAddresses: []cloudscalesdk.VIPAddress{ + {Address: "203.0.113.10"}, + }, + }, nil + }, + }, + lbEnabled: true, }) clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID @@ -981,7 +770,7 @@ func TestDeleteLoadBalancer_SkipsWhenDisabled(t *testing.T) { lbService := &mockLoadBalancerService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when LB is disabled") + g.Fail("Delete should not be called when LB is disabled") return nil }, } @@ -1158,7 +947,7 @@ func TestReconcileLBMembers_AddsMissingMember(t *testing.T) { r := newTestReconcilerWithClient(k8sClient) - err := r.reconcileLBMembers(context.Background(), clusterScope) + _, err := r.reconcileLBMembers(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createdReq).ToNot(BeNil()) @@ -1195,7 +984,7 @@ func TestReconcileLBMembers_RemovesStaleMember(t *testing.T) { r := newTestReconcilerWithClient(k8sClient) - err := r.reconcileLBMembers(context.Background(), clusterScope) + _, err := r.reconcileLBMembers(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(deletedMemberID).To(Equal("stale-member-uuid")) @@ -1247,7 +1036,7 @@ func TestReconcileLBMembers_UpdatesChangedAddress(t *testing.T) { r := newTestReconcilerWithClient(k8sClient) - err := r.reconcileLBMembers(context.Background(), clusterScope) + _, err := r.reconcileLBMembers(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(updatedMemberID).To(Equal("member-uuid-1")) @@ -1265,15 +1054,15 @@ func TestReconcileLBMembers_NoopWhenInSync(t *testing.T) { }, nil }, createFn: func(ctx context.Context, poolID string, req *cloudscalesdk.LoadBalancerPoolMemberRequest) (*cloudscalesdk.LoadBalancerPoolMember, error) { - t.Fatal("Create should not be called when in sync") + g.Fail("Create should not be called when in sync") return nil, nil }, deleteFn: func(ctx context.Context, poolID, memberID string) error { - t.Fatal("Delete should not be called when in sync") + g.Fail("Delete should not be called when in sync") return nil }, updateFn: func(ctx context.Context, poolID, memberID string, req *cloudscalesdk.LoadBalancerPoolMemberRequest) error { - t.Fatal("Update should not be called when in sync") + g.Fail("Update should not be called when in sync") return nil }, } @@ -1304,7 +1093,7 @@ func TestReconcileLBMembers_NoopWhenInSync(t *testing.T) { r := newTestReconcilerWithClient(k8sClient) - err := r.reconcileLBMembers(context.Background(), clusterScope) + _, err := r.reconcileLBMembers(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) } @@ -1440,7 +1229,7 @@ func TestCreateLoadBalancerMember_AppendsToStatus(t *testing.T) { r := newTestReconciler() - err := r.createLoadBalancerMember(context.Background(), clusterScope, cloudscalesdk.LoadBalancerPoolMemberRequest{ + _, err := r.createLoadBalancerMember(context.Background(), clusterScope, cloudscalesdk.LoadBalancerPoolMemberRequest{ Name: "cp-machine-1", Address: "10.0.0.1", }) diff --git a/internal/controller/cloudscalecluster_network.go b/internal/controller/cloudscalecluster_network.go index 381d9de..180bc14 100644 --- a/internal/controller/cloudscalecluster_network.go +++ b/internal/controller/cloudscalecluster_network.go @@ -20,20 +20,25 @@ import ( "context" "errors" "fmt" + "strings" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) +const createNetworkTimeoutRequeueAfter = 5 * time.Second + // reconcileNetwork orchestrates network and subnet provisioning for all networks // defined in spec.networks. A single NetworkReadyCondition covers all networks. -func (r *CloudscaleClusterReconciler) reconcileNetwork(ctx context.Context, clusterScope *scope.ClusterScope) (reterr error) { +func (r *CloudscaleClusterReconciler) reconcileNetwork(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { defer func() { if reterr != nil { r.setCondition(clusterScope, infrastructurev1beta2.NetworkReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.NetworkErrorReason, reterr.Error()) @@ -43,22 +48,26 @@ func (r *CloudscaleClusterReconciler) reconcileNetwork(ctx context.Context, clus }() if len(clusterScope.CloudscaleCluster.Spec.Networks) == 0 { - return fmt.Errorf("no networks defined in spec") + return ctrl.Result{}, fmt.Errorf("no networks defined in spec") } for _, netSpec := range clusterScope.CloudscaleCluster.Spec.Networks { if netSpec.UUID != "" { if err := r.reconcilePreExistingNetwork(ctx, clusterScope, netSpec); err != nil { - return fmt.Errorf("reconciling pre-existing network %q: %w", netSpec.Name, err) + return ctrl.Result{}, fmt.Errorf("reconciling pre-existing network %q: %w", netSpec.Name, err) } } else { - if err := r.reconcileManagedNetwork(ctx, clusterScope, netSpec); err != nil { - return fmt.Errorf("reconciling managed network %q: %w", netSpec.Name, err) + result, err := r.reconcileManagedNetwork(ctx, clusterScope, netSpec) + if err != nil { + return ctrl.Result{}, fmt.Errorf("reconciling managed network %q: %w", netSpec.Name, err) + } + if !result.IsZero() { + return result, nil } } } - return nil + return ctrl.Result{}, nil } // reconcilePreExistingNetwork validates a pre-existing network exists and discovers its subnet. @@ -73,7 +82,9 @@ func (r *CloudscaleClusterReconciler) reconcilePreExistingNetwork(ctx context.Co return nil } - network, err := clusterScope.CloudscaleClient.Networks.Get(ctx, netSpec.UUID) + getCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + network, err := clusterScope.CloudscaleClient.Networks.Get(getCtx, netSpec.UUID) if err != nil { return fmt.Errorf("getting pre-existing network %s: %w", netSpec.UUID, err) } @@ -93,7 +104,7 @@ func (r *CloudscaleClusterReconciler) reconcilePreExistingNetwork(ctx context.Co } // reconcileManagedNetwork ensures a managed network and its subnet exist. -func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Context, clusterScope *scope.ClusterScope, netSpec infrastructurev1beta2.NetworkSpec) error { +func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Context, clusterScope *scope.ClusterScope, netSpec infrastructurev1beta2.NetworkSpec) (ctrl.Result, error) { ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus(netSpec.Name) // Reconcile the network resource @@ -112,13 +123,15 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex tags, ) if err != nil { - return err + return ctrl.Result{}, err } if resolvedNetworkID == "" { // Create new network clusterScope.Info("Creating network", "name", netSpec.Name) - network, err := clusterScope.CloudscaleClient.Networks.Create(ctx, &cloudscalesdk.NetworkCreateRequest{ + createCtx, cancel := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancel() + network, err := clusterScope.CloudscaleClient.Networks.Create(createCtx, &cloudscalesdk.NetworkCreateRequest{ Name: netSpec.Name, AutoCreateIPV4Subnet: ptr.To(false), ZonalResourceRequest: cloudscalesdk.ZonalResourceRequest{ @@ -129,7 +142,11 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex }, }) if err != nil { - return fmt.Errorf("creating network: %w", err) + if cloudscale.IsTimeoutError(err) { + clusterScope.Info("Network creation timed out, waiting before retry", "requeueAfter", createNetworkTimeoutRequeueAfter) + return ctrl.Result{RequeueAfter: createNetworkTimeoutRequeueAfter}, nil + } + return ctrl.Result{}, fmt.Errorf("creating network: %w", err) } resolvedNetworkID = network.UUID clusterScope.Info("Created network", "name", netSpec.Name, "networkID", network.UUID) @@ -151,13 +168,15 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex tags, ) if err != nil { - return err + return ctrl.Result{}, err } if resolvedSubnetID == "" { // Create new subnet clusterScope.Info("Creating subnet", "name", netSpec.Name, "cidr", netSpec.CIDR, "gateway", netSpec.GatewayAddress) - subnet, err := clusterScope.CloudscaleClient.Subnets.Create(ctx, &cloudscalesdk.SubnetCreateRequest{ + createSubnetCtx, cancelSubnet := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancelSubnet() + subnet, err := clusterScope.CloudscaleClient.Subnets.Create(createSubnetCtx, &cloudscalesdk.SubnetCreateRequest{ Network: resolvedNetworkID, CIDR: netSpec.CIDR, GatewayAddress: netSpec.GatewayAddress, @@ -166,7 +185,11 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex }, }) if err != nil { - return fmt.Errorf("creating subnet: %w", err) + if cloudscale.IsTimeoutError(err) { + clusterScope.Info("Subnet creation timed out, waiting before retry", "requeueAfter", createNetworkTimeoutRequeueAfter) + return ctrl.Result{RequeueAfter: createNetworkTimeoutRequeueAfter}, nil + } + return ctrl.Result{}, fmt.Errorf("creating subnet: %w", err) } resolvedSubnetID = subnet.UUID clusterScope.Info("Created subnet", "name", netSpec.Name, "subnetID", subnet.UUID) @@ -175,7 +198,7 @@ func (r *CloudscaleClusterReconciler) reconcileManagedNetwork(ctx context.Contex } r.setNetworkStatus(clusterScope, netSpec.Name, resolvedNetworkID, resolvedSubnetID, netSpec.CIDR, true) - return nil + return ctrl.Result{}, nil } // deleteNetwork deletes all managed networks. Pre-existing networks are left untouched. @@ -195,8 +218,10 @@ func (r *CloudscaleClusterReconciler) deleteNetwork(ctx context.Context, cluster var errs []error for _, ns := range clusterScope.CloudscaleCluster.Status.Networks { + logger := clusterScope.WithValues("name", ns.Name, "networkID", ns.NetworkID) + if !ns.Managed { - clusterScope.Info("Skipping pre-existing network deletion", "name", ns.Name, "networkID", ns.NetworkID) + logger.Info("Skipping pre-existing network deletion") remaining = append(remaining, ns) continue } @@ -205,14 +230,29 @@ func (r *CloudscaleClusterReconciler) deleteNetwork(ctx context.Context, cluster continue } - clusterScope.Info("Deleting network", "name", ns.Name, "networkID", ns.NetworkID) - if err := clusterScope.CloudscaleClient.Networks.Delete(ctx, ns.NetworkID); err != nil { - if !cloudscale.IsNotFound(err) { - remaining = append(remaining, ns) - errs = append(errs, fmt.Errorf("deleting network %s: %w", ns.Name, err)) - continue - } - clusterScope.Info("Network already deleted", "name", ns.Name, "networkID", ns.NetworkID) + logger.Info("Deleting network") + deleteCtx, cancel := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + err := clusterScope.CloudscaleClient.Networks.Delete(deleteCtx, ns.NetworkID) + cancel() + + // return sentinel error which will wait a pre-defined amount of time + if isLBPoolMembersError(err) { + logger.Info("Network still has LB pool members, waiting for async cleanup") + return errNetworkNotReady + } + + isNotFound := cloudscale.IsNotFound(err) + + // in case of an actual error, keep the network in status and move on. + if err != nil && !isNotFound { + remaining = append(remaining, ns) + errs = append(errs, fmt.Errorf("deleting network %s: %w", ns.Name, err)) + continue + } + + // Idempotent not-found: log and fall through to the event. + if isNotFound { + logger.Info("Network already deleted") } r.recorder.Eventf(clusterScope.CloudscaleCluster, nil, corev1.EventTypeNormal, "NetworkDeleted", "DeleteNetwork", @@ -223,6 +263,19 @@ func (r *CloudscaleClusterReconciler) deleteNetwork(ctx context.Context, cluster return errors.Join(errs...) } +// errNetworkNotReady is returned when a network cannot be deleted because +// it still has pending dependencies (e.g. LB pool members). +var errNetworkNotReady = errors.New("network has pending dependencies") + +// isLBPoolMembersError reports whether err is the cloudscale API error +// for deleting a network that still has load balancer pool members in it. +func isLBPoolMembersError(err error) bool { + if err == nil { + return false + } + return strings.Contains(err.Error(), "load balancer pool members") +} + // setNetworkStatus updates or appends the network status entry for the given name. func (r *CloudscaleClusterReconciler) setNetworkStatus(clusterScope *scope.ClusterScope, name, networkID, subnetID, cidr string, managed bool) { for i, ns := range clusterScope.CloudscaleCluster.Status.Networks { diff --git a/internal/controller/cloudscalecluster_network_test.go b/internal/controller/cloudscalecluster_network_test.go index ce12ef9..c688c23 100644 --- a/internal/controller/cloudscalecluster_network_test.go +++ b/internal/controller/cloudscalecluster_network_test.go @@ -19,17 +19,18 @@ package controller import ( "context" "fmt" + "net/url" + "os" "testing" - "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" "github.com/go-logr/logr" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/tools/events" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -37,7 +38,7 @@ const netUUID = "net-uuid-123" // --- Test helpers --- -func newTestClusterScope(networkService cs.NetworkService, subnetService cs.SubnetService) *scope.ClusterScope { +func newTestClusterScope(networkService cloudscale.NetworkService, subnetService cloudscale.SubnetService) *scope.ClusterScope { return &scope.ClusterScope{ Logger: logr.Discard(), Cluster: &clusterv1.Cluster{ @@ -62,44 +63,38 @@ func newTestClusterScope(networkService cs.NetworkService, subnetService cs.Subn }, }, }, - CloudscaleClient: &cs.Client{ + CloudscaleClient: &cloudscale.Client{ Networks: networkService, Subnets: subnetService, }, } } -func newTestReconciler() *CloudscaleClusterReconciler { - return &CloudscaleClusterReconciler{ - recorder: events.NewFakeRecorder(10), - } -} - // --- Orchestrator tests --- func TestReconcileNetwork_CreatesBothResources(t *testing.T) { g := NewWithT(t) - var capturedNetReq *cloudscale.NetworkCreateRequest - var capturedSubReq *cloudscale.SubnetCreateRequest + var capturedNetReq *cloudscalesdk.NetworkCreateRequest + var capturedSubReq *cloudscalesdk.SubnetCreateRequest networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { capturedNetReq = req - return &cloudscale.Network{UUID: netUUID, Name: req.Name}, nil + return &cloudscalesdk.Network{UUID: netUUID, Name: req.Name}, nil }, } subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { capturedSubReq = req - return &cloudscale.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil + return &cloudscalesdk.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil }, } clusterScope := newTestClusterScope(networkService, subnetService) r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("test") @@ -119,20 +114,20 @@ func TestReconcileNetwork_SkipsIfBothExist(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: id}, nil + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: id}, nil }, - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - t.Fatal("Network create should not be called") + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + g.Fail("Network create should not be called") return nil, nil }, } subnetService := &mockSubnetService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Subnet, error) { - return &cloudscale.Subnet{UUID: id}, nil + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Subnet, error) { + return &cloudscalesdk.Subnet{UUID: id}, nil }, - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { - t.Fatal("Subnet create should not be called") + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { + g.Fail("Create should not be called when subnet is found by tag") return nil, nil }, } @@ -144,7 +139,7 @@ func TestReconcileNetwork_SkipsIfBothExist(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("test") @@ -157,13 +152,13 @@ func TestReconcileNetwork_NetworkErrorStopsSubnet(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) { return nil, fmt.Errorf("api error") }, } subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { - t.Fatal("Subnet create should not be called when network fails") + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { + g.Fail("Subnet create should not be called when network fails") return nil, nil }, } @@ -171,7 +166,7 @@ func TestReconcileNetwork_NetworkErrorStopsSubnet(t *testing.T) { clusterScope := newTestClusterScope(networkService, subnetService) r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("api error")) @@ -182,12 +177,12 @@ func TestReconcileNetwork_SubnetErrorSurfaced(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: "net-uuid"}, nil + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: "net-uuid"}, nil }, } subnetService := &mockSubnetService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { return nil, fmt.Errorf("subnet api error") }, } @@ -195,7 +190,7 @@ func TestReconcileNetwork_SubnetErrorSurfaced(t *testing.T) { clusterScope := newTestClusterScope(networkService, subnetService) r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("subnet api error")) @@ -207,23 +202,23 @@ func TestReconcileNetwork_FindsByTag(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) { - return []cloudscale.Network{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) { + return []cloudscalesdk.Network{ {UUID: "found-net-uuid", Name: "test-cluster"}, }, nil }, - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - t.Fatal("Create should not be called when network is found by tag") + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + g.Fail("Create should not be called when network is found by tag") return nil, nil }, } subnetService := &mockSubnetService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { - return []cloudscale.Subnet{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { + return []cloudscalesdk.Subnet{ {UUID: "found-subnet-uuid", CIDR: "10.0.0.0/24"}, }, nil }, - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { t.Fatal("Create should not be called when subnet is found by tag") return nil, nil }, @@ -232,7 +227,7 @@ func TestReconcileNetwork_FindsByTag(t *testing.T) { clusterScope := newTestClusterScope(networkService, subnetService) r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("test") @@ -245,8 +240,8 @@ func TestReconcileNetwork_ErrorsOnMultipleNetworks(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) { - return []cloudscale.Network{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) { + return []cloudscalesdk.Network{ {UUID: "net-uuid-1"}, {UUID: "net-uuid-2"}, }, nil @@ -256,7 +251,7 @@ func TestReconcileNetwork_ErrorsOnMultipleNetworks(t *testing.T) { clusterScope := newTestClusterScope(networkService, &mockSubnetService{}) r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("found 2 network/tests matching tag filter")) @@ -268,20 +263,20 @@ func TestReconcileNetwork_RecreatesIfDeletedExternally(t *testing.T) { var created bool networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return nil, &cloudscale.ErrorResponse{StatusCode: 404} + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} }, - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) { return nil, nil }, - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { created = true - return &cloudscale.Network{UUID: "new-net-uuid", Name: req.Name}, nil + return &cloudscalesdk.Network{UUID: "new-net-uuid", Name: req.Name}, nil }, } subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { - return &cloudscale.Subnet{UUID: "new-subnet-uuid", CIDR: req.CIDR}, nil + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { + return &cloudscalesdk.Subnet{UUID: "new-subnet-uuid", CIDR: req.CIDR}, nil }, } @@ -292,7 +287,7 @@ func TestReconcileNetwork_RecreatesIfDeletedExternally(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(created).To(BeTrue(), "Should create a new network when old one was deleted") @@ -307,17 +302,17 @@ func TestReconcileNetwork_SubnetFindsByTag(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: netUUID, Name: req.Name}, nil + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: netUUID, Name: req.Name}, nil }, } subnetService := &mockSubnetService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { - return []cloudscale.Subnet{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { + return []cloudscalesdk.Subnet{ {UUID: "found-subnet-uuid", CIDR: "10.0.0.0/24"}, }, nil }, - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { t.Fatal("Create should not be called when subnet is found by tag") return nil, nil }, @@ -329,13 +324,13 @@ func TestReconcileNetwork_SubnetFindsByTag(t *testing.T) { {Name: "test", NetworkID: netUUID, Managed: true}, } // The networkService Get should return the existing network - networkService.getFn = func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: id}, nil + networkService.getFn = func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: id}, nil } r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("test") @@ -347,13 +342,13 @@ func TestReconcileNetwork_SubnetErrorsOnMultiple(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: id}, nil + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: id}, nil }, } subnetService := &mockSubnetService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { - return []cloudscale.Subnet{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { + return []cloudscalesdk.Subnet{ {UUID: "subnet-uuid-1"}, {UUID: "subnet-uuid-2"}, }, nil @@ -367,7 +362,7 @@ func TestReconcileNetwork_SubnetErrorsOnMultiple(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("found 2 subnet/tests matching tag filter")) @@ -379,20 +374,20 @@ func TestReconcileNetwork_SubnetRecreatesIfDeletedExternally(t *testing.T) { var created bool networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: id}, nil + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: id}, nil }, } subnetService := &mockSubnetService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Subnet, error) { - return nil, &cloudscale.ErrorResponse{StatusCode: 404} + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Subnet, error) { + return nil, &cloudscalesdk.ErrorResponse{StatusCode: 404} }, - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { return nil, nil }, - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { created = true - return &cloudscale.Subnet{UUID: "new-subnet-uuid", CIDR: req.CIDR}, nil + return &cloudscalesdk.Subnet{UUID: "new-subnet-uuid", CIDR: req.CIDR}, nil }, } @@ -403,7 +398,7 @@ func TestReconcileNetwork_SubnetRecreatesIfDeletedExternally(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(created).To(BeTrue(), "Should create a new subnet when old one was deleted") @@ -415,17 +410,17 @@ func TestReconcileNetwork_SubnetRecreatesIfDeletedExternally(t *testing.T) { func TestReconcileNetwork_CustomCIDR(t *testing.T) { g := NewWithT(t) - var capturedReq *cloudscale.SubnetCreateRequest + var capturedReq *cloudscalesdk.SubnetCreateRequest networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: netUUID, Name: req.Name}, nil + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: netUUID, Name: req.Name}, nil }, } subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { capturedReq = req - return &cloudscale.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil + return &cloudscalesdk.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil }, } @@ -434,7 +429,7 @@ func TestReconcileNetwork_CustomCIDR(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.CIDR).To(Equal("192.168.0.0/16")) @@ -443,17 +438,17 @@ func TestReconcileNetwork_CustomCIDR(t *testing.T) { func TestReconcileNetwork_ExplicitGateway(t *testing.T) { g := NewWithT(t) - var capturedReq *cloudscale.SubnetCreateRequest + var capturedReq *cloudscalesdk.SubnetCreateRequest networkService := &mockNetworkService{ - createFn: func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { - return &cloudscale.Network{UUID: netUUID, Name: req.Name}, nil + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{UUID: netUUID, Name: req.Name}, nil }, } subnetService := &mockSubnetService{ - createFn: func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { capturedReq = req - return &cloudscale.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil + return &cloudscalesdk.Subnet{UUID: "subnet-uuid-123", CIDR: req.CIDR}, nil }, } @@ -462,7 +457,7 @@ func TestReconcileNetwork_ExplicitGateway(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(capturedReq.GatewayAddress).To(Equal("10.0.0.254")) @@ -501,7 +496,7 @@ func TestDeleteNetwork_SkipsIfNoNetwork(t *testing.T) { networkService := &mockNetworkService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when no network exists") + g.Fail("Delete should not be called when no network exists") return nil }, } @@ -520,7 +515,7 @@ func TestDeleteNetwork_IgnoresAlreadyDeleted(t *testing.T) { networkService := &mockNetworkService{ deleteFn: func(ctx context.Context, id string) error { - return &cloudscale.ErrorResponse{StatusCode: 404} + return &cloudscalesdk.ErrorResponse{StatusCode: 404} }, } @@ -542,7 +537,7 @@ func TestDeleteNetwork_SkipsPreExistingNetwork(t *testing.T) { networkService := &mockNetworkService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called for Pre-existing networks") + g.Fail("Delete should not be called for Pre-existing networks") return nil }, } @@ -643,8 +638,8 @@ func TestReconcileNetwork_PreExistingCachedShortCircuits(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - t.Fatal("Get should not be called when pre-existing status is cached") + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + g.Fail("Get should not be called when pre-existing status is cached") return nil, nil }, } @@ -659,7 +654,7 @@ func TestReconcileNetwork_PreExistingCachedShortCircuits(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("pre-existing-net") @@ -672,14 +667,14 @@ func TestReconcileNetwork_PreExistingReDiscoversWhenCIDRMissing(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { g.Expect(id).To(Equal("pre-existing-uuid")) - return &cloudscale.Network{ + return &cloudscalesdk.Network{ UUID: "pre-existing-uuid", - ZonalResource: cloudscale.ZonalResource{ - Zone: cloudscale.ZoneStub{Slug: "rma1"}, + ZonalResource: cloudscalesdk.ZonalResource{ + Zone: cloudscalesdk.ZoneStub{Slug: "rma1"}, }, - Subnets: []cloudscale.SubnetStub{ + Subnets: []cloudscalesdk.SubnetStub{ {UUID: "pre-existing-subnet-uuid", CIDR: "192.168.0.0/24"}, }, }, nil @@ -697,7 +692,7 @@ func TestReconcileNetwork_PreExistingReDiscoversWhenCIDRMissing(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("pre-existing-net") @@ -712,14 +707,14 @@ func TestReconcileNetwork_PreExistingFetchesAndSetsStatus(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { g.Expect(id).To(Equal("pre-existing-uuid")) - return &cloudscale.Network{ + return &cloudscalesdk.Network{ UUID: "pre-existing-uuid", - ZonalResource: cloudscale.ZonalResource{ - Zone: cloudscale.ZoneStub{Slug: "rma1"}, + ZonalResource: cloudscalesdk.ZonalResource{ + Zone: cloudscalesdk.ZoneStub{Slug: "rma1"}, }, - Subnets: []cloudscale.SubnetStub{ + Subnets: []cloudscalesdk.SubnetStub{ {UUID: "discovered-subnet-uuid"}, }, }, nil @@ -733,7 +728,7 @@ func TestReconcileNetwork_PreExistingFetchesAndSetsStatus(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) ns := clusterScope.CloudscaleCluster.Status.GetNetworkStatus("pre-existing-net") @@ -747,7 +742,7 @@ func TestReconcileNetwork_PreExistingGetError(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { return nil, fmt.Errorf("network not found") }, } @@ -759,7 +754,7 @@ func TestReconcileNetwork_PreExistingGetError(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("network not found")) @@ -769,13 +764,13 @@ func TestReconcileNetwork_PreExistingZoneMismatchErrors(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{ UUID: "pre-existing-uuid", - ZonalResource: cloudscale.ZonalResource{ - Zone: cloudscale.ZoneStub{Slug: "lpg1"}, + ZonalResource: cloudscalesdk.ZonalResource{ + Zone: cloudscalesdk.ZoneStub{Slug: "lpg1"}, }, - Subnets: []cloudscale.SubnetStub{ + Subnets: []cloudscalesdk.SubnetStub{ {UUID: "discovered-subnet-uuid"}, }, }, nil @@ -789,7 +784,7 @@ func TestReconcileNetwork_PreExistingZoneMismatchErrors(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("lpg1")) @@ -801,13 +796,13 @@ func TestReconcileNetwork_PreExistingNoSubnetsErrors(t *testing.T) { g := NewWithT(t) networkService := &mockNetworkService{ - getFn: func(ctx context.Context, id string) (*cloudscale.Network, error) { - return &cloudscale.Network{ + getFn: func(ctx context.Context, id string) (*cloudscalesdk.Network, error) { + return &cloudscalesdk.Network{ UUID: "pre-existing-uuid", - ZonalResource: cloudscale.ZonalResource{ - Zone: cloudscale.ZoneStub{Slug: "rma1"}, + ZonalResource: cloudscalesdk.ZonalResource{ + Zone: cloudscalesdk.ZoneStub{Slug: "rma1"}, }, - Subnets: []cloudscale.SubnetStub{}, + Subnets: []cloudscalesdk.SubnetStub{}, }, nil }, } @@ -819,36 +814,111 @@ func TestReconcileNetwork_PreExistingNoSubnetsErrors(t *testing.T) { r := newTestReconciler() - err := r.reconcileNetwork(context.Background(), clusterScope) + _, err := r.reconcileNetwork(context.Background(), clusterScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("has no subnets")) } +// --- Timeout handling tests for Create() calls --- + +func TestReconcileNetwork_NetworkCreateTimeoutRequeues(t *testing.T) { + g := NewWithT(t) + + networkService := &mockNetworkService{ + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/networks", Err: os.ErrDeadlineExceeded} + }, + } + subnetService := &mockSubnetService{} + + clusterScope := newTestClusterScope(networkService, subnetService) + r := newTestReconciler() + + result, err := r.reconcileNetwork(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(Equal(createNetworkTimeoutRequeueAfter), + "Should requeue after createNetworkTimeoutRequeueAfter on timeout error") + g.Expect(networkService.createFn).ToNot(BeNil(), "Network create should have been called") +} + +func TestReconcileNetwork_SubnetCreateTimeoutRequeues(t *testing.T) { + g := NewWithT(t) + + var netCreated bool + networkService := &mockNetworkService{ + createFn: func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { + netCreated = true + return &cloudscalesdk.Network{UUID: "new-net-uuid", Name: req.Name}, nil + }, + } + subnetService := &mockSubnetService{ + createFn: func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { + return nil, &url.Error{Op: "Post", URL: "https://api.example.com/v1/subnets", Err: os.ErrDeadlineExceeded} + }, + } + + clusterScope := newTestClusterScope(networkService, subnetService) + r := newTestReconciler() + + result, err := r.reconcileNetwork(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(netCreated).To(BeTrue(), "Network should have been created before hitting subnet") + g.Expect(result.RequeueAfter).To(Equal(createNetworkTimeoutRequeueAfter), + "Should requeue after createNetworkTimeoutRequeueAfter on timeout error") +} + +// --- LB pool-member error test --- + +func TestDeleteNetwork_RequeuesOnLBPoolMembersError(t *testing.T) { + g := NewWithT(t) + + networkService := &mockNetworkService{ + deleteFn: func(ctx context.Context, id string) error { + //goland:noinspection GoErrorStringFormat + return fmt.Errorf("There are still one or more load balancer pool members in this network.") + }, + } + + clusterScope := newTestClusterScope(networkService, &mockSubnetService{}) + clusterScope.CloudscaleCluster.Status.Networks = []infrastructurev1beta2.NetworkStatus{ + {Name: "test", NetworkID: "net-with-pool-members", SubnetID: "subnet-uuid", Managed: true}, + } + + r := newTestReconciler() + + err := r.deleteNetwork(context.Background(), clusterScope) + + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("network has pending dependencies")) +} + // --- Mock services --- type mockNetworkService struct { - createFn func(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) - getFn func(ctx context.Context, id string) (*cloudscale.Network, error) - listFn func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) + createFn func(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) + getFn func(ctx context.Context, id string) (*cloudscalesdk.Network, error) + listFn func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) deleteFn func(ctx context.Context, id string) error } -func (m *mockNetworkService) Create(ctx context.Context, req *cloudscale.NetworkCreateRequest) (*cloudscale.Network, error) { +func (m *mockNetworkService) Create(ctx context.Context, req *cloudscalesdk.NetworkCreateRequest) (*cloudscalesdk.Network, error) { if m.createFn != nil { return m.createFn(ctx, req) } return nil, nil } -func (m *mockNetworkService) Get(ctx context.Context, id string) (*cloudscale.Network, error) { +func (m *mockNetworkService) Get(ctx context.Context, id string) (*cloudscalesdk.Network, error) { if m.getFn != nil { return m.getFn(ctx, id) } return nil, nil } -func (m *mockNetworkService) List(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Network, error) { +func (m *mockNetworkService) List(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Network, error) { if m.listFn != nil { return m.listFn(ctx, modifiers...) } @@ -863,27 +933,27 @@ func (m *mockNetworkService) Delete(ctx context.Context, id string) error { } type mockSubnetService struct { - createFn func(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) - getFn func(ctx context.Context, id string) (*cloudscale.Subnet, error) - listFn func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) + createFn func(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) + getFn func(ctx context.Context, id string) (*cloudscalesdk.Subnet, error) + listFn func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) deleteFn func(ctx context.Context, id string) error } -func (m *mockSubnetService) Create(ctx context.Context, req *cloudscale.SubnetCreateRequest) (*cloudscale.Subnet, error) { +func (m *mockSubnetService) Create(ctx context.Context, req *cloudscalesdk.SubnetCreateRequest) (*cloudscalesdk.Subnet, error) { if m.createFn != nil { return m.createFn(ctx, req) } return nil, nil } -func (m *mockSubnetService) Get(ctx context.Context, id string) (*cloudscale.Subnet, error) { +func (m *mockSubnetService) Get(ctx context.Context, id string) (*cloudscalesdk.Subnet, error) { if m.getFn != nil { return m.getFn(ctx, id) } return nil, nil } -func (m *mockSubnetService) List(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.Subnet, error) { +func (m *mockSubnetService) List(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.Subnet, error) { if m.listFn != nil { return m.listFn(ctx, modifiers...) } diff --git a/internal/controller/cloudscalecluster_reconcile_test.go b/internal/controller/cloudscalecluster_reconcile_test.go index 55ba6ea..ab5b4ca 100644 --- a/internal/controller/cloudscalecluster_reconcile_test.go +++ b/internal/controller/cloudscalecluster_reconcile_test.go @@ -33,7 +33,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -74,7 +74,7 @@ func reconcileTestScope(opts reconcileTestOpts) *scope.ClusterScope { }, }, }, - CloudscaleClient: &cs.Client{ + CloudscaleClient: &cloudscale.Client{ Networks: opts.networkService, Subnets: opts.subnetService, ServerGroups: opts.serverGroupService, @@ -122,7 +122,7 @@ func defaultMocks() reconcileTestOpts { getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { return &cloudscalesdk.LoadBalancer{ UUID: id, - Status: "running", + Status: LoadBalancerRunningStatus, VIPAddresses: []cloudscalesdk.VIPAddress{ {Address: "1.2.3.4"}, }, @@ -248,7 +248,7 @@ func TestReconcileNormal_LBPendingReturnsRequeue(t *testing.T) { mocks := defaultMocks() mocks.lbService = &mockLoadBalancerService{ getFn: func(ctx context.Context, id string) (*cloudscalesdk.LoadBalancer, error) { - return &cloudscalesdk.LoadBalancer{UUID: id, Status: "creating"}, nil + return &cloudscalesdk.LoadBalancer{UUID: id, Status: LoadBalancerChangingStatus}, nil }, } @@ -366,7 +366,7 @@ func TestReconcileDelete_LBDeleteErrorStopsDeletion(t *testing.T) { } mocks.networkService = &mockNetworkService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Network delete should not be called when LB delete fails") + g.Fail("Network delete should not be called when LB delete fails") return nil }, } @@ -429,7 +429,7 @@ func TestReconcileDelete_LBDisabledSkipsLBDeletion(t *testing.T) { mocks.lbEnabled = false mocks.lbService = &mockLoadBalancerService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("LB delete should not be called when LB is disabled") + g.Fail("LB delete should not be called when LB is disabled") return nil }, } diff --git a/internal/controller/cloudscalecluster_servergroup.go b/internal/controller/cloudscalecluster_servergroup.go index 97b95f2..3f480e9 100644 --- a/internal/controller/cloudscalecluster_servergroup.go +++ b/internal/controller/cloudscalecluster_servergroup.go @@ -18,26 +18,70 @@ package controller import ( "context" + "errors" "fmt" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + "k8s.io/apimachinery/pkg/util/sets" + clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + "sigs.k8s.io/controller-runtime/pkg/client" + infrav1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) +// errServerGroupNotEmpty is returned when a server group still contains owned servers +// and cannot be deleted yet. +var errServerGroupNotEmpty = errors.New("server group still contains owned servers") + // deleteServerGroups deletes all server groups owned by the cluster. // This is called during cluster deletion, after all machines have been removed. +// When owned servers are still present in a group, the function returns +// errServerGroupNotEmpty to signal a graceful requeue, preventing the +// "Cannot delete non-empty server group" API error. func (r *CloudscaleClusterReconciler) deleteServerGroups(ctx context.Context, clusterScope *scope.ClusterScope) error { - groups, err := clusterScope.CloudscaleClient.ServerGroups.List(ctx, + ownedServerIDs, err := r.getOwnedServerIDs(ctx, clusterScope) + if err != nil { + return fmt.Errorf("listing owned servers: %w", err) + } + + listCtx, cancelList := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancelList() + groups, err := clusterScope.CloudscaleClient.ServerGroups.List(listCtx, cloudscalesdk.WithTagFilter(clusterOwnershipTags(clusterScope.CloudscaleCluster))) if err != nil { return fmt.Errorf("listing server groups: %w", err) } for _, g := range groups { + hasOwnedServers := false + hasForeignServers := false + for _, s := range g.Servers { + if ownedServerIDs.Has(s.UUID) { + hasOwnedServers = true + } else { + hasForeignServers = true + } + } + + if hasOwnedServers { + clusterScope.Info("Waiting for owned servers to leave server group", + "serverGroupID", g.UUID, "name", g.Name) + return errServerGroupNotEmpty + } + + if hasForeignServers { + clusterScope.Info("Server group contains foreign servers, skipping deletion", + "serverGroupID", g.UUID, "name", g.Name) + continue + } + clusterScope.Info("Deleting server group", "serverGroupID", g.UUID, "name", g.Name) - if err := clusterScope.CloudscaleClient.ServerGroups.Delete(ctx, g.UUID); err != nil { + deleteCtx, cancelDelete := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + err := clusterScope.CloudscaleClient.ServerGroups.Delete(deleteCtx, g.UUID) + cancelDelete() + if err != nil { if !cloudscale.IsNotFound(err) { return fmt.Errorf("deleting server group %s: %w", g.UUID, err) } @@ -47,3 +91,23 @@ func (r *CloudscaleClusterReconciler) deleteServerGroups(ctx context.Context, cl return nil } + +// getOwnedServerIDs returns a set of server UUIDs from CloudscaleMachine resources +// that belong to the given cluster. +func (r *CloudscaleClusterReconciler) getOwnedServerIDs(ctx context.Context, clusterScope *scope.ClusterScope) (sets.Set[string], error) { + machineList := &infrav1beta2.CloudscaleMachineList{} + if err := r.List(ctx, machineList, + client.InNamespace(clusterScope.CloudscaleCluster.Namespace), + client.MatchingLabels{clusterv1.ClusterNameLabel: clusterScope.Cluster.Name}, + ); err != nil { + return nil, fmt.Errorf("listing CloudscaleMachines: %w", err) + } + + ids := sets.New[string]() + for _, m := range machineList.Items { + if m.Status.ServerID != "" { + ids.Insert(m.Status.ServerID) + } + } + return ids, nil +} diff --git a/internal/controller/cloudscalecluster_servergroup_test.go b/internal/controller/cloudscalecluster_servergroup_test.go index f55bdf2..d009d95 100644 --- a/internal/controller/cloudscalecluster_servergroup_test.go +++ b/internal/controller/cloudscalecluster_servergroup_test.go @@ -21,7 +21,7 @@ import ( "fmt" "testing" - "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" "github.com/go-logr/logr" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -29,11 +29,11 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) -func newTestClusterScopeWithServerGroups(serverGroupService cs.ServerGroupService) *scope.ClusterScope { +func newTestClusterScopeWithServerGroups(serverGroupService cloudscale.ServerGroupService) *scope.ClusterScope { return &scope.ClusterScope{ Logger: logr.Discard(), Cluster: &clusterv1.Cluster{ @@ -52,7 +52,7 @@ func newTestClusterScopeWithServerGroups(serverGroupService cs.ServerGroupServic Zone: "rma1", }, }, - CloudscaleClient: &cs.Client{ + CloudscaleClient: &cloudscale.Client{ ServerGroups: serverGroupService, }, } @@ -64,8 +64,8 @@ func TestDeleteServerGroups_DeletesAll(t *testing.T) { var deletedIDs []string serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { - return []cloudscale.ServerGroup{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ {UUID: "sg-1", Name: "group-1"}, {UUID: "sg-2", Name: "group-2"}, }, nil @@ -77,9 +77,7 @@ func TestDeleteServerGroups_DeletesAll(t *testing.T) { } clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) - r := &CloudscaleClusterReconciler{ - recorder: events.NewFakeRecorder(10), - } + r := newTestReconciler() err := r.deleteServerGroups(context.Background(), clusterScope) @@ -91,19 +89,17 @@ func TestDeleteServerGroups_NoGroups_Noop(t *testing.T) { g := NewWithT(t) serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { return nil, nil }, deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when no server groups exist") + g.Fail("Delete should not be called when no server groups exist") return nil }, } clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) - r := &CloudscaleClusterReconciler{ - recorder: events.NewFakeRecorder(10), - } + r := newTestReconciler() err := r.deleteServerGroups(context.Background(), clusterScope) @@ -114,15 +110,13 @@ func TestDeleteServerGroups_ListError_PropagatesError(t *testing.T) { g := NewWithT(t) serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { return nil, fmt.Errorf("api error") }, } clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) - r := &CloudscaleClusterReconciler{ - recorder: events.NewFakeRecorder(10), - } + r := newTestReconciler() err := r.deleteServerGroups(context.Background(), clusterScope) @@ -134,8 +128,8 @@ func TestDeleteServerGroups_DeleteError_PropagatesError(t *testing.T) { g := NewWithT(t) serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { - return []cloudscale.ServerGroup{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ {UUID: "sg-1", Name: "group-1"}, }, nil }, @@ -145,9 +139,7 @@ func TestDeleteServerGroups_DeleteError_PropagatesError(t *testing.T) { } clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) - r := &CloudscaleClusterReconciler{ - recorder: events.NewFakeRecorder(10), - } + r := newTestReconciler() err := r.deleteServerGroups(context.Background(), clusterScope) @@ -159,22 +151,130 @@ func TestDeleteServerGroups_Ignores404(t *testing.T) { g := NewWithT(t) serverGroupService := &mockServerGroupService{ - listFn: func(ctx context.Context, modifiers ...cloudscale.ListRequestModifier) ([]cloudscale.ServerGroup, error) { - return []cloudscale.ServerGroup{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ {UUID: "sg-already-deleted", Name: "group-1"}, }, nil }, deleteFn: func(ctx context.Context, id string) error { - return &cloudscale.ErrorResponse{StatusCode: 404} + return &cloudscalesdk.ErrorResponse{StatusCode: 404} + }, + } + + clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) + r := newTestReconciler() + + err := r.deleteServerGroups(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) +} + +func TestDeleteServerGroups_OwnedServerPresent_SkipsDeletion(t *testing.T) { + g := NewWithT(t) + + // The server group has a server that is owned by our cluster + serverGroupService := &mockServerGroupService{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ + {UUID: "sg-1", Name: "group-1", Servers: []cloudscalesdk.ServerStub{{UUID: "server-123"}}}, + }, nil + }, + deleteFn: func(ctx context.Context, id string) error { + // Delete should NOT be called because server group has owned servers + g.Fail("Delete must not be called when group has owned servers") + return nil + }, + } + + clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) + fakeClient := newTestFakeClient( + &infrastructurev1beta2.CloudscaleMachine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "machine-1", + Namespace: "default", + Labels: map[string]string{ + clusterv1.ClusterNameLabel: "test-cluster", + }, + }, + Status: infrastructurev1beta2.CloudscaleMachineStatus{ + ServerID: "server-123", + }, + }, + ) + r := &CloudscaleClusterReconciler{ + Client: fakeClient, + recorder: events.NewFakeRecorder(10), + } + + err := r.deleteServerGroups(context.Background(), clusterScope) + + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("server group still contains owned servers")) +} + +func TestDeleteServerGroups_ForeignServers_Skips(t *testing.T) { + g := NewWithT(t) + called := false + + // Server group has a server that is NOT owned by this cluster + serverGroupService := &mockServerGroupService{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ + {UUID: "sg-foreign", Name: "foreign-group", Servers: []cloudscalesdk.ServerStub{{UUID: "server-999"}}}, + }, nil + }, + deleteFn: func(ctx context.Context, id string) error { + called = true + return nil }, } clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) + fakeClient := newTestFakeClient(&infrastructurev1beta2.CloudscaleMachine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "machine-1", + Namespace: "default", + Labels: map[string]string{ + clusterv1.ClusterNameLabel: "test-cluster", + }, + }, + Status: infrastructurev1beta2.CloudscaleMachineStatus{ + ServerID: "server-123", + }, + }) r := &CloudscaleClusterReconciler{ + Client: fakeClient, recorder: events.NewFakeRecorder(10), } err := r.deleteServerGroups(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) + g.Expect(called).To(BeFalse()) +} + +func TestDeleteServerGroups_EmptyGroupName_DoesNotSkip(t *testing.T) { + g := NewWithT(t) + var deletedID string + + // Server group with no servers should be deleted immediately + serverGroupService := &mockServerGroupService{ + listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { + return []cloudscalesdk.ServerGroup{ + {UUID: "sg-empty", Name: "empty-group", Servers: nil}, + }, nil + }, + deleteFn: func(ctx context.Context, id string) error { + deletedID = id + return nil + }, + } + + clusterScope := newTestClusterScopeWithServerGroups(serverGroupService) + r := newTestReconciler() + + err := r.deleteServerGroups(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(deletedID).To(Equal("sg-empty")) } diff --git a/internal/controller/cloudscalemachine_controller.go b/internal/controller/cloudscalemachine_controller.go index b886f6b..16a5edb 100644 --- a/internal/controller/cloudscalemachine_controller.go +++ b/internal/controller/cloudscalemachine_controller.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "net/http" "time" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -34,6 +35,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -52,9 +54,11 @@ const ( // CloudscaleMachineReconciler reconciles a CloudscaleMachine object type CloudscaleMachineReconciler struct { client.Client - Scheme *runtime.Scheme - recorder events.EventRecorder - WatchFilter string + Scheme *runtime.Scheme + recorder events.EventRecorder + WatchFilter string + Transport *http.Transport + MaxConcurrentReconciles int } // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=cloudscalemachines,verbs=get;list;watch;create;update;patch;delete @@ -65,6 +69,9 @@ type CloudscaleMachineReconciler struct { // Reconcile handles CloudscaleMachine reconciliation. func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { + ctx, cancel := context.WithTimeout(ctx, 3*time.Minute) + defer cancel() + logger := logf.FromContext(ctx) cloudscaleMachine := &infrastructurev1beta2.CloudscaleMachine{} @@ -128,7 +135,7 @@ func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re return ctrl.Result{}, fmt.Errorf("failed to get cloudscale.ch credentials: %w", err) } - cloudscaleClient := cloudscale.NewClient(token) + cloudscaleClient := cloudscale.NewClient(token, r.Transport) machineScope, err := scope.NewMachineScope(scope.MachineScopeParams{ Client: r.Client, @@ -144,7 +151,11 @@ func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re } defer func() { - if err := machineScope.Close(ctx); err != nil && reterr == nil { + // Use a separate context for the status patch so it succeeds even + // when the reconcile context has timed out. + patchCtx, patchCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer patchCancel() + if err := machineScope.Close(patchCtx); err != nil && reterr == nil { reterr = err } }() @@ -174,8 +185,10 @@ func (r *CloudscaleMachineReconciler) reconcileNormal(ctx context.Context, machi } if machineScope.CloudscaleMachine.Spec.ServerGroup != nil { - if err := r.reconcileServerGroup(ctx, machineScope); err != nil { + if result, err := r.reconcileServerGroup(ctx, machineScope); err != nil { return ctrl.Result{}, err + } else if !result.IsZero() { + return result, nil } } @@ -261,6 +274,7 @@ func (r *CloudscaleMachineReconciler) SetupWithManager(ctx context.Context, mgr return ctrl.NewControllerManagedBy(mgr). For(&infrastructurev1beta2.CloudscaleMachine{}). + WithOptions(controller.Options{MaxConcurrentReconciles: r.MaxConcurrentReconciles}). WithEventFilter(predicates.ResourceNotPaused(r.Scheme, logger)). WithEventFilter(predicates.ResourceHasFilterLabel(r.Scheme, logger, r.WatchFilter)). WithEventFilter(predicates.ResourceIsNotExternallyManaged(r.Scheme, logger)). diff --git a/internal/controller/cloudscalemachine_reconcile_test.go b/internal/controller/cloudscalemachine_reconcile_test.go index 4f2dda4..53dff81 100644 --- a/internal/controller/cloudscalemachine_reconcile_test.go +++ b/internal/controller/cloudscalemachine_reconcile_test.go @@ -89,11 +89,11 @@ func TestMachineReconcileNormal_BootstrapDataNotReady(t *testing.T) { serverService := &mockServerService{ getFn: func(ctx context.Context, id string) (*cloudscalesdk.Server, error) { - t.Fatal("Server Get should not be called when bootstrap data is not ready") + g.Fail("Server Get should not be called when bootstrap data is not ready") return nil, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.ServerRequest) (*cloudscalesdk.Server, error) { - t.Fatal("Server Create should not be called when bootstrap data is not ready") + g.Fail("Server Create should not be called when bootstrap data is not ready") return nil, nil }, } @@ -242,7 +242,7 @@ func TestMachineReconcileDelete_NoServer(t *testing.T) { serverService := &mockServerService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when no server exists") + g.Fail("Delete should not be called when no server exists") return nil }, } diff --git a/internal/controller/cloudscalemachine_server.go b/internal/controller/cloudscalemachine_server.go index e7fd681..de1e54b 100644 --- a/internal/controller/cloudscalemachine_server.go +++ b/internal/controller/cloudscalemachine_server.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "maps" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" corev1 "k8s.io/api/core/v1" @@ -82,7 +83,9 @@ func (r *CloudscaleMachineReconciler) reconcileServer(ctx context.Context, machi // 1. If we have a server ID in status, verify it still exists if machineScope.CloudscaleMachine.Status.ServerID != "" { var err error - server, err = machineScope.CloudscaleClient.Servers.Get(ctx, machineScope.CloudscaleMachine.Status.ServerID) + getCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + server, err = machineScope.CloudscaleClient.Servers.Get(getCtx, machineScope.CloudscaleMachine.Status.ServerID) if err == nil { machineScope.V(2).Info("Server already exists", "serverID", machineScope.CloudscaleMachine.Status.ServerID, "status", server.Status) r.updateMachineFromServer(machineScope, server) @@ -101,7 +104,9 @@ func (r *CloudscaleMachineReconciler) reconcileServer(ctx context.Context, machi } // 2. Search for existing server by tag (idempotency after crash) - servers, err := machineScope.CloudscaleClient.Servers.List(ctx, + listCtx, cancelList := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancelList() + servers, err := machineScope.CloudscaleClient.Servers.List(listCtx, cloudscalesdk.WithTagFilter(r.machineLookupTag(machineScope))) if err != nil { return ctrl.Result{}, fmt.Errorf("listing servers: %w", err) @@ -160,8 +165,15 @@ func (r *CloudscaleMachineReconciler) reconcileServer(ctx context.Context, machi req.ServerGroups = []string{machineScope.CloudscaleMachine.Status.ServerGroupID} } - server, err = machineScope.CloudscaleClient.Servers.Create(ctx, req) + createCtx, cancelCreate := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancelCreate() + server, err = machineScope.CloudscaleClient.Servers.Create(createCtx, req) if err != nil { + if cloudscale.IsTimeoutError(err) { + requeueAfter := 30 * time.Second + machineScope.Info("Server creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } return ctrl.Result{}, fmt.Errorf("creating server: %w", err) } @@ -248,7 +260,9 @@ func (r *CloudscaleMachineReconciler) deleteServer(ctx context.Context, machineS serverID := machineScope.CloudscaleMachine.Status.ServerID machineScope.Info("Deleting server", "serverID", serverID) - if err := machineScope.CloudscaleClient.Servers.Delete(ctx, serverID); err != nil { + deleteCtx, cancel := context.WithTimeout(ctx, cloudscale.DeleteTimeout) + defer cancel() + if err := machineScope.CloudscaleClient.Servers.Delete(deleteCtx, serverID); err != nil { // Ignore 404 - server was already deleted externally if !cloudscale.IsNotFound(err) { return fmt.Errorf("deleting server: %w", err) diff --git a/internal/controller/cloudscalemachine_server_test.go b/internal/controller/cloudscalemachine_server_test.go index d97a84c..3616ef4 100644 --- a/internal/controller/cloudscalemachine_server_test.go +++ b/internal/controller/cloudscalemachine_server_test.go @@ -25,17 +25,14 @@ import ( . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/events" "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/util/conditions" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -84,16 +81,8 @@ func (m *mockServerService) Update(ctx context.Context, id string, req *cloudsca return nil } -func newTestFakeClient(objs ...client.Object) client.Client { - scheme := runtime.NewScheme() - _ = corev1.AddToScheme(scheme) - _ = clusterv1.AddToScheme(scheme) - _ = infrastructurev1beta2.AddToScheme(scheme) - return fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build() -} - -func newTestMachineScopeWithServer(serverService cs.ServerService) *scope.MachineScope { - cloudscaleClient := &cs.Client{ +func newTestMachineScopeWithServer(serverService cloudscale.ServerService) *scope.MachineScope { + cloudscaleClient := &cloudscale.Client{ Servers: serverService, } @@ -292,7 +281,7 @@ func TestReconcileServer_SkipsIfAlreadyExists(t *testing.T) { }, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.ServerRequest) (*cloudscalesdk.Server, error) { - t.Fatal("Create should not be called when server already exists") + g.Fail("Create should not be called when server already exists") return nil, nil }, } @@ -324,7 +313,7 @@ func TestReconcileServer_FindsExistingByTag(t *testing.T) { }, nil }, createFn: func(ctx context.Context, req *cloudscalesdk.ServerRequest) (*cloudscalesdk.Server, error) { - t.Fatal("Create should not be called when server is found by tag") + g.Fail("Create should not be called when server is found by tag") return nil, nil }, } @@ -407,7 +396,7 @@ func TestDeleteServer_SkipsIfNoServer(t *testing.T) { g := NewWithT(t) serverService := &mockServerService{ deleteFn: func(ctx context.Context, id string) error { - t.Fatal("Delete should not be called when no server exists") + g.Fail("Delete should not be called when no server exists") return nil }, } diff --git a/internal/controller/cloudscalemachine_servergroup.go b/internal/controller/cloudscalemachine_servergroup.go index 5687772..913adbe 100644 --- a/internal/controller/cloudscalemachine_servergroup.go +++ b/internal/controller/cloudscalemachine_servergroup.go @@ -19,19 +19,28 @@ package controller import ( "context" "fmt" + "sync" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) +// serverGroupMu serializes server group creation to prevent duplicates +// when multiple machines reconcile concurrently. The cloudscale API does +// not return a conflict on duplicate creation, so we serialize here. +// Safe: leader election guarantees a single replica. +var serverGroupMu sync.Mutex + // reconcileServerGroup ensures the server group exists if specified. // Server groups are zone-scoped and created once per unique name+zone combination. -func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, machineScope *scope.MachineScope) (reterr error) { +func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, machineScope *scope.MachineScope) (_ ctrl.Result, reterr error) { defer func() { if reterr != nil { r.setCondition(machineScope.CloudscaleMachine, infrastructurev1beta2.ServerGroupReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.ServerGroupErrorReason, reterr.Error()) @@ -41,36 +50,44 @@ func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, }() if machineScope.CloudscaleMachine.Spec.ServerGroup == nil { - return nil // No server group requested + return ctrl.Result{}, nil // No server group requested } // If we already have a server group ID, verify it still exists if machineScope.CloudscaleMachine.Status.ServerGroupID != "" { - _, err := machineScope.CloudscaleClient.ServerGroups.Get(ctx, machineScope.CloudscaleMachine.Status.ServerGroupID) + getCtx, cancel := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancel() + _, err := machineScope.CloudscaleClient.ServerGroups.Get(getCtx, machineScope.CloudscaleMachine.Status.ServerGroupID) if err == nil { - return nil + return ctrl.Result{}, nil } if !cloudscale.IsNotFound(err) { - return fmt.Errorf("getting server group: %w", err) + return ctrl.Result{}, fmt.Errorf("getting server group: %w", err) } // Server group was deleted externally, fall through to re-create machineScope.CloudscaleMachine.Status.ServerGroupID = "" } + // Serialize server group list+create to prevent duplicates under concurrent reconciles. + serverGroupMu.Lock() + defer serverGroupMu.Unlock() + zone := machineScope.CloudscaleCluster.Spec.Zone groupName := machineScope.CloudscaleMachine.Spec.ServerGroup.Name // Search for existing server group by name and zone using cluster-level tags // so that all machines in the cluster can find the same server group. - groups, err := machineScope.CloudscaleClient.ServerGroups.List(ctx, cloudscalesdk.WithTagFilter(clusterOwnershipTags(machineScope.CloudscaleCluster))) + listCtx, cancelList := context.WithTimeout(ctx, cloudscale.ReadTimeout) + defer cancelList() + groups, err := machineScope.CloudscaleClient.ServerGroups.List(listCtx, cloudscalesdk.WithTagFilter(clusterOwnershipTags(machineScope.CloudscaleCluster))) if err != nil { - return fmt.Errorf("listing server groups: %w", err) + return ctrl.Result{}, fmt.Errorf("listing server groups: %w", err) } for _, g := range groups { if g.Name == groupName && g.Zone.Slug == zone { machineScope.CloudscaleMachine.Status.ServerGroupID = g.UUID - return nil + return ctrl.Result{}, nil } } @@ -84,13 +101,20 @@ func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, }, } - group, err := machineScope.CloudscaleClient.ServerGroups.Create(ctx, req) + createCtx, cancelCreate := context.WithTimeout(ctx, cloudscale.WriteTimeout) + defer cancelCreate() + group, err := machineScope.CloudscaleClient.ServerGroups.Create(createCtx, req) if err != nil { - return fmt.Errorf("creating server group: %w", err) + if cloudscale.IsTimeoutError(err) { + requeueAfter := 5 * time.Second + machineScope.Info("Server group creation timed out, waiting before retry", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + return ctrl.Result{}, fmt.Errorf("creating server group: %w", err) } machineScope.CloudscaleMachine.Status.ServerGroupID = group.UUID - return nil + return ctrl.Result{}, nil } // deleteServerGroup clears the server group reference from the machine status. diff --git a/internal/controller/cloudscalemachine_servergroup_test.go b/internal/controller/cloudscalemachine_servergroup_test.go index 4506be4..e75f3e1 100644 --- a/internal/controller/cloudscalemachine_servergroup_test.go +++ b/internal/controller/cloudscalemachine_servergroup_test.go @@ -32,7 +32,7 @@ import ( "sigs.k8s.io/cluster-api/util/conditions" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" - cs "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -79,8 +79,8 @@ func (m *mockServerGroupService) Update(ctx context.Context, id string, req *clo return nil } -func newTestMachineScopeWithServerGroup(serverGroupService cs.ServerGroupService) *scope.MachineScope { - cloudscaleClient := &cs.Client{ +func newTestMachineScopeWithServerGroup(serverGroupService cloudscale.ServerGroupService) *scope.MachineScope { + cloudscaleClient := &cloudscale.Client{ ServerGroups: serverGroupService, } @@ -155,7 +155,7 @@ func TestReconcileServerGroup_NoServerGroup_Noop(t *testing.T) { serverGroupService := &mockServerGroupService{ listFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.ServerGroup, error) { - t.Fatal("List should not be called when no server group is specified") + g.Fail("List should not be called when no server group is specified") return nil, nil }, } @@ -167,7 +167,7 @@ func TestReconcileServerGroup_NoServerGroup_Noop(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).ToNot(HaveOccurred()) @@ -202,7 +202,7 @@ func TestReconcileServerGroup_FindsExisting_SetsStatusID(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createCalled).To(BeFalse(), "Create should not be called when group exists") @@ -244,7 +244,7 @@ func TestReconcileServerGroup_SkipsNonMatchingName(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createCalled).To(BeTrue(), "Create should be called when no matching name found") @@ -282,7 +282,7 @@ func TestReconcileServerGroup_SkipsNonMatchingZone(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(createCalled).To(BeTrue(), "Create should be called when no matching zone found") @@ -313,7 +313,7 @@ func TestReconcileServerGroup_CreatesNew_SetsStatusID(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).ToNot(HaveOccurred()) g.Expect(machineScope.CloudscaleMachine.Status.ServerGroupID).To(Equal("created-group-uuid")) @@ -347,7 +347,7 @@ func TestReconcileServerGroup_ListError_PropagatesError(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("listing server groups")) @@ -375,7 +375,7 @@ func TestReconcileServerGroup_CreateError_PropagatesError(t *testing.T) { recorder: events.NewFakeRecorder(10), } - err := r.reconcileServerGroup(context.Background(), machineScope) + _, err := r.reconcileServerGroup(context.Background(), machineScope) g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(ContainSubstring("creating server group")) diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index ee6afbc..76b16b4 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -23,9 +23,14 @@ import ( "path/filepath" "testing" - "k8s.io/client-go/kubernetes/scheme" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/events" + clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/envtest" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" @@ -47,7 +52,7 @@ func TestMain(m *testing.M) { ctx, cancel = context.WithCancel(context.TODO()) - err := infrastructurev1beta2.AddToScheme(scheme.Scheme) + err := infrastructurev1beta2.AddToScheme(clientgoscheme.Scheme) if err != nil { fmt.Fprintf(os.Stderr, "Failed to add scheme: %v\n", err) os.Exit(1) @@ -76,7 +81,7 @@ func TestMain(m *testing.M) { os.Exit(1) } - k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + k8sClient, err = client.New(cfg, client.Options{Scheme: clientgoscheme.Scheme}) if err != nil { fmt.Fprintf(os.Stderr, "Failed to create client: %v\n", err) os.Exit(1) @@ -115,3 +120,18 @@ func getFirstFoundEnvTestBinaryDir() string { } return "" } + +func newTestFakeClient(objs ...client.Object) client.Client { + scheme := runtime.NewScheme() + _ = corev1.AddToScheme(scheme) + _ = clusterv1.AddToScheme(scheme) + _ = infrastructurev1beta2.AddToScheme(scheme) + return fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build() +} + +func newTestReconciler() *CloudscaleClusterReconciler { + return &CloudscaleClusterReconciler{ + Client: newTestFakeClient(), + recorder: events.NewFakeRecorder(10), + } +} diff --git a/internal/webhook/v1beta2/cloudscalecluster_webhook.go b/internal/webhook/v1beta2/cloudscalecluster_webhook.go index 8143a65..698989d 100644 --- a/internal/webhook/v1beta2/cloudscalecluster_webhook.go +++ b/internal/webhook/v1beta2/cloudscalecluster_webhook.go @@ -59,7 +59,7 @@ type CloudscaleClusterCustomDefaulter struct { RegionInfo *cloudscale.RegionInfo } -const defaultSubnetCIDR = "10.0.0.0/24" +const defaultSubnetCIDR = "172.18.0.0/24" // Default implements webhook.CustomDefaulter so a webhook will be registered for the Kind CloudscaleCluster. func (d *CloudscaleClusterCustomDefaulter) Default(_ context.Context, cluster *infrastructurev1beta2.CloudscaleCluster) error { @@ -93,9 +93,6 @@ func (d *CloudscaleClusterCustomDefaulter) Default(_ context.Context, cluster *i if cluster.Spec.ControlPlaneLoadBalancer.APIServerPort == 0 { cluster.Spec.ControlPlaneLoadBalancer.APIServerPort = 6443 } - if cluster.Spec.ControlPlaneLoadBalancer.IPFamily == "" { - cluster.Spec.ControlPlaneLoadBalancer.IPFamily = infrastructurev1beta2.IPFamilyDualStack - } if cluster.Spec.ControlPlaneLoadBalancer.HealthMonitor.DelayS == 0 { cluster.Spec.ControlPlaneLoadBalancer.HealthMonitor.DelayS = 5 @@ -405,7 +402,7 @@ func validateFloatingIPRequiresLBOrPreExisting(cluster *infrastructurev1beta2.Cl } // validateLBImmutability forbids changes to LB fields that are baked into the LB at creation. -// Algorithm, Flavor, APIServerPort, IPFamily and the HealthMonitor settings cannot be reissued +// Algorithm, Flavor, APIServerPort and the HealthMonitor settings cannot be reissued // to an existing cloudscale.ch LB, so changing them in spec would silently lie to the user. func validateLBImmutability(oldLB, newLB *infrastructurev1beta2.LoadBalancerSpec, fldPath *field.Path) field.ErrorList { var allErrs field.ErrorList @@ -421,7 +418,6 @@ func validateLBImmutability(oldLB, newLB *infrastructurev1beta2.LoadBalancerSpec forbidIfChanged("algorithm", oldLB.Algorithm, newLB.Algorithm) forbidIfChanged("flavor", oldLB.Flavor, newLB.Flavor) forbidIfChanged("apiServerPort", oldLB.APIServerPort, newLB.APIServerPort) - forbidIfChanged("ipFamily", oldLB.IPFamily, newLB.IPFamily) hmPath := fldPath.Child("healthMonitor") hmForbid := func(child string, oldV, newV int) { diff --git a/internal/webhook/v1beta2/cloudscalecluster_webhook_test.go b/internal/webhook/v1beta2/cloudscalecluster_webhook_test.go index 33cfde5..84a6f70 100644 --- a/internal/webhook/v1beta2/cloudscalecluster_webhook_test.go +++ b/internal/webhook/v1beta2/cloudscalecluster_webhook_test.go @@ -153,15 +153,6 @@ func TestClusterDefaulting_APIServerPort(t *testing.T) { g.Expect(obj.Spec.ControlPlaneLoadBalancer.APIServerPort).To(Equal(int32(6443))) } -func TestClusterDefaulting_LBIPFamily(t *testing.T) { - g := NewWithT(t) - obj, _, _, defaulter := newClusterWebhookTestObjects() - obj.Spec.ControlPlaneLoadBalancer.IPFamily = "" - - g.Expect(defaulter.Default(ctx, obj)).To(Succeed()) - g.Expect(obj.Spec.ControlPlaneLoadBalancer.IPFamily).To(Equal(infrastructurev1beta2.IPFamilyDualStack)) -} - func TestClusterDefaulting_HealthMonitorFields(t *testing.T) { g := NewWithT(t) obj, _, _, defaulter := newClusterWebhookTestObjects() @@ -225,7 +216,6 @@ func TestClusterDefaulting_AllDefaultsApplied(t *testing.T) { g.Expect(obj.Spec.ControlPlaneLoadBalancer.Algorithm).To(Equal("round_robin")) g.Expect(obj.Spec.ControlPlaneLoadBalancer.Flavor).To(Equal("lb-standard")) g.Expect(obj.Spec.ControlPlaneLoadBalancer.APIServerPort).To(Equal(int32(6443))) - g.Expect(obj.Spec.ControlPlaneLoadBalancer.IPFamily).To(Equal(infrastructurev1beta2.IPFamilyDualStack)) g.Expect(obj.Spec.ControlPlaneLoadBalancer.HealthMonitor.DelayS).To(Equal(5)) g.Expect(obj.Spec.ControlPlaneLoadBalancer.HealthMonitor.TimeoutS).To(Equal(3)) g.Expect(obj.Spec.ControlPlaneLoadBalancer.HealthMonitor.UpThreshold).To(Equal(2)) @@ -330,7 +320,7 @@ func TestClusterValidateCreate_GatewayWithinCIDR(t *testing.T) { obj.Spec.Region = RegionRma obj.Spec.Zone = ZoneRma1 obj.Spec.Networks = []infrastructurev1beta2.NetworkSpec{ - {Name: "main", CIDR: defaultSubnetCIDR, GatewayAddress: "10.0.0.1"}, + {Name: "main", CIDR: defaultSubnetCIDR, GatewayAddress: "172.18.0.1"}, } _, err := validator.ValidateCreate(ctx, obj) @@ -742,13 +732,6 @@ func TestClusterValidateUpdate_LBFieldsImmutable(t *testing.T) { mutate: func(c *infrastructurev1beta2.CloudscaleCluster) { c.Spec.ControlPlaneLoadBalancer.APIServerPort = 8443 }, errPath: "controlPlaneLoadBalancer.apiServerPort", }, - { - name: "IPFamily", - mutate: func(c *infrastructurev1beta2.CloudscaleCluster) { - c.Spec.ControlPlaneLoadBalancer.IPFamily = infrastructurev1beta2.IPFamilyIPv6 - }, - errPath: "controlPlaneLoadBalancer.ipFamily", - }, { name: "HealthMonitor.DelayS", mutate: func(c *infrastructurev1beta2.CloudscaleCluster) { @@ -783,11 +766,10 @@ func TestClusterValidateUpdate_LBFieldsImmutable(t *testing.T) { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) obj, oldObj, validator := setupUpdateTestObjects() - // Seed health monitor + IPFamily on old so changes are visible. + // Seed health monitor on old so changes are visible. oldObj.Spec.ControlPlaneLoadBalancer.Algorithm = "round_robin" oldObj.Spec.ControlPlaneLoadBalancer.Flavor = "lb-standard" oldObj.Spec.ControlPlaneLoadBalancer.APIServerPort = 6443 - oldObj.Spec.ControlPlaneLoadBalancer.IPFamily = infrastructurev1beta2.IPFamilyDualStack oldObj.Spec.ControlPlaneLoadBalancer.HealthMonitor = infrastructurev1beta2.HealthMonitorSpec{ DelayS: 5, TimeoutS: 3, UpThreshold: 2, DownThreshold: 3, } @@ -833,7 +815,7 @@ func TestClusterValidateDelete_AlwaysSucceeds(t *testing.T) { func TestValidateGatewayInCIDR_ValidGateway(t *testing.T) { g := NewWithT(t) - errs := validateGatewayInCIDR(defaultSubnetCIDR, "10.0.0.1", field.NewPath("spec", "networks", "gatewayAddress")) + errs := validateGatewayInCIDR(defaultSubnetCIDR, "172.18.0.1", field.NewPath("spec", "networks", "gatewayAddress")) g.Expect(errs).To(BeEmpty()) } diff --git a/templates/cluster-template.yaml b/templates/cluster-template.yaml index d075f2a..f3f016b 100644 --- a/templates/cluster-template.yaml +++ b/templates/cluster-template.yaml @@ -39,7 +39,7 @@ spec: region: "${CLOUDSCALE_REGION}" networks: - name: "${CLUSTER_NAME}" - cidr: "10.0.0.0/24" + cidr: "172.18.0.0/24" gatewayAddress: "" # disable gateway, use public interface for internet access credentialsRef: name: "${CLUSTER_NAME}-credentials" diff --git a/test/e2e/cloudscale_helpers.go b/test/e2e/cloudscale_helpers.go index 6f33bb6..f46a973 100644 --- a/test/e2e/cloudscale_helpers.go +++ b/test/e2e/cloudscale_helpers.go @@ -23,14 +23,12 @@ import ( "fmt" "strings" - "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" ) // newCloudscaleClient creates a new cloudscale API client from the given token. func newCloudscaleClient(token string) *cloudscale.Client { - c := cloudscale.NewClient(nil) - c.AuthToken = token - return c + return cloudscale.NewClient(token, cloudscale.NewTransport()) } // resourceSnapshot holds a snapshot of cloudscale API resources for leak detection. diff --git a/test/e2e/config/cloudscale.yaml b/test/e2e/config/cloudscale.yaml index 55f6c7b..910e6be 100644 --- a/test/e2e/config/cloudscale.yaml +++ b/test/e2e/config/cloudscale.yaml @@ -42,7 +42,7 @@ providers: type: InfrastructureProvider versions: - name: v0.99.99 # dev version, always higher than any release - value: ../../../config/default + value: ../../../config/e2e type: kustomize contract: v1beta2 replacements: @@ -73,7 +73,7 @@ variables: CLOUDSCALE_MACHINE_IMAGE_UPGRADE_FROM: "custom:ubuntu-2404-kube-v1.34.3" # CLOUDSCALE_SSH_PUBLIC_KEY: Set via environment variable CLOUDSCALE_ROOT_VOLUME_SIZE: "20" - CLOUDSCALE_NETWORK_CIDR: "10.100.0.0/24" + CLOUDSCALE_NETWORK_CIDR: "172.18.0.0/24" # Machine template names for upgrade tests CONTROL_PLANE_MACHINE_TEMPLATE_UPGRADE_TO: "k8s-upgrade-control-plane" diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index b3dbe62..387b1e9 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -25,15 +25,12 @@ import ( "path/filepath" "testing" - "github.com/cloudscale-ch/cloudscale-go-sdk/v8" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/klog/v2" - ctrl "sigs.k8s.io/controller-runtime" - - apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" bootstrapv1 "sigs.k8s.io/cluster-api/api/bootstrap/kubeadm/v1beta2" controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" @@ -41,8 +38,10 @@ import ( "sigs.k8s.io/cluster-api/test/framework" "sigs.k8s.io/cluster-api/test/framework/bootstrap" "sigs.k8s.io/cluster-api/test/framework/clusterctl" + ctrl "sigs.k8s.io/controller-runtime" infrav1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" ) var (