From 83823c0df41db16b68bd6ce92bf59031a891b941 Mon Sep 17 00:00:00 2001 From: Kamil Ubych Date: Thu, 2 Apr 2026 12:55:01 +0200 Subject: [PATCH 1/4] metrics --- cmd/main.go | 25 ++- .../controller/postgrescluster_controller.go | 39 +++- .../controller/postgresdatabase_controller.go | 25 ++- pkg/postgresql/cluster/core/cluster.go | 13 ++ pkg/postgresql/cluster/core/types.go | 2 + pkg/postgresql/database/core/database.go | 15 ++ .../database/core/database_unit_test.go | 2 +- pkg/postgresql/database/core/types.go | 4 +- pkg/postgresql/metrics/collector.go | 130 +++++++++++++ pkg/postgresql/metrics/noop.go | 22 +++ pkg/postgresql/metrics/ports.go | 102 ++++++++++ pkg/postgresql/metrics/prometheus.go | 175 ++++++++++++++++++ 12 files changed, 538 insertions(+), 16 deletions(-) create mode 100644 pkg/postgresql/metrics/collector.go create mode 100644 pkg/postgresql/metrics/noop.go create mode 100644 pkg/postgresql/metrics/ports.go create mode 100644 pkg/postgresql/metrics/prometheus.go diff --git a/cmd/main.go b/cmd/main.go index 332623f0d..5be84f744 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -25,6 +25,7 @@ import ( "path/filepath" "time" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" intController "github.com/splunk/splunk-operator/internal/controller" @@ -55,6 +56,7 @@ import ( "github.com/splunk/splunk-operator/internal/controller" cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" //+kubebuilder:scaffold:imports //extapi "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" ) @@ -282,18 +284,29 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "Telemetry") os.Exit(1) } + pgRecorder := pgmetrics.NewPrometheusRecorder() + if err := pgmetrics.Register(crmetrics.Registry); err != nil { + setupLog.Error(err, "unable to register PostgreSQL metrics") + os.Exit(1) + } + pgFleetCollector := pgmetrics.NewFleetCollector() + if err := (&controller.PostgresDatabaseReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("postgresdatabase-controller"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("postgresdatabase-controller"), + Metrics: pgRecorder, + FleetCollector: pgFleetCollector, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "PostgresDatabase") os.Exit(1) } if err := (&controller.PostgresClusterReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("postgrescluster-controller"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("postgrescluster-controller"), + Metrics: pgRecorder, + FleetCollector: pgFleetCollector, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "PostgresCluster") os.Exit(1) diff --git a/internal/controller/postgrescluster_controller.go b/internal/controller/postgrescluster_controller.go index 70b11c9e6..c25ea27d3 100644 --- a/internal/controller/postgrescluster_controller.go +++ b/internal/controller/postgrescluster_controller.go @@ -18,12 +18,15 @@ package controller import ( "context" + "time" cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" enterprisev4 "github.com/splunk/splunk-operator/api/v4" clustercore "github.com/splunk/splunk-operator/pkg/postgresql/cluster/core" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" @@ -42,8 +45,10 @@ const ( // PostgresClusterReconciler reconciles PostgresCluster resources. type PostgresClusterReconciler struct { client.Client - Scheme *runtime.Scheme - Recorder record.EventRecorder + Scheme *runtime.Scheme + Recorder record.EventRecorder + Metrics pgmetrics.Recorder + FleetCollector *pgmetrics.FleetCollector } // +kubebuilder:rbac:groups=enterprise.splunk.com,resources=postgresclusters,verbs=get;list;watch;create;update;patch;delete @@ -57,8 +62,34 @@ type PostgresClusterReconciler struct { // +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch func (r *PostgresClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - rc := &clustercore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder} - return clustercore.PostgresClusterService(ctx, rc, req) + start := time.Now() + rc := &clustercore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics} + result, err := clustercore.PostgresClusterService(ctx, rc, req) + + resultLabel := pgmetrics.ResultSuccess + if err != nil { + resultLabel = pgmetrics.ResultError + r.Metrics.IncReconcileError(pgmetrics.ControllerCluster, classifyError(err)) + } else if result.RequeueAfter > 0 || result.Requeue { + resultLabel = pgmetrics.ResultRequeue + } + r.Metrics.ObserveReconcile(pgmetrics.ControllerCluster, resultLabel, time.Since(start)) + r.FleetCollector.CollectClusterMetrics(ctx, r.Client, r.Metrics) + + return result, err +} + +func classifyError(err error) string { + switch { + case apierrors.IsNotFound(err): + return pgmetrics.ErrorClassNotFound + case apierrors.IsConflict(err): + return pgmetrics.ErrorClassConflict + case apierrors.IsInvalid(err): + return pgmetrics.ErrorClassValidation + default: + return pgmetrics.ErrorClassUnknown + } } // SetupWithManager registers the controller and owned resource watches. diff --git a/internal/controller/postgresdatabase_controller.go b/internal/controller/postgresdatabase_controller.go index 0c6db9628..32bd16dda 100644 --- a/internal/controller/postgresdatabase_controller.go +++ b/internal/controller/postgresdatabase_controller.go @@ -19,11 +19,13 @@ package controller import ( "context" "reflect" + "time" cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" enterprisev4 "github.com/splunk/splunk-operator/api/v4" dbadapter "github.com/splunk/splunk-operator/pkg/postgresql/database/adapter" dbcore "github.com/splunk/splunk-operator/pkg/postgresql/database/core" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -42,8 +44,10 @@ import ( // PostgresDatabaseReconciler reconciles a PostgresDatabase object. type PostgresDatabaseReconciler struct { client.Client - Scheme *runtime.Scheme - Recorder record.EventRecorder + Scheme *runtime.Scheme + Recorder record.EventRecorder + Metrics pgmetrics.Recorder + FleetCollector *pgmetrics.FleetCollector } const ( @@ -61,6 +65,7 @@ const ( //+kubebuilder:rbac:groups=core,resources=events,verbs=create;patch func (r *PostgresDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + start := time.Now() logger := log.FromContext(ctx) postgresDB := &enterprisev4.PostgresDatabase{} @@ -71,8 +76,20 @@ func (r *PostgresDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req } return ctrl.Result{}, err } - rc := &dbcore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder} - return dbcore.PostgresDatabaseService(ctx, rc, postgresDB, dbadapter.NewDBRepository) + rc := &dbcore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics} + result, err := dbcore.PostgresDatabaseService(ctx, rc, postgresDB, dbadapter.NewDBRepository) + + resultLabel := pgmetrics.ResultSuccess + if err != nil { + resultLabel = pgmetrics.ResultError + r.Metrics.IncReconcileError(pgmetrics.ControllerDatabase, classifyError(err)) + } else if result.RequeueAfter > 0 || result.Requeue { + resultLabel = pgmetrics.ResultRequeue + } + r.Metrics.ObserveReconcile(pgmetrics.ControllerDatabase, resultLabel, time.Since(start)) + r.FleetCollector.CollectDatabaseMetrics(ctx, r.Client, r.Metrics) + + return result, err } // SetupWithManager sets up the controller with the Manager. diff --git a/pkg/postgresql/cluster/core/cluster.go b/pkg/postgresql/cluster/core/cluster.go index 3334011c6..94eda71d3 100644 --- a/pkg/postgresql/cluster/core/cluster.go +++ b/pkg/postgresql/cluster/core/cluster.go @@ -24,6 +24,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" password "github.com/sethvargo/go-password/password" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -78,6 +79,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. } logger.Error(err, "Failed to handle finalizer") rc.emitWarning(postgresCluster, EventCleanupFailed, fmt.Sprintf("Cleanup failed: %v", err)) + rc.Metrics.IncFinalizerOp(pgmetrics.ControllerCluster, pgmetrics.ResultError) errs := []error{err} if statusErr := updateStatus(clusterReady, metav1.ConditionFalse, reasonClusterDeleteFailed, fmt.Sprintf("Failed to delete resources during cleanup: %v", err), failedClusterPhase); statusErr != nil { @@ -111,6 +113,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. if err := c.Get(ctx, client.ObjectKey{Name: postgresCluster.Spec.Class}, clusterClass); err != nil { logger.Error(err, "Failed to fetch PostgresClusterClass", "className", postgresCluster.Spec.Class) rc.emitWarning(postgresCluster, EventClusterClassNotFound, fmt.Sprintf("ClusterClass %s not found", postgresCluster.Spec.Class)) + rc.Metrics.IncValidationFailure(pgmetrics.ControllerCluster, pgmetrics.ReasonClassNotFound) if statusErr := updateStatus(clusterReady, metav1.ConditionFalse, reasonClusterClassNotFound, fmt.Sprintf("ClusterClass %s not found: %v", postgresCluster.Spec.Class, err), failedClusterPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") @@ -123,6 +126,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. if err != nil { logger.Error(err, "Failed to merge PostgresCluster configuration") rc.emitWarning(postgresCluster, EventConfigMergeFailed, fmt.Sprintf("Failed to merge configuration: %v", err)) + rc.Metrics.IncValidationFailure(pgmetrics.ControllerCluster, pgmetrics.ReasonInvalidConfig) if statusErr := updateStatus(clusterReady, metav1.ConditionFalse, reasonInvalidConfiguration, fmt.Sprintf("Failed to merge configuration: %v", err), failedClusterPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") @@ -169,6 +173,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. return ctrl.Result{}, err } rc.emitNormal(postgresCluster, EventSecretReady, fmt.Sprintf("Superuser secret %s created", postgresSecretName)) + rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourceSecret, pgmetrics.OpCreate, pgmetrics.ResultSuccess) logger.Info("Superuser secret ref persisted to status") } @@ -223,6 +228,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. return ctrl.Result{}, err } rc.emitNormal(postgresCluster, EventClusterCreationStarted, "CNPG cluster created, waiting for healthy state") + rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourceCluster, pgmetrics.OpCreate, pgmetrics.ResultSuccess) if statusErr := updateStatus(clusterReady, metav1.ConditionFalse, reasonClusterBuildSucceeded, "CNPG Cluster created", pendingClusterPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") @@ -267,6 +273,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. return ctrl.Result{Requeue: true}, nil } rc.emitNormal(postgresCluster, EventClusterUpdateStarted, "CNPG cluster spec updated, waiting for healthy state") + rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourceCluster, pgmetrics.OpUpdate, pgmetrics.ResultSuccess) logger.Info("CNPG Cluster patched, requeueing for status update", "name", cnpgCluster.Name) return ctrl.Result{RequeueAfter: retryDelay}, nil } @@ -276,6 +283,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. if err := reconcileManagedRoles(ctx, c, postgresCluster, cnpgCluster); err != nil { logger.Error(err, "Failed to reconcile managed roles") rc.emitWarning(postgresCluster, EventManagedRolesFailed, fmt.Sprintf("Failed to reconcile managed roles: %v", err)) + rc.Metrics.IncUserAction(pgmetrics.ActionRolePatch, pgmetrics.ResultError) if statusErr := updateStatus(clusterReady, metav1.ConditionFalse, reasonManagedRolesFailed, fmt.Sprintf("Failed to reconcile managed roles: %v", err), failedClusterPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") @@ -326,6 +334,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. if mergedConfig.CNPG == nil || mergedConfig.CNPG.ConnectionPooler == nil { logger.Info("Connection pooler enabled but no config found in class or cluster spec, skipping", "class", postgresCluster.Spec.Class, "cluster", postgresCluster.Name) + rc.Metrics.IncValidationFailure(pgmetrics.ControllerCluster, pgmetrics.ReasonPoolerConfigMissing) if statusErr := updateStatus(poolerReady, metav1.ConditionFalse, reasonPoolerConfigMissing, fmt.Sprintf("Connection pooler is enabled but no config found in class %q or cluster %q", postgresCluster.Spec.Class, postgresCluster.Name), failedClusterPhase); statusErr != nil { @@ -351,6 +360,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. return ctrl.Result{}, err } rc.emitNormal(postgresCluster, EventPoolerCreationStarted, "Connection poolers created, waiting for readiness") + rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourcePooler, pgmetrics.OpCreate, pgmetrics.ResultSuccess) logger.Info("Connection pooler creation started, requeueing") if statusErr := updateStatus(poolerReady, metav1.ConditionFalse, reasonPoolerCreating, "Connection poolers are being provisioned", provisioningClusterPhase); statusErr != nil { @@ -433,9 +443,11 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. switch createOrUpdateResult { case controllerutil.OperationResultCreated: rc.emitNormal(postgresCluster, EventConfigMapReady, fmt.Sprintf("ConfigMap %s created", desiredCM.Name)) + rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourceConfigMap, pgmetrics.OpCreate, pgmetrics.ResultSuccess) logger.Info("ConfigMap created", "name", desiredCM.Name) case controllerutil.OperationResultUpdated: rc.emitNormal(postgresCluster, EventConfigMapReady, fmt.Sprintf("ConfigMap %s updated", desiredCM.Name)) + rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourceConfigMap, pgmetrics.OpUpdate, pgmetrics.ResultSuccess) logger.Info("ConfigMap updated", "name", desiredCM.Name) default: logger.Info("ConfigMap unchanged", "name", desiredCM.Name) @@ -1062,6 +1074,7 @@ func handleFinalizer(ctx context.Context, rc *ReconcileContext, cluster *enterpr return fmt.Errorf("removing finalizer: %w", err) } rc.emitNormal(cluster, EventCleanupComplete, fmt.Sprintf("Cleanup complete (policy: %s)", policy)) + rc.Metrics.IncFinalizerOp(pgmetrics.ControllerCluster, pgmetrics.ResultSuccess) logger.Info("Finalizer removed, cleanup complete") return nil } diff --git a/pkg/postgresql/cluster/core/types.go b/pkg/postgresql/cluster/core/types.go index 042a5ae82..3f2edfa4d 100644 --- a/pkg/postgresql/cluster/core/types.go +++ b/pkg/postgresql/cluster/core/types.go @@ -4,6 +4,7 @@ import ( "time" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" @@ -17,6 +18,7 @@ type ReconcileContext struct { Client client.Client Scheme *runtime.Scheme Recorder record.EventRecorder + Metrics pgmetrics.Recorder } // normalizedCNPGClusterSpec is a subset of cnpgv1.ClusterSpec fields used for drift detection. diff --git a/pkg/postgresql/database/core/database.go b/pkg/postgresql/database/core/database.go index f84a35fd9..3ac7fa254 100644 --- a/pkg/postgresql/database/core/database.go +++ b/pkg/postgresql/database/core/database.go @@ -11,6 +11,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" "github.com/sethvargo/go-password/password" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" @@ -51,6 +52,7 @@ func PostgresDatabaseService( if err := handleDeletion(ctx, rc, postgresDB); err != nil { logger.Error(err, "Failed to clean up PostgresDatabase") rc.emitWarning(postgresDB, EventCleanupFailed, fmt.Sprintf("Cleanup failed: %v", err)) + rc.Metrics.IncFinalizerOp(pgmetrics.ControllerDatabase, pgmetrics.ResultError) return ctrl.Result{}, err } return ctrl.Result{}, nil @@ -81,6 +83,7 @@ func PostgresDatabaseService( if err != nil { if errors.IsNotFound(err) { rc.emitWarning(postgresDB, EventClusterNotFound, fmt.Sprintf("PostgresCluster %s not found", postgresDB.Spec.ClusterRef.Name)) + rc.Metrics.IncValidationFailure(pgmetrics.ControllerDatabase, pgmetrics.ReasonClusterNotFound) if err := updateStatus(clusterReady, metav1.ConditionFalse, reasonClusterNotFound, "Cluster CR not found", pendingDBPhase); err != nil { return ctrl.Result{}, err } @@ -98,6 +101,7 @@ func PostgresDatabaseService( switch clusterStatus { case ClusterNotReady, ClusterNoProvisionerRef: rc.emitWarning(postgresDB, EventClusterNotReady, "Referenced PostgresCluster is not ready yet") + rc.Metrics.IncValidationFailure(pgmetrics.ControllerDatabase, pgmetrics.ReasonClusterNotReady) if err := updateStatus(clusterReady, metav1.ConditionFalse, reasonClusterProvisioning, "Cluster is not in ready state yet", pendingDBPhase); err != nil { return ctrl.Result{}, err } @@ -119,6 +123,7 @@ func PostgresDatabaseService( conflictErr := fmt.Errorf("role conflict detected: %s", strings.Join(roleConflicts, ", ")) logger.Error(conflictErr, conflictMsg) rc.emitWarning(postgresDB, EventRoleConflict, conflictMsg) + rc.Metrics.IncValidationFailure(pgmetrics.ControllerDatabase, pgmetrics.ReasonRoleConflict) errs := []error{conflictErr} if statusErr := updateStatus(rolesReady, metav1.ConditionFalse, reasonRoleConflict, conflictMsg, failedDBPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") @@ -142,12 +147,14 @@ func PostgresDatabaseService( // CNPG rejects a PasswordSecretRef pointing at a missing secret. if err := reconcileUserSecrets(ctx, c, rc.Scheme, postgresDB); err != nil { rc.emitWarning(postgresDB, EventUserSecretsFailed, fmt.Sprintf("Failed to reconcile user secrets: %v", err)) + rc.Metrics.IncUserAction(pgmetrics.ActionSecretReconcile, pgmetrics.ResultError) if statusErr := updateStatus(secretsReady, metav1.ConditionFalse, reasonSecretsCreationFailed, fmt.Sprintf("Failed to reconcile user secrets: %v", err), provisioningDBPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") } return ctrl.Result{}, err } + rc.Metrics.IncUserAction(pgmetrics.ActionSecretReconcile, pgmetrics.ResultSuccess) rc.emitOnConditionTransition(postgresDB, postgresDB.Status.Conditions, secretsReady, EventSecretsReady, fmt.Sprintf("All secrets provisioned for %d databases", len(postgresDB.Spec.Databases))) if err := updateStatus(secretsReady, metav1.ConditionTrue, reasonSecretsCreated, fmt.Sprintf("All secrets provisioned for %d databases", len(postgresDB.Spec.Databases)), provisioningDBPhase); err != nil { @@ -159,12 +166,14 @@ func PostgresDatabaseService( endpoints := resolveClusterEndpoints(cluster, cnpgCluster, postgresDB.Namespace) if err := reconcileRoleConfigMaps(ctx, c, rc.Scheme, postgresDB, endpoints); err != nil { rc.emitWarning(postgresDB, EventAccessConfigFailed, fmt.Sprintf("Failed to reconcile ConfigMaps: %v", err)) + rc.Metrics.IncUserAction(pgmetrics.ActionConfigMapReconcile, pgmetrics.ResultError) if statusErr := updateStatus(configMapsReady, metav1.ConditionFalse, reasonConfigMapsCreationFailed, fmt.Sprintf("Failed to reconcile ConfigMaps: %v", err), provisioningDBPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") } return ctrl.Result{}, err } + rc.Metrics.IncUserAction(pgmetrics.ActionConfigMapReconcile, pgmetrics.ResultSuccess) rc.emitOnConditionTransition(postgresDB, postgresDB.Status.Conditions, configMapsReady, EventConfigMapsReady, fmt.Sprintf("All ConfigMaps provisioned for %d databases", len(postgresDB.Spec.Databases))) if err := updateStatus(configMapsReady, metav1.ConditionTrue, reasonConfigMapsCreated, fmt.Sprintf("All ConfigMaps provisioned for %d databases", len(postgresDB.Spec.Databases)), provisioningDBPhase); err != nil { @@ -186,9 +195,11 @@ func PostgresDatabaseService( if err := patchManagedRoles(ctx, c, postgresDB, cluster); err != nil { logger.Error(err, "Failed to patch users in CNPG Cluster") rc.emitWarning(postgresDB, EventManagedRolesPatchFailed, fmt.Sprintf("Failed to patch managed roles: %v", err)) + rc.Metrics.IncUserAction(pgmetrics.ActionRolePatch, pgmetrics.ResultError) return ctrl.Result{}, err } rc.emitNormal(postgresDB, EventRoleReconciliationStarted, fmt.Sprintf("Patched managed roles, waiting for %d roles to reconcile", len(desiredUsers))) + rc.Metrics.IncUserAction(pgmetrics.ActionRolePatch, pgmetrics.ResultSuccess) if err := updateStatus(rolesReady, metav1.ConditionFalse, reasonWaitingForCNPG, fmt.Sprintf("Waiting for %d roles to be reconciled", len(desiredUsers)), provisioningDBPhase); err != nil { return ctrl.Result{}, err @@ -223,6 +234,7 @@ func PostgresDatabaseService( if err != nil { logger.Error(err, "Failed to reconcile CNPG Databases") rc.emitWarning(postgresDB, EventDatabasesReconcileFailed, fmt.Sprintf("Failed to reconcile databases: %v", err)) + rc.Metrics.IncUserAction(pgmetrics.ActionDatabaseReconcile, pgmetrics.ResultError) return ctrl.Result{}, err } if len(adopted) > 0 { @@ -279,12 +291,14 @@ func PostgresDatabaseService( if err := reconcileRWRolePrivileges(ctx, endpoints.RWHost, string(pw), dbNames, newDBRepo); err != nil { rc.emitWarning(postgresDB, EventPrivilegesGrantFailed, fmt.Sprintf("Failed to grant RW role privileges: %v", err)) + rc.Metrics.IncUserAction(pgmetrics.ActionPrivilegeGrant, pgmetrics.ResultError) if statusErr := updateStatus(privilegesReady, metav1.ConditionFalse, reasonPrivilegesGrantFailed, fmt.Sprintf("Failed to grant RW role privileges: %v", err), provisioningDBPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") } return ctrl.Result{}, err } + rc.Metrics.IncUserAction(pgmetrics.ActionPrivilegeGrant, pgmetrics.ResultSuccess) rc.emitOnConditionTransition(postgresDB, postgresDB.Status.Conditions, privilegesReady, EventPrivilegesReady, fmt.Sprintf("RW role privileges granted for all %d databases", len(postgresDB.Spec.Databases))) if err := updateStatus(privilegesReady, metav1.ConditionTrue, reasonPrivilegesGranted, fmt.Sprintf("RW role privileges granted for all %d databases", len(postgresDB.Spec.Databases)), readyDBPhase); err != nil { @@ -543,6 +557,7 @@ func handleDeletion(ctx context.Context, rc *ReconcileContext, postgresDB *enter return fmt.Errorf("removing finalizer: %w", err) } rc.emitNormal(postgresDB, EventCleanupComplete, fmt.Sprintf("Cleanup complete (%d retained, %d deleted)", len(plan.retained), len(plan.deleted))) + rc.Metrics.IncFinalizerOp(pgmetrics.ControllerDatabase, pgmetrics.ResultSuccess) logger.Info("Cleanup complete", "retained", len(plan.retained), "deleted", len(plan.deleted)) return nil } diff --git a/pkg/postgresql/database/core/database_unit_test.go b/pkg/postgresql/database/core/database_unit_test.go index 8d4da6c52..6140c0b2a 100644 --- a/pkg/postgresql/database/core/database_unit_test.go +++ b/pkg/postgresql/database/core/database_unit_test.go @@ -306,7 +306,7 @@ func TestVerifyRolesReady(t *testing.T) { }, }, }, - wantErr: "user main_db_rw reconciliation failed: [reserved role]", + wantErr: "reconciling user main_db_rw: [reserved role]", }, { name: "returns missing roles that are not reconciled yet", diff --git a/pkg/postgresql/database/core/types.go b/pkg/postgresql/database/core/types.go index bf07fd19f..8a5579a00 100644 --- a/pkg/postgresql/database/core/types.go +++ b/pkg/postgresql/database/core/types.go @@ -4,16 +4,18 @@ import ( "time" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" ) -// ReconcileContext bundles infrastructure dependencies injected by the controller +// ReconcileContext bundles infrastructure dependencies injected by the controller. type ReconcileContext struct { Client client.Client Scheme *runtime.Scheme Recorder record.EventRecorder + Metrics pgmetrics.Recorder } type reconcileDBPhases string diff --git a/pkg/postgresql/metrics/collector.go b/pkg/postgresql/metrics/collector.go new file mode 100644 index 000000000..2fa83179f --- /dev/null +++ b/pkg/postgresql/metrics/collector.go @@ -0,0 +1,130 @@ +package metrics + +import ( + "context" + "sync" + "time" + + enterprisev4 "github.com/splunk/splunk-operator/api/v4" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +const fleetCollectInterval = 2 * time.Second + +// FleetCollector recomputes fleet-state gauges from the K8s API (informer cache). +// It is rate-limited to avoid redundant work during burst reconciles. +// Each resource type has its own timestamp so they don't starve each other. +type FleetCollector struct { + mu sync.Mutex + lastClusterCollect time.Time + lastDatabaseCollect time.Time +} + +// NewFleetCollector returns a new FleetCollector. +func NewFleetCollector() *FleetCollector { + return &FleetCollector{} +} + +// CollectClusterMetrics lists all PostgresCluster resources and updates phase +// gauges, pooler gauges, and managed-user gauges. Skips if called within 2s +// of the last collection. +func (fc *FleetCollector) CollectClusterMetrics(ctx context.Context, c client.Client, recorder Recorder) { + if !fc.shouldCollectCluster() { + return + } + + logger := log.FromContext(ctx) + + var list enterprisev4.PostgresClusterList + if err := c.List(ctx, &list); err != nil { + logger.Error(err, "Failed to list PostgresClusters for fleet metrics") + return + } + + phases := make(map[string]float64) + var poolerEnabledCount float64 + managedUserStates := map[string]float64{ + "desired": 0, + "reconciled": 0, + "pending": 0, + "failed": 0, + } + + for i := range list.Items { + cluster := &list.Items[i] + + // Phase gauge. + phase := "Unknown" + if cluster.Status.Phase != nil { + phase = *cluster.Status.Phase + } + phases[phase]++ + + // Pooler-enabled count. + if cluster.Spec.ConnectionPoolerEnabled != nil && *cluster.Spec.ConnectionPoolerEnabled { + poolerEnabledCount++ + } + + // Managed users. + managedUserStates["desired"] += float64(len(cluster.Spec.ManagedRoles)) + if cluster.Status.ManagedRolesStatus != nil { + managedUserStates["reconciled"] += float64(len(cluster.Status.ManagedRolesStatus.Reconciled)) + managedUserStates["pending"] += float64(len(cluster.Status.ManagedRolesStatus.Pending)) + managedUserStates["failed"] += float64(len(cluster.Status.ManagedRolesStatus.Failed)) + } + } + + recorder.SetClusterPhases(phases, poolerEnabledCount) + recorder.SetManagedUsers(ControllerCluster, managedUserStates) +} + +// CollectDatabaseMetrics lists all PostgresDatabase resources and updates +// phase gauges. Skips if called within 2s of the last collection. +func (fc *FleetCollector) CollectDatabaseMetrics(ctx context.Context, c client.Client, recorder Recorder) { + if !fc.shouldCollectDatabase() { + return + } + + logger := log.FromContext(ctx) + + var list enterprisev4.PostgresDatabaseList + if err := c.List(ctx, &list); err != nil { + logger.Error(err, "Failed to list PostgresDatabases for fleet metrics") + return + } + + phases := make(map[string]float64) + for i := range list.Items { + db := &list.Items[i] + phase := "Unknown" + if db.Status.Phase != nil { + phase = *db.Status.Phase + } + phases[phase]++ + } + + recorder.SetDatabasePhases(phases) +} + +func (fc *FleetCollector) shouldCollectCluster() bool { + fc.mu.Lock() + defer fc.mu.Unlock() + now := time.Now() + if now.Sub(fc.lastClusterCollect) < fleetCollectInterval { + return false + } + fc.lastClusterCollect = now + return true +} + +func (fc *FleetCollector) shouldCollectDatabase() bool { + fc.mu.Lock() + defer fc.mu.Unlock() + now := time.Now() + if now.Sub(fc.lastDatabaseCollect) < fleetCollectInterval { + return false + } + fc.lastDatabaseCollect = now + return true +} diff --git a/pkg/postgresql/metrics/noop.go b/pkg/postgresql/metrics/noop.go new file mode 100644 index 000000000..ea4e3a4b2 --- /dev/null +++ b/pkg/postgresql/metrics/noop.go @@ -0,0 +1,22 @@ +package metrics + +import "time" + +// NoopRecorder is a no-op implementation of Recorder for use in tests. +type NoopRecorder struct{} + +func (n *NoopRecorder) ObserveReconcile(string, string, time.Duration) {} +func (n *NoopRecorder) IncReconcileError(string, string) {} +func (n *NoopRecorder) IncRequeue(string, string) {} +func (n *NoopRecorder) IncValidationFailure(string, string) {} +func (n *NoopRecorder) SetClusterPhases(map[string]float64, float64) {} +func (n *NoopRecorder) SetDatabasePhases(map[string]float64) {} +func (n *NoopRecorder) SetManagedUsers(string, map[string]float64) {} +func (n *NoopRecorder) IncUserAction(string, string) {} +func (n *NoopRecorder) SetPoolers(string, string, float64) {} +func (n *NoopRecorder) SetPoolerInstances(string, float64) {} +func (n *NoopRecorder) IncFinalizerOp(string, string) {} +func (n *NoopRecorder) IncOwnedResourceOp(string, string, string, string) {} + +// Compile-time interface check. +var _ Recorder = (*NoopRecorder)(nil) diff --git a/pkg/postgresql/metrics/ports.go b/pkg/postgresql/metrics/ports.go new file mode 100644 index 000000000..fdc1d17e2 --- /dev/null +++ b/pkg/postgresql/metrics/ports.go @@ -0,0 +1,102 @@ +package metrics + +import "time" + +// Reconcile result labels. +const ( + ResultSuccess = "success" + ResultError = "error" + ResultRequeue = "requeue" +) + +// Error class labels for IncReconcileError. +const ( + ErrorClassNotFound = "not_found" + ErrorClassConflict = "conflict" + ErrorClassValidation = "validation" + ErrorClassUnknown = "unknown" +) + +// Controller name labels. +const ( + ControllerCluster = "postgrescluster" + ControllerDatabase = "postgresdatabase" +) + +// Validation failure reason labels. +const ( + ReasonClassNotFound = "class_not_found" + ReasonInvalidConfig = "invalid_configuration" + ReasonClusterNotFound = "cluster_not_found" + ReasonClusterNotReady = "cluster_not_ready" + ReasonRoleConflict = "role_conflict" + ReasonPoolerConfigMissing = "pooler_config_missing" +) + +// User action labels. +const ( + ActionSecretReconcile = "secret_reconcile" + ActionConfigMapReconcile = "configmap_reconcile" + ActionRolePatch = "role_patch" + ActionDatabaseReconcile = "database_reconcile" + ActionPrivilegeGrant = "privilege_grant" +) + +// Owned resource operation labels. +const ( + OpCreate = "create" + OpUpdate = "update" +) + +// Owned resource kind labels. +const ( + ResourceSecret = "Secret" + ResourceCluster = "Cluster" + ResourcePooler = "Pooler" + ResourceConfigMap = "ConfigMap" + ResourceDatabase = "Database" +) + +// Recorder is the port for all PostgreSQL controller metrics. +// Core service packages depend on this interface, never on Prometheus directly. +// Adapters (PrometheusRecorder, NoopRecorder) live in this package. +type Recorder interface { + // ObserveReconcile records a completed reconciliation attempt with its + // outcome and duration. Called by the controller shell after the service returns. + ObserveReconcile(controller string, result string, duration time.Duration) + + // IncReconcileError increments the error counter with a stable error class. + IncReconcileError(controller string, errorClass string) + + // IncRequeue increments the requeue counter with a reason. + IncRequeue(controller string, reason string) + + // IncValidationFailure records a validation or configuration failure. + IncValidationFailure(controller string, reason string) + + // SetClusterPhases sets gauge values for cluster counts by phase. + // The phases map keys are phase strings (Ready, Pending, etc.) with counts as values. + SetClusterPhases(phases map[string]float64, poolerEnabledCount float64) + + // SetDatabasePhases sets gauge values for database counts by phase. + SetDatabasePhases(phases map[string]float64) + + // SetManagedUsers sets the gauge for managed user states. + // The states map keys are state strings (desired, reconciled, pending, failed). + SetManagedUsers(controller string, states map[string]float64) + + // IncUserAction increments the user action counter. + IncUserAction(action string, result string) + + // SetPoolers sets pooler gauge values by type and state. + SetPoolers(poolerType string, state string, count float64) + + // SetPoolerInstances sets pooler instance gauge by type. + SetPoolerInstances(poolerType string, count float64) + + // IncFinalizerOp increments the finalizer operations counter. + IncFinalizerOp(controller string, result string) + + // IncOwnedResourceOp increments the owned resource operations counter. + IncOwnedResourceOp(controller string, resourceKind string, operation string, result string) +} diff --git a/pkg/postgresql/metrics/prometheus.go b/pkg/postgresql/metrics/prometheus.go new file mode 100644 index 000000000..0591628ac --- /dev/null +++ b/pkg/postgresql/metrics/prometheus.go @@ -0,0 +1,175 @@ +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +var ( + reconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_postgres_reconcile_total", + Help: "Total reconcile attempts for PostgreSQL controllers.", + }, []string{"controller", "result"}) + + reconcileDurationSeconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "splunk_operator_postgres_reconcile_duration_seconds", + Help: "End-to-end reconcile duration for PostgreSQL controllers.", + Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 30}, + }, []string{"controller", "result"}) + + reconcileErrorsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_postgres_reconcile_errors_total", + Help: "Reconcile failures grouped by stable error class.", + }, []string{"controller", "error_class"}) + + reconcileRequeuesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_postgres_reconcile_requeues_total", + Help: "Requeues caused by waiting, conflicts, or dependency state.", + }, []string{"controller", "reason"}) + + validationFailuresTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_postgres_validation_failures_total", + Help: "Validation and configuration failures.", + }, []string{"controller", "reason"}) + + clusters = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_clusters", + Help: "Current number of PostgresCluster resources by status phase.", + }, []string{"phase", "pooler_enabled"}) + + databases = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_databases", + Help: "Current number of PostgresDatabase resources by status phase.", + }, []string{"phase"}) + + managedUsers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_managed_users", + Help: "Current counts of managed users by state.", + }, []string{"controller", "state"}) + + userActionsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_postgres_user_actions_total", + Help: "User-management actions such as secret reconcile, role patch, privilege grant.", + }, []string{"action", "result"}) + + poolers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_poolers", + Help: "Current number of PgBouncer poolers by type and readiness state.", + }, []string{"type", "state"}) + + poolerInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_pooler_instances", + Help: "Current observed pooler instance count.", + }, []string{"type"}) + + finalizerOperationsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_postgres_finalizer_operations_total", + Help: "Finalizer success and cleanup failures.", + }, []string{"controller", "result"}) + + ownedResourceOperationsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_postgres_owned_resource_operations_total", + Help: "Create/update/delete outcomes for owned resources.", + }, []string{"controller", "resource_kind", "operation", "result"}) + + allCollectors = []prometheus.Collector{ + reconcileTotal, + reconcileDurationSeconds, + reconcileErrorsTotal, + reconcileRequeuesTotal, + validationFailuresTotal, + clusters, + databases, + managedUsers, + userActionsTotal, + poolers, + poolerInstances, + finalizerOperationsTotal, + ownedResourceOperationsTotal, + } +) + +// Register registers all PostgreSQL metrics with the given registerer. +// Call once at startup from cmd/main.go. +func Register(registerer prometheus.Registerer) error { + for _, c := range allCollectors { + if err := registerer.Register(c); err != nil { + return err + } + } + return nil +} + +// PrometheusRecorder implements Recorder using Prometheus client_golang. +type PrometheusRecorder struct{} + +// NewPrometheusRecorder returns a new PrometheusRecorder. +func NewPrometheusRecorder() *PrometheusRecorder { + return &PrometheusRecorder{} +} + +func (p *PrometheusRecorder) ObserveReconcile(controller string, result string, duration time.Duration) { + reconcileTotal.WithLabelValues(controller, result).Inc() + reconcileDurationSeconds.WithLabelValues(controller, result).Observe(duration.Seconds()) +} + +func (p *PrometheusRecorder) IncReconcileError(controller string, errorClass string) { + reconcileErrorsTotal.WithLabelValues(controller, errorClass).Inc() +} + +func (p *PrometheusRecorder) IncRequeue(controller string, reason string) { + reconcileRequeuesTotal.WithLabelValues(controller, reason).Inc() +} + +func (p *PrometheusRecorder) IncValidationFailure(controller string, reason string) { + validationFailuresTotal.WithLabelValues(controller, reason).Inc() +} + +func (p *PrometheusRecorder) SetClusterPhases(phases map[string]float64, poolerEnabledCount float64) { + // Reset all phase gauges before setting new values to avoid stale entries. + clusters.Reset() + for phase, count := range phases { + clusters.WithLabelValues(phase, "false").Set(count) + } + // pooler_enabled count is tracked separately — it's a cross-cutting dimension. + if poolerEnabledCount > 0 { + clusters.WithLabelValues("", "true").Set(poolerEnabledCount) + } +} + +func (p *PrometheusRecorder) SetDatabasePhases(phases map[string]float64) { + databases.Reset() + for phase, count := range phases { + databases.WithLabelValues(phase).Set(count) + } +} + +func (p *PrometheusRecorder) SetManagedUsers(controller string, states map[string]float64) { + for state, count := range states { + managedUsers.WithLabelValues(controller, state).Set(count) + } +} + +func (p *PrometheusRecorder) IncUserAction(action string, result string) { + userActionsTotal.WithLabelValues(action, result).Inc() +} + +func (p *PrometheusRecorder) SetPoolers(poolerType string, state string, count float64) { + poolers.WithLabelValues(poolerType, state).Set(count) +} + +func (p *PrometheusRecorder) SetPoolerInstances(poolerType string, count float64) { + poolerInstances.WithLabelValues(poolerType).Set(count) +} + +func (p *PrometheusRecorder) IncFinalizerOp(controller string, result string) { + finalizerOperationsTotal.WithLabelValues(controller, result).Inc() +} + +func (p *PrometheusRecorder) IncOwnedResourceOp(controller string, resourceKind string, operation string, result string) { + ownedResourceOperationsTotal.WithLabelValues(controller, resourceKind, operation, result).Inc() +} + +// Compile-time interface check. +var _ Recorder = (*PrometheusRecorder)(nil) From a6a55765720b94cbeffb2213967a0632b2cd6417 Mon Sep 17 00:00:00 2001 From: Kamil Ubych Date: Fri, 3 Apr 2026 10:49:10 +0200 Subject: [PATCH 2/4] use Controller-runtime standard metrics --- .../controller/postgrescluster_controller.go | 26 ------------ .../controller/postgresdatabase_controller.go | 11 ----- pkg/postgresql/metrics/noop.go | 5 --- pkg/postgresql/metrics/ports.go | 28 +++---------- pkg/postgresql/metrics/prometheus.go | 40 ------------------- 5 files changed, 6 insertions(+), 104 deletions(-) diff --git a/internal/controller/postgrescluster_controller.go b/internal/controller/postgrescluster_controller.go index c25ea27d3..abfebdf37 100644 --- a/internal/controller/postgrescluster_controller.go +++ b/internal/controller/postgrescluster_controller.go @@ -18,7 +18,6 @@ package controller import ( "context" - "time" cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" enterprisev4 "github.com/splunk/splunk-operator/api/v4" @@ -26,7 +25,6 @@ import ( pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" - apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" @@ -62,36 +60,12 @@ type PostgresClusterReconciler struct { // +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch func (r *PostgresClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - start := time.Now() rc := &clustercore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics} result, err := clustercore.PostgresClusterService(ctx, rc, req) - - resultLabel := pgmetrics.ResultSuccess - if err != nil { - resultLabel = pgmetrics.ResultError - r.Metrics.IncReconcileError(pgmetrics.ControllerCluster, classifyError(err)) - } else if result.RequeueAfter > 0 || result.Requeue { - resultLabel = pgmetrics.ResultRequeue - } - r.Metrics.ObserveReconcile(pgmetrics.ControllerCluster, resultLabel, time.Since(start)) r.FleetCollector.CollectClusterMetrics(ctx, r.Client, r.Metrics) - return result, err } -func classifyError(err error) string { - switch { - case apierrors.IsNotFound(err): - return pgmetrics.ErrorClassNotFound - case apierrors.IsConflict(err): - return pgmetrics.ErrorClassConflict - case apierrors.IsInvalid(err): - return pgmetrics.ErrorClassValidation - default: - return pgmetrics.ErrorClassUnknown - } -} - // SetupWithManager registers the controller and owned resource watches. func (r *PostgresClusterReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). diff --git a/internal/controller/postgresdatabase_controller.go b/internal/controller/postgresdatabase_controller.go index 32bd16dda..e1a68ef02 100644 --- a/internal/controller/postgresdatabase_controller.go +++ b/internal/controller/postgresdatabase_controller.go @@ -19,7 +19,6 @@ package controller import ( "context" "reflect" - "time" cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" enterprisev4 "github.com/splunk/splunk-operator/api/v4" @@ -65,7 +64,6 @@ const ( //+kubebuilder:rbac:groups=core,resources=events,verbs=create;patch func (r *PostgresDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - start := time.Now() logger := log.FromContext(ctx) postgresDB := &enterprisev4.PostgresDatabase{} @@ -78,15 +76,6 @@ func (r *PostgresDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req } rc := &dbcore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics} result, err := dbcore.PostgresDatabaseService(ctx, rc, postgresDB, dbadapter.NewDBRepository) - - resultLabel := pgmetrics.ResultSuccess - if err != nil { - resultLabel = pgmetrics.ResultError - r.Metrics.IncReconcileError(pgmetrics.ControllerDatabase, classifyError(err)) - } else if result.RequeueAfter > 0 || result.Requeue { - resultLabel = pgmetrics.ResultRequeue - } - r.Metrics.ObserveReconcile(pgmetrics.ControllerDatabase, resultLabel, time.Since(start)) r.FleetCollector.CollectDatabaseMetrics(ctx, r.Client, r.Metrics) return result, err diff --git a/pkg/postgresql/metrics/noop.go b/pkg/postgresql/metrics/noop.go index ea4e3a4b2..c82a23b41 100644 --- a/pkg/postgresql/metrics/noop.go +++ b/pkg/postgresql/metrics/noop.go @@ -1,13 +1,8 @@ package metrics -import "time" - // NoopRecorder is a no-op implementation of Recorder for use in tests. type NoopRecorder struct{} -func (n *NoopRecorder) ObserveReconcile(string, string, time.Duration) {} -func (n *NoopRecorder) IncReconcileError(string, string) {} -func (n *NoopRecorder) IncRequeue(string, string) {} func (n *NoopRecorder) IncValidationFailure(string, string) {} func (n *NoopRecorder) SetClusterPhases(map[string]float64, float64) {} func (n *NoopRecorder) SetDatabasePhases(map[string]float64) {} diff --git a/pkg/postgresql/metrics/ports.go b/pkg/postgresql/metrics/ports.go index fdc1d17e2..115767cd4 100644 --- a/pkg/postgresql/metrics/ports.go +++ b/pkg/postgresql/metrics/ports.go @@ -1,20 +1,9 @@ package metrics -import "time" - -// Reconcile result labels. +// Result labels for counters that track success/error outcomes. const ( ResultSuccess = "success" ResultError = "error" - ResultRequeue = "requeue" -) - -// Error class labels for IncReconcileError. -const ( - ErrorClassNotFound = "not_found" - ErrorClassConflict = "conflict" - ErrorClassValidation = "validation" - ErrorClassUnknown = "unknown" ) // Controller name labels. @@ -60,17 +49,12 @@ const ( // Recorder is the port for all PostgreSQL controller metrics. // Core service packages depend on this interface, never on Prometheus directly. // Adapters (PrometheusRecorder, NoopRecorder) live in this package. +// +// Reconcile-level metrics (total count, duration, error count) are handled +// automatically by controller-runtime — see controller_runtime_reconcile_total, +// controller_runtime_reconcile_time_seconds, controller_runtime_reconcile_errors_total. +// This interface covers domain-specific metrics only. type Recorder interface { - // ObserveReconcile records a completed reconciliation attempt with its - // outcome and duration. Called by the controller shell after the service returns. - ObserveReconcile(controller string, result string, duration time.Duration) - - // IncReconcileError increments the error counter with a stable error class. - IncReconcileError(controller string, errorClass string) - - // IncRequeue increments the requeue counter with a reason. - IncRequeue(controller string, reason string) - // IncValidationFailure records a validation or configuration failure. IncValidationFailure(controller string, reason string) diff --git a/pkg/postgresql/metrics/prometheus.go b/pkg/postgresql/metrics/prometheus.go index 0591628ac..f591099fe 100644 --- a/pkg/postgresql/metrics/prometheus.go +++ b/pkg/postgresql/metrics/prometheus.go @@ -1,33 +1,10 @@ package metrics import ( - "time" - "github.com/prometheus/client_golang/prometheus" ) var ( - reconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "splunk_operator_postgres_reconcile_total", - Help: "Total reconcile attempts for PostgreSQL controllers.", - }, []string{"controller", "result"}) - - reconcileDurationSeconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "splunk_operator_postgres_reconcile_duration_seconds", - Help: "End-to-end reconcile duration for PostgreSQL controllers.", - Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 30}, - }, []string{"controller", "result"}) - - reconcileErrorsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "splunk_operator_postgres_reconcile_errors_total", - Help: "Reconcile failures grouped by stable error class.", - }, []string{"controller", "error_class"}) - - reconcileRequeuesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "splunk_operator_postgres_reconcile_requeues_total", - Help: "Requeues caused by waiting, conflicts, or dependency state.", - }, []string{"controller", "reason"}) - validationFailuresTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "splunk_operator_postgres_validation_failures_total", Help: "Validation and configuration failures.", @@ -74,10 +51,6 @@ var ( }, []string{"controller", "resource_kind", "operation", "result"}) allCollectors = []prometheus.Collector{ - reconcileTotal, - reconcileDurationSeconds, - reconcileErrorsTotal, - reconcileRequeuesTotal, validationFailuresTotal, clusters, databases, @@ -109,19 +82,6 @@ func NewPrometheusRecorder() *PrometheusRecorder { return &PrometheusRecorder{} } -func (p *PrometheusRecorder) ObserveReconcile(controller string, result string, duration time.Duration) { - reconcileTotal.WithLabelValues(controller, result).Inc() - reconcileDurationSeconds.WithLabelValues(controller, result).Observe(duration.Seconds()) -} - -func (p *PrometheusRecorder) IncReconcileError(controller string, errorClass string) { - reconcileErrorsTotal.WithLabelValues(controller, errorClass).Inc() -} - -func (p *PrometheusRecorder) IncRequeue(controller string, reason string) { - reconcileRequeuesTotal.WithLabelValues(controller, reason).Inc() -} - func (p *PrometheusRecorder) IncValidationFailure(controller string, reason string) { validationFailuresTotal.WithLabelValues(controller, reason).Inc() } From 68b423eaa852d4632486c608e3ee8697c25d23d3 Mon Sep 17 00:00:00 2001 From: Kamil Ubych Date: Tue, 7 Apr 2026 09:44:54 +0200 Subject: [PATCH 3/4] no rate limiting on fleet --- pkg/postgresql/metrics/collector.go | 41 +---------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/pkg/postgresql/metrics/collector.go b/pkg/postgresql/metrics/collector.go index 2fa83179f..b590f1ad4 100644 --- a/pkg/postgresql/metrics/collector.go +++ b/pkg/postgresql/metrics/collector.go @@ -2,24 +2,14 @@ package metrics import ( "context" - "sync" - "time" enterprisev4 "github.com/splunk/splunk-operator/api/v4" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) -const fleetCollectInterval = 2 * time.Second - // FleetCollector recomputes fleet-state gauges from the K8s API (informer cache). -// It is rate-limited to avoid redundant work during burst reconciles. -// Each resource type has its own timestamp so they don't starve each other. -type FleetCollector struct { - mu sync.Mutex - lastClusterCollect time.Time - lastDatabaseCollect time.Time -} +type FleetCollector struct{} // NewFleetCollector returns a new FleetCollector. func NewFleetCollector() *FleetCollector { @@ -30,10 +20,6 @@ func NewFleetCollector() *FleetCollector { // gauges, pooler gauges, and managed-user gauges. Skips if called within 2s // of the last collection. func (fc *FleetCollector) CollectClusterMetrics(ctx context.Context, c client.Client, recorder Recorder) { - if !fc.shouldCollectCluster() { - return - } - logger := log.FromContext(ctx) var list enterprisev4.PostgresClusterList @@ -82,10 +68,6 @@ func (fc *FleetCollector) CollectClusterMetrics(ctx context.Context, c client.Cl // CollectDatabaseMetrics lists all PostgresDatabase resources and updates // phase gauges. Skips if called within 2s of the last collection. func (fc *FleetCollector) CollectDatabaseMetrics(ctx context.Context, c client.Client, recorder Recorder) { - if !fc.shouldCollectDatabase() { - return - } - logger := log.FromContext(ctx) var list enterprisev4.PostgresDatabaseList @@ -107,24 +89,3 @@ func (fc *FleetCollector) CollectDatabaseMetrics(ctx context.Context, c client.C recorder.SetDatabasePhases(phases) } -func (fc *FleetCollector) shouldCollectCluster() bool { - fc.mu.Lock() - defer fc.mu.Unlock() - now := time.Now() - if now.Sub(fc.lastClusterCollect) < fleetCollectInterval { - return false - } - fc.lastClusterCollect = now - return true -} - -func (fc *FleetCollector) shouldCollectDatabase() bool { - fc.mu.Lock() - defer fc.mu.Unlock() - now := time.Now() - if now.Sub(fc.lastDatabaseCollect) < fleetCollectInterval { - return false - } - fc.lastDatabaseCollect = now - return true -} From b605d5dd24d0f6f7d285989ad0874735754c99f9 Mon Sep 17 00:00:00 2001 From: Kamil Ubych Date: Tue, 7 Apr 2026 10:51:45 +0200 Subject: [PATCH 4/4] metrics labeled by errors --- pkg/postgresql/cluster/core/cluster.go | 32 ++++------ pkg/postgresql/database/core/database.go | 27 ++++---- .../database/core/database_unit_test.go | 2 + pkg/postgresql/database/core/types.go | 7 ++- pkg/postgresql/metrics/noop.go | 15 ++--- pkg/postgresql/metrics/ports.go | 61 +++---------------- pkg/postgresql/metrics/prometheus.go | 46 +++----------- 7 files changed, 48 insertions(+), 142 deletions(-) diff --git a/pkg/postgresql/cluster/core/cluster.go b/pkg/postgresql/cluster/core/cluster.go index 94eda71d3..43a8b82a3 100644 --- a/pkg/postgresql/cluster/core/cluster.go +++ b/pkg/postgresql/cluster/core/cluster.go @@ -68,7 +68,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. ctx = log.IntoContext(ctx, logger) updateStatus := func(conditionType conditionTypes, status metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileClusterPhases) error { - return setStatus(ctx, c, postgresCluster, conditionType, status, reason, message, phase) + return setStatus(ctx, c, rc.Metrics, postgresCluster, conditionType, status, reason, message, phase) } // Finalizer handling must come before any other processing. @@ -79,7 +79,6 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. } logger.Error(err, "Failed to handle finalizer") rc.emitWarning(postgresCluster, EventCleanupFailed, fmt.Sprintf("Cleanup failed: %v", err)) - rc.Metrics.IncFinalizerOp(pgmetrics.ControllerCluster, pgmetrics.ResultError) errs := []error{err} if statusErr := updateStatus(clusterReady, metav1.ConditionFalse, reasonClusterDeleteFailed, fmt.Sprintf("Failed to delete resources during cleanup: %v", err), failedClusterPhase); statusErr != nil { @@ -113,7 +112,6 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. if err := c.Get(ctx, client.ObjectKey{Name: postgresCluster.Spec.Class}, clusterClass); err != nil { logger.Error(err, "Failed to fetch PostgresClusterClass", "className", postgresCluster.Spec.Class) rc.emitWarning(postgresCluster, EventClusterClassNotFound, fmt.Sprintf("ClusterClass %s not found", postgresCluster.Spec.Class)) - rc.Metrics.IncValidationFailure(pgmetrics.ControllerCluster, pgmetrics.ReasonClassNotFound) if statusErr := updateStatus(clusterReady, metav1.ConditionFalse, reasonClusterClassNotFound, fmt.Sprintf("ClusterClass %s not found: %v", postgresCluster.Spec.Class, err), failedClusterPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") @@ -126,7 +124,6 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. if err != nil { logger.Error(err, "Failed to merge PostgresCluster configuration") rc.emitWarning(postgresCluster, EventConfigMergeFailed, fmt.Sprintf("Failed to merge configuration: %v", err)) - rc.Metrics.IncValidationFailure(pgmetrics.ControllerCluster, pgmetrics.ReasonInvalidConfig) if statusErr := updateStatus(clusterReady, metav1.ConditionFalse, reasonInvalidConfiguration, fmt.Sprintf("Failed to merge configuration: %v", err), failedClusterPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") @@ -173,7 +170,6 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. return ctrl.Result{}, err } rc.emitNormal(postgresCluster, EventSecretReady, fmt.Sprintf("Superuser secret %s created", postgresSecretName)) - rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourceSecret, pgmetrics.OpCreate, pgmetrics.ResultSuccess) logger.Info("Superuser secret ref persisted to status") } @@ -228,7 +224,6 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. return ctrl.Result{}, err } rc.emitNormal(postgresCluster, EventClusterCreationStarted, "CNPG cluster created, waiting for healthy state") - rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourceCluster, pgmetrics.OpCreate, pgmetrics.ResultSuccess) if statusErr := updateStatus(clusterReady, metav1.ConditionFalse, reasonClusterBuildSucceeded, "CNPG Cluster created", pendingClusterPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") @@ -273,7 +268,6 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. return ctrl.Result{Requeue: true}, nil } rc.emitNormal(postgresCluster, EventClusterUpdateStarted, "CNPG cluster spec updated, waiting for healthy state") - rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourceCluster, pgmetrics.OpUpdate, pgmetrics.ResultSuccess) logger.Info("CNPG Cluster patched, requeueing for status update", "name", cnpgCluster.Name) return ctrl.Result{RequeueAfter: retryDelay}, nil } @@ -283,7 +277,6 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. if err := reconcileManagedRoles(ctx, c, postgresCluster, cnpgCluster); err != nil { logger.Error(err, "Failed to reconcile managed roles") rc.emitWarning(postgresCluster, EventManagedRolesFailed, fmt.Sprintf("Failed to reconcile managed roles: %v", err)) - rc.Metrics.IncUserAction(pgmetrics.ActionRolePatch, pgmetrics.ResultError) if statusErr := updateStatus(clusterReady, metav1.ConditionFalse, reasonManagedRolesFailed, fmt.Sprintf("Failed to reconcile managed roles: %v", err), failedClusterPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") @@ -334,7 +327,6 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. if mergedConfig.CNPG == nil || mergedConfig.CNPG.ConnectionPooler == nil { logger.Info("Connection pooler enabled but no config found in class or cluster spec, skipping", "class", postgresCluster.Spec.Class, "cluster", postgresCluster.Name) - rc.Metrics.IncValidationFailure(pgmetrics.ControllerCluster, pgmetrics.ReasonPoolerConfigMissing) if statusErr := updateStatus(poolerReady, metav1.ConditionFalse, reasonPoolerConfigMissing, fmt.Sprintf("Connection pooler is enabled but no config found in class %q or cluster %q", postgresCluster.Spec.Class, postgresCluster.Name), failedClusterPhase); statusErr != nil { @@ -360,7 +352,6 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. return ctrl.Result{}, err } rc.emitNormal(postgresCluster, EventPoolerCreationStarted, "Connection poolers created, waiting for readiness") - rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourcePooler, pgmetrics.OpCreate, pgmetrics.ResultSuccess) logger.Info("Connection pooler creation started, requeueing") if statusErr := updateStatus(poolerReady, metav1.ConditionFalse, reasonPoolerCreating, "Connection poolers are being provisioned", provisioningClusterPhase); statusErr != nil { @@ -394,7 +385,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. default: oldConditions := make([]metav1.Condition, len(postgresCluster.Status.Conditions)) copy(oldConditions, postgresCluster.Status.Conditions) - if err := syncPoolerStatus(ctx, c, postgresCluster); err != nil { + if err := syncPoolerStatus(ctx, c, rc.Metrics, postgresCluster); err != nil { logger.Error(err, "Failed to sync pooler status") rc.emitWarning(postgresCluster, EventPoolerReconcileFailed, fmt.Sprintf("Failed to sync pooler status: %v", err)) if statusErr := updateStatus(poolerReady, metav1.ConditionFalse, reasonPoolerReconciliationFailed, @@ -443,11 +434,9 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. switch createOrUpdateResult { case controllerutil.OperationResultCreated: rc.emitNormal(postgresCluster, EventConfigMapReady, fmt.Sprintf("ConfigMap %s created", desiredCM.Name)) - rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourceConfigMap, pgmetrics.OpCreate, pgmetrics.ResultSuccess) logger.Info("ConfigMap created", "name", desiredCM.Name) case controllerutil.OperationResultUpdated: rc.emitNormal(postgresCluster, EventConfigMapReady, fmt.Sprintf("ConfigMap %s updated", desiredCM.Name)) - rc.Metrics.IncOwnedResourceOp(pgmetrics.ControllerCluster, pgmetrics.ResourceConfigMap, pgmetrics.OpUpdate, pgmetrics.ResultSuccess) logger.Info("ConfigMap updated", "name", desiredCM.Name) default: logger.Info("ConfigMap unchanged", "name", desiredCM.Name) @@ -462,7 +451,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. if postgresCluster.Status.Phase != nil { oldPhase = *postgresCluster.Status.Phase } - if err := syncStatus(ctx, c, postgresCluster, cnpgCluster); err != nil { + if err := syncStatus(ctx, c, rc.Metrics, postgresCluster, cnpgCluster); err != nil { logger.Error(err, "Failed to sync status") if apierrors.IsConflict(err) { logger.Info("Conflict during status update, will requeue") @@ -490,7 +479,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. logger.Info("Poolers ready, syncing status") poolerOldConditions := make([]metav1.Condition, len(postgresCluster.Status.Conditions)) copy(poolerOldConditions, postgresCluster.Status.Conditions) - _ = syncPoolerStatus(ctx, c, postgresCluster) + _ = syncPoolerStatus(ctx, c, rc.Metrics, postgresCluster) rc.emitPoolerReadyTransition(postgresCluster, poolerOldConditions) } } @@ -768,7 +757,7 @@ func deleteConnectionPoolers(ctx context.Context, c client.Client, cluster *ente } // syncPoolerStatus populates ConnectionPoolerStatus and the PoolerReady condition. -func syncPoolerStatus(ctx context.Context, c client.Client, cluster *enterprisev4.PostgresCluster) error { +func syncPoolerStatus(ctx context.Context, c client.Client, metrics pgmetrics.Recorder, cluster *enterprisev4.PostgresCluster) error { rwPooler := &cnpgv1.Pooler{} if err := c.Get(ctx, types.NamespacedName{ Name: poolerResourceName(cluster.Name, readWriteEndpoint), @@ -789,13 +778,13 @@ func syncPoolerStatus(ctx context.Context, c client.Client, cluster *enterprisev rwDesired, rwScheduled := poolerInstanceCount(rwPooler) roDesired, roScheduled := poolerInstanceCount(roPooler) - return setStatus(ctx, c, cluster, poolerReady, metav1.ConditionTrue, reasonAllInstancesReady, + return setStatus(ctx, c, metrics, cluster, poolerReady, metav1.ConditionTrue, reasonAllInstancesReady, fmt.Sprintf("%s: %d/%d, %s: %d/%d", readWriteEndpoint, rwScheduled, rwDesired, readOnlyEndpoint, roScheduled, roDesired), readyClusterPhase) } // syncStatus maps CNPG Cluster state to PostgresCluster status. -func syncStatus(ctx context.Context, c client.Client, cluster *enterprisev4.PostgresCluster, cnpgCluster *cnpgv1.Cluster) error { +func syncStatus(ctx context.Context, c client.Client, metrics pgmetrics.Recorder, cluster *enterprisev4.PostgresCluster, cnpgCluster *cnpgv1.Cluster) error { cluster.Status.ProvisionerRef = &corev1.ObjectReference{ APIVersion: "postgresql.cnpg.io/v1", Kind: "Cluster", @@ -848,13 +837,13 @@ func syncStatus(ctx context.Context, c client.Client, cluster *enterprisev4.Post message = fmt.Sprintf("CNPG cluster phase: %s", cnpgCluster.Status.Phase) } - return setStatus(ctx, c, cluster, clusterReady, condStatus, reason, message, phase) + return setStatus(ctx, c, metrics, cluster, clusterReady, condStatus, reason, message, phase) } // setStatus sets the phase, condition and persists the status. // It skips the API write when the resulting status is identical to the current // state, avoiding unnecessary etcd churn and ResourceVersion bumps on stable clusters. -func setStatus(ctx context.Context, c client.Client, cluster *enterprisev4.PostgresCluster, condType conditionTypes, status metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileClusterPhases) error { +func setStatus(ctx context.Context, c client.Client, metrics pgmetrics.Recorder, cluster *enterprisev4.PostgresCluster, condType conditionTypes, status metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileClusterPhases) error { before := cluster.Status.DeepCopy() p := string(phase) @@ -871,6 +860,8 @@ func setStatus(ctx context.Context, c client.Client, cluster *enterprisev4.Postg return nil } + metrics.IncStatusTransition(pgmetrics.ControllerCluster, string(condType), string(status), string(reason)) + if err := c.Status().Update(ctx, cluster); err != nil { return fmt.Errorf("failed to update PostgresCluster status: %w", err) } @@ -1074,7 +1065,6 @@ func handleFinalizer(ctx context.Context, rc *ReconcileContext, cluster *enterpr return fmt.Errorf("removing finalizer: %w", err) } rc.emitNormal(cluster, EventCleanupComplete, fmt.Sprintf("Cleanup complete (policy: %s)", policy)) - rc.Metrics.IncFinalizerOp(pgmetrics.ControllerCluster, pgmetrics.ResultSuccess) logger.Info("Finalizer removed, cleanup complete") return nil } diff --git a/pkg/postgresql/database/core/database.go b/pkg/postgresql/database/core/database.go index 3ac7fa254..fa9ce4adf 100644 --- a/pkg/postgresql/database/core/database.go +++ b/pkg/postgresql/database/core/database.go @@ -44,7 +44,7 @@ func PostgresDatabaseService( logger.Info("Reconciling PostgresDatabase") updateStatus := func(conditionType conditionTypes, conditionStatus metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileDBPhases) error { - return persistStatus(ctx, c, postgresDB, conditionType, conditionStatus, reason, message, phase) + return persistStatus(ctx, c, rc.Metrics, postgresDB, conditionType, conditionStatus, reason, message, phase) } // Finalizer: cleanup on deletion, register on creation. @@ -52,7 +52,6 @@ func PostgresDatabaseService( if err := handleDeletion(ctx, rc, postgresDB); err != nil { logger.Error(err, "Failed to clean up PostgresDatabase") rc.emitWarning(postgresDB, EventCleanupFailed, fmt.Sprintf("Cleanup failed: %v", err)) - rc.Metrics.IncFinalizerOp(pgmetrics.ControllerDatabase, pgmetrics.ResultError) return ctrl.Result{}, err } return ctrl.Result{}, nil @@ -83,7 +82,6 @@ func PostgresDatabaseService( if err != nil { if errors.IsNotFound(err) { rc.emitWarning(postgresDB, EventClusterNotFound, fmt.Sprintf("PostgresCluster %s not found", postgresDB.Spec.ClusterRef.Name)) - rc.Metrics.IncValidationFailure(pgmetrics.ControllerDatabase, pgmetrics.ReasonClusterNotFound) if err := updateStatus(clusterReady, metav1.ConditionFalse, reasonClusterNotFound, "Cluster CR not found", pendingDBPhase); err != nil { return ctrl.Result{}, err } @@ -101,7 +99,6 @@ func PostgresDatabaseService( switch clusterStatus { case ClusterNotReady, ClusterNoProvisionerRef: rc.emitWarning(postgresDB, EventClusterNotReady, "Referenced PostgresCluster is not ready yet") - rc.Metrics.IncValidationFailure(pgmetrics.ControllerDatabase, pgmetrics.ReasonClusterNotReady) if err := updateStatus(clusterReady, metav1.ConditionFalse, reasonClusterProvisioning, "Cluster is not in ready state yet", pendingDBPhase); err != nil { return ctrl.Result{}, err } @@ -123,7 +120,6 @@ func PostgresDatabaseService( conflictErr := fmt.Errorf("role conflict detected: %s", strings.Join(roleConflicts, ", ")) logger.Error(conflictErr, conflictMsg) rc.emitWarning(postgresDB, EventRoleConflict, conflictMsg) - rc.Metrics.IncValidationFailure(pgmetrics.ControllerDatabase, pgmetrics.ReasonRoleConflict) errs := []error{conflictErr} if statusErr := updateStatus(rolesReady, metav1.ConditionFalse, reasonRoleConflict, conflictMsg, failedDBPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") @@ -147,14 +143,12 @@ func PostgresDatabaseService( // CNPG rejects a PasswordSecretRef pointing at a missing secret. if err := reconcileUserSecrets(ctx, c, rc.Scheme, postgresDB); err != nil { rc.emitWarning(postgresDB, EventUserSecretsFailed, fmt.Sprintf("Failed to reconcile user secrets: %v", err)) - rc.Metrics.IncUserAction(pgmetrics.ActionSecretReconcile, pgmetrics.ResultError) if statusErr := updateStatus(secretsReady, metav1.ConditionFalse, reasonSecretsCreationFailed, fmt.Sprintf("Failed to reconcile user secrets: %v", err), provisioningDBPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") } return ctrl.Result{}, err } - rc.Metrics.IncUserAction(pgmetrics.ActionSecretReconcile, pgmetrics.ResultSuccess) rc.emitOnConditionTransition(postgresDB, postgresDB.Status.Conditions, secretsReady, EventSecretsReady, fmt.Sprintf("All secrets provisioned for %d databases", len(postgresDB.Spec.Databases))) if err := updateStatus(secretsReady, metav1.ConditionTrue, reasonSecretsCreated, fmt.Sprintf("All secrets provisioned for %d databases", len(postgresDB.Spec.Databases)), provisioningDBPhase); err != nil { @@ -166,14 +160,12 @@ func PostgresDatabaseService( endpoints := resolveClusterEndpoints(cluster, cnpgCluster, postgresDB.Namespace) if err := reconcileRoleConfigMaps(ctx, c, rc.Scheme, postgresDB, endpoints); err != nil { rc.emitWarning(postgresDB, EventAccessConfigFailed, fmt.Sprintf("Failed to reconcile ConfigMaps: %v", err)) - rc.Metrics.IncUserAction(pgmetrics.ActionConfigMapReconcile, pgmetrics.ResultError) if statusErr := updateStatus(configMapsReady, metav1.ConditionFalse, reasonConfigMapsCreationFailed, fmt.Sprintf("Failed to reconcile ConfigMaps: %v", err), provisioningDBPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") } return ctrl.Result{}, err } - rc.Metrics.IncUserAction(pgmetrics.ActionConfigMapReconcile, pgmetrics.ResultSuccess) rc.emitOnConditionTransition(postgresDB, postgresDB.Status.Conditions, configMapsReady, EventConfigMapsReady, fmt.Sprintf("All ConfigMaps provisioned for %d databases", len(postgresDB.Spec.Databases))) if err := updateStatus(configMapsReady, metav1.ConditionTrue, reasonConfigMapsCreated, fmt.Sprintf("All ConfigMaps provisioned for %d databases", len(postgresDB.Spec.Databases)), provisioningDBPhase); err != nil { @@ -195,11 +187,13 @@ func PostgresDatabaseService( if err := patchManagedRoles(ctx, c, postgresDB, cluster); err != nil { logger.Error(err, "Failed to patch users in CNPG Cluster") rc.emitWarning(postgresDB, EventManagedRolesPatchFailed, fmt.Sprintf("Failed to patch managed roles: %v", err)) - rc.Metrics.IncUserAction(pgmetrics.ActionRolePatch, pgmetrics.ResultError) + if statusErr := updateStatus(rolesReady, metav1.ConditionFalse, reasonUsersCreationFailed, + fmt.Sprintf("Failed to patch managed roles: %v", err), failedDBPhase); statusErr != nil { + logger.Error(statusErr, "Failed to update status") + } return ctrl.Result{}, err } rc.emitNormal(postgresDB, EventRoleReconciliationStarted, fmt.Sprintf("Patched managed roles, waiting for %d roles to reconcile", len(desiredUsers))) - rc.Metrics.IncUserAction(pgmetrics.ActionRolePatch, pgmetrics.ResultSuccess) if err := updateStatus(rolesReady, metav1.ConditionFalse, reasonWaitingForCNPG, fmt.Sprintf("Waiting for %d roles to be reconciled", len(desiredUsers)), provisioningDBPhase); err != nil { return ctrl.Result{}, err @@ -234,7 +228,10 @@ func PostgresDatabaseService( if err != nil { logger.Error(err, "Failed to reconcile CNPG Databases") rc.emitWarning(postgresDB, EventDatabasesReconcileFailed, fmt.Sprintf("Failed to reconcile databases: %v", err)) - rc.Metrics.IncUserAction(pgmetrics.ActionDatabaseReconcile, pgmetrics.ResultError) + if statusErr := updateStatus(databasesReady, metav1.ConditionFalse, reasonDatabaseReconcileFailed, + fmt.Sprintf("Failed to reconcile databases: %v", err), failedDBPhase); statusErr != nil { + logger.Error(statusErr, "Failed to update status") + } return ctrl.Result{}, err } if len(adopted) > 0 { @@ -291,14 +288,12 @@ func PostgresDatabaseService( if err := reconcileRWRolePrivileges(ctx, endpoints.RWHost, string(pw), dbNames, newDBRepo); err != nil { rc.emitWarning(postgresDB, EventPrivilegesGrantFailed, fmt.Sprintf("Failed to grant RW role privileges: %v", err)) - rc.Metrics.IncUserAction(pgmetrics.ActionPrivilegeGrant, pgmetrics.ResultError) if statusErr := updateStatus(privilegesReady, metav1.ConditionFalse, reasonPrivilegesGrantFailed, fmt.Sprintf("Failed to grant RW role privileges: %v", err), provisioningDBPhase); statusErr != nil { logger.Error(statusErr, "Failed to update status") } return ctrl.Result{}, err } - rc.Metrics.IncUserAction(pgmetrics.ActionPrivilegeGrant, pgmetrics.ResultSuccess) rc.emitOnConditionTransition(postgresDB, postgresDB.Status.Conditions, privilegesReady, EventPrivilegesReady, fmt.Sprintf("RW role privileges granted for all %d databases", len(postgresDB.Spec.Databases))) if err := updateStatus(privilegesReady, metav1.ConditionTrue, reasonPrivilegesGranted, fmt.Sprintf("RW role privileges granted for all %d databases", len(postgresDB.Spec.Databases)), readyDBPhase); err != nil { @@ -507,8 +502,9 @@ func verifyDatabasesReady(ctx context.Context, c client.Client, postgresDB *ente return notReady, nil } -func persistStatus(ctx context.Context, c client.Client, db *enterprisev4.PostgresDatabase, conditionType conditionTypes, conditionStatus metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileDBPhases) error { +func persistStatus(ctx context.Context, c client.Client, metrics pgmetrics.Recorder, db *enterprisev4.PostgresDatabase, conditionType conditionTypes, conditionStatus metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileDBPhases) error { applyStatus(db, conditionType, conditionStatus, reason, message, phase) + metrics.IncStatusTransition(pgmetrics.ControllerDatabase, string(conditionType), string(conditionStatus), string(reason)) return c.Status().Update(ctx, db) } @@ -557,7 +553,6 @@ func handleDeletion(ctx context.Context, rc *ReconcileContext, postgresDB *enter return fmt.Errorf("removing finalizer: %w", err) } rc.emitNormal(postgresDB, EventCleanupComplete, fmt.Sprintf("Cleanup complete (%d retained, %d deleted)", len(plan.retained), len(plan.deleted))) - rc.Metrics.IncFinalizerOp(pgmetrics.ControllerDatabase, pgmetrics.ResultSuccess) logger.Info("Cleanup complete", "retained", len(plan.retained), "deleted", len(plan.deleted)) return nil } diff --git a/pkg/postgresql/database/core/database_unit_test.go b/pkg/postgresql/database/core/database_unit_test.go index 6140c0b2a..68399ade4 100644 --- a/pkg/postgresql/database/core/database_unit_test.go +++ b/pkg/postgresql/database/core/database_unit_test.go @@ -20,6 +20,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" @@ -591,6 +592,7 @@ func TestSetStatus(t *testing.T) { err := persistStatus( context.Background(), c, + &pgmetrics.NoopRecorder{}, postgresDB, clusterReady, metav1.ConditionTrue, diff --git a/pkg/postgresql/database/core/types.go b/pkg/postgresql/database/core/types.go index 8a5579a00..f8a110423 100644 --- a/pkg/postgresql/database/core/types.go +++ b/pkg/postgresql/database/core/types.go @@ -78,9 +78,10 @@ const ( reasonUsersAvailable conditionReasons = "UsersAvailable" reasonRoleConflict conditionReasons = "RoleConflict" reasonConfigMapsCreationFailed conditionReasons = "ConfigMapsCreationFailed" - reasonConfigMapsCreated conditionReasons = "ConfigMapsCreated" - reasonPrivilegesGranted conditionReasons = "PrivilegesGranted" - reasonPrivilegesGrantFailed conditionReasons = "PrivilegesGrantFailed" + reasonConfigMapsCreated conditionReasons = "ConfigMapsCreated" + reasonDatabaseReconcileFailed conditionReasons = "DatabaseReconcileFailed" + reasonPrivilegesGranted conditionReasons = "PrivilegesGranted" + reasonPrivilegesGrantFailed conditionReasons = "PrivilegesGrantFailed" // ClusterReady sentinel values returned by ensureClusterReady. // Exported so the controller adapter can switch on them if needed. diff --git a/pkg/postgresql/metrics/noop.go b/pkg/postgresql/metrics/noop.go index c82a23b41..965f29904 100644 --- a/pkg/postgresql/metrics/noop.go +++ b/pkg/postgresql/metrics/noop.go @@ -3,15 +3,12 @@ package metrics // NoopRecorder is a no-op implementation of Recorder for use in tests. type NoopRecorder struct{} -func (n *NoopRecorder) IncValidationFailure(string, string) {} -func (n *NoopRecorder) SetClusterPhases(map[string]float64, float64) {} -func (n *NoopRecorder) SetDatabasePhases(map[string]float64) {} -func (n *NoopRecorder) SetManagedUsers(string, map[string]float64) {} -func (n *NoopRecorder) IncUserAction(string, string) {} -func (n *NoopRecorder) SetPoolers(string, string, float64) {} -func (n *NoopRecorder) SetPoolerInstances(string, float64) {} -func (n *NoopRecorder) IncFinalizerOp(string, string) {} -func (n *NoopRecorder) IncOwnedResourceOp(string, string, string, string) {} +func (n *NoopRecorder) IncStatusTransition(string, string, string, string) {} +func (n *NoopRecorder) SetClusterPhases(map[string]float64, float64) {} +func (n *NoopRecorder) SetDatabasePhases(map[string]float64) {} +func (n *NoopRecorder) SetManagedUsers(string, map[string]float64) {} +func (n *NoopRecorder) SetPoolers(string, string, float64) {} +func (n *NoopRecorder) SetPoolerInstances(string, float64) {} // Compile-time interface check. var _ Recorder = (*NoopRecorder)(nil) diff --git a/pkg/postgresql/metrics/ports.go b/pkg/postgresql/metrics/ports.go index 115767cd4..0e084973e 100644 --- a/pkg/postgresql/metrics/ports.go +++ b/pkg/postgresql/metrics/ports.go @@ -1,51 +1,11 @@ package metrics -// Result labels for counters that track success/error outcomes. -const ( - ResultSuccess = "success" - ResultError = "error" -) - // Controller name labels. const ( ControllerCluster = "postgrescluster" ControllerDatabase = "postgresdatabase" ) -// Validation failure reason labels. -const ( - ReasonClassNotFound = "class_not_found" - ReasonInvalidConfig = "invalid_configuration" - ReasonClusterNotFound = "cluster_not_found" - ReasonClusterNotReady = "cluster_not_ready" - ReasonRoleConflict = "role_conflict" - ReasonPoolerConfigMissing = "pooler_config_missing" -) - -// User action labels. -const ( - ActionSecretReconcile = "secret_reconcile" - ActionConfigMapReconcile = "configmap_reconcile" - ActionRolePatch = "role_patch" - ActionDatabaseReconcile = "database_reconcile" - ActionPrivilegeGrant = "privilege_grant" -) - -// Owned resource operation labels. -const ( - OpCreate = "create" - OpUpdate = "update" -) - -// Owned resource kind labels. -const ( - ResourceSecret = "Secret" - ResourceCluster = "Cluster" - ResourcePooler = "Pooler" - ResourceConfigMap = "ConfigMap" - ResourceDatabase = "Database" -) - // Recorder is the port for all PostgreSQL controller metrics. // Core service packages depend on this interface, never on Prometheus directly. // Adapters (PrometheusRecorder, NoopRecorder) live in this package. @@ -53,34 +13,27 @@ const ( // Reconcile-level metrics (total count, duration, error count) are handled // automatically by controller-runtime — see controller_runtime_reconcile_total, // controller_runtime_reconcile_time_seconds, controller_runtime_reconcile_errors_total. -// This interface covers domain-specific metrics only. +// +// Domain-specific business metrics are emitted automatically via IncStatusTransition +// every time a status condition is written. Fleet-level gauges are populated by the +// collector on each reconcile. type Recorder interface { - // IncValidationFailure records a validation or configuration failure. - IncValidationFailure(controller string, reason string) + // IncStatusTransition increments the status transition counter. + // Called automatically by persistStatus/setStatus — no manual calls needed in service code. + IncStatusTransition(controller, condition, status, reason string) // SetClusterPhases sets gauge values for cluster counts by phase. - // The phases map keys are phase strings (Ready, Pending, etc.) with counts as values. SetClusterPhases(phases map[string]float64, poolerEnabledCount float64) // SetDatabasePhases sets gauge values for database counts by phase. SetDatabasePhases(phases map[string]float64) // SetManagedUsers sets the gauge for managed user states. - // The states map keys are state strings (desired, reconciled, pending, failed). SetManagedUsers(controller string, states map[string]float64) - // IncUserAction increments the user action counter. - IncUserAction(action string, result string) - // SetPoolers sets pooler gauge values by type and state. SetPoolers(poolerType string, state string, count float64) // SetPoolerInstances sets pooler instance gauge by type. SetPoolerInstances(poolerType string, count float64) - - // IncFinalizerOp increments the finalizer operations counter. - IncFinalizerOp(controller string, result string) - - // IncOwnedResourceOp increments the owned resource operations counter. - IncOwnedResourceOp(controller string, resourceKind string, operation string, result string) } diff --git a/pkg/postgresql/metrics/prometheus.go b/pkg/postgresql/metrics/prometheus.go index f591099fe..7fa227529 100644 --- a/pkg/postgresql/metrics/prometheus.go +++ b/pkg/postgresql/metrics/prometheus.go @@ -5,10 +5,10 @@ import ( ) var ( - validationFailuresTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "splunk_operator_postgres_validation_failures_total", - Help: "Validation and configuration failures.", - }, []string{"controller", "reason"}) + statusTransitionsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_postgres_status_transitions_total", + Help: "Status condition transitions by controller, condition type, status, and reason.", + }, []string{"controller", "condition", "status", "reason"}) clusters = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "splunk_operator_postgres_clusters", @@ -25,11 +25,6 @@ var ( Help: "Current counts of managed users by state.", }, []string{"controller", "state"}) - userActionsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "splunk_operator_postgres_user_actions_total", - Help: "User-management actions such as secret reconcile, role patch, privilege grant.", - }, []string{"action", "result"}) - poolers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "splunk_operator_postgres_poolers", Help: "Current number of PgBouncer poolers by type and readiness state.", @@ -40,26 +35,13 @@ var ( Help: "Current observed pooler instance count.", }, []string{"type"}) - finalizerOperationsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "splunk_operator_postgres_finalizer_operations_total", - Help: "Finalizer success and cleanup failures.", - }, []string{"controller", "result"}) - - ownedResourceOperationsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "splunk_operator_postgres_owned_resource_operations_total", - Help: "Create/update/delete outcomes for owned resources.", - }, []string{"controller", "resource_kind", "operation", "result"}) - allCollectors = []prometheus.Collector{ - validationFailuresTotal, + statusTransitionsTotal, clusters, databases, managedUsers, - userActionsTotal, poolers, poolerInstances, - finalizerOperationsTotal, - ownedResourceOperationsTotal, } ) @@ -82,17 +64,15 @@ func NewPrometheusRecorder() *PrometheusRecorder { return &PrometheusRecorder{} } -func (p *PrometheusRecorder) IncValidationFailure(controller string, reason string) { - validationFailuresTotal.WithLabelValues(controller, reason).Inc() +func (p *PrometheusRecorder) IncStatusTransition(controller, condition, status, reason string) { + statusTransitionsTotal.WithLabelValues(controller, condition, status, reason).Inc() } func (p *PrometheusRecorder) SetClusterPhases(phases map[string]float64, poolerEnabledCount float64) { - // Reset all phase gauges before setting new values to avoid stale entries. clusters.Reset() for phase, count := range phases { clusters.WithLabelValues(phase, "false").Set(count) } - // pooler_enabled count is tracked separately — it's a cross-cutting dimension. if poolerEnabledCount > 0 { clusters.WithLabelValues("", "true").Set(poolerEnabledCount) } @@ -111,10 +91,6 @@ func (p *PrometheusRecorder) SetManagedUsers(controller string, states map[strin } } -func (p *PrometheusRecorder) IncUserAction(action string, result string) { - userActionsTotal.WithLabelValues(action, result).Inc() -} - func (p *PrometheusRecorder) SetPoolers(poolerType string, state string, count float64) { poolers.WithLabelValues(poolerType, state).Set(count) } @@ -123,13 +99,5 @@ func (p *PrometheusRecorder) SetPoolerInstances(poolerType string, count float64 poolerInstances.WithLabelValues(poolerType).Set(count) } -func (p *PrometheusRecorder) IncFinalizerOp(controller string, result string) { - finalizerOperationsTotal.WithLabelValues(controller, result).Inc() -} - -func (p *PrometheusRecorder) IncOwnedResourceOp(controller string, resourceKind string, operation string, result string) { - ownedResourceOperationsTotal.WithLabelValues(controller, resourceKind, operation, result).Inc() -} - // Compile-time interface check. var _ Recorder = (*PrometheusRecorder)(nil)