diff --git a/cmd/main.go b/cmd/main.go index 332623f0d..5be84f744 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -25,6 +25,7 @@ import ( "path/filepath" "time" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" intController "github.com/splunk/splunk-operator/internal/controller" @@ -55,6 +56,7 @@ import ( "github.com/splunk/splunk-operator/internal/controller" cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" //+kubebuilder:scaffold:imports //extapi "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" ) @@ -282,18 +284,29 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "Telemetry") os.Exit(1) } + pgRecorder := pgmetrics.NewPrometheusRecorder() + if err := pgmetrics.Register(crmetrics.Registry); err != nil { + setupLog.Error(err, "unable to register PostgreSQL metrics") + os.Exit(1) + } + pgFleetCollector := pgmetrics.NewFleetCollector() + if err := (&controller.PostgresDatabaseReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("postgresdatabase-controller"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("postgresdatabase-controller"), + Metrics: pgRecorder, + FleetCollector: pgFleetCollector, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "PostgresDatabase") os.Exit(1) } if err := (&controller.PostgresClusterReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("postgrescluster-controller"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("postgrescluster-controller"), + Metrics: pgRecorder, + FleetCollector: pgFleetCollector, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "PostgresCluster") os.Exit(1) diff --git a/internal/controller/postgrescluster_controller.go b/internal/controller/postgrescluster_controller.go index 70b11c9e6..abfebdf37 100644 --- a/internal/controller/postgrescluster_controller.go +++ b/internal/controller/postgrescluster_controller.go @@ -22,6 +22,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" enterprisev4 "github.com/splunk/splunk-operator/api/v4" clustercore "github.com/splunk/splunk-operator/pkg/postgresql/cluster/core" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -42,8 +43,10 @@ const ( // PostgresClusterReconciler reconciles PostgresCluster resources. type PostgresClusterReconciler struct { client.Client - Scheme *runtime.Scheme - Recorder record.EventRecorder + Scheme *runtime.Scheme + Recorder record.EventRecorder + Metrics pgmetrics.Recorder + FleetCollector *pgmetrics.FleetCollector } // +kubebuilder:rbac:groups=enterprise.splunk.com,resources=postgresclusters,verbs=get;list;watch;create;update;patch;delete @@ -57,8 +60,10 @@ type PostgresClusterReconciler struct { // +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch func (r *PostgresClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - rc := &clustercore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder} - return clustercore.PostgresClusterService(ctx, rc, req) + rc := &clustercore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics} + result, err := clustercore.PostgresClusterService(ctx, rc, req) + r.FleetCollector.CollectClusterMetrics(ctx, r.Client, r.Metrics) + return result, err } // SetupWithManager registers the controller and owned resource watches. diff --git a/internal/controller/postgresdatabase_controller.go b/internal/controller/postgresdatabase_controller.go index 0c6db9628..e1a68ef02 100644 --- a/internal/controller/postgresdatabase_controller.go +++ b/internal/controller/postgresdatabase_controller.go @@ -24,6 +24,7 @@ import ( enterprisev4 "github.com/splunk/splunk-operator/api/v4" dbadapter "github.com/splunk/splunk-operator/pkg/postgresql/database/adapter" dbcore "github.com/splunk/splunk-operator/pkg/postgresql/database/core" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -42,8 +43,10 @@ import ( // PostgresDatabaseReconciler reconciles a PostgresDatabase object. type PostgresDatabaseReconciler struct { client.Client - Scheme *runtime.Scheme - Recorder record.EventRecorder + Scheme *runtime.Scheme + Recorder record.EventRecorder + Metrics pgmetrics.Recorder + FleetCollector *pgmetrics.FleetCollector } const ( @@ -71,8 +74,11 @@ func (r *PostgresDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req } return ctrl.Result{}, err } - rc := &dbcore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder} - return dbcore.PostgresDatabaseService(ctx, rc, postgresDB, dbadapter.NewDBRepository) + rc := &dbcore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics} + result, err := dbcore.PostgresDatabaseService(ctx, rc, postgresDB, dbadapter.NewDBRepository) + r.FleetCollector.CollectDatabaseMetrics(ctx, r.Client, r.Metrics) + + return result, err } // SetupWithManager sets up the controller with the Manager. diff --git a/pkg/postgresql/cluster/core/cluster.go b/pkg/postgresql/cluster/core/cluster.go index 3334011c6..43a8b82a3 100644 --- a/pkg/postgresql/cluster/core/cluster.go +++ b/pkg/postgresql/cluster/core/cluster.go @@ -24,6 +24,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" password "github.com/sethvargo/go-password/password" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -67,7 +68,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. ctx = log.IntoContext(ctx, logger) updateStatus := func(conditionType conditionTypes, status metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileClusterPhases) error { - return setStatus(ctx, c, postgresCluster, conditionType, status, reason, message, phase) + return setStatus(ctx, c, rc.Metrics, postgresCluster, conditionType, status, reason, message, phase) } // Finalizer handling must come before any other processing. @@ -384,7 +385,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. default: oldConditions := make([]metav1.Condition, len(postgresCluster.Status.Conditions)) copy(oldConditions, postgresCluster.Status.Conditions) - if err := syncPoolerStatus(ctx, c, postgresCluster); err != nil { + if err := syncPoolerStatus(ctx, c, rc.Metrics, postgresCluster); err != nil { logger.Error(err, "Failed to sync pooler status") rc.emitWarning(postgresCluster, EventPoolerReconcileFailed, fmt.Sprintf("Failed to sync pooler status: %v", err)) if statusErr := updateStatus(poolerReady, metav1.ConditionFalse, reasonPoolerReconciliationFailed, @@ -450,7 +451,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. if postgresCluster.Status.Phase != nil { oldPhase = *postgresCluster.Status.Phase } - if err := syncStatus(ctx, c, postgresCluster, cnpgCluster); err != nil { + if err := syncStatus(ctx, c, rc.Metrics, postgresCluster, cnpgCluster); err != nil { logger.Error(err, "Failed to sync status") if apierrors.IsConflict(err) { logger.Info("Conflict during status update, will requeue") @@ -478,7 +479,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. logger.Info("Poolers ready, syncing status") poolerOldConditions := make([]metav1.Condition, len(postgresCluster.Status.Conditions)) copy(poolerOldConditions, postgresCluster.Status.Conditions) - _ = syncPoolerStatus(ctx, c, postgresCluster) + _ = syncPoolerStatus(ctx, c, rc.Metrics, postgresCluster) rc.emitPoolerReadyTransition(postgresCluster, poolerOldConditions) } } @@ -756,7 +757,7 @@ func deleteConnectionPoolers(ctx context.Context, c client.Client, cluster *ente } // syncPoolerStatus populates ConnectionPoolerStatus and the PoolerReady condition. -func syncPoolerStatus(ctx context.Context, c client.Client, cluster *enterprisev4.PostgresCluster) error { +func syncPoolerStatus(ctx context.Context, c client.Client, metrics pgmetrics.Recorder, cluster *enterprisev4.PostgresCluster) error { rwPooler := &cnpgv1.Pooler{} if err := c.Get(ctx, types.NamespacedName{ Name: poolerResourceName(cluster.Name, readWriteEndpoint), @@ -777,13 +778,13 @@ func syncPoolerStatus(ctx context.Context, c client.Client, cluster *enterprisev rwDesired, rwScheduled := poolerInstanceCount(rwPooler) roDesired, roScheduled := poolerInstanceCount(roPooler) - return setStatus(ctx, c, cluster, poolerReady, metav1.ConditionTrue, reasonAllInstancesReady, + return setStatus(ctx, c, metrics, cluster, poolerReady, metav1.ConditionTrue, reasonAllInstancesReady, fmt.Sprintf("%s: %d/%d, %s: %d/%d", readWriteEndpoint, rwScheduled, rwDesired, readOnlyEndpoint, roScheduled, roDesired), readyClusterPhase) } // syncStatus maps CNPG Cluster state to PostgresCluster status. -func syncStatus(ctx context.Context, c client.Client, cluster *enterprisev4.PostgresCluster, cnpgCluster *cnpgv1.Cluster) error { +func syncStatus(ctx context.Context, c client.Client, metrics pgmetrics.Recorder, cluster *enterprisev4.PostgresCluster, cnpgCluster *cnpgv1.Cluster) error { cluster.Status.ProvisionerRef = &corev1.ObjectReference{ APIVersion: "postgresql.cnpg.io/v1", Kind: "Cluster", @@ -836,13 +837,13 @@ func syncStatus(ctx context.Context, c client.Client, cluster *enterprisev4.Post message = fmt.Sprintf("CNPG cluster phase: %s", cnpgCluster.Status.Phase) } - return setStatus(ctx, c, cluster, clusterReady, condStatus, reason, message, phase) + return setStatus(ctx, c, metrics, cluster, clusterReady, condStatus, reason, message, phase) } // setStatus sets the phase, condition and persists the status. // It skips the API write when the resulting status is identical to the current // state, avoiding unnecessary etcd churn and ResourceVersion bumps on stable clusters. -func setStatus(ctx context.Context, c client.Client, cluster *enterprisev4.PostgresCluster, condType conditionTypes, status metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileClusterPhases) error { +func setStatus(ctx context.Context, c client.Client, metrics pgmetrics.Recorder, cluster *enterprisev4.PostgresCluster, condType conditionTypes, status metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileClusterPhases) error { before := cluster.Status.DeepCopy() p := string(phase) @@ -859,6 +860,8 @@ func setStatus(ctx context.Context, c client.Client, cluster *enterprisev4.Postg return nil } + metrics.IncStatusTransition(pgmetrics.ControllerCluster, string(condType), string(status), string(reason)) + if err := c.Status().Update(ctx, cluster); err != nil { return fmt.Errorf("failed to update PostgresCluster status: %w", err) } diff --git a/pkg/postgresql/cluster/core/types.go b/pkg/postgresql/cluster/core/types.go index 042a5ae82..3f2edfa4d 100644 --- a/pkg/postgresql/cluster/core/types.go +++ b/pkg/postgresql/cluster/core/types.go @@ -4,6 +4,7 @@ import ( "time" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" @@ -17,6 +18,7 @@ type ReconcileContext struct { Client client.Client Scheme *runtime.Scheme Recorder record.EventRecorder + Metrics pgmetrics.Recorder } // normalizedCNPGClusterSpec is a subset of cnpgv1.ClusterSpec fields used for drift detection. diff --git a/pkg/postgresql/database/core/database.go b/pkg/postgresql/database/core/database.go index f84a35fd9..fa9ce4adf 100644 --- a/pkg/postgresql/database/core/database.go +++ b/pkg/postgresql/database/core/database.go @@ -11,6 +11,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" "github.com/sethvargo/go-password/password" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" @@ -43,7 +44,7 @@ func PostgresDatabaseService( logger.Info("Reconciling PostgresDatabase") updateStatus := func(conditionType conditionTypes, conditionStatus metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileDBPhases) error { - return persistStatus(ctx, c, postgresDB, conditionType, conditionStatus, reason, message, phase) + return persistStatus(ctx, c, rc.Metrics, postgresDB, conditionType, conditionStatus, reason, message, phase) } // Finalizer: cleanup on deletion, register on creation. @@ -186,6 +187,10 @@ func PostgresDatabaseService( if err := patchManagedRoles(ctx, c, postgresDB, cluster); err != nil { logger.Error(err, "Failed to patch users in CNPG Cluster") rc.emitWarning(postgresDB, EventManagedRolesPatchFailed, fmt.Sprintf("Failed to patch managed roles: %v", err)) + if statusErr := updateStatus(rolesReady, metav1.ConditionFalse, reasonUsersCreationFailed, + fmt.Sprintf("Failed to patch managed roles: %v", err), failedDBPhase); statusErr != nil { + logger.Error(statusErr, "Failed to update status") + } return ctrl.Result{}, err } rc.emitNormal(postgresDB, EventRoleReconciliationStarted, fmt.Sprintf("Patched managed roles, waiting for %d roles to reconcile", len(desiredUsers))) @@ -223,6 +228,10 @@ func PostgresDatabaseService( if err != nil { logger.Error(err, "Failed to reconcile CNPG Databases") rc.emitWarning(postgresDB, EventDatabasesReconcileFailed, fmt.Sprintf("Failed to reconcile databases: %v", err)) + if statusErr := updateStatus(databasesReady, metav1.ConditionFalse, reasonDatabaseReconcileFailed, + fmt.Sprintf("Failed to reconcile databases: %v", err), failedDBPhase); statusErr != nil { + logger.Error(statusErr, "Failed to update status") + } return ctrl.Result{}, err } if len(adopted) > 0 { @@ -493,8 +502,9 @@ func verifyDatabasesReady(ctx context.Context, c client.Client, postgresDB *ente return notReady, nil } -func persistStatus(ctx context.Context, c client.Client, db *enterprisev4.PostgresDatabase, conditionType conditionTypes, conditionStatus metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileDBPhases) error { +func persistStatus(ctx context.Context, c client.Client, metrics pgmetrics.Recorder, db *enterprisev4.PostgresDatabase, conditionType conditionTypes, conditionStatus metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileDBPhases) error { applyStatus(db, conditionType, conditionStatus, reason, message, phase) + metrics.IncStatusTransition(pgmetrics.ControllerDatabase, string(conditionType), string(conditionStatus), string(reason)) return c.Status().Update(ctx, db) } diff --git a/pkg/postgresql/database/core/database_unit_test.go b/pkg/postgresql/database/core/database_unit_test.go index 8d4da6c52..68399ade4 100644 --- a/pkg/postgresql/database/core/database_unit_test.go +++ b/pkg/postgresql/database/core/database_unit_test.go @@ -20,6 +20,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" @@ -306,7 +307,7 @@ func TestVerifyRolesReady(t *testing.T) { }, }, }, - wantErr: "user main_db_rw reconciliation failed: [reserved role]", + wantErr: "reconciling user main_db_rw: [reserved role]", }, { name: "returns missing roles that are not reconciled yet", @@ -591,6 +592,7 @@ func TestSetStatus(t *testing.T) { err := persistStatus( context.Background(), c, + &pgmetrics.NoopRecorder{}, postgresDB, clusterReady, metav1.ConditionTrue, diff --git a/pkg/postgresql/database/core/types.go b/pkg/postgresql/database/core/types.go index bf07fd19f..f8a110423 100644 --- a/pkg/postgresql/database/core/types.go +++ b/pkg/postgresql/database/core/types.go @@ -4,16 +4,18 @@ import ( "time" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + pgmetrics "github.com/splunk/splunk-operator/pkg/postgresql/metrics" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" ) -// ReconcileContext bundles infrastructure dependencies injected by the controller +// ReconcileContext bundles infrastructure dependencies injected by the controller. type ReconcileContext struct { Client client.Client Scheme *runtime.Scheme Recorder record.EventRecorder + Metrics pgmetrics.Recorder } type reconcileDBPhases string @@ -76,9 +78,10 @@ const ( reasonUsersAvailable conditionReasons = "UsersAvailable" reasonRoleConflict conditionReasons = "RoleConflict" reasonConfigMapsCreationFailed conditionReasons = "ConfigMapsCreationFailed" - reasonConfigMapsCreated conditionReasons = "ConfigMapsCreated" - reasonPrivilegesGranted conditionReasons = "PrivilegesGranted" - reasonPrivilegesGrantFailed conditionReasons = "PrivilegesGrantFailed" + reasonConfigMapsCreated conditionReasons = "ConfigMapsCreated" + reasonDatabaseReconcileFailed conditionReasons = "DatabaseReconcileFailed" + reasonPrivilegesGranted conditionReasons = "PrivilegesGranted" + reasonPrivilegesGrantFailed conditionReasons = "PrivilegesGrantFailed" // ClusterReady sentinel values returned by ensureClusterReady. // Exported so the controller adapter can switch on them if needed. diff --git a/pkg/postgresql/metrics/collector.go b/pkg/postgresql/metrics/collector.go new file mode 100644 index 000000000..b590f1ad4 --- /dev/null +++ b/pkg/postgresql/metrics/collector.go @@ -0,0 +1,91 @@ +package metrics + +import ( + "context" + + enterprisev4 "github.com/splunk/splunk-operator/api/v4" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// FleetCollector recomputes fleet-state gauges from the K8s API (informer cache). +type FleetCollector struct{} + +// NewFleetCollector returns a new FleetCollector. +func NewFleetCollector() *FleetCollector { + return &FleetCollector{} +} + +// CollectClusterMetrics lists all PostgresCluster resources and updates phase +// gauges, pooler gauges, and managed-user gauges. Skips if called within 2s +// of the last collection. +func (fc *FleetCollector) CollectClusterMetrics(ctx context.Context, c client.Client, recorder Recorder) { + logger := log.FromContext(ctx) + + var list enterprisev4.PostgresClusterList + if err := c.List(ctx, &list); err != nil { + logger.Error(err, "Failed to list PostgresClusters for fleet metrics") + return + } + + phases := make(map[string]float64) + var poolerEnabledCount float64 + managedUserStates := map[string]float64{ + "desired": 0, + "reconciled": 0, + "pending": 0, + "failed": 0, + } + + for i := range list.Items { + cluster := &list.Items[i] + + // Phase gauge. + phase := "Unknown" + if cluster.Status.Phase != nil { + phase = *cluster.Status.Phase + } + phases[phase]++ + + // Pooler-enabled count. + if cluster.Spec.ConnectionPoolerEnabled != nil && *cluster.Spec.ConnectionPoolerEnabled { + poolerEnabledCount++ + } + + // Managed users. + managedUserStates["desired"] += float64(len(cluster.Spec.ManagedRoles)) + if cluster.Status.ManagedRolesStatus != nil { + managedUserStates["reconciled"] += float64(len(cluster.Status.ManagedRolesStatus.Reconciled)) + managedUserStates["pending"] += float64(len(cluster.Status.ManagedRolesStatus.Pending)) + managedUserStates["failed"] += float64(len(cluster.Status.ManagedRolesStatus.Failed)) + } + } + + recorder.SetClusterPhases(phases, poolerEnabledCount) + recorder.SetManagedUsers(ControllerCluster, managedUserStates) +} + +// CollectDatabaseMetrics lists all PostgresDatabase resources and updates +// phase gauges. Skips if called within 2s of the last collection. +func (fc *FleetCollector) CollectDatabaseMetrics(ctx context.Context, c client.Client, recorder Recorder) { + logger := log.FromContext(ctx) + + var list enterprisev4.PostgresDatabaseList + if err := c.List(ctx, &list); err != nil { + logger.Error(err, "Failed to list PostgresDatabases for fleet metrics") + return + } + + phases := make(map[string]float64) + for i := range list.Items { + db := &list.Items[i] + phase := "Unknown" + if db.Status.Phase != nil { + phase = *db.Status.Phase + } + phases[phase]++ + } + + recorder.SetDatabasePhases(phases) +} + diff --git a/pkg/postgresql/metrics/noop.go b/pkg/postgresql/metrics/noop.go new file mode 100644 index 000000000..965f29904 --- /dev/null +++ b/pkg/postgresql/metrics/noop.go @@ -0,0 +1,14 @@ +package metrics + +// NoopRecorder is a no-op implementation of Recorder for use in tests. +type NoopRecorder struct{} + +func (n *NoopRecorder) IncStatusTransition(string, string, string, string) {} +func (n *NoopRecorder) SetClusterPhases(map[string]float64, float64) {} +func (n *NoopRecorder) SetDatabasePhases(map[string]float64) {} +func (n *NoopRecorder) SetManagedUsers(string, map[string]float64) {} +func (n *NoopRecorder) SetPoolers(string, string, float64) {} +func (n *NoopRecorder) SetPoolerInstances(string, float64) {} + +// Compile-time interface check. +var _ Recorder = (*NoopRecorder)(nil) diff --git a/pkg/postgresql/metrics/ports.go b/pkg/postgresql/metrics/ports.go new file mode 100644 index 000000000..0e084973e --- /dev/null +++ b/pkg/postgresql/metrics/ports.go @@ -0,0 +1,39 @@ +package metrics + +// Controller name labels. +const ( + ControllerCluster = "postgrescluster" + ControllerDatabase = "postgresdatabase" +) + +// Recorder is the port for all PostgreSQL controller metrics. +// Core service packages depend on this interface, never on Prometheus directly. +// Adapters (PrometheusRecorder, NoopRecorder) live in this package. +// +// Reconcile-level metrics (total count, duration, error count) are handled +// automatically by controller-runtime — see controller_runtime_reconcile_total, +// controller_runtime_reconcile_time_seconds, controller_runtime_reconcile_errors_total. +// +// Domain-specific business metrics are emitted automatically via IncStatusTransition +// every time a status condition is written. Fleet-level gauges are populated by the +// collector on each reconcile. +type Recorder interface { + // IncStatusTransition increments the status transition counter. + // Called automatically by persistStatus/setStatus — no manual calls needed in service code. + IncStatusTransition(controller, condition, status, reason string) + + // SetClusterPhases sets gauge values for cluster counts by phase. + SetClusterPhases(phases map[string]float64, poolerEnabledCount float64) + + // SetDatabasePhases sets gauge values for database counts by phase. + SetDatabasePhases(phases map[string]float64) + + // SetManagedUsers sets the gauge for managed user states. + SetManagedUsers(controller string, states map[string]float64) + + // SetPoolers sets pooler gauge values by type and state. + SetPoolers(poolerType string, state string, count float64) + + // SetPoolerInstances sets pooler instance gauge by type. + SetPoolerInstances(poolerType string, count float64) +} diff --git a/pkg/postgresql/metrics/prometheus.go b/pkg/postgresql/metrics/prometheus.go new file mode 100644 index 000000000..7fa227529 --- /dev/null +++ b/pkg/postgresql/metrics/prometheus.go @@ -0,0 +1,103 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +var ( + statusTransitionsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_postgres_status_transitions_total", + Help: "Status condition transitions by controller, condition type, status, and reason.", + }, []string{"controller", "condition", "status", "reason"}) + + clusters = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_clusters", + Help: "Current number of PostgresCluster resources by status phase.", + }, []string{"phase", "pooler_enabled"}) + + databases = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_databases", + Help: "Current number of PostgresDatabase resources by status phase.", + }, []string{"phase"}) + + managedUsers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_managed_users", + Help: "Current counts of managed users by state.", + }, []string{"controller", "state"}) + + poolers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_poolers", + Help: "Current number of PgBouncer poolers by type and readiness state.", + }, []string{"type", "state"}) + + poolerInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_pooler_instances", + Help: "Current observed pooler instance count.", + }, []string{"type"}) + + allCollectors = []prometheus.Collector{ + statusTransitionsTotal, + clusters, + databases, + managedUsers, + poolers, + poolerInstances, + } +) + +// Register registers all PostgreSQL metrics with the given registerer. +// Call once at startup from cmd/main.go. +func Register(registerer prometheus.Registerer) error { + for _, c := range allCollectors { + if err := registerer.Register(c); err != nil { + return err + } + } + return nil +} + +// PrometheusRecorder implements Recorder using Prometheus client_golang. +type PrometheusRecorder struct{} + +// NewPrometheusRecorder returns a new PrometheusRecorder. +func NewPrometheusRecorder() *PrometheusRecorder { + return &PrometheusRecorder{} +} + +func (p *PrometheusRecorder) IncStatusTransition(controller, condition, status, reason string) { + statusTransitionsTotal.WithLabelValues(controller, condition, status, reason).Inc() +} + +func (p *PrometheusRecorder) SetClusterPhases(phases map[string]float64, poolerEnabledCount float64) { + clusters.Reset() + for phase, count := range phases { + clusters.WithLabelValues(phase, "false").Set(count) + } + if poolerEnabledCount > 0 { + clusters.WithLabelValues("", "true").Set(poolerEnabledCount) + } +} + +func (p *PrometheusRecorder) SetDatabasePhases(phases map[string]float64) { + databases.Reset() + for phase, count := range phases { + databases.WithLabelValues(phase).Set(count) + } +} + +func (p *PrometheusRecorder) SetManagedUsers(controller string, states map[string]float64) { + for state, count := range states { + managedUsers.WithLabelValues(controller, state).Set(count) + } +} + +func (p *PrometheusRecorder) SetPoolers(poolerType string, state string, count float64) { + poolers.WithLabelValues(poolerType, state).Set(count) +} + +func (p *PrometheusRecorder) SetPoolerInstances(poolerType string, count float64) { + poolerInstances.WithLabelValues(poolerType).Set(count) +} + +// Compile-time interface check. +var _ Recorder = (*PrometheusRecorder)(nil)