From 976ecd11c196e494ca75ec0cc15a09918ee9e185 Mon Sep 17 00:00:00 2001 From: dpishchenkov Date: Wed, 1 Apr 2026 12:56:10 +0200 Subject: [PATCH 1/4] add basic grafana --- api/v4/postgrescluster_types.go | 24 + api/v4/postgresclusterclass_types.go | 30 + cmd/main.go | 2 + ...nterprise_v4_postgresclusterclass_dev.yaml | 5 + .../controller/postgrescluster_controller.go | 2 +- internal/controller/suite_test.go | 4 + pkg/postgresql/cluster/core/cluster.go | 58 ++ .../dashboards/postgres_observability.json | 136 ++++ pkg/postgresql/cluster/core/events.go | 39 +- pkg/postgresql/cluster/core/monitoring.go | 614 ++++++++++++++++++ .../cluster/core/monitoring_unit_test.go | 472 ++++++++++++++ 11 files changed, 1366 insertions(+), 20 deletions(-) create mode 100644 pkg/postgresql/cluster/core/dashboards/postgres_observability.json create mode 100644 pkg/postgresql/cluster/core/monitoring.go create mode 100644 pkg/postgresql/cluster/core/monitoring_unit_test.go diff --git a/api/v4/postgrescluster_types.go b/api/v4/postgrescluster_types.go index 3e3dd0da7..a1445c03b 100644 --- a/api/v4/postgrescluster_types.go +++ b/api/v4/postgrescluster_types.go @@ -108,6 +108,30 @@ type PostgresClusterSpec struct { // +kubebuilder:default=Retain // +optional ClusterDeletionPolicy *string `json:"clusterDeletionPolicy,omitempty"` + + // Observability contains configuration for monitoring and observability features. + // +optional + Observability *PostgresObservabilityOverride `json:"observability,omitempty"` +} + +// PostgresObservabilityOverride overrides observability configuration options for PostgresClusterClass. +type PostgresObservabilityOverride struct { + + // +optional + PostgreSQL *FeatureDisableOverride `json:"postgresql,omitempty"` + + // +optional + PgBouncer *FeatureDisableOverride `json:"pgbouncer,omitempty"` + + // +optional + GrafanaDashboard *FeatureDisableOverride `json:"grafanaDashboard,omitempty"` +} + +type FeatureDisableOverride struct { + // Disable set to true will disable the feature even if it's enabled in the class. + // +kubebuilder:default=false + // +optional + Disabled *bool `json:"disabled,omitempty"` } // PostgresClusterResources defines references to Kubernetes resources related to the PostgresCluster, such as ConfigMaps and Secrets. diff --git a/api/v4/postgresclusterclass_types.go b/api/v4/postgresclusterclass_types.go index 7f02e5633..743e98722 100644 --- a/api/v4/postgresclusterclass_types.go +++ b/api/v4/postgresclusterclass_types.go @@ -99,6 +99,13 @@ type PostgresClusterClassConfig struct { // +kubebuilder:default=false // +optional ConnectionPoolerEnabled *bool `json:"connectionPoolerEnabled,omitempty"` + + // Observability contains configuration for metrics and dashboards. + // When enabled, creates metrics resources and Grafana dashboard for clusters using this class. + // Can be overridden in PostgresCluster CR. + // +kubebuilder:default={} + // +optional + Observability *PostgresObservabilityClassConfig `json:"observability,omitempty"` } // ConnectionPoolerMode defines the PgBouncer connection pooling strategy. @@ -172,6 +179,29 @@ type PostgresClusterClassStatus struct { Phase *string `json:"phase,omitempty"` } +type PostgresObservabilityClassConfig struct { + // +optional + PostgreSQL *MetricsClassConfig `json:"postgresql,omitempty"` + // +optional + PgBouncer *MetricsClassConfig `json:"pgbouncer,omitempty"` + // +optional + GrafanaDashboard *GrafanaDashboardClassConfig `json:"grafanaDashboard,omitempty"` +} + +type MetricsClassConfig struct { + // Enabled controls whether metrics resources should be created for this target. + // +kubebuilder:default=false + // +optional + Enabled *bool `json:"enabled,omitempty"` +} + +type GrafanaDashboardClassConfig struct { + // Enabled controls whether a Grafana dashboard ConfigMap should be created for this class. + // +kubebuilder:default=false + // +optional + Enabled *bool `json:"enabled,omitempty"` +} + // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:scope=Cluster diff --git a/cmd/main.go b/cmd/main.go index 332623f0d..b9770b34f 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -55,6 +55,7 @@ import ( "github.com/splunk/splunk-operator/internal/controller" cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" //+kubebuilder:scaffold:imports //extapi "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" ) @@ -69,6 +70,7 @@ func init() { utilruntime.Must(enterpriseApi.AddToScheme(scheme)) utilruntime.Must(enterpriseApiV3.AddToScheme(scheme)) utilruntime.Must(cnpgv1.AddToScheme(scheme)) + utilruntime.Must(monitoringv1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme //utilruntime.Must(extapi.AddToScheme(scheme)) } diff --git a/config/samples/enterprise_v4_postgresclusterclass_dev.yaml b/config/samples/enterprise_v4_postgresclusterclass_dev.yaml index a9846e36c..560958794 100644 --- a/config/samples/enterprise_v4_postgresclusterclass_dev.yaml +++ b/config/samples/enterprise_v4_postgresclusterclass_dev.yaml @@ -27,6 +27,11 @@ spec: cpu: "1" memory: "2Gi" connectionPoolerEnabled: true + observability: + grafanaDashboard: + enabled: true + pgbouncer: + enabled: true cnpg: # Restart method - tolerate downtime in dev diff --git a/internal/controller/postgrescluster_controller.go b/internal/controller/postgrescluster_controller.go index 70b11c9e6..55e04ff99 100644 --- a/internal/controller/postgrescluster_controller.go +++ b/internal/controller/postgrescluster_controller.go @@ -45,7 +45,7 @@ type PostgresClusterReconciler struct { Scheme *runtime.Scheme Recorder record.EventRecorder } - +// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=enterprise.splunk.com,resources=postgresclusters,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=enterprise.splunk.com,resources=postgresclusters/status,verbs=get;update;patch // +kubebuilder:rbac:groups=enterprise.splunk.com,resources=postgresclusters/finalizers,verbs=update diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 9356a011f..8518541be 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -39,6 +39,7 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3" enterpriseApi "github.com/splunk/splunk-operator/api/v4" //+kubebuilder:scaffold:imports @@ -109,6 +110,9 @@ var _ = BeforeSuite(func(ctx context.Context) { err = enterpriseApi.AddToScheme(clientgoscheme.Scheme) Expect(err).NotTo(HaveOccurred()) + err = monitoringv1.AddToScheme(clientgoscheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + //+kubebuilder:scaffold:scheme // Create New Manager for controller diff --git a/pkg/postgresql/cluster/core/cluster.go b/pkg/postgresql/cluster/core/cluster.go index 3334011c6..f7e8ba5d7 100644 --- a/pkg/postgresql/cluster/core/cluster.go +++ b/pkg/postgresql/cluster/core/cluster.go @@ -396,6 +396,64 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. rc.emitPoolerReadyTransition(postgresCluster, oldConditions) } + if err := reconcilePostgreSQLMetricsService(ctx, c, rc.Scheme, postgresCluster, isPostgreSQLMetricsEnabled(postgresCluster, clusterClass)); err != nil { + return ctrl.Result{}, err + } + + poolerMetricsEnabled := isConnectionPoolerMetricsEnabled(postgresCluster, clusterClass) + rwPoolerMetricsEnabled := poolerMetricsEnabled && rwPoolerExists + roPoolerMetricsEnabled := poolerMetricsEnabled && roPoolerExists + if err := reconcileConnectionPoolerMetricsService(ctx, c, rc.Scheme, postgresCluster, readWriteEndpoint, rwPoolerMetricsEnabled); err != nil { + return ctrl.Result{}, err + } + if err := reconcileConnectionPoolerMetricsService(ctx, c, rc.Scheme, postgresCluster, readOnlyEndpoint, roPoolerMetricsEnabled); err != nil { + return ctrl.Result{}, err + } + + if err := reconcileGrafanaDashboardConfigMap(ctx, c, rc.Scheme, postgresCluster, isGrafanaDashboardEnabled(postgresCluster, clusterClass)); err != nil { + return ctrl.Result{}, err + } + + serviceMonitorUnavailableEmitted := false + handleServiceMonitorError := func(err error) (bool, error) { + if err == nil { + return false, nil + } + if !isServiceMonitorUnavailable(err) { + return false, err + } + if !serviceMonitorUnavailableEmitted { + serviceMonitorUnavailableEmitted = true + logger.Info("ServiceMonitor CRD unavailable, continuing without ServiceMonitors") + rc.emitWarning(postgresCluster, EventServiceMonitorUnavailable, + "ServiceMonitor CRD not found; continuing without Prometheus ServiceMonitors") + } + return true, nil + } + + if handled, err := handleServiceMonitorError( + reconcilePostgreSQLMetricsServiceMonitor(ctx, c, rc.Scheme, postgresCluster, isPostgreSQLMetricsEnabled(postgresCluster, clusterClass)), + ); err != nil { + return ctrl.Result{}, err + } else if handled { + logger.Info("Skipped PostgreSQL ServiceMonitor reconciliation") + } + + if handled, err := handleServiceMonitorError( + reconcileConnectionPoolerMetricsServiceMonitor(ctx, c, rc.Scheme, postgresCluster, readWriteEndpoint, rwPoolerMetricsEnabled), + ); err != nil { + return ctrl.Result{}, err + } else if handled { + logger.Info("Skipped RW PgBouncer ServiceMonitor reconciliation") + } + if handled, err := handleServiceMonitorError( + reconcileConnectionPoolerMetricsServiceMonitor(ctx, c, rc.Scheme, postgresCluster, readOnlyEndpoint, roPoolerMetricsEnabled), + ); err != nil { + return ctrl.Result{}, err + } else if handled { + logger.Info("Skipped RO PgBouncer ServiceMonitor reconciliation") + } + // Reconcile ConfigMap when CNPG cluster is healthy. if cnpgCluster.Status.Phase == cnpgv1.PhaseHealthy { logger.Info("CNPG Cluster healthy, reconciling ConfigMap") diff --git a/pkg/postgresql/cluster/core/dashboards/postgres_observability.json b/pkg/postgresql/cluster/core/dashboards/postgres_observability.json new file mode 100644 index 000000000..bbdf6eda7 --- /dev/null +++ b/pkg/postgresql/cluster/core/dashboards/postgres_observability.json @@ -0,0 +1,136 @@ +{ + "title": "PostgreSQL __CLUSTER_NAME__", + "uid": "pg-__CLUSTER_NAME__", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "timezone": "browser", + "tags": ["postgresql", "cnpg", "pgbouncer"], + "editable": true, + "graphTooltip": 0, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "PostgreSQL Instances", + "gridPos": { "x": 0, "y": 0, "w": 6, "h": 4 }, + "targets": [ + { + "expr": "count(max by (pod) (cnpg_pg_postmaster_start_time_seconds{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}))", + "refId": "A" + } + ], + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "orientation": "horizontal", + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "RW PgBouncer Pods Up", + "gridPos": { "x": 6, "y": 0, "w": 6, "h": 4 }, + "targets": [ + { + "expr": "round(sum(max by (pod) (cnpg_pgbouncer_up{namespace=\"__NAMESPACE__\",service=\"__RW_POOLER_SERVICE__\"})))", + "refId": "A" + } + ], + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "orientation": "horizontal", + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "RO PgBouncer Pods Up", + "gridPos": { "x": 12, "y": 0, "w": 6, "h": 4 }, + "targets": [ + { + "expr": "round(sum(max by (pod) (cnpg_pgbouncer_up{namespace=\"__NAMESPACE__\",service=\"__RO_POOLER_SERVICE__\"})))", + "refId": "A" + } + ], + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "orientation": "horizontal", + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Total Database Size", + "gridPos": { "x": 18, "y": 0, "w": 6, "h": 4 }, + "targets": [ + { + "expr": "sum(max by (datname) (cnpg_pg_database_size_bytes{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "orientation": "horizontal", + "textMode": "value" + } + }, + { + "id": 5, + "type": "timeseries", + "title": "WAL Files by Pod", + "gridPos": { "x": 0, "y": 4, "w": 8, "h": 8 }, + "targets": [ + { + "expr": "round(max by (pod) (cnpg_pg_wal_files_total{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}))", + "legendFormat": "{{pod}}", + "refId": "A" + } + ] + }, + { + "id": 6, + "type": "timeseries", + "title": "Archived WAL Rate by Pod", + "gridPos": { "x": 8, "y": 4, "w": 8, "h": 8 }, + "targets": [ + { + "expr": "max by (pod) (rate(cnpg_pg_stat_archiver_archived_count{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}[5m]))", + "legendFormat": "{{pod}}", + "refId": "A" + } + ] + }, + { + "id": 7, + "type": "timeseries", + "title": "PgBouncer Active Clients", + "gridPos": { "x": 16, "y": 4, "w": 8, "h": 8 }, + "targets": [ + { + "expr": "round(sum(cnpg_pgbouncer_pools_cl_active{namespace=\"__NAMESPACE__\",service=\"__RW_POOLER_SERVICE__\"}))", + "legendFormat": "rw", + "refId": "A" + }, + { + "expr": "round(sum(cnpg_pgbouncer_pools_cl_active{namespace=\"__NAMESPACE__\",service=\"__RO_POOLER_SERVICE__\"}))", + "legendFormat": "ro", + "refId": "B" + } + ] + } + ], + "templating": { + "list": [] + }, + "annotations": { + "list": [] + } +} diff --git a/pkg/postgresql/cluster/core/events.go b/pkg/postgresql/cluster/core/events.go index afcfd768e..73ded6cd5 100644 --- a/pkg/postgresql/cluster/core/events.go +++ b/pkg/postgresql/cluster/core/events.go @@ -10,25 +10,26 @@ import ( ) const ( - EventSecretReady = "SecretReady" - EventConfigMapReady = "ConfigMapReady" - EventClusterAdopted = "ClusterAdopted" - EventClusterCreationStarted = "ClusterCreationStarted" - EventClusterUpdateStarted = "ClusterUpdateStarted" - EventClusterReady = "ClusterReady" - EventPoolerCreationStarted = "PoolerCreationStarted" - EventPoolerReady = "PoolerReady" - EventCleanupComplete = "CleanupComplete" - EventClusterClassNotFound = "ClusterClassNotFound" - EventConfigMergeFailed = "ConfigMergeFailed" - EventSecretReconcileFailed = "SecretReconcileFailed" - EventClusterCreateFailed = "ClusterCreateFailed" - EventClusterUpdateFailed = "ClusterUpdateFailed" - EventManagedRolesFailed = "ManagedRolesFailed" - EventPoolerReconcileFailed = "PoolerReconcileFailed" - EventConfigMapReconcileFailed = "ConfigMapReconcileFailed" - EventClusterDegraded = "ClusterDegraded" - EventCleanupFailed = "CleanupFailed" + EventSecretReady = "SecretReady" + EventConfigMapReady = "ConfigMapReady" + EventClusterAdopted = "ClusterAdopted" + EventClusterCreationStarted = "ClusterCreationStarted" + EventClusterUpdateStarted = "ClusterUpdateStarted" + EventClusterReady = "ClusterReady" + EventPoolerCreationStarted = "PoolerCreationStarted" + EventPoolerReady = "PoolerReady" + EventCleanupComplete = "CleanupComplete" + EventClusterClassNotFound = "ClusterClassNotFound" + EventConfigMergeFailed = "ConfigMergeFailed" + EventSecretReconcileFailed = "SecretReconcileFailed" + EventClusterCreateFailed = "ClusterCreateFailed" + EventClusterUpdateFailed = "ClusterUpdateFailed" + EventManagedRolesFailed = "ManagedRolesFailed" + EventPoolerReconcileFailed = "PoolerReconcileFailed" + EventConfigMapReconcileFailed = "ConfigMapReconcileFailed" + EventServiceMonitorUnavailable = "ServiceMonitorUnavailable" + EventClusterDegraded = "ClusterDegraded" + EventCleanupFailed = "CleanupFailed" ) func (rc *ReconcileContext) emitNormal(obj client.Object, reason, message string) { diff --git a/pkg/postgresql/cluster/core/monitoring.go b/pkg/postgresql/cluster/core/monitoring.go new file mode 100644 index 000000000..7c942d22a --- /dev/null +++ b/pkg/postgresql/cluster/core/monitoring.go @@ -0,0 +1,614 @@ +package core + +import ( + "context" + _ "embed" + "fmt" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + enterprisev4 "github.com/splunk/splunk-operator/api/v4" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "strings" +) + +const ( + // metrics + postgresMetricsServiceSuffix = "-postgres-metrics" + postgresMetricsPortName = "metrics" + postgresMetricsPort = int32(9187) + poolerMetricsPortName = "metrics" + poolerMetricsPort = int32(9127) + grafanaDashboardConfigMapSuffix = "-grafana-dashboard" + + // labels + labelManagedBy = "app.kubernetes.io/managed-by" + labelManagedByValue = "postgrescluster-controller" + labelObservabilityComponent = "enterprise.splunk.com/observability-component" + cnpgClusterLabelName = "cnpg.io/cluster" + cnpgPoolerNameLabel = "cnpg.io/poolerName" + cnpgPodRoleInstance = "instance" + cnpgPodRoleLabelName = "cnpg.io/podRole" + grafanaDashboardLabelKey = "grafana_dashboard" + grafanaDashboardLabelValue = "1" +) + +func isPostgreSQLMetricsEnabled(cluster *enterprisev4.PostgresCluster, class *enterprisev4.PostgresClusterClass) bool { + if class == nil || class.Spec.Config == nil || class.Spec.Config.Observability == nil { + return false + } + classCfg := class.Spec.Config.Observability.PostgreSQL + if classCfg == nil || classCfg.Enabled == nil || !*classCfg.Enabled { + return false + } + if cluster == nil || cluster.Spec.Observability == nil || cluster.Spec.Observability.PostgreSQL == nil { + return true + } + override := cluster.Spec.Observability.PostgreSQL.Disabled + return override == nil || !*override +} + +func isConnectionPoolerEnabled(cluster *enterprisev4.PostgresCluster, class *enterprisev4.PostgresClusterClass) bool { + if class == nil || class.Spec.Config == nil || class.Spec.Config.ConnectionPoolerEnabled == nil { + return false + } + if !*class.Spec.Config.ConnectionPoolerEnabled { + return false + } + if cluster == nil || cluster.Spec.ConnectionPoolerEnabled == nil { + return true + } + return *cluster.Spec.ConnectionPoolerEnabled +} + +func isConnectionPoolerMetricsEnabled(cluster *enterprisev4.PostgresCluster, class *enterprisev4.PostgresClusterClass) bool { + if !isConnectionPoolerEnabled(cluster, class) { + return false + } + if class == nil || class.Spec.Config == nil || class.Spec.Config.Observability == nil { + return false + } + classCfg := class.Spec.Config.Observability.PgBouncer + if classCfg == nil || classCfg.Enabled == nil || !*classCfg.Enabled { + return false + } + if cluster == nil || cluster.Spec.Observability == nil || cluster.Spec.Observability.PgBouncer == nil { + return true + } + override := cluster.Spec.Observability.PgBouncer.Disabled + return override == nil || !*override +} + +func isGrafanaDashboardEnabled(cluster *enterprisev4.PostgresCluster, class *enterprisev4.PostgresClusterClass) bool { + if class == nil || class.Spec.Config == nil || class.Spec.Config.Observability == nil { + return false + } + classCfg := class.Spec.Config.Observability.GrafanaDashboard + if classCfg == nil || classCfg.Enabled == nil || !*classCfg.Enabled { + return false + } + if cluster == nil || cluster.Spec.Observability == nil || cluster.Spec.Observability.GrafanaDashboard == nil { + return true + } + override := cluster.Spec.Observability.GrafanaDashboard.Disabled + return override == nil || !*override +} + +func buildPostgreSQLMetricsService(scheme *runtime.Scheme, cluster *enterprisev4.PostgresCluster) (*corev1.Service, error) { + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: cluster.Name + postgresMetricsServiceSuffix, + Namespace: cluster.Namespace, + Labels: map[string]string{ + labelManagedBy: labelManagedByValue, + labelObservabilityComponent: "postgresql-metrics", + cnpgClusterLabelName: cluster.Name, + }, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + Selector: map[string]string{ + cnpgClusterLabelName: cluster.Name, + cnpgPodRoleLabelName: cnpgPodRoleInstance, + }, + Ports: []corev1.ServicePort{ + { + Name: postgresMetricsPortName, + Port: postgresMetricsPort, + Protocol: corev1.ProtocolTCP, + TargetPort: intstr.FromString(postgresMetricsPortName), + }, + }, + }, + } + + if err := ctrl.SetControllerReference(cluster, svc, scheme); err != nil { + return nil, fmt.Errorf("setting controller reference on PostgreSQL metrics Service: %w", err) + } + + return svc, nil +} + +func poolerMetricsServiceName(clusterName, poolerType string) string { + return fmt.Sprintf("%s-pooler-%s-metrics", clusterName, poolerType) +} +func buildConnectionPoolerMetricsService( + scheme *runtime.Scheme, + cluster *enterprisev4.PostgresCluster, + poolerType string, +) (*corev1.Service, error) { + poolerName := poolerResourceName(cluster.Name, poolerType) + + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: poolerMetricsServiceName(cluster.Name, poolerType), + Namespace: cluster.Namespace, + Labels: map[string]string{ + labelManagedBy: labelManagedByValue, + labelObservabilityComponent: "pgbouncer-metrics", + cnpgClusterLabelName: cluster.Name, + cnpgPoolerNameLabel: poolerName, + }, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + Selector: map[string]string{ + cnpgPoolerNameLabel: poolerName, + }, + Ports: []corev1.ServicePort{ + { + Name: poolerMetricsPortName, + Port: poolerMetricsPort, + Protocol: corev1.ProtocolTCP, + TargetPort: intstr.FromString(poolerMetricsPortName), + }, + }, + }, + } + + if err := ctrl.SetControllerReference(cluster, svc, scheme); err != nil { + return nil, fmt.Errorf("setting controller reference on PgBouncer metrics Service: %w", err) + } + + return svc, nil +} + +func buildGrafanaDashboardConfigMap(scheme *runtime.Scheme, cluster *enterprisev4.PostgresCluster) (*corev1.ConfigMap, error) { + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cluster.Name + grafanaDashboardConfigMapSuffix, + Namespace: cluster.Namespace, + Labels: map[string]string{ + labelManagedBy: labelManagedByValue, + labelObservabilityComponent: "grafana-dashboard", + cnpgClusterLabelName: cluster.Name, + grafanaDashboardLabelKey: grafanaDashboardLabelValue, + }, + }, + Data: map[string]string{ + "dashboard.json": buildBasicGrafanaDashboard(cluster), + }, + } + + if err := ctrl.SetControllerReference(cluster, cm, scheme); err != nil { + return nil, fmt.Errorf("setting controller reference on Grafana dashboard ConfigMap: %w", err) + } + + return cm, nil +} + +func isServiceMonitorUnavailable(err error) bool { + if err == nil { + return false + } + + if apierrors.IsNotFound(err) || apimeta.IsNoMatchError(err) { + return true + } + + msg := err.Error() + return strings.Contains(msg, "no matches for kind \"ServiceMonitor\"") || + strings.Contains(msg, "servicemonitors.monitoring.coreos.com") +} + +func reconcilePostgreSQLMetricsService(ctx context.Context, c client.Client, scheme *runtime.Scheme, cluster *enterprisev4.PostgresCluster, enabled bool) error { + logger := log.FromContext(ctx) + serviceName := cluster.Name + postgresMetricsServiceSuffix + + if !enabled { + existing := &corev1.Service{} + err := c.Get(ctx, types.NamespacedName{Name: serviceName, Namespace: cluster.Namespace}, existing) + switch { + case apierrors.IsNotFound(err): + return nil + case err != nil: + return fmt.Errorf("getting PostgreSQL metrics Service %s: %w", serviceName, err) + } + + logger.Info("Deleting PostgreSQL metrics Service", "name", serviceName) + if err := c.Delete(ctx, existing); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("deleting PostgreSQL metrics Service %s: %w", serviceName, err) + } + return nil + } + + desired, err := buildPostgreSQLMetricsService(scheme, cluster) + if err != nil { + return fmt.Errorf("building PostgreSQL metrics Service: %w", err) + } + + live := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: desired.Name, + Namespace: desired.Namespace, + }, + } + + _, err = controllerutil.CreateOrUpdate(ctx, c, live, func() error { + live.Labels = desired.Labels + live.Annotations = desired.Annotations + live.Spec.Type = desired.Spec.Type + live.Spec.Selector = desired.Spec.Selector + live.Spec.Ports = desired.Spec.Ports + + if !metav1.IsControlledBy(live, cluster) { + if err := ctrl.SetControllerReference(cluster, live, scheme); err != nil { + return fmt.Errorf("setting controller reference on PostgreSQL metrics Service: %w", err) + } + } + return nil + }) + if err != nil { + return fmt.Errorf("reconciling PostgreSQL metrics Service %s: %w", desired.Name, err) + } + + return nil +} + +func reconcileConnectionPoolerMetricsService( + ctx context.Context, + c client.Client, + scheme *runtime.Scheme, + cluster *enterprisev4.PostgresCluster, + poolerType string, + enabled bool, +) error { + logger := log.FromContext(ctx) + serviceName := poolerMetricsServiceName(cluster.Name, poolerType) + + if !enabled { + existing := &corev1.Service{} + err := c.Get(ctx, types.NamespacedName{Name: serviceName, Namespace: cluster.Namespace}, existing) + switch { + case apierrors.IsNotFound(err): + return nil + case err != nil: + return fmt.Errorf("getting PgBouncer metrics Service %s: %w", serviceName, err) + } + + logger.Info("Deleting PgBouncer metrics Service", "name", serviceName, "poolerType", poolerType) + if err := c.Delete(ctx, existing); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("deleting PgBouncer metrics Service %s: %w", serviceName, err) + } + return nil + } + + desired, err := buildConnectionPoolerMetricsService(scheme, cluster, poolerType) + if err != nil { + return fmt.Errorf("building PgBouncer metrics Service for %s pooler: %w", poolerType, err) + } + + live := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: desired.Name, + Namespace: desired.Namespace, + }, + } + + _, err = controllerutil.CreateOrUpdate(ctx, c, live, func() error { + live.Labels = desired.Labels + live.Annotations = desired.Annotations + live.Spec.Type = desired.Spec.Type + live.Spec.Selector = desired.Spec.Selector + live.Spec.Ports = desired.Spec.Ports + + if !metav1.IsControlledBy(live, cluster) { + if err := ctrl.SetControllerReference(cluster, live, scheme); err != nil { + return fmt.Errorf("setting controller reference on PgBouncer metrics Service: %w", err) + } + } + return nil + }) + if err != nil { + return fmt.Errorf("reconciling PgBouncer metrics Service %s: %w", desired.Name, err) + } + + return nil +} + +func reconcileGrafanaDashboardConfigMap( + ctx context.Context, + c client.Client, + scheme *runtime.Scheme, + cluster *enterprisev4.PostgresCluster, + enabled bool, +) error { + logger := log.FromContext(ctx) + configMapName := cluster.Name + grafanaDashboardConfigMapSuffix + + if !enabled { + existing := &corev1.ConfigMap{} + err := c.Get(ctx, types.NamespacedName{Name: configMapName, Namespace: cluster.Namespace}, existing) + switch { + case apierrors.IsNotFound(err): + return nil + case err != nil: + return fmt.Errorf("getting Grafana dashboard ConfigMap %s: %w", configMapName, err) + } + + logger.Info("Deleting Grafana dashboard ConfigMap", "name", configMapName) + if err := c.Delete(ctx, existing); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("deleting Grafana dashboard ConfigMap %s: %w", configMapName, err) + } + return nil + } + + desired, err := buildGrafanaDashboardConfigMap(scheme, cluster) + if err != nil { + return fmt.Errorf("building Grafana dashboard ConfigMap: %w", err) + } + + live := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: desired.Name, + Namespace: desired.Namespace, + }, + } + + _, err = controllerutil.CreateOrUpdate(ctx, c, live, func() error { + live.Labels = desired.Labels + live.Annotations = desired.Annotations + live.Data = desired.Data + + if !metav1.IsControlledBy(live, cluster) { + if err := ctrl.SetControllerReference(cluster, live, scheme); err != nil { + return fmt.Errorf("setting controller reference on Grafana dashboard ConfigMap: %w", err) + } + } + return nil + }) + if err != nil { + return fmt.Errorf("reconciling Grafana dashboard ConfigMap %s: %w", desired.Name, err) + } + + return nil +} + +func postgresMetricsServiceMonitorName(clusterName string) string { + return clusterName + "-postgres-metrics-monitor" +} + +func poolerMetricsServiceMonitorName(clusterName, poolerType string) string { + return fmt.Sprintf("%s-pooler-%s-metrics-monitor", clusterName, poolerType) +} + +func buildPostgreSQLMetricsServiceMonitor( + scheme *runtime.Scheme, + cluster *enterprisev4.PostgresCluster, +) (*monitoringv1.ServiceMonitor, error) { + sm := &monitoringv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: postgresMetricsServiceMonitorName(cluster.Name), + Namespace: cluster.Namespace, + Labels: map[string]string{ + labelManagedBy: labelManagedByValue, + labelObservabilityComponent: "postgresql-metrics", + cnpgClusterLabelName: cluster.Name, + }, + }, + Spec: monitoringv1.ServiceMonitorSpec{ + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + labelObservabilityComponent: "postgresql-metrics", + cnpgClusterLabelName: cluster.Name, + }, + }, + Endpoints: []monitoringv1.Endpoint{ + { + Port: postgresMetricsPortName, + Path: "/metrics", + Scheme: "http", + }, + }, + }, + } + + if err := ctrl.SetControllerReference(cluster, sm, scheme); err != nil { + return nil, fmt.Errorf("setting controller reference on PostgreSQL ServiceMonitor: %w", err) + } + + return sm, nil +} + +func buildConnectionPoolerMetricsServiceMonitor( + scheme *runtime.Scheme, + cluster *enterprisev4.PostgresCluster, + poolerType string, +) (*monitoringv1.ServiceMonitor, error) { + poolerName := poolerResourceName(cluster.Name, poolerType) + + sm := &monitoringv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: poolerMetricsServiceMonitorName(cluster.Name, poolerType), + Namespace: cluster.Namespace, + Labels: map[string]string{ + labelManagedBy: labelManagedByValue, + labelObservabilityComponent: "pgbouncer-metrics", + cnpgClusterLabelName: cluster.Name, + cnpgPoolerNameLabel: poolerName, + }, + }, + Spec: monitoringv1.ServiceMonitorSpec{ + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + labelObservabilityComponent: "pgbouncer-metrics", + cnpgClusterLabelName: cluster.Name, + cnpgPoolerNameLabel: poolerName, + }, + }, + Endpoints: []monitoringv1.Endpoint{ + { + Port: poolerMetricsPortName, + Path: "/metrics", + Scheme: "http", + }, + }, + }, + } + + if err := ctrl.SetControllerReference(cluster, sm, scheme); err != nil { + return nil, fmt.Errorf("setting controller reference on PgBouncer ServiceMonitor: %w", err) + } + + return sm, nil +} + +func reconcilePostgreSQLMetricsServiceMonitor( + ctx context.Context, + c client.Client, + scheme *runtime.Scheme, + cluster *enterprisev4.PostgresCluster, + enabled bool, +) error { + logger := log.FromContext(ctx) + name := postgresMetricsServiceMonitorName(cluster.Name) + + if !enabled { + existing := &monitoringv1.ServiceMonitor{} + err := c.Get(ctx, types.NamespacedName{Name: name, Namespace: cluster.Namespace}, existing) + switch { + case apierrors.IsNotFound(err): + return nil + case err != nil: + return fmt.Errorf("getting PostgreSQL ServiceMonitor %s: %w", name, err) + } + + logger.Info("Deleting PostgreSQL ServiceMonitor", "name", name) + if err := c.Delete(ctx, existing); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("deleting PostgreSQL ServiceMonitor %s: %w", name, err) + } + return nil + } + + desired, err := buildPostgreSQLMetricsServiceMonitor(scheme, cluster) + if err != nil { + return fmt.Errorf("building PostgreSQL ServiceMonitor: %w", err) + } + + live := &monitoringv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: desired.Name, + Namespace: desired.Namespace, + }, + } + + _, err = controllerutil.CreateOrUpdate(ctx, c, live, func() error { + live.Labels = desired.Labels + live.Annotations = desired.Annotations + live.Spec = desired.Spec + + if !metav1.IsControlledBy(live, cluster) { + if err := ctrl.SetControllerReference(cluster, live, scheme); err != nil { + return fmt.Errorf("setting controller reference on PostgreSQL ServiceMonitor: %w", err) + } + } + return nil + }) + if err != nil { + return fmt.Errorf("reconciling PostgreSQL ServiceMonitor %s: %w", desired.Name, err) + } + + return nil +} + +func reconcileConnectionPoolerMetricsServiceMonitor( + ctx context.Context, + c client.Client, + scheme *runtime.Scheme, + cluster *enterprisev4.PostgresCluster, + poolerType string, + enabled bool, +) error { + logger := log.FromContext(ctx) + name := poolerMetricsServiceMonitorName(cluster.Name, poolerType) + + if !enabled { + existing := &monitoringv1.ServiceMonitor{} + err := c.Get(ctx, types.NamespacedName{Name: name, Namespace: cluster.Namespace}, existing) + switch { + case apierrors.IsNotFound(err): + return nil + case err != nil: + return fmt.Errorf("getting PgBouncer ServiceMonitor %s: %w", name, err) + } + + logger.Info("Deleting PgBouncer ServiceMonitor", "name", name, "poolerType", poolerType) + if err := c.Delete(ctx, existing); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("deleting PgBouncer ServiceMonitor %s: %w", name, err) + } + return nil + } + + desired, err := buildConnectionPoolerMetricsServiceMonitor(scheme, cluster, poolerType) + if err != nil { + return fmt.Errorf("building PgBouncer ServiceMonitor for %s pooler: %w", poolerType, err) + } + + live := &monitoringv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: desired.Name, + Namespace: desired.Namespace, + }, + } + + _, err = controllerutil.CreateOrUpdate(ctx, c, live, func() error { + live.Labels = desired.Labels + live.Annotations = desired.Annotations + live.Spec = desired.Spec + + if !metav1.IsControlledBy(live, cluster) { + if err := ctrl.SetControllerReference(cluster, live, scheme); err != nil { + return fmt.Errorf("setting controller reference on PgBouncer ServiceMonitor: %w", err) + } + } + return nil + }) + if err != nil { + return fmt.Errorf("reconciling PgBouncer ServiceMonitor %s: %w", desired.Name, err) + } + + return nil +} + +//go:embed dashboards/postgres_observability.json +var postgresObservabilityDashboardTemplate string + +func buildBasicGrafanaDashboard(cluster *enterprisev4.PostgresCluster) string { + replacer := strings.NewReplacer( + "__CLUSTER_NAME__", cluster.Name, + "__NAMESPACE__", cluster.Namespace, + "__POSTGRES_SERVICE__", cluster.Name+postgresMetricsServiceSuffix, + "__RW_POOLER_SERVICE__", poolerMetricsServiceName(cluster.Name, readWriteEndpoint), + "__RO_POOLER_SERVICE__", poolerMetricsServiceName(cluster.Name, readOnlyEndpoint), + ) + + return replacer.Replace(postgresObservabilityDashboardTemplate) +} diff --git a/pkg/postgresql/cluster/core/monitoring_unit_test.go b/pkg/postgresql/cluster/core/monitoring_unit_test.go new file mode 100644 index 000000000..6c1d0715a --- /dev/null +++ b/pkg/postgresql/cluster/core/monitoring_unit_test.go @@ -0,0 +1,472 @@ +package core + +import ( + "encoding/json" + "errors" + "testing" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + enterprisev4 "github.com/splunk/splunk-operator/api/v4" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/utils/ptr" +) + +func TestIsPostgreSQLMetricsEnabled(t *testing.T) { + tests := []struct { + name string + cluster *enterprisev4.PostgresCluster + class *enterprisev4.PostgresClusterClass + want bool + }{ + { + name: "disabled when class observability is absent", + class: &enterprisev4.PostgresClusterClass{ + Spec: enterprisev4.PostgresClusterClassSpec{ + Config: &enterprisev4.PostgresClusterClassConfig{}, + }, + }, + want: false, + }, + { + name: "enabled when class enables and cluster override is unset", + cluster: &enterprisev4.PostgresCluster{}, + class: newClassWithObservability( + ptr.To(true), + nil, + nil, + nil, + ), + want: true, + }, + { + name: "disabled when cluster override disables", + cluster: &enterprisev4.PostgresCluster{ + Spec: enterprisev4.PostgresClusterSpec{ + Observability: &enterprisev4.PostgresObservabilityOverride{ + PostgreSQL: &enterprisev4.FeatureDisableOverride{Disabled: ptr.To(true)}, + }, + }, + }, + class: newClassWithObservability( + ptr.To(true), + nil, + nil, + nil, + ), + want: false, + }, + { + name: "disabled when class disables even if cluster has override struct", + cluster: &enterprisev4.PostgresCluster{ + Spec: enterprisev4.PostgresClusterSpec{ + Observability: &enterprisev4.PostgresObservabilityOverride{ + PostgreSQL: &enterprisev4.FeatureDisableOverride{Disabled: ptr.To(false)}, + }, + }, + }, + class: newClassWithObservability( + ptr.To(false), + nil, + nil, + nil, + ), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isPostgreSQLMetricsEnabled(tt.cluster, tt.class) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestIsConnectionPoolerEnabled(t *testing.T) { + tests := []struct { + name string + cluster *enterprisev4.PostgresCluster + class *enterprisev4.PostgresClusterClass + want bool + }{ + { + name: "disabled when class config is absent", + class: &enterprisev4.PostgresClusterClass{}, + want: false, + }, + { + name: "inherits enabled class setting when cluster override is unset", + cluster: &enterprisev4.PostgresCluster{}, + class: &enterprisev4.PostgresClusterClass{ + Spec: enterprisev4.PostgresClusterClassSpec{ + Config: &enterprisev4.PostgresClusterClassConfig{ + ConnectionPoolerEnabled: ptr.To(true), + }, + }, + }, + want: true, + }, + { + name: "cluster can disable class enabled pooler", + cluster: &enterprisev4.PostgresCluster{ + Spec: enterprisev4.PostgresClusterSpec{ + ConnectionPoolerEnabled: ptr.To(false), + }, + }, + class: &enterprisev4.PostgresClusterClass{ + Spec: enterprisev4.PostgresClusterClassSpec{ + Config: &enterprisev4.PostgresClusterClassConfig{ + ConnectionPoolerEnabled: ptr.To(true), + }, + }, + }, + want: false, + }, + { + name: "class disabled wins", + cluster: &enterprisev4.PostgresCluster{ + Spec: enterprisev4.PostgresClusterSpec{ + ConnectionPoolerEnabled: ptr.To(true), + }, + }, + class: &enterprisev4.PostgresClusterClass{ + Spec: enterprisev4.PostgresClusterClassSpec{ + Config: &enterprisev4.PostgresClusterClassConfig{ + ConnectionPoolerEnabled: ptr.To(false), + }, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isConnectionPoolerEnabled(tt.cluster, tt.class) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestIsConnectionPoolerMetricsEnabled(t *testing.T) { + tests := []struct { + name string + cluster *enterprisev4.PostgresCluster + class *enterprisev4.PostgresClusterClass + want bool + }{ + { + name: "disabled when pooler itself is disabled", + cluster: &enterprisev4.PostgresCluster{}, + class: newClassWithObservability( + nil, + ptr.To(true), + nil, + ptr.To(false), + ), + want: false, + }, + { + name: "enabled when pooler and pgbouncer metrics are enabled", + cluster: &enterprisev4.PostgresCluster{}, + class: newClassWithObservability( + nil, + ptr.To(true), + ptr.To(true), + ptr.To(true), + ), + want: true, + }, + { + name: "disabled when cluster override disables pgbouncer metrics", + cluster: &enterprisev4.PostgresCluster{ + Spec: enterprisev4.PostgresClusterSpec{ + Observability: &enterprisev4.PostgresObservabilityOverride{ + PgBouncer: &enterprisev4.FeatureDisableOverride{Disabled: ptr.To(true)}, + }, + }, + }, + class: newClassWithObservability( + nil, + ptr.To(true), + ptr.To(true), + ptr.To(true), + ), + want: false, + }, + { + name: "disabled when class disables pgbouncer metrics", + cluster: &enterprisev4.PostgresCluster{}, + class: newClassWithObservability( + nil, + ptr.To(true), + ptr.To(false), + ptr.To(true), + ), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isConnectionPoolerMetricsEnabled(tt.cluster, tt.class) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestIsGrafanaDashboardEnabled(t *testing.T) { + tests := []struct { + name string + cluster *enterprisev4.PostgresCluster + class *enterprisev4.PostgresClusterClass + want bool + }{ + { + name: "enabled when class enables and cluster override is unset", + cluster: &enterprisev4.PostgresCluster{}, + class: newClassWithObservability( + nil, + nil, + nil, + ptr.To(true), + ), + want: true, + }, + { + name: "disabled when cluster override disables dashboard", + cluster: &enterprisev4.PostgresCluster{ + Spec: enterprisev4.PostgresClusterSpec{ + Observability: &enterprisev4.PostgresObservabilityOverride{ + GrafanaDashboard: &enterprisev4.FeatureDisableOverride{Disabled: ptr.To(true)}, + }, + }, + }, + class: newClassWithObservability( + nil, + nil, + nil, + ptr.To(true), + ), + want: false, + }, + { + name: "disabled when class disables dashboard", + cluster: &enterprisev4.PostgresCluster{}, + class: newClassWithObservability( + nil, + nil, + nil, + ptr.To(false), + ), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isGrafanaDashboardEnabled(tt.cluster, tt.class) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestBuildPostgreSQLMetricsService(t *testing.T) { + scheme := newMonitoringTestScheme(t) + cluster := newTestMonitoringCluster() + + svc, err := buildPostgreSQLMetricsService(scheme, cluster) + require.NoError(t, err) + + assert.Equal(t, "postgresql-cluster-dev-postgres-metrics", svc.Name) + assert.Equal(t, cluster.Namespace, svc.Namespace) + assert.Equal(t, "postgresql-metrics", svc.Labels[labelObservabilityComponent]) + assert.Equal(t, cluster.Name, svc.Labels[cnpgClusterLabelName]) + assert.Equal(t, cluster.Name, svc.Spec.Selector[cnpgClusterLabelName]) + assert.Equal(t, cnpgPodRoleInstance, svc.Spec.Selector[cnpgPodRoleLabelName]) + require.Len(t, svc.Spec.Ports, 1) + assert.Equal(t, postgresMetricsPortName, svc.Spec.Ports[0].Name) + assert.Equal(t, postgresMetricsPort, svc.Spec.Ports[0].Port) + assert.Equal(t, postgresMetricsPortName, svc.Spec.Ports[0].TargetPort.StrVal) + assertMonitoringOwnerRef(t, svc.OwnerReferences, cluster) +} + +func TestBuildConnectionPoolerMetricsService(t *testing.T) { + scheme := newMonitoringTestScheme(t) + cluster := newTestMonitoringCluster() + + svc, err := buildConnectionPoolerMetricsService(scheme, cluster, readWriteEndpoint) + require.NoError(t, err) + + assert.Equal(t, "postgresql-cluster-dev-pooler-rw-metrics", svc.Name) + assert.Equal(t, "pgbouncer-metrics", svc.Labels[labelObservabilityComponent]) + assert.Equal(t, poolerResourceName(cluster.Name, readWriteEndpoint), svc.Labels[cnpgPoolerNameLabel]) + assert.Equal(t, poolerResourceName(cluster.Name, readWriteEndpoint), svc.Spec.Selector[cnpgPoolerNameLabel]) + require.Len(t, svc.Spec.Ports, 1) + assert.Equal(t, poolerMetricsPortName, svc.Spec.Ports[0].Name) + assert.Equal(t, poolerMetricsPort, svc.Spec.Ports[0].Port) + assert.Equal(t, poolerMetricsPortName, svc.Spec.Ports[0].TargetPort.StrVal) + assertMonitoringOwnerRef(t, svc.OwnerReferences, cluster) +} + +func TestBuildGrafanaDashboardConfigMap(t *testing.T) { + scheme := newMonitoringTestScheme(t) + cluster := newTestMonitoringCluster() + + cm, err := buildGrafanaDashboardConfigMap(scheme, cluster) + require.NoError(t, err) + + assert.Equal(t, "postgresql-cluster-dev-grafana-dashboard", cm.Name) + assert.Equal(t, "grafana-dashboard", cm.Labels[labelObservabilityComponent]) + assert.Equal(t, grafanaDashboardLabelValue, cm.Labels[grafanaDashboardLabelKey]) + assert.Contains(t, cm.Data, "dashboard.json") + assert.NotContains(t, cm.Data["dashboard.json"], "__CLUSTER_NAME__") + assert.Contains(t, cm.Data["dashboard.json"], cluster.Name) + assert.Contains(t, cm.Data["dashboard.json"], cluster.Namespace) + assert.Contains(t, cm.Data["dashboard.json"], cluster.Name+postgresMetricsServiceSuffix) + assert.Contains(t, cm.Data["dashboard.json"], poolerMetricsServiceName(cluster.Name, readWriteEndpoint)) + assert.Contains(t, cm.Data["dashboard.json"], poolerMetricsServiceName(cluster.Name, readOnlyEndpoint)) + + var dashboard map[string]any + require.NoError(t, json.Unmarshal([]byte(cm.Data["dashboard.json"]), &dashboard)) + assertMonitoringOwnerRef(t, cm.OwnerReferences, cluster) +} + +func TestBuildPostgreSQLMetricsServiceMonitor(t *testing.T) { + scheme := newMonitoringTestScheme(t) + cluster := newTestMonitoringCluster() + + sm, err := buildPostgreSQLMetricsServiceMonitor(scheme, cluster) + require.NoError(t, err) + + assert.Equal(t, "postgresql-cluster-dev-postgres-metrics-monitor", sm.Name) + assert.Equal(t, "postgresql-metrics", sm.Labels[labelObservabilityComponent]) + assert.Equal(t, cluster.Name, sm.Spec.Selector.MatchLabels[cnpgClusterLabelName]) + require.Len(t, sm.Spec.Endpoints, 1) + assert.Equal(t, postgresMetricsPortName, sm.Spec.Endpoints[0].Port) + assert.Equal(t, "/metrics", sm.Spec.Endpoints[0].Path) + assert.Equal(t, "http", sm.Spec.Endpoints[0].Scheme) + assertMonitoringOwnerRef(t, sm.OwnerReferences, cluster) +} + +func TestBuildConnectionPoolerMetricsServiceMonitor(t *testing.T) { + scheme := newMonitoringTestScheme(t) + cluster := newTestMonitoringCluster() + + sm, err := buildConnectionPoolerMetricsServiceMonitor(scheme, cluster, readOnlyEndpoint) + require.NoError(t, err) + + assert.Equal(t, "postgresql-cluster-dev-pooler-ro-metrics-monitor", sm.Name) + assert.Equal(t, "pgbouncer-metrics", sm.Labels[labelObservabilityComponent]) + assert.Equal(t, poolerResourceName(cluster.Name, readOnlyEndpoint), sm.Labels[cnpgPoolerNameLabel]) + assert.Equal(t, poolerResourceName(cluster.Name, readOnlyEndpoint), sm.Spec.Selector.MatchLabels[cnpgPoolerNameLabel]) + require.Len(t, sm.Spec.Endpoints, 1) + assert.Equal(t, poolerMetricsPortName, sm.Spec.Endpoints[0].Port) + assert.Equal(t, "/metrics", sm.Spec.Endpoints[0].Path) + assert.Equal(t, "http", sm.Spec.Endpoints[0].Scheme) + assertMonitoringOwnerRef(t, sm.OwnerReferences, cluster) +} + +func TestIsServiceMonitorUnavailable(t *testing.T) { + tests := []struct { + name string + err error + want bool + }{ + { + name: "nil error", + err: nil, + want: false, + }, + { + name: "not found error", + err: apierrors.NewNotFound(schema.GroupResource{Group: "monitoring.coreos.com", Resource: "servicemonitors"}, "test"), + want: true, + }, + { + name: "kind match string error", + err: errors.New("no matches for kind \"ServiceMonitor\" in version \"monitoring.coreos.com/v1\""), + want: true, + }, + { + name: "resource string error", + err: errors.New("servicemonitors.monitoring.coreos.com not found"), + want: true, + }, + { + name: "unrelated error", + err: errors.New("boom"), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isServiceMonitorUnavailable(tt.err) + assert.Equal(t, tt.want, got) + }) + } +} + +func newMonitoringTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + + scheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(scheme)) + require.NoError(t, monitoringv1.AddToScheme(scheme)) + require.NoError(t, enterprisev4.AddToScheme(scheme)) + + return scheme +} + +func newTestMonitoringCluster() *enterprisev4.PostgresCluster { + return &enterprisev4.PostgresCluster{ + TypeMeta: metav1.TypeMeta{ + APIVersion: enterprisev4.GroupVersion.String(), + Kind: "PostgresCluster", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "postgresql-cluster-dev", + Namespace: "test", + UID: "cluster-uid", + }, + } +} + +func newClassWithObservability( + postgresEnabled *bool, + poolerEnabled *bool, + pgBouncerMetricsEnabled *bool, + grafanaEnabled *bool, +) *enterprisev4.PostgresClusterClass { + return &enterprisev4.PostgresClusterClass{ + Spec: enterprisev4.PostgresClusterClassSpec{ + Config: &enterprisev4.PostgresClusterClassConfig{ + ConnectionPoolerEnabled: poolerEnabled, + Observability: &enterprisev4.PostgresObservabilityClassConfig{ + PostgreSQL: &enterprisev4.MetricsClassConfig{Enabled: postgresEnabled}, + PgBouncer: &enterprisev4.MetricsClassConfig{Enabled: pgBouncerMetricsEnabled}, + GrafanaDashboard: &enterprisev4.GrafanaDashboardClassConfig{Enabled: grafanaEnabled}, + }, + }, + }, + } +} + +func assertMonitoringOwnerRef(t *testing.T, ownerRefs []metav1.OwnerReference, cluster *enterprisev4.PostgresCluster) { + t.Helper() + + require.Len(t, ownerRefs, 1) + assert.Equal(t, cluster.APIVersion, ownerRefs[0].APIVersion) + assert.Equal(t, cluster.Kind, ownerRefs[0].Kind) + assert.Equal(t, cluster.Name, ownerRefs[0].Name) + assert.Equal(t, cluster.UID, ownerRefs[0].UID) + require.NotNil(t, ownerRefs[0].Controller) + assert.True(t, *ownerRefs[0].Controller) +} From aa8924f649d26d477560caa5b4f156f1a2f5b1d6 Mon Sep 17 00:00:00 2001 From: dpishchenkov Date: Thu, 2 Apr 2026 16:46:17 +0200 Subject: [PATCH 2/4] removed grafana dashboard from code --- api/v4/postgrescluster_types.go | 5 +- api/v4/postgresclusterclass_types.go | 13 +- ...nterprise_v4_postgresclusterclass_dev.yaml | 3 +- pkg/postgresql/cluster/core/cluster.go | 39 +---- .../dashboards/postgres_observability.json | 136 ----------------- pkg/postgresql/cluster/core/events.go | 39 +++-- pkg/postgresql/cluster/core/monitoring.go | 143 +----------------- .../cluster/core/monitoring_unit_test.go | 137 +---------------- 8 files changed, 37 insertions(+), 478 deletions(-) delete mode 100644 pkg/postgresql/cluster/core/dashboards/postgres_observability.json diff --git a/api/v4/postgrescluster_types.go b/api/v4/postgrescluster_types.go index a1445c03b..8d42d5c1d 100644 --- a/api/v4/postgrescluster_types.go +++ b/api/v4/postgrescluster_types.go @@ -109,7 +109,7 @@ type PostgresClusterSpec struct { // +optional ClusterDeletionPolicy *string `json:"clusterDeletionPolicy,omitempty"` - // Observability contains configuration for monitoring and observability features. + // Observability contains configuration for metrics exposure features. // +optional Observability *PostgresObservabilityOverride `json:"observability,omitempty"` } @@ -122,9 +122,6 @@ type PostgresObservabilityOverride struct { // +optional PgBouncer *FeatureDisableOverride `json:"pgbouncer,omitempty"` - - // +optional - GrafanaDashboard *FeatureDisableOverride `json:"grafanaDashboard,omitempty"` } type FeatureDisableOverride struct { diff --git a/api/v4/postgresclusterclass_types.go b/api/v4/postgresclusterclass_types.go index 743e98722..74085d191 100644 --- a/api/v4/postgresclusterclass_types.go +++ b/api/v4/postgresclusterclass_types.go @@ -100,8 +100,8 @@ type PostgresClusterClassConfig struct { // +optional ConnectionPoolerEnabled *bool `json:"connectionPoolerEnabled,omitempty"` - // Observability contains configuration for metrics and dashboards. - // When enabled, creates metrics resources and Grafana dashboard for clusters using this class. + // Observability contains configuration for metrics exposure. + // When enabled, creates metrics resources for clusters using this class. // Can be overridden in PostgresCluster CR. // +kubebuilder:default={} // +optional @@ -184,8 +184,6 @@ type PostgresObservabilityClassConfig struct { PostgreSQL *MetricsClassConfig `json:"postgresql,omitempty"` // +optional PgBouncer *MetricsClassConfig `json:"pgbouncer,omitempty"` - // +optional - GrafanaDashboard *GrafanaDashboardClassConfig `json:"grafanaDashboard,omitempty"` } type MetricsClassConfig struct { @@ -195,13 +193,6 @@ type MetricsClassConfig struct { Enabled *bool `json:"enabled,omitempty"` } -type GrafanaDashboardClassConfig struct { - // Enabled controls whether a Grafana dashboard ConfigMap should be created for this class. - // +kubebuilder:default=false - // +optional - Enabled *bool `json:"enabled,omitempty"` -} - // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:scope=Cluster diff --git a/config/samples/enterprise_v4_postgresclusterclass_dev.yaml b/config/samples/enterprise_v4_postgresclusterclass_dev.yaml index 560958794..082d5fad9 100644 --- a/config/samples/enterprise_v4_postgresclusterclass_dev.yaml +++ b/config/samples/enterprise_v4_postgresclusterclass_dev.yaml @@ -28,7 +28,7 @@ spec: memory: "2Gi" connectionPoolerEnabled: true observability: - grafanaDashboard: + postgresql: enabled: true pgbouncer: enabled: true @@ -41,4 +41,3 @@ spec: mode: transaction config: max_client_conn: "100" - diff --git a/pkg/postgresql/cluster/core/cluster.go b/pkg/postgresql/cluster/core/cluster.go index f7e8ba5d7..77846a8e4 100644 --- a/pkg/postgresql/cluster/core/cluster.go +++ b/pkg/postgresql/cluster/core/cluster.go @@ -410,48 +410,21 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. return ctrl.Result{}, err } - if err := reconcileGrafanaDashboardConfigMap(ctx, c, rc.Scheme, postgresCluster, isGrafanaDashboardEnabled(postgresCluster, clusterClass)); err != nil { - return ctrl.Result{}, err - } - - serviceMonitorUnavailableEmitted := false - handleServiceMonitorError := func(err error) (bool, error) { - if err == nil { - return false, nil - } - if !isServiceMonitorUnavailable(err) { - return false, err - } - if !serviceMonitorUnavailableEmitted { - serviceMonitorUnavailableEmitted = true - logger.Info("ServiceMonitor CRD unavailable, continuing without ServiceMonitors") - rc.emitWarning(postgresCluster, EventServiceMonitorUnavailable, - "ServiceMonitor CRD not found; continuing without Prometheus ServiceMonitors") - } - return true, nil - } - - if handled, err := handleServiceMonitorError( - reconcilePostgreSQLMetricsServiceMonitor(ctx, c, rc.Scheme, postgresCluster, isPostgreSQLMetricsEnabled(postgresCluster, clusterClass)), + if err := reconcilePostgreSQLMetricsServiceMonitor( + ctx, c, rc.Scheme, postgresCluster, isPostgreSQLMetricsEnabled(postgresCluster, clusterClass), ); err != nil { return ctrl.Result{}, err - } else if handled { - logger.Info("Skipped PostgreSQL ServiceMonitor reconciliation") } - if handled, err := handleServiceMonitorError( - reconcileConnectionPoolerMetricsServiceMonitor(ctx, c, rc.Scheme, postgresCluster, readWriteEndpoint, rwPoolerMetricsEnabled), + if err := reconcileConnectionPoolerMetricsServiceMonitor( + ctx, c, rc.Scheme, postgresCluster, readWriteEndpoint, rwPoolerMetricsEnabled, ); err != nil { return ctrl.Result{}, err - } else if handled { - logger.Info("Skipped RW PgBouncer ServiceMonitor reconciliation") } - if handled, err := handleServiceMonitorError( - reconcileConnectionPoolerMetricsServiceMonitor(ctx, c, rc.Scheme, postgresCluster, readOnlyEndpoint, roPoolerMetricsEnabled), + if err := reconcileConnectionPoolerMetricsServiceMonitor( + ctx, c, rc.Scheme, postgresCluster, readOnlyEndpoint, roPoolerMetricsEnabled, ); err != nil { return ctrl.Result{}, err - } else if handled { - logger.Info("Skipped RO PgBouncer ServiceMonitor reconciliation") } // Reconcile ConfigMap when CNPG cluster is healthy. diff --git a/pkg/postgresql/cluster/core/dashboards/postgres_observability.json b/pkg/postgresql/cluster/core/dashboards/postgres_observability.json deleted file mode 100644 index bbdf6eda7..000000000 --- a/pkg/postgresql/cluster/core/dashboards/postgres_observability.json +++ /dev/null @@ -1,136 +0,0 @@ -{ - "title": "PostgreSQL __CLUSTER_NAME__", - "uid": "pg-__CLUSTER_NAME__", - "schemaVersion": 39, - "version": 1, - "refresh": "30s", - "timezone": "browser", - "tags": ["postgresql", "cnpg", "pgbouncer"], - "editable": true, - "graphTooltip": 0, - "panels": [ - { - "id": 1, - "type": "stat", - "title": "PostgreSQL Instances", - "gridPos": { "x": 0, "y": 0, "w": 6, "h": 4 }, - "targets": [ - { - "expr": "count(max by (pod) (cnpg_pg_postmaster_start_time_seconds{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}))", - "refId": "A" - } - ], - "options": { - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "orientation": "horizontal", - "textMode": "value" - } - }, - { - "id": 2, - "type": "stat", - "title": "RW PgBouncer Pods Up", - "gridPos": { "x": 6, "y": 0, "w": 6, "h": 4 }, - "targets": [ - { - "expr": "round(sum(max by (pod) (cnpg_pgbouncer_up{namespace=\"__NAMESPACE__\",service=\"__RW_POOLER_SERVICE__\"})))", - "refId": "A" - } - ], - "options": { - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "orientation": "horizontal", - "textMode": "value" - } - }, - { - "id": 3, - "type": "stat", - "title": "RO PgBouncer Pods Up", - "gridPos": { "x": 12, "y": 0, "w": 6, "h": 4 }, - "targets": [ - { - "expr": "round(sum(max by (pod) (cnpg_pgbouncer_up{namespace=\"__NAMESPACE__\",service=\"__RO_POOLER_SERVICE__\"})))", - "refId": "A" - } - ], - "options": { - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "orientation": "horizontal", - "textMode": "value" - } - }, - { - "id": 4, - "type": "stat", - "title": "Total Database Size", - "gridPos": { "x": 18, "y": 0, "w": 6, "h": 4 }, - "targets": [ - { - "expr": "sum(max by (datname) (cnpg_pg_database_size_bytes{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "bytes" - } - }, - "options": { - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "orientation": "horizontal", - "textMode": "value" - } - }, - { - "id": 5, - "type": "timeseries", - "title": "WAL Files by Pod", - "gridPos": { "x": 0, "y": 4, "w": 8, "h": 8 }, - "targets": [ - { - "expr": "round(max by (pod) (cnpg_pg_wal_files_total{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}))", - "legendFormat": "{{pod}}", - "refId": "A" - } - ] - }, - { - "id": 6, - "type": "timeseries", - "title": "Archived WAL Rate by Pod", - "gridPos": { "x": 8, "y": 4, "w": 8, "h": 8 }, - "targets": [ - { - "expr": "max by (pod) (rate(cnpg_pg_stat_archiver_archived_count{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}[5m]))", - "legendFormat": "{{pod}}", - "refId": "A" - } - ] - }, - { - "id": 7, - "type": "timeseries", - "title": "PgBouncer Active Clients", - "gridPos": { "x": 16, "y": 4, "w": 8, "h": 8 }, - "targets": [ - { - "expr": "round(sum(cnpg_pgbouncer_pools_cl_active{namespace=\"__NAMESPACE__\",service=\"__RW_POOLER_SERVICE__\"}))", - "legendFormat": "rw", - "refId": "A" - }, - { - "expr": "round(sum(cnpg_pgbouncer_pools_cl_active{namespace=\"__NAMESPACE__\",service=\"__RO_POOLER_SERVICE__\"}))", - "legendFormat": "ro", - "refId": "B" - } - ] - } - ], - "templating": { - "list": [] - }, - "annotations": { - "list": [] - } -} diff --git a/pkg/postgresql/cluster/core/events.go b/pkg/postgresql/cluster/core/events.go index 73ded6cd5..afcfd768e 100644 --- a/pkg/postgresql/cluster/core/events.go +++ b/pkg/postgresql/cluster/core/events.go @@ -10,26 +10,25 @@ import ( ) const ( - EventSecretReady = "SecretReady" - EventConfigMapReady = "ConfigMapReady" - EventClusterAdopted = "ClusterAdopted" - EventClusterCreationStarted = "ClusterCreationStarted" - EventClusterUpdateStarted = "ClusterUpdateStarted" - EventClusterReady = "ClusterReady" - EventPoolerCreationStarted = "PoolerCreationStarted" - EventPoolerReady = "PoolerReady" - EventCleanupComplete = "CleanupComplete" - EventClusterClassNotFound = "ClusterClassNotFound" - EventConfigMergeFailed = "ConfigMergeFailed" - EventSecretReconcileFailed = "SecretReconcileFailed" - EventClusterCreateFailed = "ClusterCreateFailed" - EventClusterUpdateFailed = "ClusterUpdateFailed" - EventManagedRolesFailed = "ManagedRolesFailed" - EventPoolerReconcileFailed = "PoolerReconcileFailed" - EventConfigMapReconcileFailed = "ConfigMapReconcileFailed" - EventServiceMonitorUnavailable = "ServiceMonitorUnavailable" - EventClusterDegraded = "ClusterDegraded" - EventCleanupFailed = "CleanupFailed" + EventSecretReady = "SecretReady" + EventConfigMapReady = "ConfigMapReady" + EventClusterAdopted = "ClusterAdopted" + EventClusterCreationStarted = "ClusterCreationStarted" + EventClusterUpdateStarted = "ClusterUpdateStarted" + EventClusterReady = "ClusterReady" + EventPoolerCreationStarted = "PoolerCreationStarted" + EventPoolerReady = "PoolerReady" + EventCleanupComplete = "CleanupComplete" + EventClusterClassNotFound = "ClusterClassNotFound" + EventConfigMergeFailed = "ConfigMergeFailed" + EventSecretReconcileFailed = "SecretReconcileFailed" + EventClusterCreateFailed = "ClusterCreateFailed" + EventClusterUpdateFailed = "ClusterUpdateFailed" + EventManagedRolesFailed = "ManagedRolesFailed" + EventPoolerReconcileFailed = "PoolerReconcileFailed" + EventConfigMapReconcileFailed = "ConfigMapReconcileFailed" + EventClusterDegraded = "ClusterDegraded" + EventCleanupFailed = "CleanupFailed" ) func (rc *ReconcileContext) emitNormal(obj client.Object, reason, message string) { diff --git a/pkg/postgresql/cluster/core/monitoring.go b/pkg/postgresql/cluster/core/monitoring.go index 7c942d22a..c2f1267f8 100644 --- a/pkg/postgresql/cluster/core/monitoring.go +++ b/pkg/postgresql/cluster/core/monitoring.go @@ -2,13 +2,12 @@ package core import ( "context" - _ "embed" "fmt" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" enterprisev4 "github.com/splunk/splunk-operator/api/v4" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -17,17 +16,15 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" - "strings" ) const ( // metrics - postgresMetricsServiceSuffix = "-postgres-metrics" - postgresMetricsPortName = "metrics" - postgresMetricsPort = int32(9187) - poolerMetricsPortName = "metrics" - poolerMetricsPort = int32(9127) - grafanaDashboardConfigMapSuffix = "-grafana-dashboard" + postgresMetricsServiceSuffix = "-postgres-metrics" + postgresMetricsPortName = "metrics" + postgresMetricsPort = int32(9187) + poolerMetricsPortName = "metrics" + poolerMetricsPort = int32(9127) // labels labelManagedBy = "app.kubernetes.io/managed-by" @@ -37,8 +34,6 @@ const ( cnpgPoolerNameLabel = "cnpg.io/poolerName" cnpgPodRoleInstance = "instance" cnpgPodRoleLabelName = "cnpg.io/podRole" - grafanaDashboardLabelKey = "grafana_dashboard" - grafanaDashboardLabelValue = "1" ) func isPostgreSQLMetricsEnabled(cluster *enterprisev4.PostgresCluster, class *enterprisev4.PostgresClusterClass) bool { @@ -87,21 +82,6 @@ func isConnectionPoolerMetricsEnabled(cluster *enterprisev4.PostgresCluster, cla return override == nil || !*override } -func isGrafanaDashboardEnabled(cluster *enterprisev4.PostgresCluster, class *enterprisev4.PostgresClusterClass) bool { - if class == nil || class.Spec.Config == nil || class.Spec.Config.Observability == nil { - return false - } - classCfg := class.Spec.Config.Observability.GrafanaDashboard - if classCfg == nil || classCfg.Enabled == nil || !*classCfg.Enabled { - return false - } - if cluster == nil || cluster.Spec.Observability == nil || cluster.Spec.Observability.GrafanaDashboard == nil { - return true - } - override := cluster.Spec.Observability.GrafanaDashboard.Disabled - return override == nil || !*override -} - func buildPostgreSQLMetricsService(scheme *runtime.Scheme, cluster *enterprisev4.PostgresCluster) (*corev1.Service, error) { svc := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ @@ -181,44 +161,6 @@ func buildConnectionPoolerMetricsService( return svc, nil } -func buildGrafanaDashboardConfigMap(scheme *runtime.Scheme, cluster *enterprisev4.PostgresCluster) (*corev1.ConfigMap, error) { - cm := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: cluster.Name + grafanaDashboardConfigMapSuffix, - Namespace: cluster.Namespace, - Labels: map[string]string{ - labelManagedBy: labelManagedByValue, - labelObservabilityComponent: "grafana-dashboard", - cnpgClusterLabelName: cluster.Name, - grafanaDashboardLabelKey: grafanaDashboardLabelValue, - }, - }, - Data: map[string]string{ - "dashboard.json": buildBasicGrafanaDashboard(cluster), - }, - } - - if err := ctrl.SetControllerReference(cluster, cm, scheme); err != nil { - return nil, fmt.Errorf("setting controller reference on Grafana dashboard ConfigMap: %w", err) - } - - return cm, nil -} - -func isServiceMonitorUnavailable(err error) bool { - if err == nil { - return false - } - - if apierrors.IsNotFound(err) || apimeta.IsNoMatchError(err) { - return true - } - - msg := err.Error() - return strings.Contains(msg, "no matches for kind \"ServiceMonitor\"") || - strings.Contains(msg, "servicemonitors.monitoring.coreos.com") -} - func reconcilePostgreSQLMetricsService(ctx context.Context, c client.Client, scheme *runtime.Scheme, cluster *enterprisev4.PostgresCluster, enabled bool) error { logger := log.FromContext(ctx) serviceName := cluster.Name + postgresMetricsServiceSuffix @@ -334,64 +276,6 @@ func reconcileConnectionPoolerMetricsService( return nil } -func reconcileGrafanaDashboardConfigMap( - ctx context.Context, - c client.Client, - scheme *runtime.Scheme, - cluster *enterprisev4.PostgresCluster, - enabled bool, -) error { - logger := log.FromContext(ctx) - configMapName := cluster.Name + grafanaDashboardConfigMapSuffix - - if !enabled { - existing := &corev1.ConfigMap{} - err := c.Get(ctx, types.NamespacedName{Name: configMapName, Namespace: cluster.Namespace}, existing) - switch { - case apierrors.IsNotFound(err): - return nil - case err != nil: - return fmt.Errorf("getting Grafana dashboard ConfigMap %s: %w", configMapName, err) - } - - logger.Info("Deleting Grafana dashboard ConfigMap", "name", configMapName) - if err := c.Delete(ctx, existing); err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("deleting Grafana dashboard ConfigMap %s: %w", configMapName, err) - } - return nil - } - - desired, err := buildGrafanaDashboardConfigMap(scheme, cluster) - if err != nil { - return fmt.Errorf("building Grafana dashboard ConfigMap: %w", err) - } - - live := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: desired.Name, - Namespace: desired.Namespace, - }, - } - - _, err = controllerutil.CreateOrUpdate(ctx, c, live, func() error { - live.Labels = desired.Labels - live.Annotations = desired.Annotations - live.Data = desired.Data - - if !metav1.IsControlledBy(live, cluster) { - if err := ctrl.SetControllerReference(cluster, live, scheme); err != nil { - return fmt.Errorf("setting controller reference on Grafana dashboard ConfigMap: %w", err) - } - } - return nil - }) - if err != nil { - return fmt.Errorf("reconciling Grafana dashboard ConfigMap %s: %w", desired.Name, err) - } - - return nil -} - func postgresMetricsServiceMonitorName(clusterName string) string { return clusterName + "-postgres-metrics-monitor" } @@ -597,18 +481,3 @@ func reconcileConnectionPoolerMetricsServiceMonitor( return nil } - -//go:embed dashboards/postgres_observability.json -var postgresObservabilityDashboardTemplate string - -func buildBasicGrafanaDashboard(cluster *enterprisev4.PostgresCluster) string { - replacer := strings.NewReplacer( - "__CLUSTER_NAME__", cluster.Name, - "__NAMESPACE__", cluster.Namespace, - "__POSTGRES_SERVICE__", cluster.Name+postgresMetricsServiceSuffix, - "__RW_POOLER_SERVICE__", poolerMetricsServiceName(cluster.Name, readWriteEndpoint), - "__RO_POOLER_SERVICE__", poolerMetricsServiceName(cluster.Name, readOnlyEndpoint), - ) - - return replacer.Replace(postgresObservabilityDashboardTemplate) -} diff --git a/pkg/postgresql/cluster/core/monitoring_unit_test.go b/pkg/postgresql/cluster/core/monitoring_unit_test.go index 6c1d0715a..545ea25da 100644 --- a/pkg/postgresql/cluster/core/monitoring_unit_test.go +++ b/pkg/postgresql/cluster/core/monitoring_unit_test.go @@ -1,8 +1,6 @@ package core import ( - "encoding/json" - "errors" "testing" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" @@ -10,10 +8,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/utils/ptr" ) @@ -40,7 +36,6 @@ func TestIsPostgreSQLMetricsEnabled(t *testing.T) { ptr.To(true), nil, nil, - nil, ), want: true, }, @@ -57,7 +52,6 @@ func TestIsPostgreSQLMetricsEnabled(t *testing.T) { ptr.To(true), nil, nil, - nil, ), want: false, }, @@ -74,7 +68,6 @@ func TestIsPostgreSQLMetricsEnabled(t *testing.T) { ptr.To(false), nil, nil, - nil, ), want: false, }, @@ -168,7 +161,6 @@ func TestIsConnectionPoolerMetricsEnabled(t *testing.T) { nil, ptr.To(true), nil, - ptr.To(false), ), want: false, }, @@ -179,7 +171,6 @@ func TestIsConnectionPoolerMetricsEnabled(t *testing.T) { nil, ptr.To(true), ptr.To(true), - ptr.To(true), ), want: true, }, @@ -196,7 +187,6 @@ func TestIsConnectionPoolerMetricsEnabled(t *testing.T) { nil, ptr.To(true), ptr.To(true), - ptr.To(true), ), want: false, }, @@ -207,7 +197,6 @@ func TestIsConnectionPoolerMetricsEnabled(t *testing.T) { nil, ptr.To(true), ptr.To(false), - ptr.To(true), ), want: false, }, @@ -221,62 +210,6 @@ func TestIsConnectionPoolerMetricsEnabled(t *testing.T) { } } -func TestIsGrafanaDashboardEnabled(t *testing.T) { - tests := []struct { - name string - cluster *enterprisev4.PostgresCluster - class *enterprisev4.PostgresClusterClass - want bool - }{ - { - name: "enabled when class enables and cluster override is unset", - cluster: &enterprisev4.PostgresCluster{}, - class: newClassWithObservability( - nil, - nil, - nil, - ptr.To(true), - ), - want: true, - }, - { - name: "disabled when cluster override disables dashboard", - cluster: &enterprisev4.PostgresCluster{ - Spec: enterprisev4.PostgresClusterSpec{ - Observability: &enterprisev4.PostgresObservabilityOverride{ - GrafanaDashboard: &enterprisev4.FeatureDisableOverride{Disabled: ptr.To(true)}, - }, - }, - }, - class: newClassWithObservability( - nil, - nil, - nil, - ptr.To(true), - ), - want: false, - }, - { - name: "disabled when class disables dashboard", - cluster: &enterprisev4.PostgresCluster{}, - class: newClassWithObservability( - nil, - nil, - nil, - ptr.To(false), - ), - want: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := isGrafanaDashboardEnabled(tt.cluster, tt.class) - assert.Equal(t, tt.want, got) - }) - } -} - func TestBuildPostgreSQLMetricsService(t *testing.T) { scheme := newMonitoringTestScheme(t) cluster := newTestMonitoringCluster() @@ -315,29 +248,6 @@ func TestBuildConnectionPoolerMetricsService(t *testing.T) { assertMonitoringOwnerRef(t, svc.OwnerReferences, cluster) } -func TestBuildGrafanaDashboardConfigMap(t *testing.T) { - scheme := newMonitoringTestScheme(t) - cluster := newTestMonitoringCluster() - - cm, err := buildGrafanaDashboardConfigMap(scheme, cluster) - require.NoError(t, err) - - assert.Equal(t, "postgresql-cluster-dev-grafana-dashboard", cm.Name) - assert.Equal(t, "grafana-dashboard", cm.Labels[labelObservabilityComponent]) - assert.Equal(t, grafanaDashboardLabelValue, cm.Labels[grafanaDashboardLabelKey]) - assert.Contains(t, cm.Data, "dashboard.json") - assert.NotContains(t, cm.Data["dashboard.json"], "__CLUSTER_NAME__") - assert.Contains(t, cm.Data["dashboard.json"], cluster.Name) - assert.Contains(t, cm.Data["dashboard.json"], cluster.Namespace) - assert.Contains(t, cm.Data["dashboard.json"], cluster.Name+postgresMetricsServiceSuffix) - assert.Contains(t, cm.Data["dashboard.json"], poolerMetricsServiceName(cluster.Name, readWriteEndpoint)) - assert.Contains(t, cm.Data["dashboard.json"], poolerMetricsServiceName(cluster.Name, readOnlyEndpoint)) - - var dashboard map[string]any - require.NoError(t, json.Unmarshal([]byte(cm.Data["dashboard.json"]), &dashboard)) - assertMonitoringOwnerRef(t, cm.OwnerReferences, cluster) -} - func TestBuildPostgreSQLMetricsServiceMonitor(t *testing.T) { scheme := newMonitoringTestScheme(t) cluster := newTestMonitoringCluster() @@ -373,47 +283,6 @@ func TestBuildConnectionPoolerMetricsServiceMonitor(t *testing.T) { assertMonitoringOwnerRef(t, sm.OwnerReferences, cluster) } -func TestIsServiceMonitorUnavailable(t *testing.T) { - tests := []struct { - name string - err error - want bool - }{ - { - name: "nil error", - err: nil, - want: false, - }, - { - name: "not found error", - err: apierrors.NewNotFound(schema.GroupResource{Group: "monitoring.coreos.com", Resource: "servicemonitors"}, "test"), - want: true, - }, - { - name: "kind match string error", - err: errors.New("no matches for kind \"ServiceMonitor\" in version \"monitoring.coreos.com/v1\""), - want: true, - }, - { - name: "resource string error", - err: errors.New("servicemonitors.monitoring.coreos.com not found"), - want: true, - }, - { - name: "unrelated error", - err: errors.New("boom"), - want: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := isServiceMonitorUnavailable(tt.err) - assert.Equal(t, tt.want, got) - }) - } -} - func newMonitoringTestScheme(t *testing.T) *runtime.Scheme { t.Helper() @@ -443,16 +312,14 @@ func newClassWithObservability( postgresEnabled *bool, poolerEnabled *bool, pgBouncerMetricsEnabled *bool, - grafanaEnabled *bool, ) *enterprisev4.PostgresClusterClass { return &enterprisev4.PostgresClusterClass{ Spec: enterprisev4.PostgresClusterClassSpec{ Config: &enterprisev4.PostgresClusterClassConfig{ ConnectionPoolerEnabled: poolerEnabled, Observability: &enterprisev4.PostgresObservabilityClassConfig{ - PostgreSQL: &enterprisev4.MetricsClassConfig{Enabled: postgresEnabled}, - PgBouncer: &enterprisev4.MetricsClassConfig{Enabled: pgBouncerMetricsEnabled}, - GrafanaDashboard: &enterprisev4.GrafanaDashboardClassConfig{Enabled: grafanaEnabled}, + PostgreSQL: &enterprisev4.MetricsClassConfig{Enabled: postgresEnabled}, + PgBouncer: &enterprisev4.MetricsClassConfig{Enabled: pgBouncerMetricsEnabled}, }, }, }, From ce20fcd1342bb7108347f04fcbd3539def7ef805 Mon Sep 17 00:00:00 2001 From: dpishchenkov Date: Fri, 3 Apr 2026 15:03:00 +0200 Subject: [PATCH 3/4] add grafana sample to docs --- docs/PostgreSQLObservabilityDashboard.json | 928 +++++++++++++++++++++ docs/PostgreSQLObservabilityDashboard.md | 65 ++ 2 files changed, 993 insertions(+) create mode 100644 docs/PostgreSQLObservabilityDashboard.json create mode 100644 docs/PostgreSQLObservabilityDashboard.md diff --git a/docs/PostgreSQLObservabilityDashboard.json b/docs/PostgreSQLObservabilityDashboard.json new file mode 100644 index 000000000..aa0ffc765 --- /dev/null +++ b/docs/PostgreSQLObservabilityDashboard.json @@ -0,0 +1,928 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "count(count by (pod) (cnpg_pg_postmaster_start_time_seconds{namespace=\"$namespace\",service=\"$cluster-postgres-metrics\"}))", + "legendFormat": "postgres pods", + "range": true, + "refId": "A" + } + ], + "title": "PostgreSQL Targets", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "index": 0, + "text": "Down" + }, + "1": { + "index": 1, + "text": "Up" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "max(cnpg_pgbouncer_up{namespace=\"$namespace\",service=\"$cluster-pooler-rw-metrics\"})", + "legendFormat": "rw", + "range": true, + "refId": "A" + } + ], + "title": "RW Pooler", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "index": 0, + "text": "Down" + }, + "1": { + "index": 1, + "text": "Up" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "max(cnpg_pgbouncer_up{namespace=\"$namespace\",service=\"$cluster-pooler-ro-metrics\"})", + "legendFormat": "ro", + "range": true, + "refId": "A" + } + ], + "title": "RO Pooler", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(cnpg_pg_stat_archiver_archived_count{namespace=\"$namespace\",service=\"$cluster-postgres-metrics\"}[5m]))", + "legendFormat": "archive rate", + "range": true, + "refId": "A" + } + ], + "title": "Archive Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 5, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(splunk_operator_postgres_databases{phase=\"Failed\"})", + "legendFormat": "failed", + "range": true, + "refId": "A" + } + ], + "title": "Failed Databases", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 12, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 4 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (datname) (cnpg_pg_database_size_bytes{namespace=\"$namespace\",service=\"$cluster-postgres-metrics\"})", + "legendFormat": "{{datname}}", + "range": true, + "refId": "A" + } + ], + "title": "Database Size by Database", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 4 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (service) (cnpg_pgbouncer_pools_cl_active{namespace=\"$namespace\",service=~\"$cluster-pooler-(rw|ro)-metrics\"})", + "legendFormat": "{{service}} active", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum by (service) (cnpg_pgbouncer_pools_cl_waiting{namespace=\"$namespace\",service=~\"$cluster-pooler-(rw|ro)-metrics\"})", + "legendFormat": "{{service}} waiting", + "range": true, + "refId": "B" + } + ], + "title": "PgBouncer Client Load", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.25 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 4 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(cnpg_pg_stat_archiver_archived_count{namespace=\"$namespace\",service=\"$cluster-postgres-metrics\"}[5m]))", + "legendFormat": "archived WAL / sec", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum(cnpg_pg_wal_files_total{namespace=\"$namespace\",service=\"$cluster-postgres-metrics\"})", + "legendFormat": "wal files total", + "range": true, + "refId": "B" + } + ], + "title": "WAL Activity", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (phase) (splunk_operator_postgres_databases)", + "legendFormat": "{{phase}}", + "range": true, + "refId": "A" + } + ], + "title": "Fleet Database Phases", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (controller, result) (rate(splunk_operator_postgres_reconcile_total[5m]))", + "legendFormat": "{{controller}} {{result}}", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum by (controller, error_class) (rate(splunk_operator_postgres_reconcile_errors_total[5m]))", + "legendFormat": "{{controller}} errors {{error_class}}", + "range": true, + "refId": "B" + } + ], + "title": "Controller Reconcile Activity", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "postgresql", + "cnpg", + "pgbouncer", + "splunk-operator", + "reference" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "default", + "value": "default" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(cnpg_pg_postmaster_start_time_seconds, namespace)", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(cnpg_pg_postmaster_start_time_seconds, namespace)", + "refId": "Prometheus-namespace" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(cnpg_pg_postmaster_start_time_seconds{namespace=\"$namespace\"}, service)", + "hide": 0, + "includeAll": false, + "label": "Cluster", + "multi": false, + "name": "cluster", + "options": [], + "query": { + "query": "label_values(cnpg_pg_postmaster_start_time_seconds{namespace=\"$namespace\"}, service)", + "refId": "Prometheus-cluster" + }, + "refresh": 2, + "regex": "/(.*)-postgres-metrics/", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "PostgreSQL Observability Reference", + "uid": "postgresql-observability-reference", + "version": 1, + "weekStart": "" +} diff --git a/docs/PostgreSQLObservabilityDashboard.md b/docs/PostgreSQLObservabilityDashboard.md new file mode 100644 index 000000000..109ef78d2 --- /dev/null +++ b/docs/PostgreSQLObservabilityDashboard.md @@ -0,0 +1,65 @@ +# PostgreSQL Observability Dashboard Example + +This file provides a reference Grafana dashboard for the PostgreSQL observability model described in the PostgreSQL observability notes. + +The dashboard JSON lives at: + +- [PostgreSQLObservabilityDashboard.json](/Users/dpishchenkov/splunk-operator/docs/PostgreSQLObservabilityDashboard.json) + +## Purpose + +This dashboard is a reference artifact only. + +It is meant to show how a Grafana dashboard could combine: + +- runtime PostgreSQL and PgBouncer metrics exposed through the `PostgresCluster` observability path +- controller metrics emitted by the PostgreSQL controllers + +It is not meant to imply that Grafana runtime resources are managed by the operator. + +## Panels Included + +The sample dashboard includes: + +- PostgreSQL target count +- RW and RO PgBouncer availability +- WAL archive rate +- failed `PostgresDatabase` count +- database size by database +- PgBouncer active and waiting clients +- WAL activity +- fleet database phases +- controller reconcile activity and errors + +## Assumptions + +The sample queries assume: + +- Prometheus is scraping the PostgreSQL metrics `Service` created by the `PostgresCluster` controller +- Prometheus is scraping the PgBouncer metrics `Service` objects created for RW and RO poolers +- Prometheus series include `namespace` and `service` labels +- the cluster metrics service is named `-postgres-metrics` +- the PgBouncer metrics services are named `-pooler-rw-metrics` and `-pooler-ro-metrics` +- the controller metrics branch is present for the `splunk_operator_postgres_*` metrics + +If your Prometheus relabeling differs, you may need to adjust the dashboard queries. + +## Import Notes + +To use the dashboard: + +1. Import the JSON file into Grafana. +2. Select the correct Prometheus datasource. +3. Choose the namespace. +4. Choose the cluster name using the derived `cluster` variable. + +## Notes On Candidate Metrics + +Some PgBouncer queries in the sample use metrics that are good candidates but should still be verified against actual exporter output in the merged branch: + +- `cnpg_pgbouncer_pools_cl_waiting` +- `cnpg_pgbouncer_pools_maxwait` +- `cnpg_pgbouncer_stats_avg_wait_time` +- `cnpg_pgbouncer_stats_total_wait_time` + +If those exact series are not present, keep the panel shape and replace the query with the actual exported metric name. From 943a51933886c7e2db1b6fce4b6b8b89fd4d0236 Mon Sep 17 00:00:00 2001 From: dpishchenkov Date: Thu, 9 Apr 2026 15:16:00 +0200 Subject: [PATCH 4/4] style: fix links to docs. --- docs/PostgreSQLObservabilityDashboard.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/PostgreSQLObservabilityDashboard.md b/docs/PostgreSQLObservabilityDashboard.md index 109ef78d2..22343afff 100644 --- a/docs/PostgreSQLObservabilityDashboard.md +++ b/docs/PostgreSQLObservabilityDashboard.md @@ -4,7 +4,7 @@ This file provides a reference Grafana dashboard for the PostgreSQL observabilit The dashboard JSON lives at: -- [PostgreSQLObservabilityDashboard.json](/Users/dpishchenkov/splunk-operator/docs/PostgreSQLObservabilityDashboard.json) +- [PostgreSQLObservabilityDashboard.json](./PostgreSQLObservabilityDashboard.json) ## Purpose