From e66a364651a83a3de1b5f593ef5f0fa0a71b4aff Mon Sep 17 00:00:00 2001 From: Abhimanyu Roy Date: Mon, 18 May 2026 17:53:22 +0530 Subject: [PATCH 1/6] feat: dbaas operator metrics --- dbaas-operator/cmd/main.go | 10 +- dbaas-operator/dev/operator-metrics.md | 176 +++++ dbaas-operator/go.mod | 5 +- .../dashboards/dbaas-operator-dashboard.json | 630 ++++++++++++++++++ .../dbaas-operator/templates/Dashboard.yaml | 17 + .../databasedeclaration_controller.go | 82 +++ .../controller/dbpolicy_controller.go | 53 +- .../controller/externaldatabase_controller.go | 161 ++++- dbaas-operator/internal/controller/metrics.go | 194 ++++++ .../internal/controller/metrics_test.go | 52 ++ 10 files changed, 1349 insertions(+), 31 deletions(-) create mode 100644 dbaas-operator/dev/operator-metrics.md create mode 100644 dbaas-operator/helm-templates/dbaas-operator/dashboards/dbaas-operator-dashboard.json create mode 100644 dbaas-operator/helm-templates/dbaas-operator/templates/Dashboard.yaml create mode 100644 dbaas-operator/internal/controller/metrics.go create mode 100644 dbaas-operator/internal/controller/metrics_test.go diff --git a/dbaas-operator/cmd/main.go b/dbaas-operator/cmd/main.go index 7486fc4f2..a255f5bc6 100644 --- a/dbaas-operator/cmd/main.go +++ b/dbaas-operator/cmd/main.go @@ -95,15 +95,15 @@ func main() { BindAddress: metricsAddr, } - // ── Operator namespace ──────────────────────────────────────────────────── + // Operator namespace. cloudNamespace := os.Getenv("CLOUD_NAMESPACE") if cloudNamespace == "" { - setupLog.Errorf("CLOUD_NAMESPACE env var is not set — ownership checks will not work correctly") + setupLog.Errorf("CLOUD_NAMESPACE env var is not set - ownership checks will not work correctly") os.Exit(1) } setupLog.Infof("operator namespace cloud-namespace=%v", cloudNamespace) - // ── dbaas-aggregator client ─────────────────────────────────────────────── + // dbaas-aggregator client. aggregatorURL := os.Getenv("DBAAS_AGGREGATOR_URL") if aggregatorURL == "" { aggregatorURL = "http://dbaas-aggregator:8080" @@ -165,14 +165,14 @@ func main() { } setupLog.Infof("backoff configured base=%v max=%v", backoffBaseDelay, backoffMaxDelay) - // ── Ownership resolver ──────────────────────────────────────────────────── + // Ownership resolver. ownershipResolver := ownership.NewOwnershipResolver(cloudNamespace, mgr.GetClient()) if err := mgr.Add(&ownershipWarmupRunnable{resolver: ownershipResolver}); err != nil { setupLog.Errorf("Failed to register ownership warmup runnable: %v", err) os.Exit(1) } - // ── NamespaceBinding controller (always enabled) ─────────────────────────── + // NamespaceBinding controller (always enabled). edbChecker := ownership.NewKindChecker( mgr.GetClient(), func() *dbaasv1.ExternalDatabaseList { return &dbaasv1.ExternalDatabaseList{} }, diff --git a/dbaas-operator/dev/operator-metrics.md b/dbaas-operator/dev/operator-metrics.md new file mode 100644 index 000000000..9bc420c56 --- /dev/null +++ b/dbaas-operator/dev/operator-metrics.md @@ -0,0 +1,176 @@ +# Operator Metrics + +Custom Prometheus metrics exposed by the DBaaS Operator at `/metrics`. +Each metric is registered at startup and scraped by Prometheus. + +--- + +## Secret Watcher + +### `dbaas_reconcile_trigger_total` +**Type:** Counter + +**Labels:** `controller`, `trigger` + +`controller`: `externaldatabase` | `dbpolicy` | `databasedeclaration` + +`trigger`: `spec_change` | `secret_change` | `namespace_binding_change` | `polling` + +Counts every reconcile invocation, tagged by what caused it. A `secret_change` increment means the watcher detected a credential rotation and re-registered the database automatically without a CR spec change. + +**Dashboard:** Stacked time series by trigger. A spike in `secret_change` is direct proof the feature fired. + +--- + +### `dbaas_secret_rotation_propagation_seconds` +**Type:** Histogram + +Measures the end-to-end time from a Secret change being detected to the ExternalDatabase reaching `Succeeded`. This is the SLO metric for credential rotation: it answers "how quickly does a password change take effect?" + +The start timestamp is kept in memory by the active operator process. If the operator restarts after the Secret event and before the ExternalDatabase succeeds, that specific propagation sample is not recorded. + +**Dashboard:** P50 / P90 / P99 latency lines over time. + +--- + +### `dbaas_secret_resolution_errors_total` +**Type:** Counter + +**Labels:** `namespace`, `reason` + +Counts failures reading a credentials Secret during reconcile. Each increment means a credential rotation or CR create/update could not resolve valid credentials until the Secret is fixed. + +**Dashboard:** Error rate time series per namespace/reason. Any non-zero sustained rate is actionable. + +--- + +## Aggregator Interaction + +### `dbaas_aggregator_request_duration_seconds` +**Type:** Histogram + +**Labels:** `controller`, `operation` (`register_external_database` | `apply_config` | `poll_status`) + +Tracks HTTP call latency to dbaas-aggregator per operation type. Identifies whether the aggregator is the bottleneck in slow registrations or provisioning. + +**Dashboard:** P50 / P90 / P99 latency lines, split by controller and operation. + +--- + +### `dbaas_aggregator_requests_total` +**Type:** Counter + +**Labels:** `controller`, `operation`, `result` (`success` | `auth_error` | `spec_rejection` | `server_error` | `network_error`) + +Counts every aggregator call by outcome. The `result` label routes failures to the right owner: + +| Result | Meaning | Owner | +|---|---|---| +| `success` | Call succeeded | - | +| `spec_rejection` | Aggregator rejected the payload by content | Application team / CR owner | +| `auth_error` | Operator credentials invalid | Platform team | +| `server_error` | Aggregator returned 5xx | Aggregator team | +| `network_error` | No response was received | Network / platform team | + +**Dashboard:** Stacked rate by result. Failure buckets are split because they have different owners. + +--- + +## Dashboard-Only Resource Health Substitutes + +The operator does not currently export `dbaas_resource_phase`, `dbaas_resources_stuck`, or `dbaas_async_operations_in_flight`. + +Those gauges require exact current-state knowledge across DBaaS CRs. A periodic sweeper can compute them, but it must list every relevant CR in each owned namespace. That cost scales linearly with the number of CRs and becomes less attractive as DBaaS adoption grows. + +A custom in-memory tracker could avoid periodic listing, but it would need to handle startup sync, status-only updates, deletes, operator restarts, leader changes, and NamespaceBinding ownership changes. That complexity is not justified for the current dashboard signal. + +The dashboard can still project close substitutes from event/rate metrics. These are not exact current-state counts; they show recent activity that usually explains why resources would be unhealthy. + +### Substitute for `dbaas_resource_phase` + +`dbaas_resource_phase` would answer: "how many CRs are currently in each phase?" + +Without a direct state gauge, use failure distribution over the dashboard range as the closest operational view of resource health: + +```promql +sum by (controller, result) ( + increase(dbaas_aggregator_requests_total{result!="success"}[$__range]) +) +``` + +```promql +sum by (namespace, reason) ( + increase(dbaas_secret_resolution_errors_total[$__range]) +) +``` + +Dashboard title: `Recent Failed Operations by Owner` + +This shows failure pressure by owner bucket, but it does not count current CR phases. + +### Substitute for `dbaas_resources_stuck` + +`dbaas_resources_stuck` would answer: "has any CR stayed in a bad state long enough to need attention?" + +Without a direct phase gauge, use repeated failure activity as a stuck-like signal. This is intentionally different from the resource-health distribution above: it looks for failures that keep recurring in the recent window. + +```promql +sum by (controller, result) ( + rate(dbaas_aggregator_requests_total{result!="success"}[15m]) +) > 0 +``` + +```promql +sum by (namespace, reason) ( + rate(dbaas_secret_resolution_errors_total[15m]) +) > 0 +``` + +Dashboard title: `Sustained Failure Signals` + +This is a close alerting substitute for quiet stuck resources, but it is event-based. If a CR failed once and then no more reconciles happened, this query will eventually go quiet. + +If a reliable non-sweep phase metric is introduced later, the preferred stuck query would be: + +```promql +min_over_time(dbaas_resource_phase{phase=~"BackingOff|InvalidConfiguration"}[15m]) > 0 +``` + +### Substitute for `dbaas_async_operations_in_flight` + +`dbaas_async_operations_in_flight` would answer: "how many DatabaseDeclarations currently have an active tracking ID?" + +Without a direct state gauge, use a backlog proxy: polling activity without matching terminal completions. + +```promql +sum( + rate(dbaas_reconcile_trigger_total{controller="databasedeclaration",trigger="polling"}[15m]) +) +``` + +```promql +sum( + rate(dbaas_async_operation_duration_seconds_count[15m]) +) +``` + +Dashboard title: `Async Provisioning Activity` + +If the first query is non-zero while the second is near zero, DatabaseDeclarations are being polled but not reaching terminal states. It does not count currently in-flight operations. + +--- + +## Async Provisioning + +### `dbaas_async_operation_duration_seconds` +**Type:** Histogram + +**Labels:** `result` (`success` | `failed` | `terminated`) + +End-to-end provisioning time from async submission (HTTP 202) to a terminal state. This is the user-visible provisioning SLO: the time a user waits from submitting a DatabaseDeclaration to having a usable database. + +`terminated` means the aggregator cancelled the operation mid-flight; the operator resubmits automatically. + +The start timestamp is kept in memory after the operator submits an async operation. If the operator restarts before the terminal poll result, that operation is still reconciled correctly, but its duration sample is not recorded. + +**Dashboard:** P50 / P90 / P99 per result label. Completion rate (operations/s) as a secondary panel. diff --git a/dbaas-operator/go.mod b/dbaas-operator/go.mod index 9a62f68d2..4fe5eb6e5 100644 --- a/dbaas-operator/go.mod +++ b/dbaas-operator/go.mod @@ -10,7 +10,9 @@ require ( github.com/netcracker/qubership-core-lib-go/v3 v3.8.0 github.com/onsi/ginkgo/v2 v2.28.1 github.com/onsi/gomega v1.39.1 + github.com/prometheus/client_golang v1.23.2 k8s.io/api v0.35.4 + k8s.io/apiextensions-apiserver v0.35.0 k8s.io/apimachinery v0.35.4 k8s.io/client-go v0.35.4 sigs.k8s.io/controller-runtime v0.23.3 @@ -43,6 +45,7 @@ require ( github.com/knadh/koanf/providers/env v1.1.0 // indirect github.com/knadh/koanf/providers/file v1.2.1 // indirect github.com/knadh/koanf/v2 v2.3.4 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect @@ -51,7 +54,6 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect @@ -76,7 +78,6 @@ require ( gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.35.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect diff --git a/dbaas-operator/helm-templates/dbaas-operator/dashboards/dbaas-operator-dashboard.json b/dbaas-operator/helm-templates/dbaas-operator/dashboards/dbaas-operator-dashboard.json new file mode 100644 index 000000000..58c1f7ec6 --- /dev/null +++ b/dbaas-operator/helm-templates/dbaas-operator/dashboards/dbaas-operator-dashboard.json @@ -0,0 +1,630 @@ +{ + "title": "DBaaS Operator", + "uid": "dbaas-operator", + "schemaVersion": 39, + "version": 3, + "refresh": "10s", + "time": { + "from": "now-15m", + "to": "now" + }, + "timezone": "browser", + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Platform Monitoring Prometheus", + "value": "Platform Monitoring Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Cloud", + "multi": false, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "panels": [ + { + "id": 1, + "type": "row", + "title": "Secret Watcher - Credential Rotation", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + } + }, + { + "id": 2, + "type": "timeseries", + "title": "Reconcile Triggers by Source (EDB)", + "description": "Compares how many reconciles are driven by Secret rotations vs spec changes. A spike in secret_change confirms the watcher is firing.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "secret_change" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "spec_change" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum by (trigger) (rate(dbaas_reconcile_trigger_total{controller=\"externaldatabase\"}[1m]))", + "legendFormat": "{{trigger}}" + } + ] + }, + { + "id": 3, + "type": "timeseries", + "title": "Secret Rotation Propagation Time (seconds)", + "description": "End-to-end time from a Secret change being detected to the ExternalDatabase reaching Succeeded. This is the SLO metric for credential rotation.", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "lineWidth": 2 + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(dbaas_secret_rotation_propagation_seconds_bucket[5m]))", + "legendFormat": "P50" + }, + { + "expr": "histogram_quantile(0.90, rate(dbaas_secret_rotation_propagation_seconds_bucket[5m]))", + "legendFormat": "P90" + }, + { + "expr": "histogram_quantile(0.99, rate(dbaas_secret_rotation_propagation_seconds_bucket[5m]))", + "legendFormat": "P99" + } + ] + }, + { + "id": 4, + "type": "timeseries", + "title": "Secret Resolution Errors", + "description": "Failures reading a credentials Secret. Reasons are actionable: secret_not_found, key_missing, key_empty, forbidden, or secret_read_failed.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "short", + "color": { + "fixedColor": "red", + "mode": "fixed" + }, + "custom": { + "lineWidth": 2 + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum by (namespace, reason) (rate(dbaas_secret_resolution_errors_total[1m]))", + "legendFormat": "{{namespace}} / {{reason}}" + } + ] + }, + { + "id": 5, + "type": "row", + "title": "Aggregator Interaction Health", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + } + }, + { + "id": 6, + "type": "timeseries", + "title": "Aggregator Call Latency (seconds)", + "description": "P50/P90/P99 latency per controller and operation. Feeds into SLO alerting - if P99 rises, the aggregator is slow.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "lineWidth": 2 + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket[5m])))", + "legendFormat": "P50 {{controller}}/{{operation}}" + }, + { + "expr": "histogram_quantile(0.90, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket[5m])))", + "legendFormat": "P90 {{controller}}/{{operation}}" + }, + { + "expr": "histogram_quantile(0.99, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket[5m])))", + "legendFormat": "P99 {{controller}}/{{operation}}" + } + ] + }, + { + "id": 7, + "type": "timeseries", + "title": "Aggregator Call Outcomes (calls/s)", + "description": "Result breakdown: spec_rejection = CR/spec owner, auth_error = operator credentials/RBAC, server_error = aggregator, network_error = connectivity/platform.", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "spec_rejection" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "auth_error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "server_error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "network_error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum by (result) (rate(dbaas_aggregator_requests_total[1m]))", + "legendFormat": "{{result}}" + } + ] + }, + { + "id": 8, + "type": "row", + "title": "Dashboard-Only Resource Health Substitutes", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + } + }, + { + "id": 9, + "type": "timeseries", + "title": "Recent Failed Operations by Owner", + "description": "Dashboard-only substitute for resource phase health. Shows recent failed operations by owner bucket instead of exact current CR phase counts.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by (controller, result) (increase(dbaas_aggregator_requests_total{result!=\"success\"}[$__range]))", + "legendFormat": "aggregator {{controller}} / {{result}}", + "refId": "A" + }, + { + "expr": "sum by (namespace, reason) (increase(dbaas_secret_resolution_errors_total[$__range]))", + "legendFormat": "secret {{namespace}} / {{reason}}", + "refId": "B" + } + ] + }, + { + "id": 10, + "type": "timeseries", + "title": "Sustained Failure Signals", + "description": "Dashboard-only substitute for stuck resources. Shows failure signals that continue over the recent window; it is event-based, not an exact stuck CR count.", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by (controller, result) (rate(dbaas_aggregator_requests_total{result!=\"success\"}[15m])) \u003e 0", + "legendFormat": "aggregator {{controller}} / {{result}}", + "refId": "A" + }, + { + "expr": "sum by (namespace, reason) (rate(dbaas_secret_resolution_errors_total[15m])) \u003e 0", + "legendFormat": "secret {{namespace}} / {{reason}}", + "refId": "B" + } + ] + }, + { + "id": 11, + "type": "timeseries", + "title": "Async Provisioning Activity", + "description": "Dashboard-only substitute for async operations in flight. Compares DatabaseDeclaration polling activity with terminal completions.", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 35 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum(rate(dbaas_reconcile_trigger_total{controller=\"databasedeclaration\",trigger=\"polling\"}[15m]))", + "legendFormat": "polling rate", + "refId": "A" + }, + { + "expr": "sum(rate(dbaas_async_operation_duration_seconds_count[15m]))", + "legendFormat": "completion rate", + "refId": "B" + } + ] + }, + { + "id": 15, + "type": "timeseries", + "title": "Async Provisioning End-to-End Duration (seconds)", + "description": "How long it takes from HTTP 202 (async accepted) to a terminal state (completed / failed / terminated). This is the user-visible provisioning SLO.", + "gridPos": { + "h": 8, + "w": 18, + "x": 6, + "y": 44 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "lineWidth": 2 + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket[5m])))", + "legendFormat": "P50 {{result}}" + }, + { + "expr": "histogram_quantile(0.90, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket[5m])))", + "legendFormat": "P90 {{result}}" + }, + { + "expr": "histogram_quantile(0.99, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket[5m])))", + "legendFormat": "P99 {{result}}" + } + ] + }, + { + "id": 16, + "type": "timeseries", + "title": "Async Completion Rate by Outcome", + "description": "Rate of async operations completing per second, split by result (success / failed / terminated). Terminated means the aggregator cancelled and the operator will resubmit.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 52 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "terminated" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum by (result) (rate(dbaas_async_operation_duration_seconds_count[1m]))", + "legendFormat": "{{result}}" + } + ] + } + ] +} diff --git a/dbaas-operator/helm-templates/dbaas-operator/templates/Dashboard.yaml b/dbaas-operator/helm-templates/dbaas-operator/templates/Dashboard.yaml new file mode 100644 index 000000000..197a346ee --- /dev/null +++ b/dbaas-operator/helm-templates/dbaas-operator/templates/Dashboard.yaml @@ -0,0 +1,17 @@ +{{- if and .Values.DBAAS_OPERATOR_ENABLED .Values.MONITORING_ENABLED }} +--- +apiVersion: integreatly.org/v1alpha1 +kind: GrafanaDashboard +metadata: + name: dbaas-operator-dashboard + labels: + app: grafana + app.kubernetes.io/name: "{{ .Values.SERVICE_NAME }}" + app.kubernetes.io/component: monitoring + app.kubernetes.io/part-of: '{{ .Values.APPLICATION_NAME | default "dbaas" }}' + app.kubernetes.io/managed-by: "{{ .Values.MANAGED_BY }}" + deployment.netcracker.com/sessionId: '{{ .Values.DEPLOYMENT_SESSION_ID | default "unimplemented" }}' +spec: + name: dbaas-operator-dashboard.json + json: {{ .Files.Get "dashboards/dbaas-operator-dashboard.json" | toJson | replace "#namespace#" .Values.NAMESPACE }} +{{- end }} diff --git a/dbaas-operator/internal/controller/databasedeclaration_controller.go b/dbaas-operator/internal/controller/databasedeclaration_controller.go index 83fe36b0c..eab94d769 100644 --- a/dbaas-operator/internal/controller/databasedeclaration_controller.go +++ b/dbaas-operator/internal/controller/databasedeclaration_controller.go @@ -25,9 +25,11 @@ import ( "errors" "fmt" "net/http" + "sync" "time" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" @@ -60,6 +62,16 @@ type DatabaseDeclarationReconciler struct { Aggregator *aggregatorclient.AggregatorClient Recorder record.EventRecorder Ownership *ownership.OwnershipResolver + + // asyncStartMu guards asyncStartTimes. + asyncStartMu sync.Mutex + // asyncStartTimes records the wall-clock time when an async operation was + // submitted (HTTP 202), keyed by "namespace/name". Consumed when the + // operation reaches a terminal state (COMPLETED, FAILED, TERMINATED). + asyncStartTimes map[string]time.Time + + bindingTriggerMu sync.Mutex + bindingTriggerStamps map[string]struct{} } func (r *DatabaseDeclarationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { @@ -68,9 +80,15 @@ func (r *DatabaseDeclarationReconciler) Reconcile(ctx context.Context, req ctrl. dd := &dbaasv1.DatabaseDeclaration{} if err := r.Get(ctx, req.NamespacedName, dd); err != nil { + if apierrors.IsNotFound(err) { + r.clearBindingTrigger(req.Namespace + "/" + req.Name) + } return ctrl.Result{}, client.IgnoreNotFound(err) } + key := req.Namespace + "/" + req.Name + bindingTriggered := r.consumeBindingTrigger(key) + owned, result, err := checkOwnership(ctx, r.Ownership, dd.Namespace, dd.Name, "DatabaseDeclaration") if err != nil { return ctrl.Result{}, err @@ -102,6 +120,14 @@ func (r *DatabaseDeclarationReconciler) Reconcile(ctx context.Context, req ctrl. dd.Status.PendingOperationGeneration = 0 } + trigger := triggerSpecChange + if bindingTriggered { + trigger = triggerNamespaceBindingChange + } else if dd.Status.TrackingID != "" { + trigger = triggerPolling + } + recordReconcileTrigger(controllerDD, trigger) + if dd.Status.TrackingID != "" { return r.reconcilePoll(ctx, dd) } @@ -120,7 +146,9 @@ func (r *DatabaseDeclarationReconciler) reconcileSubmit(ctx context.Context, dd // ── Call aggregator ─────────────────────────────────────────────────────── payload := r.buildPayload(dd) dd.Status.LastRequestID = requestID + aggStart := time.Now() resp, err := r.Aggregator.ApplyConfig(ctx, payload) + recordAggregatorCall(controllerDD, operationApplyConfig, aggStart, err) if err != nil { log.ErrorC(ctx, "failed to apply DatabaseDeclaration to dbaas-aggregator: %v", err) return handleAggregatorError(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, r.Recorder, dd, err, requestID) @@ -129,6 +157,13 @@ func (r *DatabaseDeclarationReconciler) reconcileSubmit(ctx context.Context, dd if resp.TrackingID != "" { // HTTP 202 Accepted — async operation started. log.InfoC(ctx, "database provisioning started asynchronously. trackingId = %v, microserviceName = %v", resp.TrackingID, dd.Spec.Classifier.MicroserviceName) + ddKey := dd.Namespace + "/" + dd.Name + r.asyncStartMu.Lock() + if r.asyncStartTimes == nil { + r.asyncStartTimes = make(map[string]time.Time) + } + r.asyncStartTimes[ddKey] = time.Now() + r.asyncStartMu.Unlock() markProvisioningStarted(dd, resp.TrackingID) r.Recorder.Eventf(dd, corev1.EventTypeNormal, EventReasonProvisioningStarted, "database provisioning started asynchronously (trackingId=%s, requestId=%s)", @@ -155,7 +190,9 @@ func (r *DatabaseDeclarationReconciler) reconcilePoll(ctx context.Context, dd *d dd.Status.Phase = dbaasv1.PhaseWaitingForDependency dd.Status.LastRequestID = requestID + aggStart := time.Now() resp, err := r.Aggregator.GetOperationStatus(ctx, trackingID) + recordAggregatorCall(controllerDD, operationPollStatus, aggStart, err) if err != nil { return r.handlePollError(ctx, dd, trackingID, requestID, err) } @@ -343,6 +380,22 @@ func clearPendingOperation(dd *dbaasv1.DatabaseDeclaration) { dd.Status.PendingOperationGeneration = 0 } +// observeAsyncCompletion records the end-to-end async operation duration and +// consumes the in-memory start stamp. Safe to call even when no start stamp exists +// (e.g. operator restarted mid-operation — in that case duration is not recorded). +func (r *DatabaseDeclarationReconciler) observeAsyncCompletion(dd *dbaasv1.DatabaseDeclaration, result string) { + ddKey := dd.Namespace + "/" + dd.Name + r.asyncStartMu.Lock() + start, ok := r.asyncStartTimes[ddKey] + if ok { + delete(r.asyncStartTimes, ddKey) + } + r.asyncStartMu.Unlock() + if ok { + dbaasAsyncOperationDurationSeconds.WithLabelValues(result).Observe(time.Since(start).Seconds()) + } +} + func (r *DatabaseDeclarationReconciler) handlePollError( ctx context.Context, dd *dbaasv1.DatabaseDeclaration, @@ -399,6 +452,7 @@ func (r *DatabaseDeclarationReconciler) handlePollResponse( log.InfoC(ctx, "database provisioned. trackingId = %v, microserviceName = %v", trackingID, dd.Spec.Classifier.MicroserviceName) clearPendingOperation(dd) + r.observeAsyncCompletion(dd, resultSuccess) markSucceeded(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, EventReasonDatabaseProvisioned) r.Recorder.Eventf(dd, corev1.EventTypeNormal, EventReasonDatabaseProvisioned, "database provisioned (microserviceName=%s, trackingId=%s)", @@ -410,6 +464,7 @@ func (r *DatabaseDeclarationReconciler) handlePollResponse( log.InfoC(ctx, "database provisioning failed trackingId=%v status=%v reason=%v", trackingID, resp.Status, reason) clearPendingOperation(dd) + r.observeAsyncCompletion(dd, "failed") markPermanentFailure(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, EventReasonAggregatorRejected, reason) r.Recorder.Eventf(dd, corev1.EventTypeWarning, EventReasonAggregatorRejected, @@ -423,6 +478,7 @@ func (r *DatabaseDeclarationReconciler) handlePollResponse( // Clear the stale trackingID so the next reconcile enters the SUBMIT branch. log.InfoC(ctx, "provisioning was terminated, clearing trackingId for resubmit trackingId=%v", trackingID) clearPendingOperation(dd) + r.observeAsyncCompletion(dd, "terminated") markTransientFailure(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, EventReasonOperationTerminated, "provisioning was terminated by the aggregator, resubmitting") r.Recorder.Eventf(dd, corev1.EventTypeWarning, EventReasonOperationTerminated, @@ -477,7 +533,33 @@ func (r *DatabaseDeclarationReconciler) enqueueForBinding(ctx context.Context, o } reqs := make([]reconcile.Request, 0, len(list.Items)) for i := range list.Items { + r.stampBindingTrigger(list.Items[i].Namespace + "/" + list.Items[i].Name) reqs = append(reqs, reconcile.Request{NamespacedName: client.ObjectKeyFromObject(&list.Items[i])}) } return reqs } + +func (r *DatabaseDeclarationReconciler) stampBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if r.bindingTriggerStamps == nil { + r.bindingTriggerStamps = make(map[string]struct{}) + } + r.bindingTriggerStamps[key] = struct{}{} +} + +func (r *DatabaseDeclarationReconciler) consumeBindingTrigger(key string) bool { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if _, ok := r.bindingTriggerStamps[key]; !ok { + return false + } + delete(r.bindingTriggerStamps, key) + return true +} + +func (r *DatabaseDeclarationReconciler) clearBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + delete(r.bindingTriggerStamps, key) +} diff --git a/dbaas-operator/internal/controller/dbpolicy_controller.go b/dbaas-operator/internal/controller/dbpolicy_controller.go index e079ad1a2..aef2c341d 100644 --- a/dbaas-operator/internal/controller/dbpolicy_controller.go +++ b/dbaas-operator/internal/controller/dbpolicy_controller.go @@ -21,8 +21,11 @@ package controller import ( "context" + "sync" + "time" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" @@ -49,6 +52,9 @@ type DbPolicyReconciler struct { Aggregator *aggregatorclient.AggregatorClient Recorder record.EventRecorder Ownership *ownership.OwnershipResolver + + bindingTriggerMu sync.Mutex + bindingTriggerStamps map[string]struct{} } func (r *DbPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { @@ -56,9 +62,15 @@ func (r *DbPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (r dp := &dbaasv1.DbPolicy{} if err := r.Get(ctx, req.NamespacedName, dp); err != nil { + if apierrors.IsNotFound(err) { + r.clearBindingTrigger(req.Namespace + "/" + req.Name) + } return ctrl.Result{}, client.IgnoreNotFound(err) } + key := req.Namespace + "/" + req.Name + bindingTriggered := r.consumeBindingTrigger(key) + owned, result, err := checkOwnership(ctx, r.Ownership, dp.Namespace, dp.Name, "DbPolicy") if err != nil { return ctrl.Result{}, err @@ -67,6 +79,12 @@ func (r *DbPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (r return result, nil } + trigger := triggerSpecChange + if bindingTriggered { + trigger = triggerNamespaceBindingChange + } + recordReconcileTrigger(controllerDP, trigger) + // Snapshot for the status patch at the end of reconcile. original := dp.DeepCopy() @@ -92,9 +110,12 @@ func (r *DbPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (r payload := r.buildPayload(dp) dp.Status.LastRequestID = requestID - if _, err := r.Aggregator.ApplyConfig(ctx, payload); err != nil { - log.ErrorC(ctx, "failed to apply DbPolicy to dbaas-aggregator: %v", err) - return handleAggregatorError(&dp.Status.Phase, &dp.Status.Conditions, dp.Generation, r.Recorder, dp, err, requestID) + aggStart := time.Now() + _, aggErr := r.Aggregator.ApplyConfig(ctx, payload) + recordAggregatorCall(controllerDP, operationApplyConfig, aggStart, aggErr) + if aggErr != nil { + log.ErrorC(ctx, "failed to apply DbPolicy to dbaas-aggregator: %v", aggErr) + return handleAggregatorError(&dp.Status.Phase, &dp.Status.Conditions, dp.Generation, r.Recorder, dp, aggErr, requestID) } log.InfoC(ctx, "DbPolicy applied successfully microserviceName=%v", dp.Spec.MicroserviceName) @@ -164,7 +185,33 @@ func (r *DbPolicyReconciler) enqueueForBinding(ctx context.Context, obj client.O } reqs := make([]reconcile.Request, 0, len(list.Items)) for i := range list.Items { + r.stampBindingTrigger(list.Items[i].Namespace + "/" + list.Items[i].Name) reqs = append(reqs, reconcile.Request{NamespacedName: client.ObjectKeyFromObject(&list.Items[i])}) } return reqs } + +func (r *DbPolicyReconciler) stampBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if r.bindingTriggerStamps == nil { + r.bindingTriggerStamps = make(map[string]struct{}) + } + r.bindingTriggerStamps[key] = struct{}{} +} + +func (r *DbPolicyReconciler) consumeBindingTrigger(key string) bool { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if _, ok := r.bindingTriggerStamps[key]; !ok { + return false + } + delete(r.bindingTriggerStamps, key) + return true +} + +func (r *DbPolicyReconciler) clearBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + delete(r.bindingTriggerStamps, key) +} diff --git a/dbaas-operator/internal/controller/externaldatabase_controller.go b/dbaas-operator/internal/controller/externaldatabase_controller.go index b8626f04e..3d001e488 100644 --- a/dbaas-operator/internal/controller/externaldatabase_controller.go +++ b/dbaas-operator/internal/controller/externaldatabase_controller.go @@ -24,8 +24,11 @@ import ( "context" "fmt" "maps" + "sync" + "time" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" @@ -54,6 +57,16 @@ type ExternalDatabaseReconciler struct { Aggregator *aggregatorclient.AggregatorClient Recorder record.EventRecorder Ownership *ownership.OwnershipResolver + + // secretTriggerMu guards the Secret-trigger maps below. + secretTriggerMu sync.Mutex + // secretTriggerStamps is consumed when classifying the next reconcile. + // secretPropagationStamps is kept until Succeeded so retries are included. + secretTriggerStamps map[string]struct{} + secretPropagationStamps map[string]time.Time + + bindingTriggerMu sync.Mutex + bindingTriggerStamps map[string]struct{} } func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { @@ -61,17 +74,34 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req edb := &dbaasv1.ExternalDatabase{} if err := r.Get(ctx, req.NamespacedName, edb); err != nil { + if apierrors.IsNotFound(err) { + r.clearSecretTrigger(req.Namespace + "/" + req.Name) + } return ctrl.Result{}, client.IgnoreNotFound(err) } + // Determine whether this reconcile was triggered by a Secret change or a + // spec change, and record the trigger counter. + edbKey := req.Namespace + "/" + req.Name + fromSecret := r.consumeSecretTrigger(edbKey) + + trigger := triggerSpecChange + if fromSecret { + trigger = triggerSecretChange + } else if r.consumeBindingTrigger(edbKey) { + trigger = triggerNamespaceBindingChange + } + // Skip namespaces not owned by this operator instance. owned, result, err := checkOwnership(ctx, r.Ownership, edb.Namespace, edb.Name, "ExternalDatabase") if err != nil { return ctrl.Result{}, err } if !owned { + r.clearSecretTrigger(edbKey) return result, nil } + recordReconcileTrigger(controllerEDB, trigger) // Snapshot for the status patch at the end of reconcile. original := edb.DeepCopy() @@ -85,13 +115,13 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req }() // Mark as Processing while we work. - // Conditions are NOT cleared here — setCondition upserts each type in place, + // Conditions are NOT cleared here - setCondition upserts each type in place, // preserving LastTransitionTime when Status has not changed. // This makes conditions durable API state across reconcile cycles. edb.Status.Phase = dbaasv1.PhaseProcessing // Validate that classifier.namespace, if set, matches the CR's own namespace. - // A mismatch is a permanent misconfiguration — no retry. + // A mismatch is a permanent misconfiguration - no retry. if ns := edb.Spec.Classifier["namespace"]; ns != "" && ns != edb.Namespace { return invalidSpec(ctx, &edb.Status.Phase, &edb.Status.Conditions, edb.Generation, r.Recorder, edb, @@ -119,6 +149,7 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req aggReq, err := r.buildRequest(ctx, edb) if err != nil { log.ErrorC(ctx, "failed to build registration request: %v", err) + dbaasSecretResolutionErrorsTotal.WithLabelValues(edb.Namespace, secretResolutionReason(err)).Inc() markTransientFailure(&edb.Status.Phase, &edb.Status.Conditions, edb.Generation, EventReasonSecretError, err.Error()) r.Recorder.Eventf(edb, corev1.EventTypeWarning, EventReasonSecretError, @@ -132,13 +163,19 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req // Call the aggregator. edb.Status.LastRequestID = requestID - if err := r.Aggregator.RegisterExternalDatabase(ctx, namespace, aggReq); err != nil { - log.ErrorC(ctx, "failed to register external database in dbaas-aggregator: %v", err) - return handleAggregatorError(&edb.Status.Phase, &edb.Status.Conditions, edb.Generation, r.Recorder, edb, err, requestID) + aggStart := time.Now() + aggErr := r.Aggregator.RegisterExternalDatabase(ctx, namespace, aggReq) + recordAggregatorCall(controllerEDB, operationRegisterEDB, aggStart, aggErr) + if aggErr != nil { + log.ErrorC(ctx, "failed to register external database in dbaas-aggregator: %v", aggErr) + return handleAggregatorError(&edb.Status.Phase, &edb.Status.Conditions, edb.Generation, r.Recorder, edb, aggErr, requestID) } log.InfoC(ctx, "external database registered successfully. type: %v, dbName: %v", edb.Spec.Type, edb.Spec.DbName) markSucceeded(&edb.Status.Phase, &edb.Status.Conditions, edb.Generation, EventReasonDatabaseRegistered) + if secretStart, ok := r.consumeSecretPropagation(edbKey); ok { + dbaasSecretRotationPropagationSeconds.Observe(time.Since(secretStart).Seconds()) + } r.Recorder.Eventf(edb, corev1.EventTypeNormal, EventReasonDatabaseRegistered, "registered with dbaas-aggregator (type=%s, dbName=%s)", edb.Spec.Type, edb.Spec.DbName) return ctrl.Result{}, nil @@ -210,12 +247,22 @@ func (r *ExternalDatabaseReconciler) applySecretCredentials( ref := cp.CredentialsSecretRef secret := &corev1.Secret{} if err := r.Get(ctx, types.NamespacedName{Namespace: namespace, Name: ref.Name}, secret); err != nil { - return fmt.Errorf( - "connectionProperties[%d]: get Secret %q: %w", - index, ref.Name, err) + reason := secretReasonReadFailed + switch { + case apierrors.IsNotFound(err): + reason = secretReasonNotFound + case apierrors.IsForbidden(err): + reason = secretReasonForbidden + } + return &secretResolutionError{ + reason: reason, + err: fmt.Errorf( + "connectionProperties[%d]: get Secret %q: %w", + index, ref.Name, err), + } } - // Defence-in-depth duplicate name check — CRD CEL validation should catch this + // Defence-in-depth duplicate name check - CRD CEL validation should catch this // first, but we guard here too in case validation is bypassed. seen := make(map[string]string, len(ref.Keys)) for _, km := range ref.Keys { @@ -228,14 +275,20 @@ func (r *ExternalDatabaseReconciler) applySecretCredentials( val, ok := secret.Data[km.Key] if !ok { - return fmt.Errorf( - "connectionProperties[%d]: Secret %q missing key %q", - index, ref.Name, km.Key) + return &secretResolutionError{ + reason: secretReasonKeyMissing, + err: fmt.Errorf( + "connectionProperties[%d]: Secret %q missing key %q", + index, ref.Name, km.Key), + } } if len(val) == 0 { - return fmt.Errorf( - "connectionProperties[%d]: Secret %q key %q is empty", - index, ref.Name, km.Key) + return &secretResolutionError{ + reason: secretReasonKeyEmpty, + err: fmt.Errorf( + "connectionProperties[%d]: Secret %q key %q is empty", + index, ref.Name, km.Key), + } } flat[km.Name] = string(val) } @@ -246,14 +299,14 @@ func (r *ExternalDatabaseReconciler) applySecretCredentials( // GenerationChangedPredicate ensures the controller reconciles only when the // spec changes (metadata.generation increments), not on its own status updates. // -// opts allows callers to customise the controller's behaviour — most notably +// opts allows callers to customise the controller's behaviour - most notably // the RateLimiter, which controls the exponential backoff applied when // Reconcile returns an error (BackingOff phase). Pass // ctrlcontroller.Options{} to keep the controller-runtime defaults. func (r *ExternalDatabaseReconciler) SetupWithManager(mgr ctrl.Manager, opts ctrlcontroller.Options) error { // Register the field index before building the controller. // controller-runtime calls indexSecretNames for every ExternalDatabase to - // build the reverse map ("namespace/secretName" → EDBs), then keeps it up to date + // build the reverse map ("namespace/secretName" -> EDBs), then keeps it up to date // automatically as EDBs are created, updated, or deleted. if err := mgr.GetFieldIndexer().IndexField( context.Background(), @@ -274,7 +327,7 @@ func (r *ExternalDatabaseReconciler) SetupWithManager(mgr ctrl.Manager, opts ctr handler.EnqueueRequestsFromMapFunc(r.enqueueForBinding)). // Re-enqueue ExternalDatabases that reference a Secret when that Secret // changes, so credential rotations take effect without a spec change. - // WatchesMetadata avoids caching Secret data (credentials) in operator memory — + // WatchesMetadata avoids caching Secret data (credentials) in operator memory - // enqueueForSecret only needs the name/namespace to query the field index. WatchesMetadata(&corev1.Secret{}, handler.EnqueueRequestsFromMapFunc(r.enqueueForSecret), @@ -294,14 +347,76 @@ func (r *ExternalDatabaseReconciler) enqueueForBinding(ctx context.Context, obj } reqs := make([]reconcile.Request, 0, len(list.Items)) for i := range list.Items { + r.stampBindingTrigger(list.Items[i].Namespace + "/" + list.Items[i].Name) reqs = append(reqs, reconcile.Request{NamespacedName: client.ObjectKeyFromObject(&list.Items[i])}) } return reqs } +func (r *ExternalDatabaseReconciler) stampBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if r.bindingTriggerStamps == nil { + r.bindingTriggerStamps = make(map[string]struct{}) + } + r.bindingTriggerStamps[key] = struct{}{} +} + +func (r *ExternalDatabaseReconciler) consumeBindingTrigger(key string) bool { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if _, ok := r.bindingTriggerStamps[key]; !ok { + return false + } + delete(r.bindingTriggerStamps, key) + return true +} + +func (r *ExternalDatabaseReconciler) stampSecretTrigger(key string, startedAt time.Time) { + r.secretTriggerMu.Lock() + defer r.secretTriggerMu.Unlock() + if r.secretTriggerStamps == nil { + r.secretTriggerStamps = make(map[string]struct{}) + } + r.secretTriggerStamps[key] = struct{}{} + if r.secretPropagationStamps == nil { + r.secretPropagationStamps = make(map[string]time.Time) + } + if _, exists := r.secretPropagationStamps[key]; !exists { + r.secretPropagationStamps[key] = startedAt + } +} + +func (r *ExternalDatabaseReconciler) consumeSecretTrigger(key string) bool { + r.secretTriggerMu.Lock() + defer r.secretTriggerMu.Unlock() + if _, ok := r.secretTriggerStamps[key]; !ok { + return false + } + delete(r.secretTriggerStamps, key) + return true +} + +func (r *ExternalDatabaseReconciler) consumeSecretPropagation(key string) (time.Time, bool) { + r.secretTriggerMu.Lock() + defer r.secretTriggerMu.Unlock() + start, ok := r.secretPropagationStamps[key] + if ok { + delete(r.secretPropagationStamps, key) + } + return start, ok +} + +func (r *ExternalDatabaseReconciler) clearSecretTrigger(key string) { + r.secretTriggerMu.Lock() + defer r.secretTriggerMu.Unlock() + delete(r.secretTriggerStamps, key) + delete(r.secretPropagationStamps, key) +} + // indexSecretNames is the indexer function registered at startup. // controller-runtime calls it for every ExternalDatabase to populate and -// maintain the reverse map: "namespace/secretName" → []ExternalDatabase. +// maintain the reverse map: "namespace/secretName" -> []ExternalDatabase. // The key includes the namespace to prevent spurious cross-namespace reconciles // when two namespaces have secrets with the same name. func indexSecretNames(obj client.Object) []string { @@ -320,11 +435,11 @@ func indexSecretNames(obj client.Object) []string { } // enqueueForSecret maps a Secret event to reconcile requests for ExternalDatabases -// that reference it. Uses the field index for an O(1) lookup — no full list scan. +// that reference it. Uses the field index for an O(1) lookup - no full list scan. func (r *ExternalDatabaseReconciler) enqueueForSecret(ctx context.Context, obj client.Object) []reconcile.Request { list := &dbaasv1.ExternalDatabaseList{} if err := r.List(ctx, list, - // MatchingFields resolves against the in-memory index — no API server call, no scan. + // MatchingFields resolves against the in-memory index - no API server call, no scan. // The key encodes namespace+name so results are already namespace-scoped. client.MatchingFields{secretNamesIndex: obj.GetNamespace() + "/" + obj.GetName()}, ); err != nil { @@ -332,8 +447,12 @@ func (r *ExternalDatabaseReconciler) enqueueForSecret(ctx context.Context, obj c obj.GetNamespace(), obj.GetName(), err) return nil } + + now := time.Now() reqs := make([]reconcile.Request, 0, len(list.Items)) for i := range list.Items { + key := list.Items[i].Namespace + "/" + list.Items[i].Name + r.stampSecretTrigger(key, now) reqs = append(reqs, reconcile.Request{NamespacedName: client.ObjectKeyFromObject(&list.Items[i])}) } return reqs diff --git a/dbaas-operator/internal/controller/metrics.go b/dbaas-operator/internal/controller/metrics.go new file mode 100644 index 000000000..8217a4a08 --- /dev/null +++ b/dbaas-operator/internal/controller/metrics.go @@ -0,0 +1,194 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "errors" + "time" + + "github.com/prometheus/client_golang/prometheus" + + aggregatorclient "github.com/netcracker/qubership-dbaas/dbaas-operator/internal/client" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +// Label value constants. + +// Keeping the constants here because they are only relevant to Prometheus metrics. +const ( + controllerEDB = "externaldatabase" + controllerDD = "databasedeclaration" + controllerDP = "dbpolicy" + + triggerSpecChange = "spec_change" + triggerSecretChange = "secret_change" + triggerNamespaceBindingChange = "namespace_binding_change" + triggerPolling = "polling" + + resultSuccess = "success" + resultAuthError = "auth_error" + resultSpecRejection = "spec_rejection" + resultServerError = "server_error" + resultNetworkError = "network_error" + + secretReasonNotFound = "secret_not_found" + secretReasonKeyMissing = "key_missing" + secretReasonKeyEmpty = "key_empty" + secretReasonForbidden = "forbidden" + secretReasonReadFailed = "secret_read_failed" + + operationRegisterEDB = "register_external_database" + operationApplyConfig = "apply_config" + operationPollStatus = "poll_status" +) + +// Metric declarations. + +// dbaasReconcileTriggerTotal counts reconcile invocations by meaningful source. +// Controller-runtime already exposes reconcile totals; this metric adds the +// trigger dimension that the framework cannot infer. +var dbaasReconcileTriggerTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "dbaas_reconcile_trigger_total", + Help: "Total reconcile invocations by trigger source.", + }, + []string{"controller", "trigger"}, +) + +// dbaasSecretResolutionErrorsTotal counts failures reading credential Secrets, +// labelled by error category. A non-zero value means a credential rotation +// left a database without valid credentials - direct service impact. +var dbaasSecretResolutionErrorsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "dbaas_secret_resolution_errors_total", + Help: "Total failures reading credential Secrets referenced by ExternalDatabase.", + }, + []string{"namespace", "reason"}, +) + +// dbaasSecretRotationPropagationSeconds measures end-to-end time from a +// Secret change triggering a reconcile to the ExternalDatabase reaching +// Succeeded. This is the SLO metric for the credential-rotation feature. +var dbaasSecretRotationPropagationSeconds = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "dbaas_secret_rotation_propagation_seconds", + Help: "Time from a Secret change trigger to ExternalDatabase reaching Succeeded.", + Buckets: []float64{0.5, 1, 2, 5, 10, 30, 60, 120, 300, 600}, + }, +) + +// dbaasAggregatorRequestDurationSeconds tracks HTTP call latency to dbaas-aggregator. +// P50/P90/P99 per operation type feed latency SLO alerting. +var dbaasAggregatorRequestDurationSeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "dbaas_aggregator_request_duration_seconds", + Help: "Duration of HTTP calls to dbaas-aggregator by controller and operation.", + Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}, + }, + []string{"controller", "operation"}, +) + +// dbaasAggregatorRequestsTotal counts aggregator calls by controller, operation, +// and result. The result label distinguishes user errors (spec_rejection) from +// platform errors (auth_error, server_error) so each can be routed to the +// correct owner. +var dbaasAggregatorRequestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "dbaas_aggregator_requests_total", + Help: "Total calls to dbaas-aggregator, labelled by controller, operation, and result.", + }, + []string{"controller", "operation", "result"}, +) + +// dbaasAsyncOperationDurationSeconds measures end-to-end provisioning time from +// async submit (HTTP 202) to final poll outcome. This is the user-visible DD SLO. +var dbaasAsyncOperationDurationSeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "dbaas_async_operation_duration_seconds", + Help: "End-to-end duration of async DatabaseDeclaration provisioning operations.", + Buckets: []float64{1, 5, 10, 30, 60, 300, 600, 1800, 3600, 7200}, + }, + []string{"result"}, +) + +func init() { + metrics.Registry.MustRegister( + dbaasReconcileTriggerTotal, + dbaasSecretResolutionErrorsTotal, + dbaasSecretRotationPropagationSeconds, + dbaasAggregatorRequestDurationSeconds, + dbaasAggregatorRequestsTotal, + dbaasAsyncOperationDurationSeconds, + ) +} + +// Helper functions. + +// recordAggregatorCall records both the duration and the outcome counter for a +// single aggregator HTTP call. Call it immediately after every aggregator +// method returns, passing the start time and the error (nil = success). +func recordAggregatorCall(controller, operation string, start time.Time, err error) { + dbaasAggregatorRequestDurationSeconds. + WithLabelValues(controller, operation). + Observe(time.Since(start).Seconds()) + dbaasAggregatorRequestsTotal. + WithLabelValues(controller, operation, aggregatorResult(err)). + Inc() +} + +// aggregatorResult maps an error from AggregatorClient to a result label. +func aggregatorResult(err error) string { + if err == nil { + return resultSuccess + } + var aggErr *aggregatorclient.AggregatorError + if errors.As(err, &aggErr) { + if aggErr.IsAuthError() { + return resultAuthError + } + if aggErr.IsSpecRejection() { + return resultSpecRejection + } + return resultServerError + } + return resultNetworkError +} + +type secretResolutionError struct { + reason string + err error +} + +func (e *secretResolutionError) Error() string { + return e.err.Error() +} + +func (e *secretResolutionError) Unwrap() error { + return e.err +} + +func secretResolutionReason(err error) string { + var secretErr *secretResolutionError + if errors.As(err, &secretErr) { + return secretErr.reason + } + return secretReasonReadFailed +} + +func recordReconcileTrigger(controller, trigger string) { + dbaasReconcileTriggerTotal.WithLabelValues(controller, trigger).Inc() +} diff --git a/dbaas-operator/internal/controller/metrics_test.go b/dbaas-operator/internal/controller/metrics_test.go new file mode 100644 index 000000000..68a81f43f --- /dev/null +++ b/dbaas-operator/internal/controller/metrics_test.go @@ -0,0 +1,52 @@ +package controller + +import ( + "errors" + "fmt" + "testing" + + aggregatorclient "github.com/netcracker/qubership-dbaas/dbaas-operator/internal/client" +) + +func TestAggregatorResultClassifiesOwnershipBuckets(t *testing.T) { + tests := []struct { + name string + err error + want string + }{ + {name: "success", err: nil, want: resultSuccess}, + {name: "auth", err: &aggregatorclient.AggregatorError{StatusCode: 401}, want: resultAuthError}, + {name: "spec rejection", err: &aggregatorclient.AggregatorError{StatusCode: 409}, want: resultSpecRejection}, + {name: "server", err: &aggregatorclient.AggregatorError{StatusCode: 500}, want: resultServerError}, + {name: "wrapped server", err: fmt.Errorf("wrapped: %w", &aggregatorclient.AggregatorError{StatusCode: 503}), want: resultServerError}, + {name: "network", err: errors.New("dial tcp: connection refused"), want: resultNetworkError}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := aggregatorResult(tt.err); got != tt.want { + t.Fatalf("aggregatorResult() = %q, want %q", got, tt.want) + } + }) + } +} + +func TestSecretResolutionReason(t *testing.T) { + tests := []struct { + name string + err error + want string + }{ + {name: "typed", err: &secretResolutionError{reason: secretReasonKeyMissing, err: errors.New("missing")}, want: secretReasonKeyMissing}, + {name: "wrapped typed", err: fmt.Errorf("wrapped: %w", &secretResolutionError{reason: secretReasonForbidden, err: errors.New("forbidden")}), want: secretReasonForbidden}, + {name: "generic", err: errors.New("other"), want: secretReasonReadFailed}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := secretResolutionReason(tt.err); got != tt.want { + t.Fatalf("secretResolutionReason() = %q, want %q", got, tt.want) + } + }) + } +} From 3cd06d87cb43198f6bd8c3d25432aee2a42d4fbb Mon Sep 17 00:00:00 2001 From: Abhimanyu Roy Date: Tue, 19 May 2026 14:58:28 +0530 Subject: [PATCH 2/6] feat: pushing fixes --- dbaas-operator/cmd/main.go | 10 ++-- dbaas-operator/dev/operator-metrics.md | 4 ++ .../dashboards/dbaas-operator-dashboard.json | 47 ++++++++++--------- .../databasedeclaration_controller.go | 12 ++++- .../controller/dbpolicy_controller.go | 8 ++++ .../controller/externaldatabase_controller.go | 43 +++++++++++------ dbaas-operator/internal/controller/metrics.go | 3 ++ 7 files changed, 83 insertions(+), 44 deletions(-) diff --git a/dbaas-operator/cmd/main.go b/dbaas-operator/cmd/main.go index a255f5bc6..7486fc4f2 100644 --- a/dbaas-operator/cmd/main.go +++ b/dbaas-operator/cmd/main.go @@ -95,15 +95,15 @@ func main() { BindAddress: metricsAddr, } - // Operator namespace. + // ── Operator namespace ──────────────────────────────────────────────────── cloudNamespace := os.Getenv("CLOUD_NAMESPACE") if cloudNamespace == "" { - setupLog.Errorf("CLOUD_NAMESPACE env var is not set - ownership checks will not work correctly") + setupLog.Errorf("CLOUD_NAMESPACE env var is not set — ownership checks will not work correctly") os.Exit(1) } setupLog.Infof("operator namespace cloud-namespace=%v", cloudNamespace) - // dbaas-aggregator client. + // ── dbaas-aggregator client ─────────────────────────────────────────────── aggregatorURL := os.Getenv("DBAAS_AGGREGATOR_URL") if aggregatorURL == "" { aggregatorURL = "http://dbaas-aggregator:8080" @@ -165,14 +165,14 @@ func main() { } setupLog.Infof("backoff configured base=%v max=%v", backoffBaseDelay, backoffMaxDelay) - // Ownership resolver. + // ── Ownership resolver ──────────────────────────────────────────────────── ownershipResolver := ownership.NewOwnershipResolver(cloudNamespace, mgr.GetClient()) if err := mgr.Add(&ownershipWarmupRunnable{resolver: ownershipResolver}); err != nil { setupLog.Errorf("Failed to register ownership warmup runnable: %v", err) os.Exit(1) } - // NamespaceBinding controller (always enabled). + // ── NamespaceBinding controller (always enabled) ─────────────────────────── edbChecker := ownership.NewKindChecker( mgr.GetClient(), func() *dbaasv1.ExternalDatabaseList { return &dbaasv1.ExternalDatabaseList{} }, diff --git a/dbaas-operator/dev/operator-metrics.md b/dbaas-operator/dev/operator-metrics.md index 9bc420c56..ad64638dc 100644 --- a/dbaas-operator/dev/operator-metrics.md +++ b/dbaas-operator/dev/operator-metrics.md @@ -18,6 +18,8 @@ Each metric is registered at startup and scraped by Prometheus. Counts every reconcile invocation, tagged by what caused it. A `secret_change` increment means the watcher detected a credential rotation and re-registered the database automatically without a CR spec change. +Trigger classification is best-effort under overlapping events for the same object. The metric is useful for dashboard-level distribution and feature proof, but it should not be used as exact causal tracing or as an alert source. + **Dashboard:** Stacked time series by trigger. A spike in `secret_change` is direct proof the feature fired. --- @@ -173,4 +175,6 @@ End-to-end provisioning time from async submission (HTTP 202) to a terminal stat The start timestamp is kept in memory after the operator submits an async operation. If the operator restarts before the terminal poll result, that operation is still reconciled correctly, but its duration sample is not recorded. +Operational note: alerts on this histogram should rely on absolute latency percentiles, not on sample-rate dips. A drop in `rate(dbaas_async_operation_duration_seconds_count[...])` after an operator restart usually reflects lost samples, not a slowdown in actual provisioning. + **Dashboard:** P50 / P90 / P99 per result label. Completion rate (operations/s) as a secondary panel. diff --git a/dbaas-operator/helm-templates/dbaas-operator/dashboards/dbaas-operator-dashboard.json b/dbaas-operator/helm-templates/dbaas-operator/dashboards/dbaas-operator-dashboard.json index 58c1f7ec6..2b4acf6c5 100644 --- a/dbaas-operator/helm-templates/dbaas-operator/dashboards/dbaas-operator-dashboard.json +++ b/dbaas-operator/helm-templates/dbaas-operator/dashboards/dbaas-operator-dashboard.json @@ -3,7 +3,7 @@ "uid": "dbaas-operator", "schemaVersion": 39, "version": 3, - "refresh": "10s", + "refresh": "30s", "time": { "from": "now-15m", "to": "now" @@ -106,7 +106,7 @@ }, "targets": [ { - "expr": "sum by (trigger) (rate(dbaas_reconcile_trigger_total{controller=\"externaldatabase\"}[1m]))", + "expr": "sum by (trigger) (rate(dbaas_reconcile_trigger_total{namespace=\"#namespace#\",controller=\"externaldatabase\"}[1m]))", "legendFormat": "{{trigger}}" } ] @@ -139,15 +139,15 @@ }, "targets": [ { - "expr": "histogram_quantile(0.50, rate(dbaas_secret_rotation_propagation_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.50, rate(dbaas_secret_rotation_propagation_seconds_bucket{namespace=\"#namespace#\"}[5m]))", "legendFormat": "P50" }, { - "expr": "histogram_quantile(0.90, rate(dbaas_secret_rotation_propagation_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.90, rate(dbaas_secret_rotation_propagation_seconds_bucket{namespace=\"#namespace#\"}[5m]))", "legendFormat": "P90" }, { - "expr": "histogram_quantile(0.99, rate(dbaas_secret_rotation_propagation_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.99, rate(dbaas_secret_rotation_propagation_seconds_bucket{namespace=\"#namespace#\"}[5m]))", "legendFormat": "P99" } ] @@ -184,8 +184,8 @@ }, "targets": [ { - "expr": "sum by (namespace, reason) (rate(dbaas_secret_resolution_errors_total[1m]))", - "legendFormat": "{{namespace}} / {{reason}}" + "expr": "sum by (exported_namespace, reason) (rate(dbaas_secret_resolution_errors_total{namespace=\"#namespace#\"}[1m]))", + "legendFormat": "{{exported_namespace}} / {{reason}}" } ] }, @@ -229,15 +229,15 @@ }, "targets": [ { - "expr": "histogram_quantile(0.50, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.50, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", "legendFormat": "P50 {{controller}}/{{operation}}" }, { - "expr": "histogram_quantile(0.90, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.90, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", "legendFormat": "P90 {{controller}}/{{operation}}" }, { - "expr": "histogram_quantile(0.99, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.99, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", "legendFormat": "P99 {{controller}}/{{operation}}" } ] @@ -347,7 +347,7 @@ }, "targets": [ { - "expr": "sum by (result) (rate(dbaas_aggregator_requests_total[1m]))", + "expr": "sum by (result) (rate(dbaas_aggregator_requests_total{namespace=\"#namespace#\"}[1m]))", "legendFormat": "{{result}}" } ] @@ -400,13 +400,13 @@ }, "targets": [ { - "expr": "sum by (controller, result) (increase(dbaas_aggregator_requests_total{result!=\"success\"}[$__range]))", + "expr": "sum by (controller, result) (increase(dbaas_aggregator_requests_total{namespace=\"#namespace#\",result!=\"success\"}[$__range]))", "legendFormat": "aggregator {{controller}} / {{result}}", "refId": "A" }, { - "expr": "sum by (namespace, reason) (increase(dbaas_secret_resolution_errors_total[$__range]))", - "legendFormat": "secret {{namespace}} / {{reason}}", + "expr": "sum by (exported_namespace, reason) (increase(dbaas_secret_resolution_errors_total{namespace=\"#namespace#\"}[$__range]))", + "legendFormat": "secret {{exported_namespace}} / {{reason}}", "refId": "B" } ] @@ -447,13 +447,13 @@ }, "targets": [ { - "expr": "sum by (controller, result) (rate(dbaas_aggregator_requests_total{result!=\"success\"}[15m])) \u003e 0", + "expr": "sum by (controller, result) (rate(dbaas_aggregator_requests_total{namespace=\"#namespace#\",result!=\"success\"}[15m])) \u003e 0", "legendFormat": "aggregator {{controller}} / {{result}}", "refId": "A" }, { - "expr": "sum by (namespace, reason) (rate(dbaas_secret_resolution_errors_total[15m])) \u003e 0", - "legendFormat": "secret {{namespace}} / {{reason}}", + "expr": "sum by (exported_namespace, reason) (rate(dbaas_secret_resolution_errors_total{namespace=\"#namespace#\"}[15m])) \u003e 0", + "legendFormat": "secret {{exported_namespace}} / {{reason}}", "refId": "B" } ] @@ -494,12 +494,12 @@ }, "targets": [ { - "expr": "sum(rate(dbaas_reconcile_trigger_total{controller=\"databasedeclaration\",trigger=\"polling\"}[15m]))", + "expr": "sum(rate(dbaas_reconcile_trigger_total{namespace=\"#namespace#\",controller=\"databasedeclaration\",trigger=\"polling\"}[15m]))", "legendFormat": "polling rate", "refId": "A" }, { - "expr": "sum(rate(dbaas_async_operation_duration_seconds_count[15m]))", + "expr": "sum(rate(dbaas_async_operation_duration_seconds_count{namespace=\"#namespace#\"}[15m]))", "legendFormat": "completion rate", "refId": "B" } @@ -533,15 +533,15 @@ }, "targets": [ { - "expr": "histogram_quantile(0.50, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.50, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", "legendFormat": "P50 {{result}}" }, { - "expr": "histogram_quantile(0.90, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.90, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", "legendFormat": "P90 {{result}}" }, { - "expr": "histogram_quantile(0.99, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.99, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", "legendFormat": "P99 {{result}}" } ] @@ -621,10 +621,11 @@ }, "targets": [ { - "expr": "sum by (result) (rate(dbaas_async_operation_duration_seconds_count[1m]))", + "expr": "sum by (result) (rate(dbaas_async_operation_duration_seconds_count{namespace=\"#namespace#\"}[1m]))", "legendFormat": "{{result}}" } ] } ] } + diff --git a/dbaas-operator/internal/controller/databasedeclaration_controller.go b/dbaas-operator/internal/controller/databasedeclaration_controller.go index eab94d769..79f66e62f 100644 --- a/dbaas-operator/internal/controller/databasedeclaration_controller.go +++ b/dbaas-operator/internal/controller/databasedeclaration_controller.go @@ -464,7 +464,7 @@ func (r *DatabaseDeclarationReconciler) handlePollResponse( log.InfoC(ctx, "database provisioning failed trackingId=%v status=%v reason=%v", trackingID, resp.Status, reason) clearPendingOperation(dd) - r.observeAsyncCompletion(dd, "failed") + r.observeAsyncCompletion(dd, asyncResultFailed) markPermanentFailure(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, EventReasonAggregatorRejected, reason) r.Recorder.Eventf(dd, corev1.EventTypeWarning, EventReasonAggregatorRejected, @@ -478,7 +478,7 @@ func (r *DatabaseDeclarationReconciler) handlePollResponse( // Clear the stale trackingID so the next reconcile enters the SUBMIT branch. log.InfoC(ctx, "provisioning was terminated, clearing trackingId for resubmit trackingId=%v", trackingID) clearPendingOperation(dd) - r.observeAsyncCompletion(dd, "terminated") + r.observeAsyncCompletion(dd, asyncResultTerminated) markTransientFailure(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, EventReasonOperationTerminated, "provisioning was terminated by the aggregator, resubmitting") r.Recorder.Eventf(dd, corev1.EventTypeWarning, EventReasonOperationTerminated, @@ -539,6 +539,10 @@ func (r *DatabaseDeclarationReconciler) enqueueForBinding(ctx context.Context, o return reqs } +// stampBindingTrigger records that the next reconcile for key was most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// for the same key can swap labels between queued reconciles, so the metric is +// informational and should not be used as exact causal tracing. func (r *DatabaseDeclarationReconciler) stampBindingTrigger(key string) { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() @@ -548,6 +552,10 @@ func (r *DatabaseDeclarationReconciler) stampBindingTrigger(key string) { r.bindingTriggerStamps[key] = struct{}{} } +// consumeBindingTrigger classifies the next reconcile for key as most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// for the same key can swap labels between queued reconciles, so the metric is +// informational and should not be used as exact causal tracing. func (r *DatabaseDeclarationReconciler) consumeBindingTrigger(key string) bool { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() diff --git a/dbaas-operator/internal/controller/dbpolicy_controller.go b/dbaas-operator/internal/controller/dbpolicy_controller.go index aef2c341d..09821a006 100644 --- a/dbaas-operator/internal/controller/dbpolicy_controller.go +++ b/dbaas-operator/internal/controller/dbpolicy_controller.go @@ -191,6 +191,10 @@ func (r *DbPolicyReconciler) enqueueForBinding(ctx context.Context, obj client.O return reqs } +// stampBindingTrigger records that the next reconcile for key was most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// for the same key can swap labels between queued reconciles, so the metric is +// informational and should not be used as exact causal tracing. func (r *DbPolicyReconciler) stampBindingTrigger(key string) { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() @@ -200,6 +204,10 @@ func (r *DbPolicyReconciler) stampBindingTrigger(key string) { r.bindingTriggerStamps[key] = struct{}{} } +// consumeBindingTrigger classifies the next reconcile for key as most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// for the same key can swap labels between queued reconciles, so the metric is +// informational and should not be used as exact causal tracing. func (r *DbPolicyReconciler) consumeBindingTrigger(key string) bool { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() diff --git a/dbaas-operator/internal/controller/externaldatabase_controller.go b/dbaas-operator/internal/controller/externaldatabase_controller.go index 3d001e488..005283945 100644 --- a/dbaas-operator/internal/controller/externaldatabase_controller.go +++ b/dbaas-operator/internal/controller/externaldatabase_controller.go @@ -72,18 +72,24 @@ type ExternalDatabaseReconciler struct { func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { ctx, requestID := initReconcileContext(ctx) + edbKey := req.Namespace + "/" + req.Name edb := &dbaasv1.ExternalDatabase{} if err := r.Get(ctx, req.NamespacedName, edb); err != nil { if apierrors.IsNotFound(err) { - r.clearSecretTrigger(req.Namespace + "/" + req.Name) + r.clearSecretTrigger(edbKey) } return ctrl.Result{}, client.IgnoreNotFound(err) } // Determine whether this reconcile was triggered by a Secret change or a // spec change, and record the trigger counter. - edbKey := req.Namespace + "/" + req.Name fromSecret := r.consumeSecretTrigger(edbKey) + defer func() { + secretStart, ok := r.consumeSecretPropagation(edbKey) + if ok && edb.Status.Phase == dbaasv1.PhaseSucceeded { + dbaasSecretRotationPropagationSeconds.Observe(time.Since(secretStart).Seconds()) + } + }() trigger := triggerSpecChange if fromSecret { @@ -115,13 +121,13 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req }() // Mark as Processing while we work. - // Conditions are NOT cleared here - setCondition upserts each type in place, + // Conditions are NOT cleared here — setCondition upserts each type in place, // preserving LastTransitionTime when Status has not changed. // This makes conditions durable API state across reconcile cycles. edb.Status.Phase = dbaasv1.PhaseProcessing // Validate that classifier.namespace, if set, matches the CR's own namespace. - // A mismatch is a permanent misconfiguration - no retry. + // A mismatch is a permanent misconfiguration — no retry. if ns := edb.Spec.Classifier["namespace"]; ns != "" && ns != edb.Namespace { return invalidSpec(ctx, &edb.Status.Phase, &edb.Status.Conditions, edb.Generation, r.Recorder, edb, @@ -173,9 +179,6 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req log.InfoC(ctx, "external database registered successfully. type: %v, dbName: %v", edb.Spec.Type, edb.Spec.DbName) markSucceeded(&edb.Status.Phase, &edb.Status.Conditions, edb.Generation, EventReasonDatabaseRegistered) - if secretStart, ok := r.consumeSecretPropagation(edbKey); ok { - dbaasSecretRotationPropagationSeconds.Observe(time.Since(secretStart).Seconds()) - } r.Recorder.Eventf(edb, corev1.EventTypeNormal, EventReasonDatabaseRegistered, "registered with dbaas-aggregator (type=%s, dbName=%s)", edb.Spec.Type, edb.Spec.DbName) return ctrl.Result{}, nil @@ -262,7 +265,7 @@ func (r *ExternalDatabaseReconciler) applySecretCredentials( } } - // Defence-in-depth duplicate name check - CRD CEL validation should catch this + // Defence-in-depth duplicate name check — CRD CEL validation should catch this // first, but we guard here too in case validation is bypassed. seen := make(map[string]string, len(ref.Keys)) for _, km := range ref.Keys { @@ -299,14 +302,14 @@ func (r *ExternalDatabaseReconciler) applySecretCredentials( // GenerationChangedPredicate ensures the controller reconciles only when the // spec changes (metadata.generation increments), not on its own status updates. // -// opts allows callers to customise the controller's behaviour - most notably +// opts allows callers to customise the controller's behaviour — most notably // the RateLimiter, which controls the exponential backoff applied when // Reconcile returns an error (BackingOff phase). Pass // ctrlcontroller.Options{} to keep the controller-runtime defaults. func (r *ExternalDatabaseReconciler) SetupWithManager(mgr ctrl.Manager, opts ctrlcontroller.Options) error { // Register the field index before building the controller. // controller-runtime calls indexSecretNames for every ExternalDatabase to - // build the reverse map ("namespace/secretName" -> EDBs), then keeps it up to date + // build the reverse map ("namespace/secretName" → EDBs), then keeps it up to date // automatically as EDBs are created, updated, or deleted. if err := mgr.GetFieldIndexer().IndexField( context.Background(), @@ -327,7 +330,7 @@ func (r *ExternalDatabaseReconciler) SetupWithManager(mgr ctrl.Manager, opts ctr handler.EnqueueRequestsFromMapFunc(r.enqueueForBinding)). // Re-enqueue ExternalDatabases that reference a Secret when that Secret // changes, so credential rotations take effect without a spec change. - // WatchesMetadata avoids caching Secret data (credentials) in operator memory - + // WatchesMetadata avoids caching Secret data (credentials) in operator memory — // enqueueForSecret only needs the name/namespace to query the field index. WatchesMetadata(&corev1.Secret{}, handler.EnqueueRequestsFromMapFunc(r.enqueueForSecret), @@ -353,6 +356,10 @@ func (r *ExternalDatabaseReconciler) enqueueForBinding(ctx context.Context, obj return reqs } +// stampBindingTrigger records that the next reconcile for key was most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// for the same key can swap labels between queued reconciles, so the metric is +// informational and should not be used as exact causal tracing. func (r *ExternalDatabaseReconciler) stampBindingTrigger(key string) { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() @@ -362,6 +369,10 @@ func (r *ExternalDatabaseReconciler) stampBindingTrigger(key string) { r.bindingTriggerStamps[key] = struct{}{} } +// consumeBindingTrigger classifies the next reconcile for key as most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// for the same key can swap labels between queued reconciles, so the metric is +// informational and should not be used as exact causal tracing. func (r *ExternalDatabaseReconciler) consumeBindingTrigger(key string) bool { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() @@ -372,6 +383,10 @@ func (r *ExternalDatabaseReconciler) consumeBindingTrigger(key string) bool { return true } +// stampSecretTrigger records that the next reconcile for key was most likely +// caused by a Secret change. This is best-effort: overlapping triggers for the +// same key can swap labels between queued reconciles, so the metric is +// informational and should not be used as exact causal tracing. func (r *ExternalDatabaseReconciler) stampSecretTrigger(key string, startedAt time.Time) { r.secretTriggerMu.Lock() defer r.secretTriggerMu.Unlock() @@ -416,7 +431,7 @@ func (r *ExternalDatabaseReconciler) clearSecretTrigger(key string) { // indexSecretNames is the indexer function registered at startup. // controller-runtime calls it for every ExternalDatabase to populate and -// maintain the reverse map: "namespace/secretName" -> []ExternalDatabase. +// maintain the reverse map: "namespace/secretName" → []ExternalDatabase. // The key includes the namespace to prevent spurious cross-namespace reconciles // when two namespaces have secrets with the same name. func indexSecretNames(obj client.Object) []string { @@ -435,11 +450,11 @@ func indexSecretNames(obj client.Object) []string { } // enqueueForSecret maps a Secret event to reconcile requests for ExternalDatabases -// that reference it. Uses the field index for an O(1) lookup - no full list scan. +// that reference it. Uses the field index for an O(1) lookup — no full list scan. func (r *ExternalDatabaseReconciler) enqueueForSecret(ctx context.Context, obj client.Object) []reconcile.Request { list := &dbaasv1.ExternalDatabaseList{} if err := r.List(ctx, list, - // MatchingFields resolves against the in-memory index - no API server call, no scan. + // MatchingFields resolves against the in-memory index — no API server call, no scan. // The key encodes namespace+name so results are already namespace-scoped. client.MatchingFields{secretNamesIndex: obj.GetNamespace() + "/" + obj.GetName()}, ); err != nil { diff --git a/dbaas-operator/internal/controller/metrics.go b/dbaas-operator/internal/controller/metrics.go index 8217a4a08..effb67f4a 100644 --- a/dbaas-operator/internal/controller/metrics.go +++ b/dbaas-operator/internal/controller/metrics.go @@ -45,6 +45,9 @@ const ( resultServerError = "server_error" resultNetworkError = "network_error" + asyncResultFailed = "failed" + asyncResultTerminated = "terminated" + secretReasonNotFound = "secret_not_found" secretReasonKeyMissing = "key_missing" secretReasonKeyEmpty = "key_empty" From 979dded75767be9659c97a4c33ba0860c129d114 Mon Sep 17 00:00:00 2001 From: Abhimanyu Roy Date: Tue, 19 May 2026 15:36:45 +0530 Subject: [PATCH 3/6] feat: added tests for trigger stamp --- .../controller/trigger_stamps_test.go | 127 ++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 dbaas-operator/internal/controller/trigger_stamps_test.go diff --git a/dbaas-operator/internal/controller/trigger_stamps_test.go b/dbaas-operator/internal/controller/trigger_stamps_test.go new file mode 100644 index 000000000..5b419b74a --- /dev/null +++ b/dbaas-operator/internal/controller/trigger_stamps_test.go @@ -0,0 +1,127 @@ +package controller + +import ( + "sync" + "testing" + "time" +) + +func TestExternalDatabaseSecretTriggerLifecycle(t *testing.T) { + r := &ExternalDatabaseReconciler{} + key := "test-ns/test-edb" + start := time.Unix(100, 0) + later := start.Add(time.Minute) + + r.stampSecretTrigger(key, start) + r.stampSecretTrigger(key, later) + + if !r.consumeSecretTrigger(key) { + t.Fatalf("consumeSecretTrigger() = false, want true") + } + if r.consumeSecretTrigger(key) { + t.Fatalf("second consumeSecretTrigger() = true, want false") + } + + gotStart, ok := r.consumeSecretPropagation(key) + if !ok { + t.Fatalf("consumeSecretPropagation() ok = false, want true") + } + if !gotStart.Equal(start) { + t.Fatalf("consumeSecretPropagation() = %v, want earliest %v", gotStart, start) + } + if _, ok := r.consumeSecretPropagation(key); ok { + t.Fatalf("second consumeSecretPropagation() ok = true, want false") + } +} + +func TestExternalDatabaseClearSecretTriggerClearsTriggerAndPropagation(t *testing.T) { + r := &ExternalDatabaseReconciler{} + key := "test-ns/test-edb" + + r.stampSecretTrigger(key, time.Unix(100, 0)) + r.clearSecretTrigger(key) + + if r.consumeSecretTrigger(key) { + t.Fatalf("consumeSecretTrigger() after clear = true, want false") + } + if _, ok := r.consumeSecretPropagation(key); ok { + t.Fatalf("consumeSecretPropagation() after clear ok = true, want false") + } +} + +func TestExternalDatabaseBindingTriggerLifecycle(t *testing.T) { + r := &ExternalDatabaseReconciler{} + assertBindingTriggerLifecycle(t, r.stampBindingTrigger, r.consumeBindingTrigger, r.clearBindingTrigger) +} + +func TestDatabaseDeclarationBindingTriggerLifecycle(t *testing.T) { + r := &DatabaseDeclarationReconciler{} + assertBindingTriggerLifecycle(t, r.stampBindingTrigger, r.consumeBindingTrigger, r.clearBindingTrigger) +} + +func TestDbPolicyBindingTriggerLifecycle(t *testing.T) { + r := &DbPolicyReconciler{} + assertBindingTriggerLifecycle(t, r.stampBindingTrigger, r.consumeBindingTrigger, r.clearBindingTrigger) +} + +func TestExternalDatabaseTriggerStampsConcurrentAccess(t *testing.T) { + r := &ExternalDatabaseReconciler{} + key := "test-ns/test-edb" + + var wg sync.WaitGroup + for i := 0; i < 50; i++ { + wg.Add(4) + go func(i int) { + defer wg.Done() + r.stampSecretTrigger(key, time.Unix(int64(i), 0)) + }(i) + go func() { + defer wg.Done() + _ = r.consumeSecretTrigger(key) + }() + go func() { + defer wg.Done() + r.stampBindingTrigger(key) + }() + go func() { + defer wg.Done() + _ = r.consumeBindingTrigger(key) + }() + } + wg.Wait() + + r.clearSecretTrigger(key) + r.clearBindingTrigger(key) + + if r.consumeSecretTrigger(key) { + t.Fatalf("consumeSecretTrigger() after concurrent clear = true, want false") + } + if _, ok := r.consumeSecretPropagation(key); ok { + t.Fatalf("consumeSecretPropagation() after concurrent clear ok = true, want false") + } + if r.consumeBindingTrigger(key) { + t.Fatalf("consumeBindingTrigger() after concurrent clear = true, want false") + } +} + +func assertBindingTriggerLifecycle(t *testing.T, stamp func(string), consume func(string) bool, clear func(string)) { + t.Helper() + + key := "test-ns/test-resource" + + stamp(key) + stamp(key) + + if !consume(key) { + t.Fatalf("consume() = false, want true") + } + if consume(key) { + t.Fatalf("second consume() = true, want false") + } + + stamp(key) + clear(key) + if consume(key) { + t.Fatalf("consume() after clear = true, want false") + } +} From 02f9b2fab2e9eb9a2e39d24cfa2effaf5443ee3c Mon Sep 17 00:00:00 2001 From: Abhimanyu Roy Date: Tue, 19 May 2026 19:55:38 +0530 Subject: [PATCH 4/6] feat: adding the clear trigger --- .../internal/controller/externaldatabase_controller.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dbaas-operator/internal/controller/externaldatabase_controller.go b/dbaas-operator/internal/controller/externaldatabase_controller.go index 1cf3e383b..1a3212a42 100644 --- a/dbaas-operator/internal/controller/externaldatabase_controller.go +++ b/dbaas-operator/internal/controller/externaldatabase_controller.go @@ -29,8 +29,8 @@ import ( "time" corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" @@ -79,6 +79,7 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req if err := r.Get(ctx, req.NamespacedName, edb); err != nil { if apierrors.IsNotFound(err) { r.clearSecretTrigger(edbKey) + r.clearBindingTrigger(edbKey) } return ctrl.Result{}, client.IgnoreNotFound(err) } @@ -107,6 +108,7 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req } if !owned { r.clearSecretTrigger(edbKey) + r.clearBindingTrigger(edbKey) return result, nil } recordReconcileTrigger(controllerEDB, trigger) @@ -433,6 +435,12 @@ func (r *ExternalDatabaseReconciler) consumeBindingTrigger(key string) bool { return true } +func (r *ExternalDatabaseReconciler) clearBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + delete(r.bindingTriggerStamps, key) +} + // stampSecretTrigger records that the next reconcile for key was most likely // caused by a Secret change. This is best-effort: overlapping triggers for the // same key can swap labels between queued reconciles, so the metric is From 7339e78d6eaefdd1ec22f457452b537bef1bae8d Mon Sep 17 00:00:00 2001 From: Abhimanyu Roy Date: Wed, 20 May 2026 11:40:01 +0530 Subject: [PATCH 5/6] feat: updating comments, and one test --- .../internal/controller/databasedeclaration_controller.go | 1 + dbaas-operator/internal/controller/dbpolicy_controller.go | 1 + .../internal/controller/externaldatabase_controller.go | 4 +++- dbaas-operator/internal/controller/trigger_stamps_test.go | 6 +++++- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/dbaas-operator/internal/controller/databasedeclaration_controller.go b/dbaas-operator/internal/controller/databasedeclaration_controller.go index 79f66e62f..5c4ef0a23 100644 --- a/dbaas-operator/internal/controller/databasedeclaration_controller.go +++ b/dbaas-operator/internal/controller/databasedeclaration_controller.go @@ -566,6 +566,7 @@ func (r *DatabaseDeclarationReconciler) consumeBindingTrigger(key string) bool { return true } +// clearBindingTrigger drops any pending NamespaceBinding trigger stamp for key. func (r *DatabaseDeclarationReconciler) clearBindingTrigger(key string) { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() diff --git a/dbaas-operator/internal/controller/dbpolicy_controller.go b/dbaas-operator/internal/controller/dbpolicy_controller.go index 09821a006..1dcf015b9 100644 --- a/dbaas-operator/internal/controller/dbpolicy_controller.go +++ b/dbaas-operator/internal/controller/dbpolicy_controller.go @@ -218,6 +218,7 @@ func (r *DbPolicyReconciler) consumeBindingTrigger(key string) bool { return true } +// clearBindingTrigger drops any pending NamespaceBinding trigger stamp for key. func (r *DbPolicyReconciler) clearBindingTrigger(key string) { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() diff --git a/dbaas-operator/internal/controller/externaldatabase_controller.go b/dbaas-operator/internal/controller/externaldatabase_controller.go index 1a3212a42..e78d312b0 100644 --- a/dbaas-operator/internal/controller/externaldatabase_controller.go +++ b/dbaas-operator/internal/controller/externaldatabase_controller.go @@ -63,7 +63,8 @@ type ExternalDatabaseReconciler struct { // secretTriggerMu guards the Secret-trigger maps below. secretTriggerMu sync.Mutex // secretTriggerStamps is consumed when classifying the next reconcile. - // secretPropagationStamps is kept until Succeeded so retries are included. + // secretPropagationStamps records the first Secret-change time for the next + // reconcile. It is consumed on reconcile exit and observed only on Succeeded. secretTriggerStamps map[string]struct{} secretPropagationStamps map[string]time.Time @@ -435,6 +436,7 @@ func (r *ExternalDatabaseReconciler) consumeBindingTrigger(key string) bool { return true } +// clearBindingTrigger drops any pending NamespaceBinding trigger stamp for key. func (r *ExternalDatabaseReconciler) clearBindingTrigger(key string) { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() diff --git a/dbaas-operator/internal/controller/trigger_stamps_test.go b/dbaas-operator/internal/controller/trigger_stamps_test.go index 5b419b74a..cd5602b94 100644 --- a/dbaas-operator/internal/controller/trigger_stamps_test.go +++ b/dbaas-operator/internal/controller/trigger_stamps_test.go @@ -70,7 +70,7 @@ func TestExternalDatabaseTriggerStampsConcurrentAccess(t *testing.T) { var wg sync.WaitGroup for i := 0; i < 50; i++ { - wg.Add(4) + wg.Add(5) go func(i int) { defer wg.Done() r.stampSecretTrigger(key, time.Unix(int64(i), 0)) @@ -79,6 +79,10 @@ func TestExternalDatabaseTriggerStampsConcurrentAccess(t *testing.T) { defer wg.Done() _ = r.consumeSecretTrigger(key) }() + go func() { + defer wg.Done() + _, _ = r.consumeSecretPropagation(key) + }() go func() { defer wg.Done() r.stampBindingTrigger(key) From 634437bec85c2143638ffabd9f9d5ad4a3aab30c Mon Sep 17 00:00:00 2001 From: Abhimanyu Roy Date: Wed, 20 May 2026 15:31:36 +0530 Subject: [PATCH 6/6] feat: pushing improvements --- dbaas-operator/dev/operator-metrics.md | 10 ++++---- .../databasedeclaration_controller.go | 23 +++++++++++++++---- .../controller/dbpolicy_controller.go | 8 +++---- .../controller/externaldatabase_controller.go | 23 ++++++++----------- dbaas-operator/internal/controller/metrics.go | 2 +- .../internal/controller/metrics_test.go | 1 + .../controller/trigger_stamps_test.go | 16 +++++++++++++ 7 files changed, 55 insertions(+), 28 deletions(-) diff --git a/dbaas-operator/dev/operator-metrics.md b/dbaas-operator/dev/operator-metrics.md index ad64638dc..bb9322959 100644 --- a/dbaas-operator/dev/operator-metrics.md +++ b/dbaas-operator/dev/operator-metrics.md @@ -18,7 +18,7 @@ Each metric is registered at startup and scraped by Prometheus. Counts every reconcile invocation, tagged by what caused it. A `secret_change` increment means the watcher detected a credential rotation and re-registered the database automatically without a CR spec change. -Trigger classification is best-effort under overlapping events for the same object. The metric is useful for dashboard-level distribution and feature proof, but it should not be used as exact causal tracing or as an alert source. +Trigger classification is best-effort. Overlapping events for the same object can swap labels between queued reconciles. A stamped trigger can also be consumed by a reconcile that is later skipped on namespace ownership, for example while a matching `NamespaceBinding` has not yet propagated to the informer cache; the follow-up reconcile then falls back to `spec_change`. The metric is useful for dashboard-level distribution and feature proof, but it should not be used as exact causal tracing or as an alert source. **Dashboard:** Stacked time series by trigger. A spike in `secret_change` is direct proof the feature fired. @@ -101,8 +101,8 @@ sum by (controller, result) ( ``` ```promql -sum by (namespace, reason) ( - increase(dbaas_secret_resolution_errors_total[$__range]) +sum by (exported_namespace, reason) ( + increase(dbaas_secret_resolution_errors_total{namespace=""}[$__range]) ) ``` @@ -123,8 +123,8 @@ sum by (controller, result) ( ``` ```promql -sum by (namespace, reason) ( - rate(dbaas_secret_resolution_errors_total[15m]) +sum by (exported_namespace, reason) ( + rate(dbaas_secret_resolution_errors_total{namespace=""}[15m]) ) > 0 ``` diff --git a/dbaas-operator/internal/controller/databasedeclaration_controller.go b/dbaas-operator/internal/controller/databasedeclaration_controller.go index 5c4ef0a23..fc5525c92 100644 --- a/dbaas-operator/internal/controller/databasedeclaration_controller.go +++ b/dbaas-operator/internal/controller/databasedeclaration_controller.go @@ -81,7 +81,9 @@ func (r *DatabaseDeclarationReconciler) Reconcile(ctx context.Context, req ctrl. dd := &dbaasv1.DatabaseDeclaration{} if err := r.Get(ctx, req.NamespacedName, dd); err != nil { if apierrors.IsNotFound(err) { - r.clearBindingTrigger(req.Namespace + "/" + req.Name) + key := req.Namespace + "/" + req.Name + r.clearBindingTrigger(key) + r.clearAsyncStart(key) } return ctrl.Result{}, client.IgnoreNotFound(err) } @@ -94,6 +96,8 @@ func (r *DatabaseDeclarationReconciler) Reconcile(ctx context.Context, req ctrl. return ctrl.Result{}, err } if !owned { + r.clearBindingTrigger(key) + r.clearAsyncStart(key) return result, nil } @@ -118,6 +122,7 @@ func (r *DatabaseDeclarationReconciler) Reconcile(ctx context.Context, req ctrl. dd.Status.PendingOperationGeneration, dd.Generation, dd.Status.TrackingID) dd.Status.TrackingID = "" dd.Status.PendingOperationGeneration = 0 + r.clearAsyncStart(key) } trigger := triggerSpecChange @@ -396,6 +401,13 @@ func (r *DatabaseDeclarationReconciler) observeAsyncCompletion(dd *dbaasv1.Datab } } +// clearAsyncStart drops any pending async-operation start stamp for key. +func (r *DatabaseDeclarationReconciler) clearAsyncStart(key string) { + r.asyncStartMu.Lock() + defer r.asyncStartMu.Unlock() + delete(r.asyncStartTimes, key) +} + func (r *DatabaseDeclarationReconciler) handlePollError( ctx context.Context, dd *dbaasv1.DatabaseDeclaration, @@ -419,6 +431,7 @@ func (r *DatabaseDeclarationReconciler) handlePollError( // 404 — trackingId expired or never existed; clear it so the next // reconcile re-submits the operation. log.InfoC(ctx, "trackingId not found, will re-submit on next reconcile trackingId=%v", trackingID) + r.clearAsyncStart(dd.Namespace + "/" + dd.Name) clearPendingOperation(dd) markTransientFailure(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, EventReasonAggregatorError, "operation trackingId not found — will re-submit on next reconcile") @@ -541,8 +554,8 @@ func (r *DatabaseDeclarationReconciler) enqueueForBinding(ctx context.Context, o // stampBindingTrigger records that the next reconcile for key was most likely // caused by a NamespaceBinding change. This is best-effort: overlapping triggers -// for the same key can swap labels between queued reconciles, so the metric is -// informational and should not be used as exact causal tracing. +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. func (r *DatabaseDeclarationReconciler) stampBindingTrigger(key string) { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() @@ -554,8 +567,8 @@ func (r *DatabaseDeclarationReconciler) stampBindingTrigger(key string) { // consumeBindingTrigger classifies the next reconcile for key as most likely // caused by a NamespaceBinding change. This is best-effort: overlapping triggers -// for the same key can swap labels between queued reconciles, so the metric is -// informational and should not be used as exact causal tracing. +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. func (r *DatabaseDeclarationReconciler) consumeBindingTrigger(key string) bool { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() diff --git a/dbaas-operator/internal/controller/dbpolicy_controller.go b/dbaas-operator/internal/controller/dbpolicy_controller.go index 1dcf015b9..6b8227ba3 100644 --- a/dbaas-operator/internal/controller/dbpolicy_controller.go +++ b/dbaas-operator/internal/controller/dbpolicy_controller.go @@ -193,8 +193,8 @@ func (r *DbPolicyReconciler) enqueueForBinding(ctx context.Context, obj client.O // stampBindingTrigger records that the next reconcile for key was most likely // caused by a NamespaceBinding change. This is best-effort: overlapping triggers -// for the same key can swap labels between queued reconciles, so the metric is -// informational and should not be used as exact causal tracing. +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. func (r *DbPolicyReconciler) stampBindingTrigger(key string) { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() @@ -206,8 +206,8 @@ func (r *DbPolicyReconciler) stampBindingTrigger(key string) { // consumeBindingTrigger classifies the next reconcile for key as most likely // caused by a NamespaceBinding change. This is best-effort: overlapping triggers -// for the same key can swap labels between queued reconciles, so the metric is -// informational and should not be used as exact causal tracing. +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. func (r *DbPolicyReconciler) consumeBindingTrigger(key string) bool { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() diff --git a/dbaas-operator/internal/controller/externaldatabase_controller.go b/dbaas-operator/internal/controller/externaldatabase_controller.go index e78d312b0..487e0c8d9 100644 --- a/dbaas-operator/internal/controller/externaldatabase_controller.go +++ b/dbaas-operator/internal/controller/externaldatabase_controller.go @@ -88,9 +88,10 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req // Determine whether this reconcile was triggered by a Secret change or a // spec change, and record the trigger counter. fromSecret := r.consumeSecretTrigger(edbKey) + observeSecretPropagation := false defer func() { secretStart, ok := r.consumeSecretPropagation(edbKey) - if ok && edb.Status.Phase == dbaasv1.PhaseSucceeded { + if ok && observeSecretPropagation { dbaasSecretRotationPropagationSeconds.Observe(time.Since(secretStart).Seconds()) } }() @@ -184,6 +185,7 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req log.InfoC(ctx, "external database registered successfully. type: %v, dbName: %v", edb.Spec.Type, edb.Spec.DbName) markSucceeded(&edb.Status.Phase, &edb.Status.Conditions, edb.Generation, EventReasonDatabaseRegistered) + observeSecretPropagation = true r.Recorder.Eventf(edb, corev1.EventTypeNormal, EventReasonDatabaseRegistered, "registered with dbaas-aggregator (type=%s, dbName=%s)", edb.Spec.Type, edb.Spec.DbName) return ctrl.Result{}, nil @@ -210,11 +212,6 @@ func (r *ExternalDatabaseReconciler) buildRequest( }, nil } -// classifierToFlatMap converts the typed Classifier struct into the flat -// string-to-string map expected by dbaas-aggregator on the wire. -// The aggregator treats the classifier as SortedMap with -// top-level keys: microserviceName, scope, namespace, tenantId, plus any -// additional adapter-specific entries from customKeys merged into the top level. // classifierToFlatMap converts a typed Classifier into the map expected by // dbaas-aggregator's ExternalDatabaseRequestV3.classifier (declared on the // Java side as SortedMap). Scalar fields are emitted as @@ -411,8 +408,8 @@ func (r *ExternalDatabaseReconciler) enqueueForBinding(ctx context.Context, obj // stampBindingTrigger records that the next reconcile for key was most likely // caused by a NamespaceBinding change. This is best-effort: overlapping triggers -// for the same key can swap labels between queued reconciles, so the metric is -// informational and should not be used as exact causal tracing. +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. func (r *ExternalDatabaseReconciler) stampBindingTrigger(key string) { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() @@ -424,8 +421,8 @@ func (r *ExternalDatabaseReconciler) stampBindingTrigger(key string) { // consumeBindingTrigger classifies the next reconcile for key as most likely // caused by a NamespaceBinding change. This is best-effort: overlapping triggers -// for the same key can swap labels between queued reconciles, so the metric is -// informational and should not be used as exact causal tracing. +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. func (r *ExternalDatabaseReconciler) consumeBindingTrigger(key string) bool { r.bindingTriggerMu.Lock() defer r.bindingTriggerMu.Unlock() @@ -444,9 +441,9 @@ func (r *ExternalDatabaseReconciler) clearBindingTrigger(key string) { } // stampSecretTrigger records that the next reconcile for key was most likely -// caused by a Secret change. This is best-effort: overlapping triggers for the -// same key can swap labels between queued reconciles, so the metric is -// informational and should not be used as exact causal tracing. +// caused by a Secret change. This is best-effort: overlapping triggers or +// ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. func (r *ExternalDatabaseReconciler) stampSecretTrigger(key string, startedAt time.Time) { r.secretTriggerMu.Lock() defer r.secretTriggerMu.Unlock() diff --git a/dbaas-operator/internal/controller/metrics.go b/dbaas-operator/internal/controller/metrics.go index effb67f4a..9140b5aa1 100644 --- a/dbaas-operator/internal/controller/metrics.go +++ b/dbaas-operator/internal/controller/metrics.go @@ -78,7 +78,7 @@ var dbaasReconcileTriggerTotal = prometheus.NewCounterVec( var dbaasSecretResolutionErrorsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "dbaas_secret_resolution_errors_total", - Help: "Total failures reading credential Secrets referenced by ExternalDatabase.", + Help: "Failures reading credential Secrets referenced by ExternalDatabase, scoped to namespaces owned by this operator instance. Labelled by namespace and failure category (secret_not_found, key_missing, key_empty, forbidden, secret_read_failed).", }, []string{"namespace", "reason"}, ) diff --git a/dbaas-operator/internal/controller/metrics_test.go b/dbaas-operator/internal/controller/metrics_test.go index 68a81f43f..5795a2f76 100644 --- a/dbaas-operator/internal/controller/metrics_test.go +++ b/dbaas-operator/internal/controller/metrics_test.go @@ -20,6 +20,7 @@ func TestAggregatorResultClassifiesOwnershipBuckets(t *testing.T) { {name: "server", err: &aggregatorclient.AggregatorError{StatusCode: 500}, want: resultServerError}, {name: "wrapped server", err: fmt.Errorf("wrapped: %w", &aggregatorclient.AggregatorError{StatusCode: 503}), want: resultServerError}, {name: "network", err: errors.New("dial tcp: connection refused"), want: resultNetworkError}, + {name: "wrapped network", err: fmt.Errorf("connect: %w", errors.New("dial timeout")), want: resultNetworkError}, } for _, tt := range tests { diff --git a/dbaas-operator/internal/controller/trigger_stamps_test.go b/dbaas-operator/internal/controller/trigger_stamps_test.go index cd5602b94..faf17b238 100644 --- a/dbaas-operator/internal/controller/trigger_stamps_test.go +++ b/dbaas-operator/internal/controller/trigger_stamps_test.go @@ -59,6 +59,22 @@ func TestDatabaseDeclarationBindingTriggerLifecycle(t *testing.T) { assertBindingTriggerLifecycle(t, r.stampBindingTrigger, r.consumeBindingTrigger, r.clearBindingTrigger) } +func TestDatabaseDeclarationClearAsyncStart(t *testing.T) { + key := "test-ns/test-dd" + r := &DatabaseDeclarationReconciler{ + asyncStartTimes: map[string]time.Time{ + key: time.Unix(100, 0), + }, + } + + r.clearAsyncStart(key) + r.clearAsyncStart(key) + + if _, ok := r.asyncStartTimes[key]; ok { + t.Fatalf("asyncStartTimes[%q] exists after clearAsyncStart, want deleted", key) + } +} + func TestDbPolicyBindingTriggerLifecycle(t *testing.T) { r := &DbPolicyReconciler{} assertBindingTriggerLifecycle(t, r.stampBindingTrigger, r.consumeBindingTrigger, r.clearBindingTrigger)