diff --git a/dbaas-operator/dev/operator-metrics.md b/dbaas-operator/dev/operator-metrics.md new file mode 100644 index 000000000..bb9322959 --- /dev/null +++ b/dbaas-operator/dev/operator-metrics.md @@ -0,0 +1,180 @@ +# Operator Metrics + +Custom Prometheus metrics exposed by the DBaaS Operator at `/metrics`. +Each metric is registered at startup and scraped by Prometheus. + +--- + +## Secret Watcher + +### `dbaas_reconcile_trigger_total` +**Type:** Counter + +**Labels:** `controller`, `trigger` + +`controller`: `externaldatabase` | `dbpolicy` | `databasedeclaration` + +`trigger`: `spec_change` | `secret_change` | `namespace_binding_change` | `polling` + +Counts every reconcile invocation, tagged by what caused it. A `secret_change` increment means the watcher detected a credential rotation and re-registered the database automatically without a CR spec change. + +Trigger classification is best-effort. Overlapping events for the same object can swap labels between queued reconciles. A stamped trigger can also be consumed by a reconcile that is later skipped on namespace ownership, for example while a matching `NamespaceBinding` has not yet propagated to the informer cache; the follow-up reconcile then falls back to `spec_change`. The metric is useful for dashboard-level distribution and feature proof, but it should not be used as exact causal tracing or as an alert source. + +**Dashboard:** Stacked time series by trigger. A spike in `secret_change` is direct proof the feature fired. + +--- + +### `dbaas_secret_rotation_propagation_seconds` +**Type:** Histogram + +Measures the end-to-end time from a Secret change being detected to the ExternalDatabase reaching `Succeeded`. This is the SLO metric for credential rotation: it answers "how quickly does a password change take effect?" + +The start timestamp is kept in memory by the active operator process. If the operator restarts after the Secret event and before the ExternalDatabase succeeds, that specific propagation sample is not recorded. + +**Dashboard:** P50 / P90 / P99 latency lines over time. + +--- + +### `dbaas_secret_resolution_errors_total` +**Type:** Counter + +**Labels:** `namespace`, `reason` + +Counts failures reading a credentials Secret during reconcile. Each increment means a credential rotation or CR create/update could not resolve valid credentials until the Secret is fixed. + +**Dashboard:** Error rate time series per namespace/reason. Any non-zero sustained rate is actionable. + +--- + +## Aggregator Interaction + +### `dbaas_aggregator_request_duration_seconds` +**Type:** Histogram + +**Labels:** `controller`, `operation` (`register_external_database` | `apply_config` | `poll_status`) + +Tracks HTTP call latency to dbaas-aggregator per operation type. Identifies whether the aggregator is the bottleneck in slow registrations or provisioning. + +**Dashboard:** P50 / P90 / P99 latency lines, split by controller and operation. + +--- + +### `dbaas_aggregator_requests_total` +**Type:** Counter + +**Labels:** `controller`, `operation`, `result` (`success` | `auth_error` | `spec_rejection` | `server_error` | `network_error`) + +Counts every aggregator call by outcome. The `result` label routes failures to the right owner: + +| Result | Meaning | Owner | +|---|---|---| +| `success` | Call succeeded | - | +| `spec_rejection` | Aggregator rejected the payload by content | Application team / CR owner | +| `auth_error` | Operator credentials invalid | Platform team | +| `server_error` | Aggregator returned 5xx | Aggregator team | +| `network_error` | No response was received | Network / platform team | + +**Dashboard:** Stacked rate by result. Failure buckets are split because they have different owners. + +--- + +## Dashboard-Only Resource Health Substitutes + +The operator does not currently export `dbaas_resource_phase`, `dbaas_resources_stuck`, or `dbaas_async_operations_in_flight`. + +Those gauges require exact current-state knowledge across DBaaS CRs. A periodic sweeper can compute them, but it must list every relevant CR in each owned namespace. That cost scales linearly with the number of CRs and becomes less attractive as DBaaS adoption grows. + +A custom in-memory tracker could avoid periodic listing, but it would need to handle startup sync, status-only updates, deletes, operator restarts, leader changes, and NamespaceBinding ownership changes. That complexity is not justified for the current dashboard signal. + +The dashboard can still project close substitutes from event/rate metrics. These are not exact current-state counts; they show recent activity that usually explains why resources would be unhealthy. + +### Substitute for `dbaas_resource_phase` + +`dbaas_resource_phase` would answer: "how many CRs are currently in each phase?" + +Without a direct state gauge, use failure distribution over the dashboard range as the closest operational view of resource health: + +```promql +sum by (controller, result) ( + increase(dbaas_aggregator_requests_total{result!="success"}[$__range]) +) +``` + +```promql +sum by (exported_namespace, reason) ( + increase(dbaas_secret_resolution_errors_total{namespace=""}[$__range]) +) +``` + +Dashboard title: `Recent Failed Operations by Owner` + +This shows failure pressure by owner bucket, but it does not count current CR phases. + +### Substitute for `dbaas_resources_stuck` + +`dbaas_resources_stuck` would answer: "has any CR stayed in a bad state long enough to need attention?" + +Without a direct phase gauge, use repeated failure activity as a stuck-like signal. This is intentionally different from the resource-health distribution above: it looks for failures that keep recurring in the recent window. + +```promql +sum by (controller, result) ( + rate(dbaas_aggregator_requests_total{result!="success"}[15m]) +) > 0 +``` + +```promql +sum by (exported_namespace, reason) ( + rate(dbaas_secret_resolution_errors_total{namespace=""}[15m]) +) > 0 +``` + +Dashboard title: `Sustained Failure Signals` + +This is a close alerting substitute for quiet stuck resources, but it is event-based. If a CR failed once and then no more reconciles happened, this query will eventually go quiet. + +If a reliable non-sweep phase metric is introduced later, the preferred stuck query would be: + +```promql +min_over_time(dbaas_resource_phase{phase=~"BackingOff|InvalidConfiguration"}[15m]) > 0 +``` + +### Substitute for `dbaas_async_operations_in_flight` + +`dbaas_async_operations_in_flight` would answer: "how many DatabaseDeclarations currently have an active tracking ID?" + +Without a direct state gauge, use a backlog proxy: polling activity without matching terminal completions. + +```promql +sum( + rate(dbaas_reconcile_trigger_total{controller="databasedeclaration",trigger="polling"}[15m]) +) +``` + +```promql +sum( + rate(dbaas_async_operation_duration_seconds_count[15m]) +) +``` + +Dashboard title: `Async Provisioning Activity` + +If the first query is non-zero while the second is near zero, DatabaseDeclarations are being polled but not reaching terminal states. It does not count currently in-flight operations. + +--- + +## Async Provisioning + +### `dbaas_async_operation_duration_seconds` +**Type:** Histogram + +**Labels:** `result` (`success` | `failed` | `terminated`) + +End-to-end provisioning time from async submission (HTTP 202) to a terminal state. This is the user-visible provisioning SLO: the time a user waits from submitting a DatabaseDeclaration to having a usable database. + +`terminated` means the aggregator cancelled the operation mid-flight; the operator resubmits automatically. + +The start timestamp is kept in memory after the operator submits an async operation. If the operator restarts before the terminal poll result, that operation is still reconciled correctly, but its duration sample is not recorded. + +Operational note: alerts on this histogram should rely on absolute latency percentiles, not on sample-rate dips. A drop in `rate(dbaas_async_operation_duration_seconds_count[...])` after an operator restart usually reflects lost samples, not a slowdown in actual provisioning. + +**Dashboard:** P50 / P90 / P99 per result label. Completion rate (operations/s) as a secondary panel. diff --git a/dbaas-operator/go.mod b/dbaas-operator/go.mod index 4e6e66456..9ce5685fb 100644 --- a/dbaas-operator/go.mod +++ b/dbaas-operator/go.mod @@ -6,11 +6,13 @@ require ( github.com/go-logr/logr v1.4.3 github.com/go-resty/resty/v2 v2.17.2 github.com/google/uuid v1.6.0 + github.com/prometheus/client_golang v1.23.2 github.com/netcracker/qubership-core-lib-go-error-handling/v3 v3.5.3 github.com/netcracker/qubership-core-lib-go/v3 v3.9.1 github.com/onsi/ginkgo/v2 v2.28.3 github.com/onsi/gomega v1.40.0 k8s.io/api v0.35.4 + k8s.io/apiextensions-apiserver v0.35.0 k8s.io/apimachinery v0.35.4 k8s.io/client-go v0.35.4 sigs.k8s.io/controller-runtime v0.23.3 @@ -43,6 +45,7 @@ require ( github.com/knadh/koanf/providers/env v1.1.0 // indirect github.com/knadh/koanf/providers/file v1.2.1 // indirect github.com/knadh/koanf/v2 v2.3.4 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect @@ -51,7 +54,6 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect @@ -76,7 +78,6 @@ require ( gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.35.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect diff --git a/dbaas-operator/helm-templates/dbaas-operator/dashboards/dbaas-operator-dashboard.json b/dbaas-operator/helm-templates/dbaas-operator/dashboards/dbaas-operator-dashboard.json new file mode 100644 index 000000000..2b4acf6c5 --- /dev/null +++ b/dbaas-operator/helm-templates/dbaas-operator/dashboards/dbaas-operator-dashboard.json @@ -0,0 +1,631 @@ +{ + "title": "DBaaS Operator", + "uid": "dbaas-operator", + "schemaVersion": 39, + "version": 3, + "refresh": "30s", + "time": { + "from": "now-15m", + "to": "now" + }, + "timezone": "browser", + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Platform Monitoring Prometheus", + "value": "Platform Monitoring Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Cloud", + "multi": false, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "panels": [ + { + "id": 1, + "type": "row", + "title": "Secret Watcher - Credential Rotation", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + } + }, + { + "id": 2, + "type": "timeseries", + "title": "Reconcile Triggers by Source (EDB)", + "description": "Compares how many reconciles are driven by Secret rotations vs spec changes. A spike in secret_change confirms the watcher is firing.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "secret_change" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "spec_change" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum by (trigger) (rate(dbaas_reconcile_trigger_total{namespace=\"#namespace#\",controller=\"externaldatabase\"}[1m]))", + "legendFormat": "{{trigger}}" + } + ] + }, + { + "id": 3, + "type": "timeseries", + "title": "Secret Rotation Propagation Time (seconds)", + "description": "End-to-end time from a Secret change being detected to the ExternalDatabase reaching Succeeded. This is the SLO metric for credential rotation.", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "lineWidth": 2 + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(dbaas_secret_rotation_propagation_seconds_bucket{namespace=\"#namespace#\"}[5m]))", + "legendFormat": "P50" + }, + { + "expr": "histogram_quantile(0.90, rate(dbaas_secret_rotation_propagation_seconds_bucket{namespace=\"#namespace#\"}[5m]))", + "legendFormat": "P90" + }, + { + "expr": "histogram_quantile(0.99, rate(dbaas_secret_rotation_propagation_seconds_bucket{namespace=\"#namespace#\"}[5m]))", + "legendFormat": "P99" + } + ] + }, + { + "id": 4, + "type": "timeseries", + "title": "Secret Resolution Errors", + "description": "Failures reading a credentials Secret. Reasons are actionable: secret_not_found, key_missing, key_empty, forbidden, or secret_read_failed.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "short", + "color": { + "fixedColor": "red", + "mode": "fixed" + }, + "custom": { + "lineWidth": 2 + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum by (exported_namespace, reason) (rate(dbaas_secret_resolution_errors_total{namespace=\"#namespace#\"}[1m]))", + "legendFormat": "{{exported_namespace}} / {{reason}}" + } + ] + }, + { + "id": 5, + "type": "row", + "title": "Aggregator Interaction Health", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + } + }, + { + "id": 6, + "type": "timeseries", + "title": "Aggregator Call Latency (seconds)", + "description": "P50/P90/P99 latency per controller and operation. Feeds into SLO alerting - if P99 rises, the aggregator is slow.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "lineWidth": 2 + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", + "legendFormat": "P50 {{controller}}/{{operation}}" + }, + { + "expr": "histogram_quantile(0.90, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", + "legendFormat": "P90 {{controller}}/{{operation}}" + }, + { + "expr": "histogram_quantile(0.99, sum by (controller, operation, le) (rate(dbaas_aggregator_request_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", + "legendFormat": "P99 {{controller}}/{{operation}}" + } + ] + }, + { + "id": 7, + "type": "timeseries", + "title": "Aggregator Call Outcomes (calls/s)", + "description": "Result breakdown: spec_rejection = CR/spec owner, auth_error = operator credentials/RBAC, server_error = aggregator, network_error = connectivity/platform.", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "spec_rejection" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "auth_error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "server_error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "network_error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum by (result) (rate(dbaas_aggregator_requests_total{namespace=\"#namespace#\"}[1m]))", + "legendFormat": "{{result}}" + } + ] + }, + { + "id": 8, + "type": "row", + "title": "Dashboard-Only Resource Health Substitutes", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + } + }, + { + "id": 9, + "type": "timeseries", + "title": "Recent Failed Operations by Owner", + "description": "Dashboard-only substitute for resource phase health. Shows recent failed operations by owner bucket instead of exact current CR phase counts.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by (controller, result) (increase(dbaas_aggregator_requests_total{namespace=\"#namespace#\",result!=\"success\"}[$__range]))", + "legendFormat": "aggregator {{controller}} / {{result}}", + "refId": "A" + }, + { + "expr": "sum by (exported_namespace, reason) (increase(dbaas_secret_resolution_errors_total{namespace=\"#namespace#\"}[$__range]))", + "legendFormat": "secret {{exported_namespace}} / {{reason}}", + "refId": "B" + } + ] + }, + { + "id": 10, + "type": "timeseries", + "title": "Sustained Failure Signals", + "description": "Dashboard-only substitute for stuck resources. Shows failure signals that continue over the recent window; it is event-based, not an exact stuck CR count.", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by (controller, result) (rate(dbaas_aggregator_requests_total{namespace=\"#namespace#\",result!=\"success\"}[15m])) \u003e 0", + "legendFormat": "aggregator {{controller}} / {{result}}", + "refId": "A" + }, + { + "expr": "sum by (exported_namespace, reason) (rate(dbaas_secret_resolution_errors_total{namespace=\"#namespace#\"}[15m])) \u003e 0", + "legendFormat": "secret {{exported_namespace}} / {{reason}}", + "refId": "B" + } + ] + }, + { + "id": 11, + "type": "timeseries", + "title": "Async Provisioning Activity", + "description": "Dashboard-only substitute for async operations in flight. Compares DatabaseDeclaration polling activity with terminal completions.", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 35 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum(rate(dbaas_reconcile_trigger_total{namespace=\"#namespace#\",controller=\"databasedeclaration\",trigger=\"polling\"}[15m]))", + "legendFormat": "polling rate", + "refId": "A" + }, + { + "expr": "sum(rate(dbaas_async_operation_duration_seconds_count{namespace=\"#namespace#\"}[15m]))", + "legendFormat": "completion rate", + "refId": "B" + } + ] + }, + { + "id": 15, + "type": "timeseries", + "title": "Async Provisioning End-to-End Duration (seconds)", + "description": "How long it takes from HTTP 202 (async accepted) to a terminal state (completed / failed / terminated). This is the user-visible provisioning SLO.", + "gridPos": { + "h": 8, + "w": 18, + "x": 6, + "y": 44 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "lineWidth": 2 + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", + "legendFormat": "P50 {{result}}" + }, + { + "expr": "histogram_quantile(0.90, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", + "legendFormat": "P90 {{result}}" + }, + { + "expr": "histogram_quantile(0.99, sum by (result, le) (rate(dbaas_async_operation_duration_seconds_bucket{namespace=\"#namespace#\"}[5m])))", + "legendFormat": "P99 {{result}}" + } + ] + }, + { + "id": 16, + "type": "timeseries", + "title": "Async Completion Rate by Outcome", + "description": "Rate of async operations completing per second, split by result (success / failed / terminated). Terminated means the aggregator cancelled and the operator will resubmit.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 52 + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2 + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "terminated" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum by (result) (rate(dbaas_async_operation_duration_seconds_count{namespace=\"#namespace#\"}[1m]))", + "legendFormat": "{{result}}" + } + ] + } + ] +} + diff --git a/dbaas-operator/helm-templates/dbaas-operator/templates/Dashboard.yaml b/dbaas-operator/helm-templates/dbaas-operator/templates/Dashboard.yaml new file mode 100644 index 000000000..197a346ee --- /dev/null +++ b/dbaas-operator/helm-templates/dbaas-operator/templates/Dashboard.yaml @@ -0,0 +1,17 @@ +{{- if and .Values.DBAAS_OPERATOR_ENABLED .Values.MONITORING_ENABLED }} +--- +apiVersion: integreatly.org/v1alpha1 +kind: GrafanaDashboard +metadata: + name: dbaas-operator-dashboard + labels: + app: grafana + app.kubernetes.io/name: "{{ .Values.SERVICE_NAME }}" + app.kubernetes.io/component: monitoring + app.kubernetes.io/part-of: '{{ .Values.APPLICATION_NAME | default "dbaas" }}' + app.kubernetes.io/managed-by: "{{ .Values.MANAGED_BY }}" + deployment.netcracker.com/sessionId: '{{ .Values.DEPLOYMENT_SESSION_ID | default "unimplemented" }}' +spec: + name: dbaas-operator-dashboard.json + json: {{ .Files.Get "dashboards/dbaas-operator-dashboard.json" | toJson | replace "#namespace#" .Values.NAMESPACE }} +{{- end }} diff --git a/dbaas-operator/internal/controller/databasedeclaration_controller.go b/dbaas-operator/internal/controller/databasedeclaration_controller.go index 83fe36b0c..fc5525c92 100644 --- a/dbaas-operator/internal/controller/databasedeclaration_controller.go +++ b/dbaas-operator/internal/controller/databasedeclaration_controller.go @@ -25,9 +25,11 @@ import ( "errors" "fmt" "net/http" + "sync" "time" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" @@ -60,6 +62,16 @@ type DatabaseDeclarationReconciler struct { Aggregator *aggregatorclient.AggregatorClient Recorder record.EventRecorder Ownership *ownership.OwnershipResolver + + // asyncStartMu guards asyncStartTimes. + asyncStartMu sync.Mutex + // asyncStartTimes records the wall-clock time when an async operation was + // submitted (HTTP 202), keyed by "namespace/name". Consumed when the + // operation reaches a terminal state (COMPLETED, FAILED, TERMINATED). + asyncStartTimes map[string]time.Time + + bindingTriggerMu sync.Mutex + bindingTriggerStamps map[string]struct{} } func (r *DatabaseDeclarationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { @@ -68,14 +80,24 @@ func (r *DatabaseDeclarationReconciler) Reconcile(ctx context.Context, req ctrl. dd := &dbaasv1.DatabaseDeclaration{} if err := r.Get(ctx, req.NamespacedName, dd); err != nil { + if apierrors.IsNotFound(err) { + key := req.Namespace + "/" + req.Name + r.clearBindingTrigger(key) + r.clearAsyncStart(key) + } return ctrl.Result{}, client.IgnoreNotFound(err) } + key := req.Namespace + "/" + req.Name + bindingTriggered := r.consumeBindingTrigger(key) + owned, result, err := checkOwnership(ctx, r.Ownership, dd.Namespace, dd.Name, "DatabaseDeclaration") if err != nil { return ctrl.Result{}, err } if !owned { + r.clearBindingTrigger(key) + r.clearAsyncStart(key) return result, nil } @@ -100,7 +122,16 @@ func (r *DatabaseDeclarationReconciler) Reconcile(ctx context.Context, req ctrl. dd.Status.PendingOperationGeneration, dd.Generation, dd.Status.TrackingID) dd.Status.TrackingID = "" dd.Status.PendingOperationGeneration = 0 + r.clearAsyncStart(key) + } + + trigger := triggerSpecChange + if bindingTriggered { + trigger = triggerNamespaceBindingChange + } else if dd.Status.TrackingID != "" { + trigger = triggerPolling } + recordReconcileTrigger(controllerDD, trigger) if dd.Status.TrackingID != "" { return r.reconcilePoll(ctx, dd) @@ -120,7 +151,9 @@ func (r *DatabaseDeclarationReconciler) reconcileSubmit(ctx context.Context, dd // ── Call aggregator ─────────────────────────────────────────────────────── payload := r.buildPayload(dd) dd.Status.LastRequestID = requestID + aggStart := time.Now() resp, err := r.Aggregator.ApplyConfig(ctx, payload) + recordAggregatorCall(controllerDD, operationApplyConfig, aggStart, err) if err != nil { log.ErrorC(ctx, "failed to apply DatabaseDeclaration to dbaas-aggregator: %v", err) return handleAggregatorError(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, r.Recorder, dd, err, requestID) @@ -129,6 +162,13 @@ func (r *DatabaseDeclarationReconciler) reconcileSubmit(ctx context.Context, dd if resp.TrackingID != "" { // HTTP 202 Accepted — async operation started. log.InfoC(ctx, "database provisioning started asynchronously. trackingId = %v, microserviceName = %v", resp.TrackingID, dd.Spec.Classifier.MicroserviceName) + ddKey := dd.Namespace + "/" + dd.Name + r.asyncStartMu.Lock() + if r.asyncStartTimes == nil { + r.asyncStartTimes = make(map[string]time.Time) + } + r.asyncStartTimes[ddKey] = time.Now() + r.asyncStartMu.Unlock() markProvisioningStarted(dd, resp.TrackingID) r.Recorder.Eventf(dd, corev1.EventTypeNormal, EventReasonProvisioningStarted, "database provisioning started asynchronously (trackingId=%s, requestId=%s)", @@ -155,7 +195,9 @@ func (r *DatabaseDeclarationReconciler) reconcilePoll(ctx context.Context, dd *d dd.Status.Phase = dbaasv1.PhaseWaitingForDependency dd.Status.LastRequestID = requestID + aggStart := time.Now() resp, err := r.Aggregator.GetOperationStatus(ctx, trackingID) + recordAggregatorCall(controllerDD, operationPollStatus, aggStart, err) if err != nil { return r.handlePollError(ctx, dd, trackingID, requestID, err) } @@ -343,6 +385,29 @@ func clearPendingOperation(dd *dbaasv1.DatabaseDeclaration) { dd.Status.PendingOperationGeneration = 0 } +// observeAsyncCompletion records the end-to-end async operation duration and +// consumes the in-memory start stamp. Safe to call even when no start stamp exists +// (e.g. operator restarted mid-operation — in that case duration is not recorded). +func (r *DatabaseDeclarationReconciler) observeAsyncCompletion(dd *dbaasv1.DatabaseDeclaration, result string) { + ddKey := dd.Namespace + "/" + dd.Name + r.asyncStartMu.Lock() + start, ok := r.asyncStartTimes[ddKey] + if ok { + delete(r.asyncStartTimes, ddKey) + } + r.asyncStartMu.Unlock() + if ok { + dbaasAsyncOperationDurationSeconds.WithLabelValues(result).Observe(time.Since(start).Seconds()) + } +} + +// clearAsyncStart drops any pending async-operation start stamp for key. +func (r *DatabaseDeclarationReconciler) clearAsyncStart(key string) { + r.asyncStartMu.Lock() + defer r.asyncStartMu.Unlock() + delete(r.asyncStartTimes, key) +} + func (r *DatabaseDeclarationReconciler) handlePollError( ctx context.Context, dd *dbaasv1.DatabaseDeclaration, @@ -366,6 +431,7 @@ func (r *DatabaseDeclarationReconciler) handlePollError( // 404 — trackingId expired or never existed; clear it so the next // reconcile re-submits the operation. log.InfoC(ctx, "trackingId not found, will re-submit on next reconcile trackingId=%v", trackingID) + r.clearAsyncStart(dd.Namespace + "/" + dd.Name) clearPendingOperation(dd) markTransientFailure(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, EventReasonAggregatorError, "operation trackingId not found — will re-submit on next reconcile") @@ -399,6 +465,7 @@ func (r *DatabaseDeclarationReconciler) handlePollResponse( log.InfoC(ctx, "database provisioned. trackingId = %v, microserviceName = %v", trackingID, dd.Spec.Classifier.MicroserviceName) clearPendingOperation(dd) + r.observeAsyncCompletion(dd, resultSuccess) markSucceeded(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, EventReasonDatabaseProvisioned) r.Recorder.Eventf(dd, corev1.EventTypeNormal, EventReasonDatabaseProvisioned, "database provisioned (microserviceName=%s, trackingId=%s)", @@ -410,6 +477,7 @@ func (r *DatabaseDeclarationReconciler) handlePollResponse( log.InfoC(ctx, "database provisioning failed trackingId=%v status=%v reason=%v", trackingID, resp.Status, reason) clearPendingOperation(dd) + r.observeAsyncCompletion(dd, asyncResultFailed) markPermanentFailure(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, EventReasonAggregatorRejected, reason) r.Recorder.Eventf(dd, corev1.EventTypeWarning, EventReasonAggregatorRejected, @@ -423,6 +491,7 @@ func (r *DatabaseDeclarationReconciler) handlePollResponse( // Clear the stale trackingID so the next reconcile enters the SUBMIT branch. log.InfoC(ctx, "provisioning was terminated, clearing trackingId for resubmit trackingId=%v", trackingID) clearPendingOperation(dd) + r.observeAsyncCompletion(dd, asyncResultTerminated) markTransientFailure(&dd.Status.Phase, &dd.Status.Conditions, dd.Generation, EventReasonOperationTerminated, "provisioning was terminated by the aggregator, resubmitting") r.Recorder.Eventf(dd, corev1.EventTypeWarning, EventReasonOperationTerminated, @@ -477,7 +546,42 @@ func (r *DatabaseDeclarationReconciler) enqueueForBinding(ctx context.Context, o } reqs := make([]reconcile.Request, 0, len(list.Items)) for i := range list.Items { + r.stampBindingTrigger(list.Items[i].Namespace + "/" + list.Items[i].Name) reqs = append(reqs, reconcile.Request{NamespacedName: client.ObjectKeyFromObject(&list.Items[i])}) } return reqs } + +// stampBindingTrigger records that the next reconcile for key was most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. +func (r *DatabaseDeclarationReconciler) stampBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if r.bindingTriggerStamps == nil { + r.bindingTriggerStamps = make(map[string]struct{}) + } + r.bindingTriggerStamps[key] = struct{}{} +} + +// consumeBindingTrigger classifies the next reconcile for key as most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. +func (r *DatabaseDeclarationReconciler) consumeBindingTrigger(key string) bool { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if _, ok := r.bindingTriggerStamps[key]; !ok { + return false + } + delete(r.bindingTriggerStamps, key) + return true +} + +// clearBindingTrigger drops any pending NamespaceBinding trigger stamp for key. +func (r *DatabaseDeclarationReconciler) clearBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + delete(r.bindingTriggerStamps, key) +} diff --git a/dbaas-operator/internal/controller/dbpolicy_controller.go b/dbaas-operator/internal/controller/dbpolicy_controller.go index e079ad1a2..6b8227ba3 100644 --- a/dbaas-operator/internal/controller/dbpolicy_controller.go +++ b/dbaas-operator/internal/controller/dbpolicy_controller.go @@ -21,8 +21,11 @@ package controller import ( "context" + "sync" + "time" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" @@ -49,6 +52,9 @@ type DbPolicyReconciler struct { Aggregator *aggregatorclient.AggregatorClient Recorder record.EventRecorder Ownership *ownership.OwnershipResolver + + bindingTriggerMu sync.Mutex + bindingTriggerStamps map[string]struct{} } func (r *DbPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { @@ -56,9 +62,15 @@ func (r *DbPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (r dp := &dbaasv1.DbPolicy{} if err := r.Get(ctx, req.NamespacedName, dp); err != nil { + if apierrors.IsNotFound(err) { + r.clearBindingTrigger(req.Namespace + "/" + req.Name) + } return ctrl.Result{}, client.IgnoreNotFound(err) } + key := req.Namespace + "/" + req.Name + bindingTriggered := r.consumeBindingTrigger(key) + owned, result, err := checkOwnership(ctx, r.Ownership, dp.Namespace, dp.Name, "DbPolicy") if err != nil { return ctrl.Result{}, err @@ -67,6 +79,12 @@ func (r *DbPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (r return result, nil } + trigger := triggerSpecChange + if bindingTriggered { + trigger = triggerNamespaceBindingChange + } + recordReconcileTrigger(controllerDP, trigger) + // Snapshot for the status patch at the end of reconcile. original := dp.DeepCopy() @@ -92,9 +110,12 @@ func (r *DbPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (r payload := r.buildPayload(dp) dp.Status.LastRequestID = requestID - if _, err := r.Aggregator.ApplyConfig(ctx, payload); err != nil { - log.ErrorC(ctx, "failed to apply DbPolicy to dbaas-aggregator: %v", err) - return handleAggregatorError(&dp.Status.Phase, &dp.Status.Conditions, dp.Generation, r.Recorder, dp, err, requestID) + aggStart := time.Now() + _, aggErr := r.Aggregator.ApplyConfig(ctx, payload) + recordAggregatorCall(controllerDP, operationApplyConfig, aggStart, aggErr) + if aggErr != nil { + log.ErrorC(ctx, "failed to apply DbPolicy to dbaas-aggregator: %v", aggErr) + return handleAggregatorError(&dp.Status.Phase, &dp.Status.Conditions, dp.Generation, r.Recorder, dp, aggErr, requestID) } log.InfoC(ctx, "DbPolicy applied successfully microserviceName=%v", dp.Spec.MicroserviceName) @@ -164,7 +185,42 @@ func (r *DbPolicyReconciler) enqueueForBinding(ctx context.Context, obj client.O } reqs := make([]reconcile.Request, 0, len(list.Items)) for i := range list.Items { + r.stampBindingTrigger(list.Items[i].Namespace + "/" + list.Items[i].Name) reqs = append(reqs, reconcile.Request{NamespacedName: client.ObjectKeyFromObject(&list.Items[i])}) } return reqs } + +// stampBindingTrigger records that the next reconcile for key was most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. +func (r *DbPolicyReconciler) stampBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if r.bindingTriggerStamps == nil { + r.bindingTriggerStamps = make(map[string]struct{}) + } + r.bindingTriggerStamps[key] = struct{}{} +} + +// consumeBindingTrigger classifies the next reconcile for key as most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. +func (r *DbPolicyReconciler) consumeBindingTrigger(key string) bool { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if _, ok := r.bindingTriggerStamps[key]; !ok { + return false + } + delete(r.bindingTriggerStamps, key) + return true +} + +// clearBindingTrigger drops any pending NamespaceBinding trigger stamp for key. +func (r *DbPolicyReconciler) clearBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + delete(r.bindingTriggerStamps, key) +} diff --git a/dbaas-operator/internal/controller/externaldatabase_controller.go b/dbaas-operator/internal/controller/externaldatabase_controller.go index 618f642cc..487e0c8d9 100644 --- a/dbaas-operator/internal/controller/externaldatabase_controller.go +++ b/dbaas-operator/internal/controller/externaldatabase_controller.go @@ -25,9 +25,12 @@ import ( "encoding/json" "fmt" "maps" + "sync" + "time" corev1 "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" @@ -56,24 +59,61 @@ type ExternalDatabaseReconciler struct { Aggregator *aggregatorclient.AggregatorClient Recorder record.EventRecorder Ownership *ownership.OwnershipResolver + + // secretTriggerMu guards the Secret-trigger maps below. + secretTriggerMu sync.Mutex + // secretTriggerStamps is consumed when classifying the next reconcile. + // secretPropagationStamps records the first Secret-change time for the next + // reconcile. It is consumed on reconcile exit and observed only on Succeeded. + secretTriggerStamps map[string]struct{} + secretPropagationStamps map[string]time.Time + + bindingTriggerMu sync.Mutex + bindingTriggerStamps map[string]struct{} } func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { ctx, requestID := initReconcileContext(ctx) + edbKey := req.Namespace + "/" + req.Name edb := &dbaasv1.ExternalDatabase{} if err := r.Get(ctx, req.NamespacedName, edb); err != nil { + if apierrors.IsNotFound(err) { + r.clearSecretTrigger(edbKey) + r.clearBindingTrigger(edbKey) + } return ctrl.Result{}, client.IgnoreNotFound(err) } + // Determine whether this reconcile was triggered by a Secret change or a + // spec change, and record the trigger counter. + fromSecret := r.consumeSecretTrigger(edbKey) + observeSecretPropagation := false + defer func() { + secretStart, ok := r.consumeSecretPropagation(edbKey) + if ok && observeSecretPropagation { + dbaasSecretRotationPropagationSeconds.Observe(time.Since(secretStart).Seconds()) + } + }() + + trigger := triggerSpecChange + if fromSecret { + trigger = triggerSecretChange + } else if r.consumeBindingTrigger(edbKey) { + trigger = triggerNamespaceBindingChange + } + // Skip namespaces not owned by this operator instance. owned, result, err := checkOwnership(ctx, r.Ownership, edb.Namespace, edb.Name, "ExternalDatabase") if err != nil { return ctrl.Result{}, err } if !owned { + r.clearSecretTrigger(edbKey) + r.clearBindingTrigger(edbKey) return result, nil } + recordReconcileTrigger(controllerEDB, trigger) // Snapshot for the status patch at the end of reconcile. original := edb.DeepCopy() @@ -121,6 +161,7 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req aggReq, err := r.buildRequest(ctx, edb) if err != nil { log.ErrorC(ctx, "failed to build registration request: %v", err) + dbaasSecretResolutionErrorsTotal.WithLabelValues(edb.Namespace, secretResolutionReason(err)).Inc() markTransientFailure(&edb.Status.Phase, &edb.Status.Conditions, edb.Generation, EventReasonSecretError, err.Error()) r.Recorder.Eventf(edb, corev1.EventTypeWarning, EventReasonSecretError, @@ -134,13 +175,17 @@ func (r *ExternalDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req // Call the aggregator. edb.Status.LastRequestID = requestID - if err := r.Aggregator.RegisterExternalDatabase(ctx, namespace, aggReq); err != nil { - log.ErrorC(ctx, "failed to register external database in dbaas-aggregator: %v", err) - return handleAggregatorError(&edb.Status.Phase, &edb.Status.Conditions, edb.Generation, r.Recorder, edb, err, requestID) + aggStart := time.Now() + aggErr := r.Aggregator.RegisterExternalDatabase(ctx, namespace, aggReq) + recordAggregatorCall(controllerEDB, operationRegisterEDB, aggStart, aggErr) + if aggErr != nil { + log.ErrorC(ctx, "failed to register external database in dbaas-aggregator: %v", aggErr) + return handleAggregatorError(&edb.Status.Phase, &edb.Status.Conditions, edb.Generation, r.Recorder, edb, aggErr, requestID) } log.InfoC(ctx, "external database registered successfully. type: %v, dbName: %v", edb.Spec.Type, edb.Spec.DbName) markSucceeded(&edb.Status.Phase, &edb.Status.Conditions, edb.Generation, EventReasonDatabaseRegistered) + observeSecretPropagation = true r.Recorder.Eventf(edb, corev1.EventTypeNormal, EventReasonDatabaseRegistered, "registered with dbaas-aggregator (type=%s, dbName=%s)", edb.Spec.Type, edb.Spec.DbName) return ctrl.Result{}, nil @@ -167,11 +212,6 @@ func (r *ExternalDatabaseReconciler) buildRequest( }, nil } -// classifierToFlatMap converts the typed Classifier struct into the flat -// string-to-string map expected by dbaas-aggregator on the wire. -// The aggregator treats the classifier as SortedMap with -// top-level keys: microserviceName, scope, namespace, tenantId, plus any -// additional adapter-specific entries from customKeys merged into the top level. // classifierToFlatMap converts a typed Classifier into the map expected by // dbaas-aggregator's ExternalDatabaseRequestV3.classifier (declared on the // Java side as SortedMap). Scalar fields are emitted as @@ -260,9 +300,19 @@ func (r *ExternalDatabaseReconciler) applySecretCredentials( ref := cp.CredentialsSecretRef secret := &corev1.Secret{} if err := r.Get(ctx, types.NamespacedName{Namespace: namespace, Name: ref.Name}, secret); err != nil { - return fmt.Errorf( - "connectionProperties[%d]: get Secret %q: %w", - index, ref.Name, err) + reason := secretReasonReadFailed + switch { + case apierrors.IsNotFound(err): + reason = secretReasonNotFound + case apierrors.IsForbidden(err): + reason = secretReasonForbidden + } + return &secretResolutionError{ + reason: reason, + err: fmt.Errorf( + "connectionProperties[%d]: get Secret %q: %w", + index, ref.Name, err), + } } // Defence-in-depth duplicate name check — CRD CEL validation should catch this @@ -278,14 +328,20 @@ func (r *ExternalDatabaseReconciler) applySecretCredentials( val, ok := secret.Data[km.Key] if !ok { - return fmt.Errorf( - "connectionProperties[%d]: Secret %q missing key %q", - index, ref.Name, km.Key) + return &secretResolutionError{ + reason: secretReasonKeyMissing, + err: fmt.Errorf( + "connectionProperties[%d]: Secret %q missing key %q", + index, ref.Name, km.Key), + } } if len(val) == 0 { - return fmt.Errorf( - "connectionProperties[%d]: Secret %q key %q is empty", - index, ref.Name, km.Key) + return &secretResolutionError{ + reason: secretReasonKeyEmpty, + err: fmt.Errorf( + "connectionProperties[%d]: Secret %q key %q is empty", + index, ref.Name, km.Key), + } } flat[km.Name] = string(val) } @@ -344,11 +400,92 @@ func (r *ExternalDatabaseReconciler) enqueueForBinding(ctx context.Context, obj } reqs := make([]reconcile.Request, 0, len(list.Items)) for i := range list.Items { + r.stampBindingTrigger(list.Items[i].Namespace + "/" + list.Items[i].Name) reqs = append(reqs, reconcile.Request{NamespacedName: client.ObjectKeyFromObject(&list.Items[i])}) } return reqs } +// stampBindingTrigger records that the next reconcile for key was most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. +func (r *ExternalDatabaseReconciler) stampBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if r.bindingTriggerStamps == nil { + r.bindingTriggerStamps = make(map[string]struct{}) + } + r.bindingTriggerStamps[key] = struct{}{} +} + +// consumeBindingTrigger classifies the next reconcile for key as most likely +// caused by a NamespaceBinding change. This is best-effort: overlapping triggers +// or ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. +func (r *ExternalDatabaseReconciler) consumeBindingTrigger(key string) bool { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + if _, ok := r.bindingTriggerStamps[key]; !ok { + return false + } + delete(r.bindingTriggerStamps, key) + return true +} + +// clearBindingTrigger drops any pending NamespaceBinding trigger stamp for key. +func (r *ExternalDatabaseReconciler) clearBindingTrigger(key string) { + r.bindingTriggerMu.Lock() + defer r.bindingTriggerMu.Unlock() + delete(r.bindingTriggerStamps, key) +} + +// stampSecretTrigger records that the next reconcile for key was most likely +// caused by a Secret change. This is best-effort: overlapping triggers or +// ownership skips can swap or drop labels between queued reconciles, so the +// metric is informational and should not be used as exact causal tracing. +func (r *ExternalDatabaseReconciler) stampSecretTrigger(key string, startedAt time.Time) { + r.secretTriggerMu.Lock() + defer r.secretTriggerMu.Unlock() + if r.secretTriggerStamps == nil { + r.secretTriggerStamps = make(map[string]struct{}) + } + r.secretTriggerStamps[key] = struct{}{} + if r.secretPropagationStamps == nil { + r.secretPropagationStamps = make(map[string]time.Time) + } + if _, exists := r.secretPropagationStamps[key]; !exists { + r.secretPropagationStamps[key] = startedAt + } +} + +func (r *ExternalDatabaseReconciler) consumeSecretTrigger(key string) bool { + r.secretTriggerMu.Lock() + defer r.secretTriggerMu.Unlock() + if _, ok := r.secretTriggerStamps[key]; !ok { + return false + } + delete(r.secretTriggerStamps, key) + return true +} + +func (r *ExternalDatabaseReconciler) consumeSecretPropagation(key string) (time.Time, bool) { + r.secretTriggerMu.Lock() + defer r.secretTriggerMu.Unlock() + start, ok := r.secretPropagationStamps[key] + if ok { + delete(r.secretPropagationStamps, key) + } + return start, ok +} + +func (r *ExternalDatabaseReconciler) clearSecretTrigger(key string) { + r.secretTriggerMu.Lock() + defer r.secretTriggerMu.Unlock() + delete(r.secretTriggerStamps, key) + delete(r.secretPropagationStamps, key) +} + // indexSecretNames is the indexer function registered at startup. // controller-runtime calls it for every ExternalDatabase to populate and // maintain the reverse map: "namespace/secretName" → []ExternalDatabase. @@ -382,8 +519,12 @@ func (r *ExternalDatabaseReconciler) enqueueForSecret(ctx context.Context, obj c obj.GetNamespace(), obj.GetName(), err) return nil } + + now := time.Now() reqs := make([]reconcile.Request, 0, len(list.Items)) for i := range list.Items { + key := list.Items[i].Namespace + "/" + list.Items[i].Name + r.stampSecretTrigger(key, now) reqs = append(reqs, reconcile.Request{NamespacedName: client.ObjectKeyFromObject(&list.Items[i])}) } return reqs diff --git a/dbaas-operator/internal/controller/metrics.go b/dbaas-operator/internal/controller/metrics.go new file mode 100644 index 000000000..9140b5aa1 --- /dev/null +++ b/dbaas-operator/internal/controller/metrics.go @@ -0,0 +1,197 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "errors" + "time" + + "github.com/prometheus/client_golang/prometheus" + + aggregatorclient "github.com/netcracker/qubership-dbaas/dbaas-operator/internal/client" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +// Label value constants. + +// Keeping the constants here because they are only relevant to Prometheus metrics. +const ( + controllerEDB = "externaldatabase" + controllerDD = "databasedeclaration" + controllerDP = "dbpolicy" + + triggerSpecChange = "spec_change" + triggerSecretChange = "secret_change" + triggerNamespaceBindingChange = "namespace_binding_change" + triggerPolling = "polling" + + resultSuccess = "success" + resultAuthError = "auth_error" + resultSpecRejection = "spec_rejection" + resultServerError = "server_error" + resultNetworkError = "network_error" + + asyncResultFailed = "failed" + asyncResultTerminated = "terminated" + + secretReasonNotFound = "secret_not_found" + secretReasonKeyMissing = "key_missing" + secretReasonKeyEmpty = "key_empty" + secretReasonForbidden = "forbidden" + secretReasonReadFailed = "secret_read_failed" + + operationRegisterEDB = "register_external_database" + operationApplyConfig = "apply_config" + operationPollStatus = "poll_status" +) + +// Metric declarations. + +// dbaasReconcileTriggerTotal counts reconcile invocations by meaningful source. +// Controller-runtime already exposes reconcile totals; this metric adds the +// trigger dimension that the framework cannot infer. +var dbaasReconcileTriggerTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "dbaas_reconcile_trigger_total", + Help: "Total reconcile invocations by trigger source.", + }, + []string{"controller", "trigger"}, +) + +// dbaasSecretResolutionErrorsTotal counts failures reading credential Secrets, +// labelled by error category. A non-zero value means a credential rotation +// left a database without valid credentials - direct service impact. +var dbaasSecretResolutionErrorsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "dbaas_secret_resolution_errors_total", + Help: "Failures reading credential Secrets referenced by ExternalDatabase, scoped to namespaces owned by this operator instance. Labelled by namespace and failure category (secret_not_found, key_missing, key_empty, forbidden, secret_read_failed).", + }, + []string{"namespace", "reason"}, +) + +// dbaasSecretRotationPropagationSeconds measures end-to-end time from a +// Secret change triggering a reconcile to the ExternalDatabase reaching +// Succeeded. This is the SLO metric for the credential-rotation feature. +var dbaasSecretRotationPropagationSeconds = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "dbaas_secret_rotation_propagation_seconds", + Help: "Time from a Secret change trigger to ExternalDatabase reaching Succeeded.", + Buckets: []float64{0.5, 1, 2, 5, 10, 30, 60, 120, 300, 600}, + }, +) + +// dbaasAggregatorRequestDurationSeconds tracks HTTP call latency to dbaas-aggregator. +// P50/P90/P99 per operation type feed latency SLO alerting. +var dbaasAggregatorRequestDurationSeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "dbaas_aggregator_request_duration_seconds", + Help: "Duration of HTTP calls to dbaas-aggregator by controller and operation.", + Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}, + }, + []string{"controller", "operation"}, +) + +// dbaasAggregatorRequestsTotal counts aggregator calls by controller, operation, +// and result. The result label distinguishes user errors (spec_rejection) from +// platform errors (auth_error, server_error) so each can be routed to the +// correct owner. +var dbaasAggregatorRequestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "dbaas_aggregator_requests_total", + Help: "Total calls to dbaas-aggregator, labelled by controller, operation, and result.", + }, + []string{"controller", "operation", "result"}, +) + +// dbaasAsyncOperationDurationSeconds measures end-to-end provisioning time from +// async submit (HTTP 202) to final poll outcome. This is the user-visible DD SLO. +var dbaasAsyncOperationDurationSeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "dbaas_async_operation_duration_seconds", + Help: "End-to-end duration of async DatabaseDeclaration provisioning operations.", + Buckets: []float64{1, 5, 10, 30, 60, 300, 600, 1800, 3600, 7200}, + }, + []string{"result"}, +) + +func init() { + metrics.Registry.MustRegister( + dbaasReconcileTriggerTotal, + dbaasSecretResolutionErrorsTotal, + dbaasSecretRotationPropagationSeconds, + dbaasAggregatorRequestDurationSeconds, + dbaasAggregatorRequestsTotal, + dbaasAsyncOperationDurationSeconds, + ) +} + +// Helper functions. + +// recordAggregatorCall records both the duration and the outcome counter for a +// single aggregator HTTP call. Call it immediately after every aggregator +// method returns, passing the start time and the error (nil = success). +func recordAggregatorCall(controller, operation string, start time.Time, err error) { + dbaasAggregatorRequestDurationSeconds. + WithLabelValues(controller, operation). + Observe(time.Since(start).Seconds()) + dbaasAggregatorRequestsTotal. + WithLabelValues(controller, operation, aggregatorResult(err)). + Inc() +} + +// aggregatorResult maps an error from AggregatorClient to a result label. +func aggregatorResult(err error) string { + if err == nil { + return resultSuccess + } + var aggErr *aggregatorclient.AggregatorError + if errors.As(err, &aggErr) { + if aggErr.IsAuthError() { + return resultAuthError + } + if aggErr.IsSpecRejection() { + return resultSpecRejection + } + return resultServerError + } + return resultNetworkError +} + +type secretResolutionError struct { + reason string + err error +} + +func (e *secretResolutionError) Error() string { + return e.err.Error() +} + +func (e *secretResolutionError) Unwrap() error { + return e.err +} + +func secretResolutionReason(err error) string { + var secretErr *secretResolutionError + if errors.As(err, &secretErr) { + return secretErr.reason + } + return secretReasonReadFailed +} + +func recordReconcileTrigger(controller, trigger string) { + dbaasReconcileTriggerTotal.WithLabelValues(controller, trigger).Inc() +} diff --git a/dbaas-operator/internal/controller/metrics_test.go b/dbaas-operator/internal/controller/metrics_test.go new file mode 100644 index 000000000..5795a2f76 --- /dev/null +++ b/dbaas-operator/internal/controller/metrics_test.go @@ -0,0 +1,53 @@ +package controller + +import ( + "errors" + "fmt" + "testing" + + aggregatorclient "github.com/netcracker/qubership-dbaas/dbaas-operator/internal/client" +) + +func TestAggregatorResultClassifiesOwnershipBuckets(t *testing.T) { + tests := []struct { + name string + err error + want string + }{ + {name: "success", err: nil, want: resultSuccess}, + {name: "auth", err: &aggregatorclient.AggregatorError{StatusCode: 401}, want: resultAuthError}, + {name: "spec rejection", err: &aggregatorclient.AggregatorError{StatusCode: 409}, want: resultSpecRejection}, + {name: "server", err: &aggregatorclient.AggregatorError{StatusCode: 500}, want: resultServerError}, + {name: "wrapped server", err: fmt.Errorf("wrapped: %w", &aggregatorclient.AggregatorError{StatusCode: 503}), want: resultServerError}, + {name: "network", err: errors.New("dial tcp: connection refused"), want: resultNetworkError}, + {name: "wrapped network", err: fmt.Errorf("connect: %w", errors.New("dial timeout")), want: resultNetworkError}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := aggregatorResult(tt.err); got != tt.want { + t.Fatalf("aggregatorResult() = %q, want %q", got, tt.want) + } + }) + } +} + +func TestSecretResolutionReason(t *testing.T) { + tests := []struct { + name string + err error + want string + }{ + {name: "typed", err: &secretResolutionError{reason: secretReasonKeyMissing, err: errors.New("missing")}, want: secretReasonKeyMissing}, + {name: "wrapped typed", err: fmt.Errorf("wrapped: %w", &secretResolutionError{reason: secretReasonForbidden, err: errors.New("forbidden")}), want: secretReasonForbidden}, + {name: "generic", err: errors.New("other"), want: secretReasonReadFailed}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := secretResolutionReason(tt.err); got != tt.want { + t.Fatalf("secretResolutionReason() = %q, want %q", got, tt.want) + } + }) + } +} diff --git a/dbaas-operator/internal/controller/trigger_stamps_test.go b/dbaas-operator/internal/controller/trigger_stamps_test.go new file mode 100644 index 000000000..faf17b238 --- /dev/null +++ b/dbaas-operator/internal/controller/trigger_stamps_test.go @@ -0,0 +1,147 @@ +package controller + +import ( + "sync" + "testing" + "time" +) + +func TestExternalDatabaseSecretTriggerLifecycle(t *testing.T) { + r := &ExternalDatabaseReconciler{} + key := "test-ns/test-edb" + start := time.Unix(100, 0) + later := start.Add(time.Minute) + + r.stampSecretTrigger(key, start) + r.stampSecretTrigger(key, later) + + if !r.consumeSecretTrigger(key) { + t.Fatalf("consumeSecretTrigger() = false, want true") + } + if r.consumeSecretTrigger(key) { + t.Fatalf("second consumeSecretTrigger() = true, want false") + } + + gotStart, ok := r.consumeSecretPropagation(key) + if !ok { + t.Fatalf("consumeSecretPropagation() ok = false, want true") + } + if !gotStart.Equal(start) { + t.Fatalf("consumeSecretPropagation() = %v, want earliest %v", gotStart, start) + } + if _, ok := r.consumeSecretPropagation(key); ok { + t.Fatalf("second consumeSecretPropagation() ok = true, want false") + } +} + +func TestExternalDatabaseClearSecretTriggerClearsTriggerAndPropagation(t *testing.T) { + r := &ExternalDatabaseReconciler{} + key := "test-ns/test-edb" + + r.stampSecretTrigger(key, time.Unix(100, 0)) + r.clearSecretTrigger(key) + + if r.consumeSecretTrigger(key) { + t.Fatalf("consumeSecretTrigger() after clear = true, want false") + } + if _, ok := r.consumeSecretPropagation(key); ok { + t.Fatalf("consumeSecretPropagation() after clear ok = true, want false") + } +} + +func TestExternalDatabaseBindingTriggerLifecycle(t *testing.T) { + r := &ExternalDatabaseReconciler{} + assertBindingTriggerLifecycle(t, r.stampBindingTrigger, r.consumeBindingTrigger, r.clearBindingTrigger) +} + +func TestDatabaseDeclarationBindingTriggerLifecycle(t *testing.T) { + r := &DatabaseDeclarationReconciler{} + assertBindingTriggerLifecycle(t, r.stampBindingTrigger, r.consumeBindingTrigger, r.clearBindingTrigger) +} + +func TestDatabaseDeclarationClearAsyncStart(t *testing.T) { + key := "test-ns/test-dd" + r := &DatabaseDeclarationReconciler{ + asyncStartTimes: map[string]time.Time{ + key: time.Unix(100, 0), + }, + } + + r.clearAsyncStart(key) + r.clearAsyncStart(key) + + if _, ok := r.asyncStartTimes[key]; ok { + t.Fatalf("asyncStartTimes[%q] exists after clearAsyncStart, want deleted", key) + } +} + +func TestDbPolicyBindingTriggerLifecycle(t *testing.T) { + r := &DbPolicyReconciler{} + assertBindingTriggerLifecycle(t, r.stampBindingTrigger, r.consumeBindingTrigger, r.clearBindingTrigger) +} + +func TestExternalDatabaseTriggerStampsConcurrentAccess(t *testing.T) { + r := &ExternalDatabaseReconciler{} + key := "test-ns/test-edb" + + var wg sync.WaitGroup + for i := 0; i < 50; i++ { + wg.Add(5) + go func(i int) { + defer wg.Done() + r.stampSecretTrigger(key, time.Unix(int64(i), 0)) + }(i) + go func() { + defer wg.Done() + _ = r.consumeSecretTrigger(key) + }() + go func() { + defer wg.Done() + _, _ = r.consumeSecretPropagation(key) + }() + go func() { + defer wg.Done() + r.stampBindingTrigger(key) + }() + go func() { + defer wg.Done() + _ = r.consumeBindingTrigger(key) + }() + } + wg.Wait() + + r.clearSecretTrigger(key) + r.clearBindingTrigger(key) + + if r.consumeSecretTrigger(key) { + t.Fatalf("consumeSecretTrigger() after concurrent clear = true, want false") + } + if _, ok := r.consumeSecretPropagation(key); ok { + t.Fatalf("consumeSecretPropagation() after concurrent clear ok = true, want false") + } + if r.consumeBindingTrigger(key) { + t.Fatalf("consumeBindingTrigger() after concurrent clear = true, want false") + } +} + +func assertBindingTriggerLifecycle(t *testing.T, stamp func(string), consume func(string) bool, clear func(string)) { + t.Helper() + + key := "test-ns/test-resource" + + stamp(key) + stamp(key) + + if !consume(key) { + t.Fatalf("consume() = false, want true") + } + if consume(key) { + t.Fatalf("second consume() = true, want false") + } + + stamp(key) + clear(key) + if consume(key) { + t.Fatalf("consume() after clear = true, want false") + } +}