From d326f191797d625df9e6b89337708431a9218f9d Mon Sep 17 00:00:00 2001 From: Ben Blattberg Date: Mon, 2 Jun 2025 13:28:04 -0500 Subject: [PATCH] Align pgmonitor with Containers Containers alerts/dashboards have some changes; this PR aligns the two sources. --- .../471-containers-dashboard-updates.yml | 3 +++ grafana/containers/postgresql_details.json | 2 +- grafana/containers/postgresql_overview.json | 2 +- grafana/containers/prometheus_alerts.json | 6 ++--- ...nchy-alert-rules-pg.yml.containers.example | 26 ++++++++++++++++--- 5 files changed, 31 insertions(+), 8 deletions(-) create mode 100644 changelogs/fragments/471-containers-dashboard-updates.yml diff --git a/changelogs/fragments/471-containers-dashboard-updates.yml b/changelogs/fragments/471-containers-dashboard-updates.yml new file mode 100644 index 0000000..4ae48a5 --- /dev/null +++ b/changelogs/fragments/471-containers-dashboard-updates.yml @@ -0,0 +1,3 @@ +minor_changes: + - Updated containers dashboards/alerts to allow OTel or postgres-exporter values + - Add requested PGNoPrimary and PGNoReplica alerts for containers diff --git a/grafana/containers/postgresql_details.json b/grafana/containers/postgresql_details.json index 2f9f6b3..b54f304 100644 --- a/grafana/containers/postgresql_details.json +++ b/grafana/containers/postgresql_details.json @@ -151,7 +151,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ", + "expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}, ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}, ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"})", "format": "time_series", "interval": "", "intervalFactor": 1, diff --git a/grafana/containers/postgresql_overview.json b/grafana/containers/postgresql_overview.json index b965463..f9bf2e9 100644 --- a/grafana/containers/postgresql_overview.json +++ b/grafana/containers/postgresql_overview.json @@ -163,7 +163,7 @@ "targets": [ { "$hashKey": "object:243", - "expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})", + "expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(patroni_postgres_running{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})", "format": "time_series", "interval": "", "intervalFactor": 1, diff --git a/grafana/containers/prometheus_alerts.json b/grafana/containers/prometheus_alerts.json index f41aa48..e0090cf 100644 --- a/grafana/containers/prometheus_alerts.json +++ b/grafana/containers/prometheus_alerts.json @@ -136,7 +136,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "count(count by (kubernetes_namespace) (pg_up)) or count(count by (kubernetes_namespace) (up))", + "expr": "sum(count by (kubernetes_namespace) (pg_up{pg_cluster!=''})) + sum(count by (kubernetes_namespace) (patroni_postgres_running{pg_cluster!=''}))", "format": "time_series", "instant": true, "interval": "", @@ -208,7 +208,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "count(count by (pg_cluster) (pg_up)) or count(count by (pg_cluster) (up))", + "expr": "sum(count by (pg_cluster) (pg_up{pg_cluster!=''})) + sum(count by (pg_cluster) (patroni_postgres_running{pg_cluster!=''}))", "format": "time_series", "instant": true, "interval": "", @@ -280,7 +280,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "count(pg_up) or count(up)", + "expr": "sum(count(pg_up{pg_cluster!=''})) + sum(count(patroni_postgres_running{pg_cluster!=''}))", "format": "time_series", "instant": true, "interval": "", diff --git a/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example b/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example index 0ccd9e3..32f5652 100644 --- a/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example +++ b/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example @@ -22,7 +22,7 @@ groups: ########## SYSTEM RULES ########## - alert: ExporterDown - expr: avg_over_time(up[5m]) < 0.5 + expr: avg_over_time(up{job=~"crunchy-otel-collector|crunchy-postgres-exporter",exported_job!="patroni"}[5m]) < 0.5 for: 10s labels: service: system @@ -35,15 +35,35 @@ groups: ########## POSTGRESQL RULES ########## - alert: PGIsUp - expr: pg_up < 1 + expr: pg_up < 1 or patroni_postgres_running < 1 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: - summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database' + summary: 'Metrics exporter running on {{ $labels.job }} is unable to communicate with the configured database' + - alert: PGNoPrimary + expr: max by (cluster_name) (ccp_is_in_recovery_status) < 2 + for: 30s + labels: + service: postgresql + severity: critical + severity_num: 300 + annotations: + summary: 'cluster {{ $labels.cluster_name }} does not have a primary instance' + +# Alert on missing or absent replicas +# - alert: PGNoReplica +# expr: min by (cluster_name) (ccp_is_in_recovery_status) > 1 +# for: 30s +# labels: +# service: postgresql +# severity: critical +# severity_num: 300 +# annotations: +# summary: 'cluster {{ $labels.cluster_name }} does not have a replica instance' # Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num". #