Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .yamllint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ extends: default

ignore: |
catalog/druid/chart/templates/
# Persona dashboards embed Grafana JSON (PromQL exprs exceed line-length); the
# Authored dashboards embed Grafana JSON (PromQL exprs exceed line-length); the
# JSON is validated by kustomize build + the GrafanaDashboard schema, not yamllint.
dashboards/base/platform/agent-*.yaml
dashboards/base/platform/portal.yaml
# Alert rule groups carry long PromQL exprs in their query model.
dashboards/base/alerting/portal.yaml

rules:
line-length:
Expand Down
12 changes: 12 additions & 0 deletions dashboards/base/alerting/folder.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Grafana folder that holds the authored SLO / burn-rate alert rules. The
# grafana-operator creates it in the external Amazon Managed Grafana; the
# GrafanaAlertRuleGroup CRs reference it by name via folderRef.
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaFolder
metadata:
name: slo-alerts
spec:
instanceSelector:
matchLabels:
dashboards: external
title: SLO & burn-rate alerts
204 changes: 204 additions & 0 deletions dashboards/base/alerting/portal.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
# portal — Grafana-managed SLO / burn-rate alert rules, reconciled onto the
# external Amazon Managed Grafana by the grafana-operator. Mirrors the burn-rate
# windows the observability-slo standard defines (nanohype#123); each burn rule is
# a dual-window check (both the long and short window must exceed the factor),
# encoded with PromQL `> bool` products so a single instant query yields 1/0.
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaAlertRuleGroup
metadata:
name: portal-slo
spec:
instanceSelector:
matchLabels:
dashboards: external
folderRef: slo-alerts
interval: 1m
rules:
- uid: portal-fast-burn
title: PortalErrorBudgetFastBurn
condition: B
for: 2m
noDataState: OK
execErrState: Error
isPaused: false
labels:
severity: page
service: portal
annotations:
summary: portal is burning its error budget fast (2% in 1h)
description: >
API 5xx burn rate exceeds 14.4x the 99.9% availability objective over
both the 1h and 5m windows. At this rate the 30-day error budget is
exhausted within ~2 days. Check recent deploys and upstream health.
data:
- refId: A
datasourceUid: managed-prometheus
relativeTimeRange:
from: 3600
to: 0
model:
refId: A
datasource: {type: prometheus, uid: managed-prometheus}
editorMode: code
instant: true
range: false
intervalMs: 1000
maxDataPoints: 43200
expr: "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[1h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[1h])), 1) / 0.001 > bool 14.4) * (sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[5m])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[5m])), 1) / 0.001 > bool 14.4)"
- refId: B
datasourceUid: __expr__
relativeTimeRange:
from: 3600
to: 0
model:
refId: B
type: threshold
datasource: {type: __expr__, uid: __expr__}
expression: A
conditions:
- type: query
evaluator: {type: gt, params: [0]}
operator: {type: and}
query: {params: [A]}
reducer: {type: last, params: []}
- uid: portal-slow-burn
title: PortalErrorBudgetSlowBurn
condition: B
for: 15m
noDataState: OK
execErrState: Error
isPaused: false
labels:
severity: page
service: portal
annotations:
summary: portal is burning its error budget (5% in 6h)
description: >
API 5xx burn rate exceeds 6x the 99.9% availability objective over both
the 6h and 30m windows — a sustained, slower budget burn. Investigate
before it escalates to a fast burn.
data:
- refId: A
datasourceUid: managed-prometheus
relativeTimeRange:
from: 21600
to: 0
model:
refId: A
datasource: {type: prometheus, uid: managed-prometheus}
editorMode: code
instant: true
range: false
intervalMs: 1000
maxDataPoints: 43200
expr: "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[6h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[6h])), 1) / 0.001 > bool 6) * (sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[30m])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[30m])), 1) / 0.001 > bool 6)"
- refId: B
datasourceUid: __expr__
relativeTimeRange:
from: 21600
to: 0
model:
refId: B
type: threshold
datasource: {type: __expr__, uid: __expr__}
expression: A
conditions:
- type: query
evaluator: {type: gt, params: [0]}
operator: {type: and}
query: {params: [A]}
reducer: {type: last, params: []}
- uid: portal-watcher-stall
title: PortalWatcherStalled
condition: B
for: 5m
noDataState: OK
execErrState: Error
isPaused: false
labels:
severity: page
service: portal
annotations:
summary: a portal watcher loop has stalled
description: >
The most-stale watcher loop has not ticked in over 15 minutes — the
tenant/cluster inventory it reconciles is going stale. Check the worker
logs for a wedged or panicking loop.
data:
- refId: A
datasourceUid: managed-prometheus
relativeTimeRange:
from: 1800
to: 0
model:
refId: A
datasource: {type: prometheus, uid: managed-prometheus}
editorMode: code
instant: true
range: false
intervalMs: 1000
maxDataPoints: 43200
expr: "max(time() - portal_watcher_last_tick_timestamp_seconds)"
- refId: B
datasourceUid: __expr__
relativeTimeRange:
from: 1800
to: 0
model:
refId: B
type: threshold
datasource: {type: __expr__, uid: __expr__}
expression: A
conditions:
- type: query
evaluator: {type: gt, params: [900]}
operator: {type: and}
query: {params: [A]}
reducer: {type: last, params: []}
- uid: portal-worker-job-errors
title: PortalWorkerJobErrorsHigh
condition: B
for: 15m
noDataState: OK
execErrState: Error
isPaused: false
labels:
severity: ticket
service: portal
annotations:
summary: portal worker jobs are erroring
description: >
River job errors/panics are sustained above 0.1/s over 15m — a job kind
is failing or silently retrying. Check portal_worker_job_errors_total by
kind and the worker logs.
data:
- refId: A
datasourceUid: managed-prometheus
relativeTimeRange:
from: 900
to: 0
model:
refId: A
datasource: {type: prometheus, uid: managed-prometheus}
editorMode: code
instant: true
range: false
intervalMs: 1000
maxDataPoints: 43200
expr: "sum(rate(portal_worker_job_errors_total[15m]))"
- refId: B
datasourceUid: __expr__
relativeTimeRange:
from: 900
to: 0
model:
refId: B
type: threshold
datasource: {type: __expr__, uid: __expr__}
expression: A
conditions:
- type: query
evaluator: {type: gt, params: [0.1]}
operator: {type: and}
query: {params: [A]}
reducer: {type: last, params: []}
3 changes: 3 additions & 0 deletions dashboards/base/datasources/prometheus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ spec:
dashboards: external
datasource:
name: ManagedPrometheus
# Pinned UID so Grafana-managed alert rules (GrafanaAlertRuleGroup) can
# reference this datasource deterministically instead of relying on isDefault.
uid: managed-prometheus
type: prometheus
access: proxy
url: https://aps-workspaces.us-west-2.amazonaws.com/workspaces/PLACEHOLDER # patched per-env
Expand Down
7 changes: 7 additions & 0 deletions dashboards/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ resources:
- datasources/prometheus.yaml
- datasources/loki.yaml
- datasources/tempo.yaml
# Grafana-managed SLO / burn-rate alert rules (folder + per-system rule groups),
# reconciled onto the external Amazon Managed Grafana.
- alerting/folder.yaml
- alerting/portal.yaml
- platform/kubernetes-cluster.yaml
- platform/kubernetes-views-pods.yaml
- platform/kubernetes-views-namespaces.yaml
Expand All @@ -33,6 +37,9 @@ resources:
- platform/agent-finance.yaml
- platform/agent-ops.yaml
- platform/agent-founder.yaml
# Ops control-plane app (portal): API SLO/RED + tofu-run, River-job, watcher,
# and pgxpool surfaces — self-contained PromQL over the portal_* metrics in AMP.
- platform/portal.yaml
- addons/kyverno.yaml
- addons/trivy-operator.yaml
- addons/falco.yaml
Expand Down
Loading