From f60f8241cc8793d398df3990fadbd80283286cfd Mon Sep 17 00:00:00 2001 From: Noah White Date: Wed, 24 Jun 2026 05:19:50 +0000 Subject: [PATCH] feat(OFF-178): add Grafana synthetic health check probes for tenant sites Establishes Grafana Synthetic Monitoring infrastructure to probe tenant health check endpoints. Creates an SM installation on the Grafana Cloud stack, configures HTTP checks that authenticate with X-Health-Check-Token from 3 US probe locations (Atlanta, New York, San Francisco), and routes alerts through a dedicated PagerDuty service. - grafana_synthetic_monitoring_installation enables SM on the stack - grafana_synthetic_monitoring_check probes https:/// every 2min - PagerDuty service + contact point + notification policy routing - CI workflows and infra-shell pass HEALTH_CHECK_TOKEN as TF_VAR - OpenTofu tests cover all new resources (14/14 passing) --- .github/workflows/deploy-dev.yml | 7 ++ .github/workflows/pr-tofu-plan-develop.yml | 5 ++ docker/scripts/infra-shell.sh | 14 ++++ opentofu/envs/dev/main.tofu | 8 +- .../envs/dev/tests/grafana-cloud.tofutest.hcl | 84 ++++++++++++++++++- opentofu/envs/dev/variables.tofu | 6 ++ opentofu/modules/grafana-cloud/main.tofu | 75 +++++++++++++++++ opentofu/modules/grafana-cloud/variables.tofu | 25 +++++- opentofu/modules/pagerduty/main.tofu | 62 ++++++++++++++ 9 files changed, 281 insertions(+), 5 deletions(-) diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml index 71e8069..179f146 100644 --- a/.github/workflows/deploy-dev.yml +++ b/.github/workflows/deploy-dev.yml @@ -136,6 +136,7 @@ jobs: BOOTSTRAP_R2_BUCKET: ${{ vars.BOOTSTRAP_R2_BUCKET }} ADMIN_IP: ${{ secrets.ADMIN_IP }} CLOUDFLARE_ZONE_ID: ${{ secrets.CLOUDFLARE_ZONE_ID }} + HEALTH_CHECK_TOKEN: ${{ secrets.HEALTH_CHECK_TOKEN }} run: | ./docker/scripts/infra-shell.sh --ci --secrets-only --export-github-env @@ -161,6 +162,7 @@ jobs: TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }} TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }} TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }} + TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }} TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }} run: | docker run --rm \ @@ -180,6 +182,7 @@ jobs: -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_BACKEND_BUCKET \ ghcr.io/noahwhite/ghost-stack-shell:latest \ bash -c "git config --global --add safe.directory /home/devops/app && ./opentofu/scripts/tofu.sh dev init" @@ -201,6 +204,7 @@ jobs: TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }} TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }} TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }} + TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }} TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }} TF_VAR_admin_subnets: ${{ env.TF_VAR_admin_subnets }} TF_VAR_admin_ip: ${{ env.TF_VAR_admin_ip }} @@ -227,6 +231,7 @@ jobs: -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_BACKEND_BUCKET \ -e TF_VAR_admin_subnets \ -e TF_VAR_admin_ip \ @@ -290,6 +295,7 @@ jobs: TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }} TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }} TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }} + TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }} TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }} TF_VAR_admin_subnets: ${{ env.TF_VAR_admin_subnets }} TF_VAR_admin_ip: ${{ env.TF_VAR_admin_ip }} @@ -316,6 +322,7 @@ jobs: -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_BACKEND_BUCKET \ -e TF_VAR_admin_subnets \ -e TF_VAR_admin_ip \ diff --git a/.github/workflows/pr-tofu-plan-develop.yml b/.github/workflows/pr-tofu-plan-develop.yml index f28289f..d373af4 100644 --- a/.github/workflows/pr-tofu-plan-develop.yml +++ b/.github/workflows/pr-tofu-plan-develop.yml @@ -90,6 +90,7 @@ jobs: ADMIN_IP: ${{ secrets.ADMIN_IP }} # GitHub environment secret for Cloudflare Zone ID (from bootstrap outputs) CLOUDFLARE_ZONE_ID: ${{ secrets.CLOUDFLARE_ZONE_ID }} + HEALTH_CHECK_TOKEN: ${{ secrets.HEALTH_CHECK_TOKEN }} run: | ./docker/scripts/infra-shell.sh --ci --secrets-only --export-github-env @@ -114,6 +115,7 @@ jobs: TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }} TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }} TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }} + TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }} TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }} run: | docker run --rm \ @@ -133,6 +135,7 @@ jobs: -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_BACKEND_BUCKET \ ghcr.io/noahwhite/ghost-stack-shell:latest \ bash -c "git config --global --add safe.directory /home/devops/app && ./opentofu/scripts/tofu.sh dev init" @@ -178,6 +181,7 @@ jobs: TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }} TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }} TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }} + TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }} TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }} TF_VAR_admin_subnets: ${{ env.TF_VAR_admin_subnets }} TF_VAR_admin_ip: ${{ env.TF_VAR_admin_ip }} @@ -204,6 +208,7 @@ jobs: -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_BACKEND_BUCKET \ -e TF_VAR_admin_subnets \ -e TF_VAR_admin_ip \ diff --git a/docker/scripts/infra-shell.sh b/docker/scripts/infra-shell.sh index f78d0b4..1ea8fe8 100755 --- a/docker/scripts/infra-shell.sh +++ b/docker/scripts/infra-shell.sh @@ -234,6 +234,7 @@ prompt_if_empty "TF_VAR_pd_subdomain" "Enter your PagerDuty subdomain: " false prompt_if_empty "TF_VAR_pd_user_tok" "Enter your PagerDuty user API token: " true prompt_if_empty "TF_VAR_GC_ACCESS_TOK" "Enter your Grafana Cloud access token: " true prompt_if_empty "TF_VAR_SOC_DEV_TERRAFORM_SA_TOK" "Enter your Grafana Cloud SOC DEV Terraform access token: " true +prompt_if_empty "TF_VAR_health_check_token" "Enter your health check token (X-Health-Check-Token): " true prompt_if_empty "TF_VAR_infisical_client_id" "Enter your Infisical management identity client ID: " false prompt_if_empty "TF_VAR_infisical_client_secret" "Enter your Infisical management identity client secret: " true prompt_if_empty "TF_VAR_infisical_org_id" "Enter your Infisical organization ID: " false @@ -267,6 +268,7 @@ export_var "TF_VAR_pd_subdomain" "${TF_VAR_pd_subdomain}" export_var "TF_VAR_pd_user_tok" "${TF_VAR_pd_user_tok}" export_var "TF_VAR_GC_ACCESS_TOK" "${TF_VAR_GC_ACCESS_TOK}" export_var "TF_VAR_SOC_DEV_TERRAFORM_SA_TOK" "${TF_VAR_SOC_DEV_TERRAFORM_SA_TOK}" +export_var "TF_VAR_health_check_token" "${TF_VAR_health_check_token}" export_var "TF_VAR_infisical_client_id" "${TF_VAR_infisical_client_id}" export_var "TF_VAR_infisical_client_secret" "${TF_VAR_infisical_client_secret}" export_var "TF_VAR_infisical_org_id" "${TF_VAR_infisical_org_id}" @@ -290,6 +292,16 @@ if [[ "$CI_MODE" == "true" ]]; then fi fi +# Set health check token in CI mode (from GitHub secrets, used by Grafana synthetic monitoring) +if [[ "$CI_MODE" == "true" ]]; then + if [[ -n "${HEALTH_CHECK_TOKEN:-}" ]]; then + TF_VAR_health_check_token="${HEALTH_CHECK_TOKEN}" + mask_value "$TF_VAR_health_check_token" + export_var "TF_VAR_health_check_token" "${TF_VAR_health_check_token}" + echo "Using health check token from GitHub secret" + fi +fi + # Set admin subnets based on mode if [[ "$CI_MODE" == "true" ]]; then # CI mode: Use admin IP from GitHub secrets (required) @@ -353,6 +365,7 @@ echo "Using SSH public key: $PUBKEY_PATH" : "${TF_VAR_pd_user_tok:?Environment variable not set}" : "${TF_VAR_GC_ACCESS_TOK:?Environment variable not set}" : "${TF_VAR_SOC_DEV_TERRAFORM_SA_TOK:?Environment variable not set}" +: "${TF_VAR_health_check_token:?Environment variable not set}" : "${TF_VAR_infisical_client_id:?Environment variable not set}" : "${TF_VAR_infisical_client_secret:?Environment variable not set}" : "${TF_VAR_infisical_org_id:?Environment variable not set}" @@ -398,6 +411,7 @@ if [[ "$RUN_CONTAINER" == "true" ]]; then -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_VAR_infisical_client_id \ -e TF_VAR_infisical_client_secret \ -e TF_VAR_infisical_org_id \ diff --git a/opentofu/envs/dev/main.tofu b/opentofu/envs/dev/main.tofu index 7b77273..676d9f5 100644 --- a/opentofu/envs/dev/main.tofu +++ b/opentofu/envs/dev/main.tofu @@ -165,8 +165,12 @@ module "grafana-cloud" { providers = { grafana = grafana.cloud } - pagerduty_backup_integration_key = module.pagerduty.backup_integration_key - SOC_DEV_TERRAFORM_SA_TOK = var.SOC_DEV_TERRAFORM_SA_TOK + pagerduty_backup_integration_key = module.pagerduty.backup_integration_key + pagerduty_health_check_integration_key = module.pagerduty.health_check_integration_key + SOC_DEV_TERRAFORM_SA_TOK = var.SOC_DEV_TERRAFORM_SA_TOK + metrics_publisher_key = var.GC_ACCESS_TOK + health_check_token = var.health_check_token + tenant_domain = var.ghost_domain } module "pagerduty" { diff --git a/opentofu/envs/dev/tests/grafana-cloud.tofutest.hcl b/opentofu/envs/dev/tests/grafana-cloud.tofutest.hcl index a0c4c1d..1f249b9 100644 --- a/opentofu/envs/dev/tests/grafana-cloud.tofutest.hcl +++ b/opentofu/envs/dev/tests/grafana-cloud.tofutest.hcl @@ -57,6 +57,30 @@ mock_provider "grafana" { } } + mock_resource "grafana_synthetic_monitoring_installation" { + defaults = { + id = "test-sm-installation-id" + sm_access_token = "test-sm-access-token" + stack_sm_api_url = "https://synthetic-monitoring-api-test.grafana.net" + } + } + + mock_resource "grafana_synthetic_monitoring_check" { + defaults = { + id = "test-sm-check-id" + } + } + + mock_data "grafana_synthetic_monitoring_probes" { + defaults = { + probes = { + Atlanta = 1 + NewYork = 22 + SanFrancisco = 23 + } + } + } + mock_data "grafana_data_source" { defaults = { id = "test-datasource-id" @@ -82,8 +106,12 @@ run "grafana_cloud_module_tests" { } variables { - SOC_DEV_TERRAFORM_SA_TOK = "test-token" - pagerduty_backup_integration_key = "test-pd-integration-key" + SOC_DEV_TERRAFORM_SA_TOK = "test-token" + pagerduty_backup_integration_key = "test-pd-integration-key" + pagerduty_health_check_integration_key = "test-pd-health-check-key" + metrics_publisher_key = "test-metrics-publisher-key" + health_check_token = "test-health-check-token" + tenant_domain = "separationofconcerns.dev" } # Override all data sources to prevent real API calls @@ -212,6 +240,17 @@ run "grafana_cloud_module_tests" { } } + override_data { + target = data.grafana_synthetic_monitoring_probes.main + values = { + probes = { + Atlanta = 1 + NewYork = 22 + SanFrancisco = 23 + } + } + } + # Test cloud stack configuration assert { condition = grafana_cloud_stack.soc_dev.name == "separationofconcerns0dev.grafana.net" @@ -396,4 +435,45 @@ run "grafana_cloud_module_tests" { condition = jsondecode(grafana_dashboard.ghost_stack_backup.config_json).title == "Ghost Stack Backup" error_message = "Backup dashboard title should be 'Ghost Stack Backup'" } + + # Test synthetic monitoring resources (OFF-178) + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.job == "tenant-health-separationofconcerns.dev" + error_message = "SM check job should be 'tenant-health-separationofconcerns.dev'" + } + + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.target == "https://separationofconcerns.dev/" + error_message = "SM check target should be 'https://separationofconcerns.dev/'" + } + + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.enabled == true + error_message = "SM check should be enabled" + } + + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.frequency == 120000 + error_message = "SM check frequency should be 120000ms (2 minutes)" + } + + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.timeout == 10000 + error_message = "SM check timeout should be 10000ms (10 seconds)" + } + + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.alert_sensitivity == "medium" + error_message = "SM check alert sensitivity should be 'medium'" + } + + assert { + condition = length(grafana_synthetic_monitoring_check.tenant_health.probes) == 3 + error_message = "SM check should have 3 probes" + } + + assert { + condition = grafana_contact_point.pagerduty_health_check.name == "PagerDuty - Ghost Stack Health Check" + error_message = "Health check contact point name should be 'PagerDuty - Ghost Stack Health Check'" + } } diff --git a/opentofu/envs/dev/variables.tofu b/opentofu/envs/dev/variables.tofu index ac2f3bf..31c8c28 100644 --- a/opentofu/envs/dev/variables.tofu +++ b/opentofu/envs/dev/variables.tofu @@ -100,6 +100,12 @@ variable "SOC_DEV_TERRAFORM_SA_TOK" { sensitive = true } +variable "health_check_token" { + description = "Token for X-Health-Check-Token header used by Grafana synthetic health check probes" + type = string + sensitive = true +} + variable "cloudflare_zone_id" { description = "Cloudflare Zone ID (optional, reads from bootstrap state if not provided)" type = string diff --git a/opentofu/modules/grafana-cloud/main.tofu b/opentofu/modules/grafana-cloud/main.tofu index 4762494..640b675 100644 --- a/opentofu/modules/grafana-cloud/main.tofu +++ b/opentofu/modules/grafana-cloud/main.tofu @@ -10368,6 +10368,19 @@ resource "grafana_notification_policy" "root" { group_interval = "5m" repeat_interval = "4h" } + + policy { + matcher { + label = "service" + match = "=" + value = "health-check" + } + contact_point = grafana_contact_point.pagerduty_health_check.name + group_by = ["alertname", "instance"] + group_wait = "30s" + group_interval = "5m" + repeat_interval = "4h" + } } # Alert rules — 3 rules covering failure, missed window, compose recovery @@ -10810,3 +10823,65 @@ resource "grafana_dashboard" "ghost_stack_backup" { ] }) } + +# ============================================================================= +# Synthetic Health Check Probes (OFF-178) +# ============================================================================= + +resource "grafana_synthetic_monitoring_installation" "ghost_stack" { + stack_id = grafana_cloud_stack.soc_dev.id + metrics_publisher_key = var.metrics_publisher_key +} + +provider "grafana" { + alias = "sm" + sm_access_token = grafana_synthetic_monitoring_installation.ghost_stack.sm_access_token + sm_url = grafana_synthetic_monitoring_installation.ghost_stack.stack_sm_api_url +} + +data "grafana_synthetic_monitoring_probes" "main" { + provider = grafana.sm +} + +resource "grafana_synthetic_monitoring_check" "tenant_health" { + provider = grafana.sm + job = "tenant-health-${var.tenant_domain}" + target = "https://${var.tenant_domain}/" + enabled = true + probes = [ + data.grafana_synthetic_monitoring_probes.main.probes.Atlanta, + data.grafana_synthetic_monitoring_probes.main.probes.NewYork, + data.grafana_synthetic_monitoring_probes.main.probes.SanFrancisco, + ] + frequency = 120000 + timeout = 10000 + alert_sensitivity = "medium" + + labels = { + service = "health-check" + } + + settings { + http { + method = "GET" + headers = ["X-Health-Check-Token:${var.health_check_token}"] + fail_if_not_ssl = true + valid_status_codes = [200] + } + } +} + +# PagerDuty contact point for health check alerts +resource "grafana_contact_point" "pagerduty_health_check" { + provider = grafana.soc_dev + name = "PagerDuty - Ghost Stack Health Check" + + pagerduty { + integration_key = var.pagerduty_health_check_integration_key + severity = "critical" + disable_resolve_message = false + class = "health-check" + component = "caddy" + group = "ghost-stack-dev-01" + } +} diff --git a/opentofu/modules/grafana-cloud/variables.tofu b/opentofu/modules/grafana-cloud/variables.tofu index aaaaf35..b69b534 100644 --- a/opentofu/modules/grafana-cloud/variables.tofu +++ b/opentofu/modules/grafana-cloud/variables.tofu @@ -8,4 +8,27 @@ variable "pagerduty_backup_integration_key" { description = "PagerDuty Events API V2 integration key for the Ghost Stack backup service" type = string sensitive = true -} \ No newline at end of file +} + +variable "metrics_publisher_key" { + description = "Grafana Cloud access policy token for Synthetic Monitoring (requires stacks:read, metrics:write, logs:write, traces:write scopes)" + type = string + sensitive = true +} + +variable "pagerduty_health_check_integration_key" { + description = "PagerDuty Events API V2 integration key for the Ghost Stack health check service" + type = string + sensitive = true +} + +variable "health_check_token" { + description = "Token for X-Health-Check-Token header used by tenant health check probes" + type = string + sensitive = true +} + +variable "tenant_domain" { + description = "Tenant domain for health check probes (e.g., separationofconcerns.dev)" + type = string +} diff --git a/opentofu/modules/pagerduty/main.tofu b/opentofu/modules/pagerduty/main.tofu index b26d241..0feeb95 100644 --- a/opentofu/modules/pagerduty/main.tofu +++ b/opentofu/modules/pagerduty/main.tofu @@ -256,6 +256,68 @@ output "backup_integration_key" { sensitive = true } +# ============================================================================= +# Ghost Stack Health Check Alerting (OFF-178) +# ============================================================================= +resource "pagerduty_service" "ghost-stack-dev-01-health-check" { + name = "ghost-stack-dev-01-health-check" + description = "Synthetic health check probes for tenant sites on ghost-stack-dev-01" + auto_resolve_timeout = "null" + acknowledgement_timeout = "600" + escalation_policy = pagerduty_escalation_policy.ghost-stack-dev-01-tailscale-ep.id + alert_creation = "create_alerts_and_incidents" + + auto_pause_notifications_parameters { + enabled = true + timeout = 300 + } + + incident_urgency_rule { + type = "constant" + urgency = "high" + } +} + +resource "pagerduty_service_integration" "ghost-stack-dev-01-health-check_apiv2" { + name = "Events API V2" + type = "events_api_v2_inbound_integration" + service = pagerduty_service.ghost-stack-dev-01-health-check.id +} + +resource "pagerduty_service_dependency" "health-check-flatcar-sd" { + dependency { + dependent_service { + id = pagerduty_service.ghost-stack-dev-01-health-check.id + type = "service" + } + supporting_service { + id = pagerduty_service.flatcar_instance.id + type = "service" + } + } +} + +resource "pagerduty_service_dependency" "health-check-soc_blog-sd" { + dependency { + type = "service_dependency" + + dependent_service { + id = pagerduty_business_service.soc_blog.id + type = "business_service" + } + supporting_service { + id = pagerduty_service.ghost-stack-dev-01-health-check.id + type = "service" + } + } +} + +output "health_check_integration_key" { + description = "PagerDuty Events API V2 integration key for the health check service" + value = pagerduty_service_integration.ghost-stack-dev-01-health-check_apiv2.integration_key + sensitive = true +} + resource "pagerduty_schedule" "ghost-stack_dev" { name = "ghost-stack dev" time_zone = "America/New_York"