diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml index 71e8069..179f146 100644 --- a/.github/workflows/deploy-dev.yml +++ b/.github/workflows/deploy-dev.yml @@ -136,6 +136,7 @@ jobs: BOOTSTRAP_R2_BUCKET: ${{ vars.BOOTSTRAP_R2_BUCKET }} ADMIN_IP: ${{ secrets.ADMIN_IP }} CLOUDFLARE_ZONE_ID: ${{ secrets.CLOUDFLARE_ZONE_ID }} + HEALTH_CHECK_TOKEN: ${{ secrets.HEALTH_CHECK_TOKEN }} run: | ./docker/scripts/infra-shell.sh --ci --secrets-only --export-github-env @@ -161,6 +162,7 @@ jobs: TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }} TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }} TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }} + TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }} TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }} run: | docker run --rm \ @@ -180,6 +182,7 @@ jobs: -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_BACKEND_BUCKET \ ghcr.io/noahwhite/ghost-stack-shell:latest \ bash -c "git config --global --add safe.directory /home/devops/app && ./opentofu/scripts/tofu.sh dev init" @@ -201,6 +204,7 @@ jobs: TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }} TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }} TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }} + TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }} TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }} TF_VAR_admin_subnets: ${{ env.TF_VAR_admin_subnets }} TF_VAR_admin_ip: ${{ env.TF_VAR_admin_ip }} @@ -227,6 +231,7 @@ jobs: -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_BACKEND_BUCKET \ -e TF_VAR_admin_subnets \ -e TF_VAR_admin_ip \ @@ -290,6 +295,7 @@ jobs: TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }} TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }} TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }} + TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }} TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }} TF_VAR_admin_subnets: ${{ env.TF_VAR_admin_subnets }} TF_VAR_admin_ip: ${{ env.TF_VAR_admin_ip }} @@ -316,6 +322,7 @@ jobs: -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_BACKEND_BUCKET \ -e TF_VAR_admin_subnets \ -e TF_VAR_admin_ip \ diff --git a/.github/workflows/pr-tofu-plan-develop.yml b/.github/workflows/pr-tofu-plan-develop.yml index f28289f..d373af4 100644 --- a/.github/workflows/pr-tofu-plan-develop.yml +++ b/.github/workflows/pr-tofu-plan-develop.yml @@ -90,6 +90,7 @@ jobs: ADMIN_IP: ${{ secrets.ADMIN_IP }} # GitHub environment secret for Cloudflare Zone ID (from bootstrap outputs) CLOUDFLARE_ZONE_ID: ${{ secrets.CLOUDFLARE_ZONE_ID }} + HEALTH_CHECK_TOKEN: ${{ secrets.HEALTH_CHECK_TOKEN }} run: | ./docker/scripts/infra-shell.sh --ci --secrets-only --export-github-env @@ -114,6 +115,7 @@ jobs: TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }} TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }} TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }} + TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }} TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }} run: | docker run --rm \ @@ -133,6 +135,7 @@ jobs: -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_BACKEND_BUCKET \ ghcr.io/noahwhite/ghost-stack-shell:latest \ bash -c "git config --global --add safe.directory /home/devops/app && ./opentofu/scripts/tofu.sh dev init" @@ -178,6 +181,7 @@ jobs: TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }} TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }} TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }} + TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }} TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }} TF_VAR_admin_subnets: ${{ env.TF_VAR_admin_subnets }} TF_VAR_admin_ip: ${{ env.TF_VAR_admin_ip }} @@ -204,6 +208,7 @@ jobs: -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_BACKEND_BUCKET \ -e TF_VAR_admin_subnets \ -e TF_VAR_admin_ip \ diff --git a/docker/scripts/infra-shell.sh b/docker/scripts/infra-shell.sh index f78d0b4..1ea8fe8 100755 --- a/docker/scripts/infra-shell.sh +++ b/docker/scripts/infra-shell.sh @@ -234,6 +234,7 @@ prompt_if_empty "TF_VAR_pd_subdomain" "Enter your PagerDuty subdomain: " false prompt_if_empty "TF_VAR_pd_user_tok" "Enter your PagerDuty user API token: " true prompt_if_empty "TF_VAR_GC_ACCESS_TOK" "Enter your Grafana Cloud access token: " true prompt_if_empty "TF_VAR_SOC_DEV_TERRAFORM_SA_TOK" "Enter your Grafana Cloud SOC DEV Terraform access token: " true +prompt_if_empty "TF_VAR_health_check_token" "Enter your health check token (X-Health-Check-Token): " true prompt_if_empty "TF_VAR_infisical_client_id" "Enter your Infisical management identity client ID: " false prompt_if_empty "TF_VAR_infisical_client_secret" "Enter your Infisical management identity client secret: " true prompt_if_empty "TF_VAR_infisical_org_id" "Enter your Infisical organization ID: " false @@ -267,6 +268,7 @@ export_var "TF_VAR_pd_subdomain" "${TF_VAR_pd_subdomain}" export_var "TF_VAR_pd_user_tok" "${TF_VAR_pd_user_tok}" export_var "TF_VAR_GC_ACCESS_TOK" "${TF_VAR_GC_ACCESS_TOK}" export_var "TF_VAR_SOC_DEV_TERRAFORM_SA_TOK" "${TF_VAR_SOC_DEV_TERRAFORM_SA_TOK}" +export_var "TF_VAR_health_check_token" "${TF_VAR_health_check_token}" export_var "TF_VAR_infisical_client_id" "${TF_VAR_infisical_client_id}" export_var "TF_VAR_infisical_client_secret" "${TF_VAR_infisical_client_secret}" export_var "TF_VAR_infisical_org_id" "${TF_VAR_infisical_org_id}" @@ -290,6 +292,16 @@ if [[ "$CI_MODE" == "true" ]]; then fi fi +# Set health check token in CI mode (from GitHub secrets, used by Grafana synthetic monitoring) +if [[ "$CI_MODE" == "true" ]]; then + if [[ -n "${HEALTH_CHECK_TOKEN:-}" ]]; then + TF_VAR_health_check_token="${HEALTH_CHECK_TOKEN}" + mask_value "$TF_VAR_health_check_token" + export_var "TF_VAR_health_check_token" "${TF_VAR_health_check_token}" + echo "Using health check token from GitHub secret" + fi +fi + # Set admin subnets based on mode if [[ "$CI_MODE" == "true" ]]; then # CI mode: Use admin IP from GitHub secrets (required) @@ -353,6 +365,7 @@ echo "Using SSH public key: $PUBKEY_PATH" : "${TF_VAR_pd_user_tok:?Environment variable not set}" : "${TF_VAR_GC_ACCESS_TOK:?Environment variable not set}" : "${TF_VAR_SOC_DEV_TERRAFORM_SA_TOK:?Environment variable not set}" +: "${TF_VAR_health_check_token:?Environment variable not set}" : "${TF_VAR_infisical_client_id:?Environment variable not set}" : "${TF_VAR_infisical_client_secret:?Environment variable not set}" : "${TF_VAR_infisical_org_id:?Environment variable not set}" @@ -398,6 +411,7 @@ if [[ "$RUN_CONTAINER" == "true" ]]; then -e TF_VAR_pd_user_tok \ -e TF_VAR_GC_ACCESS_TOK \ -e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \ + -e TF_VAR_health_check_token \ -e TF_VAR_infisical_client_id \ -e TF_VAR_infisical_client_secret \ -e TF_VAR_infisical_org_id \ diff --git a/opentofu/envs/dev/main.tofu b/opentofu/envs/dev/main.tofu index 7b77273..676d9f5 100644 --- a/opentofu/envs/dev/main.tofu +++ b/opentofu/envs/dev/main.tofu @@ -165,8 +165,12 @@ module "grafana-cloud" { providers = { grafana = grafana.cloud } - pagerduty_backup_integration_key = module.pagerduty.backup_integration_key - SOC_DEV_TERRAFORM_SA_TOK = var.SOC_DEV_TERRAFORM_SA_TOK + pagerduty_backup_integration_key = module.pagerduty.backup_integration_key + pagerduty_health_check_integration_key = module.pagerduty.health_check_integration_key + SOC_DEV_TERRAFORM_SA_TOK = var.SOC_DEV_TERRAFORM_SA_TOK + metrics_publisher_key = var.GC_ACCESS_TOK + health_check_token = var.health_check_token + tenant_domain = var.ghost_domain } module "pagerduty" { diff --git a/opentofu/envs/dev/tests/grafana-cloud.tofutest.hcl b/opentofu/envs/dev/tests/grafana-cloud.tofutest.hcl index a0c4c1d..1f249b9 100644 --- a/opentofu/envs/dev/tests/grafana-cloud.tofutest.hcl +++ b/opentofu/envs/dev/tests/grafana-cloud.tofutest.hcl @@ -57,6 +57,30 @@ mock_provider "grafana" { } } + mock_resource "grafana_synthetic_monitoring_installation" { + defaults = { + id = "test-sm-installation-id" + sm_access_token = "test-sm-access-token" + stack_sm_api_url = "https://synthetic-monitoring-api-test.grafana.net" + } + } + + mock_resource "grafana_synthetic_monitoring_check" { + defaults = { + id = "test-sm-check-id" + } + } + + mock_data "grafana_synthetic_monitoring_probes" { + defaults = { + probes = { + Atlanta = 1 + NewYork = 22 + SanFrancisco = 23 + } + } + } + mock_data "grafana_data_source" { defaults = { id = "test-datasource-id" @@ -82,8 +106,12 @@ run "grafana_cloud_module_tests" { } variables { - SOC_DEV_TERRAFORM_SA_TOK = "test-token" - pagerduty_backup_integration_key = "test-pd-integration-key" + SOC_DEV_TERRAFORM_SA_TOK = "test-token" + pagerduty_backup_integration_key = "test-pd-integration-key" + pagerduty_health_check_integration_key = "test-pd-health-check-key" + metrics_publisher_key = "test-metrics-publisher-key" + health_check_token = "test-health-check-token" + tenant_domain = "separationofconcerns.dev" } # Override all data sources to prevent real API calls @@ -212,6 +240,17 @@ run "grafana_cloud_module_tests" { } } + override_data { + target = data.grafana_synthetic_monitoring_probes.main + values = { + probes = { + Atlanta = 1 + NewYork = 22 + SanFrancisco = 23 + } + } + } + # Test cloud stack configuration assert { condition = grafana_cloud_stack.soc_dev.name == "separationofconcerns0dev.grafana.net" @@ -396,4 +435,45 @@ run "grafana_cloud_module_tests" { condition = jsondecode(grafana_dashboard.ghost_stack_backup.config_json).title == "Ghost Stack Backup" error_message = "Backup dashboard title should be 'Ghost Stack Backup'" } + + # Test synthetic monitoring resources (OFF-178) + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.job == "tenant-health-separationofconcerns.dev" + error_message = "SM check job should be 'tenant-health-separationofconcerns.dev'" + } + + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.target == "https://separationofconcerns.dev/" + error_message = "SM check target should be 'https://separationofconcerns.dev/'" + } + + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.enabled == true + error_message = "SM check should be enabled" + } + + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.frequency == 120000 + error_message = "SM check frequency should be 120000ms (2 minutes)" + } + + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.timeout == 10000 + error_message = "SM check timeout should be 10000ms (10 seconds)" + } + + assert { + condition = grafana_synthetic_monitoring_check.tenant_health.alert_sensitivity == "medium" + error_message = "SM check alert sensitivity should be 'medium'" + } + + assert { + condition = length(grafana_synthetic_monitoring_check.tenant_health.probes) == 3 + error_message = "SM check should have 3 probes" + } + + assert { + condition = grafana_contact_point.pagerduty_health_check.name == "PagerDuty - Ghost Stack Health Check" + error_message = "Health check contact point name should be 'PagerDuty - Ghost Stack Health Check'" + } } diff --git a/opentofu/envs/dev/variables.tofu b/opentofu/envs/dev/variables.tofu index ac2f3bf..31c8c28 100644 --- a/opentofu/envs/dev/variables.tofu +++ b/opentofu/envs/dev/variables.tofu @@ -100,6 +100,12 @@ variable "SOC_DEV_TERRAFORM_SA_TOK" { sensitive = true } +variable "health_check_token" { + description = "Token for X-Health-Check-Token header used by Grafana synthetic health check probes" + type = string + sensitive = true +} + variable "cloudflare_zone_id" { description = "Cloudflare Zone ID (optional, reads from bootstrap state if not provided)" type = string diff --git a/opentofu/modules/grafana-cloud/main.tofu b/opentofu/modules/grafana-cloud/main.tofu index 4762494..640b675 100644 --- a/opentofu/modules/grafana-cloud/main.tofu +++ b/opentofu/modules/grafana-cloud/main.tofu @@ -10368,6 +10368,19 @@ resource "grafana_notification_policy" "root" { group_interval = "5m" repeat_interval = "4h" } + + policy { + matcher { + label = "service" + match = "=" + value = "health-check" + } + contact_point = grafana_contact_point.pagerduty_health_check.name + group_by = ["alertname", "instance"] + group_wait = "30s" + group_interval = "5m" + repeat_interval = "4h" + } } # Alert rules — 3 rules covering failure, missed window, compose recovery @@ -10810,3 +10823,65 @@ resource "grafana_dashboard" "ghost_stack_backup" { ] }) } + +# ============================================================================= +# Synthetic Health Check Probes (OFF-178) +# ============================================================================= + +resource "grafana_synthetic_monitoring_installation" "ghost_stack" { + stack_id = grafana_cloud_stack.soc_dev.id + metrics_publisher_key = var.metrics_publisher_key +} + +provider "grafana" { + alias = "sm" + sm_access_token = grafana_synthetic_monitoring_installation.ghost_stack.sm_access_token + sm_url = grafana_synthetic_monitoring_installation.ghost_stack.stack_sm_api_url +} + +data "grafana_synthetic_monitoring_probes" "main" { + provider = grafana.sm +} + +resource "grafana_synthetic_monitoring_check" "tenant_health" { + provider = grafana.sm + job = "tenant-health-${var.tenant_domain}" + target = "https://${var.tenant_domain}/" + enabled = true + probes = [ + data.grafana_synthetic_monitoring_probes.main.probes.Atlanta, + data.grafana_synthetic_monitoring_probes.main.probes.NewYork, + data.grafana_synthetic_monitoring_probes.main.probes.SanFrancisco, + ] + frequency = 120000 + timeout = 10000 + alert_sensitivity = "medium" + + labels = { + service = "health-check" + } + + settings { + http { + method = "GET" + headers = ["X-Health-Check-Token:${var.health_check_token}"] + fail_if_not_ssl = true + valid_status_codes = [200] + } + } +} + +# PagerDuty contact point for health check alerts +resource "grafana_contact_point" "pagerduty_health_check" { + provider = grafana.soc_dev + name = "PagerDuty - Ghost Stack Health Check" + + pagerduty { + integration_key = var.pagerduty_health_check_integration_key + severity = "critical" + disable_resolve_message = false + class = "health-check" + component = "caddy" + group = "ghost-stack-dev-01" + } +} diff --git a/opentofu/modules/grafana-cloud/variables.tofu b/opentofu/modules/grafana-cloud/variables.tofu index aaaaf35..b69b534 100644 --- a/opentofu/modules/grafana-cloud/variables.tofu +++ b/opentofu/modules/grafana-cloud/variables.tofu @@ -8,4 +8,27 @@ variable "pagerduty_backup_integration_key" { description = "PagerDuty Events API V2 integration key for the Ghost Stack backup service" type = string sensitive = true -} \ No newline at end of file +} + +variable "metrics_publisher_key" { + description = "Grafana Cloud access policy token for Synthetic Monitoring (requires stacks:read, metrics:write, logs:write, traces:write scopes)" + type = string + sensitive = true +} + +variable "pagerduty_health_check_integration_key" { + description = "PagerDuty Events API V2 integration key for the Ghost Stack health check service" + type = string + sensitive = true +} + +variable "health_check_token" { + description = "Token for X-Health-Check-Token header used by tenant health check probes" + type = string + sensitive = true +} + +variable "tenant_domain" { + description = "Tenant domain for health check probes (e.g., separationofconcerns.dev)" + type = string +} diff --git a/opentofu/modules/pagerduty/main.tofu b/opentofu/modules/pagerduty/main.tofu index b26d241..0feeb95 100644 --- a/opentofu/modules/pagerduty/main.tofu +++ b/opentofu/modules/pagerduty/main.tofu @@ -256,6 +256,68 @@ output "backup_integration_key" { sensitive = true } +# ============================================================================= +# Ghost Stack Health Check Alerting (OFF-178) +# ============================================================================= +resource "pagerduty_service" "ghost-stack-dev-01-health-check" { + name = "ghost-stack-dev-01-health-check" + description = "Synthetic health check probes for tenant sites on ghost-stack-dev-01" + auto_resolve_timeout = "null" + acknowledgement_timeout = "600" + escalation_policy = pagerduty_escalation_policy.ghost-stack-dev-01-tailscale-ep.id + alert_creation = "create_alerts_and_incidents" + + auto_pause_notifications_parameters { + enabled = true + timeout = 300 + } + + incident_urgency_rule { + type = "constant" + urgency = "high" + } +} + +resource "pagerduty_service_integration" "ghost-stack-dev-01-health-check_apiv2" { + name = "Events API V2" + type = "events_api_v2_inbound_integration" + service = pagerduty_service.ghost-stack-dev-01-health-check.id +} + +resource "pagerduty_service_dependency" "health-check-flatcar-sd" { + dependency { + dependent_service { + id = pagerduty_service.ghost-stack-dev-01-health-check.id + type = "service" + } + supporting_service { + id = pagerduty_service.flatcar_instance.id + type = "service" + } + } +} + +resource "pagerduty_service_dependency" "health-check-soc_blog-sd" { + dependency { + type = "service_dependency" + + dependent_service { + id = pagerduty_business_service.soc_blog.id + type = "business_service" + } + supporting_service { + id = pagerduty_service.ghost-stack-dev-01-health-check.id + type = "service" + } + } +} + +output "health_check_integration_key" { + description = "PagerDuty Events API V2 integration key for the health check service" + value = pagerduty_service_integration.ghost-stack-dev-01-health-check_apiv2.integration_key + sensitive = true +} + resource "pagerduty_schedule" "ghost-stack_dev" { name = "ghost-stack dev" time_zone = "America/New_York"