Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/deploy-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ jobs:
BOOTSTRAP_R2_BUCKET: ${{ vars.BOOTSTRAP_R2_BUCKET }}
ADMIN_IP: ${{ secrets.ADMIN_IP }}
CLOUDFLARE_ZONE_ID: ${{ secrets.CLOUDFLARE_ZONE_ID }}
HEALTH_CHECK_TOKEN: ${{ secrets.HEALTH_CHECK_TOKEN }}
run: |
./docker/scripts/infra-shell.sh --ci --secrets-only --export-github-env

Expand All @@ -161,6 +162,7 @@ jobs:
TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }}
TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }}
TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }}
TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }}
TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }}
run: |
docker run --rm \
Expand All @@ -180,6 +182,7 @@ jobs:
-e TF_VAR_pd_user_tok \
-e TF_VAR_GC_ACCESS_TOK \
-e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \
-e TF_VAR_health_check_token \
-e TF_BACKEND_BUCKET \
ghcr.io/noahwhite/ghost-stack-shell:latest \
bash -c "git config --global --add safe.directory /home/devops/app && ./opentofu/scripts/tofu.sh dev init"
Expand All @@ -201,6 +204,7 @@ jobs:
TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }}
TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }}
TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }}
TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }}
TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }}
TF_VAR_admin_subnets: ${{ env.TF_VAR_admin_subnets }}
TF_VAR_admin_ip: ${{ env.TF_VAR_admin_ip }}
Expand All @@ -227,6 +231,7 @@ jobs:
-e TF_VAR_pd_user_tok \
-e TF_VAR_GC_ACCESS_TOK \
-e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \
-e TF_VAR_health_check_token \
-e TF_BACKEND_BUCKET \
-e TF_VAR_admin_subnets \
-e TF_VAR_admin_ip \
Expand Down Expand Up @@ -290,6 +295,7 @@ jobs:
TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }}
TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }}
TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }}
TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }}
TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }}
TF_VAR_admin_subnets: ${{ env.TF_VAR_admin_subnets }}
TF_VAR_admin_ip: ${{ env.TF_VAR_admin_ip }}
Expand All @@ -316,6 +322,7 @@ jobs:
-e TF_VAR_pd_user_tok \
-e TF_VAR_GC_ACCESS_TOK \
-e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \
-e TF_VAR_health_check_token \
-e TF_BACKEND_BUCKET \
-e TF_VAR_admin_subnets \
-e TF_VAR_admin_ip \
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/pr-tofu-plan-develop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ jobs:
ADMIN_IP: ${{ secrets.ADMIN_IP }}
# GitHub environment secret for Cloudflare Zone ID (from bootstrap outputs)
CLOUDFLARE_ZONE_ID: ${{ secrets.CLOUDFLARE_ZONE_ID }}
HEALTH_CHECK_TOKEN: ${{ secrets.HEALTH_CHECK_TOKEN }}
run: |
./docker/scripts/infra-shell.sh --ci --secrets-only --export-github-env

Expand All @@ -114,6 +115,7 @@ jobs:
TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }}
TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }}
TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }}
TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }}
TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }}
run: |
docker run --rm \
Expand All @@ -133,6 +135,7 @@ jobs:
-e TF_VAR_pd_user_tok \
-e TF_VAR_GC_ACCESS_TOK \
-e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \
-e TF_VAR_health_check_token \
-e TF_BACKEND_BUCKET \
ghcr.io/noahwhite/ghost-stack-shell:latest \
bash -c "git config --global --add safe.directory /home/devops/app && ./opentofu/scripts/tofu.sh dev init"
Expand Down Expand Up @@ -178,6 +181,7 @@ jobs:
TF_VAR_pd_user_tok: ${{ env.TF_VAR_pd_user_tok }}
TF_VAR_GC_ACCESS_TOK: ${{ env.TF_VAR_GC_ACCESS_TOK }}
TF_VAR_SOC_DEV_TERRAFORM_SA_TOK: ${{ env.TF_VAR_SOC_DEV_TERRAFORM_SA_TOK }}
TF_VAR_health_check_token: ${{ env.TF_VAR_health_check_token }}
TF_BACKEND_BUCKET: ${{ env.TF_BACKEND_BUCKET }}
TF_VAR_admin_subnets: ${{ env.TF_VAR_admin_subnets }}
TF_VAR_admin_ip: ${{ env.TF_VAR_admin_ip }}
Expand All @@ -204,6 +208,7 @@ jobs:
-e TF_VAR_pd_user_tok \
-e TF_VAR_GC_ACCESS_TOK \
-e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \
-e TF_VAR_health_check_token \
-e TF_BACKEND_BUCKET \
-e TF_VAR_admin_subnets \
-e TF_VAR_admin_ip \
Expand Down
14 changes: 14 additions & 0 deletions docker/scripts/infra-shell.sh
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ prompt_if_empty "TF_VAR_pd_subdomain" "Enter your PagerDuty subdomain: " false
prompt_if_empty "TF_VAR_pd_user_tok" "Enter your PagerDuty user API token: " true
prompt_if_empty "TF_VAR_GC_ACCESS_TOK" "Enter your Grafana Cloud access token: " true
prompt_if_empty "TF_VAR_SOC_DEV_TERRAFORM_SA_TOK" "Enter your Grafana Cloud SOC DEV Terraform access token: " true
prompt_if_empty "TF_VAR_health_check_token" "Enter your health check token (X-Health-Check-Token): " true
prompt_if_empty "TF_VAR_infisical_client_id" "Enter your Infisical management identity client ID: " false
prompt_if_empty "TF_VAR_infisical_client_secret" "Enter your Infisical management identity client secret: " true
prompt_if_empty "TF_VAR_infisical_org_id" "Enter your Infisical organization ID: " false
Expand Down Expand Up @@ -267,6 +268,7 @@ export_var "TF_VAR_pd_subdomain" "${TF_VAR_pd_subdomain}"
export_var "TF_VAR_pd_user_tok" "${TF_VAR_pd_user_tok}"
export_var "TF_VAR_GC_ACCESS_TOK" "${TF_VAR_GC_ACCESS_TOK}"
export_var "TF_VAR_SOC_DEV_TERRAFORM_SA_TOK" "${TF_VAR_SOC_DEV_TERRAFORM_SA_TOK}"
export_var "TF_VAR_health_check_token" "${TF_VAR_health_check_token}"
export_var "TF_VAR_infisical_client_id" "${TF_VAR_infisical_client_id}"
export_var "TF_VAR_infisical_client_secret" "${TF_VAR_infisical_client_secret}"
export_var "TF_VAR_infisical_org_id" "${TF_VAR_infisical_org_id}"
Expand All @@ -290,6 +292,16 @@ if [[ "$CI_MODE" == "true" ]]; then
fi
fi

# Set health check token in CI mode (from GitHub secrets, used by Grafana synthetic monitoring)
if [[ "$CI_MODE" == "true" ]]; then
if [[ -n "${HEALTH_CHECK_TOKEN:-}" ]]; then
TF_VAR_health_check_token="${HEALTH_CHECK_TOKEN}"
mask_value "$TF_VAR_health_check_token"
export_var "TF_VAR_health_check_token" "${TF_VAR_health_check_token}"
echo "Using health check token from GitHub secret"
fi
fi

# Set admin subnets based on mode
if [[ "$CI_MODE" == "true" ]]; then
# CI mode: Use admin IP from GitHub secrets (required)
Expand Down Expand Up @@ -353,6 +365,7 @@ echo "Using SSH public key: $PUBKEY_PATH"
: "${TF_VAR_pd_user_tok:?Environment variable not set}"
: "${TF_VAR_GC_ACCESS_TOK:?Environment variable not set}"
: "${TF_VAR_SOC_DEV_TERRAFORM_SA_TOK:?Environment variable not set}"
: "${TF_VAR_health_check_token:?Environment variable not set}"
: "${TF_VAR_infisical_client_id:?Environment variable not set}"
: "${TF_VAR_infisical_client_secret:?Environment variable not set}"
: "${TF_VAR_infisical_org_id:?Environment variable not set}"
Expand Down Expand Up @@ -398,6 +411,7 @@ if [[ "$RUN_CONTAINER" == "true" ]]; then
-e TF_VAR_pd_user_tok \
-e TF_VAR_GC_ACCESS_TOK \
-e TF_VAR_SOC_DEV_TERRAFORM_SA_TOK \
-e TF_VAR_health_check_token \
-e TF_VAR_infisical_client_id \
-e TF_VAR_infisical_client_secret \
-e TF_VAR_infisical_org_id \
Expand Down
8 changes: 6 additions & 2 deletions opentofu/envs/dev/main.tofu
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,12 @@ module "grafana-cloud" {
providers = {
grafana = grafana.cloud
}
pagerduty_backup_integration_key = module.pagerduty.backup_integration_key
SOC_DEV_TERRAFORM_SA_TOK = var.SOC_DEV_TERRAFORM_SA_TOK
pagerduty_backup_integration_key = module.pagerduty.backup_integration_key
pagerduty_health_check_integration_key = module.pagerduty.health_check_integration_key
SOC_DEV_TERRAFORM_SA_TOK = var.SOC_DEV_TERRAFORM_SA_TOK
metrics_publisher_key = var.GC_ACCESS_TOK
health_check_token = var.health_check_token
tenant_domain = var.ghost_domain
}

module "pagerduty" {
Expand Down
84 changes: 82 additions & 2 deletions opentofu/envs/dev/tests/grafana-cloud.tofutest.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,30 @@ mock_provider "grafana" {
}
}

mock_resource "grafana_synthetic_monitoring_installation" {
defaults = {
id = "test-sm-installation-id"
sm_access_token = "test-sm-access-token"
stack_sm_api_url = "https://synthetic-monitoring-api-test.grafana.net"
}
}

mock_resource "grafana_synthetic_monitoring_check" {
defaults = {
id = "test-sm-check-id"
}
}

mock_data "grafana_synthetic_monitoring_probes" {
defaults = {
probes = {
Atlanta = 1
NewYork = 22
SanFrancisco = 23
}
}
}

mock_data "grafana_data_source" {
defaults = {
id = "test-datasource-id"
Expand All @@ -82,8 +106,12 @@ run "grafana_cloud_module_tests" {
}

variables {
SOC_DEV_TERRAFORM_SA_TOK = "test-token"
pagerduty_backup_integration_key = "test-pd-integration-key"
SOC_DEV_TERRAFORM_SA_TOK = "test-token"
pagerduty_backup_integration_key = "test-pd-integration-key"
pagerduty_health_check_integration_key = "test-pd-health-check-key"
metrics_publisher_key = "test-metrics-publisher-key"
health_check_token = "test-health-check-token"
tenant_domain = "separationofconcerns.dev"
}

# Override all data sources to prevent real API calls
Expand Down Expand Up @@ -212,6 +240,17 @@ run "grafana_cloud_module_tests" {
}
}

override_data {
target = data.grafana_synthetic_monitoring_probes.main
values = {
probes = {
Atlanta = 1
NewYork = 22
SanFrancisco = 23
}
}
}

# Test cloud stack configuration
assert {
condition = grafana_cloud_stack.soc_dev.name == "separationofconcerns0dev.grafana.net"
Expand Down Expand Up @@ -396,4 +435,45 @@ run "grafana_cloud_module_tests" {
condition = jsondecode(grafana_dashboard.ghost_stack_backup.config_json).title == "Ghost Stack Backup"
error_message = "Backup dashboard title should be 'Ghost Stack Backup'"
}

# Test synthetic monitoring resources (OFF-178)
assert {
condition = grafana_synthetic_monitoring_check.tenant_health.job == "tenant-health-separationofconcerns.dev"
error_message = "SM check job should be 'tenant-health-separationofconcerns.dev'"
}

assert {
condition = grafana_synthetic_monitoring_check.tenant_health.target == "https://separationofconcerns.dev/"
error_message = "SM check target should be 'https://separationofconcerns.dev/'"
}

assert {
condition = grafana_synthetic_monitoring_check.tenant_health.enabled == true
error_message = "SM check should be enabled"
}

assert {
condition = grafana_synthetic_monitoring_check.tenant_health.frequency == 120000
error_message = "SM check frequency should be 120000ms (2 minutes)"
}

assert {
condition = grafana_synthetic_monitoring_check.tenant_health.timeout == 10000
error_message = "SM check timeout should be 10000ms (10 seconds)"
}

assert {
condition = grafana_synthetic_monitoring_check.tenant_health.alert_sensitivity == "medium"
error_message = "SM check alert sensitivity should be 'medium'"
}

assert {
condition = length(grafana_synthetic_monitoring_check.tenant_health.probes) == 3
error_message = "SM check should have 3 probes"
}

assert {
condition = grafana_contact_point.pagerduty_health_check.name == "PagerDuty - Ghost Stack Health Check"
error_message = "Health check contact point name should be 'PagerDuty - Ghost Stack Health Check'"
}
}
6 changes: 6 additions & 0 deletions opentofu/envs/dev/variables.tofu
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ variable "SOC_DEV_TERRAFORM_SA_TOK" {
sensitive = true
}

variable "health_check_token" {
description = "Token for X-Health-Check-Token header used by Grafana synthetic health check probes"
type = string
sensitive = true
}

variable "cloudflare_zone_id" {
description = "Cloudflare Zone ID (optional, reads from bootstrap state if not provided)"
type = string
Expand Down
75 changes: 75 additions & 0 deletions opentofu/modules/grafana-cloud/main.tofu
Original file line number Diff line number Diff line change
Expand Up @@ -10368,6 +10368,19 @@ resource "grafana_notification_policy" "root" {
group_interval = "5m"
repeat_interval = "4h"
}

policy {
matcher {
label = "service"
match = "="
value = "health-check"
}
contact_point = grafana_contact_point.pagerduty_health_check.name
group_by = ["alertname", "instance"]
group_wait = "30s"
group_interval = "5m"
repeat_interval = "4h"
}
}

# Alert rules — 3 rules covering failure, missed window, compose recovery
Expand Down Expand Up @@ -10810,3 +10823,65 @@ resource "grafana_dashboard" "ghost_stack_backup" {
]
})
}

# =============================================================================
# Synthetic Health Check Probes (OFF-178)
# =============================================================================

resource "grafana_synthetic_monitoring_installation" "ghost_stack" {
stack_id = grafana_cloud_stack.soc_dev.id
metrics_publisher_key = var.metrics_publisher_key
}

provider "grafana" {
alias = "sm"
sm_access_token = grafana_synthetic_monitoring_installation.ghost_stack.sm_access_token
sm_url = grafana_synthetic_monitoring_installation.ghost_stack.stack_sm_api_url
}

data "grafana_synthetic_monitoring_probes" "main" {
provider = grafana.sm
}

resource "grafana_synthetic_monitoring_check" "tenant_health" {
provider = grafana.sm
job = "tenant-health-${var.tenant_domain}"
target = "https://${var.tenant_domain}/"
enabled = true
probes = [
data.grafana_synthetic_monitoring_probes.main.probes.Atlanta,
data.grafana_synthetic_monitoring_probes.main.probes.NewYork,
data.grafana_synthetic_monitoring_probes.main.probes.SanFrancisco,
]
frequency = 120000
timeout = 10000
alert_sensitivity = "medium"

labels = {
service = "health-check"
}

settings {
http {
method = "GET"
headers = ["X-Health-Check-Token:${var.health_check_token}"]
fail_if_not_ssl = true
valid_status_codes = [200]
}
}
}

# PagerDuty contact point for health check alerts
resource "grafana_contact_point" "pagerduty_health_check" {
provider = grafana.soc_dev
name = "PagerDuty - Ghost Stack Health Check"

pagerduty {
integration_key = var.pagerduty_health_check_integration_key
severity = "critical"
disable_resolve_message = false
class = "health-check"
component = "caddy"
group = "ghost-stack-dev-01"
}
}
25 changes: 24 additions & 1 deletion opentofu/modules/grafana-cloud/variables.tofu
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,27 @@ variable "pagerduty_backup_integration_key" {
description = "PagerDuty Events API V2 integration key for the Ghost Stack backup service"
type = string
sensitive = true
}
}

variable "metrics_publisher_key" {
description = "Grafana Cloud access policy token for Synthetic Monitoring (requires stacks:read, metrics:write, logs:write, traces:write scopes)"
type = string
sensitive = true
}

variable "pagerduty_health_check_integration_key" {
description = "PagerDuty Events API V2 integration key for the Ghost Stack health check service"
type = string
sensitive = true
}

variable "health_check_token" {
description = "Token for X-Health-Check-Token header used by tenant health check probes"
type = string
sensitive = true
}

variable "tenant_domain" {
description = "Tenant domain for health check probes (e.g., separationofconcerns.dev)"
type = string
}
Loading
Loading