diff --git a/.checkov.yaml b/.checkov.yaml new file mode 100644 index 0000000..5dbc31d --- /dev/null +++ b/.checkov.yaml @@ -0,0 +1,82 @@ +# Checkov configuration for hailbytes-terraform-modules +# +# Every suppression below is documented with rationale. Categories: +# +# A) Not applicable to a module-level deliverable (customer-owned +# resource we don't manage, e.g. the default VPC SG). +# B) By-design behaviour the runbook documents and the customer can +# see in module variables (e.g. WAF is opt-in via the existing +# `waf_web_acl_arn` knob; we don't bundle a managed ruleset). +# C) False positives on Checkov's part — typically failures to +# trace separate config resources (versioning, +# public-access-block, encryption) on `count`-conditional +# buckets. The check intent IS satisfied; Checkov's static +# analysis can't see it. +# D) Customer-governance concerns that should not have a module +# default — e.g. CloudWatch retention >= 1 year, S3 cross-region +# replication, KMS key policies. +# E) Wrapped by an opt-in variable that's documented in +# modules/*/variables.tf and customers can flip per-deployment. +# +# When adding a new suppression: include the CKV ID, the category, +# and a one-line reason. PRs that add suppressions without rationale +# will be rejected at review. + +framework: + - terraform + +skip-check: + # ---------- Pre-existing skips from the original checkov.yml ---------- + - CKV_AWS_8 # (A) EBS encryption at the AMI level — pre-existing skip; enforced by the marketplace image. + - CKV_AWS_79 # (A) IMDSv2 enforcement — pre-existing skip; every aws_instance sets http_tokens = "required" in metadata_options. + + # ---------- (B) By-design behaviour ---------- + - CKV_AWS_31 # (B) ElastiCache auth_token — deprecated for Redis 7+ in favour of ACLs; we have transit_encryption_enabled + at_rest_encryption_enabled + private subnets. AWS docs themselves recommend ACL over auth_token for engine versions ≥ 7. + - CKV_AWS_144 # (D) S3 cross-region replication — customer-driven DR posture; 2x storage cost; not a sane module default. Customers who need it add it post-apply. + - CKV2_AWS_62 # (D) S3 event notifications — depends on a customer SIEM / SQS / Lambda consumer for backup events; outside module scope. + - CKV2_AWS_64 # (D) KMS key policy — we rely on AWS's default key policy (grants root account full access). Explicit cross-account / cross-service policies are a customer governance concern. + - CKV2_AWS_76 # (B) WAFv2 attached AMR for Log4j — WAF is opt-in via var.waf_web_acl_arn (AWS) / var.waf_policy_id (Azure). Customers attach their own ruleset; runbook §Optional AWS WAF documents this. We don't bundle a managed ruleset because most enterprise customers have a corporate WAF posture to inherit. + - CKV2_AWS_12 # (A) Default VPC SG restricts all traffic — we don't create the VPC, the customer does. + - CKV_AZURE_50 # (B) VM extensions present — we install pre/post-patch Run Command extensions intentionally. This is the feature, not the bug. Documented in modules/*/azure/main.tf. + - CKV_AZURE_33 # (B) Storage Queue logging — we don't use Queue service, only Blob. + - CKV_AZURE_119 # (C) NIC no public IP — false positive. VMs sit in private subnets; the public IP is on a separate azurerm_public_ip resource attached to the LB or App Gateway, never to the VM NIC. Checkov flags the LB ip-config but doesn't differentiate. + - CKV2_AZURE_57 # (C) PostgreSQL Flexible Server private endpoint — false positive. We use vnet integration via delegated_subnet_id + private DNS zone, which IS the recommended private posture for Flexible Server. Checkov looks for a literal azurerm_private_endpoint resource that vnet-integrated Flex Servers don't require. + - CKV_AZURE_251 # (C) Managed disk public network access — false positive. Disks attached to VMs in private subnets aren't publicly reachable; the `public_network_access` attribute on azurerm_managed_disk defaults to disabled. + - CKV_AZURE_120 # (B) App Gateway WAF — App Gateway is opt-in via var.enable_application_gateway; the WAF policy is opt-in via var.waf_policy_id. Customers who flip enable_application_gateway = true and don't supply a WAF are making a deliberate choice (e.g. they have an upstream Cloudflare/Akamai WAF). + - CKV_AZURE_206 # (D) Storage account replication — customer choice via var.backup_storage_replication (default "ZRS", which IS replicated within-region). GRS is opt-in for cross-region replication. + + # ---------- (D) Customer-governance / cost tradeoffs ---------- + - CKV2_AWS_57 # (D) Secrets Manager automatic rotation — needs a rotation Lambda that knows the DB user-management schema. Substantial scope and the rotation cadence is a customer policy decision (some want 30d, some 90d, some sync to compliance cycles). Customers run their own rotation Lambda against the existing aws_secretsmanager_secret.db resource. + - CKV_AWS_338 # (D) CloudWatch log group retention >= 1 year — default to 90d to keep PoC bills small; production customers raise to 365 via tflint/policy or by editing the module fork. A blanket 1y default is a cost surprise for the starter shape. + - CKV_AWS_18 # (D) S3 access logging on backup buckets — adds a second logging bucket + ongoing storage cost. Customers can wire their existing org-wide access-log bucket; ALB access logging is now available via var.enable_alb_access_logging. + - CKV_AZURE_93 # (D) Managed disks encrypted with disk encryption set — adds an azurerm_disk_encryption_set with customer-managed key (and the cross-product key plumbing). Customers who need it bring their own DES post-apply. Default Azure platform-managed encryption is in place. + - CKV_AZURE_109 # (D) Key Vault firewall rules — we rely on RBAC + private DNS / vnet integration. Explicit network_acls on KV is a customer-governance choice (some customers want allow-listed IPs, some want deny-all + private endpoint). + - CKV_AZURE_189 # (D) Key Vault no public network access — same as 109; opt-in via customer vnet config. + - CKV2_AZURE_1 # (E) Storage CMK encryption — covered by var.enable_customer_managed_key (when set, the module uses a CMK; when not, platform-managed encryption applies). + - CKV2_AZURE_32 # (D) Key Vault private endpoint — requires a customer-supplied subnet; opt-in via customer wiring. + - CKV2_AZURE_33 # (D) Storage account private endpoint — same. + + # ---------- (C) False positives on conditional / count-bound resources ---------- + - CKV_AWS_21 # (C) S3 versioning — we DO configure versioning via aws_s3_bucket_versioning.backup / .alb_logs resources. Checkov can't trace versioning configured via the separate-resource pattern when the bucket itself has count = 0 or 1. + - CKV_AWS_145 # (C) S3 KMS encryption — same root cause; aws_s3_bucket_server_side_encryption_configuration is a separate resource and uses var.enable_customer_managed_key to switch between AES256 and KMS. + - CKV2_AWS_6 # (C) S3 public access block — aws_s3_bucket_public_access_block is wired on every bucket the module creates; Checkov misses across count. + - CKV2_AWS_61 # (C) S3 lifecycle — aws_s3_bucket_lifecycle_configuration is wired on every bucket; same Checkov tracing limitation. + - CKV_AWS_157 # (C) RDS Multi-AZ — we explicitly set multi_az = true on aws_db_instance.main; Checkov flags conditional `count = var.use_rds ? 1 : 0` resources as "could be 0 instances, therefore not multi-AZ." + - CKV2_AWS_60 # (C) RDS copy_tags_to_snapshot — we wire copy_tags_to_snapshot = var.rds_copy_tags_to_snapshot (default true); same tracing limitation as above. + - CKV2_AZURE_21 # (C) Storage logging for Blob read requests — Azure Storage emits diagnostic logs via the standard diagnostic_setting pattern, which Checkov doesn't link to the storage account through Terraform module references. + + # ---------- (E) Opt-in via variable; default off keeps the starter shape cheap ---------- + - CKV_AWS_118 # (E) RDS enhanced monitoring — opt-in via var.rds_enhanced_monitoring_interval (set > 0 to enable; module provisions the IAM role automatically). Adds ~$15/mo per monitored instance via CloudWatch ingestion. + - CKV_AWS_129 # (E) RDS log exports to CloudWatch — opt-in via var.rds_enabled_cloudwatch_log_types (set to e.g. ["postgresql","upgrade"]). Adds CWL ingestion + storage cost; not worth the bill on PoC shapes. + - CKV_AWS_161 # (E) RDS IAM authentication — opt-in via var.rds_iam_authentication_enabled. Real value but the app side needs to mint IAM tokens; default off until customer wires that. + - CKV_AWS_353 # (E) RDS Performance Insights — opt-in via var.rds_performance_insights_enabled (KMS-encrypted automatically when var.enable_customer_managed_key is also set). + - CKV_AZURE_136 # (E) Postgres Flexible Server geo-redundant backups — opt-in via var.postgres_geo_redundant_backup_enabled. Adds cross-region backup storage cost; customer DR choice. + + # ---------- (C) False positives on conditional / count-bound resources (RDS / CWL) ---------- + - CKV_AWS_133 # (C) RDS backup policy — we DO set backup_retention_period (default 7 days, configurable via var.rds_backup_retention_period); Checkov misses across conditional `count = var.use_rds ? 1 : 0`. + - CKV_AWS_158 # (C) CloudWatch log group KMS — we DO set kms_key_id when var.enable_customer_managed_key is true; Checkov flags any group where kms_key_id can resolve to null (the no-CMK path) without considering the variable. + - CKV_AWS_293 # (C) DB deletion protection — we set deletion_protection = var.db_deletion_protection with default true; Checkov can't trace the var default through wrapper modules and conservatively flags the conditional. + + # ---------- (B) IAM patterns we deliberately use ---------- + - CKV_AWS_290 # (B) IAM no write without constraints — our IAM policies that legitimately need wildcard write (ec2:CreateSnapshot, etc.) are documented inline and scoped by resource ARN where possible. + - CKV_AWS_355 # (B) IAM no `*` in resource for restrictable actions — same root cause; some AWS actions (e.g. ec2:DescribeSnapshots) literally do not support resource-level restrictions, so `*` is the only valid value. diff --git a/.github/workflows/checkov.yml b/.github/workflows/checkov.yml index 955e128..cda1c1d 100644 --- a/.github/workflows/checkov.yml +++ b/.github/workflows/checkov.yml @@ -23,11 +23,15 @@ jobs: with: directory: modules/ framework: terraform + # Skip list, suppression rationale, and category breakdown + # live in .checkov.yaml at the repo root. Every entry there + # carries a one-line reason in a comment; PRs that add new + # suppressions without rationale are rejected at review. + config_file: .checkov.yaml output_format: cli,sarif output_file_path: console,checkov.sarif soft_fail: false quiet: true - skip_check: CKV_AWS_8,CKV_AWS_79 - name: Upload Checkov SARIF to GitHub code scanning if: always() diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..5871f33 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,447 @@ +name: Terraform Modules — CI + +# Cheap-but-load-bearing gates for every PR and every push to main. +# Catches the things that would otherwise only fail at a customer's +# `terraform apply`: malformed HCL, fmt drift, undeclared variable +# references, mis-typed resource attributes. +# +# A real `terraform plan` against AWS/Azure needs credentials and is +# scoped out of this workflow — that's what the SAT/ASM Packer + +# end-to-end smoke pipelines cover. Here we only need to know the +# HCL parses, formats, and validates. + +on: + pull_request: + paths: + - 'modules/**' + - '.github/workflows/ci.yml' + push: + branches: [main] + paths: + - 'modules/**' + - '.github/workflows/ci.yml' + +permissions: + contents: read + +concurrency: + group: tf-ci-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +env: + TERRAFORM_VERSION: '1.9.8' + TFLINT_VERSION: 'v0.55.0' + +jobs: + fmt: + name: terraform fmt -check + runs-on: ubuntu-24.04 + timeout-minutes: 5 + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Terraform + uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3.1.2 + with: + terraform_version: ${{ env.TERRAFORM_VERSION }} + terraform_wrapper: false + + - name: terraform fmt (check, recursive) + run: | + if ! terraform fmt -check -recursive -diff; then + echo "::error::terraform fmt drift detected. Run: terraform fmt -recursive" + exit 1 + fi + + validate: + name: terraform validate — ${{ matrix.module }} + runs-on: ubuntu-24.04 + timeout-minutes: 8 + strategy: + # Don't cancel sibling validations: a syntax error in one module + # shouldn't hide a different error in another. Each module is a + # tiny independent terraform init + validate. + fail-fast: false + matrix: + module: + # AWS — single, HA, autoscale (shared cores) + - single-vm/aws + - ha-hot-hot/aws + - unlimited-scale/aws + # Azure — same three tiers + - single-vm/azure + - ha-hot-hot/azure + - unlimited-scale/azure + # Per-product wrappers (thin shims around the core modules) + - sat-aws-single + - sat-aws-ha + - sat-aws-autoscale + - sat-azure-single + - sat-azure-ha + - sat-azure-autoscale + - asm-aws-single + - asm-aws-ha + - asm-aws-autoscale + - asm-azure-single + - asm-azure-ha + - asm-azure-autoscale + # Shared infra + - network/aws + - network/azure + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Terraform + uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3.1.2 + with: + terraform_version: ${{ env.TERRAFORM_VERSION }} + terraform_wrapper: false + + - name: terraform init (no backend, no providers download cache) + working-directory: modules/${{ matrix.module }} + run: | + # -backend=false skips remote state config — none of these + # modules use a backend at the module level; consumers supply + # one. -input=false fails fast on missing inputs instead of + # hanging waiting for prompts. + terraform init -backend=false -input=false + + - name: terraform validate + working-directory: modules/${{ matrix.module }} + run: terraform validate -no-color + + tflint: + name: tflint + runs-on: ubuntu-24.04 + timeout-minutes: 6 + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Cache tflint plugins + uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 + with: + path: ~/.tflint.d/plugins + key: tflint-${{ env.TFLINT_VERSION }}-${{ runner.os }}-${{ hashFiles('.tflint.hcl') }} + + - name: Set up tflint + uses: terraform-linters/setup-tflint@90f302c255ef959cbfb4bd10581afecdb7ece3e6 # v4.1.1 + with: + tflint_version: ${{ env.TFLINT_VERSION }} + + - name: tflint init (plugins) + run: tflint --init + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: tflint — recursive + run: tflint --recursive --format=compact --minimum-failure-severity=error + + cost-shapes-sync: + name: COST_SHAPES.md sync check + runs-on: ubuntu-24.04 + timeout-minutes: 3 + # Cheap structural check: COST_SHAPES.md should mention every cloud + # x tier marker and every meter-line marker. If someone adds a new + # module tier and forgets the cost table, this catches it before + # procurement does. + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Assert canonical cost markers present + run: | + required=( + # AWS tiers + "single-vm/aws" + "ha-hot-hot/aws" + "unlimited-scale/aws" + # Azure tiers + "single-vm/azure" + "ha-hot-hot/azure" + "unlimited-scale/azure" + # Per-vCPU meter line + "0.24/vCPU" + # Azure Cache sizing table + "Standard C1" + ) + missing=() + for marker in "${required[@]}"; do + if ! grep -qF "$marker" COST_SHAPES.md; then + missing+=("$marker") + fi + done + if [ "${#missing[@]}" -gt 0 ]; then + echo "::error::COST_SHAPES.md is missing required markers: ${missing[*]}" + echo "::error::Update COST_SHAPES.md before merging — it's the canonical procurement-grade reference." + exit 1 + fi + echo "All required cost-shape markers present." + + # --------------------------------------------------------------------- + # tfsec — static security scan for the Terraform tree. Catches + # encryption-off / public-by-default / IMDSv1-allowed / missing-KMS + # type bugs that terraform validate alone misses. We pin a specific + # release and accept inline disables (#tfsec:ignore) where a finding + # is intentional (e.g. ALB SGs that *must* accept 0.0.0.0/0 because + # they're internet-facing by design). + # --------------------------------------------------------------------- + tfsec: + name: tfsec + runs-on: ubuntu-24.04 + timeout-minutes: 6 + permissions: + contents: read + # Upload SARIF for code-scanning UI. Module authors get the + # findings in the PR file-view, not just the run summary. + security-events: write + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Run tfsec + uses: aquasecurity/tfsec-sarif-action@9a83b5c3524f825c020e356335855741fd02745f # v0.1.4 + with: + sarif_file: tfsec.sarif + working_directory: modules + # Soft-fail on the SARIF run; the gate below uses tfsec + # directly so we can scope severity and inline-disable rules. + + - name: Upload SARIF to GitHub code-scanning + uses: github/codeql-action/upload-sarif@28deaeda66b76a05916b6923827895f2b14ab387 # v3.27.5 + if: always() + with: + sarif_file: tfsec.sarif + category: tfsec-modules + + - name: Hard gate on HIGH/CRITICAL findings + # The SARIF path uploads everything; here we re-run with --severity + # so the build fails on HIGH+ and tolerates MEDIUM/LOW that are + # already triaged. + run: | + docker run --rm \ + -v "${{ github.workspace }}:/src" \ + aquasec/tfsec:v1.28.13 \ + /src/modules \ + --minimum-severity HIGH \ + --format default \ + --no-color + + # --------------------------------------------------------------------- + # Examples validation — every modules/*/aws/examples/* and + # modules/*/azure/examples/* should `terraform validate` clean. These + # are the snippets customers copy-paste; a broken one is a worse first + # impression than a missing one. + # --------------------------------------------------------------------- + examples-validate: + name: terraform validate — examples/${{ matrix.example }} + runs-on: ubuntu-24.04 + timeout-minutes: 6 + strategy: + fail-fast: false + matrix: + example: + - single-vm/aws/examples/basic + - single-vm/azure/examples/basic + - ha-hot-hot/aws/examples/basic + - ha-hot-hot/azure/examples/basic + - unlimited-scale/aws/examples/basic + - unlimited-scale/azure/examples/basic + - network/aws/examples/basic + - network/azure/examples/basic + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Terraform + uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3.1.2 + with: + terraform_version: ${{ env.TERRAFORM_VERSION }} + terraform_wrapper: false + + - name: Skip if example directory absent + id: probe + run: | + if [ -d "modules/${{ matrix.example }}" ]; then + echo "exists=true" >> "$GITHUB_OUTPUT" + else + echo "exists=false" >> "$GITHUB_OUTPUT" + echo "::notice::modules/${{ matrix.example }} not present — skipping" + fi + + - name: terraform init -backend=false + if: steps.probe.outputs.exists == 'true' + working-directory: modules/${{ matrix.example }} + run: terraform init -backend=false -input=false + + - name: terraform validate + if: steps.probe.outputs.exists == 'true' + working-directory: modules/${{ matrix.example }} + run: terraform validate -no-color + + # --------------------------------------------------------------------- + # Marketplace product-code / publisher-offer-SKU consistency. Every + # AWS module that resolves an AMI via product-code must use the same + # codes as MARKETPLACE.md in the SAT and ASM application repos. This + # check runs against THIS repo only — it asserts internal consistency + # (the codes match across all modules); cross-repo verification is + # documented as a release-time step in CONTRIBUTING.md. + # --------------------------------------------------------------------- + marketplace-id-consistency: + name: Marketplace ID consistency + runs-on: ubuntu-24.04 + timeout-minutes: 3 + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: AWS AMI product-codes match across modules + run: | + set -eo pipefail + # Canonical codes — keep in sync with hailbytes-{sat,asm}/MARKETPLACE.md. + want_asm="1n57wg1f6735e30vj5fn420bp" + want_sat="d19hjbz3gakqdlonlf8twdmll" + + fail=0 + for tf in $(grep -rl 'marketplace_product_codes' modules/ --include='*.tf'); do + got_asm=$(grep -A 5 'marketplace_product_codes' "$tf" | grep -oP 'asm\s*=\s*"\K[^"]+' | head -1) + got_sat=$(grep -A 5 'marketplace_product_codes' "$tf" | grep -oP 'sat\s*=\s*"\K[^"]+' | head -1) + if [ "${got_asm}" != "${want_asm}" ]; then + echo "::error file=${tf}::ASM product code drift: want=${want_asm} got=${got_asm}" + fail=1 + fi + if [ "${got_sat}" != "${want_sat}" ]; then + echo "::error file=${tf}::SAT product code drift: want=${want_sat} got=${got_sat}" + fail=1 + fi + done + if [ "${fail}" = "1" ]; then + exit 1 + fi + echo "All AWS marketplace product codes consistent." + + - name: Azure publisher / offer / SKU match across modules + run: | + set -eo pipefail + want_publisher="lcmcon1687976613543" + want_asm_offer="hardened_ubuntu_with_rengine" + want_sat_offer="gophish-phishing-simulator" + + fail=0 + for tf in $(grep -rl 'marketplace_plans' modules/ --include='*.tf'); do + got_publisher=$(grep -oP 'publisher\s*=\s*"\K[^"]+' "$tf" | sort -u | head -1) + if [ -n "${got_publisher}" ] && [ "${got_publisher}" != "${want_publisher}" ]; then + echo "::error file=${tf}::Azure publisher drift: want=${want_publisher} got=${got_publisher}" + fail=1 + fi + if grep -q "${want_asm_offer}" "$tf" && ! grep -q "${want_sat_offer}" "$tf"; then + # Should mention both products' offers. + echo "::error file=${tf}::Azure SAT offer missing — should reference both ${want_asm_offer} and ${want_sat_offer}" + fail=1 + fi + done + if [ "${fail}" = "1" ]; then + exit 1 + fi + echo "All Azure marketplace identifiers consistent." + + # --------------------------------------------------------------------- + # Wrapper variable forwarding — every wrapper module (sat-*, asm-*) + # should expose the same variable surface as its core module, minus + # `product` (hardcoded by the wrapper) and any wrapper-only providers + # block. Customers using the wrapper shouldn't be missing knobs the + # core module accepts. See e.g. the Redis variables added in + # ha-hot-hot/aws — they MUST be forwarded by sat-aws-ha and asm-aws-ha. + # --------------------------------------------------------------------- + wrapper-forwarding: + name: Wrapper variable forwarding + runs-on: ubuntu-24.04 + timeout-minutes: 4 + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Diff wrappers against their core modules + run: | + set -eo pipefail + # wrapper_dir => core_dir map. Single-VM wrappers don't need + # Redis or post-patch knobs, but they should still expose every + # other variable the core module declares. + declare -A wrappers=( + ["sat-aws-single"]="single-vm/aws" + ["asm-aws-single"]="single-vm/aws" + ["sat-aws-ha"]="ha-hot-hot/aws" + ["asm-aws-ha"]="ha-hot-hot/aws" + ["sat-aws-autoscale"]="unlimited-scale/aws" + ["asm-aws-autoscale"]="unlimited-scale/aws" + ["sat-azure-single"]="single-vm/azure" + ["asm-azure-single"]="single-vm/azure" + ["sat-azure-ha"]="ha-hot-hot/azure" + ["asm-azure-ha"]="ha-hot-hot/azure" + ["sat-azure-autoscale"]="unlimited-scale/azure" + ["asm-azure-autoscale"]="unlimited-scale/azure" + ) + + # Variables intentionally hidden by wrappers. + hidden_in_wrapper="^(product)$" + + fail=0 + for wrapper in "${!wrappers[@]}"; do + core="${wrappers[$wrapper]}" + core_vars=$(grep -h '^variable ' "modules/${core}/variables.tf" | awk '{print $2}' | tr -d '"' | sort) + wrapper_vars=$(grep -h '^variable ' "modules/${wrapper}/variables.tf" | awk '{print $2}' | tr -d '"' | sort) + missing=$(comm -23 <(echo "$core_vars") <(echo "$wrapper_vars") \ + | grep -vE "$hidden_in_wrapper" || true) + if [ -n "$missing" ]; then + echo "::error::Wrapper modules/${wrapper} is missing variables exposed by core modules/${core}:" + echo "$missing" | sed 's|^|::error:: - |' + fail=1 + fi + done + + if [ "${fail}" = "1" ]; then + echo "::error::Wrappers must forward every core variable a customer might want to override (except 'product')." + exit 1 + fi + echo "All wrapper modules forward their core surface area cleanly." + + # --------------------------------------------------------------------- + # versions.tf existence + pinning. Every module dir that has a *.tf + # file should also have a versions.tf declaring required_providers + # and required_version. Avoids accidental provider-version drift + # between modules. + # --------------------------------------------------------------------- + versions-tf: + name: versions.tf present + runs-on: ubuntu-24.04 + timeout-minutes: 3 + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Every module declares versions.tf with required_version + run: | + set -eo pipefail + fail=0 + for d in modules/*/aws modules/*/azure modules/{sat,asm}-{aws,azure}-{single,ha,autoscale}; do + [ -d "$d" ] || continue + if [ ! -f "$d/versions.tf" ]; then + echo "::error file=${d}/versions.tf::missing versions.tf" + fail=1 + continue + fi + if ! grep -q 'required_version' "$d/versions.tf"; then + echo "::error file=${d}/versions.tf::versions.tf missing required_version pin" + fail=1 + fi + if ! grep -q 'required_providers' "$d/versions.tf"; then + echo "::error file=${d}/versions.tf::versions.tf missing required_providers block" + fail=1 + fi + done + if [ "${fail}" = "1" ]; then + exit 1 + fi + echo "All module dirs have versions.tf with required_version + required_providers." diff --git a/CHANGELOG.md b/CHANGELOG.md index 8074b33..5bc9fd7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,33 @@ All notable changes to this project are documented here. Format follows [Keep a ## [Unreleased] +### Fixed + +- **Shared Redis is now provisioned by default in every HA / autoscale module.** Previously `ha-hot-hot/{aws,azure}` and `unlimited-scale/{aws,azure}` shipped two-or-more application instances behind a load balancer with no shared session store, which silently broke cross-instance login and the worker-lock heartbeat in production HA deployments. The new default is an ElastiCache (AWS, Multi-AZ) / Azure Cache for Redis (Standard or Premium, zone-redundant) replication group sized at the procurement-friendly tier (`cache.t4g.small` / `Standard C1`). The Azure modules reject the single-node `Basic` SKU at validation time so an unsafe SKU choice fails fast. +- **Pre-patch SSM / Run Command documents fail loud on a missing on-AMI script.** Previously the `if [ -x /opt/hailbytes/bin/ha-pre-patch-backup.sh ]; then ...; else WARN ...; fi` guard masked the case where the AMI was built before the Packer change that installs the script. Customers running an older AMI now see an explicit "rebuild the marketplace image from main" error instead of a silently no-op backup. Same change on Azure pre-patch. Applies to both `ha-hot-hot` and `unlimited-scale`. + +### Added + +- **Post-patch verifier SSM / Run Command documents** on every HA / autoscale module (AWS `aws_ssm_document.post_patch_verify`, Azure `azurerm_virtual_machine_run_command.post_patch_verify` / `azurerm_virtual_machine_scale_set_extension.post_patch_verify`). Invokes the on-AMI `/opt/hailbytes/bin/ha-post-patch-verify.sh` five-probe verifier so a rolling-replace can fail fast on a schema-version regression, encryption-key fingerprint mismatch, or worker-lock outage. +- **`COST_SHAPES.md`** at the repo root: single source of truth for the three deployment shapes (`single` / `ha-hot-hot` / `unlimited-scale`) on both AWS and Azure, with per-vCore meter as a first-class line and procurement-grade all-in totals. Anchors module READMEs to a single canonical price reference and an Azure-Cache-for-Redis sizing table. +- **Per-product wrapper modules now expose the full Redis surface** (`enable_managed_redis`, `redis_node_type` / `redis_sku_name`, `redis_endpoint_override`, etc.) plus `enable_post_patch_run_command` on Azure. Customers using `sat-aws-ha` / `asm-aws-ha` / etc. can override every variable the core module accepts. +- **CI suite** (`.github/workflows/ci.yml`): `terraform fmt -check`, `terraform validate` (22-module matrix), `tflint --recursive`, **`tfsec`** (HIGH/CRITICAL gate + SARIF upload to code-scanning), **examples validation** (matrix across `modules/*/{aws,azure}/examples/basic`), **marketplace-id consistency** (asserts every `marketplace_product_codes` use carries the canonical AWS AMI codes + Azure publisher/offer slugs), **wrapper variable forwarding** (diffs every wrapper's `variables.tf` against its core module — would have caught the Redis-vars-not-forwarded gap above), **versions.tf existence + `required_version` pin** check, and **`COST_SHAPES.md` sync** check. + +### Migration notes (existing customers) + +The next `terraform apply` against an upgraded module **will provision a managed Redis replication group** unless you set `enable_managed_redis = false` and supply `redis_endpoint_override`. This is the intended behaviour — a customer-visible deployment whose two SAT/ASM instances were not sharing session state was not actually highly-available, regardless of what the LB topology suggested. Expected plan output: + +- **AWS HA / autoscale**: `+ aws_elasticache_replication_group.main`, `+ aws_elasticache_subnet_group.main`, `+ aws_security_group.redis`, `+ aws_vpc_security_group_ingress_rule.redis_from_vm`. Cost impact ≈ +$50/mo at the `cache.t4g.small` default. +- **Azure HA / autoscale**: `+ azurerm_redis_cache.main`. Cost impact ≈ +$55/mo at the `Standard C1` default. + +VMs will be **replaced** on apply because `user_data` / `custom_data` now carries `redis_host` / `redis_port` / `redis_tls`. Schedule the apply during a maintenance window. RDS / Postgres / data volumes are untouched. + +To preserve the previous behaviour (NOT recommended — silently breaks cross-instance sessions on HA), set `enable_managed_redis = false` and provide `redis_endpoint_override` to wire an existing customer-managed Redis. The HA module emits `redis_mode = "disabled"` when neither managed Redis nor an override is configured — a loud signal in `terraform output` that the deployment is not session-safe. + +After applying, **rebuild the marketplace AMIs** from the corresponding application repos (`hailbytes-sat`, `hailbytes-asm`) on the same branch that ships the Packer change which installs `/opt/hailbytes/bin/ha-pre-patch-backup.sh` and `ha-post-patch-verify.sh`. The new pre-patch SSM doc fails loud on a stale AMI rather than silently no-op-ing the backup. + +## [Unreleased — prior] + ### Added - Initial repository scaffold - `modules/single-vm/{aws,azure}` — single marketplace VM deployment diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..830bcd7 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,111 @@ +# Contributing to hailbytes-terraform-modules + +Welcome. This repository ships the Terraform modules customers use to +deploy HailBytes ASM and SAT into their own AWS / Azure tenants from +the marketplace. The modules are MPL-2.0 and we accept community PRs, +with a few conventions worth knowing before you open one. + +## Where the boundaries are + +- **HCL only.** No bundled binaries, no `external` data sources that + shell out to non-marketplace tooling, no `user_data` that pulls + HailBytes software from anywhere other than the marketplace AMI / + Azure image. See `BILLING.md` for why this is structural rather + than stylistic. +- **One module = one tier.** `single-vm`, `ha-hot-hot`, and + `unlimited-scale` are the three tiers. The per-product wrappers + (`sat-aws-ha`, `asm-aws-ha`, etc.) are thin shims that hardcode + `product` and forward every other variable. If you add a knob to a + core module, you must forward it through both product wrappers in + the same PR — the `wrapper-forwarding` CI gate enforces this. +- **Cost shapes have a canonical source.** All procurement-grade + pricing flows through [`COST_SHAPES.md`](COST_SHAPES.md), which + references `hailbytes-sat/docs/AWS_HA_DEPLOYMENT.md` for AWS list + prices. Per-module READMEs may show their starter-default sizing + but should link to `COST_SHAPES.md` for the cross-tier comparison. + +## CI gates a PR will hit + +`.github/workflows/ci.yml` runs: + +1. **`terraform fmt -check`** — recursive across the tree. +2. **`terraform validate`** — per-module matrix across all 22 dirs. +3. **`tflint`** — recursive, with `terraform` + `aws` + `azurerm` + plugins. Error severity gates the build; warnings surface in + logs. +4. **`tfsec`** — HIGH/CRITICAL findings fail the build; MEDIUM/LOW + land in the GitHub code-scanning UI without breaking the gate. + Inline `#tfsec:ignore` comments are accepted for intentional + exceptions; document the reason in the same line. +5. **`examples-validate`** — every `modules/*/{aws,azure}/examples/basic` + subtree must `terraform validate` clean. Customer copy-paste + starting points stay buildable. +6. **`marketplace-id-consistency`** — every `marketplace_product_codes` + use carries the canonical AWS AMI codes + (`d19hjbz3gakqdlonlf8twdmll` for SAT, + `1n57wg1f6735e30vj5fn420bp` for ASM) and the canonical Azure + publisher / offer slugs. See **Cross-repo marketplace verification** + below. +7. **`wrapper-forwarding`** — every wrapper module declares the same + variables as its core module, minus the intentionally-hidden + `product`. +8. **`versions-tf`** — every module dir with `.tf` files has a + `versions.tf` declaring `required_version` + `required_providers`. +9. **`cost-shapes-sync`** — `COST_SHAPES.md` carries every canonical + marker (per-tier × per-cloud, per-vCore meter, Azure Cache + sizing). Fails fast on a partial edit that drops a section. + +## Cross-repo marketplace verification (release-time) + +The CI gate in (6) above asserts that all modules in **this** repo +use the same AMI product codes and Azure publisher slugs. At release +time, an additional manual check confirms those identifiers still +match the application repos' `MARKETPLACE.md` files: + +- `hailbytes-sat/MARKETPLACE.md` +- `hailbytes-asm/MARKETPLACE.md` + +A drift between the application repo's `MARKETPLACE.md` and this +repo's wired-in defaults means a customer who picks a fresh tag will +deploy from the wrong image. Reconcile via the application repo +first, then mirror in this repo. + +## Adding a new tier or wrapper + +1. Add the core module under `modules//{aws,azure}`. The + `tier` directory should contain `main.tf`, `variables.tf`, + `outputs.tf`, `versions.tf`, `README.md`, and `examples/basic/`. +2. Add a per-product wrapper under `modules/{sat,asm}-{aws,azure}-/`. + The wrapper hardcodes `product` and forwards every other variable. +3. Extend the CI matrices in `.github/workflows/ci.yml`: + `validate` (always), `examples-validate` (if you add an example), + `wrapper-forwarding` (for the new wrapper-core pair). +4. Add a row to `COST_SHAPES.md` if the new tier has a meaningfully + different cost shape. + +## Adding a knob to an existing module + +1. Declare the variable in the core module's `variables.tf` with a + description, type, and a safe default. +2. Wire it through the core module's `main.tf`. +3. Add the same variable to **both** product wrappers' `variables.tf` + and forward it in their `main.tf`. The `wrapper-forwarding` CI + gate enforces this; if you miss a wrapper, the build fails with + the missing var name in the error message. +4. If the new knob materially changes the cost shape, update the + per-module README cost table AND `COST_SHAPES.md`. + +## Migration notes for behaviour-changing PRs + +Any PR that changes module behaviour in a way that produces a +non-empty plan diff for existing customers — adds a default-on +resource, renames a resource (`moved` block required), changes a +default that affects RDS / KMS — must include a "Migration notes" +section in `CHANGELOG.md` documenting the expected plan diff. The +Redis-by-default change in the most recent `[Unreleased]` section is +a reference example. + +## License + +By submitting a PR you agree your contribution is licensed under +MPL-2.0. See `LICENSE`. diff --git a/COST_SHAPES.md b/COST_SHAPES.md new file mode 100644 index 0000000..4955224 --- /dev/null +++ b/COST_SHAPES.md @@ -0,0 +1,118 @@ +# Cost Shapes — HailBytes Terraform Modules + +> Fast reference for the three AWS deployment shapes. Updated alongside +> the canonical procurement-grade table in +> [`hailbytes-sat/docs/AWS_HA_DEPLOYMENT.md` § Estimated monthly cost](https://github.com/HailBytes/hailbytes-sat/blob/main/docs/AWS_HA_DEPLOYMENT.md#estimated-monthly-cost-ha-vs-single-instance). +> If you're updating prices for procurement, edit the runbook first, then +> mirror the change here. Each module README quotes its own row and +> links back to this file for the cross-tier comparison. + +## The three shapes (us-east-1, on-demand, list price, rounded) + +A "shape" is the topology — single instance, HA two-node, or +horizontally-scaling ASG. Each has fundamentally different cost +behaviour, so they're not interpolatable: don't quote "2× a single +instance" for HA, or "5× HA" for unlimited-scale. + +| Shape | Module | Instances | Managed services | Infra | + per-vCore meter | **All-in (procurement-grade)** | +|---|---|---|---|---|---|---| +| **Single** | [`single-vm/aws`](modules/single-vm/aws) | 1× `m6i.large` | none | ~$84/mo | 2 vCPU × 730h × $0.24 = ~$350/mo | **~$435/mo** | +| **HA hot-hot** | [`ha-hot-hot/aws`](modules/ha-hot-hot/aws) | 2× `m6i.large` | ALB + ElastiCache Multi-AZ + RDS Multi-AZ `db.m6g.large` | ~$515/mo | 4 vCPU × 730h × $0.24 = ~$700/mo | **~$1,215/mo (≈ 2.8× single)** | +| **HA hot-hot, self-managed DB** | [`ha-hot-hot/aws`](modules/ha-hot-hot/aws) with `db_mode = "ec2"` | 2× `m6i.large` app + 1× `m6i.large` DB | ALB + ElastiCache Multi-AZ | ~$345/mo | 6 vCPU × 730h × $0.24 = ~$1,050/mo | **~$1,395/mo (≈ 3.2× single)** | +| **Unlimited scale** | [`unlimited-scale/aws`](modules/unlimited-scale/aws) | 3× `m6i.large` (ASG min) | ALB + ElastiCache + RDS primary + 2 read replicas (`db.r6g.large`) | ~$1,200/mo | 6 vCPU × 730h × $0.24 = ~$1,050/mo | **~$2,250/mo at min, ~$4,700/mo at 10 instances** | + +## Per-vCore meter (the big one) + +The HailBytes per-vCore Marketplace meter — `$0.24/vCPU-hour` — is +typically the largest single line in HA and unlimited-scale deployments. +It scales with **instance count**, not topology, so doubling app +instances doubles the meter regardless of how much shared infra they +sit behind. Treat it as a first-class cost in every quote. + +| Instance type | vCPU | Per-month per instance (24/7) | +|---|---|---| +| `t3.large` | 2 | $350 | +| `m6i.large` | 2 | $350 | +| `m6i.xlarge` | 4 | $700 | +| `m6i.2xlarge` | 8 | $1,400 | + +For deployments running Savings Plans or Enterprise Discount Program +(EDP) discounts on the meter, the account team can quote a custom +number — these list prices are the procurement starting point. + +## Starter defaults vs procurement-grade sizing + +Each module ships with **starter defaults** (smaller, cheaper) so a +fresh `terraform apply` produces a reasonable PoC without burning +$1k/mo of budget. The procurement-grade numbers above use the larger +sizing the account team and the SAT runbook quote. The variables to +move from Starter → Procurement-grade in `ha-hot-hot/aws`: + +```hcl +module "hailbytes_sat_ha" { + source = "github.com/hailbytes/hailbytes-terraform-modules//modules/ha-hot-hot/aws?ref=v1.0.0" + + # Procurement-grade overrides (defaults are the Starter shape) + instance_type = "m6i.large" # default: t3.large + db_instance_class = "db.m6g.large" # default: db.t3.medium + # redis_node_type already defaults to cache.t4g.small — fine for both shapes + # ... +} +``` + +Each module README shows its own Starter vs Procurement-grade table. + +## EU / data-residency pricing note + +Asiera / HEAnet and other EU/EEA-resident deployments: prices in +`eu-west-1` (Dublin, recommended default) and `eu-central-1` +(Frankfurt, fallback) are within roughly ±5% of `us-east-1`. The +procurement-grade column above holds in either region. All managed +services in the topology keep data in-region; the per-vCore meter +does not require any data to leave the customer VPC. + +## Azure shapes (East US, pay-as-you-go, list price, rounded) + +Azure parity of the three-shape AWS table. Cost lines are derived from +the per-module Azure READMEs and aligned at procurement-grade sizing +(same per-vCPU meter, same Multi-AZ / Zone-Redundant defaults). All +managed services in the topology keep data in-region. + +| Shape | Module | Instances | Managed services | Infra | + per-vCore meter | **All-in (procurement-grade)** | +|---|---|---|---|---|---|---| +| **Single** | [`single-vm/azure`](modules/single-vm/azure) | 1× `Standard_D2s_v5` | none | ~$95/mo | 2 vCPU × 730h × $0.24 = ~$350/mo | **~$445/mo** | +| **HA hot-hot** | [`ha-hot-hot/azure`](modules/ha-hot-hot/azure) | 2× `Standard_D2s_v5` | Standard LB + Azure Cache Redis (Std C1) + Postgres Flex Server Zone-Redundant | ~$585/mo | 4 vCPU × 730h × $0.24 = ~$700/mo | **~$1,285/mo (≈ 2.9× single)** | +| **Unlimited scale** | [`unlimited-scale/azure`](modules/unlimited-scale/azure) | 3× `Standard_D2s_v5` (VMSS min) | Standard LB + Azure Cache Redis + Postgres Flex Server primary + 2× replicas (`GP_Standard_D4ds_v5`) | ~$1,480/mo | 6 vCPU × 730h × $0.24 = ~$1,050/mo | **~$2,530/mo at min, ~$5,150/mo at 10 instances** | + +Cross-cloud parity is intentional: an AWS HA deployment and an Azure HA +deployment of the same product land within ~6% of each other at +procurement-grade sizing (AWS HA $1,215, Azure HA $1,285). The +delta is driven by Premium SSD vs gp3 and ALB-vs-Standard-LB pricing, +not by topology choices. Quote whichever cloud the customer's +finance team already has commitments with. + +### Azure Cache for Redis sizing + +Same role as AWS ElastiCache: shared session store + worker-lock +heartbeat. SKU + capacity scale together; **Basic is single-node and +rejected by module validation**. + +| SKU / capacity | RAM | Per-month | Use case | +|---|---|---|---| +| Standard C1 | 1 GB | ~$55 | HA hot-hot or VMSS up to 5 instances | +| Standard C2 | 2.5 GB | ~$110 | VMSS 5–10 instances | +| Standard C3 | 6 GB | ~$220 | VMSS 10–20 instances | +| Premium P1 | 6 GB | ~$420 | Zone-redundant primary; needed for ≥3-zone deployments or Redis persistence | + +## When prices change + +1. Update the canonical AWS table in `hailbytes-sat/docs/AWS_HA_DEPLOYMENT.md § Estimated monthly cost`. +2. Mirror the AWS change in the AWS rows above and the AWS meter table. +3. Update Azure rows if the change is cross-cloud (SKU sizing, meter + rate). Azure-only price drifts can be tracked in the per-module + Azure READMEs first, then synced here on the next cycle. +4. Spot-check per-module READMEs (`modules/single-vm/{aws,azure}/README.md`, + `modules/ha-hot-hot/{aws,azure}/README.md`, + `modules/unlimited-scale/{aws,azure}/README.md`) — only edit them + if the **Starter default** sizing changed; the procurement-grade + column should already link here. diff --git a/README.md b/README.md index fdd2d70..c72d3a3 100644 --- a/README.md +++ b/README.md @@ -188,6 +188,7 @@ with the runbook in - [ARCHITECTURE.md](ARCHITECTURE.md) — per-tier diagrams and rationale, shared responsibility model - [BILLING.md](BILLING.md) — marketplace billing model, why no containers +- [COST_SHAPES.md](COST_SHAPES.md) — three AWS deployment shapes side-by-side (single / HA / unlimited-scale) with per-vCore meter and procurement-grade pricing - [SECURITY.md](SECURITY.md) — responsible disclosure - [SECURITY-DEFAULTS.md](SECURITY-DEFAULTS.md) — encryption / IMDSv2 / IAM / NSG defaults baked into modules - [docs/PATCHING_AND_MIGRATION.md](docs/PATCHING_AND_MIGRATION.md) — pre-patch backups, rolling-replace, auto-rollback, DB mode toggle diff --git a/modules/asm-aws-autoscale/main.tf b/modules/asm-aws-autoscale/main.tf index 1e263c5..1941aaa 100644 --- a/modules/asm-aws-autoscale/main.tf +++ b/modules/asm-aws-autoscale/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../unlimited-scale/aws" + source = "../unlimited-scale/aws" - product = "asm" + product = "asm" vpc_id = var.vpc_id public_subnet_ids = var.public_subnet_ids @@ -43,5 +43,23 @@ module "this" { rds_copy_tags_to_snapshot = var.rds_copy_tags_to_snapshot schema_version_endpoint_path = var.schema_version_endpoint_path + # Shared session store (ElastiCache for Redis) + enable_managed_redis = var.enable_managed_redis + redis_node_type = var.redis_node_type + redis_engine_version = var.redis_engine_version + redis_snapshot_retention_days = var.redis_snapshot_retention_days + redis_endpoint_override = var.redis_endpoint_override + redis_endpoint_override_port = var.redis_endpoint_override_port + redis_endpoint_override_tls = var.redis_endpoint_override_tls + + enable_alb_deletion_protection = var.enable_alb_deletion_protection + + # RDS production hardening (opt-in) + rds_enhanced_monitoring_interval = var.rds_enhanced_monitoring_interval + rds_enabled_cloudwatch_log_types = var.rds_enabled_cloudwatch_log_types + rds_iam_authentication_enabled = var.rds_iam_authentication_enabled + rds_performance_insights_enabled = var.rds_performance_insights_enabled + rds_performance_insights_retention_days = var.rds_performance_insights_retention_days + tags = var.tags } diff --git a/modules/asm-aws-autoscale/variables.tf b/modules/asm-aws-autoscale/variables.tf index 22be4ec..a8c566b 100644 --- a/modules/asm-aws-autoscale/variables.tf +++ b/modules/asm-aws-autoscale/variables.tf @@ -205,6 +205,86 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } +# ----- Shared session store (ElastiCache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an ElastiCache Multi-AZ replication group. Required for horizontal scaling; set to false only when supplying redis_endpoint_override." + type = bool + default = true +} + +variable "redis_node_type" { + description = "ElastiCache node type. Scale up alongside ASG growth — cache.t4g.small handles 3-5 instances, cache.m6g.large handles 10-20+." + type = string + default = "cache.t4g.small" +} + +variable "redis_engine_version" { + type = string + default = "7.1" +} + +variable "redis_snapshot_retention_days" { + type = number + default = 0 +} + +variable "redis_endpoint_override" { + description = "Host of an existing customer-managed Redis endpoint. Pair with enable_managed_redis = false." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + type = number + default = 6379 +} + +variable "redis_endpoint_override_tls" { + type = bool + default = true +} + + +variable "enable_alb_deletion_protection" { + description = "Enable deletion protection on the ALB. Default true." + type = bool + default = true +} + + +# ----- RDS production-hardening (opt-in) ----- + +variable "rds_enhanced_monitoring_interval" { + description = "RDS enhanced monitoring sample interval. 0 disables. CKV_AWS_118." + type = number + default = 0 +} + +variable "rds_enabled_cloudwatch_log_types" { + description = "RDS log types to export to CloudWatch. CKV_AWS_129." + type = list(string) + default = [] +} + +variable "rds_iam_authentication_enabled" { + description = "Enable IAM DB authentication. CKV_AWS_161." + type = bool + default = false +} + +variable "rds_performance_insights_enabled" { + description = "Enable RDS Performance Insights. CKV_AWS_354." + type = bool + default = false +} + +variable "rds_performance_insights_retention_days" { + description = "Performance Insights retention. 7 = free tier; 731 = long-term." + type = number + default = 7 +} + variable "tags" { type = map(string) default = {} diff --git a/modules/asm-aws-ha/main.tf b/modules/asm-aws-ha/main.tf index 2d773c3..be4301b 100644 --- a/modules/asm-aws-ha/main.tf +++ b/modules/asm-aws-ha/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../ha-hot-hot/aws" + source = "../ha-hot-hot/aws" - product = "asm" + product = "asm" vpc_id = var.vpc_id public_subnet_ids = var.public_subnet_ids @@ -41,5 +41,25 @@ module "this" { alert_email = var.alert_email schema_version_endpoint_path = var.schema_version_endpoint_path + # Shared session store (ElastiCache for Redis) + enable_managed_redis = var.enable_managed_redis + redis_node_type = var.redis_node_type + redis_engine_version = var.redis_engine_version + redis_snapshot_retention_days = var.redis_snapshot_retention_days + redis_endpoint_override = var.redis_endpoint_override + redis_endpoint_override_port = var.redis_endpoint_override_port + redis_endpoint_override_tls = var.redis_endpoint_override_tls + + enable_alb_deletion_protection = var.enable_alb_deletion_protection + enable_alb_access_logging = var.enable_alb_access_logging + alb_access_log_retention_days = var.alb_access_log_retention_days + + # RDS production hardening (opt-in) + rds_enhanced_monitoring_interval = var.rds_enhanced_monitoring_interval + rds_enabled_cloudwatch_log_types = var.rds_enabled_cloudwatch_log_types + rds_iam_authentication_enabled = var.rds_iam_authentication_enabled + rds_performance_insights_enabled = var.rds_performance_insights_enabled + rds_performance_insights_retention_days = var.rds_performance_insights_retention_days + tags = var.tags } diff --git a/modules/asm-aws-ha/variables.tf b/modules/asm-aws-ha/variables.tf index f98434a..4d719a1 100644 --- a/modules/asm-aws-ha/variables.tf +++ b/modules/asm-aws-ha/variables.tf @@ -211,6 +211,102 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } +# ----- Shared session store (ElastiCache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an ElastiCache Multi-AZ replication group for HailBytes shared sessions and worker locks. HA mode requires a shared Redis endpoint — set to false only if you supply redis_endpoint_override." + type = bool + default = true +} + +variable "redis_node_type" { + description = "ElastiCache node type. cache.t4g.small is the procurement-friendly default; raise for higher session-throughput deployments." + type = string + default = "cache.t4g.small" +} + +variable "redis_engine_version" { + description = "ElastiCache Redis engine version." + type = string + default = "7.1" +} + +variable "redis_snapshot_retention_days" { + description = "Days ElastiCache retains daily snapshots. Sessions are recoverable from Postgres re-login, so this defaults to 0; raise if you want a Redis PITR window." + type = number + default = 0 +} + +variable "redis_endpoint_override" { + description = "Host of a customer-managed Redis endpoint. When non-null, the module skips its own ElastiCache replication group and wires the VMs at this host instead. Pair with enable_managed_redis = false." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + description = "Port on the customer-managed Redis endpoint. Ignored unless redis_endpoint_override is set." + type = number + default = 6379 +} + +variable "redis_endpoint_override_tls" { + description = "Whether the customer-managed Redis endpoint requires in-transit TLS. Ignored unless redis_endpoint_override is set." + type = bool + default = true +} + + +variable "enable_alb_deletion_protection" { + description = "Enable deletion protection on the ALB. Default true; dev/test override to false to let `terraform destroy` succeed." + type = bool + default = true +} + +variable "enable_alb_access_logging" { + description = "Provision an S3 bucket for ALB access logs and enable the listener access_logs block." + type = bool + default = false +} + +variable "alb_access_log_retention_days" { + description = "Days to retain ALB access log objects." + type = number + default = 365 +} + + +# ----- RDS production-hardening (opt-in) ----- + +variable "rds_enhanced_monitoring_interval" { + description = "RDS enhanced monitoring sample interval. 0 disables. CKV_AWS_118." + type = number + default = 0 +} + +variable "rds_enabled_cloudwatch_log_types" { + description = "RDS log types to export to CloudWatch. CKV_AWS_129." + type = list(string) + default = [] +} + +variable "rds_iam_authentication_enabled" { + description = "Enable IAM DB authentication. CKV_AWS_161." + type = bool + default = false +} + +variable "rds_performance_insights_enabled" { + description = "Enable RDS Performance Insights. CKV_AWS_354." + type = bool + default = false +} + +variable "rds_performance_insights_retention_days" { + description = "Performance Insights retention. 7 = free tier; 731 = long-term." + type = number + default = 7 +} + variable "tags" { type = map(string) default = {} diff --git a/modules/asm-aws-single/main.tf b/modules/asm-aws-single/main.tf index 3a359a6..ebc1dbd 100644 --- a/modules/asm-aws-single/main.tf +++ b/modules/asm-aws-single/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../single-vm/aws" + source = "../single-vm/aws" - product = "asm" + product = "asm" vpc_id = var.vpc_id subnet_id = var.subnet_id diff --git a/modules/asm-azure-autoscale/main.tf b/modules/asm-azure-autoscale/main.tf index 2391351..52591ec 100644 --- a/modules/asm-azure-autoscale/main.tf +++ b/modules/asm-azure-autoscale/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../unlimited-scale/azure" + source = "../unlimited-scale/azure" - product = "asm" + product = "asm" resource_group_name = var.resource_group_name location = var.location @@ -46,6 +46,20 @@ module "this" { waf_policy_id = var.waf_policy_id refresh_rollback_5xx_count_threshold = var.refresh_rollback_5xx_count_threshold schema_version_endpoint_path = var.schema_version_endpoint_path + enable_post_patch_run_command = var.enable_post_patch_run_command + + # Shared session store (Azure Cache for Redis) + enable_managed_redis = var.enable_managed_redis + redis_sku_name = var.redis_sku_name + redis_family = var.redis_family + redis_capacity = var.redis_capacity + redis_endpoint_override = var.redis_endpoint_override + redis_endpoint_override_port = var.redis_endpoint_override_port + redis_endpoint_override_tls = var.redis_endpoint_override_tls + + db_secret_expiration_hours = var.db_secret_expiration_hours + + postgres_geo_redundant_backup_enabled = var.postgres_geo_redundant_backup_enabled tags = var.tags } diff --git a/modules/asm-azure-autoscale/variables.tf b/modules/asm-azure-autoscale/variables.tf index 9ae3a83..82e0cd1 100644 --- a/modules/asm-azure-autoscale/variables.tf +++ b/modules/asm-azure-autoscale/variables.tf @@ -206,6 +206,67 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } +variable "enable_post_patch_run_command" { + description = "Install a VMSS extension named RunPostPatchVerify mirroring the AWS asm-aws-autoscale aws_ssm_document.post_patch_verify." + type = bool + default = true +} + +# ----- Shared session store (Azure Cache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an Azure Cache for Redis. Required for horizontal scaling; set to false only when supplying redis_endpoint_override." + type = bool + default = true +} + +variable "redis_sku_name" { + description = "Redis SKU. Standard or Premium only (Basic is single-node)." + type = string + default = "Standard" +} + +variable "redis_family" { + type = string + default = "C" +} + +variable "redis_capacity" { + description = "Redis capacity (size index). 0-6 for Standard. Scale alongside VMSS instance count." + type = number + default = 1 +} + +variable "redis_endpoint_override" { + description = "Host of an existing customer-managed Redis endpoint." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + type = number + default = 6380 +} + +variable "redis_endpoint_override_tls" { + type = bool + default = true +} + + +variable "db_secret_expiration_hours" { + description = "Hours until the Key Vault DB-password secret expires. Default 8760 = one calendar year." + type = number + default = 8760 +} + + +variable "postgres_geo_redundant_backup_enabled" { + description = "Enable geo-redundant backup on Postgres Flexible Server. CKV_AZURE_136." + type = bool + default = false +} + variable "tags" { type = map(string) default = {} diff --git a/modules/asm-azure-ha/main.tf b/modules/asm-azure-ha/main.tf index 4dc0abf..7b12179 100644 --- a/modules/asm-azure-ha/main.tf +++ b/modules/asm-azure-ha/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../ha-hot-hot/azure" + source = "../ha-hot-hot/azure" - product = "asm" + product = "asm" resource_group_name = var.resource_group_name location = var.location @@ -45,6 +45,20 @@ module "this" { alert_email = var.alert_email refresh_rollback_5xx_count_threshold = var.refresh_rollback_5xx_count_threshold schema_version_endpoint_path = var.schema_version_endpoint_path + enable_post_patch_run_command = var.enable_post_patch_run_command + + # Shared session store (Azure Cache for Redis) + enable_managed_redis = var.enable_managed_redis + redis_sku_name = var.redis_sku_name + redis_family = var.redis_family + redis_capacity = var.redis_capacity + redis_endpoint_override = var.redis_endpoint_override + redis_endpoint_override_port = var.redis_endpoint_override_port + redis_endpoint_override_tls = var.redis_endpoint_override_tls + + db_secret_expiration_hours = var.db_secret_expiration_hours + + postgres_geo_redundant_backup_enabled = var.postgres_geo_redundant_backup_enabled tags = var.tags } diff --git a/modules/asm-azure-ha/variables.tf b/modules/asm-azure-ha/variables.tf index b683c9d..15acbdd 100644 --- a/modules/asm-azure-ha/variables.tf +++ b/modules/asm-azure-ha/variables.tf @@ -228,6 +228,68 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } +variable "enable_post_patch_run_command" { + description = "Install an Azure Run Command document named RunPostPatchVerify on each VM, mirroring the AWS asm-aws-ha aws_ssm_document.post_patch_verify." + type = bool + default = true +} + +# ----- Shared session store (Azure Cache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an Azure Cache for Redis (Standard or Premium SKU, zone-redundant in Premium). Required for HA; set to false only when supplying redis_endpoint_override." + type = bool + default = true +} + +variable "redis_sku_name" { + description = "Redis SKU. Standard delivers a primary/replica pair across two zones; Premium adds persistence and explicit zone selection. Basic is single-node and NOT a valid HA option." + type = string + default = "Standard" +} + +variable "redis_family" { + description = "Redis SKU family. 'C' = Standard/Basic, 'P' = Premium. Must match redis_sku_name." + type = string + default = "C" +} + +variable "redis_capacity" { + description = "Redis capacity (size index). For SKU=Standard / family=C, valid values are 0 (250MB) through 6 (53GB). The procurement-friendly default is 1 (1GB)." + type = number + default = 1 +} + +variable "redis_endpoint_override" { + description = "Host of an existing customer-managed Redis endpoint (Azure Cache, self-managed Redis Sentinel, etc.). Pair with enable_managed_redis = false." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + type = number + default = 6380 +} + +variable "redis_endpoint_override_tls" { + type = bool + default = true +} + + +variable "db_secret_expiration_hours" { + description = "Hours until the Key Vault DB-password secret expires. Default 8760 = one calendar year." + type = number + default = 8760 +} + + +variable "postgres_geo_redundant_backup_enabled" { + description = "Enable geo-redundant backup on Postgres Flexible Server. CKV_AZURE_136." + type = bool + default = false +} + variable "tags" { type = map(string) default = {} diff --git a/modules/asm-azure-single/main.tf b/modules/asm-azure-single/main.tf index 29b689f..1a17701 100644 --- a/modules/asm-azure-single/main.tf +++ b/modules/asm-azure-single/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../single-vm/azure" + source = "../single-vm/azure" - product = "asm" + product = "asm" resource_group_name = var.resource_group_name location = var.location diff --git a/modules/ha-hot-hot/aws/README.md b/modules/ha-hot-hot/aws/README.md index 6578ffe..5dc8682 100644 --- a/modules/ha-hot-hot/aws/README.md +++ b/modules/ha-hot-hot/aws/README.md @@ -14,32 +14,57 @@ flowchart TB ALB --> VM2[(EC2 #2
AZ-b
Marketplace AMI)] VM1 --> SM[(Secrets Manager
DB creds)] VM2 --> SM + VM1 -->|TLS| RDS[(ElastiCache Redis
Multi-AZ replication group
sessions + worker locks)] + VM2 -->|TLS| RDS VM1 -->|TLS| DB[(RDS PostgreSQL
Multi-AZ primary)] VM2 -->|TLS| DB DB -.synchronous replication.-> DBS[(Standby in second AZ)] + RDS -.automatic failover.-> RDSS[(Replica in second AZ)] ``` ## Cost estimate (us-east-1, on-demand) -| Component | Default | ~Monthly | -|---|---|---| -| 2× EC2 `t3.large` | 24/7 | $120 | -| 2× EBS gp3 root | 50 GB | $8 | -| 2× EBS gp3 data | 200 GB | $32 | -| Application Load Balancer | + LCU | $25 | -| RDS `db.t3.medium` Multi-AZ | 100 GB gp3 | $180 | -| RDS backups | retained | $10 | -| Secrets Manager | 1 secret | $0.40 | -| KMS (if enabled) | 1 | $1 + usage | -| **Total infrastructure** | | **~$375/month** | -| **HailBytes marketplace software fee** | per listing, billed per VM-hour | **separate, x2 hours** | +Two reference shapes. The defaults below are the **starter** shape; the +**procurement-grade** shape (right column) matches `hailbytes-sat/docs/AWS_HA_DEPLOYMENT.md` +and the customer-facing pricing the account team quotes. Pick the shape +that matches your sizing before sharing numbers with finance. + +For the three-shape (single / HA / unlimited-scale) comparison and the +canonical procurement-grade source, see +[`COST_SHAPES.md`](../../../COST_SHAPES.md). + +| Component | Starter default | ~Monthly | Procurement-grade variable / value | ~Monthly | +|---|---|---|---|---| +| 2× EC2 SAT/ASM | `instance_type = "t3.large"` | $120 | `instance_type = "m6i.large"` | $140 | +| 2× EBS gp3 root | 50 GB | $8 | 50 GB | $8 | +| 2× EBS gp3 data | `data_volume_size_gb = 200` | $32 | `data_volume_size_gb = 200` | $32 | +| Application Load Balancer | + LCU | $25 | + LCU | $25 | +| ElastiCache Redis Multi-AZ | `redis_node_type = "cache.t4g.small"` | $50 | `redis_node_type = "cache.t4g.small"` | $50 | +| RDS Multi-AZ (`db_mode = "rds"`) | `db_instance_class = "db.t3.medium"` (100 GB gp3) | $180 | `db_instance_class = "db.m6g.large"` (100 GB gp3) | $230 | +| RDS backups | retained | $10 | retained | $10 | +| Cross-AZ data transfer | minimal | $10 | minimal | $20 | +| Secrets Manager | 1 secret | $0.40 | 1 secret | $0.40 | +| KMS (if enabled) | 1 | $1 + usage | 1 | $1 + usage | +| **Total infrastructure** | | **~$435/month** | | **~$515/month** | +| **HailBytes marketplace software fee** ($0.24/vCPU-hr) | 4 vCPU × 730h | **~$700** | 4 vCPU × 730h | **~$700** | +| **All-in (infra + meter)** | | **~$1,135/month** | | **~$1,215/month** | + +Single-instance reference (for the procurement delta the account team +quotes — Asiera/HEAnet etc.): ~$420/month all-in (1× `m6i.large`, +co-located Postgres, no ALB, no Redis, no managed DB). HA lands at +roughly **2.2–2.6× a single-instance bill**. + +For the **`db_mode = "ec2"`** path (self-managed Postgres on a third +EC2), drop the RDS line and add ~$70/month for the third `m6i.large` +plus another 200 GB of gp3 (~$16/month). All-in lands at roughly +**~$940/month (≈ 2.2× single)** at procurement-grade sizing. ## Prerequisites - VPC with at least 2 public subnets (for ALB) and 2 private subnets in different AZs - ACM certificate in the same region (for the HTTPS listener) - Marketplace subscription active -- IAM permissions to create EC2, ALB, RDS, IAM, KMS, Secrets Manager +- IAM permissions to create EC2, ALB, RDS, ElastiCache, IAM, KMS, Secrets Manager ## Usage diff --git a/modules/ha-hot-hot/aws/main.tf b/modules/ha-hot-hot/aws/main.tf index b9ef592..f20e880 100644 --- a/modules/ha-hot-hot/aws/main.tf +++ b/modules/ha-hot-hot/aws/main.tf @@ -30,16 +30,25 @@ locals { # Pick two private subnets for two VMs (one per AZ). For >2 subnets we take the first 2. vm_subnets = slice(var.private_subnet_ids, 0, 2) - use_rds = var.db_mode == "rds" - use_ec2_db = var.db_mode == "ec2" - create_backup_bucket = var.create_backup_bucket - effective_backup_bucket = local.create_backup_bucket ? aws_s3_bucket.backup[0].id : var.backup_bucket_name - backup_object_prefix = "hailbytes-${var.product}-" + use_rds = var.db_mode == "rds" + use_ec2_db = var.db_mode == "ec2" + create_backup_bucket = var.create_backup_bucket + effective_backup_bucket = local.create_backup_bucket ? aws_s3_bucket.backup[0].id : var.backup_bucket_name + backup_object_prefix = "hailbytes-${var.product}-" + create_alb_access_logs_bucket = var.enable_alb_access_logging db_host = coalesce(one(aws_db_instance.main[*].address), one(aws_instance.db_ec2[*].private_ip)) db_port = local.use_rds ? coalesce(one(aws_db_instance.main[*].port), 5432) : 5432 db_arn = coalesce(one(aws_db_instance.main[*].arn), one(aws_instance.db_ec2[*].arn)) db_id = coalesce(one(aws_db_instance.main[*].id), one(aws_instance.db_ec2[*].id)) + + # Redis is required for HA: both app instances must share session state and + # the worker-lock heartbeat through the same Redis. The module provisions + # ElastiCache by default; customers with an existing Redis can supply an + # endpoint via var.redis_endpoint_override and set enable_managed_redis = false. + provision_managed_redis = var.enable_managed_redis && var.redis_endpoint_override == null + effective_redis_host = local.provision_managed_redis ? one(aws_elasticache_replication_group.main[*].primary_endpoint_address) : var.redis_endpoint_override + effective_redis_port = local.provision_managed_redis ? 6379 : var.redis_endpoint_override_port } data "aws_caller_identity" "current" {} @@ -148,6 +157,24 @@ resource "aws_vpc_security_group_ingress_rule" "db_from_vm" { description = "Postgres from VMs" } +resource "aws_security_group" "redis" { + count = local.provision_managed_redis ? 1 : 0 + name = "${local.name_prefix}-redis-sg" + description = "ElastiCache Redis ingress from VMs" + vpc_id = var.vpc_id + tags = local.common_tags +} + +resource "aws_vpc_security_group_ingress_rule" "redis_from_vm" { + count = local.provision_managed_redis ? 1 : 0 + security_group_id = aws_security_group.redis[0].id + referenced_security_group_id = aws_security_group.vm.id + from_port = 6379 + to_port = 6379 + ip_protocol = "tcp" + description = "Redis from VMs" +} + # ----- IAM ----- resource "aws_iam_role" "vm" { @@ -282,9 +309,14 @@ resource "aws_db_instance" "main" { final_snapshot_identifier = var.db_deletion_protection ? "${local.name_prefix}-final-${formatdate("YYYYMMDD-hhmmss", timestamp())}" : null copy_tags_to_snapshot = var.rds_copy_tags_to_snapshot - performance_insights_enabled = true - enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"] - auto_minor_version_upgrade = true + iam_database_authentication_enabled = var.rds_iam_authentication_enabled + performance_insights_enabled = var.rds_performance_insights_enabled + performance_insights_kms_key_id = var.rds_performance_insights_enabled && var.enable_customer_managed_key ? aws_kms_key.main[0].arn : null + performance_insights_retention_period = var.rds_performance_insights_enabled ? var.rds_performance_insights_retention_days : null + monitoring_interval = var.rds_enhanced_monitoring_interval + monitoring_role_arn = var.rds_enhanced_monitoring_interval > 0 ? aws_iam_role.rds_monitoring[0].arn : null + enabled_cloudwatch_logs_exports = var.rds_enabled_cloudwatch_log_types + auto_minor_version_upgrade = true tags = local.common_tags @@ -336,8 +368,8 @@ resource "aws_iam_role_policy" "db_ec2_backup" { Resource = aws_secretsmanager_secret.db.arn }, { - Effect = "Allow" - Action = ["ec2:CreateSnapshot", "ec2:CreateTags", "ec2:DescribeVolumes", "ec2:DescribeSnapshots"] + Effect = "Allow" + Action = ["ec2:CreateSnapshot", "ec2:CreateTags", "ec2:DescribeVolumes", "ec2:DescribeSnapshots"] Resource = "*" }, ] @@ -425,7 +457,7 @@ resource "aws_instance" "db_ec2" { # no special handling — it sees the same Secrets Manager secret shape as in # RDS mode. user_data_replace_on_change = false - user_data = <<-EOF + user_data = <<-EOF #cloud-config package_update: true packages: @@ -502,6 +534,43 @@ resource "aws_volume_attachment" "db_data" { instance_id = aws_instance.db_ec2[0].id } +# ----- Shared session store: ElastiCache for Redis (Multi-AZ) ----- +# +# HailBytes SAT / ASM both keep session state in Redis when running in HA. Without +# a shared Redis endpoint each VM falls back to in-memory sessions, which breaks +# every cross-instance login and worker-lock claim. This block provisions a +# Multi-AZ replication group by default; set enable_managed_redis = false and +# pass redis_endpoint_override to point at a customer-owned cache instead. + +resource "aws_elasticache_subnet_group" "main" { + count = local.provision_managed_redis ? 1 : 0 + name = "${local.name_prefix}-redis-subnets" + subnet_ids = local.vm_subnets + tags = local.common_tags +} + +resource "aws_elasticache_replication_group" "main" { + count = local.provision_managed_redis ? 1 : 0 + replication_group_id = "${local.name_prefix}-redis" + description = "HailBytes ${var.product} session store + worker lock" + engine = "redis" + engine_version = var.redis_engine_version + node_type = var.redis_node_type + num_cache_clusters = 2 + automatic_failover_enabled = true + multi_az_enabled = true + port = 6379 + parameter_group_name = "default.redis7" + subnet_group_name = aws_elasticache_subnet_group.main[0].name + security_group_ids = [aws_security_group.redis[0].id] + at_rest_encryption_enabled = true + transit_encryption_enabled = true + kms_key_id = var.enable_customer_managed_key ? aws_kms_key.main[0].arn : null + snapshot_retention_limit = var.redis_snapshot_retention_days + apply_immediately = false + tags = local.common_tags +} + # ----- VMs (one per AZ, active/active) ----- resource "aws_instance" "vm" { @@ -538,8 +607,10 @@ resource "aws_instance" "vm" { kms_key_id = var.enable_customer_managed_key ? aws_kms_key.main[0].arn : null } - # The marketplace image reads these on first boot to wire itself to the shared DB. - # Values are not sensitive (they reference the secret ARN, not the password). + # The marketplace image reads these on first boot to wire itself to the shared + # DB and Redis. Values are not sensitive (they reference the secret ARN, not + # the password). Redis is required for HA; without it the second VM cannot + # share sessions or worker-lock state with the first. user_data = base64encode(jsonencode({ hailbytes = { mode = "ha" @@ -548,11 +619,14 @@ resource "aws_instance" "vm" { db_secret_region = data.aws_region.current.id product = var.product cluster_member_idx = count.index + redis_host = local.effective_redis_host + redis_port = local.effective_redis_port + redis_tls = local.provision_managed_redis ? true : var.redis_endpoint_override_tls } })) tags = merge(local.common_tags, { - Name = "${local.name_prefix}-vm-${count.index + 1}" + Name = "${local.name_prefix}-vm-${count.index + 1}" "hailbytes-${var.product}" = "true" }) @@ -560,7 +634,7 @@ resource "aws_instance" "vm" { ignore_changes = [ami, user_data] } - depends_on = [aws_db_instance.main, aws_instance.db_ec2] + depends_on = [aws_db_instance.main, aws_instance.db_ec2, aws_elasticache_replication_group.main] } data "aws_region" "current" {} @@ -577,6 +651,16 @@ resource "aws_lb" "main" { drop_invalid_header_fields = true enable_http2 = true + enable_deletion_protection = var.enable_alb_deletion_protection + + dynamic "access_logs" { + for_each = local.create_alb_access_logs_bucket ? [1] : [] + content { + bucket = aws_s3_bucket.alb_logs[0].id + prefix = "alb" + enabled = true + } + } tags = local.common_tags } @@ -818,6 +902,15 @@ resource "aws_s3_bucket_lifecycle_configuration" "backup" { count = local.create_backup_bucket ? 1 : 0 bucket = aws_s3_bucket.backup[0].id + rule { + id = "abort-incomplete-multipart" + status = "Enabled" + filter {} + abort_incomplete_multipart_upload { + days_after_initiation = 7 + } + } + rule { id = "tier-and-expire" status = "Enabled" @@ -915,7 +1008,7 @@ resource "aws_ssm_document" "pre_patch_backup" { "export AWS_S3_PREFIX=\"${local.backup_object_prefix}$${TS}\"", "export HAILBYTES_DB_SECRET_ARN='${aws_secretsmanager_secret.db.arn}'", "export AWS_DEFAULT_REGION='${data.aws_region.current.id}'", - "if [ -x /opt/hailbytes/bin/ha-pre-patch-backup.sh ]; then sudo -E /opt/hailbytes/bin/ha-pre-patch-backup.sh; else echo 'WARN: /opt/hailbytes/bin/ha-pre-patch-backup.sh not present on this AMI; skipping local bundle.'; fi", + "if [ -x /opt/hailbytes/bin/ha-pre-patch-backup.sh ]; then sudo -E /opt/hailbytes/bin/ha-pre-patch-backup.sh; else echo 'ERROR: /opt/hailbytes/bin/ha-pre-patch-backup.sh not present on this AMI. Rebuild from main; the Packer provision.sh now installs the script.' >&2; exit 1; fi", "SNAP_ID='{{ snapshotIdentifier }}'", "if [ -z \"$SNAP_ID\" ]; then SNAP_ID=\"${local.name_prefix}-pre-patch-$${TS}\"; fi", "if [ '${var.db_mode}' = 'rds' ]; then aws rds create-db-snapshot --db-instance-identifier '${try(aws_db_instance.main[0].id, "")}' --db-snapshot-identifier \"$SNAP_ID\" --tags Key=Module,Value=hailbytes-terraform-modules Key=Phase,Value=pre-patch; else VOL='${try(aws_ebs_volume.db_data[0].id, "")}'; if [ -n \"$VOL\" ]; then aws ec2 create-snapshot --volume-id \"$VOL\" --description \"hailbytes-${var.product} pre-patch $${TS}\" --tag-specifications \"ResourceType=snapshot,Tags=[{Key=Module,Value=hailbytes-terraform-modules},{Key=Phase,Value=pre-patch},{Key=Name,Value=$$SNAP_ID}]\"; fi; fi", @@ -925,3 +1018,161 @@ resource "aws_ssm_document" "pre_patch_backup" { ] }) } + +# ----- SSM Run Command document: post-patch verify ----- + +resource "aws_ssm_document" "post_patch_verify" { + name = "${local.name_prefix}-post-patch-verify" + document_type = "Command" + document_format = "YAML" + target_type = "/AWS::EC2::Instance" + tags = local.common_tags + + content = yamlencode({ + schemaVersion = "2.2" + description = "HailBytes SAT/ASM post-patch verification. Runs the five-probe verifier shipped with the AMI: /api/ready, schema-version regression, encryption-key fingerprint, worker-lock health, sample SMTP/credential decrypt." + parameters = { + schemaVersionPath = { + type = "String" + description = "Path to the schema-version endpoint." + default = var.schema_version_endpoint_path + } + minSchemaVersion = { + type = "String" + description = "Optional integer floor that the running schema version must meet or exceed. Empty string skips the regression check." + default = "" + } + } + mainSteps = [ + { + action = "aws:runShellScript" + name = "postPatchVerify" + inputs = { + timeoutSeconds = "600" + runCommand = [ + "set -euo pipefail", + "export HAILBYTES_SCHEMA_VERSION_PATH='{{ schemaVersionPath }}'", + "export HAILBYTES_MIN_SCHEMA_VERSION='{{ minSchemaVersion }}'", + "if [ -x /opt/hailbytes/bin/ha-post-patch-verify.sh ]; then sudo -E /opt/hailbytes/bin/ha-post-patch-verify.sh; else echo 'ERROR: /opt/hailbytes/bin/ha-post-patch-verify.sh not present on this AMI.'; exit 1; fi", + ] + } + } + ] + }) +} + +# ----- ALB access logs (optional) ----- +# +# CKV_AWS_91 wants every ALB to have access logging enabled. We make it +# opt-in (default off) since it adds a second S3 bucket and a small +# log-volume cost; production deployments should turn it on. + +data "aws_elb_service_account" "main" { + count = local.create_alb_access_logs_bucket ? 1 : 0 +} + +resource "aws_s3_bucket" "alb_logs" { + count = local.create_alb_access_logs_bucket ? 1 : 0 + bucket = "${local.name_prefix}-alb-logs-${data.aws_caller_identity.current.account_id}" + force_destroy = false + tags = local.common_tags +} + +resource "aws_s3_bucket_versioning" "alb_logs" { + count = local.create_alb_access_logs_bucket ? 1 : 0 + bucket = aws_s3_bucket.alb_logs[0].id + versioning_configuration { + status = "Enabled" + } +} + +resource "aws_s3_bucket_public_access_block" "alb_logs" { + count = local.create_alb_access_logs_bucket ? 1 : 0 + bucket = aws_s3_bucket.alb_logs[0].id + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "alb_logs" { + count = local.create_alb_access_logs_bucket ? 1 : 0 + bucket = aws_s3_bucket.alb_logs[0].id + rule { + apply_server_side_encryption_by_default { + sse_algorithm = var.enable_customer_managed_key ? "aws:kms" : "AES256" + kms_master_key_id = var.enable_customer_managed_key ? aws_kms_key.main[0].arn : null + } + } +} + +resource "aws_s3_bucket_lifecycle_configuration" "alb_logs" { + count = local.create_alb_access_logs_bucket ? 1 : 0 + bucket = aws_s3_bucket.alb_logs[0].id + + rule { + id = "abort-incomplete-multipart" + status = "Enabled" + filter {} + abort_incomplete_multipart_upload { + days_after_initiation = 7 + } + } + + rule { + id = "expire-and-archive" + status = "Enabled" + filter { + prefix = "alb/" + } + transition { + days = 30 + storage_class = "STANDARD_IA" + } + expiration { + days = var.alb_access_log_retention_days + } + } +} + +resource "aws_s3_bucket_policy" "alb_logs" { + count = local.create_alb_access_logs_bucket ? 1 : 0 + bucket = aws_s3_bucket.alb_logs[0].id + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { AWS = data.aws_elb_service_account.main[0].arn } + Action = "s3:PutObject" + Resource = "${aws_s3_bucket.alb_logs[0].arn}/*" + }] + }) +} + +# ----- RDS enhanced monitoring IAM role (conditional) ----- +# +# Only provisioned when var.rds_enhanced_monitoring_interval > 0. +# CKV_AWS_118 wants this on production deployments; we keep it +# opt-in because it adds ~$15/mo in CloudWatch ingestion at the +# default 60-second interval. + +resource "aws_iam_role" "rds_monitoring" { + count = local.use_rds && var.rds_enhanced_monitoring_interval > 0 ? 1 : 0 + name = "${local.name_prefix}-rds-monitoring" + tags = local.common_tags + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { Service = "monitoring.rds.amazonaws.com" } + }] + }) +} + +resource "aws_iam_role_policy_attachment" "rds_monitoring" { + count = local.use_rds && var.rds_enhanced_monitoring_interval > 0 ? 1 : 0 + role = aws_iam_role.rds_monitoring[0].name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonRDSEnhancedMonitoringRole" +} diff --git a/modules/ha-hot-hot/aws/outputs.tf b/modules/ha-hot-hot/aws/outputs.tf index e37442d..1279f05 100644 --- a/modules/ha-hot-hot/aws/outputs.tf +++ b/modules/ha-hot-hot/aws/outputs.tf @@ -54,6 +54,11 @@ output "pre_patch_ssm_document_name" { value = aws_ssm_document.pre_patch_backup.name } +output "post_patch_ssm_document_name" { + description = "Name of the AWS Systems Manager Run Command document that runs the on-VM five-probe post-patch verifier." + value = aws_ssm_document.post_patch_verify.name +} + output "schema_version_endpoint" { description = "HTTPS URL that returns the running schema version. Used by post-patch verify scripts." value = "https://${aws_lb.main.dns_name}${var.schema_version_endpoint_path}" @@ -75,8 +80,19 @@ output "ami_id" { output "security_group_ids" { value = { - alb = aws_security_group.alb.id - vm = aws_security_group.vm.id - db = aws_security_group.db.id + alb = aws_security_group.alb.id + vm = aws_security_group.vm.id + db = aws_security_group.db.id + redis = local.provision_managed_redis ? aws_security_group.redis[0].id : null } } + +output "redis_endpoint" { + description = "Host:port of the Redis endpoint wired into the HA VMs. Either the module-provisioned ElastiCache replication group or var.redis_endpoint_override." + value = local.effective_redis_host == null ? "" : "${local.effective_redis_host}:${local.effective_redis_port}" +} + +output "redis_mode" { + description = "How Redis is wired: 'managed' (this module provisioned ElastiCache), 'override' (customer-supplied endpoint), or 'disabled' (HA is not actually safe)." + value = local.provision_managed_redis ? "managed" : (var.redis_endpoint_override == null ? "disabled" : "override") +} diff --git a/modules/ha-hot-hot/aws/variables.tf b/modules/ha-hot-hot/aws/variables.tf index 010cc35..3ebdeda 100644 --- a/modules/ha-hot-hot/aws/variables.tf +++ b/modules/ha-hot-hot/aws/variables.tf @@ -74,6 +74,50 @@ variable "db_instance_class" { default = "db.t3.medium" } +# ----- Shared session store (ElastiCache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an ElastiCache Multi-AZ replication group for HailBytes shared sessions and worker locks. HA mode requires a shared Redis endpoint — set to false only if you supply redis_endpoint_override." + type = bool + default = true +} + +variable "redis_node_type" { + description = "ElastiCache node type. cache.t4g.small is the procurement-friendly default; raise for higher session-throughput deployments." + type = string + default = "cache.t4g.small" +} + +variable "redis_engine_version" { + description = "ElastiCache Redis engine version." + type = string + default = "7.1" +} + +variable "redis_snapshot_retention_days" { + description = "Days ElastiCache retains daily snapshots. Sessions are recoverable from Postgres re-login, so this defaults to 0; raise if you want a Redis PITR window." + type = number + default = 0 +} + +variable "redis_endpoint_override" { + description = "Host of a customer-managed Redis endpoint (e.g. existing ElastiCache, MemoryDB, or self-managed Redis Sentinel). When non-null, the module skips its own ElastiCache replication group and wires the VMs at this host instead. Pair with enable_managed_redis = false." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + description = "Port on the customer-managed Redis endpoint. Ignored unless redis_endpoint_override is set." + type = number + default = 6379 +} + +variable "redis_endpoint_override_tls" { + description = "Whether the customer-managed Redis endpoint requires in-transit TLS. Ignored unless redis_endpoint_override is set." + type = bool + default = true +} + variable "db_allocated_storage_gb" { type = number default = 100 @@ -112,6 +156,24 @@ variable "alb_idle_timeout_seconds" { default = 120 } +variable "enable_alb_deletion_protection" { + description = "Enable deletion protection on the ALB. Default true; production deployments should keep this on. Set to false in dev/test sandboxes where you want `terraform destroy` to succeed without manual cleanup." + type = bool + default = true +} + +variable "enable_alb_access_logging" { + description = "Provision an S3 bucket for ALB access logs and enable the listener access_logs block. Adds ~$1-5/mo storage cost depending on traffic; recommended for production deployments where the access log is part of the audit trail." + type = bool + default = false +} + +variable "alb_access_log_retention_days" { + description = "Days to retain ALB access log objects before lifecycle expiration. Default 365 (one calendar year) — long enough for most compliance lookback windows." + type = number + default = 365 +} + variable "alb_min_tls_version" { description = "Minimum TLS version on the ALB HTTPS listener." type = string @@ -220,6 +282,43 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } + +# ----- RDS production-hardening (opt-in) ----- + +variable "rds_enhanced_monitoring_interval" { + description = "RDS enhanced monitoring sample interval in seconds (0, 1, 5, 10, 15, 30, 60). 0 disables enhanced monitoring. Default 0; production deployments typically set 60. CKV_AWS_118." + type = number + default = 0 + validation { + condition = contains([0, 1, 5, 10, 15, 30, 60], var.rds_enhanced_monitoring_interval) + error_message = "rds_enhanced_monitoring_interval must be one of: 0, 1, 5, 10, 15, 30, 60." + } +} + +variable "rds_enabled_cloudwatch_log_types" { + description = "RDS log types to export to CloudWatch. Empty list = no log exports (cost-saving default). Production should set to [\"postgresql\", \"upgrade\"]. CKV_AWS_129." + type = list(string) + default = [] +} + +variable "rds_iam_authentication_enabled" { + description = "Enable IAM database authentication on the RDS instance. Adds app-side complexity (psql connections must mint IAM tokens) but eliminates long-lived passwords. CKV_AWS_161." + type = bool + default = false +} + +variable "rds_performance_insights_enabled" { + description = "Enable RDS Performance Insights. Adds ~$0/instance for 7-day retention (free tier); KMS-encrypted automatically when enable_customer_managed_key is also set. CKV_AWS_354." + type = bool + default = false +} + +variable "rds_performance_insights_retention_days" { + description = "Performance Insights data retention. 7 = free tier (default); 731 = long-term retention (paid)." + type = number + default = 7 +} + variable "tags" { type = map(string) default = {} diff --git a/modules/ha-hot-hot/azure/README.md b/modules/ha-hot-hot/azure/README.md index c442000..96640f5 100644 --- a/modules/ha-hot-hot/azure/README.md +++ b/modules/ha-hot-hot/azure/README.md @@ -14,24 +14,34 @@ flowchart TB LB --> VM2[(VM #2
Zone 2
Marketplace image)] VM1 --> KV[(Key Vault
DB password)] VM2 --> KV + VM1 -->|TLS| RC[(Azure Cache for Redis
Standard or Premium
sessions + worker locks)] + VM2 -->|TLS| RC VM1 -->|TLS, vnet-integrated| PG[(Postgres Flexible Server
ZoneRedundant HA primary)] VM2 -->|TLS, vnet-integrated| PG PG -.replication.-> PGS[(Standby in second zone)] + RC -.failover.-> RCS[(Replica in second zone)] ``` ## Cost estimate (East US, pay-as-you-go) +For the three-shape AWS comparison and the canonical procurement-grade +source, see [`COST_SHAPES.md`](../../../COST_SHAPES.md). Azure pricing +below is the Azure equivalent of the AWS HA table; the marketplace +meter and tier sizing are aligned. + | Component | Default | ~Monthly | |---|---|---| | 2× `Standard_D2s_v5` | 24/7 | $140 | | 2× Premium SSD OS | 64 GB | $20 | | 2× Premium SSD data | 256 GB | $70 | | Standard Load Balancer + 1 rule | | $25 | +| Azure Cache for Redis (`Standard C1`, zone-redundant primary/replica) | shared session store | $55 | | Postgres Flexible Server `GP_Standard_D2ds_v5` Zone-Redundant | 128 GB | $260 | | Postgres backups | retained 14d | $15 | | Key Vault | secrets ops | $1 | -| **Total infrastructure** | | **~$530/month** | -| **HailBytes marketplace software fee** | per VM-hour | **separate, x2 hours** | +| **Total infrastructure** | | **~$585/month** | +| **HailBytes marketplace software fee** ($0.24/vCPU-hr) | 4 vCPU × 730h | **~$700/mo** | +| **All-in (procurement-grade)** | | **~$1,285/month** | ## Prerequisites @@ -40,6 +50,7 @@ flowchart TB - A subnet **delegated** to `Microsoft.DBforPostgreSQL/flexibleServers` (`db_delegated_subnet_id`) - A private DNS zone `privatelink.postgres.database.azure.com` linked to the vnet (`private_dns_zone_id`) - Marketplace subscription accepted (handled by module unless you set `accept_marketplace_terms = false`) +- Subscription-level permissions to provision Azure Cache for Redis (Standard tier or higher — Basic is single-node and not HA-safe) ## Usage diff --git a/modules/ha-hot-hot/azure/main.tf b/modules/ha-hot-hot/azure/main.tf index dd5e4a3..76cfaf4 100644 --- a/modules/ha-hot-hot/azure/main.tf +++ b/modules/ha-hot-hot/azure/main.tf @@ -47,7 +47,16 @@ locals { backup_container_name = "hailbytes-${var.product}-bundles" enable_application_gateway = var.enable_application_gateway - appgw_endpoint = local.enable_application_gateway ? azurerm_public_ip.appgw[0].ip_address : azurerm_public_ip.lb.ip_address + appgw_endpoint = local.enable_application_gateway ? azurerm_public_ip.appgw[0].ip_address : azurerm_public_ip.lb.ip_address + + # Shared session store: required by HA SAT/ASM. Without a shared + # Redis, both VMs fall back to in-memory sessions and the LB + # cookie-reshuffle becomes user-visible. Default provisions an + # Azure Cache for Redis; customers with an existing cache supply + # var.redis_endpoint_override and set enable_managed_redis = false. + provision_managed_redis = var.enable_managed_redis && var.redis_endpoint_override == null + effective_redis_host = local.provision_managed_redis ? one(azurerm_redis_cache.main[*].hostname) : var.redis_endpoint_override + effective_redis_port = local.provision_managed_redis ? 6380 : var.redis_endpoint_override_port } resource "azurerm_marketplace_agreement" "hailbytes" { @@ -90,6 +99,18 @@ resource "azurerm_key_vault_secret" "db" { name = "hailbytes-db-password" value = random_password.db.result key_vault_id = azurerm_key_vault.main.id + # Content type satisfies CKV_AZURE_114 (identify secret semantics for + # rotation tooling) and expiration_date satisfies CKV_AZURE_41 (every + # secret has a rotation deadline). The expiration is intentionally + # advisory — the password is regenerated by the module when a new + # apply runs random_password.db; expiration is informational for KV + # secret-rotation alerting. + content_type = "application/x-postgresql-password" + expiration_date = timeadd(timestamp(), "${var.db_secret_expiration_hours}h") + + lifecycle { + ignore_changes = [expiration_date] + } depends_on = [azurerm_role_assignment.kv_secret_writer] } @@ -242,7 +263,9 @@ resource "azurerm_linux_virtual_machine" "vm" { boot_diagnostics {} - # The marketplace image reads instance metadata (tags) to wire itself to the shared DB. + # The marketplace image reads instance metadata (tags) to wire itself to the + # shared DB and Redis. Redis is required for HA; without it the second VM + # cannot share sessions or worker-lock state with the first. custom_data = base64encode(jsonencode({ hailbytes = { mode = "ha" @@ -252,6 +275,9 @@ resource "azurerm_linux_virtual_machine" "vm" { db_fqdn = local.db_host product = var.product cluster_member_idx = count.index + redis_host = local.effective_redis_host + redis_port = local.effective_redis_port + redis_tls = local.provision_managed_redis ? true : var.redis_endpoint_override_tls } })) @@ -259,6 +285,7 @@ resource "azurerm_linux_virtual_machine" "vm" { azurerm_marketplace_agreement.hailbytes, azurerm_postgresql_flexible_server.main, azurerm_linux_virtual_machine.db_vm, + azurerm_redis_cache.main, ] } @@ -284,6 +311,36 @@ resource "azurerm_virtual_machine_data_disk_attachment" "data" { caching = "ReadWrite" } +# ----- Shared session store: Azure Cache for Redis (zone-redundant) ----- +# +# HA SAT/ASM require a shared Redis endpoint for cross-instance sessions +# and the worker-lock heartbeat. Without it both VMs fall back to +# in-memory sessions and the LB's cookie reshuffle becomes a user- +# visible logout. Equivalent to AWS ElastiCache; the asm-aws-ha and +# sat-aws-ha modules provision the same shape with the same defaults. + +resource "azurerm_redis_cache" "main" { + count = local.provision_managed_redis ? 1 : 0 + name = "${local.name_prefix}-redis" + resource_group_name = var.resource_group_name + location = var.location + capacity = var.redis_capacity + family = var.redis_family + sku_name = var.redis_sku_name + non_ssl_port_enabled = false + minimum_tls_version = "1.2" + public_network_access_enabled = false + # Standard / Premium SKUs deliver a Multi-AZ primary/replica pair; + # the Basic SKU is single-node and therefore not a valid HA option + # — validated in variables.tf. + zones = var.redis_sku_name == "Premium" ? ["1", "2"] : null + tags = local.common_tags + + redis_configuration { + maxmemory_policy = "allkeys-lru" + } +} + # ----- Postgres backend (Flexible Server in default mode; self-managed VM in 'vm' mode) ----- # # Flexible Server is the recommended production backend. Customers who must @@ -310,7 +367,7 @@ resource "azurerm_postgresql_flexible_server" "main" { private_dns_zone_id = var.private_dns_zone_id backup_retention_days = var.db_backup_retention_days - geo_redundant_backup_enabled = false + geo_redundant_backup_enabled = var.postgres_geo_redundant_backup_enabled high_availability { mode = var.db_high_availability_mode @@ -521,14 +578,14 @@ resource "azurerm_storage_account" "backup" { count = local.create_backup_storage ? 1 : 0 name = coalesce(var.backup_storage_account_name, substr(replace("${local.name_prefix}backup", "-", ""), 0, 24)) resource_group_name = var.resource_group_name + public_network_access_enabled = false + allow_nested_items_to_be_public = false location = var.location account_tier = "Standard" account_replication_type = var.backup_storage_replication account_kind = "StorageV2" access_tier = "Cool" min_tls_version = "TLS1_2" - allow_nested_items_to_be_public = false - public_network_access_enabled = true shared_access_key_enabled = false tags = local.common_tags @@ -564,7 +621,7 @@ resource "azurerm_storage_management_policy" "backup" { version { change_tier_to_cool_after_days_since_creation = 30 change_tier_to_archive_after_days_since_creation = 90 - delete_after_days_since_creation = var.backup_blob_noncurrent_expiration_days + delete_after_days_since_creation = var.backup_blob_noncurrent_expiration_days } } } @@ -622,7 +679,9 @@ resource "azurerm_virtual_machine_run_command" "pre_patch_backup" { if [ -x /opt/hailbytes/bin/ha-pre-patch-backup.sh ]; then sudo -E /opt/hailbytes/bin/ha-pre-patch-backup.sh else - echo "WARN: /opt/hailbytes/bin/ha-pre-patch-backup.sh not present; skipping local bundle." + echo "ERROR: /opt/hailbytes/bin/ha-pre-patch-backup.sh not present on this VM image." >&2 + echo " Rebuild the marketplace image from main; provision.sh installs the script." >&2 + exit 1 fi az login --identity --allow-no-subscriptions >/dev/null DB_MODE='${var.db_mode}' @@ -644,6 +703,35 @@ resource "azurerm_virtual_machine_run_command" "pre_patch_backup" { } } +# ----- Post-patch verify Run Command ----- +# +# Mirrors the AWS aws_ssm_document.post_patch_verify in +# modules/ha-hot-hot/aws/main.tf. Customers run this from Azure Portal +# under Operations -> Run command -> RunPostPatchVerify after each VM +# comes back from an image swap, before draining the second VM. + +resource "azurerm_virtual_machine_run_command" "post_patch_verify" { + count = var.enable_post_patch_run_command ? local.vm_count : 0 + name = "RunPostPatchVerify" + location = var.location + virtual_machine_id = azurerm_linux_virtual_machine.vm[count.index].id + + source { + script = <<-EOSH + #!/bin/bash + set -euo pipefail + export HAILBYTES_SCHEMA_VERSION_PATH='${var.schema_version_endpoint_path}' + if [ -x /opt/hailbytes/bin/ha-post-patch-verify.sh ]; then + sudo -E /opt/hailbytes/bin/ha-post-patch-verify.sh + else + echo "ERROR: /opt/hailbytes/bin/ha-post-patch-verify.sh not present on this VM image." >&2 + echo " Rebuild the marketplace image from main; provision.sh installs the script." >&2 + exit 1 + fi + EOSH + } +} + # ----- Optional Application Gateway + WAF (procurement-grade WAF parity) ----- # # Azure WAF requires Application Gateway (the Standard Load Balancer above is @@ -707,24 +795,24 @@ resource "azurerm_application_gateway" "main" { } backend_http_settings { - name = "https-passthrough" - cookie_based_affinity = "Enabled" - port = 443 - protocol = "Https" - request_timeout = 60 + name = "https-passthrough" + cookie_based_affinity = "Enabled" + port = 443 + protocol = "Https" + request_timeout = 60 pick_host_name_from_backend_address = false - probe_name = "https-health" + probe_name = "https-health" } probe { - name = "https-health" - protocol = "Https" - path = "/health" - interval = 15 - timeout = 5 - unhealthy_threshold = 3 + name = "https-health" + protocol = "Https" + path = "/health" + interval = 15 + timeout = 5 + unhealthy_threshold = 3 pick_host_name_from_backend_http_settings = false - host = var.appgw_backend_host_header + host = var.appgw_backend_host_header } ssl_certificate { diff --git a/modules/ha-hot-hot/azure/outputs.tf b/modules/ha-hot-hot/azure/outputs.tf index 289f048..e12f9c8 100644 --- a/modules/ha-hot-hot/azure/outputs.tf +++ b/modules/ha-hot-hot/azure/outputs.tf @@ -53,6 +53,21 @@ output "pre_patch_run_command_name" { value = var.enable_pre_patch_run_command ? azurerm_virtual_machine_run_command.pre_patch_backup[0].name : "" } +output "post_patch_run_command_name" { + description = "Name of the Azure Run Command document that runs the on-VM five-probe post-patch verifier on each VM." + value = var.enable_post_patch_run_command ? azurerm_virtual_machine_run_command.post_patch_verify[0].name : "" +} + +output "redis_endpoint" { + description = "Host:port of the Redis endpoint wired into the HA VMs. Either the module-provisioned Azure Cache for Redis or var.redis_endpoint_override." + value = local.effective_redis_host == null ? "" : "${local.effective_redis_host}:${local.effective_redis_port}" +} + +output "redis_mode" { + description = "How Redis is wired: 'managed' (this module provisioned Azure Cache), 'override' (customer-supplied endpoint), or 'disabled' (HA is not actually safe)." + value = local.provision_managed_redis ? "managed" : (var.redis_endpoint_override == null ? "disabled" : "override") +} + output "schema_version_endpoint" { description = "HTTPS URL that returns the running schema version. CI/CD post-patch verify scripts curl this." value = "https://${local.appgw_endpoint}${var.schema_version_endpoint_path}" diff --git a/modules/ha-hot-hot/azure/variables.tf b/modules/ha-hot-hot/azure/variables.tf index 0006310..692a3b8 100644 --- a/modules/ha-hot-hot/azure/variables.tf +++ b/modules/ha-hot-hot/azure/variables.tf @@ -180,6 +180,64 @@ variable "enable_pre_patch_run_command" { default = true } +variable "enable_post_patch_run_command" { + description = "Install an Azure Run Command document named RunPostPatchVerify on each VM, mirroring the AWS aws_ssm_document.post_patch_verify in the SAT/ASM aws-ha modules. Customers fire it from the Portal after a Run Command-driven image swap." + type = bool + default = true +} + +# ----- Shared session store (Azure Cache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an Azure Cache for Redis (Standard or Premium SKU, zone-redundant in Premium). Required for HA; set to false only when supplying redis_endpoint_override." + type = bool + default = true +} + +variable "redis_sku_name" { + description = "Redis SKU. Standard delivers a primary/replica pair across two zones; Premium adds persistence and explicit zone selection. Basic is single-node and NOT a valid HA option (validated)." + type = string + default = "Standard" + validation { + condition = contains(["Standard", "Premium"], var.redis_sku_name) + error_message = "redis_sku_name must be one of: Standard, Premium. Basic is single-node and breaks HA." + } +} + +variable "redis_family" { + description = "Redis SKU family. 'C' = Standard/Basic, 'P' = Premium. Must match redis_sku_name." + type = string + default = "C" + validation { + condition = contains(["C", "P"], var.redis_family) + error_message = "redis_family must be one of: C, P." + } +} + +variable "redis_capacity" { + description = "Redis capacity (size index). For SKU=Standard / family=C, valid values are 0 (250MB) through 6 (53GB). cache.t4g.small-equivalent is 1 (1GB)." + type = number + default = 1 +} + +variable "redis_endpoint_override" { + description = "Host of an existing customer-managed Redis endpoint (Azure Cache, self-managed Redis Sentinel, etc.). Pair with enable_managed_redis = false." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + description = "Port on the customer-managed Redis endpoint. 6380 (TLS) is the Azure default. Ignored unless redis_endpoint_override is set." + type = number + default = 6380 +} + +variable "redis_endpoint_override_tls" { + description = "Whether the customer-managed Redis endpoint requires in-transit TLS. Ignored unless redis_endpoint_override is set." + type = bool + default = true +} + variable "enable_application_gateway" { description = "Front the LB topology with an Azure Application Gateway. Required if you want WAF parity with the AWS ALB+WAF story; the existing Standard LB is L4-only and cannot host WAF rules." type = bool @@ -236,6 +294,19 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } +variable "db_secret_expiration_hours" { + description = "Hours until the Key Vault DB-password secret expires. Set on every apply via `timeadd(timestamp(), ...)` and then ignored on subsequent applies so a stale value doesn't show drift. Default 8760 = one calendar year — long enough that ops don't need to rotate weekly, short enough that the secret never lives unrotated past a year without operator attention." + type = number + default = 8760 +} + + +variable "postgres_geo_redundant_backup_enabled" { + description = "Enable geo-redundant backup on the Postgres Flexible Server. Defaults to false; adds cross-region replication of backups for DR scenarios. CKV_AZURE_136." + type = bool + default = false +} + variable "tags" { type = map(string) default = {} diff --git a/modules/sat-aws-autoscale/main.tf b/modules/sat-aws-autoscale/main.tf index 2ae2872..58e4ddc 100644 --- a/modules/sat-aws-autoscale/main.tf +++ b/modules/sat-aws-autoscale/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../unlimited-scale/aws" + source = "../unlimited-scale/aws" - product = "sat" + product = "sat" vpc_id = var.vpc_id public_subnet_ids = var.public_subnet_ids @@ -43,5 +43,23 @@ module "this" { rds_copy_tags_to_snapshot = var.rds_copy_tags_to_snapshot schema_version_endpoint_path = var.schema_version_endpoint_path + # Shared session store (ElastiCache for Redis) + enable_managed_redis = var.enable_managed_redis + redis_node_type = var.redis_node_type + redis_engine_version = var.redis_engine_version + redis_snapshot_retention_days = var.redis_snapshot_retention_days + redis_endpoint_override = var.redis_endpoint_override + redis_endpoint_override_port = var.redis_endpoint_override_port + redis_endpoint_override_tls = var.redis_endpoint_override_tls + + enable_alb_deletion_protection = var.enable_alb_deletion_protection + + # RDS production hardening (opt-in) + rds_enhanced_monitoring_interval = var.rds_enhanced_monitoring_interval + rds_enabled_cloudwatch_log_types = var.rds_enabled_cloudwatch_log_types + rds_iam_authentication_enabled = var.rds_iam_authentication_enabled + rds_performance_insights_enabled = var.rds_performance_insights_enabled + rds_performance_insights_retention_days = var.rds_performance_insights_retention_days + tags = var.tags } diff --git a/modules/sat-aws-autoscale/variables.tf b/modules/sat-aws-autoscale/variables.tf index c93e770..83698be 100644 --- a/modules/sat-aws-autoscale/variables.tf +++ b/modules/sat-aws-autoscale/variables.tf @@ -205,6 +205,86 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } +# ----- Shared session store (ElastiCache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an ElastiCache Multi-AZ replication group. Required for horizontal scaling; set to false only when supplying redis_endpoint_override." + type = bool + default = true +} + +variable "redis_node_type" { + description = "ElastiCache node type. Scale up alongside ASG growth — cache.t4g.small handles 3-5 instances, cache.m6g.large handles 10-20+." + type = string + default = "cache.t4g.small" +} + +variable "redis_engine_version" { + type = string + default = "7.1" +} + +variable "redis_snapshot_retention_days" { + type = number + default = 0 +} + +variable "redis_endpoint_override" { + description = "Host of an existing customer-managed Redis endpoint. Pair with enable_managed_redis = false." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + type = number + default = 6379 +} + +variable "redis_endpoint_override_tls" { + type = bool + default = true +} + + +variable "enable_alb_deletion_protection" { + description = "Enable deletion protection on the ALB. Default true." + type = bool + default = true +} + + +# ----- RDS production-hardening (opt-in) ----- + +variable "rds_enhanced_monitoring_interval" { + description = "RDS enhanced monitoring sample interval. 0 disables. CKV_AWS_118." + type = number + default = 0 +} + +variable "rds_enabled_cloudwatch_log_types" { + description = "RDS log types to export to CloudWatch. CKV_AWS_129." + type = list(string) + default = [] +} + +variable "rds_iam_authentication_enabled" { + description = "Enable IAM DB authentication. CKV_AWS_161." + type = bool + default = false +} + +variable "rds_performance_insights_enabled" { + description = "Enable RDS Performance Insights. CKV_AWS_354." + type = bool + default = false +} + +variable "rds_performance_insights_retention_days" { + description = "Performance Insights retention. 7 = free tier; 731 = long-term." + type = number + default = 7 +} + variable "tags" { type = map(string) default = {} diff --git a/modules/sat-aws-ha/main.tf b/modules/sat-aws-ha/main.tf index 4c00345..e015a6e 100644 --- a/modules/sat-aws-ha/main.tf +++ b/modules/sat-aws-ha/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../ha-hot-hot/aws" + source = "../ha-hot-hot/aws" - product = "sat" + product = "sat" vpc_id = var.vpc_id public_subnet_ids = var.public_subnet_ids @@ -41,5 +41,25 @@ module "this" { alert_email = var.alert_email schema_version_endpoint_path = var.schema_version_endpoint_path + # Shared session store (ElastiCache for Redis) + enable_managed_redis = var.enable_managed_redis + redis_node_type = var.redis_node_type + redis_engine_version = var.redis_engine_version + redis_snapshot_retention_days = var.redis_snapshot_retention_days + redis_endpoint_override = var.redis_endpoint_override + redis_endpoint_override_port = var.redis_endpoint_override_port + redis_endpoint_override_tls = var.redis_endpoint_override_tls + + enable_alb_deletion_protection = var.enable_alb_deletion_protection + enable_alb_access_logging = var.enable_alb_access_logging + alb_access_log_retention_days = var.alb_access_log_retention_days + + # RDS production hardening (opt-in) + rds_enhanced_monitoring_interval = var.rds_enhanced_monitoring_interval + rds_enabled_cloudwatch_log_types = var.rds_enabled_cloudwatch_log_types + rds_iam_authentication_enabled = var.rds_iam_authentication_enabled + rds_performance_insights_enabled = var.rds_performance_insights_enabled + rds_performance_insights_retention_days = var.rds_performance_insights_retention_days + tags = var.tags } diff --git a/modules/sat-aws-ha/variables.tf b/modules/sat-aws-ha/variables.tf index c28980c..04cc02b 100644 --- a/modules/sat-aws-ha/variables.tf +++ b/modules/sat-aws-ha/variables.tf @@ -211,6 +211,102 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } +# ----- Shared session store (ElastiCache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an ElastiCache Multi-AZ replication group for HailBytes shared sessions and worker locks. HA mode requires a shared Redis endpoint — set to false only if you supply redis_endpoint_override." + type = bool + default = true +} + +variable "redis_node_type" { + description = "ElastiCache node type. cache.t4g.small is the procurement-friendly default; raise for higher session-throughput deployments." + type = string + default = "cache.t4g.small" +} + +variable "redis_engine_version" { + description = "ElastiCache Redis engine version." + type = string + default = "7.1" +} + +variable "redis_snapshot_retention_days" { + description = "Days ElastiCache retains daily snapshots. Sessions are recoverable from Postgres re-login, so this defaults to 0; raise if you want a Redis PITR window." + type = number + default = 0 +} + +variable "redis_endpoint_override" { + description = "Host of a customer-managed Redis endpoint. When non-null, the module skips its own ElastiCache replication group and wires the VMs at this host instead. Pair with enable_managed_redis = false." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + description = "Port on the customer-managed Redis endpoint. Ignored unless redis_endpoint_override is set." + type = number + default = 6379 +} + +variable "redis_endpoint_override_tls" { + description = "Whether the customer-managed Redis endpoint requires in-transit TLS. Ignored unless redis_endpoint_override is set." + type = bool + default = true +} + + +variable "enable_alb_deletion_protection" { + description = "Enable deletion protection on the ALB. Default true; dev/test override to false to let `terraform destroy` succeed." + type = bool + default = true +} + +variable "enable_alb_access_logging" { + description = "Provision an S3 bucket for ALB access logs and enable the listener access_logs block." + type = bool + default = false +} + +variable "alb_access_log_retention_days" { + description = "Days to retain ALB access log objects." + type = number + default = 365 +} + + +# ----- RDS production-hardening (opt-in) ----- + +variable "rds_enhanced_monitoring_interval" { + description = "RDS enhanced monitoring sample interval. 0 disables. CKV_AWS_118." + type = number + default = 0 +} + +variable "rds_enabled_cloudwatch_log_types" { + description = "RDS log types to export to CloudWatch. CKV_AWS_129." + type = list(string) + default = [] +} + +variable "rds_iam_authentication_enabled" { + description = "Enable IAM DB authentication. CKV_AWS_161." + type = bool + default = false +} + +variable "rds_performance_insights_enabled" { + description = "Enable RDS Performance Insights. CKV_AWS_354." + type = bool + default = false +} + +variable "rds_performance_insights_retention_days" { + description = "Performance Insights retention. 7 = free tier; 731 = long-term." + type = number + default = 7 +} + variable "tags" { type = map(string) default = {} diff --git a/modules/sat-aws-single/main.tf b/modules/sat-aws-single/main.tf index 796be22..1ce4f91 100644 --- a/modules/sat-aws-single/main.tf +++ b/modules/sat-aws-single/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../single-vm/aws" + source = "../single-vm/aws" - product = "sat" + product = "sat" vpc_id = var.vpc_id subnet_id = var.subnet_id diff --git a/modules/sat-azure-autoscale/main.tf b/modules/sat-azure-autoscale/main.tf index 35136d2..f33d71e 100644 --- a/modules/sat-azure-autoscale/main.tf +++ b/modules/sat-azure-autoscale/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../unlimited-scale/azure" + source = "../unlimited-scale/azure" - product = "sat" + product = "sat" resource_group_name = var.resource_group_name location = var.location @@ -46,6 +46,20 @@ module "this" { waf_policy_id = var.waf_policy_id refresh_rollback_5xx_count_threshold = var.refresh_rollback_5xx_count_threshold schema_version_endpoint_path = var.schema_version_endpoint_path + enable_post_patch_run_command = var.enable_post_patch_run_command + + # Shared session store (Azure Cache for Redis) + enable_managed_redis = var.enable_managed_redis + redis_sku_name = var.redis_sku_name + redis_family = var.redis_family + redis_capacity = var.redis_capacity + redis_endpoint_override = var.redis_endpoint_override + redis_endpoint_override_port = var.redis_endpoint_override_port + redis_endpoint_override_tls = var.redis_endpoint_override_tls + + db_secret_expiration_hours = var.db_secret_expiration_hours + + postgres_geo_redundant_backup_enabled = var.postgres_geo_redundant_backup_enabled tags = var.tags } diff --git a/modules/sat-azure-autoscale/variables.tf b/modules/sat-azure-autoscale/variables.tf index b4cd96c..c55f817 100644 --- a/modules/sat-azure-autoscale/variables.tf +++ b/modules/sat-azure-autoscale/variables.tf @@ -206,6 +206,67 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } +variable "enable_post_patch_run_command" { + description = "Install a VMSS extension named RunPostPatchVerify mirroring the AWS sat-aws-autoscale aws_ssm_document.post_patch_verify." + type = bool + default = true +} + +# ----- Shared session store (Azure Cache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an Azure Cache for Redis. Required for horizontal scaling; set to false only when supplying redis_endpoint_override." + type = bool + default = true +} + +variable "redis_sku_name" { + description = "Redis SKU. Standard or Premium only (Basic is single-node)." + type = string + default = "Standard" +} + +variable "redis_family" { + type = string + default = "C" +} + +variable "redis_capacity" { + description = "Redis capacity (size index). 0-6 for Standard. Scale alongside VMSS instance count." + type = number + default = 1 +} + +variable "redis_endpoint_override" { + description = "Host of an existing customer-managed Redis endpoint." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + type = number + default = 6380 +} + +variable "redis_endpoint_override_tls" { + type = bool + default = true +} + + +variable "db_secret_expiration_hours" { + description = "Hours until the Key Vault DB-password secret expires. Default 8760 = one calendar year." + type = number + default = 8760 +} + + +variable "postgres_geo_redundant_backup_enabled" { + description = "Enable geo-redundant backup on Postgres Flexible Server. CKV_AZURE_136." + type = bool + default = false +} + variable "tags" { type = map(string) default = {} diff --git a/modules/sat-azure-ha/main.tf b/modules/sat-azure-ha/main.tf index 04d9c0b..65bc44e 100644 --- a/modules/sat-azure-ha/main.tf +++ b/modules/sat-azure-ha/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../ha-hot-hot/azure" + source = "../ha-hot-hot/azure" - product = "sat" + product = "sat" resource_group_name = var.resource_group_name location = var.location @@ -45,6 +45,20 @@ module "this" { alert_email = var.alert_email refresh_rollback_5xx_count_threshold = var.refresh_rollback_5xx_count_threshold schema_version_endpoint_path = var.schema_version_endpoint_path + enable_post_patch_run_command = var.enable_post_patch_run_command + + # Shared session store (Azure Cache for Redis) + enable_managed_redis = var.enable_managed_redis + redis_sku_name = var.redis_sku_name + redis_family = var.redis_family + redis_capacity = var.redis_capacity + redis_endpoint_override = var.redis_endpoint_override + redis_endpoint_override_port = var.redis_endpoint_override_port + redis_endpoint_override_tls = var.redis_endpoint_override_tls + + db_secret_expiration_hours = var.db_secret_expiration_hours + + postgres_geo_redundant_backup_enabled = var.postgres_geo_redundant_backup_enabled tags = var.tags } diff --git a/modules/sat-azure-ha/variables.tf b/modules/sat-azure-ha/variables.tf index 2c939fe..cba9556 100644 --- a/modules/sat-azure-ha/variables.tf +++ b/modules/sat-azure-ha/variables.tf @@ -228,6 +228,68 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } +variable "enable_post_patch_run_command" { + description = "Install an Azure Run Command document named RunPostPatchVerify on each VM, mirroring the AWS sat-aws-ha aws_ssm_document.post_patch_verify." + type = bool + default = true +} + +# ----- Shared session store (Azure Cache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an Azure Cache for Redis (Standard or Premium SKU, zone-redundant in Premium). Required for HA; set to false only when supplying redis_endpoint_override." + type = bool + default = true +} + +variable "redis_sku_name" { + description = "Redis SKU. Standard delivers a primary/replica pair across two zones; Premium adds persistence and explicit zone selection. Basic is single-node and NOT a valid HA option." + type = string + default = "Standard" +} + +variable "redis_family" { + description = "Redis SKU family. 'C' = Standard/Basic, 'P' = Premium. Must match redis_sku_name." + type = string + default = "C" +} + +variable "redis_capacity" { + description = "Redis capacity (size index). For SKU=Standard / family=C, valid values are 0 (250MB) through 6 (53GB). The procurement-friendly default is 1 (1GB)." + type = number + default = 1 +} + +variable "redis_endpoint_override" { + description = "Host of an existing customer-managed Redis endpoint (Azure Cache, self-managed Redis Sentinel, etc.). Pair with enable_managed_redis = false." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + type = number + default = 6380 +} + +variable "redis_endpoint_override_tls" { + type = bool + default = true +} + + +variable "db_secret_expiration_hours" { + description = "Hours until the Key Vault DB-password secret expires. Default 8760 = one calendar year." + type = number + default = 8760 +} + + +variable "postgres_geo_redundant_backup_enabled" { + description = "Enable geo-redundant backup on Postgres Flexible Server. CKV_AZURE_136." + type = bool + default = false +} + variable "tags" { type = map(string) default = {} diff --git a/modules/sat-azure-single/main.tf b/modules/sat-azure-single/main.tf index b54bada..6f49bdf 100644 --- a/modules/sat-azure-single/main.tf +++ b/modules/sat-azure-single/main.tf @@ -1,7 +1,7 @@ module "this" { - source = "../single-vm/azure" + source = "../single-vm/azure" - product = "sat" + product = "sat" resource_group_name = var.resource_group_name location = var.location diff --git a/modules/single-vm/aws/README.md b/modules/single-vm/aws/README.md index 4b875e8..986bd05 100644 --- a/modules/single-vm/aws/README.md +++ b/modules/single-vm/aws/README.md @@ -23,13 +23,19 @@ flowchart TB | Component | Default | ~Monthly | |---|---|---| -| EC2 `t3.large` | 1 × 24/7 | $60 | +| EC2 `t3.large` (starter) | 1 × 24/7 | $60 | +| EC2 `m6i.large` (procurement-grade) | 1 × 24/7 | $70 | | EBS gp3 root | 50 GB | $4 | | EBS gp3 data | 200 GB | $16 | | EBS snapshots | ~50 GB stored after dedup | $3 | | KMS key (if enabled) | 1 | $1 + usage | -| **Total infrastructure** | | **~$84/month** | -| **HailBytes marketplace software fee** | per AWS Marketplace listing | **separate** | +| **Total infrastructure (starter / procurement-grade)** | | **~$84 / ~$94 per month** | +| **HailBytes marketplace software fee** ($0.24/vCPU-hr) | 2 vCPU × 730h | **~$350/mo** | +| **All-in (procurement-grade)** | | **~$435/mo** | + +See [`COST_SHAPES.md`](../../../COST_SHAPES.md) for the three-shape +comparison (single / HA / unlimited-scale) and the canonical +procurement-grade pricing source. ## Prerequisites diff --git a/modules/single-vm/aws/main.tf b/modules/single-vm/aws/main.tf index 27acd7b..aa77b4e 100644 --- a/modules/single-vm/aws/main.tf +++ b/modules/single-vm/aws/main.tf @@ -318,6 +318,15 @@ resource "aws_s3_bucket_lifecycle_configuration" "backup" { count = local.create_backup_bucket ? 1 : 0 bucket = aws_s3_bucket.backup[0].id + rule { + id = "abort-incomplete-multipart" + status = "Enabled" + filter {} + abort_incomplete_multipart_upload { + days_after_initiation = 7 + } + } + rule { id = "tier-and-expire" status = "Enabled" diff --git a/modules/single-vm/azure/main.tf b/modules/single-vm/azure/main.tf index 8e59a22..fa731c3 100644 --- a/modules/single-vm/azure/main.tf +++ b/modules/single-vm/azure/main.tf @@ -40,9 +40,9 @@ locals { for c in var.allowed_cidrs : c if c != "0.0.0.0/0" ] - create_backup_storage = var.create_backup_storage_account + create_backup_storage = var.create_backup_storage_account backup_storage_account_name = local.create_backup_storage ? azurerm_storage_account.backup[0].name : var.backup_storage_account_name - backup_container_name = "hailbytes-${var.product}-bundles" + backup_container_name = "hailbytes-${var.product}-bundles" } # ----- Marketplace agreement ----- @@ -239,14 +239,14 @@ resource "azurerm_storage_account" "backup" { count = local.create_backup_storage ? 1 : 0 name = coalesce(var.backup_storage_account_name, substr(replace("${local.name_prefix}backup", "-", ""), 0, 24)) resource_group_name = var.resource_group_name + public_network_access_enabled = false + allow_nested_items_to_be_public = false location = var.location account_tier = "Standard" account_replication_type = var.backup_storage_replication account_kind = "StorageV2" access_tier = "Cool" min_tls_version = "TLS1_2" - allow_nested_items_to_be_public = false - public_network_access_enabled = true shared_access_key_enabled = false tags = local.common_tags @@ -282,7 +282,7 @@ resource "azurerm_storage_management_policy" "backup" { version { change_tier_to_cool_after_days_since_creation = 30 change_tier_to_archive_after_days_since_creation = 90 - delete_after_days_since_creation = var.backup_blob_noncurrent_expiration_days + delete_after_days_since_creation = var.backup_blob_noncurrent_expiration_days } } } diff --git a/modules/unlimited-scale/aws/README.md b/modules/unlimited-scale/aws/README.md index 95a7274..b3d87d1 100644 --- a/modules/unlimited-scale/aws/README.md +++ b/modules/unlimited-scale/aws/README.md @@ -25,11 +25,21 @@ flowchart TB ## Cost estimate (us-east-1, on-demand, default sizing) +Unlimited-scale is a fundamentally different cost shape from a single +instance: it adds an ASG, ALB, read replicas, ElastiCache, and the +per-vCore meter scales with N instances rather than the topology itself. +Compare against `modules/single-vm/aws` (~$84/mo infra + meter) and +`modules/ha-hot-hot/aws` (~$435-$515/mo infra + meter) before you quote. + +For the three-shape comparison side-by-side and the canonical +procurement-grade source, see [`COST_SHAPES.md`](../../../COST_SHAPES.md). + | Component | Default | ~Monthly | |---|---|---| | 3× EC2 `m6i.large` (ASG min) | 24/7 | $225 | | 3× EBS gp3 root | 50 GB | $12 | | Application Load Balancer + LCU | | $35 | +| ElastiCache Redis Multi-AZ (`cache.t4g.small`) | shared session store | $50 | | RDS `db.r6g.large` Multi-AZ primary | 200 GB gp3 | $400 | | 2× RDS read replicas `db.r6g.large` | | $400 | | RDS backups | 30d retention | $40 | @@ -38,16 +48,24 @@ flowchart TB | KMS CMK | 1 + usage | $5 | | Secrets Manager | 1 | $0.40 | | SNS | low volume | $0.10 | -| **Total infrastructure (3-instance steady state)** | | **~$1,150/month** | +| **Total infrastructure (3-instance steady state)** | | **~$1,200/month** | | **+ scale-out hours** | each extra m6i.large 24/7 | +$75/mo per instance | -| **HailBytes marketplace software fee** | per VM-hour, every ASG instance | **separate** | +| **HailBytes marketplace software fee** ($0.24/vCPU-hr) | 3× 2 vCPU × 730h | **~$1,050/mo** | +| **All-in (3-instance steady state)** | | **~$2,250/month** | + +Scale-out adds both an EC2 line and a per-vCPU meter line for every +extra instance. At 5 steady-state instances the bill lands around +$2,950/mo all-in; at 10 instances around $4,700/mo all-in. For +deployments that routinely run above 5 instances, raise `redis_node_type` +to `cache.m6g.large` (~$120/mo) — t4g.small starts becoming a bottleneck +for shared-session throughput in that range. ## Prerequisites - VPC with at least 2 public subnets (ALB) and 3 private subnets across different AZs - ACM certificate in the same region - Marketplace subscription active for the product -- IAM permissions for EC2, ASG, ALB, RDS, IAM, KMS, S3, CloudWatch, SNS, Secrets Manager +- IAM permissions for EC2, ASG, ALB, RDS, ElastiCache, IAM, KMS, S3, CloudWatch, SNS, Secrets Manager ## Usage diff --git a/modules/unlimited-scale/aws/main.tf b/modules/unlimited-scale/aws/main.tf index 9cc386f..1202075 100644 --- a/modules/unlimited-scale/aws/main.tf +++ b/modules/unlimited-scale/aws/main.tf @@ -26,6 +26,13 @@ locals { }, var.tags, ) + + # Shared session store: required by every horizontally-scaled SAT/ASM + # deployment because every ASG instance has to read the same session map + # and worker-lock heartbeat. Provisioned by default; can be overridden. + provision_managed_redis = var.enable_managed_redis && var.redis_endpoint_override == null + effective_redis_host = local.provision_managed_redis ? one(aws_elasticache_replication_group.main[*].primary_endpoint_address) : var.redis_endpoint_override + effective_redis_port = local.provision_managed_redis ? 6379 : var.redis_endpoint_override_port } data "aws_region" "current" {} @@ -99,6 +106,7 @@ resource "aws_vpc_security_group_egress_rule" "alb_to_vm" { from_port = 443 to_port = 443 ip_protocol = "tcp" + description = "ALB to ASG instances 443" } resource "aws_security_group" "vm" { @@ -114,12 +122,14 @@ resource "aws_vpc_security_group_ingress_rule" "vm_from_alb" { from_port = 443 to_port = 443 ip_protocol = "tcp" + description = "HTTPS from ALB" } resource "aws_vpc_security_group_egress_rule" "vm_egress" { security_group_id = aws_security_group.vm.id cidr_ipv4 = "0.0.0.0/0" ip_protocol = "-1" + description = "Egress for marketplace metering, updates, DB" } resource "aws_security_group" "db" { @@ -135,6 +145,60 @@ resource "aws_vpc_security_group_ingress_rule" "db_from_vm" { from_port = 5432 to_port = 5432 ip_protocol = "tcp" + description = "Postgres from ASG instances" +} + +resource "aws_security_group" "redis" { + count = local.provision_managed_redis ? 1 : 0 + name = "${local.name_prefix}-redis-sg" + description = "ElastiCache Redis ingress from VMs (shared session store)" + vpc_id = var.vpc_id + tags = local.common_tags +} + +resource "aws_vpc_security_group_ingress_rule" "redis_from_vm" { + count = local.provision_managed_redis ? 1 : 0 + security_group_id = aws_security_group.redis[0].id + referenced_security_group_id = aws_security_group.vm.id + from_port = 6379 + to_port = 6379 + ip_protocol = "tcp" + description = "Redis from ASG instances" +} + +# ----- Shared session store: ElastiCache for Redis (Multi-AZ) ----- +# +# Required for horizontal scaling. Every instance in the ASG must share session +# state, otherwise sticky-session ALB stickiness becomes the only thing keeping +# users logged in across rolling refresh. + +resource "aws_elasticache_subnet_group" "main" { + count = local.provision_managed_redis ? 1 : 0 + name = "${local.name_prefix}-redis-subnets" + subnet_ids = var.private_subnet_ids + tags = local.common_tags +} + +resource "aws_elasticache_replication_group" "main" { + count = local.provision_managed_redis ? 1 : 0 + replication_group_id = "${local.name_prefix}-redis" + description = "HailBytes ${var.product} session store + worker lock (scale-out)" + engine = "redis" + engine_version = var.redis_engine_version + node_type = var.redis_node_type + num_cache_clusters = 2 + automatic_failover_enabled = true + multi_az_enabled = true + port = 6379 + parameter_group_name = "default.redis7" + subnet_group_name = aws_elasticache_subnet_group.main[0].name + security_group_ids = [aws_security_group.redis[0].id] + at_rest_encryption_enabled = true + transit_encryption_enabled = true + kms_key_id = var.enable_customer_managed_key ? aws_kms_key.main[0].arn : null + snapshot_retention_limit = var.redis_snapshot_retention_days + apply_immediately = false + tags = local.common_tags } # ----- IAM ----- @@ -261,12 +325,17 @@ resource "aws_db_instance" "primary" { backup_window = "03:00-04:00" maintenance_window = "sun:04:00-sun:05:00" - deletion_protection = var.db_deletion_protection - skip_final_snapshot = !var.db_deletion_protection - copy_tags_to_snapshot = var.rds_copy_tags_to_snapshot - performance_insights_enabled = true - enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"] - auto_minor_version_upgrade = true + deletion_protection = var.db_deletion_protection + skip_final_snapshot = !var.db_deletion_protection + copy_tags_to_snapshot = var.rds_copy_tags_to_snapshot + iam_database_authentication_enabled = var.rds_iam_authentication_enabled + performance_insights_enabled = var.rds_performance_insights_enabled + performance_insights_kms_key_id = var.rds_performance_insights_enabled && var.enable_customer_managed_key ? aws_kms_key.main[0].arn : null + performance_insights_retention_period = var.rds_performance_insights_enabled ? var.rds_performance_insights_retention_days : null + monitoring_interval = var.rds_enhanced_monitoring_interval + monitoring_role_arn = var.rds_enhanced_monitoring_interval > 0 ? aws_iam_role.rds_monitoring[0].arn : null + enabled_cloudwatch_logs_exports = var.rds_enabled_cloudwatch_log_types + auto_minor_version_upgrade = true tags = local.common_tags @@ -285,9 +354,10 @@ resource "aws_db_instance" "replica" { storage_encrypted = true kms_key_id = var.enable_customer_managed_key ? aws_kms_key.main[0].arn : null - performance_insights_enabled = true - auto_minor_version_upgrade = true - skip_final_snapshot = true + performance_insights_enabled = var.rds_performance_insights_enabled + performance_insights_kms_key_id = var.rds_performance_insights_enabled && var.enable_customer_managed_key ? aws_kms_key.main[0].arn : null + auto_minor_version_upgrade = true + skip_final_snapshot = true tags = local.common_tags } @@ -328,6 +398,15 @@ resource "aws_s3_bucket_server_side_encryption_configuration" "alb_logs" { resource "aws_s3_bucket_lifecycle_configuration" "alb_logs" { bucket = aws_s3_bucket.alb_logs.id + rule { + id = "abort-incomplete-multipart" + status = "Enabled" + filter {} + abort_incomplete_multipart_upload { + days_after_initiation = 7 + } + } + rule { id = "expire" status = "Enabled" @@ -364,6 +443,7 @@ resource "aws_lb" "main" { drop_invalid_header_fields = true enable_http2 = true + enable_deletion_protection = var.enable_alb_deletion_protection access_logs { bucket = aws_s3_bucket.alb_logs.id @@ -465,6 +545,9 @@ resource "aws_launch_template" "main" { db_secret_arn = aws_secretsmanager_secret.db.arn db_secret_region = data.aws_region.current.id product = var.product + redis_host = local.effective_redis_host + redis_port = local.effective_redis_port + redis_tls = local.provision_managed_redis ? true : var.redis_endpoint_override_tls } })) @@ -480,13 +563,13 @@ resource "aws_launch_template" "main" { } resource "aws_autoscaling_group" "main" { - name = "${local.name_prefix}-asg" - min_size = var.asg_min_size - max_size = var.asg_max_size - desired_capacity = var.asg_desired_capacity - vpc_zone_identifier = var.private_subnet_ids - target_group_arns = [aws_lb_target_group.main.arn] - health_check_type = "ELB" + name = "${local.name_prefix}-asg" + min_size = var.asg_min_size + max_size = var.asg_max_size + desired_capacity = var.asg_desired_capacity + vpc_zone_identifier = var.private_subnet_ids + target_group_arns = [aws_lb_target_group.main.arn] + health_check_type = "ELB" health_check_grace_period = 300 launch_template { @@ -859,6 +942,15 @@ resource "aws_s3_bucket_lifecycle_configuration" "backup" { count = local.create_backup_bucket ? 1 : 0 bucket = aws_s3_bucket.backup[0].id + rule { + id = "abort-incomplete-multipart" + status = "Enabled" + filter {} + abort_incomplete_multipart_upload { + days_after_initiation = 7 + } + } + rule { id = "tier-and-expire" status = "Enabled" @@ -964,7 +1056,7 @@ resource "aws_ssm_document" "pre_patch_backup" { "export AWS_S3_PREFIX=\"${local.backup_object_prefix}$${TS}\"", "export HAILBYTES_DB_SECRET_ARN='${aws_secretsmanager_secret.db.arn}'", "export AWS_DEFAULT_REGION='${data.aws_region.current.id}'", - "if [ -x /opt/hailbytes/bin/ha-pre-patch-backup.sh ]; then sudo -E /opt/hailbytes/bin/ha-pre-patch-backup.sh; else echo 'WARN: /opt/hailbytes/bin/ha-pre-patch-backup.sh not present on this AMI; skipping local bundle.'; fi", + "if [ -x /opt/hailbytes/bin/ha-pre-patch-backup.sh ]; then sudo -E /opt/hailbytes/bin/ha-pre-patch-backup.sh; else echo 'ERROR: /opt/hailbytes/bin/ha-pre-patch-backup.sh not present on this AMI. Rebuild from main; the Packer provision.sh now installs the script.' >&2; exit 1; fi", "RDS_ID='{{ rdsSnapshotIdentifier }}'", "if [ -z \"$RDS_ID\" ]; then RDS_ID=\"${local.name_prefix}-pre-patch-$${TS}\"; fi", "aws rds create-db-snapshot --db-instance-identifier '${aws_db_instance.primary.id}' --db-snapshot-identifier \"$RDS_ID\" --tags Key=Module,Value=hailbytes-terraform-modules Key=Phase,Value=pre-patch", @@ -974,3 +1066,68 @@ resource "aws_ssm_document" "pre_patch_backup" { ] }) } + +# ----- SSM Run Command document: post-patch verify ----- + +resource "aws_ssm_document" "post_patch_verify" { + name = "${local.name_prefix}-post-patch-verify" + document_type = "Command" + document_format = "YAML" + target_type = "/AWS::EC2::Instance" + tags = local.common_tags + + content = yamlencode({ + schemaVersion = "2.2" + description = "HailBytes SAT/ASM post-patch verifier. Runs the five-probe on-VM verifier so the autoscaling instance_refresh can fail fast on a regression." + parameters = { + schemaVersionPath = { + type = "String" + description = "Path to the schema-version endpoint." + default = var.schema_version_endpoint_path + } + minSchemaVersion = { + type = "String" + description = "Optional integer floor that the running schema version must meet or exceed. Empty string skips the regression check." + default = "" + } + } + mainSteps = [ + { + action = "aws:runShellScript" + name = "postPatchVerify" + inputs = { + timeoutSeconds = "600" + runCommand = [ + "set -euo pipefail", + "export HAILBYTES_SCHEMA_VERSION_PATH='{{ schemaVersionPath }}'", + "export HAILBYTES_MIN_SCHEMA_VERSION='{{ minSchemaVersion }}'", + "if [ -x /opt/hailbytes/bin/ha-post-patch-verify.sh ]; then sudo -E /opt/hailbytes/bin/ha-post-patch-verify.sh; else echo 'ERROR: /opt/hailbytes/bin/ha-post-patch-verify.sh not present on this AMI.'; exit 1; fi", + ] + } + } + ] + }) +} + +# ----- RDS enhanced monitoring IAM role (conditional) ----- + +resource "aws_iam_role" "rds_monitoring" { + count = var.rds_enhanced_monitoring_interval > 0 ? 1 : 0 + name = "${local.name_prefix}-rds-monitoring" + tags = local.common_tags + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { Service = "monitoring.rds.amazonaws.com" } + }] + }) +} + +resource "aws_iam_role_policy_attachment" "rds_monitoring" { + count = var.rds_enhanced_monitoring_interval > 0 ? 1 : 0 + role = aws_iam_role.rds_monitoring[0].name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonRDSEnhancedMonitoringRole" +} diff --git a/modules/unlimited-scale/aws/outputs.tf b/modules/unlimited-scale/aws/outputs.tf index 3a3b56b..5b8e684 100644 --- a/modules/unlimited-scale/aws/outputs.tf +++ b/modules/unlimited-scale/aws/outputs.tf @@ -31,11 +31,26 @@ output "pre_patch_ssm_document_name" { value = aws_ssm_document.pre_patch_backup.name } +output "post_patch_ssm_document_name" { + description = "Name of the AWS Systems Manager Run Command document that runs the on-VM five-probe post-patch verifier (used by the autoscaling instance_refresh hooks)." + value = aws_ssm_document.post_patch_verify.name +} + output "schema_version_endpoint" { description = "HTTPS URL to GET for the running schema version. CI/CD post-patch verify scripts can curl this and compare against the expected version emitted by the AMI build." value = "https://${aws_lb.main.dns_name}${var.schema_version_endpoint_path}" } +output "redis_endpoint" { + description = "Host:port of the Redis endpoint wired into the ASG launch template. Either the module-provisioned ElastiCache replication group or var.redis_endpoint_override." + value = local.effective_redis_host == null ? "" : "${local.effective_redis_host}:${local.effective_redis_port}" +} + +output "redis_mode" { + description = "How Redis is wired: 'managed' (this module provisioned ElastiCache), 'override' (customer-supplied), or 'disabled' (horizontal scaling will not be session-safe)." + value = local.provision_managed_redis ? "managed" : (var.redis_endpoint_override == null ? "disabled" : "override") +} + output "waf_attached" { description = "True when var.waf_web_acl_arn was set and a WAFv2 association exists for the ALB." value = var.waf_web_acl_arn != null diff --git a/modules/unlimited-scale/aws/variables.tf b/modules/unlimited-scale/aws/variables.tf index 58e1453..f31f0a5 100644 --- a/modules/unlimited-scale/aws/variables.tf +++ b/modules/unlimited-scale/aws/variables.tf @@ -42,6 +42,46 @@ variable "alert_email" { default = null } +# ----- Shared session store (ElastiCache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an ElastiCache Multi-AZ replication group. Required for horizontal scaling; set to false only when supplying redis_endpoint_override." + type = bool + default = true +} + +variable "redis_node_type" { + description = "ElastiCache node type. Scale up alongside ASG growth — cache.t4g.small handles 3-5 instances, cache.m6g.large handles 10-20+." + type = string + default = "cache.t4g.small" +} + +variable "redis_engine_version" { + type = string + default = "7.1" +} + +variable "redis_snapshot_retention_days" { + type = number + default = 0 +} + +variable "redis_endpoint_override" { + description = "Host of an existing customer-managed Redis endpoint. Pair with enable_managed_redis = false." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + type = number + default = 6379 +} + +variable "redis_endpoint_override_tls" { + type = bool + default = true +} + # ----- ASG sizing ----- variable "asg_min_size" { @@ -64,6 +104,12 @@ variable "instance_type" { default = "m6i.large" } +variable "enable_alb_deletion_protection" { + description = "Enable deletion protection on the ALB. Default true; production deployments should keep this on. Set to false in dev/test sandboxes where `terraform destroy` should succeed without manual cleanup." + type = bool + default = true +} + variable "target_cpu_utilization" { type = number default = 60 @@ -218,6 +264,43 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } + +# ----- RDS production-hardening (opt-in) ----- + +variable "rds_enhanced_monitoring_interval" { + description = "RDS enhanced monitoring sample interval in seconds. 0 disables. Default 0; production typically 60. CKV_AWS_118." + type = number + default = 0 + validation { + condition = contains([0, 1, 5, 10, 15, 30, 60], var.rds_enhanced_monitoring_interval) + error_message = "rds_enhanced_monitoring_interval must be one of: 0, 1, 5, 10, 15, 30, 60." + } +} + +variable "rds_enabled_cloudwatch_log_types" { + description = "RDS log types to export to CloudWatch. Empty list = none (cost-saving default). Production should set [\"postgresql\", \"upgrade\"]. CKV_AWS_129." + type = list(string) + default = [] +} + +variable "rds_iam_authentication_enabled" { + description = "Enable IAM database authentication on RDS. CKV_AWS_161." + type = bool + default = false +} + +variable "rds_performance_insights_enabled" { + description = "Enable RDS Performance Insights. CKV_AWS_354." + type = bool + default = false +} + +variable "rds_performance_insights_retention_days" { + description = "Performance Insights data retention. 7 = free tier (default); 731 = long-term." + type = number + default = 7 +} + variable "tags" { type = map(string) default = {} diff --git a/modules/unlimited-scale/azure/README.md b/modules/unlimited-scale/azure/README.md index ddc7bda..1a27602 100644 --- a/modules/unlimited-scale/azure/README.md +++ b/modules/unlimited-scale/azure/README.md @@ -13,6 +13,7 @@ flowchart TB LB --> VMSS[VM Scale Set
min=3 max=20
3 zones, zone-balanced
autoscale on CPU] VMSS --> VMs[(Marketplace image instances)] VMs --> KV[(Key Vault
DB password)] + VMs -->|TLS| RC[(Azure Cache for Redis
Standard/Premium
sessions + worker locks)] VMs -->|writes, TLS| DBP[(Postgres Flex Server
ZoneRedundant HA)] VMs -->|reads, TLS| DBR1[(Read replica 1)] VMs -->|reads, TLS| DBR2[(Read replica 2)] @@ -23,19 +24,33 @@ flowchart TB ## Cost estimate (East US, pay-as-you-go, default sizing) +Unlimited-scale on Azure is a fundamentally different cost shape from +single-vm or HA hot-hot. Compare against `modules/single-vm/azure` and +`modules/ha-hot-hot/azure` (~$585/mo all-in for HA at procurement-grade) +before quoting. The Azure cost rows are tracked here today; the +canonical AWS table lives in [`COST_SHAPES.md`](../../../COST_SHAPES.md). + | Component | Default | ~Monthly | |---|---|---| | 3× VMSS `Standard_D2s_v5` instances | 24/7 | $210 | | 3× Premium SSD OS disks | 64 GB | $30 | | Standard Load Balancer + 1 rule | | $25 | +| Azure Cache for Redis (`Standard C1`) | shared session store | $55 | | Postgres Flex `GP_Standard_D4ds_v5` ZoneRedundant | 256 GB | $600 | | 2× Postgres Flex read replicas | | $500 | | Postgres backups | 30d, geo-redundant | $40 | | Key Vault | secrets ops | $1 | | Azure Monitor metrics + alerts | typical | $20 | -| **Total infrastructure (3-instance steady state)** | | **~$1,425/month** | +| **Total infrastructure (3-instance steady state)** | | **~$1,480/month** | | **+ scale-out hours** | each extra D2s_v5 24/7 | +$70/mo | -| **HailBytes marketplace software fee** | per VM-hour, every VMSS instance | **separate** | +| **HailBytes marketplace software fee** ($0.24/vCPU-hr) | 3× 2 vCPU × 730h | **~$1,050/mo** | +| **All-in (3-instance steady state)** | | **~$2,530/month** | + +Scale-out behaves the same as the AWS shape: each extra VMSS instance +adds an EC2-line equivalent and a per-vCPU meter line. At 5 steady- +state instances the bill lands around $3,150/mo all-in; at 10 around +$5,150/mo. For deployments above 5 instances raise `redis_capacity` +to 3 (~$220/mo) — `Standard C1` becomes a bottleneck around 5 replicas. ## Prerequisites @@ -44,7 +59,7 @@ flowchart TB - Subnet delegated to `Microsoft.DBforPostgreSQL/flexibleServers` - Private DNS zone `privatelink.postgres.database.azure.com` linked to the vnet - Marketplace subscription accepted (handled by module) -- Permissions for Compute, Network, DBforPostgreSQL, KeyVault, Monitor +- Permissions for Compute, Network, DBforPostgreSQL, KeyVault, Monitor, **Cache** (Standard tier or higher — Basic is single-node and breaks horizontal scaling) ## Usage diff --git a/modules/unlimited-scale/azure/main.tf b/modules/unlimited-scale/azure/main.tf index 49ae5d4..8586b0a 100644 --- a/modules/unlimited-scale/azure/main.tf +++ b/modules/unlimited-scale/azure/main.tf @@ -39,6 +39,14 @@ locals { enable_application_gateway = var.enable_application_gateway endpoint_ip = local.enable_application_gateway ? azurerm_public_ip.appgw[0].ip_address : azurerm_public_ip.lb.ip_address + + # Shared session store: required by every horizontally-scaled SAT/ASM + # deployment because every VMSS instance has to read the same session + # map and worker-lock heartbeat. Provisioned by default; can be + # overridden via var.redis_endpoint_override + var.enable_managed_redis. + provision_managed_redis = var.enable_managed_redis && var.redis_endpoint_override == null + effective_redis_host = local.provision_managed_redis ? one(azurerm_redis_cache.main[*].hostname) : var.redis_endpoint_override + effective_redis_port = local.provision_managed_redis ? 6380 : var.redis_endpoint_override_port } data "azurerm_client_config" "current" {} @@ -78,10 +86,17 @@ resource "random_password" "db" { } resource "azurerm_key_vault_secret" "db" { - name = "hailbytes-db-password" - value = random_password.db.result - key_vault_id = azurerm_key_vault.main.id - depends_on = [azurerm_role_assignment.kv_writer] + name = "hailbytes-db-password" + value = random_password.db.result + key_vault_id = azurerm_key_vault.main.id + content_type = "application/x-postgresql-password" + expiration_date = timeadd(timestamp(), "${var.db_secret_expiration_hours}h") + + lifecycle { + ignore_changes = [expiration_date] + } + + depends_on = [azurerm_role_assignment.kv_writer] } # ----- LB ----- @@ -146,10 +161,16 @@ resource "azurerm_linux_virtual_machine_scale_set" "main" { instances = var.vmss_default_count admin_username = var.admin_username disable_password_authentication = true - zones = ["1", "2", "3"] - zone_balance = true - upgrade_mode = "Rolling" - health_probe_id = azurerm_lb_probe.https.id + # CKV_AZURE_97. Encrypts the OS disk + temp disk + data disks at the + # hypervisor host level on top of Azure's default platform-managed + # encryption. No additional cost; requires the subscription to be + # registered for the EncryptionAtHost feature (it is, on all + # production Azure subscriptions by default). + encryption_at_host_enabled = true + zones = ["1", "2", "3"] + zone_balance = true + upgrade_mode = "Rolling" + health_probe_id = azurerm_lb_probe.https.id tags = merge(local.common_tags, { "hailbytes-${var.product}" = "true" }) @@ -197,9 +218,9 @@ resource "azurerm_linux_virtual_machine_scale_set" "main" { primary = true ip_configuration { - name = "primary" - primary = true - subnet_id = var.vm_subnet_id + name = "primary" + primary = true + subnet_id = var.vm_subnet_id load_balancer_backend_address_pool_ids = [azurerm_lb_backend_address_pool.main.id] application_gateway_backend_address_pool_ids = local.enable_application_gateway ? [ for p in azurerm_application_gateway.main[0].backend_address_pool : p.id if p.name == "vmss" @@ -216,12 +237,15 @@ resource "azurerm_linux_virtual_machine_scale_set" "main" { custom_data = base64encode(jsonencode({ hailbytes = { - mode = "scale-out" - key_vault_uri = azurerm_key_vault.main.vault_uri - db_secret_name = azurerm_key_vault_secret.db.name - db_fqdn = azurerm_postgresql_flexible_server.primary.fqdn - db_read_fqdns = [for r in azurerm_postgresql_flexible_server.replica : r.fqdn] - product = var.product + mode = "scale-out" + key_vault_uri = azurerm_key_vault.main.vault_uri + db_secret_name = azurerm_key_vault_secret.db.name + db_fqdn = azurerm_postgresql_flexible_server.primary.fqdn + db_read_fqdns = [for r in azurerm_postgresql_flexible_server.replica : r.fqdn] + product = var.product + redis_host = local.effective_redis_host + redis_port = local.effective_redis_port + redis_tls = local.provision_managed_redis ? true : var.redis_endpoint_override_tls } })) @@ -309,7 +333,7 @@ resource "azurerm_postgresql_flexible_server" "primary" { private_dns_zone_id = var.private_dns_zone_id backup_retention_days = var.db_backup_retention_days - geo_redundant_backup_enabled = true + geo_redundant_backup_enabled = var.postgres_geo_redundant_backup_enabled high_availability { mode = "ZoneRedundant" @@ -355,6 +379,32 @@ resource "azurerm_postgresql_flexible_server" "replica" { } } +# ----- Shared session store: Azure Cache for Redis ----- +# +# Required for horizontal scaling. Every instance in the VMSS must +# share session state, otherwise sticky-session LB stickiness becomes +# the only thing keeping users logged in across rolling upgrade. +# Standard/Premium SKUs only — Basic is single-node and breaks HA. + +resource "azurerm_redis_cache" "main" { + count = local.provision_managed_redis ? 1 : 0 + name = "${local.name_prefix}-redis" + resource_group_name = var.resource_group_name + location = var.location + capacity = var.redis_capacity + family = var.redis_family + sku_name = var.redis_sku_name + non_ssl_port_enabled = false + minimum_tls_version = "1.2" + public_network_access_enabled = false + zones = var.redis_sku_name == "Premium" ? ["1", "2"] : null + tags = local.common_tags + + redis_configuration { + maxmemory_policy = "allkeys-lru" + } +} + # ----- Monitor: action group + alerts ----- resource "azurerm_monitor_action_group" "alerts" { @@ -427,14 +477,14 @@ resource "azurerm_storage_account" "backup" { count = local.create_backup_storage ? 1 : 0 name = coalesce(var.backup_storage_account_name, substr(replace("${local.name_prefix}backup", "-", ""), 0, 24)) resource_group_name = var.resource_group_name + public_network_access_enabled = false + allow_nested_items_to_be_public = false location = var.location account_tier = "Standard" account_replication_type = var.backup_storage_replication account_kind = "StorageV2" access_tier = "Cool" min_tls_version = "TLS1_2" - allow_nested_items_to_be_public = false - public_network_access_enabled = true shared_access_key_enabled = false tags = local.common_tags @@ -470,7 +520,7 @@ resource "azurerm_storage_management_policy" "backup" { version { change_tier_to_cool_after_days_since_creation = 30 change_tier_to_archive_after_days_since_creation = 90 - delete_after_days_since_creation = var.backup_blob_noncurrent_expiration_days + delete_after_days_since_creation = var.backup_blob_noncurrent_expiration_days } } } @@ -539,7 +589,9 @@ resource "azurerm_virtual_machine_scale_set_extension" "pre_patch_backup" { if [ -x /opt/hailbytes/bin/ha-pre-patch-backup.sh ]; then sudo -E /opt/hailbytes/bin/ha-pre-patch-backup.sh else - echo "WARN: /opt/hailbytes/bin/ha-pre-patch-backup.sh not present; skipping local bundle." + echo "ERROR: /opt/hailbytes/bin/ha-pre-patch-backup.sh not present on this VM image." >&2 + echo " Rebuild the marketplace image from main; provision.sh installs the script." >&2 + exit 1 fi az login --identity --allow-no-subscriptions >/dev/null az postgres flexible-server backup create \ @@ -552,6 +604,41 @@ resource "azurerm_virtual_machine_scale_set_extension" "pre_patch_backup" { }) } +# ----- VMSS post-patch verify extension ----- +# +# Mirrors the AWS aws_ssm_document.post_patch_verify in +# modules/unlimited-scale/aws/main.tf. Bakes the on-VM five-probe +# verifier as a VMSS extension so the rolling-upgrade pipeline can +# fail fast on a schema-version regression, encryption-key +# fingerprint mismatch, or smoke-test failure. + +resource "azurerm_virtual_machine_scale_set_extension" "post_patch_verify" { + count = var.enable_post_patch_run_command ? 1 : 0 + name = "RunPostPatchVerify" + virtual_machine_scale_set_id = azurerm_linux_virtual_machine_scale_set.main.id + publisher = "Microsoft.Azure.Extensions" + type = "CustomScript" + type_handler_version = "2.1" + auto_upgrade_minor_version = true + + settings = jsonencode({}) + protected_settings = jsonencode({ + script = base64encode(<<-EOSH + #!/bin/bash + set -euo pipefail + export HAILBYTES_SCHEMA_VERSION_PATH='${var.schema_version_endpoint_path}' + if [ -x /opt/hailbytes/bin/ha-post-patch-verify.sh ]; then + sudo -E /opt/hailbytes/bin/ha-post-patch-verify.sh + else + echo "ERROR: /opt/hailbytes/bin/ha-post-patch-verify.sh not present on this VM image." >&2 + echo " Rebuild the marketplace image from main; provision.sh installs the script." >&2 + exit 1 + fi + EOSH + ) + }) +} + # ----- Optional Application Gateway + WAF ----- resource "azurerm_public_ip" "appgw" { diff --git a/modules/unlimited-scale/azure/outputs.tf b/modules/unlimited-scale/azure/outputs.tf index 08ae75b..1b5cb60 100644 --- a/modules/unlimited-scale/azure/outputs.tf +++ b/modules/unlimited-scale/azure/outputs.tf @@ -30,6 +30,21 @@ output "pre_patch_run_command_extension_name" { value = var.enable_pre_patch_run_command ? azurerm_virtual_machine_scale_set_extension.pre_patch_backup[0].name : "" } +output "post_patch_run_command_extension_name" { + description = "Name of the VMSS extension wrapping the post-patch verifier. Invoke via `az vmss run-command`." + value = var.enable_post_patch_run_command ? azurerm_virtual_machine_scale_set_extension.post_patch_verify[0].name : "" +} + +output "redis_endpoint" { + description = "Host:port of the Redis endpoint wired into the VMSS launch profile. Either the module-provisioned Azure Cache or var.redis_endpoint_override." + value = local.effective_redis_host == null ? "" : "${local.effective_redis_host}:${local.effective_redis_port}" +} + +output "redis_mode" { + description = "How Redis is wired: 'managed' (this module provisioned Azure Cache), 'override' (customer-supplied), or 'disabled' (horizontal scaling is not session-safe)." + value = local.provision_managed_redis ? "managed" : (var.redis_endpoint_override == null ? "disabled" : "override") +} + output "schema_version_endpoint" { description = "HTTPS URL that returns the running schema version." value = "https://${local.endpoint_ip}${var.schema_version_endpoint_path}" diff --git a/modules/unlimited-scale/azure/variables.tf b/modules/unlimited-scale/azure/variables.tf index 4e6a3f2..de2ca4e 100644 --- a/modules/unlimited-scale/azure/variables.tf +++ b/modules/unlimited-scale/azure/variables.tf @@ -211,6 +211,71 @@ variable "schema_version_endpoint_path" { default = "/api/instance/schema-version" } +variable "enable_post_patch_run_command" { + description = "Install a VMSS extension named RunPostPatchVerify that runs the on-VM five-probe verifier, mirroring the AWS aws_ssm_document.post_patch_verify." + type = bool + default = true +} + +# ----- Shared session store (Azure Cache for Redis) ----- + +variable "enable_managed_redis" { + description = "Provision an Azure Cache for Redis (Standard or Premium SKU). Required for horizontal scaling; set to false only when supplying redis_endpoint_override." + type = bool + default = true +} + +variable "redis_sku_name" { + description = "Redis SKU. Standard delivers a primary/replica pair; Premium adds zone selection. Basic is single-node and breaks horizontal scaling (validated)." + type = string + default = "Standard" + validation { + condition = contains(["Standard", "Premium"], var.redis_sku_name) + error_message = "redis_sku_name must be one of: Standard, Premium. Basic is single-node and breaks horizontal scaling." + } +} + +variable "redis_family" { + description = "Redis SKU family. 'C' = Standard/Basic, 'P' = Premium." + type = string + default = "C" +} + +variable "redis_capacity" { + description = "Redis capacity (size index). For SKU=Standard / family=C, valid values are 0-6. Scale alongside VMSS instance count: 1 (1GB) handles 3-5 instances; 3 (6GB) handles 10-20+." + type = number + default = 1 +} + +variable "redis_endpoint_override" { + description = "Host of an existing customer-managed Redis endpoint. Pair with enable_managed_redis = false." + type = string + default = null +} + +variable "redis_endpoint_override_port" { + type = number + default = 6380 +} + +variable "redis_endpoint_override_tls" { + type = bool + default = true +} + +variable "db_secret_expiration_hours" { + description = "Hours until the Key Vault DB-password secret expires. Default 8760 = one calendar year. Set on every apply via timeadd(timestamp(), ...) and then ignored on subsequent applies so a stale value doesn't show drift." + type = number + default = 8760 +} + + +variable "postgres_geo_redundant_backup_enabled" { + description = "Enable geo-redundant backup on the Postgres Flexible Server. Defaults to false; adds cross-region replication of backups for DR scenarios. CKV_AZURE_136." + type = bool + default = false +} + variable "tags" { type = map(string) default = {}