From d467e4e69a7f7f5631961dec40d38317cd5241c7 Mon Sep 17 00:00:00 2001 From: Mystery Date: Sat, 30 May 2026 14:20:21 +0000 Subject: [PATCH] feat(#503): implement infrastructure as code with Terraform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complete AWS IaC covering all required components for the Brain Storm platform: VPC & Networking - VPC with 2 public and 2 private subnets across AZs - NAT gateways, route tables, Internet Gateway - VPC flow logs with CloudWatch (enable_flow_logs variable) RDS PostgreSQL - PostgreSQL 16.3 on gp3 encrypted storage with auto-scaling - Enhanced monitoring (60s interval), Performance Insights in prod - Multi-AZ support (rds_multi_az variable), deletion protection in prod - IAM role for enhanced monitoring agent ElastiCache Redis - Redis 7.1 replication group with encryption at rest and in transit - Fixes deprecated replication_group_description → description - Multi-AZ and automatic failover enabled in prod; single node in dev/staging - CloudWatch slow-log delivery ECS Fargate - Cluster with Container Insights, FARGATE + FARGATE_SPOT capacity providers - Backend and frontend Fargate services with deployment circuit breakers - Secrets Manager injection for DB password, JWT, and Stellar key - ECS Execute Command enabled in non-prod for debugging - lifecycle { ignore_changes } on task_definition and desired_count for CI/CD ALB - Internet-facing ALB with access logs to S3 - Path-based routing: /api/*, /health, /v1/* → backend; default → frontend - Optional HTTPS listener with TLS 1.3 policy; HTTP→HTTPS redirect when cert provided - alb_security_group_id output for downstream use ECR (new module) - Backend and frontend repositories with image scanning on push - Lifecycle policies: retain 10 tagged images, expire untagged after 14 days - Repository policies granting ECS pull and GitHub Actions push access Auto-scaling (new module) - Application Auto Scaling targets for both ECS services - CPU (70%) and memory (80%) target-tracking policies - CloudWatch alarms for high CPU on each service - Configurable min/max capacity per service Security Groups & IAM - All security groups use parameterised vpc_cidr instead of hardcoded 10.0.0.0/16 - ECS execution role extended with secretsmanager:GetSecretValue and kms:Decrypt - ECS task role with least-privilege CloudWatch Logs and ECR pull permissions Other - Root main.tf wires all modules including new ECR and autoscaling - Complete variables.tf with validation on environment values - Comprehensive outputs.tf with ECR repo URLs and secrets ARNs - Updated terraform.tfvars.example documenting every variable - Updated README with architecture diagram and two-step bootstrap guide Co-Authored-By: Claude Sonnet 4.6 --- infra/terraform/README.md | 167 +++++-- infra/terraform/main.tf | 145 ++++-- infra/terraform/modules/alb/main.tf | 107 +++++ infra/terraform/modules/alb/outputs.tf | 15 + infra/terraform/modules/alb/variables.tf | 11 + infra/terraform/modules/autoscaling/main.tf | 116 +++++ .../terraform/modules/autoscaling/outputs.tf | 9 + .../modules/autoscaling/variables.tf | 67 +++ infra/terraform/modules/ecr/main.tf | 99 ++++ infra/terraform/modules/ecr/outputs.tf | 24 + infra/terraform/modules/ecr/variables.tf | 16 + infra/terraform/modules/ecs/main.tf | 432 ++++++++++++------ infra/terraform/modules/ecs/outputs.tf | 35 ++ infra/terraform/modules/ecs/variables.tf | 77 +++- infra/terraform/modules/elasticache/main.tf | 34 +- .../modules/elasticache/variables.tf | 5 + infra/terraform/modules/rds/main.tf | 48 +- infra/terraform/modules/rds/variables.tf | 17 + infra/terraform/modules/vpc/main.tf | 67 +++ infra/terraform/modules/vpc/outputs.tf | 5 + infra/terraform/modules/vpc/variables.tf | 6 + infra/terraform/outputs.tf | 37 ++ infra/terraform/terraform.tfvars.example | 64 ++- infra/terraform/variables.tf | 113 ++++- 24 files changed, 1455 insertions(+), 261 deletions(-) create mode 100644 infra/terraform/modules/autoscaling/main.tf create mode 100644 infra/terraform/modules/autoscaling/outputs.tf create mode 100644 infra/terraform/modules/autoscaling/variables.tf create mode 100644 infra/terraform/modules/ecr/main.tf create mode 100644 infra/terraform/modules/ecr/outputs.tf create mode 100644 infra/terraform/modules/ecr/variables.tf diff --git a/infra/terraform/README.md b/infra/terraform/README.md index f56548a..5d90afa 100644 --- a/infra/terraform/README.md +++ b/infra/terraform/README.md @@ -1,26 +1,77 @@ # Brain Storm - Terraform Infrastructure -This directory contains Terraform configurations for deploying Brain Storm to AWS. +This directory contains Terraform configurations for deploying Brain Storm to AWS using infrastructure as code. ## Architecture -- VPC with public and private subnets across 2 availability zones -- RDS PostgreSQL 16 with automated backups -- ElastiCache Redis cluster with automatic failover -- ECS Fargate for container orchestration -- Application Load Balancer for traffic distribution +``` +Internet + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ AWS Account │ +│ │ +│ ┌─────────────────────────────────────────────────┐ │ +│ │ VPC (10.0.0.0/16) │ │ +│ │ │ │ +│ │ Public Subnets (AZ-a, AZ-b) │ │ +│ │ ┌────────────────────────────────────────────┐ │ │ +│ │ │ Application Load Balancer │ │ │ +│ │ │ :80 (HTTP) → redirect or forward │ │ │ +│ │ │ :443 (HTTPS, optional ACM cert) │ │ │ +│ │ │ path /api/* → backend target group │ │ │ +│ │ │ default → frontend target group │ │ │ +│ │ └────────────────────────────────────────────┘ │ │ +│ │ │ │ +│ │ Private Subnets (AZ-a, AZ-b) │ │ +│ │ ┌──────────────┐ ┌──────────────┐ │ │ +│ │ │ ECS Frontend │ │ ECS Backend │ │ │ +│ │ │ (Fargate) │ │ (Fargate) │ │ │ +│ │ │ Auto-scaling │ │ Auto-scaling│ │ │ +│ │ └──────────────┘ └──────┬───────┘ │ │ +│ │ │ │ │ +│ │ ┌──────────────┐ ┌──────▼───────┐ │ │ +│ │ │ ElastiCache │ │ RDS │ │ │ +│ │ │ Redis 7.1 │ │ PostgreSQL │ │ │ +│ │ │ (encrypted) │ │ 16.3 │ │ │ +│ │ └──────────────┘ └──────────────┘ │ │ +│ └─────────────────────────────────────────────────┘ │ +│ │ +│ ECR repositories (backend + frontend) │ +│ Secrets Manager (DB pwd, JWT, Stellar key) │ +│ API Gateway (HTTP API + VPC Link) │ +│ GitHub Actions OIDC role │ +└─────────────────────────────────────────────────────────┘ +``` + +## Modules + +| Module | Purpose | +|--------|---------| +| `vpc` | VPC, subnets, NAT gateways, route tables, flow logs | +| `rds` | PostgreSQL 16 with enhanced monitoring, encrypted storage | +| `elasticache` | Redis 7.1 replication group with encryption | +| `ecs` | Fargate cluster, backend & frontend services, IAM roles | +| `alb` | Internet-facing ALB, path routing, optional HTTPS | +| `ecr` | Container registries with lifecycle policies | +| `autoscaling` | CPU/memory-based auto-scaling for ECS services | +| `api-gateway` | HTTP API Gateway with VPC Link and throttling | +| `oidc` | GitHub Actions keyless authentication | +| `secrets` | Secrets Manager for DB password, JWT, Stellar key | ## Prerequisites -- Terraform >= 1.0 +- Terraform >= 1.5 - AWS CLI configured with appropriate credentials - S3 bucket for remote state: `brain-storm-terraform-state` - DynamoDB table for state locking: `brain-storm-terraform-locks` -## Setup Remote State +## Bootstrap Remote State + +Run once before the first `terraform init`: ```bash -# Create S3 bucket for state +# Create S3 bucket with versioning and encryption aws s3api create-bucket \ --bucket brain-storm-terraform-state \ --region us-east-1 @@ -29,7 +80,12 @@ aws s3api put-bucket-versioning \ --bucket brain-storm-terraform-state \ --versioning-configuration Status=Enabled -# Create DynamoDB table for locking +aws s3api put-bucket-encryption \ + --bucket brain-storm-terraform-state \ + --server-side-encryption-configuration \ + '{"Rules":[{"ApplyServerSideEncryptionByDefault":{"SSEAlgorithm":"AES256"}}]}' + +# Create DynamoDB table for state locking aws dynamodb create-table \ --table-name brain-storm-terraform-locks \ --attribute-definitions AttributeName=LockID,AttributeType=S \ @@ -40,54 +96,77 @@ aws dynamodb create-table \ ## Usage -1. Copy the example variables file: ```bash +# 1. Copy and edit variables cp terraform.tfvars.example terraform.tfvars -``` +# Edit terraform.tfvars — never commit this file -2. Edit `terraform.tfvars` with your values +# 2. Provide sensitive values as environment variables (preferred) +export TF_VAR_db_password="$(openssl rand -hex 16)" +export TF_VAR_jwt_secret="$(openssl rand -hex 32)" +export TF_VAR_stellar_secret_key="your-stellar-key" -3. Initialize Terraform: -```bash +# 3. Initialize terraform init -``` -4. Plan the deployment: -```bash -terraform plan -``` +# 4. Plan +terraform plan -out=tfplan -5. Apply the configuration: -```bash -terraform apply +# 5. Apply +terraform apply tfplan ``` ## Outputs -After successful apply, Terraform will output: -- ALB DNS name for accessing the application -- VPC ID -- Database and Redis endpoints (sensitive) +| Output | Description | +|--------|-------------| +| `alb_dns_name` | ALB DNS name — point your domain here | +| `api_gateway_endpoint` | API Gateway URL — use as `api_base_url` in tfvars | +| `backend_repository_url` | ECR URL for backend images | +| `frontend_repository_url` | ECR URL for frontend images | +| `github_actions_role_arn` | Set as `AWS_ROLE_ARN` GitHub secret | +| `db_endpoint` | RDS endpoint (sensitive) | +| `redis_endpoint` | ElastiCache endpoint (sensitive) | + +## Two-step bootstrap + +On first apply, `backend_image` and `frontend_image` default to the ECR repos. +The ECR repos will be empty, so the ECS task definitions reference images that +don't exist yet. To bootstrap: + +1. Run `terraform apply` — ECR repos, VPC, RDS, and Redis are created. +2. Build and push your images: + ```bash + aws ecr get-login-password | docker login --username AWS \ + --password-stdin $(terraform output -raw backend_repository_url | cut -d/ -f1) + docker build -t $(terraform output -raw backend_repository_url):latest apps/backend + docker push $(terraform output -raw backend_repository_url):latest + # repeat for frontend + ``` +3. Run `terraform apply` again — ECS services will start with the new images. + +## HTTPS Setup + +To enable HTTPS: + +1. Request an ACM certificate in the same region as your ALB. +2. Set `https_certificate_arn` in `terraform.tfvars`. +3. Run `terraform apply` — the ALB will add an HTTPS listener and redirect HTTP. -## Resource Limits +## GitHub Actions OIDC -Production configuration uses: -- Backend: 512 CPU, 1024 MB memory (2 tasks) -- Frontend: 256 CPU, 512 MB memory (2 tasks) -- RDS: db.t3.micro with auto-scaling storage -- Redis: cache.t3.micro with 2 nodes +After `terraform apply`, add the `github_actions_role_arn` output as the `AWS_ROLE_ARN` +secret in your GitHub repository. Remove any existing `AWS_ACCESS_KEY_ID` / +`AWS_SECRET_ACCESS_KEY` secrets — the OIDC role replaces them. ## Cost Optimization -For development/staging environments, adjust in `terraform.tfvars`: -- Use smaller instance classes -- Reduce ECS task counts -- Disable multi-AZ for RDS and Redis +For dev/staging, adjust in `terraform.tfvars`: -## GitHub Actions OIDC - -The `oidc` module provisions: -- An AWS IAM OIDC identity provider for `token.actions.githubusercontent.com` -- A least-privilege `GitHubActionsDeploymentRole` IAM role - -After `terraform apply`, copy the `github_actions_role_arn` output and add it as the `AWS_ROLE_ARN` secret in your GitHub repository. Remove any existing `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` secrets. +```hcl +db_instance_class = "db.t3.micro" +rds_multi_az = false +redis_node_type = "cache.t3.micro" +backend_min_capacity = 1 +frontend_min_capacity = 1 +``` diff --git a/infra/terraform/main.tf b/infra/terraform/main.tf index cae3b00..1db2f15 100644 --- a/infra/terraform/main.tf +++ b/infra/terraform/main.tf @@ -1,6 +1,6 @@ terraform { - required_version = ">= 1.0" - + required_version = ">= 1.5" + required_providers { aws = { source = "hashicorp/aws" @@ -19,62 +19,148 @@ terraform { provider "aws" { region = var.aws_region + + default_tags { + tags = { + Project = "Brain-Storm" + ManagedBy = "terraform" + Environment = var.environment + } + } } +# ─── Networking ─────────────────────────────────────────────────────────────── + module "vpc" { source = "./modules/vpc" - - environment = var.environment - vpc_cidr = var.vpc_cidr + + environment = var.environment + vpc_cidr = var.vpc_cidr + enable_flow_logs = var.enable_flow_logs } +# ─── Container Registry ─────────────────────────────────────────────────────── + +module "ecr" { + source = "./modules/ecr" + + environment = var.environment + image_retention_count = var.ecr_image_retention_count + github_actions_role_arn = module.oidc.role_arn +} + +# ─── Data Stores ────────────────────────────────────────────────────────────── + module "rds" { source = "./modules/rds" - - environment = var.environment - vpc_id = module.vpc.vpc_id - private_subnet_ids = module.vpc.private_subnet_ids - db_name = var.db_name - db_username = var.db_username - db_password = var.db_password - db_instance_class = var.db_instance_class + + environment = var.environment + vpc_id = module.vpc.vpc_id + vpc_cidr = module.vpc.vpc_cidr + private_subnet_ids = module.vpc.private_subnet_ids + db_name = var.db_name + db_username = var.db_username + db_password = var.db_password + db_instance_class = var.db_instance_class + multi_az = var.rds_multi_az + monitoring_interval = var.rds_monitoring_interval } module "elasticache" { source = "./modules/elasticache" - + environment = var.environment vpc_id = module.vpc.vpc_id + vpc_cidr = module.vpc.vpc_cidr private_subnet_ids = module.vpc.private_subnet_ids node_type = var.redis_node_type } +# ─── Application Layer ──────────────────────────────────────────────────────── + module "ecs" { source = "./modules/ecs" - + environment = var.environment vpc_id = module.vpc.vpc_id + vpc_cidr = module.vpc.vpc_cidr private_subnet_ids = module.vpc.private_subnet_ids public_subnet_ids = module.vpc.public_subnet_ids - - backend_image = var.backend_image - frontend_image = var.frontend_image - - db_host = module.rds.db_endpoint - redis_host = module.elasticache.redis_endpoint + + backend_image = var.backend_image != "" ? var.backend_image : "${module.ecr.backend_repository_url}:latest" + frontend_image = var.frontend_image != "" ? var.frontend_image : "${module.ecr.frontend_repository_url}:latest" + + db_host = module.rds.db_endpoint + db_name = var.db_name + db_username = var.db_username + redis_host = module.elasticache.redis_endpoint + api_base_url = var.api_base_url + + backend_secrets = [ + { + name = "DATABASE_PASSWORD" + valueFrom = module.secrets.db_password_secret_arn + }, + { + name = "JWT_SECRET" + valueFrom = module.secrets.jwt_secret_arn + }, + { + name = "STELLAR_SECRET_KEY" + valueFrom = module.secrets.stellar_key_secret_arn + } + ] + + backend_desired_count = var.ecs_backend_desired_count + frontend_desired_count = var.ecs_frontend_desired_count } module "alb" { source = "./modules/alb" - + environment = var.environment vpc_id = module.vpc.vpc_id public_subnet_ids = module.vpc.public_subnet_ids - + account_id = var.account_id + backend_target_group_arn = module.ecs.backend_target_group_arn frontend_target_group_arn = module.ecs.frontend_target_group_arn + + https_certificate_arn = var.https_certificate_arn +} + +# ─── Auto Scaling ───────────────────────────────────────────────────────────── + +module "autoscaling" { + source = "./modules/autoscaling" + + environment = var.environment + cluster_name = module.ecs.cluster_name + + backend_service_name = module.ecs.backend_service_name + frontend_service_name = module.ecs.frontend_service_name + + backend_min_capacity = var.backend_min_capacity + backend_max_capacity = var.backend_max_capacity + frontend_min_capacity = var.frontend_min_capacity + frontend_max_capacity = var.frontend_max_capacity +} + +# ─── Edge / API Gateway ─────────────────────────────────────────────────────── + +module "api_gateway" { + source = "./modules/api-gateway" + + environment = var.environment + vpc_id = module.vpc.vpc_id + private_subnet_ids = module.vpc.private_subnet_ids + alb_listener_arn = module.alb.http_listener_arn + cors_allow_origins = var.api_gateway_cors_origins + alert_sns_arns = var.alert_sns_arns } +# ─── IAM / OIDC ─────────────────────────────────────────────────────────────── + module "oidc" { source = "./modules/oidc" @@ -82,6 +168,8 @@ module "oidc" { github_repo = var.github_repo } +# ─── Secrets Management ─────────────────────────────────────────────────────── + module "secrets" { source = "./modules/secrets" @@ -94,14 +182,3 @@ module "secrets" { enable_rotation = var.environment == "prod" alert_sns_arns = var.alert_sns_arns } - -module "api_gateway" { - source = "./modules/api-gateway" - - environment = var.environment - vpc_id = module.vpc.vpc_id - private_subnet_ids = module.vpc.private_subnet_ids - alb_listener_arn = module.alb.http_listener_arn - cors_allow_origins = var.api_gateway_cors_origins - alert_sns_arns = var.alert_sns_arns -} diff --git a/infra/terraform/modules/alb/main.tf b/infra/terraform/modules/alb/main.tf index da38508..762e439 100644 --- a/infra/terraform/modules/alb/main.tf +++ b/infra/terraform/modules/alb/main.tf @@ -8,6 +8,7 @@ resource "aws_security_group" "alb" { to_port = 80 protocol = "tcp" cidr_blocks = ["0.0.0.0/0"] + description = "HTTP" } ingress { @@ -15,6 +16,7 @@ resource "aws_security_group" "alb" { to_port = 443 protocol = "tcp" cidr_blocks = ["0.0.0.0/0"] + description = "HTTPS" } egress { @@ -38,6 +40,13 @@ resource "aws_lb" "main" { subnets = var.public_subnet_ids enable_deletion_protection = var.environment == "prod" + drop_invalid_header_fields = true + + access_logs { + bucket = aws_s3_bucket.alb_logs.bucket + prefix = "alb" + enabled = true + } tags = { Name = "${var.environment}-brain-storm-alb" @@ -45,13 +54,111 @@ resource "aws_lb" "main" { } } +# S3 bucket for ALB access logs +resource "aws_s3_bucket" "alb_logs" { + bucket = "${var.environment}-brain-storm-alb-logs-${var.account_id}" + force_destroy = var.environment != "prod" + + tags = { + Name = "${var.environment}-brain-storm-alb-logs" + Environment = var.environment + } +} + +resource "aws_s3_bucket_lifecycle_configuration" "alb_logs" { + bucket = aws_s3_bucket.alb_logs.id + + rule { + id = "expire-logs" + status = "Enabled" + + expiration { + days = 90 + } + } +} + +resource "aws_s3_bucket_public_access_block" "alb_logs" { + bucket = aws_s3_bucket.alb_logs.id + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +data "aws_elb_service_account" "main" {} + +resource "aws_s3_bucket_policy" "alb_logs" { + bucket = aws_s3_bucket.alb_logs.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { AWS = data.aws_elb_service_account.main.arn } + Action = "s3:PutObject" + Resource = "${aws_s3_bucket.alb_logs.arn}/alb/AWSLogs/*" + }] + }) +} + +# ─── Listeners ──────────────────────────────────────────────────────────────── + resource "aws_lb_listener" "http" { load_balancer_arn = aws_lb.main.arn port = "80" protocol = "HTTP" + default_action { + type = var.https_certificate_arn != "" ? "redirect" : "forward" + + dynamic "redirect" { + for_each = var.https_certificate_arn != "" ? [1] : [] + content { + port = "443" + protocol = "HTTPS" + status_code = "HTTP_301" + } + } + + dynamic "forward" { + for_each = var.https_certificate_arn == "" ? [1] : [] + content { + target_group { + arn = var.frontend_target_group_arn + } + } + } + } +} + +resource "aws_lb_listener" "https" { + count = var.https_certificate_arn != "" ? 1 : 0 + load_balancer_arn = aws_lb.main.arn + port = "443" + protocol = "HTTPS" + ssl_policy = "ELBSecurityPolicy-TLS13-1-2-2021-06" + certificate_arn = var.https_certificate_arn + default_action { type = "forward" target_group_arn = var.frontend_target_group_arn } } + +# Route /api/* and /health paths to backend +resource "aws_lb_listener_rule" "backend_api" { + listener_arn = var.https_certificate_arn != "" ? aws_lb_listener.https[0].arn : aws_lb_listener.http.arn + priority = 10 + + action { + type = "forward" + target_group_arn = var.backend_target_group_arn + } + + condition { + path_pattern { + values = ["/api/*", "/health", "/v1/*"] + } + } +} diff --git a/infra/terraform/modules/alb/outputs.tf b/infra/terraform/modules/alb/outputs.tf index c253e35..81a47e9 100644 --- a/infra/terraform/modules/alb/outputs.tf +++ b/infra/terraform/modules/alb/outputs.tf @@ -8,7 +8,22 @@ output "alb_arn" { value = aws_lb.main.arn } +output "alb_zone_id" { + description = "ALB hosted zone ID (for Route 53 alias records)" + value = aws_lb.main.zone_id +} + +output "alb_security_group_id" { + description = "Security group ID attached to the ALB" + value = aws_security_group.alb.id +} + output "http_listener_arn" { description = "ARN of the HTTP listener (used by API Gateway VPC Link integration)" value = aws_lb_listener.http.arn } + +output "https_listener_arn" { + description = "ARN of the HTTPS listener (null when no certificate is provided)" + value = var.https_certificate_arn != "" ? aws_lb_listener.https[0].arn : null +} diff --git a/infra/terraform/modules/alb/variables.tf b/infra/terraform/modules/alb/variables.tf index 215f168..4fc901e 100644 --- a/infra/terraform/modules/alb/variables.tf +++ b/infra/terraform/modules/alb/variables.tf @@ -22,3 +22,14 @@ variable "frontend_target_group_arn" { description = "Frontend target group ARN" type = string } + +variable "https_certificate_arn" { + description = "ACM certificate ARN for HTTPS. When set, HTTP redirects to HTTPS." + type = string + default = "" +} + +variable "account_id" { + description = "AWS account ID (used for ALB access log bucket naming)" + type = string +} diff --git a/infra/terraform/modules/autoscaling/main.tf b/infra/terraform/modules/autoscaling/main.tf new file mode 100644 index 0000000..a83c0f5 --- /dev/null +++ b/infra/terraform/modules/autoscaling/main.tf @@ -0,0 +1,116 @@ +locals { + services = { + backend = { + service_name = var.backend_service_name + min_capacity = var.backend_min_capacity + max_capacity = var.backend_max_capacity + cpu_target = var.backend_cpu_target + memory_target = var.backend_memory_target + } + frontend = { + service_name = var.frontend_service_name + min_capacity = var.frontend_min_capacity + max_capacity = var.frontend_max_capacity + cpu_target = var.frontend_cpu_target + memory_target = var.frontend_memory_target + } + } +} + +# ─── Application Auto Scaling Targets ───────────────────────────────────────── + +resource "aws_appautoscaling_target" "ecs" { + for_each = local.services + + max_capacity = each.value.max_capacity + min_capacity = each.value.min_capacity + resource_id = "service/${var.cluster_name}/${each.value.service_name}" + scalable_dimension = "ecs:service:DesiredCount" + service_namespace = "ecs" +} + +# ─── CPU Scaling Policies ───────────────────────────────────────────────────── + +resource "aws_appautoscaling_policy" "cpu" { + for_each = local.services + + name = "${var.environment}-brain-storm-${each.key}-cpu-scaling" + policy_type = "TargetTrackingScaling" + resource_id = aws_appautoscaling_target.ecs[each.key].resource_id + scalable_dimension = aws_appautoscaling_target.ecs[each.key].scalable_dimension + service_namespace = aws_appautoscaling_target.ecs[each.key].service_namespace + + target_tracking_scaling_policy_configuration { + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + target_value = each.value.cpu_target + scale_in_cooldown = 300 + scale_out_cooldown = 60 + } +} + +# ─── Memory Scaling Policies ────────────────────────────────────────────────── + +resource "aws_appautoscaling_policy" "memory" { + for_each = local.services + + name = "${var.environment}-brain-storm-${each.key}-memory-scaling" + policy_type = "TargetTrackingScaling" + resource_id = aws_appautoscaling_target.ecs[each.key].resource_id + scalable_dimension = aws_appautoscaling_target.ecs[each.key].scalable_dimension + service_namespace = aws_appautoscaling_target.ecs[each.key].service_namespace + + target_tracking_scaling_policy_configuration { + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageMemoryUtilization" + } + target_value = each.value.memory_target + scale_in_cooldown = 300 + scale_out_cooldown = 60 + } +} + +# ─── CloudWatch Alarms for Scaling Events ───────────────────────────────────── + +resource "aws_cloudwatch_metric_alarm" "backend_high_cpu" { + alarm_name = "${var.environment}-brain-storm-backend-high-cpu" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 2 + metric_name = "CPUUtilization" + namespace = "AWS/ECS" + period = 60 + statistic = "Average" + threshold = var.backend_cpu_target + alarm_description = "Backend ECS CPU utilization above target" + + dimensions = { + ClusterName = var.cluster_name + ServiceName = var.backend_service_name + } + + tags = { + Environment = var.environment + } +} + +resource "aws_cloudwatch_metric_alarm" "frontend_high_cpu" { + alarm_name = "${var.environment}-brain-storm-frontend-high-cpu" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 2 + metric_name = "CPUUtilization" + namespace = "AWS/ECS" + period = 60 + statistic = "Average" + threshold = var.frontend_cpu_target + alarm_description = "Frontend ECS CPU utilization above target" + + dimensions = { + ClusterName = var.cluster_name + ServiceName = var.frontend_service_name + } + + tags = { + Environment = var.environment + } +} diff --git a/infra/terraform/modules/autoscaling/outputs.tf b/infra/terraform/modules/autoscaling/outputs.tf new file mode 100644 index 0000000..44c1a6a --- /dev/null +++ b/infra/terraform/modules/autoscaling/outputs.tf @@ -0,0 +1,9 @@ +output "backend_autoscaling_target_arn" { + description = "ARN of the backend Application Auto Scaling target" + value = aws_appautoscaling_target.ecs["backend"].arn +} + +output "frontend_autoscaling_target_arn" { + description = "ARN of the frontend Application Auto Scaling target" + value = aws_appautoscaling_target.ecs["frontend"].arn +} diff --git a/infra/terraform/modules/autoscaling/variables.tf b/infra/terraform/modules/autoscaling/variables.tf new file mode 100644 index 0000000..e3002aa --- /dev/null +++ b/infra/terraform/modules/autoscaling/variables.tf @@ -0,0 +1,67 @@ +variable "environment" { + description = "Environment name" + type = string +} + +variable "cluster_name" { + description = "ECS cluster name" + type = string +} + +variable "backend_service_name" { + description = "ECS backend service name" + type = string +} + +variable "frontend_service_name" { + description = "ECS frontend service name" + type = string +} + +variable "backend_min_capacity" { + description = "Minimum number of backend tasks" + type = number + default = 2 +} + +variable "backend_max_capacity" { + description = "Maximum number of backend tasks" + type = number + default = 10 +} + +variable "backend_cpu_target" { + description = "Target CPU utilization % to trigger backend scaling" + type = number + default = 70 +} + +variable "backend_memory_target" { + description = "Target memory utilization % to trigger backend scaling" + type = number + default = 80 +} + +variable "frontend_min_capacity" { + description = "Minimum number of frontend tasks" + type = number + default = 2 +} + +variable "frontend_max_capacity" { + description = "Maximum number of frontend tasks" + type = number + default = 6 +} + +variable "frontend_cpu_target" { + description = "Target CPU utilization % to trigger frontend scaling" + type = number + default = 70 +} + +variable "frontend_memory_target" { + description = "Target memory utilization % to trigger frontend scaling" + type = number + default = 80 +} diff --git a/infra/terraform/modules/ecr/main.tf b/infra/terraform/modules/ecr/main.tf new file mode 100644 index 0000000..146b756 --- /dev/null +++ b/infra/terraform/modules/ecr/main.tf @@ -0,0 +1,99 @@ +locals { + repositories = { + backend = "${var.environment}-brain-storm-backend" + frontend = "${var.environment}-brain-storm-frontend" + } +} + +resource "aws_ecr_repository" "main" { + for_each = local.repositories + + name = each.value + image_tag_mutability = "MUTABLE" + + image_scanning_configuration { + scan_on_push = true + } + + encryption_configuration { + encryption_type = "AES256" + } + + tags = { + Name = each.value + Service = each.key + Environment = var.environment + } +} + +resource "aws_ecr_lifecycle_policy" "main" { + for_each = local.repositories + repository = aws_ecr_repository.main[each.key].name + + policy = jsonencode({ + rules = [ + { + rulePriority = 1 + description = "Keep last ${var.image_retention_count} tagged production images" + selection = { + tagStatus = "tagged" + tagPrefixList = ["prod-", "v"] + countType = "imageCountMoreThan" + countNumber = var.image_retention_count + } + action = { type = "expire" } + }, + { + rulePriority = 2 + description = "Expire untagged images older than 14 days" + selection = { + tagStatus = "untagged" + countType = "sinceImagePushed" + countUnit = "days" + countNumber = 14 + } + action = { type = "expire" } + } + ] + }) +} + +resource "aws_ecr_repository_policy" "main" { + for_each = local.repositories + repository = aws_ecr_repository.main[each.key].name + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "AllowECSPull" + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + Action = [ + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "ecr:BatchCheckLayerAvailability" + ] + }, + { + Sid = "AllowGitHubActionsPush" + Effect = "Allow" + Principal = { + AWS = var.github_actions_role_arn != "" ? [var.github_actions_role_arn] : [] + } + Action = [ + "ecr:GetAuthorizationToken", + "ecr:BatchCheckLayerAvailability", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "ecr:PutImage", + "ecr:InitiateLayerUpload", + "ecr:UploadLayerPart", + "ecr:CompleteLayerUpload" + ] + } + ] + }) +} diff --git a/infra/terraform/modules/ecr/outputs.tf b/infra/terraform/modules/ecr/outputs.tf new file mode 100644 index 0000000..7f6c252 --- /dev/null +++ b/infra/terraform/modules/ecr/outputs.tf @@ -0,0 +1,24 @@ +output "backend_repository_url" { + description = "ECR repository URL for the backend image" + value = aws_ecr_repository.main["backend"].repository_url +} + +output "frontend_repository_url" { + description = "ECR repository URL for the frontend image" + value = aws_ecr_repository.main["frontend"].repository_url +} + +output "backend_repository_arn" { + description = "ECR repository ARN for the backend image" + value = aws_ecr_repository.main["backend"].arn +} + +output "frontend_repository_arn" { + description = "ECR repository ARN for the frontend image" + value = aws_ecr_repository.main["frontend"].arn +} + +output "registry_id" { + description = "ECR registry ID (AWS account ID)" + value = aws_ecr_repository.main["backend"].registry_id +} diff --git a/infra/terraform/modules/ecr/variables.tf b/infra/terraform/modules/ecr/variables.tf new file mode 100644 index 0000000..4353bf1 --- /dev/null +++ b/infra/terraform/modules/ecr/variables.tf @@ -0,0 +1,16 @@ +variable "environment" { + description = "Environment name" + type = string +} + +variable "image_retention_count" { + description = "Number of tagged images to retain per repository" + type = number + default = 10 +} + +variable "github_actions_role_arn" { + description = "IAM role ARN for GitHub Actions (granted push access to ECR)" + type = string + default = "" +} diff --git a/infra/terraform/modules/ecs/main.tf b/infra/terraform/modules/ecs/main.tf index ceacbd5..e5bb1a3 100644 --- a/infra/terraform/modules/ecs/main.tf +++ b/infra/terraform/modules/ecs/main.tf @@ -12,86 +12,127 @@ resource "aws_ecs_cluster" "main" { } } -resource "aws_ecs_task_definition" "backend" { - family = "${var.environment}-brain-storm-backend" - network_mode = "awsvpc" - requires_compatibilities = ["FARGATE"] - cpu = "512" - memory = "1024" - execution_role_arn = aws_iam_role.ecs_execution.arn - task_role_arn = aws_iam_role.ecs_task.arn +resource "aws_ecs_cluster_capacity_providers" "main" { + cluster_name = aws_ecs_cluster.main.name - container_definitions = jsonencode([{ - name = "backend" - image = var.backend_image + capacity_providers = ["FARGATE", "FARGATE_SPOT"] - portMappings = [{ - containerPort = 3000 - protocol = "tcp" - }] + default_capacity_provider_strategy { + base = 1 + weight = 100 + capacity_provider = "FARGATE" + } +} - environment = [ - { name = "PORT", value = "3000" }, - { name = "DATABASE_HOST", value = var.db_host }, - { name = "REDIS_URL", value = "redis://${var.redis_host}:6379" } - ] +# ─── IAM Roles ─────────────────────────────────────────────────────────────── - logConfiguration = { - logDriver = "awslogs" - options = { - "awslogs-group" = aws_cloudwatch_log_group.backend.name - "awslogs-region" = data.aws_region.current.name - "awslogs-stream-prefix" = "backend" - } - } +resource "aws_iam_role" "ecs_execution" { + name = "${var.environment}-brain-storm-ecs-execution-role" - healthCheck = { - command = ["CMD-SHELL", "curl -f http://localhost:3000/health || exit 1"] - interval = 30 - timeout = 5 - retries = 3 - startPeriod = 10 - } - }]) + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + }] + }) tags = { - Name = "${var.environment}-brain-storm-backend-task" + Name = "${var.environment}-brain-storm-ecs-execution-role" Environment = var.environment } } -resource "aws_ecs_service" "backend" { - name = "${var.environment}-brain-storm-backend" - cluster = aws_ecs_cluster.main.id - task_definition = aws_ecs_task_definition.backend.arn - desired_count = 2 - launch_type = "FARGATE" +resource "aws_iam_role_policy_attachment" "ecs_execution" { + role = aws_iam_role.ecs_execution.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} - network_configuration { - subnets = var.private_subnet_ids - security_groups = [aws_security_group.ecs_backend.id] - assign_public_ip = false - } +resource "aws_iam_role_policy" "ecs_execution_secrets" { + name = "${var.environment}-brain-storm-ecs-execution-secrets" + role = aws_iam_role.ecs_execution.id - load_balancer { - target_group_arn = aws_lb_target_group.backend.arn - container_name = "backend" - container_port = 3000 + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = [ + "secretsmanager:GetSecretValue", + "ssm:GetParameters", + "kms:Decrypt" + ] + Resource = "*" + }] + }) +} + +resource "aws_iam_role" "ecs_task" { + name = "${var.environment}-brain-storm-ecs-task-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + }] + }) + + tags = { + Name = "${var.environment}-brain-storm-ecs-task-role" + Environment = var.environment } +} - depends_on = [aws_lb_target_group.backend] +resource "aws_iam_role_policy" "ecs_task" { + name = "${var.environment}-brain-storm-ecs-task-policy" + role = aws_iam_role.ecs_task.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "logs:CreateLogStream", + "logs:PutLogEvents" + ] + Resource = "arn:aws:logs:*:*:log-group:/ecs/${var.environment}-brain-storm-*" + }, + { + Effect = "Allow" + Action = [ + "ecr:GetAuthorizationToken", + "ecr:BatchCheckLayerAvailability", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage" + ] + Resource = "*" + } + ] + }) } +data "aws_region" "current" {} + +# ─── Security Groups ───────────────────────────────────────────────────────── + resource "aws_security_group" "ecs_backend" { name = "${var.environment}-brain-storm-ecs-backend-sg" description = "Security group for ECS backend tasks" vpc_id = var.vpc_id ingress { - from_port = 3000 - to_port = 3000 - protocol = "tcp" - cidr_blocks = ["10.0.0.0/16"] + from_port = 3000 + to_port = 3000 + protocol = "tcp" + cidr_blocks = [var.vpc_cidr] + description = "Backend API access from within VPC" } egress { @@ -107,8 +148,58 @@ resource "aws_security_group" "ecs_backend" { } } +resource "aws_security_group" "ecs_frontend" { + name = "${var.environment}-brain-storm-ecs-frontend-sg" + description = "Security group for ECS frontend tasks" + vpc_id = var.vpc_id + + ingress { + from_port = 3001 + to_port = 3001 + protocol = "tcp" + cidr_blocks = [var.vpc_cidr] + description = "Frontend access from within VPC" + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { + Name = "${var.environment}-brain-storm-ecs-frontend-sg" + Environment = var.environment + } +} + +# ─── CloudWatch Log Groups ──────────────────────────────────────────────────── + +resource "aws_cloudwatch_log_group" "backend" { + name = "/ecs/${var.environment}-brain-storm-backend" + retention_in_days = var.environment == "prod" ? 30 : 7 + + tags = { + Name = "${var.environment}-brain-storm-backend-logs" + Environment = var.environment + } +} + +resource "aws_cloudwatch_log_group" "frontend" { + name = "/ecs/${var.environment}-brain-storm-frontend" + retention_in_days = var.environment == "prod" ? 30 : 7 + + tags = { + Name = "${var.environment}-brain-storm-frontend-logs" + Environment = var.environment + } +} + +# ─── Target Groups ─────────────────────────────────────────────────────────── + resource "aws_lb_target_group" "backend" { - name = "${var.environment}-brain-storm-backend-tg" + name = "${var.environment}-bs-backend-tg" port = 3000 protocol = "HTTP" vpc_id = var.vpc_id @@ -120,77 +211,107 @@ resource "aws_lb_target_group" "backend" { unhealthy_threshold = 3 timeout = 5 interval = 30 + matcher = "200" } + deregistration_delay = 30 + tags = { Name = "${var.environment}-brain-storm-backend-tg" Environment = var.environment } } -resource "aws_cloudwatch_log_group" "backend" { - name = "/ecs/${var.environment}-brain-storm-backend" - retention_in_days = 7 +resource "aws_lb_target_group" "frontend" { + name = "${var.environment}-bs-frontend-tg" + port = 3001 + protocol = "HTTP" + vpc_id = var.vpc_id + target_type = "ip" - tags = { - Name = "${var.environment}-brain-storm-backend-logs" - Environment = var.environment + health_check { + path = "/api/health" + healthy_threshold = 2 + unhealthy_threshold = 3 + timeout = 5 + interval = 30 + matcher = "200" } -} -resource "aws_iam_role" "ecs_execution" { - name = "${var.environment}-brain-storm-ecs-execution-role" - - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [{ - Action = "sts:AssumeRole" - Effect = "Allow" - Principal = { - Service = "ecs-tasks.amazonaws.com" - } - }] - }) + deregistration_delay = 30 tags = { - Name = "${var.environment}-brain-storm-ecs-execution-role" + Name = "${var.environment}-brain-storm-frontend-tg" Environment = var.environment } } -resource "aws_iam_role_policy_attachment" "ecs_execution" { - role = aws_iam_role.ecs_execution.name - policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" -} +# ─── Task Definitions ───────────────────────────────────────────────────────── -resource "aws_iam_role" "ecs_task" { - name = "${var.environment}-brain-storm-ecs-task-role" +resource "aws_ecs_task_definition" "backend" { + family = "${var.environment}-brain-storm-backend" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = var.backend_cpu + memory = var.backend_memory + execution_role_arn = aws_iam_role.ecs_execution.arn + task_role_arn = aws_iam_role.ecs_task.arn - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [{ - Action = "sts:AssumeRole" - Effect = "Allow" - Principal = { - Service = "ecs-tasks.amazonaws.com" - } + container_definitions = jsonencode([{ + name = "backend" + image = var.backend_image + + portMappings = [{ + containerPort = 3000 + protocol = "tcp" }] - }) + + environment = [ + { name = "PORT", value = "3000" }, + { name = "NODE_ENV", value = var.environment }, + { name = "DATABASE_HOST", value = var.db_host }, + { name = "DATABASE_PORT", value = "5432" }, + { name = "DATABASE_NAME", value = var.db_name }, + { name = "DATABASE_USERNAME", value = var.db_username }, + { name = "REDIS_HOST", value = var.redis_host }, + { name = "REDIS_PORT", value = "6379" } + ] + + secrets = var.backend_secrets + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.backend.name + "awslogs-region" = data.aws_region.current.name + "awslogs-stream-prefix" = "backend" + } + } + + healthCheck = { + command = ["CMD-SHELL", "curl -f http://localhost:3000/health || exit 1"] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = 60 + } + + readonlyRootFilesystem = false + essential = true + }]) tags = { - Name = "${var.environment}-brain-storm-ecs-task-role" + Name = "${var.environment}-brain-storm-backend-task" Environment = var.environment } } -data "aws_region" "current" {} - resource "aws_ecs_task_definition" "frontend" { family = "${var.environment}-brain-storm-frontend" network_mode = "awsvpc" requires_compatibilities = ["FARGATE"] - cpu = "256" - memory = "512" + cpu = var.frontend_cpu + memory = var.frontend_memory execution_role_arn = aws_iam_role.ecs_execution.arn task_role_arn = aws_iam_role.ecs_task.arn @@ -204,7 +325,9 @@ resource "aws_ecs_task_definition" "frontend" { }] environment = [ - { name = "NEXT_PUBLIC_API_URL", value = "http://backend:3000" } + { name = "NODE_ENV", value = var.environment }, + { name = "PORT", value = "3001" }, + { name = "NEXT_PUBLIC_API_URL", value = var.api_base_url } ] logConfiguration = { @@ -221,8 +344,11 @@ resource "aws_ecs_task_definition" "frontend" { interval = 30 timeout = 5 retries = 3 - startPeriod = 10 + startPeriod = 60 } + + readonlyRootFilesystem = false + essential = true }]) tags = { @@ -231,80 +357,88 @@ resource "aws_ecs_task_definition" "frontend" { } } -resource "aws_ecs_service" "frontend" { - name = "${var.environment}-brain-storm-frontend" - cluster = aws_ecs_cluster.main.id - task_definition = aws_ecs_task_definition.frontend.arn - desired_count = 2 - launch_type = "FARGATE" +# ─── ECS Services ──────────────────────────────────────────────────────────── + +resource "aws_ecs_service" "backend" { + name = "${var.environment}-brain-storm-backend" + cluster = aws_ecs_cluster.main.id + task_definition = aws_ecs_task_definition.backend.arn + desired_count = var.backend_desired_count + launch_type = "FARGATE" + health_check_grace_period_seconds = 60 + enable_execute_command = var.environment != "prod" network_configuration { subnets = var.private_subnet_ids - security_groups = [aws_security_group.ecs_frontend.id] + security_groups = [aws_security_group.ecs_backend.id] assign_public_ip = false } load_balancer { - target_group_arn = aws_lb_target_group.frontend.arn - container_name = "frontend" - container_port = 3001 + target_group_arn = aws_lb_target_group.backend.arn + container_name = "backend" + container_port = 3000 } - depends_on = [aws_lb_target_group.frontend] -} - -resource "aws_security_group" "ecs_frontend" { - name = "${var.environment}-brain-storm-ecs-frontend-sg" - description = "Security group for ECS frontend tasks" - vpc_id = var.vpc_id + deployment_circuit_breaker { + enable = true + rollback = true + } - ingress { - from_port = 3001 - to_port = 3001 - protocol = "tcp" - cidr_blocks = ["10.0.0.0/16"] + deployment_controller { + type = "ECS" } - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] + depends_on = [aws_lb_target_group.backend] + + lifecycle { + ignore_changes = [task_definition, desired_count] } tags = { - Name = "${var.environment}-brain-storm-ecs-frontend-sg" + Name = "${var.environment}-brain-storm-backend" Environment = var.environment } } -resource "aws_lb_target_group" "frontend" { - name = "${var.environment}-brain-storm-frontend-tg" - port = 3001 - protocol = "HTTP" - vpc_id = var.vpc_id - target_type = "ip" +resource "aws_ecs_service" "frontend" { + name = "${var.environment}-brain-storm-frontend" + cluster = aws_ecs_cluster.main.id + task_definition = aws_ecs_task_definition.frontend.arn + desired_count = var.frontend_desired_count + launch_type = "FARGATE" + health_check_grace_period_seconds = 60 + enable_execute_command = var.environment != "prod" - health_check { - path = "/api/health" - healthy_threshold = 2 - unhealthy_threshold = 3 - timeout = 5 - interval = 30 + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.ecs_frontend.id] + assign_public_ip = false } - tags = { - Name = "${var.environment}-brain-storm-frontend-tg" - Environment = var.environment + load_balancer { + target_group_arn = aws_lb_target_group.frontend.arn + container_name = "frontend" + container_port = 3001 } -} -resource "aws_cloudwatch_log_group" "frontend" { - name = "/ecs/${var.environment}-brain-storm-frontend" - retention_in_days = 7 + deployment_circuit_breaker { + enable = true + rollback = true + } + + deployment_controller { + type = "ECS" + } + + depends_on = [aws_lb_target_group.frontend] + + lifecycle { + ignore_changes = [task_definition, desired_count] + } tags = { - Name = "${var.environment}-brain-storm-frontend-logs" + Name = "${var.environment}-brain-storm-frontend" Environment = var.environment } } diff --git a/infra/terraform/modules/ecs/outputs.tf b/infra/terraform/modules/ecs/outputs.tf index 831c965..6b4c884 100644 --- a/infra/terraform/modules/ecs/outputs.tf +++ b/infra/terraform/modules/ecs/outputs.tf @@ -3,6 +3,11 @@ output "cluster_id" { value = aws_ecs_cluster.main.id } +output "cluster_name" { + description = "ECS cluster name" + value = aws_ecs_cluster.main.name +} + output "backend_target_group_arn" { description = "Backend target group ARN" value = aws_lb_target_group.backend.arn @@ -12,3 +17,33 @@ output "frontend_target_group_arn" { description = "Frontend target group ARN" value = aws_lb_target_group.frontend.arn } + +output "backend_security_group_id" { + description = "Security group ID for ECS backend tasks" + value = aws_security_group.ecs_backend.id +} + +output "frontend_security_group_id" { + description = "Security group ID for ECS frontend tasks" + value = aws_security_group.ecs_frontend.id +} + +output "backend_service_name" { + description = "ECS backend service name (for auto-scaling)" + value = aws_ecs_service.backend.name +} + +output "frontend_service_name" { + description = "ECS frontend service name (for auto-scaling)" + value = aws_ecs_service.frontend.name +} + +output "ecs_task_role_arn" { + description = "ECS task IAM role ARN" + value = aws_iam_role.ecs_task.arn +} + +output "ecs_execution_role_arn" { + description = "ECS task execution IAM role ARN" + value = aws_iam_role.ecs_execution.arn +} diff --git a/infra/terraform/modules/ecs/variables.tf b/infra/terraform/modules/ecs/variables.tf index 9ea18a3..0f6ee9d 100644 --- a/infra/terraform/modules/ecs/variables.tf +++ b/infra/terraform/modules/ecs/variables.tf @@ -8,6 +8,11 @@ variable "vpc_id" { type = string } +variable "vpc_cidr" { + description = "VPC CIDR block for security group ingress" + type = string +} + variable "private_subnet_ids" { description = "Private subnet IDs" type = list(string) @@ -19,21 +24,85 @@ variable "public_subnet_ids" { } variable "backend_image" { - description = "Backend Docker image" + description = "Backend Docker image URI" type = string } variable "frontend_image" { - description = "Frontend Docker image" + description = "Frontend Docker image URI" type = string } variable "db_host" { - description = "Database host" + description = "Database host endpoint" + type = string +} + +variable "db_name" { + description = "Database name" type = string + default = "brainstorm" +} + +variable "db_username" { + description = "Database username" + type = string + default = "admin" + sensitive = true } variable "redis_host" { - description = "Redis host" + description = "Redis host endpoint" + type = string +} + +variable "api_base_url" { + description = "Public API base URL for the frontend (NEXT_PUBLIC_API_URL)" + type = string + default = "" +} + +variable "backend_secrets" { + description = "Secrets to inject into the backend container from Secrets Manager" + type = list(object({ + name = string + valueFrom = string + })) + default = [] +} + +variable "backend_cpu" { + description = "CPU units for backend Fargate task (256, 512, 1024, 2048, 4096)" + type = string + default = "512" +} + +variable "backend_memory" { + description = "Memory (MiB) for backend Fargate task" + type = string + default = "1024" +} + +variable "frontend_cpu" { + description = "CPU units for frontend Fargate task" type = string + default = "256" +} + +variable "frontend_memory" { + description = "Memory (MiB) for frontend Fargate task" + type = string + default = "512" +} + +variable "backend_desired_count" { + description = "Desired number of backend task replicas" + type = number + default = 2 +} + +variable "frontend_desired_count" { + description = "Desired number of frontend task replicas" + type = number + default = 2 } diff --git a/infra/terraform/modules/elasticache/main.tf b/infra/terraform/modules/elasticache/main.tf index 00f38a0..0354a6e 100644 --- a/infra/terraform/modules/elasticache/main.tf +++ b/infra/terraform/modules/elasticache/main.tf @@ -17,7 +17,8 @@ resource "aws_security_group" "redis" { from_port = 6379 to_port = 6379 protocol = "tcp" - cidr_blocks = ["10.0.0.0/16"] + cidr_blocks = [var.vpc_cidr] + description = "Redis access from within VPC" } egress { @@ -34,28 +35,47 @@ resource "aws_security_group" "redis" { } resource "aws_elasticache_replication_group" "main" { - replication_group_id = "${var.environment}-brain-storm-redis" - replication_group_description = "Redis cluster for Brain Storm ${var.environment}" - + replication_group_id = "${var.environment}-brain-storm-redis" + description = "Redis cluster for Brain Storm ${var.environment}" + engine = "redis" - engine_version = "7.0" + engine_version = "7.1" node_type = var.node_type - num_cache_clusters = 2 + num_cache_clusters = var.environment == "prod" ? 2 : 1 parameter_group_name = "default.redis7" port = 6379 subnet_group_name = aws_elasticache_subnet_group.main.name security_group_ids = [aws_security_group.redis.id] - automatic_failover_enabled = true + automatic_failover_enabled = var.environment == "prod" + multi_az_enabled = var.environment == "prod" at_rest_encryption_enabled = true transit_encryption_enabled = true snapshot_retention_limit = 5 snapshot_window = "03:00-05:00" + maintenance_window = "mon:05:00-mon:06:00" + + log_delivery_configuration { + destination = aws_cloudwatch_log_group.redis.name + destination_type = "cloudwatch-logs" + log_format = "text" + log_type = "slow-log" + } tags = { Name = "${var.environment}-brain-storm-redis" Environment = var.environment } } + +resource "aws_cloudwatch_log_group" "redis" { + name = "/elasticache/${var.environment}-brain-storm/redis" + retention_in_days = 14 + + tags = { + Name = "${var.environment}-brain-storm-redis-logs" + Environment = var.environment + } +} diff --git a/infra/terraform/modules/elasticache/variables.tf b/infra/terraform/modules/elasticache/variables.tf index 27ef20a..c103943 100644 --- a/infra/terraform/modules/elasticache/variables.tf +++ b/infra/terraform/modules/elasticache/variables.tf @@ -17,3 +17,8 @@ variable "node_type" { description = "ElastiCache node type" type = string } + +variable "vpc_cidr" { + description = "VPC CIDR block for security group ingress" + type = string +} diff --git a/infra/terraform/modules/rds/main.tf b/infra/terraform/modules/rds/main.tf index b8ff960..988b5ea 100644 --- a/infra/terraform/modules/rds/main.tf +++ b/infra/terraform/modules/rds/main.tf @@ -17,7 +17,8 @@ resource "aws_security_group" "rds" { from_port = 5432 to_port = 5432 protocol = "tcp" - cidr_blocks = ["10.0.0.0/16"] + cidr_blocks = [var.vpc_cidr] + description = "PostgreSQL access from within VPC" } egress { @@ -33,14 +34,40 @@ resource "aws_security_group" "rds" { } } +resource "aws_iam_role" "rds_monitoring" { + name = "${var.environment}-brain-storm-rds-monitoring-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "monitoring.rds.amazonaws.com" + } + }] + }) + + tags = { + Name = "${var.environment}-brain-storm-rds-monitoring-role" + Environment = var.environment + } +} + +resource "aws_iam_role_policy_attachment" "rds_monitoring" { + role = aws_iam_role.rds_monitoring.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonRDSEnhancedMonitoringRole" +} + resource "aws_db_instance" "main" { identifier = "${var.environment}-brain-storm-db" engine = "postgres" - engine_version = "16.1" + engine_version = "16.3" instance_class = var.db_instance_class allocated_storage = 20 max_allocated_storage = 100 + storage_type = "gp3" storage_encrypted = true db_name = var.db_name @@ -50,10 +77,25 @@ resource "aws_db_instance" "main" { db_subnet_group_name = aws_db_subnet_group.main.name vpc_security_group_ids = [aws_security_group.rds.id] - backup_retention_period = 7 + multi_az = var.multi_az + publicly_accessible = false + deletion_protection = var.environment == "prod" + + backup_retention_period = var.environment == "prod" ? 14 : 7 backup_window = "03:00-04:00" maintenance_window = "mon:04:00-mon:05:00" + monitoring_interval = var.monitoring_interval + monitoring_role_arn = var.monitoring_interval > 0 ? aws_iam_role.rds_monitoring.arn : null + + performance_insights_enabled = var.environment == "prod" + performance_insights_retention_period = var.environment == "prod" ? 7 : 0 + + enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"] + + auto_minor_version_upgrade = true + copy_tags_to_snapshot = true + skip_final_snapshot = var.environment != "prod" final_snapshot_identifier = "${var.environment}-brain-storm-db-final-snapshot" diff --git a/infra/terraform/modules/rds/variables.tf b/infra/terraform/modules/rds/variables.tf index 7c5e3f0..4538b9b 100644 --- a/infra/terraform/modules/rds/variables.tf +++ b/infra/terraform/modules/rds/variables.tf @@ -8,6 +8,11 @@ variable "vpc_id" { type = string } +variable "vpc_cidr" { + description = "VPC CIDR block for security group ingress" + type = string +} + variable "private_subnet_ids" { description = "Private subnet IDs" type = list(string) @@ -34,3 +39,15 @@ variable "db_instance_class" { description = "RDS instance class" type = string } + +variable "multi_az" { + description = "Enable Multi-AZ deployment for high availability" + type = bool + default = false +} + +variable "monitoring_interval" { + description = "Enhanced monitoring interval in seconds (0 to disable)" + type = number + default = 60 +} diff --git a/infra/terraform/modules/vpc/main.tf b/infra/terraform/modules/vpc/main.tf index 7800aa9..02ed820 100644 --- a/infra/terraform/modules/vpc/main.tf +++ b/infra/terraform/modules/vpc/main.tf @@ -108,3 +108,70 @@ resource "aws_route_table_association" "private" { data "aws_availability_zones" "available" { state = "available" } + +# VPC Flow Logs +resource "aws_flow_log" "main" { + count = var.enable_flow_logs ? 1 : 0 + iam_role_arn = aws_iam_role.flow_log[0].arn + log_destination = aws_cloudwatch_log_group.flow_log[0].arn + traffic_type = "ALL" + vpc_id = aws_vpc.main.id + + tags = { + Name = "${var.environment}-brain-storm-flow-log" + Environment = var.environment + } +} + +resource "aws_cloudwatch_log_group" "flow_log" { + count = var.enable_flow_logs ? 1 : 0 + name = "/vpc/${var.environment}-brain-storm/flow-logs" + retention_in_days = 30 + + tags = { + Name = "${var.environment}-brain-storm-flow-logs" + Environment = var.environment + } +} + +resource "aws_iam_role" "flow_log" { + count = var.enable_flow_logs ? 1 : 0 + name = "${var.environment}-brain-storm-flow-log-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "vpc-flow-logs.amazonaws.com" + } + }] + }) + + tags = { + Name = "${var.environment}-brain-storm-flow-log-role" + Environment = var.environment + } +} + +resource "aws_iam_role_policy" "flow_log" { + count = var.enable_flow_logs ? 1 : 0 + name = "${var.environment}-brain-storm-flow-log-policy" + role = aws_iam_role.flow_log[0].id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + "logs:DescribeLogGroups", + "logs:DescribeLogStreams" + ] + Resource = "*" + }] + }) +} diff --git a/infra/terraform/modules/vpc/outputs.tf b/infra/terraform/modules/vpc/outputs.tf index 3089bed..44829f9 100644 --- a/infra/terraform/modules/vpc/outputs.tf +++ b/infra/terraform/modules/vpc/outputs.tf @@ -12,3 +12,8 @@ output "private_subnet_ids" { description = "Private subnet IDs" value = aws_subnet.private[*].id } + +output "vpc_cidr" { + description = "VPC CIDR block" + value = aws_vpc.main.cidr_block +} diff --git a/infra/terraform/modules/vpc/variables.tf b/infra/terraform/modules/vpc/variables.tf index 42e9dbb..f7742c8 100644 --- a/infra/terraform/modules/vpc/variables.tf +++ b/infra/terraform/modules/vpc/variables.tf @@ -7,3 +7,9 @@ variable "vpc_cidr" { description = "CIDR block for VPC" type = string } + +variable "enable_flow_logs" { + description = "Enable VPC flow logs to CloudWatch" + type = bool + default = true +} diff --git a/infra/terraform/outputs.tf b/infra/terraform/outputs.tf index 642904a..a56e3ed 100644 --- a/infra/terraform/outputs.tf +++ b/infra/terraform/outputs.tf @@ -3,11 +3,21 @@ output "vpc_id" { value = module.vpc.vpc_id } +output "vpc_cidr" { + description = "VPC CIDR block" + value = module.vpc.vpc_cidr +} + output "alb_dns_name" { description = "Application Load Balancer DNS name" value = module.alb.alb_dns_name } +output "alb_zone_id" { + description = "ALB hosted zone ID (use for Route 53 alias records)" + value = module.alb.alb_zone_id +} + output "db_endpoint" { description = "RDS database endpoint" value = module.rds.db_endpoint @@ -20,6 +30,21 @@ output "redis_endpoint" { sensitive = true } +output "ecs_cluster_name" { + description = "ECS cluster name" + value = module.ecs.cluster_name +} + +output "backend_repository_url" { + description = "ECR repository URL for backend — use as backend_image in CI/CD" + value = module.ecr.backend_repository_url +} + +output "frontend_repository_url" { + description = "ECR repository URL for frontend — use as frontend_image in CI/CD" + value = module.ecr.frontend_repository_url +} + output "github_actions_role_arn" { description = "IAM role ARN for GitHub Actions OIDC — set as AWS_ROLE_ARN secret" value = module.oidc.role_arn @@ -29,3 +54,15 @@ output "api_gateway_endpoint" { description = "API Gateway invoke URL" value = module.api_gateway.api_gateway_endpoint } + +output "db_password_secret_arn" { + description = "Secrets Manager ARN for the database password" + value = module.secrets.db_password_secret_arn + sensitive = true +} + +output "jwt_secret_arn" { + description = "Secrets Manager ARN for the JWT signing secret" + value = module.secrets.jwt_secret_arn + sensitive = true +} diff --git a/infra/terraform/terraform.tfvars.example b/infra/terraform/terraform.tfvars.example index bd709d9..92d1662 100644 --- a/infra/terraform/terraform.tfvars.example +++ b/infra/terraform/terraform.tfvars.example @@ -1,23 +1,61 @@ -# AWS Configuration +# ─── Core ───────────────────────────────────────────────────────────────────── aws_region = "us-east-1" environment = "prod" +account_id = "123456789012" -# Network Configuration -vpc_cidr = "10.0.0.0/16" +# ─── Networking ─────────────────────────────────────────────────────────────── +vpc_cidr = "10.0.0.0/16" +enable_flow_logs = true -# Database Configuration +# ─── Database ───────────────────────────────────────────────────────────────── db_name = "brainstorm" -db_username = "admin" -db_password = "change-me-in-production" -db_instance_class = "db.t3.micro" +db_username = "brainstorm_admin" +db_password = "CHANGE_ME_use_secrets_manager_in_prod" +db_instance_class = "db.t3.micro" # db.t3.medium or larger for prod +rds_multi_az = false # set true for prod -# Redis Configuration -redis_node_type = "cache.t3.micro" +# ─── Redis ──────────────────────────────────────────────────────────────────── +redis_node_type = "cache.t3.micro" # cache.t3.medium or larger for prod -# Application Images -backend_image = "your-registry/brain-storm-backend:latest" -frontend_image = "your-registry/brain-storm-frontend:latest" +# ─── ECR ────────────────────────────────────────────────────────────────────── +ecr_image_retention_count = 10 -# GitHub OIDC +# ─── ECS / Application ──────────────────────────────────────────────────────── +# Leave empty to default to the ECR repositories created by this config. +# Override with an explicit URI (e.g. from a previous build) if needed. +backend_image = "" +frontend_image = "" + +# Public URL the browser-side frontend JavaScript will use for API calls. +# Set to your custom domain (e.g. "https://api.brainstorm.example.com") +# or to the api_gateway_endpoint output after first apply. +api_base_url = "https://api.brainstorm.example.com" + +ecs_backend_desired_count = 2 +ecs_frontend_desired_count = 2 + +# ─── Auto Scaling ───────────────────────────────────────────────────────────── +backend_min_capacity = 2 +backend_max_capacity = 10 +frontend_min_capacity = 2 +frontend_max_capacity = 6 + +# ─── HTTPS ──────────────────────────────────────────────────────────────────── +# Optional: provide an ACM certificate ARN to enable HTTPS on the ALB. +# When set, HTTP traffic is automatically redirected to HTTPS. +# Leave empty for HTTP-only (not recommended in production). +https_certificate_arn = "" +# https_certificate_arn = "arn:aws:acm:us-east-1:123456789012:certificate/..." + +# ─── GitHub OIDC ────────────────────────────────────────────────────────────── github_org = "your-github-org" github_repo = "Brain-Storm" + +# ─── Secrets (use environment variables or a .tfvars file NOT committed to git) +# TF_VAR_db_password, TF_VAR_jwt_secret, TF_VAR_stellar_secret_key +jwt_secret = "CHANGE_ME_generate_with_openssl_rand_hex_64" +stellar_secret_key = "CHANGE_ME_your_stellar_secret_key" + +# ─── Observability ──────────────────────────────────────────────────────────── +alert_sns_arns = [] # ["arn:aws:sns:us-east-1:123456789012:brain-storm-alerts"] +api_gateway_cors_origins = ["https://brainstorm.example.com"] diff --git a/infra/terraform/variables.tf b/infra/terraform/variables.tf index a3d5bd6..bcd2c6e 100644 --- a/infra/terraform/variables.tf +++ b/infra/terraform/variables.tf @@ -7,14 +7,34 @@ variable "aws_region" { variable "environment" { description = "Environment name (dev, staging, prod)" type = string + + validation { + condition = contains(["dev", "staging", "prod"], var.environment) + error_message = "environment must be one of: dev, staging, prod." + } +} + +variable "account_id" { + description = "AWS account ID" + type = string } +# ─── Networking ─────────────────────────────────────────────────────────────── + variable "vpc_cidr" { description = "CIDR block for VPC" type = string default = "10.0.0.0/16" } +variable "enable_flow_logs" { + description = "Enable VPC flow logs to CloudWatch" + type = bool + default = true +} + +# ─── Database ───────────────────────────────────────────────────────────────── + variable "db_name" { description = "Database name" type = string @@ -39,22 +59,102 @@ variable "db_instance_class" { default = "db.t3.micro" } +variable "rds_multi_az" { + description = "Enable Multi-AZ RDS for high availability (recommended for prod)" + type = bool + default = false +} + +variable "rds_monitoring_interval" { + description = "Enhanced RDS monitoring interval in seconds (0 to disable)" + type = number + default = 60 +} + +# ─── Redis ──────────────────────────────────────────────────────────────────── + variable "redis_node_type" { description = "ElastiCache node type" type = string default = "cache.t3.micro" } +# ─── ECR ────────────────────────────────────────────────────────────────────── + +variable "ecr_image_retention_count" { + description = "Number of tagged images to retain per ECR repository" + type = number + default = 10 +} + +# ─── ECS / Application ──────────────────────────────────────────────────────── + variable "backend_image" { - description = "Docker image for backend service" + description = "Docker image URI for backend service. Defaults to ECR repo when empty." type = string + default = "" } variable "frontend_image" { - description = "Docker image for frontend service" + description = "Docker image URI for frontend service. Defaults to ECR repo when empty." type = string + default = "" } +variable "api_base_url" { + description = "Public API base URL injected as NEXT_PUBLIC_API_URL into the frontend container" + type = string + default = "" +} + +variable "ecs_backend_desired_count" { + description = "Initial desired count for the backend ECS service" + type = number + default = 2 +} + +variable "ecs_frontend_desired_count" { + description = "Initial desired count for the frontend ECS service" + type = number + default = 2 +} + +# ─── Auto Scaling ───────────────────────────────────────────────────────────── + +variable "backend_min_capacity" { + description = "Minimum number of backend ECS tasks" + type = number + default = 2 +} + +variable "backend_max_capacity" { + description = "Maximum number of backend ECS tasks" + type = number + default = 10 +} + +variable "frontend_min_capacity" { + description = "Minimum number of frontend ECS tasks" + type = number + default = 2 +} + +variable "frontend_max_capacity" { + description = "Maximum number of frontend ECS tasks" + type = number + default = 6 +} + +# ─── ALB / HTTPS ────────────────────────────────────────────────────────────── + +variable "https_certificate_arn" { + description = "ACM certificate ARN for HTTPS. HTTP redirects to HTTPS when set." + type = string + default = "" +} + +# ─── GitHub OIDC ────────────────────────────────────────────────────────────── + variable "github_org" { description = "GitHub organization or username owning this repo" type = string @@ -66,10 +166,7 @@ variable "github_repo" { default = "Brain-Storm" } -variable "account_id" { - description = "AWS account ID" - type = string -} +# ─── Secrets ────────────────────────────────────────────────────────────────── variable "jwt_secret" { description = "JWT signing secret" @@ -83,8 +180,10 @@ variable "stellar_secret_key" { sensitive = true } +# ─── Observability / Alerting ───────────────────────────────────────────────── + variable "alert_sns_arns" { - description = "SNS topic ARNs for security alerts" + description = "SNS topic ARNs for CloudWatch alarms and security alerts" type = list(string) default = [] }