-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark.sh
More file actions
executable file
·1789 lines (1480 loc) · 51.1 KB
/
benchmark.sh
File metadata and controls
executable file
·1789 lines (1480 loc) · 51.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/bash
set -e
# Disable AWS CLI pager (prevents less/more from opening)
export AWS_PAGER=""
# Load common functions
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR" && pwd)"
source "$PROJECT_ROOT/lib/common.sh"
source "$PROJECT_ROOT/lib/state.sh"
CLUSTER_NAME="${CLUSTER_NAME:-watt-benchmark-$(date +%s)}"
AWS_PROFILE="${AWS_PROFILE}"
NODE_TYPE="${NODE_TYPE:-m5.2xlarge}"
NODE_COUNT="${NODE_COUNT:-4}"
FRAMEWORK="${FRAMEWORK:-next}"
FRAMEWORK_SOURCE_DIR="$PROJECT_ROOT/$FRAMEWORK"
KUBE_MANIFEST="${FRAMEWORK_SOURCE_DIR}/kube.yaml"
AMI_ID="${AMI_ID:-ami-07b2b18045edffe90}" # Amazon Linux 2023 arm64
LOADTESTING_INSTANCE_TYPE="${LOADTESTING_INSTANCE_TYPE:-c7gn.2xlarge}"
ECR_REPO_NAME="${ECR_REPO_NAME:-watt-benchmark}"
IMAGE_TAG="${IMAGE_TAG:-latest}"
S3_BUCKET_NAME="" # Will be set dynamically with cluster name
# Infrastructure resource names (set by creation functions)
CLUSTER_ROLE_NAME=""
NODE_ROLE_NAME=""
LOADTEST_ROLE_NAME=""
LOADTEST_INSTANCE_PROFILE_NAME=""
VPC_ID=""
SUBNET_IDS=""
IGW_ID=""
RTB_ID=""
CLUSTER_ROLE_ARN=""
NODE_ROLE_ARN=""
KUBE_CONTEXT=""
LOAD_TEST_INSTANCE_ID=""
SECURITY_GROUP_ID=""
AWS_ACCOUNT_ID=""
AWS_REGION=""
ECR_IMAGE_URI=""
ECR_REPO_CREATED=""
cleanup_instances() {
# Terminate load test EC2 instance and wait for it
if [[ -n "$LOAD_TEST_INSTANCE_ID" ]]; then
log "Terminating load_test instance: $LOAD_TEST_INSTANCE_ID"
aws ec2 terminate-instances \
--instance-ids "$LOAD_TEST_INSTANCE_ID" \
--profile "$AWS_PROFILE" >/dev/null 2>&1 || true
log "Waiting for instance termination..."
aws ec2 wait instance-terminated \
--instance-ids "$LOAD_TEST_INSTANCE_ID" \
--profile "$AWS_PROFILE" 2>/dev/null || true
mark_resource_cleaned "ec2" "load_test_instance_id"
fi
# Delete EKS node group
if [[ -n "$CLUSTER_NAME" ]]; then
local nodegroup_name="$CLUSTER_NAME-nodegroup"
log "Checking for node group: $nodegroup_name"
if aws eks describe-nodegroup \
--cluster-name "$CLUSTER_NAME" \
--nodegroup-name "$nodegroup_name" \
--profile "$AWS_PROFILE" >/dev/null 2>&1; then
log "Deleting node group: $nodegroup_name"
aws eks delete-nodegroup \
--cluster-name "$CLUSTER_NAME" \
--nodegroup-name "$nodegroup_name" \
--profile "$AWS_PROFILE" >/dev/null 2>&1 || true
log "Waiting for node group deletion..."
aws eks wait nodegroup-deleted \
--cluster-name "$CLUSTER_NAME" \
--nodegroup-name "$nodegroup_name" \
--profile "$AWS_PROFILE" 2>&1 | grep -v "waiting" || true
mark_resource_cleaned "eks" "nodegroup_name"
fi
fi
# Delete EKS cluster
if [[ -n "$CLUSTER_NAME" ]]; then
log "Checking if cluster exists: $CLUSTER_NAME"
if aws eks describe-cluster \
--name "$CLUSTER_NAME" \
--profile "$AWS_PROFILE" >/dev/null 2>&1; then
log "Deleting EKS cluster: $CLUSTER_NAME"
aws eks delete-cluster \
--name "$CLUSTER_NAME" \
--profile "$AWS_PROFILE" >/dev/null 2>&1 || true
log "Waiting for cluster deletion..."
aws eks wait cluster-deleted \
--name "$CLUSTER_NAME" \
--profile "$AWS_PROFILE" 2>&1 | grep -v "waiting" || true
fi
fi
# Delete Load Balancers in VPC (created by K8s LoadBalancer services)
if [[ -n "$VPC_ID" ]]; then
log "Deleting Load Balancers in VPC..."
local lb_arns
lb_arns=$(aws elbv2 describe-load-balancers \
--profile "$AWS_PROFILE" \
--output json 2>/dev/null | \
jq -r ".LoadBalancers[] | select(.VpcId == \"$VPC_ID\") | .LoadBalancerArn" 2>/dev/null || true)
if [[ -n "$lb_arns" ]]; then
for arn in $lb_arns; do
log "Deleting Load Balancer: $arn"
aws elbv2 delete-load-balancer \
--load-balancer-arn "$arn" \
--profile "$AWS_PROFILE" 2>/dev/null || true
done
log "Waiting for Load Balancer ENIs to be released (60s)..."
sleep 60
fi
fi
# Delete security group (with retry)
if [[ -n "$SECURITY_GROUP_ID" ]]; then
log "Deleting security group: $SECURITY_GROUP_ID"
local sg_retry=0
while [[ $sg_retry -lt 5 ]]; do
if aws ec2 delete-security-group \
--group-id "$SECURITY_GROUP_ID" \
--profile "$AWS_PROFILE" 2>/dev/null; then
mark_resource_cleaned "ec2" "security_group_id"
break
fi
sg_retry=$((sg_retry + 1))
sleep 10
done
fi
if [[ -n "$VPC_ID" ]]; then
log "Deleting VPC resources..."
# Delete all non-default security groups in VPC
log "Deleting security groups..."
local sgs
sgs=$(aws ec2 describe-security-groups \
--filters "Name=vpc-id,Values=$VPC_ID" \
--profile "$AWS_PROFILE" \
--output json 2>/dev/null | \
jq -r '.SecurityGroups[] | select(.GroupName != "default") | .GroupId' 2>/dev/null || true)
for sg in $sgs; do
aws ec2 delete-security-group \
--group-id "$sg" \
--profile "$AWS_PROFILE" 2>/dev/null || true
done
# Delete Network Interfaces (orphaned ENIs from LBs/EKS)
log "Deleting network interfaces..."
local enis
enis=$(aws ec2 describe-network-interfaces \
--filters "Name=vpc-id,Values=$VPC_ID" \
--query 'NetworkInterfaces[*].NetworkInterfaceId' \
--output text \
--profile "$AWS_PROFILE" 2>/dev/null || true)
for eni in $enis; do
aws ec2 delete-network-interface \
--network-interface-id "$eni" \
--profile "$AWS_PROFILE" 2>/dev/null || true
done
# Detach and delete internet gateway
if [[ -n "$IGW_ID" ]]; then
aws ec2 detach-internet-gateway \
--internet-gateway-id "$IGW_ID" \
--vpc-id "$VPC_ID" \
--profile "$AWS_PROFILE" 2>/dev/null || true
aws ec2 delete-internet-gateway \
--internet-gateway-id "$IGW_ID" \
--profile "$AWS_PROFILE" 2>/dev/null || true
mark_resource_cleaned "vpc" "igw_id"
fi
# Also check for any IGWs attached to VPC (in case IGW_ID wasn't set)
local igws
igws=$(aws ec2 describe-internet-gateways \
--filters "Name=attachment.vpc-id,Values=$VPC_ID" \
--query 'InternetGateways[*].InternetGatewayId' \
--output text \
--profile "$AWS_PROFILE" 2>/dev/null || true)
for igw in $igws; do
aws ec2 detach-internet-gateway \
--internet-gateway-id "$igw" \
--vpc-id "$VPC_ID" \
--profile "$AWS_PROFILE" 2>/dev/null || true
aws ec2 delete-internet-gateway \
--internet-gateway-id "$igw" \
--profile "$AWS_PROFILE" 2>/dev/null || true
done
# Delete subnets
log "Deleting subnets..."
local subnets
subnets=$(aws ec2 describe-subnets \
--filters "Name=vpc-id,Values=$VPC_ID" \
--query 'Subnets[*].SubnetId' \
--output text \
--profile "$AWS_PROFILE" 2>/dev/null || true)
for subnet in $subnets; do
aws ec2 delete-subnet \
--subnet-id "$subnet" \
--profile "$AWS_PROFILE" 2>/dev/null || true
done
mark_resource_cleaned "vpc" "subnet_ids"
# Delete all non-main route tables
log "Deleting route tables..."
local rts
rts=$(aws ec2 describe-route-tables \
--filters "Name=vpc-id,Values=$VPC_ID" \
--profile "$AWS_PROFILE" \
--output json 2>/dev/null | \
jq -r '.RouteTables[] | select(.Associations[0].Main != true) | .RouteTableId' 2>/dev/null || true)
for rt in $rts; do
aws ec2 delete-route-table \
--route-table-id "$rt" \
--profile "$AWS_PROFILE" 2>/dev/null || true
done
mark_resource_cleaned "vpc" "rtb_id"
# Delete VPC with retry
log "Deleting VPC: $VPC_ID"
local vpc_retry=0
while [[ $vpc_retry -lt 3 ]]; do
if aws ec2 delete-vpc \
--vpc-id "$VPC_ID" \
--profile "$AWS_PROFILE" 2>/dev/null; then
log "VPC deleted successfully"
mark_resource_cleaned "vpc" "vpc_id"
break
fi
vpc_retry=$((vpc_retry + 1))
log "VPC deletion failed, retrying in 10s... (attempt $vpc_retry/3)"
sleep 10
done
fi
# Delete IAM roles
if [[ -n "$NODE_ROLE_NAME" ]]; then
log "Deleting node IAM role: $NODE_ROLE_NAME"
aws iam detach-role-policy \
--role-name "$NODE_ROLE_NAME" \
--policy-arn arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy \
--profile "$AWS_PROFILE" 2>/dev/null || true
aws iam detach-role-policy \
--role-name "$NODE_ROLE_NAME" \
--policy-arn arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryPullOnly \
--profile "$AWS_PROFILE" 2>/dev/null || true
aws iam detach-role-policy \
--role-name "$NODE_ROLE_NAME" \
--policy-arn arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy \
--profile "$AWS_PROFILE" 2>/dev/null || true
aws iam delete-role \
--role-name "$NODE_ROLE_NAME" \
--profile "$AWS_PROFILE" 2>/dev/null || true
mark_resource_cleaned "iam" "node_role_name"
fi
if [[ -n "$CLUSTER_ROLE_NAME" ]]; then
log "Deleting cluster IAM role: $CLUSTER_ROLE_NAME"
aws iam detach-role-policy \
--role-name "$CLUSTER_ROLE_NAME" \
--policy-arn arn:aws:iam::aws:policy/AmazonEKSClusterPolicy \
--profile "$AWS_PROFILE" 2>/dev/null || true
aws iam delete-role \
--role-name "$CLUSTER_ROLE_NAME" \
--profile "$AWS_PROFILE" 2>/dev/null || true
mark_resource_cleaned "iam" "cluster_role_name"
fi
# Delete ECR repository
if [[ -n "$ECR_REPO_CREATED" && "$ECR_REPO_CREATED" == "true" ]]; then
log "Deleting ECR repository: $ECR_REPO_NAME"
aws ecr delete-repository \
--repository-name "$ECR_REPO_NAME" \
--force \
--profile "$AWS_PROFILE" 2>/dev/null || true
mark_resource_cleaned "ecr" "repo_name"
mark_resource_cleaned "ecr" "repo_created"
fi
# Delete load test instance profile and role
if [[ -n "$LOADTEST_INSTANCE_PROFILE_NAME" ]]; then
log "Deleting load test instance profile: $LOADTEST_INSTANCE_PROFILE_NAME"
aws iam remove-role-from-instance-profile \
--instance-profile-name "$LOADTEST_INSTANCE_PROFILE_NAME" \
--role-name "$LOADTEST_ROLE_NAME" \
--profile "$AWS_PROFILE" 2>/dev/null || true
aws iam delete-instance-profile \
--instance-profile-name "$LOADTEST_INSTANCE_PROFILE_NAME" \
--profile "$AWS_PROFILE" 2>/dev/null || true
mark_resource_cleaned "iam" "loadtest_instance_profile_name"
fi
if [[ -n "$LOADTEST_ROLE_NAME" ]]; then
log "Deleting load test IAM role: $LOADTEST_ROLE_NAME"
aws iam delete-role-policy \
--role-name "$LOADTEST_ROLE_NAME" \
--policy-name "S3BenchmarkAccess" \
--profile "$AWS_PROFILE" 2>/dev/null || true
aws iam delete-role \
--role-name "$LOADTEST_ROLE_NAME" \
--profile "$AWS_PROFILE" 2>/dev/null || true
mark_resource_cleaned "iam" "loadtest_role_name"
fi
# Delete S3 bucket (must empty first)
if [[ -n "$S3_BUCKET_NAME" ]]; then
log "Deleting S3 bucket: $S3_BUCKET_NAME"
aws s3 rm "s3://$S3_BUCKET_NAME" --recursive \
--profile "$AWS_PROFILE" 2>/dev/null || true
aws s3 rb "s3://$S3_BUCKET_NAME" \
--profile "$AWS_PROFILE" 2>/dev/null || true
mark_resource_cleaned "s3" "bucket_name"
fi
}
# Trap handler that updates state and cleans up resources
cleanup_with_state() {
update_state_status "cleaning"
generic_cleanup
# Delete state file if all resources were cleaned successfully
local state_file=$(get_current_state_file)
if [[ -n "$state_file" && -f "$state_file" ]]; then
if ! state_has_resources "$state_file"; then
delete_state_file "$state_file"
else
update_state_status "failed"
warning "Some resources may not have been cleaned. State file retained: $state_file"
warning "Run './cleanup.sh --file $state_file' to retry cleanup"
fi
fi
}
trap cleanup_with_state EXIT INT TERM
# OS-specific base64 encoding without line wraps
base64_encode() {
local input="$1"
if [[ "$OSTYPE" == "darwin"* ]]; then
# macOS doesn't support -w flag
printf '%s' "$input" | base64 | tr -d '\n'
else
# Linux (GNU coreutils)
printf '%s' "$input" | base64 -w0
fi
}
# Gzip compress then base64 encode (for large user data)
# AWS EC2 automatically decompresses gzip user data
gzip_base64_encode() {
local input="$1"
if [[ "$OSTYPE" == "darwin"* ]]; then
printf '%s' "$input" | gzip -9 | base64 | tr -d '\n'
else
# Linux (GNU coreutils)
printf '%s' "$input" | gzip -9 | base64 -w 0
fi
}
validate_eks_tools() {
log "Validating EKS tools..."
if ! check_tool "kubectl" "Please install kubectl: https://kubernetes.io/docs/tasks/tools/"; then
return 1
fi
success "EKS tools validated"
return 0
}
validate_framework_manifests() {
log "Validating framework manifests for: $FRAMEWORK"
if [[ ! -d "$FRAMEWORK_SOURCE_DIR" ]]; then
error "Framework directory not found: $FRAMEWORK_SOURCE_DIR"
error "Available frameworks: next, react-router, tanstack"
return 1
fi
if [[ ! -f "$KUBE_MANIFEST" ]]; then
error "Kubernetes manifest not found: $KUBE_MANIFEST"
error "Expected kube.yaml in $FRAMEWORK directory"
return 1
fi
success "Framework manifests validated for: $FRAMEWORK"
return 0
}
validate_docker() {
log "Validating Docker..."
if ! command -v docker &>/dev/null; then
error "Docker is not installed. Please install Docker: https://docs.docker.com/get-docker/"
return 1
fi
if ! docker info >/dev/null 2>&1; then
error "Docker daemon is not running. Please start Docker."
return 1
fi
success "Docker validated"
return 0
}
setup_aws_info() {
log "Getting AWS account info..."
AWS_ACCOUNT_ID=$(aws sts get-caller-identity \
--profile "$AWS_PROFILE" \
--query 'Account' \
--output text)
AWS_REGION=$(aws configure get region --profile "$AWS_PROFILE")
if [[ -z "$AWS_ACCOUNT_ID" ]]; then
error "Could not get AWS account ID"
return 1
fi
if [[ -z "$AWS_REGION" ]]; then
error "Could not get AWS region. Please set a default region with: aws configure"
return 1
fi
ECR_IMAGE_URI="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPO_NAME}:${IMAGE_TAG}"
log "AWS Account: $AWS_ACCOUNT_ID"
log "AWS Region: $AWS_REGION"
log "ECR Image: $ECR_IMAGE_URI"
success "AWS info retrieved"
}
ecr_login() {
log "Logging in to ECR..."
if ! aws ecr get-login-password \
--profile "$AWS_PROFILE" \
--region "$AWS_REGION" | \
docker login \
--username AWS \
--password-stdin \
"${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"; then
error "ECR login failed"
return 1
fi
success "ECR login successful"
}
create_ecr_repository() {
log "Creating ECR repository: $ECR_REPO_NAME"
if aws ecr describe-repositories \
--repository-names "$ECR_REPO_NAME" \
--profile "$AWS_PROFILE" >/dev/null 2>&1; then
log "Repository already exists"
ECR_REPO_CREATED="true"
return 0
fi
if ! aws ecr create-repository \
--repository-name "$ECR_REPO_NAME" \
--profile "$AWS_PROFILE" \
--image-scanning-configuration scanOnPush=false \
>/dev/null; then
error "Failed to create ECR repository"
return 1
fi
ECR_REPO_CREATED="true"
success "ECR repository created"
}
build_and_push_image() {
log "Building Docker image for linux/amd64..."
log "Building framework: $FRAMEWORK"
log "This may take a few minutes..."
if ! docker build \
--platform linux/amd64 \
--build-arg COMMIT_HASH="$(git rev-parse HEAD 2>/dev/null || echo 'unknown')" \
--build-arg BUILD_TIME="$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-t "$ECR_IMAGE_URI" \
"$FRAMEWORK_SOURCE_DIR"; then
error "Docker build failed"
return 1
fi
log "Pushing image to ECR..."
if ! docker push "$ECR_IMAGE_URI"; then
error "Docker push failed"
return 1
fi
# Verify image exists in ECR
log "Verifying image in ECR..."
if ! aws ecr describe-images \
--repository-name "$ECR_REPO_NAME" \
--image-ids imageTag="$IMAGE_TAG" \
--profile "$AWS_PROFILE" >/dev/null 2>&1; then
error "Image verification failed - image not found in ECR"
return 1
fi
success "Image pushed and verified: $ECR_IMAGE_URI"
}
create_security_group_for_load_test() {
log "Creating security group for load_test instance..."
local vpc_id=$(aws eks describe-cluster \
--name "$CLUSTER_NAME" \
--profile "$AWS_PROFILE" \
--query 'cluster.resourcesVpcConfig.vpcId' \
--output text)
if [[ -z "$vpc_id" || "$vpc_id" == "None" ]]; then
error "Could not get VPC ID from EKS cluster"
return 1
fi
log "Using VPC from EKS cluster: $vpc_id"
local timestamp=$(date +%s)
local sg_name="load_test-sg-$timestamp"
SECURITY_GROUP_ID=$(aws ec2 create-security-group \
--group-name "$sg_name" \
--description "Temporary security group for load_test instance" \
--vpc-id "$vpc_id" \
--query 'GroupId' \
--output text \
--profile "$AWS_PROFILE")
log "Created security group: $SECURITY_GROUP_ID"
success "Security group configured"
}
configure_node_security_for_nodeports() {
local node_ports=$1
log "Configuring node security groups for NodePort access..."
local node_sg=$(aws eks describe-cluster \
--name "$CLUSTER_NAME" \
--profile "$AWS_PROFILE" \
--query 'cluster.resourcesVpcConfig.clusterSecurityGroupId' \
--output text)
if [[ -z "$node_sg" || "$node_sg" == "None" ]]; then
error "Could not get cluster security group"
return 1
fi
log "Cluster security group: $node_sg"
# Add ingress rules for each NodePort
IFS=',' read -ra PORTS <<< "$node_ports"
for port in "${PORTS[@]}"; do
log "Adding ingress rule for NodePort $port..."
AWS_PAGER="" aws ec2 authorize-security-group-ingress \
--group-id "$node_sg" \
--protocol tcp \
--port "$port" \
--source-group "$SECURITY_GROUP_ID" \
--profile "$AWS_PROFILE" 2>/dev/null || {
log " (rule may already exist, continuing...)"
}
done
success "Node security configured for ports: $node_ports"
}
create_vpc_stack() {
log "Creating VPC infrastructure..."
VPC_ID=$(aws ec2 create-vpc \
--cidr-block 10.0.0.0/16 \
--profile "$AWS_PROFILE" \
--tag-specifications "ResourceType=vpc,Tags=[{Key=Name,Value=eks-vpc-$CLUSTER_NAME}]" \
--query 'Vpc.VpcId' \
--output text)
log "Created VPC: $VPC_ID"
aws ec2 modify-vpc-attribute \
--vpc-id "$VPC_ID" \
--enable-dns-hostnames \
--profile "$AWS_PROFILE"
local igw_id=$(aws ec2 create-internet-gateway \
--profile "$AWS_PROFILE" \
--tag-specifications "ResourceType=internet-gateway,Tags=[{Key=Name,Value=eks-igw-$CLUSTER_NAME}]" \
--query 'InternetGateway.InternetGatewayId' \
--output text)
log "Created Internet Gateway: $igw_id"
aws ec2 attach-internet-gateway \
--vpc-id "$VPC_ID" \
--internet-gateway-id "$igw_id" \
--profile "$AWS_PROFILE"
local azs=($(aws ec2 describe-availability-zones \
--profile "$AWS_PROFILE" \
--query 'AvailabilityZones[0:2].ZoneName' \
--output text))
local subnet1=$(aws ec2 create-subnet \
--vpc-id "$VPC_ID" \
--cidr-block 10.0.1.0/24 \
--availability-zone "${azs[0]}" \
--profile "$AWS_PROFILE" \
--tag-specifications "ResourceType=subnet,Tags=[{Key=Name,Value=eks-public-subnet-1}]" \
--query 'Subnet.SubnetId' \
--output text)
local subnet2=$(aws ec2 create-subnet \
--vpc-id "$VPC_ID" \
--cidr-block 10.0.2.0/24 \
--availability-zone "${azs[1]}" \
--profile "$AWS_PROFILE" \
--tag-specifications "ResourceType=subnet,Tags=[{Key=Name,Value=eks-public-subnet-2}]" \
--query 'Subnet.SubnetId' \
--output text)
log "Created subnets: $subnet1, $subnet2"
aws ec2 modify-subnet-attribute \
--subnet-id "$subnet1" \
--map-public-ip-on-launch \
--profile "$AWS_PROFILE"
aws ec2 modify-subnet-attribute \
--subnet-id "$subnet2" \
--map-public-ip-on-launch \
--profile "$AWS_PROFILE"
local rtb_id=$(aws ec2 create-route-table \
--vpc-id "$VPC_ID" \
--profile "$AWS_PROFILE" \
--tag-specifications "ResourceType=route-table,Tags=[{Key=Name,Value=eks-public-rtb}]" \
--query 'RouteTable.RouteTableId' \
--output text)
log "Created route table: $rtb_id"
aws ec2 create-route \
--route-table-id "$rtb_id" \
--destination-cidr-block 0.0.0.0/0 \
--gateway-id "$igw_id" \
--profile "$AWS_PROFILE" >/dev/null
aws ec2 associate-route-table \
--route-table-id "$rtb_id" \
--subnet-id "$subnet1" \
--profile "$AWS_PROFILE" >/dev/null
aws ec2 associate-route-table \
--route-table-id "$rtb_id" \
--subnet-id "$subnet2" \
--profile "$AWS_PROFILE" >/dev/null
SUBNET_IDS="$subnet1,$subnet2"
IGW_ID="$igw_id"
RTB_ID="$rtb_id"
log "VPC ID: $VPC_ID"
log "Subnet IDs: $SUBNET_IDS"
success "VPC infrastructure created"
}
create_cluster_iam_role() {
local role_name="eks-cluster-role-$CLUSTER_NAME"
CLUSTER_ROLE_NAME="$role_name"
log "Creating EKS cluster IAM role: $role_name"
cat >/tmp/cluster-trust-policy.json <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "eks.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
EOF
aws iam create-role \
--role-name "$role_name" \
--assume-role-policy-document file:///tmp/cluster-trust-policy.json \
--profile "$AWS_PROFILE" \
>/dev/null
aws iam attach-role-policy \
--policy-arn arn:aws:iam::aws:policy/AmazonEKSClusterPolicy \
--role-name "$role_name" \
--profile "$AWS_PROFILE"
CLUSTER_ROLE_ARN=$(aws iam get-role \
--role-name "$role_name" \
--profile "$AWS_PROFILE" \
--query 'Role.Arn' \
--output text)
log "Cluster role ARN: $CLUSTER_ROLE_ARN"
success "Cluster IAM role created"
}
create_node_iam_role() {
local role_name="eks-node-role-$CLUSTER_NAME"
NODE_ROLE_NAME="$role_name"
log "Creating EKS node IAM role: $role_name"
cat >/tmp/node-trust-policy.json <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": "sts:AssumeRole",
"Principal": {
"Service": "ec2.amazonaws.com"
}
}
]
}
EOF
aws iam create-role \
--role-name "$role_name" \
--assume-role-policy-document file:///tmp/node-trust-policy.json \
--profile "$AWS_PROFILE" \
>/dev/null
aws iam attach-role-policy \
--policy-arn arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy \
--role-name "$role_name" \
--profile "$AWS_PROFILE"
aws iam attach-role-policy \
--policy-arn arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryPullOnly \
--role-name "$role_name" \
--profile "$AWS_PROFILE"
aws iam attach-role-policy \
--policy-arn arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy \
--role-name "$role_name" \
--profile "$AWS_PROFILE"
NODE_ROLE_ARN=$(aws iam get-role \
--role-name "$role_name" \
--profile "$AWS_PROFILE" \
--query 'Role.Arn' \
--output text)
log "Node role ARN: $NODE_ROLE_ARN"
success "Node IAM role created"
}
create_s3_bucket() {
S3_BUCKET_NAME="benchmark-results-${CLUSTER_NAME}"
log "Creating S3 bucket: $S3_BUCKET_NAME"
# S3 bucket creation syntax differs by region
if [[ "$AWS_REGION" == "us-east-1" ]]; then
aws s3api create-bucket \
--bucket "$S3_BUCKET_NAME" \
--profile "$AWS_PROFILE" \
>/dev/null
else
aws s3api create-bucket \
--bucket "$S3_BUCKET_NAME" \
--create-bucket-configuration LocationConstraint="$AWS_REGION" \
--profile "$AWS_PROFILE" \
>/dev/null
fi
success "S3 bucket created: $S3_BUCKET_NAME"
}
create_loadtest_iam_role() {
LOADTEST_ROLE_NAME="${CLUSTER_NAME}-loadtest-role"
LOADTEST_INSTANCE_PROFILE_NAME="${CLUSTER_NAME}-loadtest-profile"
log "Creating load test IAM role: $LOADTEST_ROLE_NAME"
# Create trust policy for EC2
local trust_policy='{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}'
aws iam create-role \
--role-name "$LOADTEST_ROLE_NAME" \
--assume-role-policy-document "$trust_policy" \
--profile "$AWS_PROFILE" \
>/dev/null
# Create S3 access policy
local s3_policy=$(cat <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:PutObject",
"s3:GetObject",
"s3:ListBucket"
],
"Resource": [
"arn:aws:s3:::$S3_BUCKET_NAME",
"arn:aws:s3:::$S3_BUCKET_NAME/*"
]
}
]
}
EOF
)
aws iam put-role-policy \
--role-name "$LOADTEST_ROLE_NAME" \
--policy-name "S3BenchmarkAccess" \
--policy-document "$s3_policy" \
--profile "$AWS_PROFILE"
# Create instance profile
aws iam create-instance-profile \
--instance-profile-name "$LOADTEST_INSTANCE_PROFILE_NAME" \
--profile "$AWS_PROFILE" \
>/dev/null
# Add role to instance profile
aws iam add-role-to-instance-profile \
--instance-profile-name "$LOADTEST_INSTANCE_PROFILE_NAME" \
--role-name "$LOADTEST_ROLE_NAME" \
--profile "$AWS_PROFILE"
# Wait for instance profile to be available
log "Waiting for instance profile to propagate..."
sleep 10
success "Load test IAM role and instance profile created"
}
create_eks_cluster() {
log "Creating EKS cluster: $CLUSTER_NAME"
log "This may take 15-20 minutes..."
aws eks create-cluster \
--name "$CLUSTER_NAME" \
--role-arn "$CLUSTER_ROLE_ARN" \
--resources-vpc-config subnetIds="$SUBNET_IDS" \
--profile "$AWS_PROFILE" \
>/dev/null
log "Waiting for cluster to be ACTIVE..."
local max_attempts=60
local retry_delay=15
for ((i = 1; i <= max_attempts; i++)); do
local status=$(aws eks describe-cluster \
--name "$CLUSTER_NAME" \
--profile "$AWS_PROFILE" \
--query 'cluster.status' \
--output text)
if [[ "$status" == "ACTIVE" ]]; then
success "EKS cluster is ACTIVE"
return 0
fi
if ((i % 4 == 0)); then
log "Cluster status: $status (attempt $i/$max_attempts)"
fi
sleep "$retry_delay"
done
error "Cluster not ACTIVE after $((max_attempts * retry_delay)) seconds"
return 1
}
create_nodegroup() {
local nodegroup_name="$CLUSTER_NAME-nodegroup"
log "Creating managed node group: $nodegroup_name"
aws eks create-nodegroup \
--cluster-name "$CLUSTER_NAME" \
--nodegroup-name "$nodegroup_name" \
--node-role "$NODE_ROLE_ARN" \
--subnets $(echo "$SUBNET_IDS" | tr ',' ' ') \
--instance-types "$NODE_TYPE" \
--scaling-config minSize="$NODE_COUNT",maxSize="$NODE_COUNT",desiredSize="$NODE_COUNT" \
--profile "$AWS_PROFILE" \
>/dev/null
log "Waiting for node group to be ACTIVE..."
local max_attempts=60
local retry_delay=10
for ((i = 1; i <= max_attempts; i++)); do
local status=$(aws eks describe-nodegroup \
--cluster-name "$CLUSTER_NAME" \
--nodegroup-name "$nodegroup_name" \
--profile "$AWS_PROFILE" \
--query 'nodegroup.status' \
--output text 2>/dev/null || echo "CREATING")
if [[ "$status" == "ACTIVE" ]]; then
success "Node group is ACTIVE"
return 0
fi
if ((i % 6 == 0)); then
log "Node group status: $status (attempt $i/$max_attempts)"
fi
sleep "$retry_delay"
done
error "Node group not ACTIVE after $((max_attempts * retry_delay)) seconds"
return 1
}
wait_for_nodes() {
log "Waiting for nodes to be ready..."
local max_attempts=60
local retry_delay=5
for ((i = 1; i <= max_attempts; i++)); do
local ready_nodes=$(kubectl --context "$KUBE_CONTEXT" get nodes --no-headers 2>/dev/null | grep -c " Ready" || echo "0")
if [[ "$ready_nodes" -ge "$NODE_COUNT" ]]; then
success "All $NODE_COUNT nodes are ready"
return 0
fi
if ((i % 10 == 0)); then
log "Still waiting for nodes... $ready_nodes/$NODE_COUNT ready (attempt $i/$max_attempts)"
fi
sleep "$retry_delay"
done
error "Nodes not ready after $((max_attempts * retry_delay)) seconds"
return 1
}
apply_framework_manifests() {
log "Applying $FRAMEWORK manifests from $KUBE_MANIFEST..."
# Template the manifest with ECR image URI
sed "s|IMAGE_PLACEHOLDER|${ECR_IMAGE_URI}|g" "$KUBE_MANIFEST" | \
kubectl --context "$KUBE_CONTEXT" apply -f -
success "$FRAMEWORK manifests applied"
}
wait_for_pods() {
log "Waiting for pods to be ready..."
local max_attempts=120
local retry_delay=5
for ((i = 1; i <= max_attempts; i++)); do
local pods=$(kubectl --context "$KUBE_CONTEXT" get pods --all-namespaces --no-headers 2>/dev/null | grep -v "kube-system" || echo "")
if [[ -z "$pods" ]]; then
if ((i % 10 == 0)); then