Skip to content

Commit 76f39fe

Browse files
committed
feat(infra,helm): add zero-downtime spot instance support for AWS and Azure
1 parent c9caa69 commit 76f39fe

8 files changed

Lines changed: 107 additions & 7 deletions

File tree

backend/helm-chart/templates/deployment.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ spec:
1111
selector:
1212
matchLabels:
1313
{{- include "hankers-backend.selectorLabels" . | nindent 6 }}
14+
strategy:
15+
type: RollingUpdate
16+
rollingUpdate:
17+
maxSurge: 1
18+
maxUnavailable: 0
1419
template:
1520
metadata:
1621
labels:
@@ -20,6 +25,16 @@ spec:
2025
# Force pod restart on upgrade by adding timestamp
2126
rollme: {{ randAlphaNum 5 | quote }}
2227
spec:
28+
terminationGracePeriodSeconds: 30
29+
affinity:
30+
podAntiAffinity:
31+
preferredDuringSchedulingIgnoredDuringExecution:
32+
- weight: 100
33+
podAffinityTerm:
34+
labelSelector:
35+
matchLabels:
36+
{{- include "hankers-backend.selectorLabels" . | nindent 20 }}
37+
topologyKey: kubernetes.io/hostname
2338
{{- if .Values.migration.enabled }}
2439
initContainers:
2540
- name: migrate
@@ -36,6 +51,10 @@ spec:
3651
- name: backend
3752
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
3853
imagePullPolicy: {{ .Values.image.pullPolicy }}
54+
lifecycle:
55+
preStop:
56+
exec:
57+
command: ["/bin/sh", "-c", "sleep 5"]
3958
ports:
4059
- name: http
4160
containerPort: {{ .Values.service.targetPort }}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: policy/v1
2+
kind: PodDisruptionBudget
3+
metadata:
4+
name: {{ include "hankers-backend.fullname" . }}-pdb
5+
namespace: {{ .Release.Namespace }}
6+
labels:
7+
{{- include "hankers-backend.labels" . | nindent 4 }}
8+
spec:
9+
minAvailable: 1
10+
selector:
11+
matchLabels:
12+
{{- include "hankers-backend.selectorLabels" . | nindent 6 }}

backend/helm-chart/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
replicaCount: 1
1+
replicaCount: 2
22

33
image:
44
repository: karimzakzouk/backend

frontend/helm-chart/templates/deployment.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ spec:
1111
selector:
1212
matchLabels:
1313
{{- include "hankers-frontend.selectorLabels" . | nindent 6 }}
14+
strategy:
15+
type: RollingUpdate
16+
rollingUpdate:
17+
maxSurge: 1
18+
maxUnavailable: 0
1419
template:
1520
metadata:
1621
labels:
@@ -20,10 +25,24 @@ spec:
2025
# Force pod restart on upgrade by adding timestamp
2126
rollme: {{ randAlphaNum 5 | quote }}
2227
spec:
28+
terminationGracePeriodSeconds: 30
29+
affinity:
30+
podAntiAffinity:
31+
preferredDuringSchedulingIgnoredDuringExecution:
32+
- weight: 100
33+
podAffinityTerm:
34+
labelSelector:
35+
matchLabels:
36+
{{- include "hankers-frontend.selectorLabels" . | nindent 20 }}
37+
topologyKey: kubernetes.io/hostname
2338
containers:
2439
- name: frontend
2540
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
2641
imagePullPolicy: {{ .Values.image.pullPolicy }}
42+
lifecycle:
43+
preStop:
44+
exec:
45+
command: ["/bin/sh", "-c", "sleep 5"]
2746
ports:
2847
- name: http
2948
containerPort: {{ .Values.service.targetPort }}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: policy/v1
2+
kind: PodDisruptionBudget
3+
metadata:
4+
name: {{ include "hankers-frontend.fullname" . }}-pdb
5+
namespace: {{ .Release.Namespace }}
6+
labels:
7+
{{- include "hankers-frontend.labels" . | nindent 4 }}
8+
spec:
9+
minAvailable: 1
10+
selector:
11+
matchLabels:
12+
{{- include "hankers-frontend.selectorLabels" . | nindent 6 }}

frontend/helm-chart/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
replicaCount: 1
1+
replicaCount: 2
22

33
image:
44
repository: karimzakzouk/frontend

infra/development/terraform/main.tf

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,52 @@ resource "azurerm_kubernetes_cluster" "main" {
3838

3939
default_node_pool {
4040
name = "default"
41-
node_count = 1
41+
node_count = 1 # Keep 1 on-demand for stateful services
4242
vm_size = "Standard_B2s"
4343
}
4444

4545
identity {
4646
type = "SystemAssigned"
4747
}
48+
49+
# Enable cluster autoscaler
50+
automatic_channel_upgrade = "stable"
51+
52+
tags = {
53+
Environment = "Development"
54+
}
55+
}
4856

57+
# Spot Node Pool for cost savings (70% cheaper)
58+
resource "azurerm_kubernetes_cluster_node_pool" "spot" {
59+
name = "spot"
60+
kubernetes_cluster_id = azurerm_kubernetes_cluster.main.id
61+
vm_size = "Standard_B2s"
62+
63+
# Spot configuration
64+
priority = "Spot"
65+
eviction_policy = "Delete"
66+
spot_max_price = -1 # Pay up to regular price (usually 70-90% off)
67+
68+
# Auto-scaling configuration
69+
enable_auto_scaling = true
70+
min_count = 0 # Scale to zero when not needed
71+
max_count = 2
72+
73+
# Taints for spot-tolerant workloads only
74+
node_taints = [
75+
"kubernetes.azure.com/scalesetpriority=spot:NoSchedule"
76+
]
77+
78+
# Labels to identify spot nodes
79+
node_labels = {
80+
"kubernetes.azure.com/scalesetpriority" = "spot"
81+
"node-type" = "spot"
82+
}
83+
4984
tags = {
5085
Environment = "Development"
86+
NodeType = "Spot"
5187
}
5288
}
5389

infra/production/terraform/modules/rds/main.tf

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,9 @@ resource "aws_db_parameter_group" "postgres" {
7070

7171
# Enable required PostgreSQL extensions (matching dev environment)
7272
parameter {
73-
name = "shared_preload_libraries"
74-
value = "pg_stat_statements"
73+
name = "shared_preload_libraries"
74+
value = "pg_stat_statements"
75+
apply_method = "pending-reboot" # Static parameter requires DB restart
7576
}
7677

7778
tags = merge(
@@ -110,8 +111,9 @@ resource "aws_db_instance" "main" {
110111
backup_window = var.backup_window
111112
maintenance_window = var.maintenance_window
112113

113-
skip_final_snapshot = var.skip_final_snapshot
114-
deletion_protection = var.deletion_protection
114+
skip_final_snapshot = var.skip_final_snapshot
115+
final_snapshot_identifier = var.skip_final_snapshot ? null : "${var.project_name}-db-final-snapshot-${formatdate("YYYY-MM-DD-hhmm", timestamp())}"
116+
deletion_protection = var.deletion_protection
115117

116118
performance_insights_enabled = var.performance_insights_enabled
117119

0 commit comments

Comments
 (0)