ai-stack/values-prod.yaml at main · rmednitzer/ai-stack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# =============================================================================
# values-prod.yaml — Production overlay for EU-regulated deployment
# Usage: helm install ai-stack . -f values.yaml -f values-prod.yaml
# =============================================================================

global:
  profile: prod
  imagePullPolicy: Always
  # -- Pin your storage class
  storageClass: "zfs-production"

  otel:
    enabled: true
    endpoint: "http://otel-collector.observability.svc.cluster.local:4317"

  networkPolicy:
    enabled: true

  serviceMonitor:
    enabled: true

  podAnnotations:
    assurance.platform/chart: "ai-stack"
    assurance.platform/profile: "prod"
    assurance.platform/control-refs: "CTL-002"


# External inference API providers: see HOWTO.md §6 for configuration examples.
# Use existingSecret references (ESO / Vault) in production — never inline apiKey.

openwebui:
  replicaCount: 2
  image:
    tag: "v0.8.10"
  autoscaling:
    enabled: true
    minReplicas: 2
    maxReplicas: 5
    targetCPUUtilizationPercentage: 70
    targetMemoryUtilizationPercentage: 80
  ingress:
    enabled: true
    className: "envoy"
    annotations:
      # Envoy Gateway — TLS and routing
      gateway.envoyproxy.io/tls-terminate: "true"
      # Websocket support for streaming responses
      gateway.envoyproxy.io/timeout: "300s"
      gateway.envoyproxy.io/request-body-max-size: "50m"
      cert-manager.io/cluster-issuer: "letsencrypt-prod"
      # Rate limiting — 60 requests per minute per client IP
      gateway.envoyproxy.io/rate-limit-local: "60"
      gateway.envoyproxy.io/rate-limit-burst: "20"
    hosts:
      # -- REQUIRED: Replace with your actual domain before deploying to production
      - host: ai.example.com
        paths:
          - path: /
            pathType: Prefix
    tls:
      - secretName: ai-tls
        hosts:
          # -- REQUIRED: Replace with your actual domain before deploying to production
          - ai.example.com
  resources:
    requests:
      cpu: 500m
      memory: 1Gi
    limits:
      cpu: "4"
      memory: 4Gi
  persistence:
    size: 50Gi

ollama:
  replicaCount: 1
  image:
    tag: "0.18.2"
  gpu:
    enabled: true
    count: 1
  resources:
    requests:
      cpu: "2"
      memory: 8Gi
    limits:
      cpu: "8"
      memory: 32Gi
  persistence:
    size: 200Gi

qdrant:
  replicaCount: 1
  resources:
    requests:
      cpu: 500m
      memory: 2Gi
    limits:
      cpu: "4"
      memory: 8Gi
  persistence:
    size: 100Gi

tika:
  autoscaling:
    enabled: true
    minReplicas: 2
    maxReplicas: 4
    targetCPUUtilizationPercentage: 70
  resources:
    requests:
      cpu: 250m
      memory: 1Gi
    limits:
      cpu: "2"
      memory: 4Gi

searxng:
  settings:
    server:
      secret_key: ""  # Auto-generated from Secret; override with external secret manager
      limiter: true
      image_proxy: false

workbench:
  enabled: false  # Disable in prod by default; enable per-request with GPU scheduling

valkey:
  resources:
    requests:
      cpu: 200m
      memory: 256Mi
    limits:
      cpu: "1"
      memory: 1Gi

openTerminal:
  enabled: false  # Disable in prod by default; enable per-request

ingestionWorker:
  enabled: false  # Opt-in; enable when async document ingestion is needed
  autoscaling:
    enabled: true
    minReplicas: 2
    maxReplicas: 6
    targetCPUUtilizationPercentage: 70
  resources:
    requests:
      cpu: 250m
      memory: 512Mi
    limits:
      cpu: "2"
      memory: 2Gi

langgraph:
  enabled: false  # Opt-in; enable when agentic workloads are needed
  autoscaling:
    enabled: true
    minReplicas: 2
    maxReplicas: 4
    targetCPUUtilizationPercentage: 70
  resources:
    requests:
      cpu: 500m
      memory: 1Gi
    limits:
      cpu: "4"
      memory: 8Gi

postgres:
  enabled: false  # Enable together with langgraph
  # -- Production: use CloudNativePG for HA, automated failover, and backups.
  #    Requires: CloudNativePG operator v1.25+ installed in the cluster.
  #    Install: helm install cnpg cloudnative-pg/cloudnative-pg
  mode: cnpg
  tls:
    mode: require
  cnpg:
    instances: 3
    storage:
      size: 50Gi
    resources:
      requests:
        cpu: 500m
        memory: 1Gi
      limits:
        cpu: "4"
        memory: 4Gi
    postgresql:
      parameters:
        shared_buffers: "1GB"
        effective_cache_size: "3GB"
        max_connections: "200"
        wal_level: "replica"
        max_wal_senders: "10"
        max_replication_slots: "10"
    backup:
      enabled: false  # Enable when S3-compatible storage is configured
      retentionPolicy: "30d"
      schedule: "0 0 2 * * *"
    pooler:
      enabled: true
      instances: 2
      type: rw
      pgbouncer:
        poolMode: transaction
        defaultPoolSize: "25"
        maxClientConn: "200"
    monitoring:
      enabled: true