kosminus · kosminus · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/.env.prod.example b/.env.prod.example
@@ -0,0 +1,104 @@
+# =============================================================================
+# QueryWise — PRODUCTION environment (docker-compose.prod.yml)
+# =============================================================================
+#   cp .env.prod.example .env.prod   # then fill in every value marked CHANGE ME
+#   docker compose -f docker-compose.prod.yml --env-file .env.prod up -d --build
+#
+# This file holds secrets — keep it out of version control (see .gitignore).
+# =============================================================================
+
+# -- Application --
+ENVIRONMENT=production
+DEBUG=false
+
+# -- App Database (pgvector) --
+# docker-compose.prod.yml builds DATABASE_URL from these; the app-db service
+# uses them too. For an external/managed Postgres, set DATABASE_URL directly
+# and drop the app-db service.
+POSTGRES_DB=querywise
+POSTGRES_USER=querywise
+POSTGRES_PASSWORD=CHANGE_ME_strong_db_password
+
+# -- Cache / Jobs --
+# REDIS_URL + JOB_BACKEND=arq are set by the compose file. arq worker runs as a
+# dedicated service.
+
+# -- Security --
+# Fernet key for connection-string encryption at rest. REQUIRED — rotating it
+# makes existing stored connections undecryptable.
+#   python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())"
+ENCRYPTION_KEY=CHANGE_ME_generate_a_fernet_key
+
+# HS256 signing secret for session + magic-link JWTs. REQUIRED.
+#   python -c "import secrets; print(secrets.token_urlsafe(48))"
+JWT_SECRET=CHANGE_ME_generate_a_long_random_secret
+
+# Secrets backend: env (Fernet, default) | aws | gcp | azure | vault
+SECRETS_BACKEND=env
+
+# -- Auth --
+# NEVER true in production — this disables login entirely.
+DISABLE_AUTH=false
+AUTH_PROVIDER=local
+# Session cookie hardening (the edge terminates TLS).
+AUTH_COOKIE_SECURE=true
+AUTH_COOKIE_SAMESITE=lax
+# Bootstrap admin (created on first boot). Set a password to enable local login.
+DEFAULT_ADMIN_EMAIL=admin@yourcompany.com
+DEFAULT_ADMIN_PASSWORD=CHANGE_ME_admin_password
+
+# Allowed CORS origins (JSON list) — your public frontend origin(s).
+# Same-origin (SPA + API behind one host) needs no cross-origin entry.
+CORS_ORIGINS=["https://querywise.yourcompany.com"]
+
+# -- Observability --
+LOG_LEVEL=INFO
+LOG_FORMAT=json
+ENABLE_METRICS=true
+SERVICE_NAME=querywise-backend
+OTEL_ENABLED=false
+# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318/v1/traces
+
+# -- Scaling --
+# uvicorn worker processes per backend replica.
+UVICORN_WORKERS=4
+# Public HTTP port for the edge (map 443 + mount certs for direct TLS).
+HTTP_PORT=80
+
+# -- Sample DB auto-setup: OFF in prod (point at real warehouses instead) --
+AUTO_SETUP_SAMPLE_DB=false
+
+# -- LLM Configuration --
+# Vector dimension: 1536 for OpenAI/Anthropic, 768 for Ollama nomic-embed-text
+EMBEDDING_DIMENSION=1536
+
+# ---- Anthropic ----
+# DEFAULT_LLM_PROVIDER=anthropic
+# DEFAULT_LLM_MODEL=claude-sonnet-4-5-20250929
+# ANTHROPIC_API_KEY=
+# OPENAI_API_KEY=                          # Required for embeddings
+# EMBEDDING_MODEL=text-embedding-3-small
+
+# ---- OpenAI ----
+DEFAULT_LLM_PROVIDER=openai
+DEFAULT_LLM_MODEL=gpt-5.2
+OPENAI_API_KEY=CHANGE_ME
+EMBEDDING_MODEL=text-embedding-3-small
+
+# ---- Azure OpenAI (in-VPC) ----
+# DEFAULT_LLM_PROVIDER=azure_openai
+# DEFAULT_LLM_MODEL=<chat-deployment>
+# AZURE_OPENAI_ENDPOINT=https://<resource>.openai.azure.com
+# AZURE_OPENAI_API_KEY=
+# AZURE_OPENAI_API_VERSION=2024-10-21
+# AZURE_OPENAI_DEPLOYMENT=<embedding-deployment>
+# EMBEDDING_MODEL=<embedding-deployment>
+
+# -- Query Defaults --
+DEFAULT_QUERY_TIMEOUT_SECONDS=30
+DEFAULT_MAX_ROWS=1000
+MAX_RETRY_ATTEMPTS=3
+
+# -- Rate Limiting --
+MAX_QUERIES_PER_MINUTE=30
+RATE_LIMIT_ENABLED=true
diff --git a/.github/actions/helm-deploy/action.yml b/.github/actions/helm-deploy/action.yml
@@ -0,0 +1,72 @@
+name: Helm deploy
+description: Deploy QueryWise to a cluster with Helm, pinned to a specific image tag.
+
+inputs:
+  environment:
+    description: Target environment (staging | production). Selects the optional values-<env>.yaml overlay.
+    required: true
+  image_tag:
+    description: Image tag to deploy (both backend and frontend share it).
+    required: true
+  kube_config:
+    description: Base64-encoded kubeconfig for the target cluster.
+    required: true
+  namespace:
+    description: Kubernetes namespace.
+    required: false
+    default: querywise
+  release:
+    description: Helm release name.
+    required: false
+    default: querywise
+
+runs:
+  using: composite
+  steps:
+    - uses: azure/setup-helm@v4
+      with:
+        version: v3.16.0
+
+    - uses: azure/setup-kubectl@v4
+
+    - name: Write kubeconfig
+      shell: bash
+      run: |
+        echo "${{ inputs.kube_config }}" | base64 -d > "${RUNNER_TEMP}/kubeconfig"
+        chmod 600 "${RUNNER_TEMP}/kubeconfig"
+        echo "KUBECONFIG=${RUNNER_TEMP}/kubeconfig" >> "$GITHUB_ENV"
+
+    - name: Resolve per-environment values overlay
+      id: vals
+      shell: bash
+      run: |
+        f="deploy/helm/querywise/values-${{ inputs.environment }}.yaml"
+        if [ -f "$f" ]; then
+          echo "arg=--values $f" >> "$GITHUB_OUTPUT"
+          echo "Using overlay $f"
+        else
+          echo "arg=" >> "$GITHUB_OUTPUT"
+          echo "No overlay at $f — using chart defaults + --set."
+        fi
+
+    - name: Helm upgrade
+      shell: bash
+      env:
+        OWNER: ${{ github.repository_owner }}
+        TAG: ${{ inputs.image_tag }}
+      run: |
+        helm upgrade --install "${{ inputs.release }}" deploy/helm/querywise \
+          --namespace "${{ inputs.namespace }}" --create-namespace \
+          ${{ steps.vals.outputs.arg }} \
+          --set image.backend.repository="ghcr.io/${OWNER}/querywise-backend" \
+          --set image.backend.tag="${TAG}" \
+          --set image.frontend.repository="ghcr.io/${OWNER}/querywise-frontend" \
+          --set image.frontend.tag="${TAG}" \
+          --wait --atomic --timeout 10m
+
+    - name: Rollout summary
+      shell: bash
+      run: |
+        helm status "${{ inputs.release }}" --namespace "${{ inputs.namespace }}" || true
+        kubectl get pods -n "${{ inputs.namespace }}" \
+          -l app.kubernetes.io/instance="${{ inputs.release }}" || true
diff --git a/.github/workflows/deploy-validate.yml b/.github/workflows/deploy-validate.yml
@@ -0,0 +1,69 @@
+name: Deploy artifacts
+
+# Validates the deployment artifacts so a broken chart or module never merges.
+# Runs only when something under deploy/ (or these workflows) changes.
+
+on:
+  pull_request:
+    paths:
+      - "deploy/**"
+      - ".github/workflows/deploy-validate.yml"
+  push:
+    branches: [main]
+    paths:
+      - "deploy/**"
+
+jobs:
+  helm:
+    name: Helm lint + kubeconform
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: azure/setup-helm@v4
+        with:
+          version: v3.16.0
+
+      - name: Helm lint
+        run: helm lint deploy/helm/querywise
+
+      - name: Install kubeconform
+        run: |
+          curl -sSL -o /tmp/kubeconform.tar.gz \
+            https://github.com/yannh/kubeconform/releases/download/v0.6.7/kubeconform-linux-amd64.tar.gz
+          tar -xzf /tmp/kubeconform.tar.gz -C /tmp
+          sudo mv /tmp/kubeconform /usr/local/bin/
+
+      - name: Render + schema-validate
+        run: |
+          helm template querywise deploy/helm/querywise \
+            --set secrets.data.DATABASE_URL=postgresql+asyncpg://u:p@db:5432/querywise \
+            --set secrets.data.REDIS_URL=redis://redis:6379/0 \
+            --set secrets.data.ENCRYPTION_KEY=x --set secrets.data.JWT_SECRET=y \
+          | kubeconform -strict -summary -kubernetes-version 1.29.0
+
+  terraform:
+    name: Terraform fmt + validate (${{ matrix.cloud }})
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        cloud: [aws, gcp, azure]
+    defaults:
+      run:
+        working-directory: deploy/terraform/${{ matrix.cloud }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: "1.9.5"
+
+      - name: Format check
+        run: terraform fmt -check -recursive
+
+      - name: Init (no backend)
+        run: terraform init -backend=false -input=false
+
+      - name: Validate
+        run: terraform validate
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,104 @@
+name: Release
+
+# Build + push the two production images, then deploy with Helm:
+#   push to main  -> build -> deploy to STAGING
+#   push tag v*   -> build -> deploy to PRODUCTION (gated by the environment's
+#                             required reviewers)
+#   manual        -> build only (workflow_dispatch)
+#
+# Required GitHub Environment secrets:
+#   staging / production:  KUBE_CONFIG  (base64-encoded kubeconfig for the cluster)
+# Images push to GHCR using the built-in GITHUB_TOKEN (packages: write).
+
+on:
+  push:
+    branches: [main]
+    tags: ["v*"]
+  workflow_dispatch:
+
+concurrency:
+  group: release-${{ github.ref }}
+  cancel-in-progress: false
+
+env:
+  REGISTRY: ghcr.io
+
+jobs:
+  images:
+    name: Build & push (${{ matrix.component }})
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    strategy:
+      matrix:
+        include:
+          - component: backend
+            context: backend
+            dockerfile: backend/Dockerfile.prod
+          - component: frontend
+            context: frontend
+            dockerfile: frontend/Dockerfile.prod
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: docker/setup-buildx-action@v3
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Image metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ github.repository_owner }}/querywise-${{ matrix.component }}
+          tags: |
+            type=raw,value=${{ github.sha }}
+            type=ref,event=branch
+            type=semver,pattern={{version}}
+            type=raw,value=latest,enable={{is_default_branch}}
+
+      - name: Build & push
+        uses: docker/build-push-action@v6
+        with:
+          context: ${{ matrix.context }}
+          file: ${{ matrix.dockerfile }}
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          # Frontend is built same-origin; nginx proxies /api to the backend.
+          build-args: ${{ matrix.component == 'frontend' && 'VITE_API_URL=' || '' }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+  deploy-staging:
+    name: Deploy to staging
+    needs: images
+    if: github.ref == 'refs/heads/main'
+    runs-on: ubuntu-latest
+    environment: staging
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/helm-deploy
+        with:
+          environment: staging
+          image_tag: ${{ github.sha }}
+          kube_config: ${{ secrets.KUBE_CONFIG }}
+
+  deploy-prod:
+    name: Deploy to production
+    needs: images
+    if: startsWith(github.ref, 'refs/tags/v')
+    runs-on: ubuntu-latest
+    environment: production
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/helm-deploy
+        with:
+          environment: production
+          image_tag: ${{ github.sha }}
+          kube_config: ${{ secrets.KUBE_CONFIG }}
diff --git a/.gitignore b/.gitignore
@@ -48,6 +48,8 @@ Thumbs.db
 
 # Environment / secrets
 .env
+.env.prod
+.env.*.local
 *.pem
 *.key
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -98,6 +98,36 @@ product surface; all optional dependencies degrade gracefully).
   so the lineage tests run (they `importorskip` past `sqlglot` when the extra is absent).
 - **Deferred to a later milestone:** column profiling (null rate / distinct counts / sample values).
 
+### Added (Packaging & deployability)
+- **Hardened production images** — multi-stage, non-root `backend/Dockerfile.prod`
+  (builder venv → slim runtime, `curl` healthcheck, prod extras only) and
+  `frontend/Dockerfile.prod` (Vite build → unprivileged nginx serving the SPA and
+  reverse-proxying `/api` + `/mcp`). The dev `Dockerfile`s are untouched.
+- **Production compose** (`docker-compose.prod.yml`) — pgvector app-db, Redis,
+  one-shot `migrate` service (gated so backend replicas never race on Alembic),
+  backend (uvicorn), arq `worker`, and the nginx edge. Configured by `.env.prod`
+  (`.env.prod.example` template).
+- **Helm chart** (`deploy/helm/querywise/`, EKS/GKE/AKS) — backend Deployment +
+  HPA + PDB, arq `worker`, frontend + PDB, path-based ingress (`/api`+`/mcp` →
+  backend, `/` → SPA), ServiceAccount, and a `pre-install`/`pre-upgrade`
+  migration hook Job. Secrets via a chart-created Secret or `existingSecret`
+  (external-secrets seam). Validated with `helm lint` + `kubeconform`.
+- **Terraform modules** (`deploy/terraform/{aws,gcp,azure}/`) — each provisions
+  the data plane + secrets in the customer's own account/VPC: managed Postgres 16
+  (pgvector) + managed Redis + a secret store with the assembled DSNs/keys +
+  object storage + optional networking + an identity/policy for external-secrets.
+  Compute (cluster) is intentionally separate state. `terraform validate`-clean.
+- **CI/CD** (`.github/workflows/`) — `deploy-validate.yml` lints the chart
+  (`kubeconform`) and Terraform (`fmt`/`validate`) on PRs; `release.yml` builds +
+  pushes both images to GHCR and deploys with Helm (`main` → staging, tag `v*` →
+  production, `--wait --atomic`) via a reusable composite action.
+- **Ops** (`deploy/ops/`) — `backup.sh`/`restore.sh` (encrypted `pg_dump`/
+  `pg_restore`), an in-cluster backup CronJob example, a DR runbook (backup/
+  restore, region rebuild, upgrade path, quarterly credential rotation), and a
+  production config reference.
+- **Deferred:** the managed-SaaS control plane (provisioning/billing/fleet
+  upgrades) — additive, since each tenant is already an isolated instance.
+
 ## [1.0.0] - 2026-06-04
 
 First stable release: natural-language-to-SQL with a semantic metadata layer.
-Original file line number
+Diff line change
@@ Expand Up / @@ -48,6 +48,8 @@ Thumbs.db @@
     # Environment / secrets
     .env
+    .env.prod
+    .env.*.local
     *.pem
     *.key
@@ Expand Down @@