diff --git a/.env.prod.example b/.env.prod.example new file mode 100644 index 0000000..383e2cf --- /dev/null +++ b/.env.prod.example @@ -0,0 +1,104 @@ +# ============================================================================= +# QueryWise — PRODUCTION environment (docker-compose.prod.yml) +# ============================================================================= +# cp .env.prod.example .env.prod # then fill in every value marked CHANGE ME +# docker compose -f docker-compose.prod.yml --env-file .env.prod up -d --build +# +# This file holds secrets — keep it out of version control (see .gitignore). +# ============================================================================= + +# -- Application -- +ENVIRONMENT=production +DEBUG=false + +# -- App Database (pgvector) -- +# docker-compose.prod.yml builds DATABASE_URL from these; the app-db service +# uses them too. For an external/managed Postgres, set DATABASE_URL directly +# and drop the app-db service. +POSTGRES_DB=querywise +POSTGRES_USER=querywise +POSTGRES_PASSWORD=CHANGE_ME_strong_db_password + +# -- Cache / Jobs -- +# REDIS_URL + JOB_BACKEND=arq are set by the compose file. arq worker runs as a +# dedicated service. + +# -- Security -- +# Fernet key for connection-string encryption at rest. REQUIRED — rotating it +# makes existing stored connections undecryptable. +# python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())" +ENCRYPTION_KEY=CHANGE_ME_generate_a_fernet_key + +# HS256 signing secret for session + magic-link JWTs. REQUIRED. +# python -c "import secrets; print(secrets.token_urlsafe(48))" +JWT_SECRET=CHANGE_ME_generate_a_long_random_secret + +# Secrets backend: env (Fernet, default) | aws | gcp | azure | vault +SECRETS_BACKEND=env + +# -- Auth -- +# NEVER true in production — this disables login entirely. +DISABLE_AUTH=false +AUTH_PROVIDER=local +# Session cookie hardening (the edge terminates TLS). +AUTH_COOKIE_SECURE=true +AUTH_COOKIE_SAMESITE=lax +# Bootstrap admin (created on first boot). Set a password to enable local login. +DEFAULT_ADMIN_EMAIL=admin@yourcompany.com +DEFAULT_ADMIN_PASSWORD=CHANGE_ME_admin_password + +# Allowed CORS origins (JSON list) — your public frontend origin(s). +# Same-origin (SPA + API behind one host) needs no cross-origin entry. +CORS_ORIGINS=["https://querywise.yourcompany.com"] + +# -- Observability -- +LOG_LEVEL=INFO +LOG_FORMAT=json +ENABLE_METRICS=true +SERVICE_NAME=querywise-backend +OTEL_ENABLED=false +# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318/v1/traces + +# -- Scaling -- +# uvicorn worker processes per backend replica. +UVICORN_WORKERS=4 +# Public HTTP port for the edge (map 443 + mount certs for direct TLS). +HTTP_PORT=80 + +# -- Sample DB auto-setup: OFF in prod (point at real warehouses instead) -- +AUTO_SETUP_SAMPLE_DB=false + +# -- LLM Configuration -- +# Vector dimension: 1536 for OpenAI/Anthropic, 768 for Ollama nomic-embed-text +EMBEDDING_DIMENSION=1536 + +# ---- Anthropic ---- +# DEFAULT_LLM_PROVIDER=anthropic +# DEFAULT_LLM_MODEL=claude-sonnet-4-5-20250929 +# ANTHROPIC_API_KEY= +# OPENAI_API_KEY= # Required for embeddings +# EMBEDDING_MODEL=text-embedding-3-small + +# ---- OpenAI ---- +DEFAULT_LLM_PROVIDER=openai +DEFAULT_LLM_MODEL=gpt-5.2 +OPENAI_API_KEY=CHANGE_ME +EMBEDDING_MODEL=text-embedding-3-small + +# ---- Azure OpenAI (in-VPC) ---- +# DEFAULT_LLM_PROVIDER=azure_openai +# DEFAULT_LLM_MODEL= +# AZURE_OPENAI_ENDPOINT=https://.openai.azure.com +# AZURE_OPENAI_API_KEY= +# AZURE_OPENAI_API_VERSION=2024-10-21 +# AZURE_OPENAI_DEPLOYMENT= +# EMBEDDING_MODEL= + +# -- Query Defaults -- +DEFAULT_QUERY_TIMEOUT_SECONDS=30 +DEFAULT_MAX_ROWS=1000 +MAX_RETRY_ATTEMPTS=3 + +# -- Rate Limiting -- +MAX_QUERIES_PER_MINUTE=30 +RATE_LIMIT_ENABLED=true diff --git a/.github/actions/helm-deploy/action.yml b/.github/actions/helm-deploy/action.yml new file mode 100644 index 0000000..1859509 --- /dev/null +++ b/.github/actions/helm-deploy/action.yml @@ -0,0 +1,72 @@ +name: Helm deploy +description: Deploy QueryWise to a cluster with Helm, pinned to a specific image tag. + +inputs: + environment: + description: Target environment (staging | production). Selects the optional values-.yaml overlay. + required: true + image_tag: + description: Image tag to deploy (both backend and frontend share it). + required: true + kube_config: + description: Base64-encoded kubeconfig for the target cluster. + required: true + namespace: + description: Kubernetes namespace. + required: false + default: querywise + release: + description: Helm release name. + required: false + default: querywise + +runs: + using: composite + steps: + - uses: azure/setup-helm@v4 + with: + version: v3.16.0 + + - uses: azure/setup-kubectl@v4 + + - name: Write kubeconfig + shell: bash + run: | + echo "${{ inputs.kube_config }}" | base64 -d > "${RUNNER_TEMP}/kubeconfig" + chmod 600 "${RUNNER_TEMP}/kubeconfig" + echo "KUBECONFIG=${RUNNER_TEMP}/kubeconfig" >> "$GITHUB_ENV" + + - name: Resolve per-environment values overlay + id: vals + shell: bash + run: | + f="deploy/helm/querywise/values-${{ inputs.environment }}.yaml" + if [ -f "$f" ]; then + echo "arg=--values $f" >> "$GITHUB_OUTPUT" + echo "Using overlay $f" + else + echo "arg=" >> "$GITHUB_OUTPUT" + echo "No overlay at $f — using chart defaults + --set." + fi + + - name: Helm upgrade + shell: bash + env: + OWNER: ${{ github.repository_owner }} + TAG: ${{ inputs.image_tag }} + run: | + helm upgrade --install "${{ inputs.release }}" deploy/helm/querywise \ + --namespace "${{ inputs.namespace }}" --create-namespace \ + ${{ steps.vals.outputs.arg }} \ + --set image.backend.repository="ghcr.io/${OWNER}/querywise-backend" \ + --set image.backend.tag="${TAG}" \ + --set image.frontend.repository="ghcr.io/${OWNER}/querywise-frontend" \ + --set image.frontend.tag="${TAG}" \ + --wait --atomic --timeout 10m + + - name: Rollout summary + shell: bash + run: | + helm status "${{ inputs.release }}" --namespace "${{ inputs.namespace }}" || true + kubectl get pods -n "${{ inputs.namespace }}" \ + -l app.kubernetes.io/instance="${{ inputs.release }}" || true diff --git a/.github/workflows/deploy-validate.yml b/.github/workflows/deploy-validate.yml new file mode 100644 index 0000000..9e302b3 --- /dev/null +++ b/.github/workflows/deploy-validate.yml @@ -0,0 +1,69 @@ +name: Deploy artifacts + +# Validates the deployment artifacts so a broken chart or module never merges. +# Runs only when something under deploy/ (or these workflows) changes. + +on: + pull_request: + paths: + - "deploy/**" + - ".github/workflows/deploy-validate.yml" + push: + branches: [main] + paths: + - "deploy/**" + +jobs: + helm: + name: Helm lint + kubeconform + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: azure/setup-helm@v4 + with: + version: v3.16.0 + + - name: Helm lint + run: helm lint deploy/helm/querywise + + - name: Install kubeconform + run: | + curl -sSL -o /tmp/kubeconform.tar.gz \ + https://github.com/yannh/kubeconform/releases/download/v0.6.7/kubeconform-linux-amd64.tar.gz + tar -xzf /tmp/kubeconform.tar.gz -C /tmp + sudo mv /tmp/kubeconform /usr/local/bin/ + + - name: Render + schema-validate + run: | + helm template querywise deploy/helm/querywise \ + --set secrets.data.DATABASE_URL=postgresql+asyncpg://u:p@db:5432/querywise \ + --set secrets.data.REDIS_URL=redis://redis:6379/0 \ + --set secrets.data.ENCRYPTION_KEY=x --set secrets.data.JWT_SECRET=y \ + | kubeconform -strict -summary -kubernetes-version 1.29.0 + + terraform: + name: Terraform fmt + validate (${{ matrix.cloud }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + cloud: [aws, gcp, azure] + defaults: + run: + working-directory: deploy/terraform/${{ matrix.cloud }} + steps: + - uses: actions/checkout@v4 + + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: "1.9.5" + + - name: Format check + run: terraform fmt -check -recursive + + - name: Init (no backend) + run: terraform init -backend=false -input=false + + - name: Validate + run: terraform validate diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..81f93b1 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,104 @@ +name: Release + +# Build + push the two production images, then deploy with Helm: +# push to main -> build -> deploy to STAGING +# push tag v* -> build -> deploy to PRODUCTION (gated by the environment's +# required reviewers) +# manual -> build only (workflow_dispatch) +# +# Required GitHub Environment secrets: +# staging / production: KUBE_CONFIG (base64-encoded kubeconfig for the cluster) +# Images push to GHCR using the built-in GITHUB_TOKEN (packages: write). + +on: + push: + branches: [main] + tags: ["v*"] + workflow_dispatch: + +concurrency: + group: release-${{ github.ref }} + cancel-in-progress: false + +env: + REGISTRY: ghcr.io + +jobs: + images: + name: Build & push (${{ matrix.component }}) + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + strategy: + matrix: + include: + - component: backend + context: backend + dockerfile: backend/Dockerfile.prod + - component: frontend + context: frontend + dockerfile: frontend/Dockerfile.prod + steps: + - uses: actions/checkout@v4 + + - uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Image metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ github.repository_owner }}/querywise-${{ matrix.component }} + tags: | + type=raw,value=${{ github.sha }} + type=ref,event=branch + type=semver,pattern={{version}} + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build & push + uses: docker/build-push-action@v6 + with: + context: ${{ matrix.context }} + file: ${{ matrix.dockerfile }} + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + # Frontend is built same-origin; nginx proxies /api to the backend. + build-args: ${{ matrix.component == 'frontend' && 'VITE_API_URL=' || '' }} + cache-from: type=gha + cache-to: type=gha,mode=max + + deploy-staging: + name: Deploy to staging + needs: images + if: github.ref == 'refs/heads/main' + runs-on: ubuntu-latest + environment: staging + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/helm-deploy + with: + environment: staging + image_tag: ${{ github.sha }} + kube_config: ${{ secrets.KUBE_CONFIG }} + + deploy-prod: + name: Deploy to production + needs: images + if: startsWith(github.ref, 'refs/tags/v') + runs-on: ubuntu-latest + environment: production + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/helm-deploy + with: + environment: production + image_tag: ${{ github.sha }} + kube_config: ${{ secrets.KUBE_CONFIG }} diff --git a/.gitignore b/.gitignore index e8a769b..fc77c21 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,8 @@ Thumbs.db # Environment / secrets .env +.env.prod +.env.*.local *.pem *.key diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f6a69f..8753a21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -98,6 +98,36 @@ product surface; all optional dependencies degrade gracefully). so the lineage tests run (they `importorskip` past `sqlglot` when the extra is absent). - **Deferred to a later milestone:** column profiling (null rate / distinct counts / sample values). +### Added (Packaging & deployability) +- **Hardened production images** — multi-stage, non-root `backend/Dockerfile.prod` + (builder venv → slim runtime, `curl` healthcheck, prod extras only) and + `frontend/Dockerfile.prod` (Vite build → unprivileged nginx serving the SPA and + reverse-proxying `/api` + `/mcp`). The dev `Dockerfile`s are untouched. +- **Production compose** (`docker-compose.prod.yml`) — pgvector app-db, Redis, + one-shot `migrate` service (gated so backend replicas never race on Alembic), + backend (uvicorn), arq `worker`, and the nginx edge. Configured by `.env.prod` + (`.env.prod.example` template). +- **Helm chart** (`deploy/helm/querywise/`, EKS/GKE/AKS) — backend Deployment + + HPA + PDB, arq `worker`, frontend + PDB, path-based ingress (`/api`+`/mcp` → + backend, `/` → SPA), ServiceAccount, and a `pre-install`/`pre-upgrade` + migration hook Job. Secrets via a chart-created Secret or `existingSecret` + (external-secrets seam). Validated with `helm lint` + `kubeconform`. +- **Terraform modules** (`deploy/terraform/{aws,gcp,azure}/`) — each provisions + the data plane + secrets in the customer's own account/VPC: managed Postgres 16 + (pgvector) + managed Redis + a secret store with the assembled DSNs/keys + + object storage + optional networking + an identity/policy for external-secrets. + Compute (cluster) is intentionally separate state. `terraform validate`-clean. +- **CI/CD** (`.github/workflows/`) — `deploy-validate.yml` lints the chart + (`kubeconform`) and Terraform (`fmt`/`validate`) on PRs; `release.yml` builds + + pushes both images to GHCR and deploys with Helm (`main` → staging, tag `v*` → + production, `--wait --atomic`) via a reusable composite action. +- **Ops** (`deploy/ops/`) — `backup.sh`/`restore.sh` (encrypted `pg_dump`/ + `pg_restore`), an in-cluster backup CronJob example, a DR runbook (backup/ + restore, region rebuild, upgrade path, quarterly credential rotation), and a + production config reference. +- **Deferred:** the managed-SaaS control plane (provisioning/billing/fleet + upgrades) — additive, since each tenant is already an isolated instance. + ## [1.0.0] - 2026-06-04 First stable release: natural-language-to-SQL with a semantic metadata layer. diff --git a/CLAUDE.md b/CLAUDE.md index ee60f1f..3f4e0ed 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -269,3 +269,21 @@ Makes the semantic layer discoverable and trustworthy. Two milestones; migration - **Lineage** (`app/services/lineage_service.py`, `ArtifactDependency` model): saved-query `pinned_sql` and metric `sql_expression` are parsed with **sqlglot** (optional `[lineage]` extra; lazy import, degrades to a no-op if absent) into table/column edges, recomputed on create/update (best-effort, never blocks the write). Per-artifact "what this touches" at `GET .../{saved-queries|metrics}/{id}/lineage`; impact view "what depends on this table" at `GET .../catalog/lineage?table=&column=`. Connector type → sqlglot dialect via `dialect_for`. - **Endpoints:** `/connections/{id}/catalog/{search,facets,lineage}`, plus `/status`, `/versions`, `/versions/{v}`, and `/lineage` sub-resources on the metric/glossary/sample-query/saved-query routers. - **Heads-up:** existing rows migrate to `status='draft'`, `version=1`. The saved-query PUT routes any `status` change through the governed lifecycle (no raw status writes). sqlglot is a new optional dep — install the `[lineage]` extra (or rebuild the backend image) for lineage to populate. + +## Packaging & deployability (parallel track) + +Production deployment artifacts under `deploy/` (+ root prod compose), separate from the dev `docker-compose.yml` / `Dockerfile`s (which stay untouched for local work). The whole **Packaging & deployability** parallel track from `planfull.md` is complete: hardened images, prod compose, Helm chart, Terraform for AWS + GCP + Azure, CI/CD (build/push/deploy), and ops (backup/restore, DR runbook, config reference). The only deferred item is the **SaaS control plane** (provisioning/billing/fleet upgrades), which is additive and build-on-demand. Overview: `deploy/README.md`. + +- **Hardened images:** `backend/Dockerfile.prod` (multi-stage: builder venv → slim runtime, non-root uid 1001, `curl` healthcheck on `/api/v1/health/live`, `uvicorn --workers ${UVICORN_WORKERS:-4}`, prod extras only — no `[dev]`) and `frontend/Dockerfile.prod` (Vite build → `nginxinc/nginx-unprivileged:1.27-alpine`, non-root uid 101, listens 8080). `.dockerignore` in both dirs. +- **Edge:** `frontend/nginx.conf` serves the SPA bundle (with client-route fallback) and reverse-proxies `/api`, `/mcp` (buffering off for SSE), and health to the backend **same-origin**. Uses Docker's embedded resolver (`127.0.0.11`) + a `set $backend` variable `proxy_pass` so the edge boots even while the backend is starting (a static `upstream` would make nginx refuse to start). Internal `/healthz` for the container healthcheck. TLS terminates here (mount certs + add a 443 block) or upstream at a LB. +- **Same-origin build:** `frontend/src/api/client.ts` uses `?? 'http://localhost:8000'` (not `||`) so the prod build with `VITE_API_URL=""` calls the API at relative `/api/v1`; unset (dev) still falls back to the local backend. +- **Prod stack:** `docker-compose.prod.yml` — `app-db` (pgvector, no host port), `redis` (cache + arq), one-shot `migrate` (`alembic upgrade head`, gated by `service_completed_successfully` so backend replicas never race), `backend` (uvicorn, `JOB_BACKEND=arq`), `worker` (`arq app.jobs.worker.WorkerSettings`), `frontend` (edge, the only published port). Run: `cp .env.prod.example .env.prod` → edit → `docker compose -f docker-compose.prod.yml --env-file .env.prod up -d --build`. +- **Config:** `.env.prod.example` is the prod-tuned template (CHANGE_ME secrets, `DISABLE_AUTH=false`, `AUTH_COOKIE_SECURE=true`, `LOG_FORMAT=json`, `AUTO_SETUP_SAMPLE_DB=false`). `.env.prod` is gitignored. +- **Helm chart** (`deploy/helm/querywise/`, EKS/GKE/AKS): backend Deployment (uvicorn) + HPA + PDB, dedicated arq `worker` Deployment, frontend Deployment + PDB, two Services, ingress (path-based: `/api`+`/mcp`→backend, `/`→frontend SPA — same-origin), ServiceAccount (IRSA/Workload-Identity annotations). Managed Postgres+pgvector and Redis are **expected out-of-cluster** (supply DSNs via the release Secret). Config split: non-secret env → ConfigMap, secrets → chart-created Secret **or** `secrets.existingSecret` (external-secrets/sealed-secrets seam). **Migration:** `alembic upgrade head` runs as a `pre-install`/`pre-upgrade` hook Job (weight `-5`); the ConfigMap+Secret are also hooks (weight `-10`) so they exist first, and the Job gates new backend pods so replicas never race. Validate: `helm lint` + `helm template ... | kubeconform -strict` (both pass). `values-production.example.yaml` is a realistic override; `deploy/README.md` has the install flow. +- **Terraform** (`deploy/terraform/{aws,gcp,azure}/`): each provisions the **data plane + secrets** the chart consumes, in the customer's own account/VPC, with the **same shape** — managed Postgres 16 (pgvector via app migrations) + managed Redis (cache + arq) + a secret store holding the assembled DSNs+keys (keys map 1:1 to backend env → external-secrets `dataFrom` into the `querywise-secrets` k8s Secret) + object storage (exports/backups) + optional network + an identity/policy for external-secrets to read the secret. DB password + JWT secret auto-generate if unset; `ENCRYPTION_KEY` is required (Fernet). **Compute (EKS/GKE/AKS) is deliberately out of scope** — BYO or the upstream cluster module, kept in a separate state so cluster rebuilds never risk the DB. + - **AWS:** RDS (Multi-AZ, gp3, `rds.force_ssl`) + ElastiCache + Secrets Manager + S3; IAM policy for the external-secrets IRSA role. + - **GCP:** Cloud SQL (private IP via PSA peering) + Memorystore + Secret Manager + GCS; a service account with `secretAccessor` for Workload Identity. + - **Azure:** Postgres flexible server (VNet-integrated, `azure.extensions=VECTOR` allow-list) + Cache for Redis + Key Vault + Blob; a user-assigned managed identity with Key Vault read for Workload Identity. + - All three pass `tofu/terraform validate` + `fmt`. `*.tfvars` gitignored; lockfiles committed. +- **CI/CD** (`.github/workflows/`): `ci.yml` (existing — backend tests gating + advisory lint/type, frontend lint/build) is unchanged. **`deploy-validate.yml`** runs on PRs touching `deploy/**` — `helm lint` + `helm template | kubeconform -strict`, and `terraform fmt -check`/`validate` across aws/gcp/azure (matrix). **`release.yml`** builds+pushes both images to GHCR (`querywise-{backend,frontend}`, tagged SHA/branch/semver/latest, gha cache) then deploys via the `.github/actions/helm-deploy` composite action: push to `main` → **staging**, tag `v*` → **production** (gate with environment required-reviewers). Deploys pin the release to the commit SHA with `--wait --atomic` (auto-rollback) and inject only image coords; per-env overlay `values-.yaml` (committed, non-secret) is applied if present. Each environment needs a `KUBE_CONFIG` secret (base64 kubeconfig); clusters run external-secrets to sync `querywise-secrets`. Lint with `actionlint`. +- **Ops** (`deploy/ops/`): `backup.sh` (`pg_dump` custom format → AES-256/openssl PBKDF2 → `querywise-.dump.enc`, optional S3/GCS upload, local retention prune) + `restore.sh` (decrypt → `pg_restore --clean --if-exists`, guarded by `RESTORE_CONFIRM=yes`); both strip the `+asyncpg` suffix from `DATABASE_URL`, shellcheck-clean. `backup-cronjob.example.yaml` schedules backups in-cluster (postgres:16 image, script via ConfigMap, `BACKUP_PASSPHRASE`+`DATABASE_URL` from `querywise-secrets`). `RUNBOOK.md` covers backup/restore, managed-DB PITR, full-region DR rebuild, the Alembic upgrade path (migrations only run via the Helm/compose migrate hook), and quarterly credential rotation — **`ENCRYPTION_KEY` must not be blind-rotated** (it Fernet-encrypts stored connection strings; re-encrypt each connection before swapping). `config-reference.md` is the production-focused settings catalogue (the full list is in the env-vars table above / `.env.example`). diff --git a/README.md b/README.md index 332dc38..c33519c 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ A full-stack application that translates natural language questions into SQL que - **Data catalog** — hybrid search (embeddings + keyword) across tables, columns, metrics, glossary, and knowledge, with facets and certified-first ranking - **Lineage** — sqlglot parses saved-query/metric SQL to show what each touches and what depends on a given table (impact view) - **Production hardening** — rate limiting, async job queue, OpenTelemetry tracing, structured logging, health probes +- **Deploy anywhere** — hardened non-root images, a production Docker Compose stack, a Helm chart (HPA/PDB/ingress/migration hook), Terraform for AWS/GCP/Azure (managed Postgres+pgvector, Redis, secrets, in your own VPC), GitHub Actions CI/CD (build → staging → prod), and ops tooling (encrypted backup/restore + DR runbook) — see [`deploy/`](deploy/) --- @@ -391,6 +392,28 @@ For development, `docker compose up app-db sample-db` starts both databases with --- +## Production Deployment + +The `docker compose up` flow above is for **local development**. For production, +QueryWise ships a full set of deployment artifacts under [`deploy/`](deploy/) — +the same build-once images configured entirely by environment: + +| Target | Where | Best for | +|--------|-------|----------| +| **Docker Compose (prod)** | [`docker-compose.prod.yml`](docker-compose.prod.yml) | Small / on-prem, single host | +| **Helm chart** | [`deploy/helm/querywise/`](deploy/helm/querywise) | EKS / GKE / AKS | +| **Terraform** | [`deploy/terraform/{aws,gcp,azure}/`](deploy/terraform) | Managed Postgres+pgvector, Redis, secrets — in your own VPC | +| **CI/CD** | [`.github/workflows/release.yml`](.github/workflows/release.yml) | Build → push images → Helm deploy (staging → prod) | +| **Ops** | [`deploy/ops/`](deploy/ops) | Encrypted backup/restore, DR runbook, config reference | + +Highlights: hardened multi-stage **non-root** images, a one-shot Alembic +migration that runs before new pods roll (replicas never race), backend +autoscaling + PodDisruptionBudgets, secrets via the **external-secrets** seam, +and a same-origin SPA behind an nginx edge. Start at [`deploy/README.md`](deploy/README.md); +the production env template is [`.env.prod.example`](.env.prod.example). + +--- + ## Environment Variables | Variable | Default | Description | @@ -425,10 +448,18 @@ For development, `docker compose up app-db sample-db` starts both databases with ``` querywise/ -├── docker-compose.yml # 4 services: app-db, sample-db, backend, frontend -├── .env.example # Environment variable template +├── docker-compose.yml # Dev: app-db, sample-db, backend, frontend +├── docker-compose.prod.yml # Prod: + redis, migrate, arq worker, nginx edge +├── .env.example # Environment variable template (dev) +├── .env.prod.example # Environment variable template (prod) ├── CLAUDE.md # Claude Code project conventions +├── CHANGELOG.md # Release notes ├── README.md # This file +├── deploy/ # Production deployment artifacts +│ ├── helm/querywise/ # Helm chart (HPA, PDB, ingress, migration hook) +│ ├── terraform/{aws,gcp,azure}/ # Managed Postgres+pgvector, Redis, secrets +│ └── ops/ # backup/restore, DR runbook, config reference +├── .github/workflows/ # CI (tests/lint) + release (build → deploy) │ ├── backend/ │ ├── Dockerfile diff --git a/backend/.dockerignore b/backend/.dockerignore new file mode 100644 index 0000000..979fd6b --- /dev/null +++ b/backend/.dockerignore @@ -0,0 +1,16 @@ +# Keep the build context lean + avoid baking local state into the image. +__pycache__/ +*.py[cod] +*.egg-info/ +.mypy_cache/ +.pytest_cache/ +.ruff_cache/ +.venv/ +venv/ +tests/ +.env +.env.* +*.sqlite +*.log +.git/ +.DS_Store diff --git a/backend/Dockerfile.prod b/backend/Dockerfile.prod new file mode 100644 index 0000000..17b87c5 --- /dev/null +++ b/backend/Dockerfile.prod @@ -0,0 +1,71 @@ +# syntax=docker/dockerfile:1 +# +# Hardened, multi-stage production image for the QueryWise backend. +# * builder stage compiles deps into an isolated venv (needs gcc/libpq-dev) +# * runtime stage carries only the venv + app + libpq runtime, runs non-root +# +# Build: docker build -f Dockerfile.prod -t querywise-backend:prod . +# Used by docker-compose.prod.yml for the backend, worker, and migrate services. + +# ---- builder --------------------------------------------------------------- +FROM python:3.12-slim AS builder + +ENV PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PYTHONDONTWRITEBYTECODE=1 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc libpq-dev && \ + rm -rf /var/lib/apt/lists/* + +# Isolated venv we can copy wholesale into the runtime image. +RUN python -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +WORKDIR /build +# Copy only what the package + build needs (keeps the layer cache warm). +COPY pyproject.toml ./ +COPY app ./app +COPY alembic ./alembic +COPY alembic.ini ./ +COPY scripts ./scripts + +# Production extras only — no [dev]. Non-editable install so the venv is +# self-contained and the source tree isn't needed at runtime. +RUN pip install ".[llm,bigquery,databricks,export,lineage,observability,jobs,scheduling]" + +# ---- runtime --------------------------------------------------------------- +FROM python:3.12-slim AS runtime + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PATH="/opt/venv/bin:$PATH" + +# libpq for asyncpg/psycopg at runtime; curl for the container healthcheck. +RUN apt-get update && apt-get install -y --no-install-recommends \ + libpq5 curl && \ + rm -rf /var/lib/apt/lists/* && \ + # Non-root runtime user. + groupadd --system --gid 1001 querywise && \ + useradd --system --uid 1001 --gid querywise --no-create-home querywise + +COPY --from=builder /opt/venv /opt/venv + +WORKDIR /app +# alembic.ini + migrations are needed by the `migrate` service; app code is the +# package itself (installed into the venv) but we keep the tree for alembic env. +COPY --chown=querywise:querywise alembic ./alembic +COPY --chown=querywise:querywise alembic.ini ./ +COPY --chown=querywise:querywise app ./app +COPY --chown=querywise:querywise scripts ./scripts + +USER querywise + +EXPOSE 8000 + +# Liveness probe hits the app's own /health/live (process-only, no DB/LLM). +HEALTHCHECK --interval=15s --timeout=5s --start-period=20s --retries=3 \ + CMD curl -fsS http://localhost:8000/api/v1/health/live || exit 1 + +# Workers come from env so the same image scales without a rebuild. +CMD ["sh", "-c", "uvicorn app.main:app --host 0.0.0.0 --port 8000 --workers ${UVICORN_WORKERS:-4}"] diff --git a/deploy/README.md b/deploy/README.md new file mode 100644 index 0000000..079454d --- /dev/null +++ b/deploy/README.md @@ -0,0 +1,128 @@ +# Deploying QueryWise + +Production deployment artifacts. Single-tenant per deployment; isolation is by +workspace within the auto-created default organization. The app is a +**build-once image configured entirely by env** — the same backend/frontend +images run under Docker Compose, Helm, or any of the cloud targets. + +| Target | Where | Best for | +|--------|-------|----------| +| **Docker Compose (prod)** | [`../docker-compose.prod.yml`](../docker-compose.prod.yml) | Small / on-prem, single host | +| **Helm chart** | [`helm/querywise/`](helm/querywise) | EKS / GKE / AKS | +| **Terraform — AWS** | [`terraform/aws/`](terraform/aws) | RDS pgvector + ElastiCache + Secrets Manager + S3, in your VPC | +| **Terraform — GCP** | [`terraform/gcp/`](terraform/gcp) | Cloud SQL pgvector + Memorystore + Secret Manager + GCS | +| **Terraform — Azure** | [`terraform/azure/`](terraform/azure) | Postgres flexible server + Cache for Redis + Key Vault + Blob | +| **Ops** | [`ops/`](ops) | Backup/restore, DR runbook, config reference | + +External dependencies (not bundled in the Helm chart): **managed PostgreSQL 16 +with the `pgvector` extension** and **Redis** (cache + the arq job queue). The +Terraform modules provision these; for the chart you supply their DSNs via the +release Secret. + +## Images + +Built from `backend/Dockerfile.prod` and `frontend/Dockerfile.prod`: + +```bash +docker build -f backend/Dockerfile.prod -t ghcr.io/your-org/querywise-backend:1.0.0 backend +docker build -f frontend/Dockerfile.prod -t ghcr.io/your-org/querywise-frontend:1.0.0 frontend +docker push ghcr.io/your-org/querywise-backend:1.0.0 +docker push ghcr.io/your-org/querywise-frontend:1.0.0 +``` + +Both run **non-root**; the frontend serves the SPA and the backend is uvicorn +with an arq worker alongside. The SPA is built same-origin (`VITE_API_URL=""`), +so the edge / ingress routes `/api` + `/mcp` to the backend and everything else +to the frontend. + +## Helm + +```bash +# 1. Provide secrets — ideally via external-secrets / sealed-secrets: +kubectl create namespace querywise +kubectl -n querywise create secret generic querywise-secrets \ + --from-literal=DATABASE_URL='postgresql+asyncpg://user:pass@host:5432/querywise' \ + --from-literal=REDIS_URL='redis://host:6379/0' \ + --from-literal=ENCRYPTION_KEY='...' \ + --from-literal=JWT_SECRET='...' \ + --from-literal=OPENAI_API_KEY='...' + +# 2. Install (a pre-upgrade hook runs `alembic upgrade head` before pods roll): +helm upgrade --install querywise deploy/helm/querywise \ + -n querywise \ + -f deploy/helm/querywise/values-production.example.yaml \ + --set secrets.existingSecret=querywise-secrets +``` + +Key chart features: + +- **Migration hook** — `alembic upgrade head` runs as a `pre-install`/`pre-upgrade` + Job (ordered after the config/secret hooks) so schema changes land before new + backend code serves and the N replicas never race. +- **Scaling** — backend HPA (CPU), PodDisruptionBudgets on backend + frontend, + dedicated arq `worker` Deployment. +- **Secrets seam** — `secrets.existingSecret` to bring your own (external-secrets + operator, sealed-secrets, cloud sync) instead of putting values in the release. +- **Service account annotations** — for IRSA (EKS) / Workload Identity (GKE) / + Azure Workload Identity. + +See [`helm/querywise/values.yaml`](helm/querywise/values.yaml) for the full set +of knobs and [`values-production.example.yaml`](helm/querywise/values-production.example.yaml) +for a realistic production override. + +### Validate locally + +```bash +helm lint deploy/helm/querywise +helm template querywise deploy/helm/querywise | kubeconform -strict -summary +``` + +## CI/CD + +Two GitHub Actions workflows under [`../.github/workflows`](../.github/workflows): + +- **`deploy-validate.yml`** (PRs touching `deploy/**`) — `helm lint` + `helm + template | kubeconform -strict`, and `terraform fmt -check` + `validate` for + each of aws/gcp/azure. Keeps a broken chart or module from merging. +- **`release.yml`** — builds + pushes both images to GHCR + (`ghcr.io//querywise-{backend,frontend}`, tagged with the commit SHA, + branch, semver, and `latest`), then deploys with Helm via the + [`helm-deploy`](../.github/actions/helm-deploy) composite action: + - **push to `main`** → deploy to the **staging** environment + - **push tag `v*`** → deploy to the **production** environment (gate it with + required reviewers in the environment's protection rules for manual approval) + - **manual run** → build only + +Both deploys pin the release to the exact commit SHA (`--wait --atomic`, so a +failed rollout auto-reverts) and inject only the image coordinates; everything +else comes from the chart defaults plus an optional committed overlay +`deploy/helm/querywise/values-.yaml` (see the `*-staging` / +`*-production` examples). + +### Required GitHub config + +| What | Where | Value | +|------|-------|-------| +| `KUBE_CONFIG` | Environment secret on **staging** and **production** | base64-encoded kubeconfig for that cluster | +| Required reviewers | **production** environment protection rules | who approves prod deploys | +| Packages: write | repo default `GITHUB_TOKEN` | already granted in the workflow | + +The clusters are expected to run the **external-secrets operator** syncing the +cloud secret store (provisioned by Terraform) into the `querywise-secrets` +Kubernetes Secret the chart references. + +## Operations + +Day-2 procedures live in [`ops/`](ops): + +- **Backups** — [`ops/backup.sh`](ops/backup.sh): `pg_dump` (custom format) → + AES-256 (openssl) → `querywise-.dump.enc`, with optional S3/GCS upload and + local retention. Schedule it in-cluster with + [`ops/backup-cronjob.example.yaml`](ops/backup-cronjob.example.yaml). +- **Restore** — [`ops/restore.sh`](ops/restore.sh): decrypt → `pg_restore + --clean --if-exists` (guarded by `RESTORE_CONFIRM=yes`). +- **Runbook** — [`ops/RUNBOOK.md`](ops/RUNBOOK.md): backup/restore, full-region + DR rebuild, the Alembic upgrade path, and quarterly credential rotation + (including the `ENCRYPTION_KEY` caveat). +- **Config reference** — [`ops/config-reference.md`](ops/config-reference.md): + every production-critical setting, where it's set, and what's a secret. diff --git a/deploy/helm/querywise/.helmignore b/deploy/helm/querywise/.helmignore new file mode 100644 index 0000000..a313914 --- /dev/null +++ b/deploy/helm/querywise/.helmignore @@ -0,0 +1,8 @@ +.DS_Store +.git/ +.gitignore +*.tmproj +*.bak +*.swp +*~ +ci/ diff --git a/deploy/helm/querywise/Chart.yaml b/deploy/helm/querywise/Chart.yaml new file mode 100644 index 0000000..85c3bf1 --- /dev/null +++ b/deploy/helm/querywise/Chart.yaml @@ -0,0 +1,21 @@ +apiVersion: v2 +name: querywise +description: >- + QueryWise — text-to-SQL with a semantic metadata layer. Single-tenant per + deployment; brings up the backend (uvicorn), arq worker, and frontend edge, + with a one-shot migration hook. Managed Postgres (pgvector) and Redis are + expected to be provided out-of-cluster (see the Terraform modules). +type: application +# Chart version — bump on chart changes. +version: 0.1.0 +# Tracks the QueryWise app release the chart defaults target. +appVersion: "0.1.0" +keywords: + - text-to-sql + - semantic-layer + - analytics +home: https://github.com/kosminus/querywise +sources: + - https://github.com/kosminus/querywise +maintainers: + - name: QueryWise diff --git a/deploy/helm/querywise/templates/NOTES.txt b/deploy/helm/querywise/templates/NOTES.txt new file mode 100644 index 0000000..57a5194 --- /dev/null +++ b/deploy/helm/querywise/templates/NOTES.txt @@ -0,0 +1,30 @@ +QueryWise {{ .Chart.AppVersion }} deployed as release "{{ .Release.Name }}". + +Components: + backend {{ include "querywise.backendImage" . }} (uvicorn, {{ .Values.backend.uvicornWorkers }} workers/pod) + worker arq background jobs + frontend {{ include "querywise.frontendImage" . }} (nginx edge / SPA) + +A pre-install/pre-upgrade hook ran `alembic upgrade head`{{ if not .Values.migrate.enabled }} (DISABLED — migrate.enabled=false){{ end }}. + +{{- if .Values.ingress.enabled }} + +Reach it at: http{{ if .Values.ingress.tls.enabled }}s{{ end }}://{{ .Values.ingress.host }}/ +{{- else }} + +Ingress is disabled. Port-forward the frontend to try it locally: + kubectl port-forward svc/{{ include "querywise.fullname" . }}-frontend 8080:{{ .Values.frontend.service.port }} + open http://localhost:8080 +{{- end }} + +{{- if not .Values.secrets.existingSecret }} +{{- if not .Values.secrets.data.DATABASE_URL }} + +WARNING: secrets.data.DATABASE_URL is empty. Set DATABASE_URL and REDIS_URL +(managed Postgres+pgvector / Redis), ENCRYPTION_KEY, and JWT_SECRET — or point +secrets.existingSecret at an external-secrets-managed Secret. +{{- end }} +{{- end }} + +Check rollout: + kubectl get pods -l app.kubernetes.io/instance={{ .Release.Name }} diff --git a/deploy/helm/querywise/templates/_helpers.tpl b/deploy/helm/querywise/templates/_helpers.tpl new file mode 100644 index 0000000..6b0ca60 --- /dev/null +++ b/deploy/helm/querywise/templates/_helpers.tpl @@ -0,0 +1,86 @@ +{{/* Chart name (overridable). */}} +{{- define "querywise.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* Fully-qualified release name. */}} +{{- define "querywise.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{- define "querywise.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* Common labels. */}} +{{- define "querywise.labels" -}} +helm.sh/chart: {{ include "querywise.chart" . }} +{{ include "querywise.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* Selector labels (app-wide). */}} +{{- define "querywise.selectorLabels" -}} +app.kubernetes.io/name: {{ include "querywise.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{/* Per-component selector labels. Pass a dict: (dict "ctx" . "component" "backend"). */}} +{{- define "querywise.componentSelectorLabels" -}} +{{ include "querywise.selectorLabels" .ctx }} +app.kubernetes.io/component: {{ .component }} +{{- end -}} + +{{/* Service account name. */}} +{{- define "querywise.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} +{{- default (include "querywise.fullname" .) .Values.serviceAccount.name -}} +{{- else -}} +{{- default "default" .Values.serviceAccount.name -}} +{{- end -}} +{{- end -}} + +{{/* Name of the Secret to read env from (existing or chart-created). */}} +{{- define "querywise.secretName" -}} +{{- if .Values.secrets.existingSecret -}} +{{- .Values.secrets.existingSecret -}} +{{- else -}} +{{- printf "%s-secrets" (include "querywise.fullname" .) -}} +{{- end -}} +{{- end -}} + +{{/* Name of the ConfigMap to read env from. */}} +{{- define "querywise.configMapName" -}} +{{- printf "%s-config" (include "querywise.fullname" .) -}} +{{- end -}} + +{{/* Image refs (tag defaults to appVersion). */}} +{{- define "querywise.backendImage" -}} +{{- $tag := default .Chart.AppVersion .Values.image.backend.tag -}} +{{- printf "%s:%s" .Values.image.backend.repository $tag -}} +{{- end -}} + +{{- define "querywise.frontendImage" -}} +{{- $tag := default .Chart.AppVersion .Values.image.frontend.tag -}} +{{- printf "%s:%s" .Values.image.frontend.repository $tag -}} +{{- end -}} + +{{/* envFrom block shared by backend, worker, and migrate. */}} +{{- define "querywise.envFrom" -}} +- configMapRef: + name: {{ include "querywise.configMapName" . }} +- secretRef: + name: {{ include "querywise.secretName" . }} +{{- end -}} diff --git a/deploy/helm/querywise/templates/backend-deployment.yaml b/deploy/helm/querywise/templates/backend-deployment.yaml new file mode 100644 index 0000000..c29fac1 --- /dev/null +++ b/deploy/helm/querywise/templates/backend-deployment.yaml @@ -0,0 +1,70 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "querywise.fullname" . }}-backend + labels: + {{- include "querywise.labels" . | nindent 4 }} + app.kubernetes.io/component: backend +spec: + {{- if not .Values.backend.autoscaling.enabled }} + replicas: {{ .Values.backend.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "querywise.componentSelectorLabels" (dict "ctx" . "component" "backend") | nindent 6 }} + template: + metadata: + labels: + {{- include "querywise.componentSelectorLabels" (dict "ctx" . "component" "backend") | nindent 8 }} + annotations: + # Roll pods when config/secret content changes. + checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + spec: + serviceAccountName: {{ include "querywise.serviceAccountName" . }} + {{- with .Values.image.pullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: backend + image: {{ include "querywise.backendImage" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + securityContext: + {{- toYaml .Values.backend.securityContext | nindent 12 }} + command: + - sh + - -c + - uvicorn app.main:app --host 0.0.0.0 --port {{ .Values.backend.containerPort }} --workers {{ .Values.backend.uvicornWorkers }} + ports: + - name: http + containerPort: {{ .Values.backend.containerPort }} + envFrom: + {{- include "querywise.envFrom" . | nindent 12 }} + livenessProbe: + httpGet: + path: /api/v1/health/live + port: http + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /api/v1/health/ready + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + resources: + {{- toYaml .Values.backend.resources | nindent 12 }} + {{- with .Values.backend.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.backend.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.backend.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/deploy/helm/querywise/templates/backend-hpa.yaml b/deploy/helm/querywise/templates/backend-hpa.yaml new file mode 100644 index 0000000..110e0a6 --- /dev/null +++ b/deploy/helm/querywise/templates/backend-hpa.yaml @@ -0,0 +1,23 @@ +{{- if .Values.backend.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "querywise.fullname" . }}-backend + labels: + {{- include "querywise.labels" . | nindent 4 }} + app.kubernetes.io/component: backend +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "querywise.fullname" . }}-backend + minReplicas: {{ .Values.backend.autoscaling.minReplicas }} + maxReplicas: {{ .Values.backend.autoscaling.maxReplicas }} + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.backend.autoscaling.targetCPUUtilizationPercentage }} +{{- end }} diff --git a/deploy/helm/querywise/templates/backend-pdb.yaml b/deploy/helm/querywise/templates/backend-pdb.yaml new file mode 100644 index 0000000..4d2f5e6 --- /dev/null +++ b/deploy/helm/querywise/templates/backend-pdb.yaml @@ -0,0 +1,14 @@ +{{- if .Values.backend.podDisruptionBudget.enabled }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "querywise.fullname" . }}-backend + labels: + {{- include "querywise.labels" . | nindent 4 }} + app.kubernetes.io/component: backend +spec: + minAvailable: {{ .Values.backend.podDisruptionBudget.minAvailable }} + selector: + matchLabels: + {{- include "querywise.componentSelectorLabels" (dict "ctx" . "component" "backend") | nindent 6 }} +{{- end }} diff --git a/deploy/helm/querywise/templates/backend-service.yaml b/deploy/helm/querywise/templates/backend-service.yaml new file mode 100644 index 0000000..06b0ab5 --- /dev/null +++ b/deploy/helm/querywise/templates/backend-service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "querywise.fullname" . }}-backend + labels: + {{- include "querywise.labels" . | nindent 4 }} + app.kubernetes.io/component: backend +spec: + type: {{ .Values.backend.service.type }} + ports: + - name: http + port: {{ .Values.backend.service.port }} + targetPort: http + protocol: TCP + selector: + {{- include "querywise.componentSelectorLabels" (dict "ctx" . "component" "backend") | nindent 4 }} diff --git a/deploy/helm/querywise/templates/configmap.yaml b/deploy/helm/querywise/templates/configmap.yaml new file mode 100644 index 0000000..848a82f --- /dev/null +++ b/deploy/helm/querywise/templates/configmap.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "querywise.configMapName" . }} + labels: + {{- include "querywise.labels" . | nindent 4 }} + annotations: + # Also a pre-install/pre-upgrade hook so it exists before the migrate Job + # (lower weight = runs earlier). Persists after success for the Deployments. + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-10" + "helm.sh/hook-delete-policy": before-hook-creation +data: + {{- range $key, $val := .Values.config }} + {{ $key }}: {{ $val | quote }} + {{- end }} diff --git a/deploy/helm/querywise/templates/frontend-deployment.yaml b/deploy/helm/querywise/templates/frontend-deployment.yaml new file mode 100644 index 0000000..0815a1e --- /dev/null +++ b/deploy/helm/querywise/templates/frontend-deployment.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "querywise.fullname" . }}-frontend + labels: + {{- include "querywise.labels" . | nindent 4 }} + app.kubernetes.io/component: frontend +spec: + replicas: {{ .Values.frontend.replicaCount }} + selector: + matchLabels: + {{- include "querywise.componentSelectorLabels" (dict "ctx" . "component" "frontend") | nindent 6 }} + template: + metadata: + labels: + {{- include "querywise.componentSelectorLabels" (dict "ctx" . "component" "frontend") | nindent 8 }} + spec: + {{- with .Values.image.pullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: frontend + image: {{ include "querywise.frontendImage" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + securityContext: + {{- toYaml .Values.frontend.securityContext | nindent 12 }} + ports: + - name: http + containerPort: {{ .Values.frontend.containerPort }} + livenessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 5 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + {{- toYaml .Values.frontend.resources | nindent 12 }} + {{- with .Values.frontend.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.frontend.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.frontend.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/deploy/helm/querywise/templates/frontend-pdb.yaml b/deploy/helm/querywise/templates/frontend-pdb.yaml new file mode 100644 index 0000000..31204ef --- /dev/null +++ b/deploy/helm/querywise/templates/frontend-pdb.yaml @@ -0,0 +1,14 @@ +{{- if .Values.frontend.podDisruptionBudget.enabled }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "querywise.fullname" . }}-frontend + labels: + {{- include "querywise.labels" . | nindent 4 }} + app.kubernetes.io/component: frontend +spec: + minAvailable: {{ .Values.frontend.podDisruptionBudget.minAvailable }} + selector: + matchLabels: + {{- include "querywise.componentSelectorLabels" (dict "ctx" . "component" "frontend") | nindent 6 }} +{{- end }} diff --git a/deploy/helm/querywise/templates/frontend-service.yaml b/deploy/helm/querywise/templates/frontend-service.yaml new file mode 100644 index 0000000..8950199 --- /dev/null +++ b/deploy/helm/querywise/templates/frontend-service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "querywise.fullname" . }}-frontend + labels: + {{- include "querywise.labels" . | nindent 4 }} + app.kubernetes.io/component: frontend +spec: + type: {{ .Values.frontend.service.type }} + ports: + - name: http + port: {{ .Values.frontend.service.port }} + targetPort: http + protocol: TCP + selector: + {{- include "querywise.componentSelectorLabels" (dict "ctx" . "component" "frontend") | nindent 4 }} diff --git a/deploy/helm/querywise/templates/ingress.yaml b/deploy/helm/querywise/templates/ingress.yaml new file mode 100644 index 0000000..9b551f3 --- /dev/null +++ b/deploy/helm/querywise/templates/ingress.yaml @@ -0,0 +1,52 @@ +{{- if .Values.ingress.enabled }} +{{- $fullName := include "querywise.fullname" . -}} +{{- $backendSvc := printf "%s-backend" $fullName -}} +{{- $frontendSvc := printf "%s-frontend" $fullName -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "querywise.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- with .Values.ingress.className }} + ingressClassName: {{ . }} + {{- end }} + {{- if .Values.ingress.tls.enabled }} + tls: + - hosts: + - {{ .Values.ingress.host | quote }} + secretName: {{ .Values.ingress.tls.secretName }} + {{- end }} + rules: + - host: {{ .Values.ingress.host | quote }} + http: + paths: + # API + MCP go straight to the backend; the SPA calls these same-origin. + - path: /api + pathType: Prefix + backend: + service: + name: {{ $backendSvc }} + port: + number: {{ .Values.backend.service.port }} + - path: /mcp + pathType: Prefix + backend: + service: + name: {{ $backendSvc }} + port: + number: {{ .Values.backend.service.port }} + # Everything else is the SPA bundle. + - path: / + pathType: Prefix + backend: + service: + name: {{ $frontendSvc }} + port: + number: {{ .Values.frontend.service.port }} +{{- end }} diff --git a/deploy/helm/querywise/templates/migrate-job.yaml b/deploy/helm/querywise/templates/migrate-job.yaml new file mode 100644 index 0000000..e2e6a03 --- /dev/null +++ b/deploy/helm/querywise/templates/migrate-job.yaml @@ -0,0 +1,42 @@ +{{- if .Values.migrate.enabled }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "querywise.fullname" . }}-migrate + labels: + {{- include "querywise.labels" . | nindent 4 }} + app.kubernetes.io/component: migrate + annotations: + # Runs after the ConfigMap/Secret hooks (weight -10) and before the main + # phase rolls new backend pods — so migrations land before new code serves + # and the N backend replicas never race on `alembic upgrade`. + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + backoffLimit: {{ .Values.migrate.backoffLimit }} + template: + metadata: + labels: + {{- include "querywise.componentSelectorLabels" (dict "ctx" . "component" "migrate") | nindent 8 }} + spec: + restartPolicy: Never + serviceAccountName: {{ include "querywise.serviceAccountName" . }} + {{- with .Values.image.pullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: migrate + image: {{ include "querywise.backendImage" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + securityContext: + {{- toYaml .Values.backend.securityContext | nindent 12 }} + command: ["alembic", "upgrade", "head"] + envFrom: + {{- include "querywise.envFrom" . | nindent 12 }} + resources: + {{- toYaml .Values.migrate.resources | nindent 12 }} +{{- end }} diff --git a/deploy/helm/querywise/templates/secret.yaml b/deploy/helm/querywise/templates/secret.yaml new file mode 100644 index 0000000..d1a3167 --- /dev/null +++ b/deploy/helm/querywise/templates/secret.yaml @@ -0,0 +1,22 @@ +{{- if not .Values.secrets.existingSecret }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "querywise.secretName" . }} + labels: + {{- include "querywise.labels" . | nindent 4 }} + annotations: + # Hook-ordered before the migrate Job, like the ConfigMap. For real + # deployments prefer `secrets.existingSecret` (external-secrets / sealed- + # secrets) so credentials never live in the Helm release. + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-10" + "helm.sh/hook-delete-policy": before-hook-creation +type: Opaque +stringData: + {{- range $key, $val := .Values.secrets.data }} + {{- if $val }} + {{ $key }}: {{ $val | quote }} + {{- end }} + {{- end }} +{{- end }} diff --git a/deploy/helm/querywise/templates/serviceaccount.yaml b/deploy/helm/querywise/templates/serviceaccount.yaml new file mode 100644 index 0000000..246bad3 --- /dev/null +++ b/deploy/helm/querywise/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "querywise.serviceAccountName" . }} + labels: + {{- include "querywise.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/querywise/templates/worker-deployment.yaml b/deploy/helm/querywise/templates/worker-deployment.yaml new file mode 100644 index 0000000..7acd9ed --- /dev/null +++ b/deploy/helm/querywise/templates/worker-deployment.yaml @@ -0,0 +1,49 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "querywise.fullname" . }}-worker + labels: + {{- include "querywise.labels" . | nindent 4 }} + app.kubernetes.io/component: worker +spec: + replicas: {{ .Values.worker.replicaCount }} + selector: + matchLabels: + {{- include "querywise.componentSelectorLabels" (dict "ctx" . "component" "worker") | nindent 6 }} + template: + metadata: + labels: + {{- include "querywise.componentSelectorLabels" (dict "ctx" . "component" "worker") | nindent 8 }} + annotations: + checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + spec: + serviceAccountName: {{ include "querywise.serviceAccountName" . }} + {{- with .Values.image.pullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: worker + image: {{ include "querywise.backendImage" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + securityContext: + {{- toYaml .Values.worker.securityContext | nindent 12 }} + command: ["arq", "app.jobs.worker.WorkerSettings"] + envFrom: + {{- include "querywise.envFrom" . | nindent 12 }} + resources: + {{- toYaml .Values.worker.resources | nindent 12 }} + {{- with .Values.worker.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.worker.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.worker.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/deploy/helm/querywise/values-production.example.yaml b/deploy/helm/querywise/values-production.example.yaml new file mode 100644 index 0000000..7e23a9e --- /dev/null +++ b/deploy/helm/querywise/values-production.example.yaml @@ -0,0 +1,61 @@ +# Example production overrides for the QueryWise chart. +# helm upgrade --install querywise deploy/helm/querywise \ +# -n querywise --create-namespace \ +# -f deploy/helm/querywise/values-production.example.yaml +# +# Prefer secrets.existingSecret (external-secrets / sealed-secrets) over inline +# secret values for anything real. + +image: + backend: + repository: ghcr.io/your-org/querywise-backend + tag: "1.0.0" + frontend: + repository: ghcr.io/your-org/querywise-frontend + tag: "1.0.0" + +backend: + uvicornWorkers: 4 + autoscaling: + enabled: true + minReplicas: 3 + maxReplicas: 12 + targetCPUUtilizationPercentage: 70 + +worker: + replicaCount: 2 + +frontend: + replicaCount: 3 + +# IRSA / Workload Identity so pods can reach a managed secrets backend. +serviceAccount: + annotations: + # eks.amazonaws.com/role-arn: arn:aws:iam::123456789012:role/querywise + # iam.gke.io/gcp-service-account: querywise@PROJECT.iam.gserviceaccount.com + +ingress: + enabled: true + className: nginx + host: querywise.yourcompany.com + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + # Streamable-HTTP MCP needs buffering off + long read timeout at the edge: + nginx.ingress.kubernetes.io/proxy-buffering: "off" + nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" + tls: + enabled: true + secretName: querywise-tls + +config: + CORS_ORIGINS: '["https://querywise.yourcompany.com"]' + DEFAULT_ADMIN_EMAIL: admin@yourcompany.com + DEFAULT_LLM_PROVIDER: openai + DEFAULT_LLM_MODEL: gpt-5.2 + EMBEDDING_DIMENSION: "1536" + +# Real deployments: sync this Secret with external-secrets and reference it. +# It must carry DATABASE_URL, REDIS_URL, ENCRYPTION_KEY, JWT_SECRET, and your +# LLM provider key(s). +secrets: + existingSecret: querywise-secrets diff --git a/deploy/helm/querywise/values-staging.example.yaml b/deploy/helm/querywise/values-staging.example.yaml new file mode 100644 index 0000000..379b4cb --- /dev/null +++ b/deploy/helm/querywise/values-staging.example.yaml @@ -0,0 +1,36 @@ +# Example STAGING overlay. Copy to values-staging.yaml (committed, no secrets — +# those come via external-secrets) and the Release workflow's staging deploy +# picks it up automatically (.github/actions/helm-deploy resolves +# values-.yaml). Image repo/tag are injected by CI via --set. + +backend: + replicaCount: 1 + uvicornWorkers: 2 + autoscaling: + enabled: false + +worker: + replicaCount: 1 + +frontend: + replicaCount: 1 + +ingress: + enabled: true + className: nginx + host: querywise.staging.yourcompany.com + annotations: + cert-manager.io/cluster-issuer: letsencrypt-staging + nginx.ingress.kubernetes.io/proxy-buffering: "off" + nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" + tls: + enabled: true + secretName: querywise-staging-tls + +config: + ENVIRONMENT: staging + CORS_ORIGINS: '["https://querywise.staging.yourcompany.com"]' + +# Synced from the cloud secret store by external-secrets. +secrets: + existingSecret: querywise-secrets diff --git a/deploy/helm/querywise/values.yaml b/deploy/helm/querywise/values.yaml new file mode 100644 index 0000000..35282ca --- /dev/null +++ b/deploy/helm/querywise/values.yaml @@ -0,0 +1,204 @@ +# Default values for the QueryWise chart. +# Override per-environment with `-f my-values.yaml` or `--set`. + +# -- Naming ------------------------------------------------------------------- +nameOverride: "" +fullnameOverride: "" + +# -- Images ------------------------------------------------------------------- +# Built from backend/Dockerfile.prod and frontend/Dockerfile.prod, pushed to +# your registry by CI. Override repository/tag per environment. +image: + pullPolicy: IfNotPresent + pullSecrets: [] + backend: + repository: ghcr.io/kosminus/querywise-backend + tag: "" # defaults to .Chart.AppVersion + frontend: + repository: ghcr.io/kosminus/querywise-frontend + tag: "" # defaults to .Chart.AppVersion + +# -- Backend (uvicorn API) ---------------------------------------------------- +backend: + replicaCount: 2 + uvicornWorkers: 4 + containerPort: 8000 + service: + type: ClusterIP + port: 8000 + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + memory: 1Gi + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 8 + targetCPUUtilizationPercentage: 70 + podDisruptionBudget: + enabled: true + minAvailable: 1 + # runAsNonRoot — image runs as uid 1001. + securityContext: + runAsNonRoot: true + runAsUser: 1001 + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + seccompProfile: + type: RuntimeDefault + nodeSelector: {} + tolerations: [] + affinity: {} + +# -- Worker (arq background jobs) --------------------------------------------- +worker: + replicaCount: 1 + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + memory: 1Gi + securityContext: + runAsNonRoot: true + runAsUser: 1001 + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + seccompProfile: + type: RuntimeDefault + nodeSelector: {} + tolerations: [] + affinity: {} + +# -- Frontend (nginx edge — serves the SPA) ----------------------------------- +frontend: + replicaCount: 2 + containerPort: 8080 + service: + type: ClusterIP + port: 80 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + memory: 128Mi + podDisruptionBudget: + enabled: true + minAvailable: 1 + # nginx-unprivileged runs as uid 101. + securityContext: + runAsNonRoot: true + runAsUser: 101 + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + seccompProfile: + type: RuntimeDefault + nodeSelector: {} + tolerations: [] + affinity: {} + +# -- Schema migration --------------------------------------------------------- +# Runs `alembic upgrade head` as a pre-install/pre-upgrade Helm hook Job, so +# migrations complete before new backend pods roll and replicas never race. +migrate: + enabled: true + backoffLimit: 2 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 512Mi + +# -- Service account ---------------------------------------------------------- +# Annotate for IRSA (EKS) / Workload Identity (GKE) / Azure Workload Identity so +# pods can reach a managed secrets backend or the external-secrets operator. +serviceAccount: + create: true + name: "" + annotations: {} + +# -- Ingress ------------------------------------------------------------------ +# Path-based routing: /api + /mcp -> backend, everything else -> frontend SPA. +# The SPA is built same-origin so this is all it needs. +ingress: + enabled: true + className: "" + annotations: {} + host: querywise.example.com + tls: + enabled: false + secretName: querywise-tls + +# -- Non-secret configuration (rendered into a ConfigMap, injected as env) ----- +config: + ENVIRONMENT: production + DEBUG: "false" + LOG_LEVEL: INFO + LOG_FORMAT: json + ENABLE_METRICS: "true" + SERVICE_NAME: querywise-backend + OTEL_ENABLED: "false" + # OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4318/v1/traces + + # Jobs: arq (Redis) is required for the multi-pod worker topology. + JOB_BACKEND: arq + + # Auth (Phase 1) — never disable in prod. + DISABLE_AUTH: "false" + AUTH_PROVIDER: local + AUTH_COOKIE_SECURE: "true" + AUTH_COOKIE_SAMESITE: lax + DEFAULT_ORG_SLUG: default + DEFAULT_ADMIN_EMAIL: admin@querywise.local + + # No sample-DB seeding in prod. + AUTO_SETUP_SAMPLE_DB: "false" + + # CORS — same-origin needs no entries; add cross-origin frontends here. + CORS_ORIGINS: '["https://querywise.example.com"]' + + # LLM (OpenAI default — switch provider here; keys go in `secrets`). + DEFAULT_LLM_PROVIDER: openai + DEFAULT_LLM_MODEL: gpt-5.2 + EMBEDDING_MODEL: text-embedding-3-small + EMBEDDING_DIMENSION: "1536" + + # Query defaults + rate limiting. + DEFAULT_QUERY_TIMEOUT_SECONDS: "30" + DEFAULT_MAX_ROWS: "1000" + MAX_RETRY_ATTEMPTS: "3" + MAX_QUERIES_PER_MINUTE: "30" + RATE_LIMIT_ENABLED: "true" + +# -- Secrets ------------------------------------------------------------------ +# Either: +# (a) point at a Secret you manage out-of-band (external-secrets operator, +# sealed-secrets, cloud sync) via `existingSecret`, OR +# (b) let the chart create one from `data` below (fine for bootstrap, but the +# values land in your release — prefer (a) for real deployments). +secrets: + existingSecret: "" + data: + # REQUIRED — managed Postgres (pgvector) and Redis DSNs. + DATABASE_URL: "" + REDIS_URL: "" + # REQUIRED — generate before install (see .env.prod.example). + ENCRYPTION_KEY: "" + JWT_SECRET: "" + # Optional bootstrap admin password. + DEFAULT_ADMIN_PASSWORD: "" + # LLM provider keys (set the ones your provider needs). + OPENAI_API_KEY: "" + # ANTHROPIC_API_KEY: "" + # AZURE_OPENAI_API_KEY: "" + +# -- Pod-level security context (shared) -------------------------------------- +podSecurityContext: + fsGroup: 1001 diff --git a/deploy/ops/RUNBOOK.md b/deploy/ops/RUNBOOK.md new file mode 100644 index 0000000..f5e8806 --- /dev/null +++ b/deploy/ops/RUNBOOK.md @@ -0,0 +1,162 @@ +# QueryWise — Operations & DR Runbook + +Operational procedures for a production QueryWise deployment: backups, restore / +disaster recovery, schema upgrades, and credential rotation. Pairs with the +deploy artifacts in [`../`](../) (Helm chart, Terraform, CI/CD). + +Two stateful systems hold everything that matters: + +| System | Holds | Recovery source | +|--------|-------|-----------------| +| **App Postgres (pgvector)** | metadata, semantic layer, saved queries, dashboards, users, history, embeddings | logical backups (below) + managed PITR | +| **Cloud secret store** | `ENCRYPTION_KEY`, `JWT_SECRET`, DSNs, LLM keys | your IaC / secret manager | + +Redis is a **cache + transient job queue** — it is not a source of truth and +needs no backup (embeddings regenerate; the cache repopulates). + +--- + +## 1. Backups + +The managed databases provisioned by the Terraform modules already have +**automated snapshots + PITR** (RDS backup retention, Cloud SQL PITR, Azure +flexible-server backups). Logical backups via `backup.sh` are the second layer — +portable, offsite-able, and restorable to any Postgres. + +**What's covered:** the entire app database (schema + data, including pgvector +columns) in `pg_dump` custom format, AES-256 encrypted. + +### Run a one-off backup + +```bash +export DATABASE_URL='postgresql://querywise:…@db-host:5432/querywise' +export BACKUP_PASSPHRASE='…' # from your secret store +./backup.sh # -> ./backups/querywise-.dump.enc +# Offsite: also set BACKUP_S3_URI=s3://… or BACKUP_GCS_URI=gs://… +``` + +From a cluster without DB network exposure, exec through a pod: + +```bash +kubectl -n querywise exec deploy/querywise-backend -- \ + sh -c 'DATABASE_URL="$DATABASE_URL" BACKUP_PASSPHRASE="$BACKUP_PASSPHRASE" ...' +# or apply the scheduled CronJob — see backup-cronjob.example.yaml +``` + +### Scheduled backups + +Apply [`backup-cronjob.example.yaml`](backup-cronjob.example.yaml) for nightly +encrypted dumps to a PVC (or offsite). **Verify restores quarterly** — an +untested backup is not a backup (see §2.3). + +--- + +## 2. Restore / Disaster Recovery + +**Targets:** RPO ≈ last backup / PITR window (minutes with managed PITR); RTO ≈ +time to provision a DB + restore (tens of minutes). + +### 2.1 Data loss / corruption (DB intact) + +Prefer the managed DB's **point-in-time recovery** — restore to a timestamp just +before the bad change (RDS/Cloud SQL/Azure console or Terraform). This avoids +losing everything since the last logical dump. + +### 2.2 Restore from a logical backup + +```bash +export DATABASE_URL='postgresql://querywise:…@new-db-host:5432/querywise' +export BACKUP_PASSPHRASE='…' +RESTORE_CONFIRM=yes ./restore.sh ./backups/querywise-.dump.enc +``` + +Then make the schema current (the dump may predate a migration): + +```bash +kubectl -n querywise create job --from=cronjob/none qw-migrate || true # or: +kubectl -n querywise exec deploy/querywise-backend -- alembic upgrade head +# Simplest: re-run `helm upgrade` — the pre-upgrade hook runs the migration. +``` + +### 2.3 Full region/cluster loss (clean-room rebuild) + +1. **Infra:** `terraform apply` the relevant `deploy/terraform/` module in + the recovery region → new Postgres, Redis, secret store, networking. +2. **Secrets:** restore the cloud secret values (or re-generate — but **keep the + original `ENCRYPTION_KEY`**, see §4, or stored connection strings become + undecryptable). +3. **Data:** `restore.sh` the latest backup into the new Postgres. +4. **App:** point kubeconfig at the recovery cluster, install external-secrets, + `helm upgrade --install` the chart. The migrate hook reconciles the schema. +5. **DNS/TLS:** repoint the hostname to the new ingress; re-issue certs. +6. **Verify:** `GET /api/v1/health/ready` is 200; run a known query; confirm a + saved query + dashboard render. + +--- + +## 3. Schema upgrades (Alembic) + +Migrations live in `backend/alembic/versions`. The normal path is automatic: + +- **Helm:** every `helm upgrade` runs `alembic upgrade head` as a + `pre-install`/`pre-upgrade` hook Job **before** new backend pods roll, so code + and schema move together and replicas never race (the migrate hook is the only + place migrations run). +- **Compose:** the `migrate` service runs once before backend/worker start. + +**Manual** (rarely needed): + +```bash +kubectl -n querywise exec deploy/querywise-backend -- alembic current +kubectl -n querywise exec deploy/querywise-backend -- alembic upgrade head +``` + +**Rollback:** Alembic `downgrade` exists but data-dropping migrations are not +safely reversible — prefer rolling **forward** with a fix migration, or restore +from backup (§2). Always take a backup before a major upgrade. + +--- + +## 4. Credential rotation (quarterly) + +Rotate on a quarterly cadence (and immediately on suspected compromise). All +secrets live in the cloud secret store; external-secrets syncs them into the +`querywise-secrets` Kubernetes Secret, then restart pods to pick up changes: + +```bash +kubectl -n querywise rollout restart deploy/querywise-backend deploy/querywise-worker +``` + +| Secret | Procedure | Blast radius | +|--------|-----------|--------------| +| **DB password** | Change the master password on the managed DB (cloud/Terraform), update `DATABASE_URL` in the secret store, restart pods. | Brief; pods reconnect. | +| **`JWT_SECRET`** | New random value in the secret store, restart pods. | All sessions invalidated + pending magic links — users re-login. | +| **LLM API keys** | Rotate at the provider, update the secret, restart pods. | None if overlapping validity. | +| **User API keys** | Per-user via `/api-keys` (only the SHA-256 hash is stored; plaintext shown once). | Per key. | +| **`ENCRYPTION_KEY`** | ⚠️ **Do not blind-rotate.** This Fernet key encrypts stored DB-connection strings; a new key cannot decrypt existing ones. To rotate: decrypt each connection with the old key and re-save with the new one (or re-enter connection credentials in the UI), *then* swap the key. Keep the old key available until every connection is re-encrypted. | Connections become unusable until re-encrypted. | + +> Prefer cloud-managed rotation where available (e.g. Secrets Manager rotation +> for the DB password) so rotation is automatic and audited. + +--- + +## 5. Quick reference + +```bash +# Health +kubectl -n querywise get pods +curl -fsS https:///api/v1/health/ready + +# Logs (JSON in prod — pipe to jq) +kubectl -n querywise logs deploy/querywise-backend --tail=200 + +# Roll back a bad release (Helm keeps history) +helm -n querywise history querywise +helm -n querywise rollback querywise + +# Scale +kubectl -n querywise scale deploy/querywise-backend --replicas=4 # if HPA disabled +``` + +See [`config-reference.md`](config-reference.md) for every tunable and which +ones must change for production. diff --git a/deploy/ops/backup-cronjob.example.yaml b/deploy/ops/backup-cronjob.example.yaml new file mode 100644 index 0000000..528db51 --- /dev/null +++ b/deploy/ops/backup-cronjob.example.yaml @@ -0,0 +1,78 @@ +# Optional: scheduled encrypted backups in-cluster. Runs deploy/ops/backup.sh +# nightly against the same Secret the Helm release uses. +# +# Prereqs: +# 1. The script as a ConfigMap (keeps it in sync with deploy/ops/backup.sh): +# kubectl -n querywise create configmap querywise-backup-script \ +# --from-file=backup.sh=deploy/ops/backup.sh +# 2. A BACKUP_PASSPHRASE key in the querywise-secrets Secret (add it to your +# cloud secret store so external-secrets syncs it alongside DATABASE_URL). +# 3. A StorageClass for the PVC (or swap the PVC for an emptyDir + offsite +# upload by setting BACKUP_S3_URI/BACKUP_GCS_URI and using an image that +# bundles the matching cloud CLI). +# +# The postgres:16 image provides pg_dump + openssl. Apply with: +# kubectl -n querywise apply -f backup-cronjob.example.yaml +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: querywise-backups + namespace: querywise +spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: querywise-backup + namespace: querywise +spec: + schedule: "0 3 * * *" # 03:00 UTC daily + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 2 + template: + spec: + restartPolicy: Never + securityContext: + runAsNonRoot: true + runAsUser: 999 + fsGroup: 999 + containers: + - name: backup + image: postgres:16 + command: ["bash", "/scripts/backup.sh"] + env: + - name: DATABASE_URL + valueFrom: + secretKeyRef: + name: querywise-secrets + key: DATABASE_URL + - name: BACKUP_PASSPHRASE + valueFrom: + secretKeyRef: + name: querywise-secrets + key: BACKUP_PASSPHRASE + - name: BACKUP_DIR + value: /backups + - name: BACKUP_RETENTION_DAYS + value: "14" + volumeMounts: + - name: script + mountPath: /scripts + - name: backups + mountPath: /backups + volumes: + - name: script + configMap: + name: querywise-backup-script + - name: backups + persistentVolumeClaim: + claimName: querywise-backups diff --git a/deploy/ops/backup.sh b/deploy/ops/backup.sh new file mode 100755 index 0000000..545614b --- /dev/null +++ b/deploy/ops/backup.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# +# Encrypted logical backup of the QueryWise app database. +# +# pg_dump (custom format) -> AES-256 (openssl, PBKDF2) -> /querywise-.dump.enc +# +# Runs anywhere with a postgres client + openssl and reachability to the DB: +# a cron host, a CI job, or `kubectl exec` into a pod. Decrypt/restore with +# restore.sh using the same passphrase. +# +# Required env: +# DATABASE_URL postgresql+asyncpg://… or postgresql://… (driver suffix stripped) +# BACKUP_PASSPHRASE symmetric key for encryption — store in your secret manager +# Optional env: +# BACKUP_DIR output directory (default ./backups) +# BACKUP_RETENTION_DAYS prune local dumps older than N days (default 14; 0 = keep all) +# BACKUP_S3_URI s3://bucket/prefix -> uploaded with `aws` +# BACKUP_GCS_URI gs://bucket/prefix -> uploaded with `gcloud storage` +set -euo pipefail + +: "${DATABASE_URL:?set DATABASE_URL}" +: "${BACKUP_PASSPHRASE:?set BACKUP_PASSPHRASE}" +BACKUP_DIR="${BACKUP_DIR:-./backups}" +RETENTION="${BACKUP_RETENTION_DAYS:-14}" + +command -v pg_dump >/dev/null || { echo "ERROR: pg_dump not found (install the postgresql client)" >&2; exit 1; } +command -v openssl >/dev/null || { echo "ERROR: openssl not found" >&2; exit 1; } + +# pg_dump speaks plain postgresql:// — drop the SQLAlchemy +asyncpg driver suffix. +PG_URL="${DATABASE_URL/+asyncpg/}" + +mkdir -p "$BACKUP_DIR" +TS="$(date -u +%Y%m%dT%H%M%SZ)" +OUT="$BACKUP_DIR/querywise-${TS}.dump.enc" + +echo "Backing up database -> ${OUT}" +pg_dump --format=custom --no-owner --no-privileges "$PG_URL" \ + | openssl enc -aes-256-cbc -pbkdf2 -salt -pass env:BACKUP_PASSPHRASE \ + > "$OUT" + +echo "Wrote $(du -h "$OUT" | cut -f1) encrypted backup." + +# Optional offsite upload. +if [[ -n "${BACKUP_S3_URI:-}" ]]; then + echo "Uploading to ${BACKUP_S3_URI%/}/$(basename "$OUT")" + aws s3 cp "$OUT" "${BACKUP_S3_URI%/}/$(basename "$OUT")" +fi +if [[ -n "${BACKUP_GCS_URI:-}" ]]; then + echo "Uploading to ${BACKUP_GCS_URI%/}/$(basename "$OUT")" + gcloud storage cp "$OUT" "${BACKUP_GCS_URI%/}/$(basename "$OUT")" +fi + +# Prune old local backups. +if [[ "$RETENTION" -gt 0 ]]; then + find "$BACKUP_DIR" -name 'querywise-*.dump.enc' -type f -mtime +"$RETENTION" -print -delete +fi + +echo "Done." diff --git a/deploy/ops/config-reference.md b/deploy/ops/config-reference.md new file mode 100644 index 0000000..cbe2aff --- /dev/null +++ b/deploy/ops/config-reference.md @@ -0,0 +1,66 @@ +# QueryWise — Production Config Reference + +Every backend setting is an environment variable. The **full catalogue with +defaults** lives in [`../../.env.example`](../../.env.example) and the project +`CLAUDE.md`; this page is the **production-focused** view — what to change, where +to set it, and what's a secret. + +## Where settings come from + +| Layer | Carries | Source of truth | +|-------|---------|-----------------| +| **Compose (prod)** | everything | `.env.prod` (from `.env.prod.example`) | +| **Helm — non-secret** | tunables, feature flags | `config:` map → ConfigMap (`values.yaml`) | +| **Helm — secret** | keys, DSNs | `querywise-secrets` Secret via `secrets.existingSecret` | +| **Terraform** | DSNs + keys assembled into the cloud secret store | `*.tfvars` | + +Secrets must **never** sit in `values.yaml` or a committed overlay — they flow +cloud secret store → external-secrets → `querywise-secrets`. + +## Must-set for production + +| Setting | Why | Notes | +|---------|-----|-------| +| `DATABASE_URL` | app database | secret; managed pgvector Postgres | +| `REDIS_URL` | cache + arq queue | secret; `JOB_BACKEND=arq` in prod | +| `ENCRYPTION_KEY` | encrypts stored connection strings | **secret; never rotate blind** (see RUNBOOK §4) | +| `JWT_SECRET` | session/magic-link signing | secret; rotating logs everyone out | +| `DISABLE_AUTH=false` | enforce login | **never `true` in prod** | +| `AUTH_COOKIE_SECURE=true` | HTTPS-only session cookie | TLS terminates at the edge/ingress | +| `CORS_ORIGINS` | allowed browser origins | JSON list; same-origin needs none | +| `AUTO_SETUP_SAMPLE_DB=false` | no IFRS-9 seed in prod | point at real warehouses | +| LLM provider + key | SQL generation + embeddings | `DEFAULT_LLM_PROVIDER` + the matching `*_API_KEY` (secret) | +| `EMBEDDING_DIMENSION` | vector column size | 1536 (OpenAI/Anthropic) / 768 (Ollama nomic) — must match the model | + +## Operational tunables (non-secret) + +| Setting | Default | Effect | +|---------|---------|--------| +| `UVICORN_WORKERS` | 4 | uvicorn processes per backend pod/container | +| `LOG_FORMAT` | `json` (prod) | structured logs for aggregation | +| `LOG_LEVEL` | `INFO` | verbosity | +| `ENABLE_METRICS` | `true` | Prometheus at `GET /metrics` | +| `OTEL_ENABLED` / `OTEL_EXPORTER_OTLP_ENDPOINT` | `false` / — | tracing to Jaeger/Tempo/Collector | +| `RATE_LIMIT_ENABLED` / `MAX_QUERIES_PER_MINUTE` | `true` / 30 | `/query` throttle | +| `DEFAULT_QUERY_TIMEOUT_SECONDS` / `DEFAULT_MAX_ROWS` | 30 / 1000 | query guardrails | +| `SECRETS_BACKEND` | `env` | `aws`/`gcp`/`azure`/`vault` for managed connection-string encryption | + +## Scaling knobs (Helm `values.yaml`) + +| Value | Purpose | +|-------|---------| +| `backend.autoscaling.{enabled,min,max,targetCPU}` | HPA on the API | +| `backend.replicaCount` | fixed replicas when HPA off | +| `worker.replicaCount` | arq worker concurrency (separate pods) | +| `frontend.replicaCount` | edge replicas | +| `*.podDisruptionBudget` | availability during node drains | +| `ingress.{host,className,annotations,tls}` | routing + TLS | +| `image.{backend,frontend}.{repository,tag}` | which images (CI injects `tag`) | + +## Cross-checks + +- `EMBEDDING_DIMENSION` must match the embedding model, or startup resizes the + vector columns and nulls embeddings (they regenerate in the background). +- `JOB_BACKEND=arq` ⇒ a running `worker` and a reachable `REDIS_URL`. +- `AUTH_COOKIE_SECURE=true` ⇒ the app is served over HTTPS (else the cookie is + dropped and login silently fails). diff --git a/deploy/ops/restore.sh b/deploy/ops/restore.sh new file mode 100755 index 0000000..87acda4 --- /dev/null +++ b/deploy/ops/restore.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# +# Restore an encrypted backup produced by backup.sh. +# +# .dump.enc -> AES-256 decrypt -> pg_restore --clean --if-exists +# +# DESTRUCTIVE: drops and recreates the objects in the target database. Guarded +# behind RESTORE_CONFIRM=yes. After restoring, run the app's migrations to make +# sure the schema matches the running code (`alembic upgrade head`, or just +# redeploy — the Helm migrate hook does it). +# +# Required env: +# DATABASE_URL target database (postgresql+asyncpg://… or postgresql://…) +# BACKUP_PASSPHRASE the passphrase the backup was encrypted with +# RESTORE_CONFIRM=yes acknowledge that this overwrites the target +# Usage: +# RESTORE_CONFIRM=yes ./restore.sh ./backups/querywise-20260608T030000Z.dump.enc +set -euo pipefail + +FILE="${1:-}" +: "${DATABASE_URL:?set DATABASE_URL}" +: "${BACKUP_PASSPHRASE:?set BACKUP_PASSPHRASE}" +[[ -n "$FILE" ]] || { echo "usage: restore.sh " >&2; exit 2; } +[[ -f "$FILE" ]] || { echo "ERROR: no such file: $FILE" >&2; exit 2; } + +if [[ "${RESTORE_CONFIRM:-}" != "yes" ]]; then + echo "This will OVERWRITE the database at the configured DATABASE_URL." >&2 + echo "Re-run with RESTORE_CONFIRM=yes to proceed." >&2 + exit 1 +fi + +command -v pg_restore >/dev/null || { echo "ERROR: pg_restore not found (install the postgresql client)" >&2; exit 1; } +command -v openssl >/dev/null || { echo "ERROR: openssl not found" >&2; exit 1; } + +PG_URL="${DATABASE_URL/+asyncpg/}" + +echo "Restoring ${FILE} -> database ..." +openssl enc -d -aes-256-cbc -pbkdf2 -pass env:BACKUP_PASSPHRASE -in "$FILE" \ + | pg_restore --clean --if-exists --no-owner --no-privileges --dbname "$PG_URL" + +echo "Restore complete." +echo "Next: ensure the schema is current — 'alembic upgrade head' or redeploy." diff --git a/deploy/terraform/.gitignore b/deploy/terraform/.gitignore new file mode 100644 index 0000000..52479d2 --- /dev/null +++ b/deploy/terraform/.gitignore @@ -0,0 +1,17 @@ +# Local provider plugins / cache +**/.terraform/* + +# State (use a remote, encrypted backend — never commit state) +*.tfstate +*.tfstate.* +crash.log + +# Variable files with secrets +*.tfvars +!*.tfvars.example + +# Plan outputs +*.tfplan + +# Keep the provider lockfile committed for reproducible versions: +!**/.terraform.lock.hcl diff --git a/deploy/terraform/aws/.terraform.lock.hcl b/deploy/terraform/aws/.terraform.lock.hcl new file mode 100644 index 0000000..9e6c643 --- /dev/null +++ b/deploy/terraform/aws/.terraform.lock.hcl @@ -0,0 +1,66 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/aws" { + version = "5.100.0" + constraints = "~> 5.40" + hashes = [ + "h1:7/GgVlN+KplSVCuc8qb4ct2R7gotYooPNRd0cnj9GxE=", + "h1:BrNG7eFOdRrRRbHdvrTjMJ8X8Oh/tiegURiKf7J2db8=", + "h1:C6eM6fGJVktK2M5vH3Yhv5NnqmegcBDY0EuDHhiXoVY=", + "h1:C7yD4Be2zhVdjnilsKPfucYAYMG5UCJYuUSoY6FCtGQ=", + "h1:H8CH2vfXXP/WQgJw+Qrn72umKs9UlGYQvn+QdnwO8Nc=", + "h1:J7L5bgyYNRAbtwAFJl2Lj+IMI2DJTrbbL33PTK4OWVY=", + "h1:JJ+EJQ+sIN3XRmNmrSUnUQtR8i3P22z+AbtAf8O/cRE=", + "h1:Wm5Ofhc15lX1OMMCt7iDV0NY5FDIouQDjX7I1iab55s=", + "h1:crKvBCgX6RlMcE6Ewm8o8YVuIg6mkXqKNgt/kSFYTvQ=", + "h1:zef23ac/YWw9O2FepFWRs+my9iWWUkniL4dT4LnCKjU=", + "zh:1a41f3ee26720fee7a9a0a361890632a1701b5dc1cf5355dc651ddbe115682ff", + "zh:30457f36690c19307921885cc5e72b9dbeba369445815903acd5c39ac0e41e7a", + "zh:42c22674d5f23f6309eaf3ac3a4f1f8b66b566c1efe1dcb0dd2fb30c17ce1f78", + "zh:4cc271c795ff8ce6479ec2d11a8ba65a0a9ed6331def6693f4b9dccb6e662838", + "zh:60932aa376bb8c87cd1971240063d9d38ba6a55502c867fdbb9f5361dc93d003", + "zh:864e42784bde77b18393ebfcc0104cea9123da5f4392e8a059789e296952eefa", + "zh:9750423138bb01ecaa5cec1a6691664f7783d301fb1628d3b64a231b6b564e0e", + "zh:e5d30c4dec271ef9d6fe09f48237ec6cfea1036848f835b4e47f274b48bda5a7", + "zh:e62bd314ae97b43d782e0841b13e68a3f8ec85cc762004f973ce5ce7b6cdbfd0", + "zh:ea851a3c072528a4445ac6236ba2ce58ffc99ec466019b0bd0e4adde63a248e4", + ] +} + +provider "registry.opentofu.org/hashicorp/random" { + version = "3.9.0" + constraints = "~> 3.5" + hashes = [ + "h1:8EQU5KSxezcjo/phRSe69rDOI0lk4pSaggj7FsskYp8=", + "h1:Lw9im2VBBJQ3RyAbHPQ0rcvcmmcZWm3x+kIOpN+Tv9s=", + "h1:U8KXqGCoNI9/guYbTvzgdtVk3fRthoG0UXwm1JoEpIs=", + "h1:YXaVd4p6qXPPVaxIBaIDNXmBwT02ZqDn0qD+tYpw8sA=", + "h1:cOpc03fphEt/G9Rfc4jLL/fW0D7tgvlXqiDKPF4vuww=", + "h1:g09RR7T1xWkeGrZwWvWMT9ncJrFGr1k3CBD585UmO7w=", + "h1:gGDdPPibmw2EWROx+sh1RGLjR5+nPwZyrf6/N9jXfeM=", + "h1:haE7/nXCOhXKP4oXeEnER3t5CaVQWqujz4nBnpeTUv4=", + "h1:ieSVpfZS2lKuMr05ph0QsOVpCzg7uk3cgKBaXR+Ikug=", + "h1:ig2s1IS9IzehorRjvVAnKIsUUj8fkgyxct1L/kswcc4=", + "h1:j3lS+ZEERFnoab8t1ppDrScGVP/cgWbzlCrEYKTCXYw=", + "h1:lxezrKmOiQIySHAM+os8qLVq7hqufDr8h3Hpzvsk+78=", + "h1:lzRqBJAG+NETxHbEZUJ/YP3RMEjZBinTX7VmgH3lw60=", + "h1:tdSNWK5ApqUsgbdYieyeYLTu6nIZUV3hR1oFqUfAuGo=", + "h1:xedet8yH/zI2CfdxsGlK0nlFWc/Bp61yrWsEa3fHB8g=", + "zh:03f1114cc20b8913523735ab76e0f0a2b16ce13c92923a53304bf85f07fc0dbc", + "zh:105b678ee72322a3067f105d7e05e940f6143238f377f6e87ff4ec909246ac2a", + "zh:55f3bbf13ea18cbace61a706566a80f25f33fe2b1780b6f3d7b582af2a05b6d2", + "zh:63adf996db48f082f7a6351eb485e219cd88795fc71e6ec60a837263ab0d2cb1", + "zh:7e99550738a4e3cc68b8a467714b0d69371025fe95e3326d5323d026d55653e9", + "zh:8342b54af3a18a37e075eeae61be57f4de2ba71b35d95c5075d402dd2c1f289d", + "zh:83ee18e32ac9dd5fc91298554b7c4cfa4c3a1db50f4c797945637cc93c0844ae", + "zh:993ecc0adbf6bd535a59fbc9b735d8c33950e6f6eb5e621d750da9b71d65d80a", + "zh:ad722bc59d4edbf1415e827fc007c0efe6e0e9462d5568bae20b34be1058a261", + "zh:ae9448e1f87b2f9a6c5197a0e9862162ec6b137cb3a3835e11522995d8939e7c", + "zh:bc9cdd3aac784f759125c6627f6f6416e8726a1c184eb9cf3e55b9edbc94c627", + "zh:c8e35b89572ba1c40a9b20022e033a3395fb8d42e7604d50c900f193ba10382e", + "zh:e2deaa8a9975ef81d9f62baed12c41286918b0a10908e0e031f13f69a3b730a1", + "zh:ee39707557210a0ab1098aa357d2cdfe502e5a312d0dbdffb09d08facc4d3fc5", + "zh:f81afe4eb63e8aa9e0ea71be6c990f0dc69cb360e7191c0742a991f4a5081b64", + ] +} diff --git a/deploy/terraform/aws/README.md b/deploy/terraform/aws/README.md new file mode 100644 index 0000000..2724c36 --- /dev/null +++ b/deploy/terraform/aws/README.md @@ -0,0 +1,63 @@ +# QueryWise on AWS — Terraform (data plane + secrets) + +Provisions the **managed dependencies** the QueryWise Helm chart needs, in your +own VPC — your data never leaves your account: + +- **RDS PostgreSQL 16** (pgvector-ready, encrypted, Multi-AZ, gp3, TLS enforced) +- **ElastiCache Redis** (result cache + the arq job queue) +- **Secrets Manager** secret with the assembled DSNs + keys +- **S3** bucket for exports / `pg_dump` backups (optional) +- **VPC + private subnets** (optional — or drop into an existing VPC) +- **IAM policy** to read the app secret (for the external-secrets IRSA role) + +**Compute is intentionally out of scope.** Provision EKS (or ECS) separately — +BYO, or the upstream [`terraform-aws-modules/eks`](https://github.com/terraform-aws-modules/terraform-aws-eks) +module — then deploy the app with the Helm chart in [`../../helm/querywise`](../../helm/querywise). +Keeping the data plane and the cluster in separate states means a `helm` +rollback or cluster rebuild never risks the database. + +## Usage + +```bash +cp terraform.tfvars.example terraform.tfvars # then edit +terraform init +terraform apply +``` + +Wire the outputs into the cluster. The recommended path is the +**external-secrets operator** reading the Secrets Manager secret: + +1. `terraform output secret_access_policy_arn` → attach to an IAM role whose + trust policy references your EKS OIDC provider, bound to the external-secrets + ServiceAccount (IRSA). +2. Create an `ExternalSecret` that pulls `terraform output app_secret_name` + with a `dataFrom` extract into a Kubernetes Secret named `querywise-secrets` + (its keys — `DATABASE_URL`, `REDIS_URL`, `ENCRYPTION_KEY`, `JWT_SECRET`, + `OPENAI_API_KEY`, … — already match the backend's env). +3. Install the chart pointing at it: + + ```bash + helm upgrade --install querywise ../../helm/querywise -n querywise \ + --set secrets.existingSecret=querywise-secrets \ + --set config.AUTO_SETUP_SAMPLE_DB=false + ``` + +Make sure `allowed_security_group_ids` includes the EKS node/pod security group +so pods can reach Postgres + Redis. + +> **Quick-start without external-secrets:** feed the DSNs straight into the +> chart's own Secret — but `database_url` / `redis_url` are sensitive outputs, so +> avoid this for anything but a sandbox. + +## pgvector + +The `vector` extension ships with RDS PostgreSQL 16 and is created by the app's +Alembic migrations (`CREATE EXTENSION IF NOT EXISTS vector`) on first +`helm upgrade` (the migration hook). No parameter-group change required. + +## Notes + +- `db_deletion_protection = true` (default) blocks `terraform destroy` of the DB + and forces a final snapshot. Set to `false` for throwaway environments. +- The master DB password and JWT secret are generated if not supplied and stored + only in Secrets Manager / Terraform state — keep your state backend encrypted. diff --git a/deploy/terraform/aws/iam.tf b/deploy/terraform/aws/iam.tf new file mode 100644 index 0000000..4b819c6 --- /dev/null +++ b/deploy/terraform/aws/iam.tf @@ -0,0 +1,22 @@ +# Read-only access to the app secret, for the external-secrets operator's IRSA +# role. Attach `secret_access_policy_arn` to the IAM role you bind to the +# external-secrets ServiceAccount (the role's trust policy references the EKS +# OIDC provider — created with the cluster, hence kept out of this data module). + +data "aws_iam_policy_document" "secret_read" { + statement { + sid = "ReadAppSecret" + effect = "Allow" + actions = [ + "secretsmanager:GetSecretValue", + "secretsmanager:DescribeSecret", + ] + resources = [aws_secretsmanager_secret.app.arn] + } +} + +resource "aws_iam_policy" "secret_read" { + name = "${var.name_prefix}-secret-read" + description = "Read the QueryWise app secret (for external-secrets IRSA)" + policy = data.aws_iam_policy_document.secret_read.json +} diff --git a/deploy/terraform/aws/main.tf b/deploy/terraform/aws/main.tf new file mode 100644 index 0000000..8b5f8e7 --- /dev/null +++ b/deploy/terraform/aws/main.tf @@ -0,0 +1,57 @@ +data "aws_caller_identity" "current" {} + +data "aws_availability_zones" "available" { + state = "available" +} + +locals { + tags = merge({ + "app.kubernetes.io/name" = "querywise" + "ManagedBy" = "terraform" + }, var.tags) + + # Default to the first two available AZs when none are supplied. + azs = length(var.availability_zones) > 0 ? var.availability_zones : slice(data.aws_availability_zones.available.names, 0, 2) + + # Resolve network: created vs. supplied. + vpc_id = var.create_vpc ? aws_vpc.this[0].id : var.vpc_id + private_subnet_ids = var.create_vpc ? aws_subnet.private[*].id : var.private_subnet_ids + + # Master password: supplied or generated. + db_password = var.db_password != "" ? var.db_password : random_password.db[0].result + jwt_secret = var.jwt_secret != "" ? var.jwt_secret : random_password.jwt[0].result + + # DSNs the app/Helm chart consume. The generated DB password uses a URL-safe + # alphabet (see random_password.db) so no escaping is needed here. + database_url = "postgresql+asyncpg://${var.db_username}:${local.db_password}@${aws_db_instance.this.address}:5432/${var.db_name}" + redis_url = "redis://${aws_elasticache_replication_group.this.primary_endpoint_address}:6379/0" + + bucket_name = var.s3_bucket_name != "" ? var.s3_bucket_name : "${var.name_prefix}-${data.aws_caller_identity.current.account_id}" + + # Keys mirror what the backend reads from env / the Helm Secret. Empty values + # are dropped so optional provider keys don't create blank entries. + secret_payload = { for k, v in { + DATABASE_URL = local.database_url + REDIS_URL = local.redis_url + ENCRYPTION_KEY = var.encryption_key + JWT_SECRET = local.jwt_secret + DEFAULT_ADMIN_PASSWORD = var.default_admin_password + OPENAI_API_KEY = var.openai_api_key + ANTHROPIC_API_KEY = var.anthropic_api_key + AZURE_OPENAI_API_KEY = var.azure_openai_api_key + } : k => v if v != null && v != "" } +} + +resource "random_password" "db" { + count = var.db_password == "" ? 1 : 0 + # URL-safe alphabet so the password drops cleanly into the DSN. + length = 32 + special = true + override_special = "-_" +} + +resource "random_password" "jwt" { + count = var.jwt_secret == "" ? 1 : 0 + length = 48 + special = false +} diff --git a/deploy/terraform/aws/network.tf b/deploy/terraform/aws/network.tf new file mode 100644 index 0000000..29cbdfa --- /dev/null +++ b/deploy/terraform/aws/network.tf @@ -0,0 +1,21 @@ +# Minimal private-subnet VPC for the data plane. RDS + ElastiCache live here; +# compute (EKS/ECS) reaches them via security-group references. Set +# create_vpc = false to drop these into an existing VPC instead. + +resource "aws_vpc" "this" { + count = var.create_vpc ? 1 : 0 + cidr_block = var.vpc_cidr + enable_dns_support = true + enable_dns_hostnames = true + + tags = { Name = "${var.name_prefix}-vpc" } +} + +resource "aws_subnet" "private" { + count = var.create_vpc ? length(local.azs) : 0 + vpc_id = aws_vpc.this[0].id + availability_zone = local.azs[count.index] + cidr_block = cidrsubnet(var.vpc_cidr, 4, count.index) + + tags = { Name = "${var.name_prefix}-private-${local.azs[count.index]}" } +} diff --git a/deploy/terraform/aws/outputs.tf b/deploy/terraform/aws/outputs.tf new file mode 100644 index 0000000..4e4e382 --- /dev/null +++ b/deploy/terraform/aws/outputs.tf @@ -0,0 +1,66 @@ +# -- Network ----------------------------------------------------------------- +output "vpc_id" { + description = "VPC the data plane runs in." + value = local.vpc_id +} + +output "private_subnet_ids" { + description = "Private subnets used by RDS/ElastiCache." + value = local.private_subnet_ids +} + +output "rds_security_group_id" { + description = "Attach app compute here is not needed; reference for rules/debugging." + value = aws_security_group.rds.id +} + +output "redis_security_group_id" { + value = aws_security_group.redis.id + description = "Redis security group id." +} + +# -- Endpoints --------------------------------------------------------------- +output "db_endpoint" { + description = "RDS Postgres endpoint (host)." + value = aws_db_instance.this.address +} + +output "redis_endpoint" { + description = "ElastiCache primary endpoint (host)." + value = aws_elasticache_replication_group.this.primary_endpoint_address +} + +# -- Secrets ----------------------------------------------------------------- +output "app_secret_arn" { + description = "Secrets Manager ARN holding the assembled app secret (DSNs + keys). Point external-secrets at this." + value = aws_secretsmanager_secret.app.arn +} + +output "app_secret_name" { + description = "Secrets Manager name of the app secret." + value = aws_secretsmanager_secret.app.name +} + +output "secret_access_policy_arn" { + description = "IAM policy granting read of the app secret — attach to the external-secrets IRSA role." + value = aws_iam_policy.secret_read.arn +} + +# -- Storage ----------------------------------------------------------------- +output "s3_bucket_name" { + description = "Exports/backups bucket (empty if disabled)." + value = var.create_s3_bucket ? aws_s3_bucket.data[0].bucket : "" +} + +# -- Convenience: DSNs (sensitive) ------------------------------------------- +output "database_url" { + description = "asyncpg DSN for the backend (also stored in the app secret)." + value = local.database_url + sensitive = true +} + +output "redis_url" { + description = "Redis DSN for cache + arq (also stored in the app secret)." + value = local.redis_url + sensitive = true +} diff --git a/deploy/terraform/aws/rds.tf b/deploy/terraform/aws/rds.tf new file mode 100644 index 0000000..30e4d4f --- /dev/null +++ b/deploy/terraform/aws/rds.tf @@ -0,0 +1,86 @@ +# PostgreSQL 16 with pgvector. The extension ships with RDS Postgres 16 and is +# created by the app's Alembic migrations (`CREATE EXTENSION IF NOT EXISTS +# vector`), so no parameter-group change is required. + +resource "aws_db_subnet_group" "this" { + name = "${var.name_prefix}-db" + subnet_ids = local.private_subnet_ids +} + +resource "aws_security_group" "rds" { + name = "${var.name_prefix}-rds" + description = "Postgres access for QueryWise" + vpc_id = local.vpc_id + + egress { + description = "All egress" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group_rule" "rds_from_sg" { + count = length(var.allowed_security_group_ids) + type = "ingress" + from_port = 5432 + to_port = 5432 + protocol = "tcp" + security_group_id = aws_security_group.rds.id + source_security_group_id = var.allowed_security_group_ids[count.index] + description = "Postgres from app security group" +} + +resource "aws_security_group_rule" "rds_from_cidr" { + count = length(var.allowed_cidr_blocks) > 0 ? 1 : 0 + type = "ingress" + from_port = 5432 + to_port = 5432 + protocol = "tcp" + security_group_id = aws_security_group.rds.id + cidr_blocks = var.allowed_cidr_blocks + description = "Postgres from allowed CIDRs" +} + +resource "aws_db_parameter_group" "this" { + name = "${var.name_prefix}-pg16" + family = "postgres16" + + # Enforce TLS in transit. + parameter { + name = "rds.force_ssl" + value = "1" + } +} + +resource "aws_db_instance" "this" { + identifier = "${var.name_prefix}-pg" + engine = "postgres" + engine_version = var.db_engine_version + instance_class = var.db_instance_class + + db_name = var.db_name + username = var.db_username + password = local.db_password + + allocated_storage = var.db_allocated_storage + max_allocated_storage = var.db_max_allocated_storage + storage_type = "gp3" + storage_encrypted = true + + multi_az = var.db_multi_az + db_subnet_group_name = aws_db_subnet_group.this.name + vpc_security_group_ids = [aws_security_group.rds.id] + parameter_group_name = aws_db_parameter_group.this.name + + backup_retention_period = var.db_backup_retention_days + deletion_protection = var.db_deletion_protection + auto_minor_version_upgrade = true + + # Take a final snapshot on destroy unless deletion protection is off. + skip_final_snapshot = !var.db_deletion_protection + final_snapshot_identifier = var.db_deletion_protection ? "${var.name_prefix}-pg-final" : null + + tags = { Name = "${var.name_prefix}-pg" } +} diff --git a/deploy/terraform/aws/redis.tf b/deploy/terraform/aws/redis.tf new file mode 100644 index 0000000..ca583ef --- /dev/null +++ b/deploy/terraform/aws/redis.tf @@ -0,0 +1,64 @@ +# ElastiCache Redis — backs the result cache + the arq job queue. + +resource "aws_elasticache_subnet_group" "this" { + name = "${var.name_prefix}-redis" + subnet_ids = local.private_subnet_ids +} + +resource "aws_security_group" "redis" { + name = "${var.name_prefix}-redis" + description = "Redis access for QueryWise" + vpc_id = local.vpc_id + + egress { + description = "All egress" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group_rule" "redis_from_sg" { + count = length(var.allowed_security_group_ids) + type = "ingress" + from_port = 6379 + to_port = 6379 + protocol = "tcp" + security_group_id = aws_security_group.redis.id + source_security_group_id = var.allowed_security_group_ids[count.index] + description = "Redis from app security group" +} + +resource "aws_security_group_rule" "redis_from_cidr" { + count = length(var.allowed_cidr_blocks) > 0 ? 1 : 0 + type = "ingress" + from_port = 6379 + to_port = 6379 + protocol = "tcp" + security_group_id = aws_security_group.redis.id + cidr_blocks = var.allowed_cidr_blocks + description = "Redis from allowed CIDRs" +} + +resource "aws_elasticache_replication_group" "this" { + replication_group_id = "${var.name_prefix}-redis" + description = "QueryWise cache + job queue" + + engine = "redis" + engine_version = var.redis_engine_version + node_type = var.redis_node_type + port = 6379 + + # primary + N replicas; automatic failover needs at least one replica. + num_cache_clusters = var.redis_replicas + 1 + automatic_failover_enabled = var.redis_replicas > 0 + multi_az_enabled = var.redis_replicas > 0 + + subnet_group_name = aws_elasticache_subnet_group.this.name + security_group_ids = [aws_security_group.redis.id] + + at_rest_encryption_enabled = true + + tags = { Name = "${var.name_prefix}-redis" } +} diff --git a/deploy/terraform/aws/s3.tf b/deploy/terraform/aws/s3.tf new file mode 100644 index 0000000..df394ca --- /dev/null +++ b/deploy/terraform/aws/s3.tf @@ -0,0 +1,37 @@ +# Optional bucket for exports / pg_dump backups. Private + encrypted + versioned. + +resource "aws_s3_bucket" "data" { + count = var.create_s3_bucket ? 1 : 0 + bucket = local.bucket_name + + tags = { Name = local.bucket_name } +} + +resource "aws_s3_bucket_public_access_block" "data" { + count = var.create_s3_bucket ? 1 : 0 + bucket = aws_s3_bucket.data[0].id + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "data" { + count = var.create_s3_bucket ? 1 : 0 + bucket = aws_s3_bucket.data[0].id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "AES256" + } + } +} + +resource "aws_s3_bucket_versioning" "data" { + count = var.create_s3_bucket ? 1 : 0 + bucket = aws_s3_bucket.data[0].id + + versioning_configuration { + status = "Enabled" + } +} diff --git a/deploy/terraform/aws/secrets.tf b/deploy/terraform/aws/secrets.tf new file mode 100644 index 0000000..c232965 --- /dev/null +++ b/deploy/terraform/aws/secrets.tf @@ -0,0 +1,14 @@ +# Secrets Manager holds the assembled app secret (DSNs + keys). The +# external-secrets operator in-cluster syncs this into the Kubernetes Secret the +# Helm chart references (secrets.existingSecret). Keys map 1:1 to the backend's +# env vars, so a SecretStore + ExternalSecret with a "dataFrom" extract is enough. + +resource "aws_secretsmanager_secret" "app" { + name = "${var.name_prefix}/app" + description = "QueryWise application secrets (DSNs + keys)" +} + +resource "aws_secretsmanager_secret_version" "app" { + secret_id = aws_secretsmanager_secret.app.id + secret_string = jsonencode(local.secret_payload) +} diff --git a/deploy/terraform/aws/terraform.tfvars.example b/deploy/terraform/aws/terraform.tfvars.example new file mode 100644 index 0000000..9e62df1 --- /dev/null +++ b/deploy/terraform/aws/terraform.tfvars.example @@ -0,0 +1,32 @@ +# Copy to terraform.tfvars and fill in. Keep secrets out of version control. + +region = "us-east-1" +name_prefix = "querywise-prod" + +# Network — let the module create a VPC, or set create_vpc = false and supply +# vpc_id + private_subnet_ids to drop into your existing cluster VPC. +create_vpc = true +# vpc_id = "vpc-0123456789abcdef0" +# private_subnet_ids = ["subnet-aaa", "subnet-bbb"] + +# Allow your EKS node/pod security group to reach Postgres + Redis. +allowed_security_group_ids = ["sg-0123456789abcdef0"] + +# Database +db_instance_class = "db.t4g.medium" +db_multi_az = true + +# Redis +redis_node_type = "cache.t4g.small" +redis_replicas = 1 + +# REQUIRED — Fernet key for connection-string encryption. Generate with: +# python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())" +encryption_key = "CHANGE_ME" + +# Optional — generated if left empty. +# jwt_secret = "" + +# LLM provider key(s). +openai_api_key = "CHANGE_ME" +# anthropic_api_key = "" diff --git a/deploy/terraform/aws/variables.tf b/deploy/terraform/aws/variables.tf new file mode 100644 index 0000000..72a3268 --- /dev/null +++ b/deploy/terraform/aws/variables.tf @@ -0,0 +1,197 @@ +# -- General ----------------------------------------------------------------- +variable "region" { + description = "AWS region to deploy into." + type = string +} + +variable "name_prefix" { + description = "Prefix for all resource names (e.g. \"querywise-prod\")." + type = string + default = "querywise" +} + +variable "tags" { + description = "Extra tags applied to every resource." + type = map(string) + default = {} +} + +# -- Network ----------------------------------------------------------------- +# Either let the module create a VPC, or supply an existing one. +variable "create_vpc" { + description = "Create a VPC + private subnets. If false, supply vpc_id and private_subnet_ids." + type = bool + default = true +} + +variable "vpc_cidr" { + description = "CIDR for the created VPC (when create_vpc = true)." + type = string + default = "10.42.0.0/16" +} + +variable "availability_zones" { + description = "AZs to spread data subnets across (>= 2 for Multi-AZ / ElastiCache)." + type = list(string) + default = [] +} + +variable "vpc_id" { + description = "Existing VPC id (when create_vpc = false)." + type = string + default = "" +} + +variable "private_subnet_ids" { + description = "Existing private subnet ids for RDS/ElastiCache (when create_vpc = false)." + type = list(string) + default = [] +} + +variable "allowed_security_group_ids" { + description = "Security groups (e.g. the EKS node/pod SG) allowed to reach Postgres/Redis." + type = list(string) + default = [] +} + +variable "allowed_cidr_blocks" { + description = "CIDRs allowed to reach Postgres/Redis (use sparingly; prefer SG references)." + type = list(string) + default = [] +} + +# -- PostgreSQL (pgvector) --------------------------------------------------- +variable "db_name" { + description = "Application database name." + type = string + default = "querywise" +} + +variable "db_username" { + description = "Master username for the app database." + type = string + default = "querywise" +} + +variable "db_password" { + description = "Master password. Leave empty to generate one (stored in Secrets Manager)." + type = string + default = "" + sensitive = true +} + +variable "db_engine_version" { + description = "PostgreSQL engine version (16.x supports the pgvector extension)." + type = string + default = "16.4" +} + +variable "db_instance_class" { + description = "RDS instance class." + type = string + default = "db.t4g.medium" +} + +variable "db_allocated_storage" { + description = "Initial storage (GiB)." + type = number + default = 50 +} + +variable "db_max_allocated_storage" { + description = "Storage autoscaling ceiling (GiB). Set equal to allocated to disable." + type = number + default = 200 +} + +variable "db_multi_az" { + description = "Run the database Multi-AZ for HA." + type = bool + default = true +} + +variable "db_backup_retention_days" { + description = "Automated backup retention (days)." + type = number + default = 7 +} + +variable "db_deletion_protection" { + description = "Block accidental `terraform destroy` of the database." + type = bool + default = true +} + +# -- ElastiCache (Redis) ----------------------------------------------------- +variable "redis_node_type" { + description = "ElastiCache node type." + type = string + default = "cache.t4g.small" +} + +variable "redis_engine_version" { + description = "Redis engine version." + type = string + default = "7.1" +} + +variable "redis_replicas" { + description = "Number of replica nodes (0 = single primary, no HA)." + type = number + default = 1 +} + +# -- S3 (exports / backups) -------------------------------------------------- +variable "create_s3_bucket" { + description = "Create an S3 bucket for exports/backups." + type = bool + default = true +} + +variable "s3_bucket_name" { + description = "Bucket name. Empty = \"-\"." + type = string + default = "" +} + +# -- Application secrets (assembled into the Secrets Manager secret) ---------- +variable "encryption_key" { + description = "Fernet key for connection-string encryption (REQUIRED — generate with the python one-liner in the README)." + type = string + sensitive = true +} + +variable "jwt_secret" { + description = "HS256 signing secret for session/magic-link JWTs. Empty = generate one." + type = string + default = "" + sensitive = true +} + +variable "default_admin_password" { + description = "Optional bootstrap admin password." + type = string + default = "" + sensitive = true +} + +variable "openai_api_key" { + description = "OpenAI API key (completions + embeddings)." + type = string + default = "" + sensitive = true +} + +variable "anthropic_api_key" { + description = "Anthropic API key (optional)." + type = string + default = "" + sensitive = true +} + +variable "azure_openai_api_key" { + description = "Azure OpenAI key (optional)." + type = string + default = "" + sensitive = true +} diff --git a/deploy/terraform/aws/versions.tf b/deploy/terraform/aws/versions.tf new file mode 100644 index 0000000..e6d3a2d --- /dev/null +++ b/deploy/terraform/aws/versions.tf @@ -0,0 +1,22 @@ +terraform { + required_version = ">= 1.5" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.40" + } + random = { + source = "hashicorp/random" + version = "~> 3.5" + } + } +} + +provider "aws" { + region = var.region + + default_tags { + tags = local.tags + } +} diff --git a/deploy/terraform/azure/.terraform.lock.hcl b/deploy/terraform/azure/.terraform.lock.hcl new file mode 100644 index 0000000..9653214 --- /dev/null +++ b/deploy/terraform/azure/.terraform.lock.hcl @@ -0,0 +1,66 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/azurerm" { + version = "3.117.1" + constraints = "~> 3.110" + hashes = [ + "h1:/LLqOTgyUBDgagr0Bi2C6ZMbe1ytEZHjV8vZxxIKApo=", + "h1:BlM2+pRV7LWN3+enHRHjztlEVQGdfQfibT3Jt6vYw4o=", + "h1:CACtG75Ab4OnxWIvUVTVp/8KzXzVVA1vfr6eSiGDh48=", + "h1:HhwEwklIk4ccwh/55Yuk2bmoqRY/zwIhYEGpZPHAV5U=", + "h1:II0m9urxKMoQ+spbeVYO2aC2e6fpDMIfvnwG5lV2PtU=", + "h1:LW4VuLN3Jxho7ieW508bPNHPDq1aCWWYVszmiw3eQKg=", + "h1:OXBPoQpiwe519GeBfkmbfsDXO020v706RmWTYSuuUCE=", + "h1:PWUq7EPQ9uDVA8We/AbeUnTKlk4/6ELYQ5uWkthUxz0=", + "h1:YBwLP1Vu2+/XAORcOLN8Z3s/aH4d5uGC98A2qDBSo+8=", + "h1:wBga8SpJzBXnt03/RQZLVUFu3AVT1EsvFa6QyL1hofg=", + "zh:1fedd2521c8ced1fbebd5d70fda376d42393cac5cc25c043c390b44d630d9e37", + "zh:634c16442fd8aaed6c3bccd0069f4a01399b141d2a993d85997e6a03f9f867cf", + "zh:637ae3787f87506e5b673f44a1b0f33cf75d7fa9c5353df6a2584488fc3d4328", + "zh:7c7741f66ff5b05051db4b6c3d9bad68c829f9e920a7f1debdca0ab8e50836a3", + "zh:9b454fa0b6c821db2c6a71e591a467a5b4802129509710b56f01ae7106058d86", + "zh:bb820ff92b4a77e9d70999ae30758d408728c6e782b4e1c8c4b6d53b8c3c8ff9", + "zh:d38cd7d5f99398fb96672cb27943b96ea2b7008f26d379a69e1c6c2f25051869", + "zh:d56f5a132181ab14e6be332996753cc11c0d3b1cfdd1a1b44ef484c67e38cc91", + "zh:d8a1e7cf218f46e6d0bd878ff70f92db7e800a15f01e96189a24864d10cde33b", + "zh:f67cf6d14d859a1d2a1dc615941a1740a14cb3f4ee2a34da672ff6729d81fa81", + ] +} + +provider "registry.opentofu.org/hashicorp/random" { + version = "3.9.0" + constraints = "~> 3.5" + hashes = [ + "h1:8EQU5KSxezcjo/phRSe69rDOI0lk4pSaggj7FsskYp8=", + "h1:Lw9im2VBBJQ3RyAbHPQ0rcvcmmcZWm3x+kIOpN+Tv9s=", + "h1:U8KXqGCoNI9/guYbTvzgdtVk3fRthoG0UXwm1JoEpIs=", + "h1:YXaVd4p6qXPPVaxIBaIDNXmBwT02ZqDn0qD+tYpw8sA=", + "h1:cOpc03fphEt/G9Rfc4jLL/fW0D7tgvlXqiDKPF4vuww=", + "h1:g09RR7T1xWkeGrZwWvWMT9ncJrFGr1k3CBD585UmO7w=", + "h1:gGDdPPibmw2EWROx+sh1RGLjR5+nPwZyrf6/N9jXfeM=", + "h1:haE7/nXCOhXKP4oXeEnER3t5CaVQWqujz4nBnpeTUv4=", + "h1:ieSVpfZS2lKuMr05ph0QsOVpCzg7uk3cgKBaXR+Ikug=", + "h1:ig2s1IS9IzehorRjvVAnKIsUUj8fkgyxct1L/kswcc4=", + "h1:j3lS+ZEERFnoab8t1ppDrScGVP/cgWbzlCrEYKTCXYw=", + "h1:lxezrKmOiQIySHAM+os8qLVq7hqufDr8h3Hpzvsk+78=", + "h1:lzRqBJAG+NETxHbEZUJ/YP3RMEjZBinTX7VmgH3lw60=", + "h1:tdSNWK5ApqUsgbdYieyeYLTu6nIZUV3hR1oFqUfAuGo=", + "h1:xedet8yH/zI2CfdxsGlK0nlFWc/Bp61yrWsEa3fHB8g=", + "zh:03f1114cc20b8913523735ab76e0f0a2b16ce13c92923a53304bf85f07fc0dbc", + "zh:105b678ee72322a3067f105d7e05e940f6143238f377f6e87ff4ec909246ac2a", + "zh:55f3bbf13ea18cbace61a706566a80f25f33fe2b1780b6f3d7b582af2a05b6d2", + "zh:63adf996db48f082f7a6351eb485e219cd88795fc71e6ec60a837263ab0d2cb1", + "zh:7e99550738a4e3cc68b8a467714b0d69371025fe95e3326d5323d026d55653e9", + "zh:8342b54af3a18a37e075eeae61be57f4de2ba71b35d95c5075d402dd2c1f289d", + "zh:83ee18e32ac9dd5fc91298554b7c4cfa4c3a1db50f4c797945637cc93c0844ae", + "zh:993ecc0adbf6bd535a59fbc9b735d8c33950e6f6eb5e621d750da9b71d65d80a", + "zh:ad722bc59d4edbf1415e827fc007c0efe6e0e9462d5568bae20b34be1058a261", + "zh:ae9448e1f87b2f9a6c5197a0e9862162ec6b137cb3a3835e11522995d8939e7c", + "zh:bc9cdd3aac784f759125c6627f6f6416e8726a1c184eb9cf3e55b9edbc94c627", + "zh:c8e35b89572ba1c40a9b20022e033a3395fb8d42e7604d50c900f193ba10382e", + "zh:e2deaa8a9975ef81d9f62baed12c41286918b0a10908e0e031f13f69a3b730a1", + "zh:ee39707557210a0ab1098aa357d2cdfe502e5a312d0dbdffb09d08facc4d3fc5", + "zh:f81afe4eb63e8aa9e0ea71be6c990f0dc69cb360e7191c0742a991f4a5081b64", + ] +} diff --git a/deploy/terraform/azure/README.md b/deploy/terraform/azure/README.md new file mode 100644 index 0000000..56fdb5f --- /dev/null +++ b/deploy/terraform/azure/README.md @@ -0,0 +1,64 @@ +# QueryWise on Azure — Terraform (data plane + secrets) + +Provisions the managed dependencies the Helm chart needs, in your subscription: + +- **Azure Database for PostgreSQL flexible server 16** (pgvector allow-listed, + VNet-integrated/private, zone-redundant HA, TLS-only) +- **Azure Cache for Redis** (result cache + the arq job queue, TLS-only) +- **Key Vault** secret with the assembled DSNs + keys +- **Storage account + container** for exports / `pg_dump` backups (optional) +- **VNet + delegated subnet + private DNS zone** (optional — or BYO) +- **User-assigned managed identity** with Key Vault read, for external-secrets + +**Compute (AKS) is out of scope** — BYO or the upstream +[`Azure/aks`](https://registry.terraform.io/modules/Azure/aks/azurerm/latest) +module — then deploy with the Helm chart in [`../../helm/querywise`](../../helm/querywise), +keeping the cluster in a separate state from the database. + +## Usage + +```bash +az login +cp terraform.tfvars.example terraform.tfvars # then edit +terraform init +terraform apply +``` + +Wire it up with the external-secrets operator on AKS (Workload Identity): + +1. Federate the managed identity to the external-secrets KSA: + ```bash + az identity federated-credential create \ + --identity-name querywise-prod-ext-secrets \ + --resource-group "$(terraform output -raw resource_group_name)" \ + --issuer "$(az aks show -g -n --query oidcIssuerProfile.issuerUrl -o tsv)" \ + --subject system:serviceaccount:external-secrets:external-secrets \ + --audience api://AzureADTokenExchange + ``` +2. Create an `ExternalSecret` (provider `azurekv`) that pulls the + `querywise-app` secret with a `dataFrom` extract into a Kubernetes Secret + named `querywise-secrets` (its keys already match the backend's env). +3. Install the chart: + ```bash + helm upgrade --install querywise ../../helm/querywise -n querywise \ + --set secrets.existingSecret=querywise-secrets + ``` + +AKS must reach the Postgres private endpoint and the Redis host — peer its VNet +with the one created here (or set `create_vnet = false` and deploy into the +cluster's VNet). + +## pgvector + +`azure.extensions = VECTOR` is set here so the server permits the extension; the +app's Alembic migrations then run `CREATE EXTENSION IF NOT EXISTS vector` on +first `helm upgrade` (the migration hook). + +## Notes + +- The Terraform principal needs rights to assign roles on the Key Vault (it + grants itself **Key Vault Secrets Officer** to write the secret). +- Key Vault has purge protection on — a destroyed vault is recoverable for 7 + days and the name stays reserved. +- Generated DB password / JWT secret live only in Key Vault + Terraform state — + keep your state backend encrypted. diff --git a/deploy/terraform/azure/identity.tf b/deploy/terraform/azure/identity.tf new file mode 100644 index 0000000..f6c0dc1 --- /dev/null +++ b/deploy/terraform/azure/identity.tf @@ -0,0 +1,23 @@ +# User-assigned managed identity for the external-secrets operator. Grant it +# read on the vault, then federate it to the in-cluster external-secrets KSA +# (the federated credential references the AKS OIDC issuer, created with the +# cluster — hence kept out of this data module): +# +# az identity federated-credential create \ +# --identity-name --resource-group \ +# --issuer \ +# --subject system:serviceaccount:external-secrets:external-secrets \ +# --audience api://AzureADTokenExchange + +resource "azurerm_user_assigned_identity" "external_secrets" { + name = "${var.name_prefix}-ext-secrets" + location = var.location + resource_group_name = local.rg_name + tags = local.tags +} + +resource "azurerm_role_assignment" "es_secrets_user" { + scope = azurerm_key_vault.this.id + role_definition_name = "Key Vault Secrets User" + principal_id = azurerm_user_assigned_identity.external_secrets.principal_id +} diff --git a/deploy/terraform/azure/keyvault.tf b/deploy/terraform/azure/keyvault.tf new file mode 100644 index 0000000..0866bde --- /dev/null +++ b/deploy/terraform/azure/keyvault.tf @@ -0,0 +1,35 @@ +# Key Vault holds the assembled app secret (DSNs + keys) as a JSON blob. The +# external-secrets operator on AKS reads it (via the managed identity in +# identity.tf, federated to its KSA) and syncs it into the Kubernetes Secret the +# Helm chart references. Keys map 1:1 to the backend's env vars. + +locals { + key_vault_name = substr("${var.name_prefix}-kv-${random_string.suffix.result}", 0, 24) +} + +resource "azurerm_key_vault" "this" { + name = local.key_vault_name + location = var.location + resource_group_name = local.rg_name + tenant_id = data.azurerm_client_config.current.tenant_id + sku_name = "standard" + enable_rbac_authorization = true + purge_protection_enabled = true + soft_delete_retention_days = 7 + tags = local.tags +} + +# Let the principal running Terraform write secrets (RBAC mode). +resource "azurerm_role_assignment" "tf_secrets_officer" { + scope = azurerm_key_vault.this.id + role_definition_name = "Key Vault Secrets Officer" + principal_id = data.azurerm_client_config.current.object_id +} + +resource "azurerm_key_vault_secret" "app" { + name = "querywise-app" + value = jsonencode(local.secret_payload) + key_vault_id = azurerm_key_vault.this.id + + depends_on = [azurerm_role_assignment.tf_secrets_officer] +} diff --git a/deploy/terraform/azure/main.tf b/deploy/terraform/azure/main.tf new file mode 100644 index 0000000..503836f --- /dev/null +++ b/deploy/terraform/azure/main.tf @@ -0,0 +1,61 @@ +data "azurerm_client_config" "current" {} + +locals { + tags = merge({ + "app" = "querywise" + "managedBy" = "terraform" + }, var.tags) + + rg_name = var.create_resource_group ? azurerm_resource_group.this[0].name : var.resource_group_name + + db_subnet_id = var.create_vnet ? azurerm_subnet.db[0].id : var.db_subnet_id + private_dns_zone_id = var.create_vnet ? azurerm_private_dns_zone.pg[0].id : var.private_dns_zone_id + + db_password = var.db_password != "" ? var.db_password : random_password.db[0].result + jwt_secret = var.jwt_secret != "" ? var.jwt_secret : random_password.jwt[0].result + + # Postgres flexible server FQDN; Azure Cache for Redis is TLS-only on 6380 and + # authenticates with the access key (rediss:// DSN). + database_url = "postgresql+asyncpg://${var.db_username}:${local.db_password}@${azurerm_postgresql_flexible_server.this.fqdn}:5432/${var.db_name}" + redis_url = "rediss://:${azurerm_redis_cache.this.primary_access_key}@${azurerm_redis_cache.this.hostname}:6380/0" + + # Storage account name: 3-24 lowercase alphanumeric, globally unique. + storage_account_name = substr("${replace(lower(var.name_prefix), "/[^a-z0-9]/", "")}${random_string.suffix.result}", 0, 24) + + secret_payload = { for k, v in { + DATABASE_URL = local.database_url + REDIS_URL = local.redis_url + ENCRYPTION_KEY = var.encryption_key + JWT_SECRET = local.jwt_secret + DEFAULT_ADMIN_PASSWORD = var.default_admin_password + OPENAI_API_KEY = var.openai_api_key + ANTHROPIC_API_KEY = var.anthropic_api_key + AZURE_OPENAI_API_KEY = var.azure_openai_api_key + } : k => v if v != null && v != "" } +} + +resource "random_string" "suffix" { + length = 6 + upper = false + special = false +} + +resource "random_password" "db" { + count = var.db_password == "" ? 1 : 0 + length = 32 + special = true + override_special = "-_" +} + +resource "random_password" "jwt" { + count = var.jwt_secret == "" ? 1 : 0 + length = 48 + special = false +} + +resource "azurerm_resource_group" "this" { + count = var.create_resource_group ? 1 : 0 + name = var.resource_group_name != "" ? var.resource_group_name : "${var.name_prefix}-rg" + location = var.location + tags = local.tags +} diff --git a/deploy/terraform/azure/network.tf b/deploy/terraform/azure/network.tf new file mode 100644 index 0000000..cbb0aeb --- /dev/null +++ b/deploy/terraform/azure/network.tf @@ -0,0 +1,43 @@ +# VNet + a subnet delegated to the Postgres flexible server, plus the private +# DNS zone it needs for VNet integration. Set create_vnet = false to supply your +# own delegated subnet + DNS zone. + +resource "azurerm_virtual_network" "this" { + count = var.create_vnet ? 1 : 0 + name = "${var.name_prefix}-vnet" + location = var.location + resource_group_name = local.rg_name + address_space = [var.vnet_cidr] + tags = local.tags +} + +resource "azurerm_subnet" "db" { + count = var.create_vnet ? 1 : 0 + name = "${var.name_prefix}-pg" + resource_group_name = local.rg_name + virtual_network_name = azurerm_virtual_network.this[0].name + address_prefixes = [var.db_subnet_cidr] + + delegation { + name = "fs" + service_delegation { + name = "Microsoft.DBforPostgreSQL/flexibleServers" + actions = ["Microsoft.Network/virtualNetworks/subnets/join/action"] + } + } +} + +resource "azurerm_private_dns_zone" "pg" { + count = var.create_vnet ? 1 : 0 + name = "${var.name_prefix}.private.postgres.database.azure.com" + resource_group_name = local.rg_name + tags = local.tags +} + +resource "azurerm_private_dns_zone_virtual_network_link" "pg" { + count = var.create_vnet ? 1 : 0 + name = "${var.name_prefix}-pg-link" + resource_group_name = local.rg_name + private_dns_zone_name = azurerm_private_dns_zone.pg[0].name + virtual_network_id = azurerm_virtual_network.this[0].id +} diff --git a/deploy/terraform/azure/outputs.tf b/deploy/terraform/azure/outputs.tf new file mode 100644 index 0000000..5572ce7 --- /dev/null +++ b/deploy/terraform/azure/outputs.tf @@ -0,0 +1,46 @@ +output "resource_group_name" { + description = "Resource group the data plane runs in." + value = local.rg_name +} + +output "db_fqdn" { + description = "Postgres flexible server FQDN." + value = azurerm_postgresql_flexible_server.this.fqdn +} + +output "redis_hostname" { + description = "Azure Cache for Redis hostname." + value = azurerm_redis_cache.this.hostname +} + +output "key_vault_name" { + description = "Key Vault holding the app secret. Point external-secrets at this." + value = azurerm_key_vault.this.name +} + +output "app_secret_name" { + description = "Key Vault secret name with the assembled app config (JSON)." + value = azurerm_key_vault_secret.app.name +} + +output "external_secrets_identity_client_id" { + description = "Client id of the managed identity to federate to the external-secrets KSA." + value = azurerm_user_assigned_identity.external_secrets.client_id +} + +output "storage_account_name" { + description = "Exports/backups storage account (empty if disabled)." + value = var.create_storage ? azurerm_storage_account.this[0].name : "" +} + +output "database_url" { + description = "asyncpg DSN (also stored in Key Vault)." + value = local.database_url + sensitive = true +} + +output "redis_url" { + description = "Redis DSN (also stored in Key Vault)." + value = local.redis_url + sensitive = true +} diff --git a/deploy/terraform/azure/postgres.tf b/deploy/terraform/azure/postgres.tf new file mode 100644 index 0000000..77d87cb --- /dev/null +++ b/deploy/terraform/azure/postgres.tf @@ -0,0 +1,48 @@ +# Azure Database for PostgreSQL flexible server, v16. pgvector must be +# allow-listed via the azure.extensions server parameter; the extension itself +# is then created by the app's Alembic migrations (`CREATE EXTENSION ... vector`). + +resource "azurerm_postgresql_flexible_server" "this" { + name = "${var.name_prefix}-pg" + resource_group_name = local.rg_name + location = var.location + version = "16" + + administrator_login = var.db_username + administrator_password = local.db_password + + sku_name = var.db_sku + storage_mb = var.db_storage_mb + + # VNet-integrated (private) access. + delegated_subnet_id = local.db_subnet_id + private_dns_zone_id = local.private_dns_zone_id + + backup_retention_days = var.db_backup_retention_days + + dynamic "high_availability" { + for_each = var.db_ha ? [1] : [] + content { + mode = "ZoneRedundant" + } + } + + tags = local.tags + + # The private DNS zone link must exist before the server is created. + depends_on = [azurerm_private_dns_zone_virtual_network_link.pg] +} + +resource "azurerm_postgresql_flexible_server_database" "app" { + name = var.db_name + server_id = azurerm_postgresql_flexible_server.this.id + collation = "en_US.utf8" + charset = "UTF8" +} + +# Allow-list pgvector so the app can `CREATE EXTENSION vector`. +resource "azurerm_postgresql_flexible_server_configuration" "extensions" { + name = "azure.extensions" + server_id = azurerm_postgresql_flexible_server.this.id + value = "VECTOR" +} diff --git a/deploy/terraform/azure/redis.tf b/deploy/terraform/azure/redis.tf new file mode 100644 index 0000000..97b46df --- /dev/null +++ b/deploy/terraform/azure/redis.tf @@ -0,0 +1,17 @@ +# Azure Cache for Redis — result cache + the arq job queue. TLS-only (6380); +# the backend connects with rediss:// using the primary access key. + +resource "azurerm_redis_cache" "this" { + name = "${var.name_prefix}-redis" + location = var.location + resource_group_name = local.rg_name + + capacity = var.redis_capacity + family = var.redis_sku == "Premium" ? "P" : "C" + sku_name = var.redis_sku + + non_ssl_port_enabled = false + minimum_tls_version = "1.2" + + tags = local.tags +} diff --git a/deploy/terraform/azure/storage.tf b/deploy/terraform/azure/storage.tf new file mode 100644 index 0000000..7f969a1 --- /dev/null +++ b/deploy/terraform/azure/storage.tf @@ -0,0 +1,21 @@ +# Optional storage account + container for exports / pg_dump backups. + +resource "azurerm_storage_account" "this" { + count = var.create_storage ? 1 : 0 + name = local.storage_account_name + resource_group_name = local.rg_name + location = var.location + account_tier = "Standard" + account_replication_type = "LRS" + account_kind = "StorageV2" + min_tls_version = "TLS1_2" + + tags = local.tags +} + +resource "azurerm_storage_container" "data" { + count = var.create_storage ? 1 : 0 + name = "exports" + storage_account_name = azurerm_storage_account.this[0].name + container_access_type = "private" +} diff --git a/deploy/terraform/azure/terraform.tfvars.example b/deploy/terraform/azure/terraform.tfvars.example new file mode 100644 index 0000000..0e57e63 --- /dev/null +++ b/deploy/terraform/azure/terraform.tfvars.example @@ -0,0 +1,25 @@ +# Copy to terraform.tfvars and fill in. Keep secrets out of version control. +# Authenticate first: `az login` (+ set ARM_SUBSCRIPTION_ID or subscription_id). + +location = "eastus" +name_prefix = "querywise-prod" +# subscription_id = "00000000-0000-0000-0000-000000000000" + +# Network — create a VNet with a delegated subnet + private DNS zone, or set +# create_vnet = false and supply db_subnet_id + private_dns_zone_id. +create_vnet = true + +# Postgres flexible server +db_sku = "GP_Standard_D2ds_v5" +db_ha = true + +# Redis +redis_sku = "Standard" +redis_capacity = 1 + +# REQUIRED — Fernet key. Generate with: +# python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())" +encryption_key = "CHANGE_ME" + +# LLM provider key(s). +openai_api_key = "CHANGE_ME" diff --git a/deploy/terraform/azure/variables.tf b/deploy/terraform/azure/variables.tf new file mode 100644 index 0000000..137b7b5 --- /dev/null +++ b/deploy/terraform/azure/variables.tf @@ -0,0 +1,176 @@ +# -- General ----------------------------------------------------------------- +variable "subscription_id" { + description = "Azure subscription id. Empty = use the provider's ambient context (ARM_SUBSCRIPTION_ID)." + type = string + default = "" +} + +variable "location" { + description = "Azure region." + type = string + default = "eastus" +} + +variable "name_prefix" { + description = "Prefix for resource names." + type = string + default = "querywise" +} + +variable "tags" { + description = "Extra tags applied to every resource." + type = map(string) + default = {} +} + +# -- Resource group ---------------------------------------------------------- +variable "create_resource_group" { + description = "Create the resource group. If false, it must already exist." + type = bool + default = true +} + +variable "resource_group_name" { + description = "Resource group name. Empty = \"-rg\"." + type = string + default = "" +} + +# -- Network ----------------------------------------------------------------- +# The Postgres flexible server uses VNet integration (delegated subnet + private +# DNS zone). Set create_vnet = false to supply your own delegated subnet. +variable "create_vnet" { + description = "Create a VNet + delegated subnet + private DNS zone for Postgres." + type = bool + default = true +} + +variable "vnet_cidr" { + description = "VNet CIDR (when create_vnet = true)." + type = string + default = "10.44.0.0/16" +} + +variable "db_subnet_cidr" { + description = "Delegated subnet CIDR for the flexible server." + type = string + default = "10.44.1.0/24" +} + +variable "db_subnet_id" { + description = "Existing delegated subnet id (when create_vnet = false)." + type = string + default = "" +} + +variable "private_dns_zone_id" { + description = "Existing private DNS zone id for Postgres (when create_vnet = false)." + type = string + default = "" +} + +# -- PostgreSQL flexible server (pgvector) ----------------------------------- +variable "db_name" { + description = "Application database name." + type = string + default = "querywise" +} + +variable "db_username" { + description = "Administrator login." + type = string + default = "querywise" +} + +variable "db_password" { + description = "Admin password. Empty = generate one (stored in Key Vault)." + type = string + default = "" + sensitive = true +} + +variable "db_sku" { + description = "Flexible server SKU." + type = string + default = "GP_Standard_D2ds_v5" +} + +variable "db_storage_mb" { + description = "Storage (MB). Minimum 32768." + type = number + default = 65536 +} + +variable "db_ha" { + description = "Zone-redundant high availability." + type = bool + default = true +} + +variable "db_backup_retention_days" { + description = "Backup retention (days)." + type = number + default = 7 +} + +# -- Redis ------------------------------------------------------------------- +variable "redis_capacity" { + description = "Redis cache capacity (Standard family C: 0=250MB,1=1GB,...)." + type = number + default = 1 +} + +variable "redis_sku" { + description = "Redis SKU (Basic | Standard | Premium)." + type = string + default = "Standard" +} + +# -- Storage ----------------------------------------------------------------- +variable "create_storage" { + description = "Create a storage account + container for exports/backups." + type = bool + default = true +} + +# -- Application secrets ------------------------------------------------------ +variable "encryption_key" { + description = "Fernet key for connection-string encryption (REQUIRED — see README)." + type = string + sensitive = true +} + +variable "jwt_secret" { + description = "HS256 JWT signing secret. Empty = generate one." + type = string + default = "" + sensitive = true +} + +variable "default_admin_password" { + description = "Optional bootstrap admin password." + type = string + default = "" + sensitive = true +} + +variable "openai_api_key" { + description = "OpenAI API key." + type = string + default = "" + sensitive = true +} + +variable "anthropic_api_key" { + description = "Anthropic API key (optional)." + type = string + default = "" + sensitive = true +} + +variable "azure_openai_api_key" { + description = "Azure OpenAI key (optional)." + type = string + default = "" + sensitive = true +} diff --git a/deploy/terraform/azure/versions.tf b/deploy/terraform/azure/versions.tf new file mode 100644 index 0000000..cea02ae --- /dev/null +++ b/deploy/terraform/azure/versions.tf @@ -0,0 +1,19 @@ +terraform { + required_version = ">= 1.5" + + required_providers { + azurerm = { + source = "hashicorp/azurerm" + version = "~> 3.110" + } + random = { + source = "hashicorp/random" + version = "~> 3.5" + } + } +} + +provider "azurerm" { + features {} + subscription_id = var.subscription_id != "" ? var.subscription_id : null +} diff --git a/deploy/terraform/gcp/.terraform.lock.hcl b/deploy/terraform/gcp/.terraform.lock.hcl new file mode 100644 index 0000000..e625516 --- /dev/null +++ b/deploy/terraform/gcp/.terraform.lock.hcl @@ -0,0 +1,66 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/google" { + version = "5.45.2" + constraints = "~> 5.40" + hashes = [ + "h1:0RjrEaRJMIWbgQ4sBfjjLOy9tZiaKaq4r5J1iVz085E=", + "h1:0ehpLaWItePTA1Ne3WXjtRjI4uzPkdPiNwe+M2NI5Kc=", + "h1:9HblaFutcY1nCoKewYVq8aRKC5wyzLIYyLeMMVuzO9M=", + "h1:9jeOJWcgA9fNRuBzPStcI3/N3NZ6BNoSxqS6O9dEmIE=", + "h1:Dm34S6/Q+8uAtrmX+tWkQZCkrvVveU6lHbk4NBkgGBc=", + "h1:RYRrPC1vvSyNu4aYq5MFkRpTyCx84YMan5cNP01XUUk=", + "h1:YEQOp7Ou1+GtpcKyCX6Cr/mAGqKIogpi85MX51GuG4s=", + "h1:fwPyxJ8zBHeuEyv87dn8YkRHAqXGbJ9AqLN1I8loPr8=", + "h1:lg2ogfA9WQfN1nKFQzWpZEqX+0+/J5yz2hvw6Mea/qk=", + "h1:qYSz8K/mE6U1q05/GBky/xOPaUJ4BZn4f4kyDvxcugo=", + "zh:0931f08e81f220ae3132169cfa4ed8e9d8d2045f29ca914afd8ee9e3e9cf56e0", + "zh:31afa45a4c8a0fd4abff564ecff8b69a97ac1813ead61c12f5f0bf5d33cec7f1", + "zh:536979e437aad59ba41465c9398d8e3d7d3702bfe2a51d80571862d48c817959", + "zh:748e14614be32350ece4e9249e09bc1d20e54421983734ded3a0df6d6674ea71", + "zh:7c8fe641666603aad6693207c8eaac679b9be15246d77090c73a1a84326d6084", + "zh:8095a513a0662323d99c25466b5a291c80b2b0c1857c7c7a7b1159f25dbe4439", + "zh:9453db86d14611cab26dba30daf56d1cfef929918207e9e3e78b58299fc8c4fe", + "zh:adaa5df5d40060409b6b66136c0ac37b99fb35ac2cf554c584649c236a18d95b", + "zh:af2f659b4bd1f44e578f203830bdab829b5e635fcf2a59ffa7e997c16e6611ad", + "zh:b75184fe5c162821b0524fa941d6a934c452e815d82e62675bb21bbdc9046dfc", + ] +} + +provider "registry.opentofu.org/hashicorp/random" { + version = "3.9.0" + constraints = "~> 3.5" + hashes = [ + "h1:8EQU5KSxezcjo/phRSe69rDOI0lk4pSaggj7FsskYp8=", + "h1:Lw9im2VBBJQ3RyAbHPQ0rcvcmmcZWm3x+kIOpN+Tv9s=", + "h1:U8KXqGCoNI9/guYbTvzgdtVk3fRthoG0UXwm1JoEpIs=", + "h1:YXaVd4p6qXPPVaxIBaIDNXmBwT02ZqDn0qD+tYpw8sA=", + "h1:cOpc03fphEt/G9Rfc4jLL/fW0D7tgvlXqiDKPF4vuww=", + "h1:g09RR7T1xWkeGrZwWvWMT9ncJrFGr1k3CBD585UmO7w=", + "h1:gGDdPPibmw2EWROx+sh1RGLjR5+nPwZyrf6/N9jXfeM=", + "h1:haE7/nXCOhXKP4oXeEnER3t5CaVQWqujz4nBnpeTUv4=", + "h1:ieSVpfZS2lKuMr05ph0QsOVpCzg7uk3cgKBaXR+Ikug=", + "h1:ig2s1IS9IzehorRjvVAnKIsUUj8fkgyxct1L/kswcc4=", + "h1:j3lS+ZEERFnoab8t1ppDrScGVP/cgWbzlCrEYKTCXYw=", + "h1:lxezrKmOiQIySHAM+os8qLVq7hqufDr8h3Hpzvsk+78=", + "h1:lzRqBJAG+NETxHbEZUJ/YP3RMEjZBinTX7VmgH3lw60=", + "h1:tdSNWK5ApqUsgbdYieyeYLTu6nIZUV3hR1oFqUfAuGo=", + "h1:xedet8yH/zI2CfdxsGlK0nlFWc/Bp61yrWsEa3fHB8g=", + "zh:03f1114cc20b8913523735ab76e0f0a2b16ce13c92923a53304bf85f07fc0dbc", + "zh:105b678ee72322a3067f105d7e05e940f6143238f377f6e87ff4ec909246ac2a", + "zh:55f3bbf13ea18cbace61a706566a80f25f33fe2b1780b6f3d7b582af2a05b6d2", + "zh:63adf996db48f082f7a6351eb485e219cd88795fc71e6ec60a837263ab0d2cb1", + "zh:7e99550738a4e3cc68b8a467714b0d69371025fe95e3326d5323d026d55653e9", + "zh:8342b54af3a18a37e075eeae61be57f4de2ba71b35d95c5075d402dd2c1f289d", + "zh:83ee18e32ac9dd5fc91298554b7c4cfa4c3a1db50f4c797945637cc93c0844ae", + "zh:993ecc0adbf6bd535a59fbc9b735d8c33950e6f6eb5e621d750da9b71d65d80a", + "zh:ad722bc59d4edbf1415e827fc007c0efe6e0e9462d5568bae20b34be1058a261", + "zh:ae9448e1f87b2f9a6c5197a0e9862162ec6b137cb3a3835e11522995d8939e7c", + "zh:bc9cdd3aac784f759125c6627f6f6416e8726a1c184eb9cf3e55b9edbc94c627", + "zh:c8e35b89572ba1c40a9b20022e033a3395fb8d42e7604d50c900f193ba10382e", + "zh:e2deaa8a9975ef81d9f62baed12c41286918b0a10908e0e031f13f69a3b730a1", + "zh:ee39707557210a0ab1098aa357d2cdfe502e5a312d0dbdffb09d08facc4d3fc5", + "zh:f81afe4eb63e8aa9e0ea71be6c990f0dc69cb360e7191c0742a991f4a5081b64", + ] +} diff --git a/deploy/terraform/gcp/README.md b/deploy/terraform/gcp/README.md new file mode 100644 index 0000000..605c5b5 --- /dev/null +++ b/deploy/terraform/gcp/README.md @@ -0,0 +1,55 @@ +# QueryWise on GCP — Terraform (data plane + secrets) + +Provisions the managed dependencies the Helm chart needs, in your project: + +- **Cloud SQL PostgreSQL 16** (pgvector-ready, private IP, regional HA, PITR, TLS-only) +- **Memorystore for Redis** (result cache + the arq job queue) +- **Secret Manager** secret with the assembled DSNs + keys +- **GCS** bucket for exports / `pg_dump` backups (optional) +- **VPC + private-services-access** peering (optional — or BYO VPC with PSA) +- **Service account** with `secretAccessor` for the external-secrets operator + +**Compute (GKE / Cloud Run) is out of scope** — BYO or the upstream +[`terraform-google-modules/kubernetes-engine`](https://github.com/terraform-google-modules/terraform-google-kubernetes-engine) +module — then deploy with the Helm chart in [`../../helm/querywise`](../../helm/querywise), +keeping the cluster in a separate state from the database. + +## Usage + +```bash +cp terraform.tfvars.example terraform.tfvars # then edit +terraform init +terraform apply +``` + +Wire it up with the external-secrets operator on GKE: + +1. Bind the service account to the external-secrets KSA with Workload Identity: + ```bash + gcloud iam service-accounts add-iam-policy-binding \ + "$(terraform output -raw external_secrets_sa_email)" \ + --role roles/iam.workloadIdentityUser \ + --member "serviceAccount:PROJECT.svc.id.goog[external-secrets/external-secrets]" + ``` +2. Create an `ExternalSecret` that pulls `terraform output app_secret_id` with a + `dataFrom` extract into a Kubernetes Secret named `querywise-secrets` (its + keys already match the backend's env). +3. Install the chart: + ```bash + helm upgrade --install querywise ../../helm/querywise -n querywise \ + --set secrets.existingSecret=querywise-secrets + ``` + +GKE must sit on the same VPC (or a peered one) so pods reach the Cloud SQL +private IP and Memorystore host. + +## pgvector + +The `vector` extension is created by the app's Alembic migrations on first +`helm upgrade` (the migration hook). No instance flag required. + +## Notes + +- `db_deletion_protection = true` (default) blocks destroying the instance. +- Generated DB password / JWT secret live only in Secret Manager + Terraform + state — keep your state backend (a GCS bucket) encrypted and access-controlled. diff --git a/deploy/terraform/gcp/cloudsql.tf b/deploy/terraform/gcp/cloudsql.tf new file mode 100644 index 0000000..bf6b159 --- /dev/null +++ b/deploy/terraform/gcp/cloudsql.tf @@ -0,0 +1,45 @@ +# Cloud SQL for PostgreSQL 16. pgvector is available as an extension and is +# created by the app's Alembic migrations (`CREATE EXTENSION IF NOT EXISTS +# vector`) — no instance flag required. + +resource "google_sql_database_instance" "this" { + name = "${var.name_prefix}-pg" + database_version = "POSTGRES_16" + region = var.region + deletion_protection = var.db_deletion_protection + + # Private IP depends on the PSA peering being established first. + depends_on = [google_service_networking_connection.psa] + + settings { + tier = var.db_tier + availability_type = var.db_ha ? "REGIONAL" : "ZONAL" + disk_size = var.db_disk_size + disk_autoresize = true + disk_type = "PD_SSD" + + ip_configuration { + ipv4_enabled = false + private_network = local.network_id + ssl_mode = "ENCRYPTED_ONLY" + } + + backup_configuration { + enabled = true + point_in_time_recovery_enabled = true + } + + user_labels = local.labels + } +} + +resource "google_sql_database" "app" { + name = var.db_name + instance = google_sql_database_instance.this.name +} + +resource "google_sql_user" "app" { + name = var.db_username + instance = google_sql_database_instance.this.name + password = local.db_password +} diff --git a/deploy/terraform/gcp/gcs.tf b/deploy/terraform/gcp/gcs.tf new file mode 100644 index 0000000..09d2a0e --- /dev/null +++ b/deploy/terraform/gcp/gcs.tf @@ -0,0 +1,14 @@ +# Optional bucket for exports / pg_dump backups. Uniform access + versioned. + +resource "google_storage_bucket" "data" { + count = var.create_bucket ? 1 : 0 + name = local.bucket_name + location = var.region + uniform_bucket_level_access = true + force_destroy = false + labels = local.labels + + versioning { + enabled = true + } +} diff --git a/deploy/terraform/gcp/iam.tf b/deploy/terraform/gcp/iam.tf new file mode 100644 index 0000000..669050c --- /dev/null +++ b/deploy/terraform/gcp/iam.tf @@ -0,0 +1,19 @@ +# Service account for the external-secrets operator. Grant it accessor on the +# app secret, then bind it to the in-cluster external-secrets KSA with Workload +# Identity (the iam.workloadIdentityUser binding references the GKE workload +# identity pool, created with the cluster — hence kept out of this data module): +# +# gcloud iam service-accounts add-iam-policy-binding \ +# --role roles/iam.workloadIdentityUser \ +# --member "serviceAccount:.svc.id.goog[external-secrets/external-secrets]" + +resource "google_service_account" "external_secrets" { + account_id = "${var.name_prefix}-ext-secrets" + display_name = "QueryWise external-secrets accessor" +} + +resource "google_secret_manager_secret_iam_member" "accessor" { + secret_id = google_secret_manager_secret.app.id + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${google_service_account.external_secrets.email}" +} diff --git a/deploy/terraform/gcp/main.tf b/deploy/terraform/gcp/main.tf new file mode 100644 index 0000000..c2b4164 --- /dev/null +++ b/deploy/terraform/gcp/main.tf @@ -0,0 +1,41 @@ +locals { + labels = merge({ + "app" = "querywise" + }, var.labels) + + network_id = var.create_network ? google_compute_network.this[0].id : var.network_id + + db_password = var.db_password != "" ? var.db_password : random_password.db[0].result + jwt_secret = var.jwt_secret != "" ? var.jwt_secret : random_password.jwt[0].result + + # Cloud SQL private IP + Memorystore host. Generated password uses a URL-safe + # alphabet so it drops into the DSN without escaping. + database_url = "postgresql+asyncpg://${var.db_username}:${local.db_password}@${google_sql_database_instance.this.private_ip_address}:5432/${var.db_name}" + redis_url = "redis://${google_redis_instance.this.host}:${google_redis_instance.this.port}/0" + + bucket_name = var.bucket_name != "" ? var.bucket_name : "${var.name_prefix}-${var.project_id}" + + secret_payload = { for k, v in { + DATABASE_URL = local.database_url + REDIS_URL = local.redis_url + ENCRYPTION_KEY = var.encryption_key + JWT_SECRET = local.jwt_secret + DEFAULT_ADMIN_PASSWORD = var.default_admin_password + OPENAI_API_KEY = var.openai_api_key + ANTHROPIC_API_KEY = var.anthropic_api_key + AZURE_OPENAI_API_KEY = var.azure_openai_api_key + } : k => v if v != null && v != "" } +} + +resource "random_password" "db" { + count = var.db_password == "" ? 1 : 0 + length = 32 + special = true + override_special = "-_" +} + +resource "random_password" "jwt" { + count = var.jwt_secret == "" ? 1 : 0 + length = 48 + special = false +} diff --git a/deploy/terraform/gcp/network.tf b/deploy/terraform/gcp/network.tf new file mode 100644 index 0000000..285157e --- /dev/null +++ b/deploy/terraform/gcp/network.tf @@ -0,0 +1,34 @@ +# VPC + private-services-access peering so Cloud SQL gets a private IP. Set +# create_network = false to use an existing VPC that already has PSA configured. + +resource "google_compute_network" "this" { + count = var.create_network ? 1 : 0 + name = "${var.name_prefix}-vpc" + auto_create_subnetworks = false +} + +resource "google_compute_subnetwork" "this" { + count = var.create_network ? 1 : 0 + name = "${var.name_prefix}-subnet" + ip_cidr_range = var.subnet_cidr + region = var.region + network = google_compute_network.this[0].id + private_ip_google_access = true +} + +# Reserved range + connection for private services access (Cloud SQL, etc.). +resource "google_compute_global_address" "psa" { + count = var.create_network ? 1 : 0 + name = "${var.name_prefix}-psa" + purpose = "VPC_PEERING" + address_type = "INTERNAL" + prefix_length = 16 + network = google_compute_network.this[0].id +} + +resource "google_service_networking_connection" "psa" { + count = var.create_network ? 1 : 0 + network = google_compute_network.this[0].id + service = "servicenetworking.googleapis.com" + reserved_peering_ranges = [google_compute_global_address.psa[0].name] +} diff --git a/deploy/terraform/gcp/outputs.tf b/deploy/terraform/gcp/outputs.tf new file mode 100644 index 0000000..4e160ff --- /dev/null +++ b/deploy/terraform/gcp/outputs.tf @@ -0,0 +1,46 @@ +output "network_id" { + description = "VPC the data plane runs in." + value = local.network_id +} + +output "db_private_ip" { + description = "Cloud SQL private IP." + value = google_sql_database_instance.this.private_ip_address +} + +output "db_instance_connection_name" { + description = "Cloud SQL connection name (for the auth proxy, if used)." + value = google_sql_database_instance.this.connection_name +} + +output "redis_host" { + description = "Memorystore host." + value = google_redis_instance.this.host +} + +output "app_secret_id" { + description = "Secret Manager secret id holding the assembled app secret. Point external-secrets at this." + value = google_secret_manager_secret.app.secret_id +} + +output "external_secrets_sa_email" { + description = "Service account email to bind to the external-secrets KSA via Workload Identity." + value = google_service_account.external_secrets.email +} + +output "bucket_name" { + description = "Exports/backups bucket (empty if disabled)." + value = var.create_bucket ? google_storage_bucket.data[0].name : "" +} + +output "database_url" { + description = "asyncpg DSN (also stored in the app secret)." + value = local.database_url + sensitive = true +} + +output "redis_url" { + description = "Redis DSN (also stored in the app secret)." + value = local.redis_url + sensitive = true +} diff --git a/deploy/terraform/gcp/redis.tf b/deploy/terraform/gcp/redis.tf new file mode 100644 index 0000000..3137dcf --- /dev/null +++ b/deploy/terraform/gcp/redis.tf @@ -0,0 +1,14 @@ +# Memorystore for Redis — result cache + the arq job queue. Reachable on the +# authorized VPC's private IP. + +resource "google_redis_instance" "this" { + name = "${var.name_prefix}-redis" + tier = var.redis_ha ? "STANDARD_HA" : "BASIC" + memory_size_gb = var.redis_memory_gb + region = var.region + redis_version = "REDIS_7_0" + + authorized_network = local.network_id + + labels = local.labels +} diff --git a/deploy/terraform/gcp/secrets.tf b/deploy/terraform/gcp/secrets.tf new file mode 100644 index 0000000..8485988 --- /dev/null +++ b/deploy/terraform/gcp/secrets.tf @@ -0,0 +1,18 @@ +# Secret Manager holds the assembled app secret (DSNs + keys) as a JSON blob. +# The external-secrets operator on GKE reads it (via the service account below, +# bound with Workload Identity) and syncs it into the Kubernetes Secret the Helm +# chart references. Keys map 1:1 to the backend's env vars. + +resource "google_secret_manager_secret" "app" { + secret_id = "${var.name_prefix}-app" + labels = local.labels + + replication { + auto {} + } +} + +resource "google_secret_manager_secret_version" "app" { + secret = google_secret_manager_secret.app.id + secret_data = jsonencode(local.secret_payload) +} diff --git a/deploy/terraform/gcp/terraform.tfvars.example b/deploy/terraform/gcp/terraform.tfvars.example new file mode 100644 index 0000000..13e73b5 --- /dev/null +++ b/deploy/terraform/gcp/terraform.tfvars.example @@ -0,0 +1,25 @@ +# Copy to terraform.tfvars and fill in. Keep secrets out of version control. + +project_id = "my-gcp-project" +region = "us-central1" +name_prefix = "querywise-prod" + +# Network — create a VPC with private-services-access, or set create_network = +# false and supply a network_id that already has PSA configured. +create_network = true +# network_id = "projects/my-gcp-project/global/networks/my-vpc" + +# Cloud SQL +db_tier = "db-custom-2-7680" +db_ha = true + +# Memorystore +redis_memory_gb = 1 +redis_ha = true + +# REQUIRED — Fernet key. Generate with: +# python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())" +encryption_key = "CHANGE_ME" + +# LLM provider key(s). +openai_api_key = "CHANGE_ME" diff --git a/deploy/terraform/gcp/variables.tf b/deploy/terraform/gcp/variables.tf new file mode 100644 index 0000000..dcf249b --- /dev/null +++ b/deploy/terraform/gcp/variables.tf @@ -0,0 +1,155 @@ +# -- General ----------------------------------------------------------------- +variable "project_id" { + description = "GCP project id." + type = string +} + +variable "region" { + description = "GCP region." + type = string + default = "us-central1" +} + +variable "name_prefix" { + description = "Prefix for resource names." + type = string + default = "querywise" +} + +variable "labels" { + description = "Extra labels applied to resources that support them." + type = map(string) + default = {} +} + +# -- Network ----------------------------------------------------------------- +# Cloud SQL private IP needs a VPC with a private-services-access peering range. +variable "create_network" { + description = "Create a VPC + subnet + private-services-access peering. If false, supply network_id (must already have PSA configured)." + type = bool + default = true +} + +variable "subnet_cidr" { + description = "Primary subnet CIDR (when create_network = true)." + type = string + default = "10.43.0.0/20" +} + +variable "network_id" { + description = "Existing VPC self_link/id (when create_network = false)." + type = string + default = "" +} + +# -- Cloud SQL (PostgreSQL + pgvector) --------------------------------------- +variable "db_name" { + description = "Application database name." + type = string + default = "querywise" +} + +variable "db_username" { + description = "Application database user." + type = string + default = "querywise" +} + +variable "db_password" { + description = "DB password. Empty = generate one (stored in Secret Manager)." + type = string + default = "" + sensitive = true +} + +variable "db_tier" { + description = "Cloud SQL machine tier." + type = string + default = "db-custom-2-7680" +} + +variable "db_disk_size" { + description = "Cloud SQL disk size (GiB)." + type = number + default = 50 +} + +variable "db_ha" { + description = "Regional (HA) availability instead of zonal." + type = bool + default = true +} + +variable "db_deletion_protection" { + description = "Block accidental destroy of the instance." + type = bool + default = true +} + +# -- Memorystore (Redis) ----------------------------------------------------- +variable "redis_memory_gb" { + description = "Memorystore capacity (GiB)." + type = number + default = 1 +} + +variable "redis_ha" { + description = "STANDARD_HA tier instead of BASIC." + type = bool + default = true +} + +# -- GCS --------------------------------------------------------------------- +variable "create_bucket" { + description = "Create a GCS bucket for exports/backups." + type = bool + default = true +} + +variable "bucket_name" { + description = "Bucket name. Empty = \"-\"." + type = string + default = "" +} + +# -- Application secrets ------------------------------------------------------ +variable "encryption_key" { + description = "Fernet key for connection-string encryption (REQUIRED — see README)." + type = string + sensitive = true +} + +variable "jwt_secret" { + description = "HS256 JWT signing secret. Empty = generate one." + type = string + default = "" + sensitive = true +} + +variable "default_admin_password" { + description = "Optional bootstrap admin password." + type = string + default = "" + sensitive = true +} + +variable "openai_api_key" { + description = "OpenAI API key." + type = string + default = "" + sensitive = true +} + +variable "anthropic_api_key" { + description = "Anthropic API key (optional)." + type = string + default = "" + sensitive = true +} + +variable "azure_openai_api_key" { + description = "Azure OpenAI key (optional)." + type = string + default = "" + sensitive = true +} diff --git a/deploy/terraform/gcp/versions.tf b/deploy/terraform/gcp/versions.tf new file mode 100644 index 0000000..0ab2d32 --- /dev/null +++ b/deploy/terraform/gcp/versions.tf @@ -0,0 +1,19 @@ +terraform { + required_version = ">= 1.5" + + required_providers { + google = { + source = "hashicorp/google" + version = "~> 5.40" + } + random = { + source = "hashicorp/random" + version = "~> 3.5" + } + } +} + +provider "google" { + project = var.project_id + region = var.region +} diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml new file mode 100644 index 0000000..5c38628 --- /dev/null +++ b/docker-compose.prod.yml @@ -0,0 +1,121 @@ +# Production stack for QueryWise (small / on-prem single-tenant). +# +# cp .env.prod.example .env.prod # then edit secrets +# docker compose -f docker-compose.prod.yml --env-file .env.prod up -d --build +# +# Topology: frontend(nginx edge) ──> backend(uvicorn, N workers) +# ├─> app-db (pgvector) +# └─> redis (cache + arq jobs) +# worker(arq) ─> redis/app-db migrate(one-shot) ─> app-db +# +# TLS: terminate at the `frontend` edge by mounting certs + adding a 443 server +# block, or front the stack with a cloud LB / external nginx. See the Helm chart +# / Terraform modules (deploy/) for managed-ingress deployments. + +services: + app-db: + image: pgvector/pgvector:pg16 + restart: unless-stopped + environment: + POSTGRES_DB: ${POSTGRES_DB:-querywise} + POSTGRES_USER: ${POSTGRES_USER:-querywise} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD in .env.prod} + volumes: + - app_db_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-querywise} -d ${POSTGRES_DB:-querywise}"] + interval: 10s + timeout: 5s + retries: 5 + # No host port by default — only the backend/worker/migrate reach it. + + redis: + image: redis:7-alpine + restart: unless-stopped + command: ["redis-server", "--appendonly", "yes"] + volumes: + - redis_data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 5 + + # One-shot schema migration. Backend/worker wait for this to finish cleanly + # so multiple backend replicas never race on `alembic upgrade`. + migrate: + build: + context: ./backend + dockerfile: Dockerfile.prod + image: querywise-backend:prod + restart: "no" + env_file: + - .env.prod + environment: + DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-querywise}:${POSTGRES_PASSWORD}@app-db:5432/${POSTGRES_DB:-querywise} + command: ["alembic", "upgrade", "head"] + depends_on: + app-db: + condition: service_healthy + + backend: + build: + context: ./backend + dockerfile: Dockerfile.prod + image: querywise-backend:prod + restart: unless-stopped + env_file: + - .env.prod + environment: + DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-querywise}:${POSTGRES_PASSWORD}@app-db:5432/${POSTGRES_DB:-querywise} + REDIS_URL: redis://redis:6379/0 + JOB_BACKEND: arq + UVICORN_WORKERS: ${UVICORN_WORKERS:-4} + # Auto-setup seeds the sample DB — off in prod (point at a real warehouse). + AUTO_SETUP_SAMPLE_DB: ${AUTO_SETUP_SAMPLE_DB:-false} + depends_on: + app-db: + condition: service_healthy + redis: + condition: service_healthy + migrate: + condition: service_completed_successfully + + # arq worker: runs background jobs (embeddings, schedules) off the request path. + worker: + build: + context: ./backend + dockerfile: Dockerfile.prod + image: querywise-backend:prod + restart: unless-stopped + env_file: + - .env.prod + environment: + DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-querywise}:${POSTGRES_PASSWORD}@app-db:5432/${POSTGRES_DB:-querywise} + REDIS_URL: redis://redis:6379/0 + JOB_BACKEND: arq + command: ["arq", "app.jobs.worker.WorkerSettings"] + depends_on: + redis: + condition: service_healthy + migrate: + condition: service_completed_successfully + + # Edge: serves the SPA bundle and reverse-proxies /api, /mcp, /health. + frontend: + build: + context: ./frontend + dockerfile: Dockerfile.prod + args: + # Empty => SPA calls the API same-origin; nginx proxies to backend. + VITE_API_URL: "" + image: querywise-frontend:prod + restart: unless-stopped + ports: + - "${HTTP_PORT:-80}:8080" + depends_on: + - backend + +volumes: + app_db_data: + redis_data: diff --git a/frontend/.dockerignore b/frontend/.dockerignore new file mode 100644 index 0000000..bc801d5 --- /dev/null +++ b/frontend/.dockerignore @@ -0,0 +1,7 @@ +node_modules/ +dist/ +.env +.env.* +*.log +.git/ +.DS_Store diff --git a/frontend/Dockerfile.prod b/frontend/Dockerfile.prod new file mode 100644 index 0000000..853e0e9 --- /dev/null +++ b/frontend/Dockerfile.prod @@ -0,0 +1,38 @@ +# syntax=docker/dockerfile:1 +# +# Hardened, multi-stage production image for the QueryWise frontend. +# * builder stage produces the static Vite bundle +# * runtime stage serves it from the unprivileged nginx image (non-root, +# listens on 8080) and reverse-proxies the API to the backend +# +# Build: docker build -f Dockerfile.prod -t querywise-frontend:prod . +# +# VITE_API_URL is baked in at build time. Leave it empty ("") so the SPA calls +# the API same-origin (/api/v1) and nginx proxies it to the backend — the +# build-once, configure-by-deployment shape. Override only for a split-origin +# deployment where the API lives on a different host. + +# ---- builder --------------------------------------------------------------- +FROM node:20-slim AS builder + +WORKDIR /app +COPY package*.json ./ +RUN npm ci + +COPY . . + +ARG VITE_API_URL="" +ENV VITE_API_URL=${VITE_API_URL} +RUN npm run build + +# ---- runtime --------------------------------------------------------------- +# Unprivileged nginx: master + workers run as uid 101, listens on 8080. +FROM nginxinc/nginx-unprivileged:1.27-alpine AS runtime + +COPY nginx.conf /etc/nginx/conf.d/default.conf +COPY --from=builder /app/dist /usr/share/nginx/html + +EXPOSE 8080 + +HEALTHCHECK --interval=15s --timeout=5s --start-period=10s --retries=3 \ + CMD wget -qO- http://localhost:8080/healthz >/dev/null 2>&1 || exit 1 diff --git a/frontend/nginx.conf b/frontend/nginx.conf new file mode 100644 index 0000000..53cbd74 --- /dev/null +++ b/frontend/nginx.conf @@ -0,0 +1,68 @@ +# QueryWise frontend edge: serves the SPA bundle and reverse-proxies the API, +# MCP, and health endpoints to the backend. Runs in the unprivileged nginx +# image (worker uid 101, listens on 8080); TLS is terminated upstream (the +# compose `edge`/ingress, or a cloud LB). See docker-compose.prod.yml. + +server { + listen 8080; + server_name _; + + # Resolve the backend at request time via Docker's embedded DNS (127.0.0.11) + # so the edge boots even when the backend is still starting / restarting. + # A static `upstream` would make nginx refuse to start if it can't resolve. + resolver 127.0.0.11 valid=10s ipv6=off; + set $backend http://backend:8000; + + # SPA assets. + root /usr/share/nginx/html; + index index.html; + + # Don't leak the nginx version. + server_tokens off; + + # Cap request bodies (knowledge imports / uploads are modest). + client_max_body_size 25m; + + # Container-internal healthcheck target (see Dockerfile.prod HEALTHCHECK). + location = /healthz { + access_log off; + add_header Content-Type text/plain; + return 200 "ok\n"; + } + + # API + MCP + health proxied to the backend, same-origin. + location /api/ { + proxy_pass $backend; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 120s; + } + + # MCP is streamable HTTP (SSE) — disable buffering so events flush live. + location /mcp { + proxy_pass $backend; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_buffering off; + proxy_cache off; + proxy_read_timeout 3600s; + } + + # Hashed Vite assets — cache hard. + location /assets/ { + expires 1y; + add_header Cache-Control "public, immutable"; + try_files $uri =404; + } + + # SPA fallback — every other path serves index.html for client routing. + location / { + try_files $uri $uri/ /index.html; + } +} diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts index 1842c31..9c44ec9 100644 --- a/frontend/src/api/client.ts +++ b/frontend/src/api/client.ts @@ -1,6 +1,9 @@ import axios from 'axios'; -const API_BASE = import.meta.env.VITE_API_URL || 'http://localhost:8000'; +// `??` (not `||`) so a deliberately-empty VITE_API_URL is honored: the prod +// build sets it to "" so the SPA calls the API same-origin (/api/v1) behind +// nginx. Unset (dev) still falls back to the local backend. +const API_BASE = import.meta.env.VITE_API_URL ?? 'http://localhost:8000'; export const api = axios.create({ baseURL: `${API_BASE}/api/v1`,