diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..9018dd8
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,18 @@
+# Build-context filter for the api/ and worker/ images (context = repo root,
+# because api/ imports worker.db — see api/Dockerfile). web/ builds from its
+# own context (web/) with its own .dockerignore.
+.git
+.github
+.claude
+docs
+infra
+web
+**/.venv
+**/__pycache__
+**/.pytest_cache
+**/.ruff_cache
+**/.mypy_cache
+**/node_modules
+**/tests/samples
+.env
+.env.*
diff --git a/api/Dockerfile b/api/Dockerfile
new file mode 100644
index 0000000..d5fdb82
--- /dev/null
+++ b/api/Dockerfile
@@ -0,0 +1,51 @@
+# api/ — FastAPI HTTP layer. BUILD FROM THE REPO ROOT:
+#
+#   docker build -f api/Dockerfile .
+#
+# The context must be the repo root because api/ imports `db`, `embedding`,
+# `retrieval`, and `scripts.bootstrap_milvus` from ../worker at RUNTIME (the
+# one allowed cross-package import — root CLAUDE.md). A `COPY api/`-only
+# build fails at boot with ModuleNotFoundError.
+#
+# Deps install from api/uv.lock with `uv sync --frozen` so the
+# [tool.uv.sources] CPU-only torch index is honored — re-resolving (or pip)
+# would drag in ~2GB of unused CUDA wheels.
+#
+# The three inference models (BGE-Large ~1.3GB, ms-marco cross-encoder
+# ~90MB, BGE-M3 ~2.3GB) are NOT baked into this image. They live in the
+# shared `sermon-hf-cache` volume (HF_HOME), downloaded once by the
+# `prewarm` one-shot in infra/docker-compose.prod.yml; the runtime then
+# loads them lazily with HF_HUB_OFFLINE=1 so no request ever blocks on
+# (or flakes over) a HuggingFace network round-trip.
+
+FROM python:3.12-slim AS deps
+COPY --from=ghcr.io/astral-sh/uv:0.11.19 /uv /usr/local/bin/uv
+ENV UV_PYTHON_DOWNLOADS=never \
+    UV_LINK_MODE=copy \
+    UV_COMPILE_BYTECODE=1
+WORKDIR /app/api
+COPY api/pyproject.toml api/uv.lock ./
+# package=false in pyproject: this installs dependencies only (no project
+# build), which is exactly what we want. --no-dev skips pytest/ruff/pyright.
+RUN uv sync --frozen --no-dev
+
+FROM python:3.12-slim
+# libgomp1: OpenMP runtime needed by the torch-CPU / sentence-transformers
+# native wheels on slim images. ca-certificates: HTTPS to the LLM endpoint
+# (generativelanguage.googleapis.com / api.ppq.ai).
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends libgomp1 ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+COPY --from=deps /app/api/.venv /app/api/.venv
+COPY worker/ /app/worker/
+COPY api/ /app/api/
+ENV PATH="/app/api/.venv/bin:$PATH" \
+    PYTHONPATH=/app/worker \
+    PYTHONUNBUFFERED=1
+WORKDIR /app/api
+EXPOSE 8000
+# Exactly ONE uvicorn worker: each process lazily loads ~3.7GB of models, so
+# --workers N would cost N × 3.7GB on the shared box. Handlers offload
+# blocking CPU/Milvus work via asyncio.to_thread, so a single worker serves
+# concurrent requests fine; the bottleneck is model wall-time, not async I/O.
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/docs/DEPLOY_AWS.md b/docs/DEPLOY_AWS.md
new file mode 100644
index 0000000..696a2ef
--- /dev/null
+++ b/docs/DEPLOY_AWS.md
@@ -0,0 +1,132 @@
+# Deploying sermon.guide to AWS (v0, single box)
+
+Operator runbook for the v0 deployment: the whole stack — Postgres, Redis,
+Milvus (+etcd +MinIO), FastAPI api, Celery worker, Next.js web, Caddy TLS
+edge — on **one EC2 instance** via `infra/docker-compose.prod.yml`. This is
+the dollar-store rendering of ARCHITECTURE.md §1's "~$50/mo" target; the
+KEDA/k8s shape stays Phase 30.
+
+## TL;DR
+
+```bash
+aws configure                       # once: credentials + region
+cd infra/aws
+./provision.sh                      # EC2 t3a.xlarge + EIP + SG (~2min)
+./deploy.sh                         # clone, build, migrate, bootstrap, prewarm, smoke (~20min first run)
+                                    #   deploys your CURRENT branch — it must be pushed to origin
+                                    #   (deploy.sh preflights this and tells you if not)
+./stop.sh                           # done for the day → compute billing off
+./start.sh                          # back up on the same IP in ~3min
+./status.sh                         # state + URL + billing posture
+./destroy.sh                        # everything gone, $0
+```
+
+The site serves at `https://<elastic-ip>` with a self-signed cert (Caddy's
+internal CA) — the browser warns once per browser; proceed. No domain is
+required; see [Adding a domain](#adding-a-domain-later) for the day one exists.
+
+## Cost
+
+| State | What bills | ≈/mo |
+| --- | --- | --- |
+| Running 24/7 | t3a.xlarge ($0.1504/hr us-east-1) + 100GB gp3 + EIP | ~$122 |
+| Running 8h/day | compute ~⅓ + disk + EIP | ~$48 |
+| **Stopped** | 100GB gp3 (~$8) + EIP (~$3.65) | **~$12** |
+
+t3a is burstable (unlimited mode by default): sustained heavy CPU — e.g.
+hours of ingest — can accrue small credit-overage charges; fine for bursty
+beta use, watch CloudWatch `CPUSurplusCreditCharged` if you batch-ingest a
+whole library. **Set a billing alarm** (Billing → Budgets) — nothing in this
+stack does it for you.
+
+LLM spend is separate (per-query, via `GOOGLE_API_KEY`/`PPQ_API_KEY`); a
+warm `/search-summary` was ~6¢ in the Phase 14b live verify. Set a spend cap
+at the provider.
+
+## What deploy.sh actually does
+
+1. **Code** → `git clone`/`reset --hard` of the chosen branch into `/opt/sermon/app`.
+2. **Secrets** → first run generates `/opt/sermon/.env.prod` *on the box*
+   (`openssl rand`; JWT secret, Postgres/Redis/MinIO passwords). Secrets never
+   exist on a dev machine or in git. `docker-compose.prod.yml` uses `${VAR:?}`
+   so a missing secret refuses to boot rather than falling back to the
+   dev defaults baked into the code (the Phase 18 startup-guard gap, mitigated
+   at the compose layer).
+3. **LLM keys** → if `GOOGLE_API_KEY`/`PPQ_API_KEY` are set in your local
+   shell when you run `./deploy.sh`, they're forwarded into the box's env
+   file (PPQ also flips the provider). Without one, everything works except
+   `/search-summary`, which 503s naming the missing var.
+4. **Build** → all four images build on the instance (first run ~10–20min).
+5. **Bootstrap** → `alembic upgrade head`, `bootstrap_milvus.py` (both
+   idempotent), then the `prewarm` one-shot downloads the three models
+   (~3.7GB) into the shared `sermon-hf-cache` volume — the only moment the
+   stack talks to HuggingFace. Runtime containers run `HF_HUB_OFFLINE=1`.
+6. **Up + smoke** → `up -d --wait`, then an outside-in signup→login→/library
+   pass through Caddy with a cookie jar.
+
+## Security posture (read before sharing the URL)
+
+Mitigated at deploy time, no app-code changes:
+
+- **Only Caddy publishes ports** (80/443). Postgres/Redis/Milvus (which has
+  *no auth*)/MinIO/etcd/api/web are compose-network-only; the security group
+  (443/80 world, 22 admin-IP) is the backstop.
+- **Strong generated secrets**; compose hard-fails if any is missing.
+- **Per-IP rate limits at Caddy**: 10/min on `/api/auth/*`, 6/min on
+  `/api/search-summary` + `/api/upload`, 600/min general (Phase 19 gap).
+- **Body caps at the edge**: 1MB on JSON routes, 210MB global (the api's own
+  200MB streamed cap is the real gate; this protects the Node proxy).
+- **Celery `--time-limit=3600`** so one poisoned upload can't pin the CPU.
+- **Secure session cookie**: web runs `NODE_ENV=production` (bakes the
+  `Secure` attribute), everything redirects to HTTPS.
+- The api is **not publicly reachable at all** — browsers only ever hit the
+  Next route handlers, which attach the JWT server-side.
+
+Accepted v0 risks (documented in the Phase 17–30 plan, revisit before real
+launch): open signup (rate-limited but no email verification/CAPTCHA),
+`task_id`-as-capability on `/tasks/{id}`, no Pydantic `extra='forbid'`,
+no graceful degradation (a Milvus blip = 500), CPU latency (warm
+`/search-summary` ≈ 2min — "user is reading, not chatting").
+
+## Day-2 operations
+
+```bash
+# logs (whole stack / one service)
+ssh -i ~/.ssh/sermon-guide.pem ubuntu@<ip>
+cd /opt/sermon/app
+docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod logs -f [api|worker|web|caddy|milvus]
+
+# redeploy after pushing changes to the branch
+./deploy.sh                      # re-runs build/migrate/bootstrap idempotently
+
+# add or rotate the LLM key later
+ssh … 'sed -i "s|^GOOGLE_API_KEY=.*|GOOGLE_API_KEY=<key>|" /opt/sermon/.env.prod'
+ssh … 'cd /opt/sermon/app && docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod up -d api'
+
+# manual backup before risky changes (Phase 28 will do this properly)
+aws ec2 create-snapshot --volume-id $(aws ec2 describe-instances \
+  --filters Name=tag:Name,Values=sermon-guide Name=instance-state-name,Values=running,stopped \
+  --query 'Reservations[0].Instances[0].BlockDeviceMappings[0].Ebs.VolumeId' --output text) \
+  --description "sermon-guide manual backup"
+```
+
+## Adding a domain later
+
+1. Point an A record at the Elastic IP.
+2. On the box, edit `/opt/sermon/.env.prod`:
+   `SITE_HOST=sermon.guide` and
+   `SERMON_API_CORS_ORIGINS=["https://sermon.guide"]`.
+3. Remove the `tls internal` line from `infra/caddy/Caddyfile` (commit that
+   change) so automatic Let's Encrypt takes over.
+4. `docker compose … up -d caddy api`. Add an HSTS header in the Caddyfile
+   once the real cert is confirmed working (deliberately absent now — HSTS on
+   a self-signed IP would lock browsers out).
+
+## Known deltas vs the phase plan
+
+- This is operator tooling on branch `deploy/aws-v0`, not Phase 29/30:
+  images build on the box (no registry/CI), models live in a volume rather
+  than baked into images. When Phase 29 lands proper image-build CI, these
+  Dockerfiles are its starting point and `prewarm` becomes a build step.
+- `web/next.config.ts` gained `output: "standalone"` (required for the slim
+  web image; dev/CI behavior unchanged).
diff --git a/infra/AGENTS.md b/infra/AGENTS.md
index df24e73..2d1b84c 100644
--- a/infra/AGENTS.md
+++ b/infra/AGENTS.md
@@ -1,14 +1,31 @@
 # infra/ — agent instructions
 
-Local-development infrastructure for sermon.guide v0. Production lives in
-k8s manifests later (see [docs/PHASES.md](../docs/PHASES.md), Beyond Phase 16).
+Local-development AND v0 single-box production infrastructure for
+sermon.guide. The k8s/KEDA shape stays post-v0
+(see [docs/PHASES.md](../docs/PHASES.md), Beyond Phase 16).
 
 ## What lives here
 
-- `docker-compose.yml` — Postgres 16, Redis 7, Milvus standalone v2.6 with its
-  required etcd + MinIO dependencies. Brought up via `make up` from repo root.
+- `docker-compose.yml` — local dev data plane: Postgres 16, Redis 7, Milvus
+  standalone v2.6 with its required etcd + MinIO dependencies. Brought up via
+  `make up` from repo root.
 - `.env.example` — template for `infra/.env` (gitignored). `make up` copies
   the example to `.env` on first run.
+- `docker-compose.prod.yml` — the v0 single-box AWS stack (data plane + api/
+  worker/web + Caddy edge). DELIBERATELY self-contained, not an overlay:
+  compose merges `ports:` additively, and "only Caddy publishes a port" is
+  the security property the file guarantees. Keep its data-plane blocks in
+  sync with `docker-compose.yml` when bumping versions. Runbook:
+  [docs/DEPLOY_AWS.md](../docs/DEPLOY_AWS.md).
+- `caddy/` — TLS edge (Dockerfile + Caddyfile: rate limits, body caps,
+  default_sni for bare-IP deploys).
+- `scripts/` — deploy-time one-shots (model prewarm into the shared HF cache).
+- `aws/` — provision/deploy/start/stop/status/destroy lifecycle scripts.
+  Tag-based and re-runnable; secrets are generated ON the instance, never
+  committed.
+- `env.prod.template` — documents `/opt/sermon/.env.prod` (generated on-box
+  by `aws/deploy.sh`). Deliberately NOT dot-env-named so repo tooling can
+  read it.
 - Future: `k8s/` Helm values + KEDA scaler config (post-v0).
 
 ## Conventions
diff --git a/infra/aws/common.sh b/infra/aws/common.sh
new file mode 100755
index 0000000..b2e3653
--- /dev/null
+++ b/infra/aws/common.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# Shared helpers for the sermon.guide AWS scripts. Source, don't execute.
+#
+# Conventions:
+#   - Everything is found by tag, not by stored state: the instance, SG, EIP
+#     and key pair all carry Name/Project=sermon-guide tags, so the scripts
+#     are re-runnable from any checkout.
+#   - Region/profile come from the ambient AWS CLI config (aws configure /
+#     AWS_PROFILE / AWS_REGION); override per-invocation with env vars.
+
+set -euo pipefail
+
+TAG_NAME="${SERMON_AWS_NAME:-sermon-guide}"
+KEY_NAME="${SERMON_AWS_KEY_NAME:-${TAG_NAME}}"
+KEY_FILE="${SERMON_AWS_KEY_FILE:-${HOME}/.ssh/${TAG_NAME}.pem}"
+SSH_USER="ubuntu"
+
+aws() {
+  command aws "$@"
+}
+
+region() {
+  aws configure get region 2>/dev/null || echo "${AWS_REGION:-${AWS_DEFAULT_REGION:-}}"
+}
+
+die() {
+  echo "ERROR: $*" >&2
+  exit 1
+}
+
+require_aws() {
+  command -v aws >/dev/null 2>&1 || die "aws CLI not found (expected on PATH, e.g. ~/.local/bin/aws)"
+  aws sts get-caller-identity >/dev/null 2>&1 \
+    || die "AWS credentials not configured — run: aws configure"
+  [ -n "$(region)" ] || die "no default region — set one via aws configure or AWS_REGION"
+}
+
+# Newest non-terminated instance tagged Name=$TAG_NAME; empty if none.
+find_instance() {
+  aws ec2 describe-instances \
+    --filters "Name=tag:Name,Values=${TAG_NAME}" \
+              "Name=instance-state-name,Values=pending,running,stopping,stopped" \
+    --query 'sort_by(Reservations[].Instances[], &LaunchTime)[-1].InstanceId' \
+    --output text 2>/dev/null | grep -v '^None$' || true
+}
+
+instance_state() {
+  aws ec2 describe-instances --instance-ids "$1" \
+    --query 'Reservations[0].Instances[0].State.Name' --output text
+}
+
+# Elastic IP tagged Name=$TAG_NAME; prints "ALLOC_ID IP" or nothing.
+find_eip() {
+  aws ec2 describe-addresses \
+    --filters "Name=tag:Name,Values=${TAG_NAME}" \
+    --query 'Addresses[0].[AllocationId,PublicIp]' --output text 2>/dev/null \
+    | grep -v '^None' || true
+}
+
+find_security_group() {
+  aws ec2 describe-security-groups \
+    --filters "Name=group-name,Values=${TAG_NAME}-sg" \
+    --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null \
+    | grep -v '^None$' || true
+}
+
+ssh_cmd() {
+  # accept-new = trust-on-first-use: conventional for a box we just created
+  # ourselves, but the first connect is unauthenticated — paranoid operators
+  # can pre-pin the host key from the EC2 console's system log.
+  ssh -i "${KEY_FILE}" \
+    -o StrictHostKeyChecking=accept-new \
+    -o ConnectTimeout=10 \
+    "${SSH_USER}@$1" "${@:2}"
+}
+
+wait_for_ssh() {
+  local ip="$1" tries=0
+  echo "waiting for SSH on ${ip} …"
+  until ssh_cmd "${ip}" true 2>/dev/null; do
+    tries=$((tries + 1))
+    [ "${tries}" -lt 40 ] || die "SSH to ${ip} not reachable after ~3min"
+    sleep 5
+  done
+}
diff --git a/infra/aws/deploy.sh b/infra/aws/deploy.sh
new file mode 100755
index 0000000..8cfba26
--- /dev/null
+++ b/infra/aws/deploy.sh
@@ -0,0 +1,207 @@
+#!/usr/bin/env bash
+# Deploy (or re-deploy) sermon.guide onto the provisioned EC2 box.
+#
+#   ./deploy.sh             # deploys the branch you're currently on
+#   BRANCH=main ./deploy.sh # deploy a specific branch
+#
+# What it does, in order (every step idempotent, safe to re-run):
+#   1. clone/pull the repo on the instance (/opt/sermon/app)
+#   2. first run only: generate /opt/sermon/.env.prod — strong secrets are
+#      created ON the box with openssl and never leave it
+#   3. forward GOOGLE_API_KEY / PPQ_API_KEY from the local env if set
+#      (so the LLM key never lands in git or chat either). Keys are STICKY:
+#      once set on the box they persist across deploys until you edit
+#      /opt/sermon/.env.prod by hand — running deploy.sh without the var
+#      exported leaves the old key (and provider) in place.
+#   4. docker compose build (first build ~10-20min: torch wheels, Next build,
+#      xcaddy compile)
+#   5. up the data plane, run one-shots: migrate → bootstrap-milvus → prewarm
+#      (prewarm downloads ~3.7GB of models into the hf-cache volume once)
+#   6. up everything, then smoke-test from the OUTSIDE (signup→login→library
+#      through Caddy with a cookie jar)
+
+. "$(dirname "$0")/common.sh"
+
+REPO_URL="${REPO_URL:-https://github.com/sovITxyz/sermon.guide.git}"
+BRANCH="${BRANCH:-$(git -C "$(dirname "$0")/../.." rev-parse --abbrev-ref HEAD 2>/dev/null || echo main)}"
+
+# Preflight: the box clones BRANCH from origin over anonymous HTTPS. If the
+# branch isn't pushed, the remote `git clone --branch` fails with a cryptic
+# exit-128 mid-SSH — catch it here with an actionable message instead.
+GIT_TERMINAL_PROMPT=0 git ls-remote --exit-code --heads "${REPO_URL}" "refs/heads/${BRANCH}" >/dev/null 2>&1 \
+  || die "branch '${BRANCH}' is not on origin (${REPO_URL}) — push it first: git push -u origin ${BRANCH}"
+
+require_aws
+
+instance_id="$(find_instance)"
+[ -n "${instance_id}" ] || die "no instance found — run provision.sh first"
+state="$(instance_state "${instance_id}")"
+[ "${state}" = "running" ] || die "instance ${instance_id} is ${state} — run start.sh first"
+
+eip_info="$(find_eip)"
+[ -n "${eip_info}" ] || die "no Elastic IP tagged ${TAG_NAME}"
+ip="$(echo "${eip_info}" | awk '{print $2}')"
+
+echo "deploying branch '${BRANCH}' to ${instance_id} @ ${ip}"
+wait_for_ssh "${ip}"
+
+# Wait for cloud-init (Docker install) on a fresh box.
+ssh_cmd "${ip}" "cloud-init status --wait >/dev/null 2>&1 || true"
+ssh_cmd "${ip}" "command -v docker >/dev/null" || die "Docker missing on instance — cloud-init failed? check /var/log/cloud-init-output.log"
+
+# Optional LLM keys forwarded from the local environment (never stored in git).
+llm_env=""
+[ -n "${GOOGLE_API_KEY:-}" ] && llm_env="GOOGLE_API_KEY=${GOOGLE_API_KEY}"
+[ -n "${PPQ_API_KEY:-}" ] && llm_env="${llm_env} PPQ_API_KEY=${PPQ_API_KEY}"
+
+ssh_cmd "${ip}" "bash -s" <<REMOTE
+set -euo pipefail
+export ${llm_env:-_NOOP=1}
+
+# --- 1. code ---
+if [ ! -d /opt/sermon/app/.git ]; then
+  git clone --branch "${BRANCH}" "${REPO_URL}" /opt/sermon/app
+else
+  git -C /opt/sermon/app fetch origin
+  git -C /opt/sermon/app checkout "${BRANCH}"
+  git -C /opt/sermon/app reset --hard "origin/${BRANCH}"
+fi
+
+# --- 2. env file (first run only; secrets generated on-box) ---
+if [ ! -f /opt/sermon/.env.prod ]; then
+  umask 177
+  cat > /opt/sermon/.env.prod <<EOF
+SERMON_API_JWT_SECRET=\$(openssl rand -hex 48)
+SERMON_POSTGRES_PASSWORD=\$(openssl rand -hex 24)
+SERMON_REDIS_PASSWORD=\$(openssl rand -hex 24)
+SERMON_MINIO_ROOT_PASSWORD=\$(openssl rand -hex 24)
+SERMON_POSTGRES_USER=sermon
+SERMON_POSTGRES_DB=sermon
+SERMON_MINIO_ROOT_USER=sermon-minio
+SITE_HOST=${ip}
+SERMON_API_CORS_ORIGINS=["https://${ip}"]
+SERMON_API_LLM_PROVIDER=google
+SERMON_API_LLM_MODEL=
+GOOGLE_API_KEY=
+PPQ_API_KEY=
+EOF
+  umask 022
+  echo "generated /opt/sermon/.env.prod"
+fi
+
+# --- 3. LLM keys forwarded from the operator's shell, if any ---
+# Values are written via shell vars + redirects only — never as argv of
+# sed/etc., so a secret can't appear in the box's process list mid-write.
+set_kv() {
+  ( umask 177; { grep -v "^\$1=" /opt/sermon/.env.prod; printf '%s=%s\n' "\$1" "\$2"; } > /opt/sermon/.env.prod.new )
+  mv /opt/sermon/.env.prod.new /opt/sermon/.env.prod
+}
+if [ -n "\${GOOGLE_API_KEY:-}" ]; then
+  set_kv GOOGLE_API_KEY "\${GOOGLE_API_KEY}"
+  echo "GOOGLE_API_KEY updated"
+fi
+if [ -n "\${PPQ_API_KEY:-}" ]; then
+  set_kv PPQ_API_KEY "\${PPQ_API_KEY}"
+  set_kv SERMON_API_LLM_PROVIDER ppq
+  echo "PPQ_API_KEY updated (provider → ppq)"
+fi
+
+cd /opt/sermon/app
+compose() {
+  docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod "\$@"
+}
+
+# --- 4. build ---
+compose build
+
+# --- 5. data plane up → one-shots ---
+# NOTE: every 'compose run' below MUST have stdin redirected (</dev/null).
+# This whole script arrives on the box via 'bash -s' reading ssh stdin, and
+# 'docker compose run' attaches the container's stdin by default — without
+# the redirect the one-shot container EATS THE REST OF THIS SCRIPT off the
+# stream and bash silently exits 0 after that line (observed on the first
+# real deploy: everything after 'migrate' vanished).
+# (And no backticks anywhere in this heredoc: the delimiter is unquoted, so
+# backticks COMMAND-SUBSTITUTE ON THE OPERATOR MACHINE during expansion —
+# a backticked 'bash -s' in an earlier version of this very comment ran
+# locally and hung the deploy reading stdin.)
+compose up -d --wait postgres redis etcd minio milvus
+compose run --rm migrate </dev/null
+# Milvus's :9091 healthz can report healthy moments before the :19530 gRPC
+# path accepts queries on a cold standalone — retry instead of aborting the
+# whole deploy on that race.
+for attempt in 1 2 3; do
+  if compose run --rm bootstrap-milvus </dev/null; then
+    break
+  fi
+  if [ "\${attempt}" = 3 ]; then
+    echo "bootstrap-milvus failed after 3 attempts" >&2
+    exit 1
+  fi
+  echo "bootstrap-milvus attempt \${attempt} failed (cold Milvus gRPC?) — retrying in 10s"
+  sleep 10
+done
+
+# Prewarm only when the heaviest model isn't cached yet (~3.7GB once).
+if ! docker run --rm -v sermon_sermon-hf-cache:/hf-cache alpine:3.20 \
+    test -d /hf-cache/hub/models--BAAI--bge-m3 2>/dev/null; then
+  compose run --rm prewarm </dev/null
+else
+  echo "hf-cache already warm — skipping prewarm"
+fi
+
+# --- 6. full stack ---
+compose up -d --wait
+compose ps
+REMOTE
+
+# --- smoke test from the outside, through Caddy ---
+echo
+echo "smoke-testing https://${ip} …"
+jar="$(mktemp)"
+trap 'rm -f "${jar}"' EXIT
+
+# / 307s to /library, which 307s to /login without a cookie (app/page.tsx +
+# middleware.ts) — follow the chain and assert the final page serves.
+code="$(curl -skL -o /dev/null -w '%{http_code}' "https://${ip}/")"
+echo "  GET / (-L)       → ${code}"
+[ "${code}" = "200" ] || die "landing page not serving"
+
+# Fresh throwaway user per deploy so the FULL authed path (signup → login →
+# cookie → authed page) is asserted on every run, not just the first; any
+# unexpected code (500/502/503/429) is a hard failure, not a silent skip.
+# Domain must be @example.com — the api's email validation 422s reserved
+# TLDs like .test (verified against the live stack).
+smoke_user="smoke-$(openssl rand -hex 4)@example.com"
+smoke_pw="smoke-$(openssl rand -hex 8)"
+code="$(curl -sk -o /dev/null -w '%{http_code}' -X POST "https://${ip}/api/auth/signup" \
+  -H 'Content-Type: application/json' \
+  -d "{\"email\":\"${smoke_user}\",\"password\":\"${smoke_pw}\"}")"
+echo "  POST signup      → ${code}"
+[ "${code}" = "201" ] || die "signup returned ${code} — api/postgres path not healthy"
+
+code="$(curl -sk -c "${jar}" -o /dev/null -w '%{http_code}' -X POST "https://${ip}/api/auth/login" \
+  -H 'Content-Type: application/json' \
+  -d "{\"email\":\"${smoke_user}\",\"password\":\"${smoke_pw}\"}")"
+echo "  POST login       → ${code}"
+[ "${code}" = "200" ] || die "login failed"
+grep -q sg_session "${jar}" || die "session cookie not set"
+
+code="$(curl -sk -b "${jar}" -o /dev/null -w '%{http_code}' "https://${ip}/library")"
+echo "  GET /library     → ${code} (authed)"
+[ "${code}" = "200" ] || die "authed library page failed"
+
+code="$(curl -sk -o /dev/null -w '%{http_code}' "https://${ip}/library")"
+echo "  GET /library     → ${code} (no cookie; 307 → /login expected)"
+
+code="$(curl -sk -o /dev/null -w '%{http_code}' -X POST "https://${ip}/api/auth/login" \
+  -H 'Content-Type: application/json' \
+  -d "{\"email\":\"${smoke_user}\",\"password\":\"wrong-password\"}")"
+echo "  bad login        → ${code} (401 expected)"
+[ "${code}" = "401" ] || die "bad credentials did not 401"
+
+echo
+echo "deployed ✓  https://${ip}"
+echo "  (self-signed cert — browser will warn once; Accept/Proceed is expected)"
+echo "  logs:    ssh -i ${KEY_FILE} ${SSH_USER}@${ip} 'cd /opt/sermon/app && docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod logs -f'"
+echo "  stop:    ./stop.sh   (compute billing stops; disk+IP ≈ \$12/mo)"
diff --git a/infra/aws/destroy.sh b/infra/aws/destroy.sh
new file mode 100755
index 0000000..d03c630
--- /dev/null
+++ b/infra/aws/destroy.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# PERMANENTLY tear down the sermon.guide AWS footprint: instance (and its
+# EBS volume — Postgres, Milvus, uploads, everything), Elastic IP, security
+# group. The key pair is kept unless --delete-key is passed.
+#
+# There are no backups unless you made a snapshot first:
+#   aws ec2 create-snapshot --volume-id <vol> --description sermon-pre-destroy
+
+. "$(dirname "$0")/common.sh"
+
+require_aws
+instance_id="$(find_instance)"
+eip_info="$(find_eip)"
+sg_id="$(find_security_group)"
+
+echo "will destroy:"
+echo "  instance : ${instance_id:-none} (+ its EBS volume — ALL DATA)"
+echo "  eip      : ${eip_info:-none}"
+echo "  sg       : ${sg_id:-none}"
+printf 'type "destroy sermon-guide" to confirm: '
+read -r confirm
+[ "${confirm}" = "destroy sermon-guide" ] || die "aborted"
+
+if [ -n "${instance_id}" ]; then
+  aws ec2 terminate-instances --instance-ids "${instance_id}" >/dev/null
+  echo "terminating ${instance_id} …"
+  aws ec2 wait instance-terminated --instance-ids "${instance_id}"
+fi
+
+if [ -n "${eip_info}" ]; then
+  alloc_id="$(echo "${eip_info}" | awk '{print $1}')"
+  aws ec2 release-address --allocation-id "${alloc_id}"
+  echo "released EIP"
+fi
+
+if [ -n "${sg_id}" ]; then
+  aws ec2 delete-security-group --group-id "${sg_id}"
+  echo "deleted security group"
+fi
+
+if [ "${1:-}" = "--delete-key" ]; then
+  aws ec2 delete-key-pair --key-name "${KEY_NAME}"
+  rm -f "${KEY_FILE}"
+  echo "deleted key pair + ${KEY_FILE}"
+fi
+
+echo "destroyed ✓ (billing for this stack is now \$0)"
diff --git a/infra/aws/provision.sh b/infra/aws/provision.sh
new file mode 100755
index 0000000..f7f8f18
--- /dev/null
+++ b/infra/aws/provision.sh
@@ -0,0 +1,146 @@
+#!/usr/bin/env bash
+# Provision the sermon.guide EC2 box: security group (443/80 world, 22 admin
+# IP only), key pair, Ubuntu 24.04 instance with Docker via user-data, and an
+# Elastic IP (so stop/start keeps the same address — stopped instances bill
+# only EBS ~$8/mo + EIP ~$3.65/mo, not compute).
+#
+# Idempotent-ish: re-running finds the tagged instance and exits with info.
+#
+#   ./provision.sh                          # t3a.xlarge, 100GB gp3, current region
+#   INSTANCE_TYPE=t3a.large ./provision.sh  # override sizing
+#   SSH_CIDR=1.2.3.4/32 ./provision.sh      # override admin-SSH source
+
+. "$(dirname "$0")/common.sh"
+
+INSTANCE_TYPE="${INSTANCE_TYPE:-t3a.xlarge}"
+VOLUME_GB="${VOLUME_GB:-100}"
+
+require_aws
+REGION="$(region)"
+echo "region: ${REGION}  type: ${INSTANCE_TYPE}  disk: ${VOLUME_GB}GB gp3"
+
+existing="$(find_instance)"
+if [ -n "${existing}" ]; then
+  state="$(instance_state "${existing}")"
+  eip_info="$(find_eip)"
+  echo "already provisioned: ${existing} (${state})  eip: ${eip_info:-none}"
+  echo "use deploy.sh / start.sh / stop.sh / destroy.sh"
+  exit 0
+fi
+
+# --- default VPC ---
+vpc_id="$(aws ec2 describe-vpcs --filters Name=isDefault,Values=true \
+  --query 'Vpcs[0].VpcId' --output text)"
+if [ "${vpc_id}" = "None" ] || [ -z "${vpc_id}" ]; then
+  echo "no default VPC — creating one"
+  vpc_id="$(aws ec2 create-default-vpc --query 'Vpc.VpcId' --output text)"
+fi
+echo "vpc: ${vpc_id}"
+
+# --- security group: 80/443 world, 22 admin only ---
+# Admin IP is resolved BEFORE the SG exists, and rules are (re)applied even
+# when the SG already exists — so a half-created SG from an aborted earlier
+# run self-heals instead of silently shipping with no ingress (SSH lockout).
+if [ -z "${SSH_CIDR:-}" ]; then
+  my_ip="$(curl -fsS https://checkip.amazonaws.com || true)"
+  [ -n "${my_ip}" ] || die "could not determine admin IP — set SSH_CIDR=x.x.x.x/32 explicitly"
+  SSH_CIDR="${my_ip}/32"
+fi
+
+sg_id="$(find_security_group)"
+if [ -z "${sg_id}" ]; then
+  sg_id="$(aws ec2 create-security-group \
+    --group-name "${TAG_NAME}-sg" \
+    --description "sermon.guide single-box: 443/80 public, 22 admin" \
+    --vpc-id "${vpc_id}" \
+    --tag-specifications "ResourceType=security-group,Tags=[{Key=Name,Value=${TAG_NAME}},{Key=Project,Value=${TAG_NAME}}]" \
+    --query 'GroupId' --output text)"
+fi
+echo "sg: ${sg_id}  ssh from: ${SSH_CIDR}"
+for perm in \
+  "IpProtocol=tcp,FromPort=80,ToPort=80,IpRanges=[{CidrIp=0.0.0.0/0}],Ipv6Ranges=[{CidrIpv6=::/0}]" \
+  "IpProtocol=tcp,FromPort=443,ToPort=443,IpRanges=[{CidrIp=0.0.0.0/0}],Ipv6Ranges=[{CidrIpv6=::/0}]" \
+  "IpProtocol=udp,FromPort=443,ToPort=443,IpRanges=[{CidrIp=0.0.0.0/0}],Ipv6Ranges=[{CidrIpv6=::/0}]" \
+  "IpProtocol=tcp,FromPort=22,ToPort=22,IpRanges=[{CidrIp=${SSH_CIDR}}]"; do
+  out="$(aws ec2 authorize-security-group-ingress --group-id "${sg_id}" \
+    --ip-permissions "${perm}" 2>&1)" \
+    || { echo "${out}" | grep -q InvalidPermission.Duplicate || die "SG ingress failed: ${out}"; }
+done
+
+# --- key pair ---
+if ! aws ec2 describe-key-pairs --key-names "${KEY_NAME}" >/dev/null 2>&1; then
+  echo "creating key pair ${KEY_NAME} → ${KEY_FILE}"
+  mkdir -p "$(dirname "${KEY_FILE}")"
+  aws ec2 create-key-pair --key-name "${KEY_NAME}" \
+    --key-type ed25519 \
+    --tag-specifications "ResourceType=key-pair,Tags=[{Key=Project,Value=${TAG_NAME}}]" \
+    --query 'KeyMaterial' --output text > "${KEY_FILE}"
+  chmod 600 "${KEY_FILE}"
+elif [ ! -f "${KEY_FILE}" ]; then
+  die "key pair ${KEY_NAME} exists in AWS but ${KEY_FILE} is missing locally — delete the AWS key pair or set SERMON_AWS_KEY_FILE"
+fi
+
+# --- AMI: latest Ubuntu 24.04 LTS amd64 ---
+ami_id="$(aws ssm get-parameters \
+  --names /aws/service/canonical/ubuntu/server/24.04/stable/current/amd64/hvm/ebs-gp3/ami-id \
+  --query 'Parameters[0].Value' --output text)"
+echo "ami: ${ami_id}"
+
+# --- user-data: Docker engine + compose v2 (official repo; distro packages
+# are too old for the compose features the prod file uses) ---
+user_data="$(mktemp)"
+trap 'rm -f "${user_data}"' EXIT
+cat > "${user_data}" <<'CLOUDINIT'
+#!/bin/bash
+set -euxo pipefail
+apt-get update
+apt-get install -y ca-certificates curl
+install -m 0755 -d /etc/apt/keyrings
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+chmod a+r /etc/apt/keyrings/docker.asc
+echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] \
+https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" \
+  > /etc/apt/sources.list.d/docker.list
+apt-get update
+apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+usermod -aG docker ubuntu
+mkdir -p /opt/sermon
+chown ubuntu:ubuntu /opt/sermon
+CLOUDINIT
+
+# --- launch ---
+instance_id="$(aws ec2 run-instances \
+  --image-id "${ami_id}" \
+  --instance-type "${INSTANCE_TYPE}" \
+  --key-name "${KEY_NAME}" \
+  --security-group-ids "${sg_id}" \
+  --block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=${VOLUME_GB},VolumeType=gp3,DeleteOnTermination=true}" \
+  --metadata-options "HttpEndpoint=enabled,HttpTokens=required" \
+  --instance-initiated-shutdown-behavior stop \
+  --user-data "file://${user_data}" \
+  --tag-specifications \
+    "ResourceType=instance,Tags=[{Key=Name,Value=${TAG_NAME}},{Key=Project,Value=${TAG_NAME}}]" \
+    "ResourceType=volume,Tags=[{Key=Name,Value=${TAG_NAME}},{Key=Project,Value=${TAG_NAME}}]" \
+  --query 'Instances[0].InstanceId' --output text)"
+echo "instance: ${instance_id} — waiting for running state"
+aws ec2 wait instance-running --instance-ids "${instance_id}"
+
+# --- elastic IP (survives stop/start) ---
+eip_info="$(find_eip)"
+if [ -z "${eip_info}" ]; then
+  alloc_id="$(aws ec2 allocate-address --domain vpc \
+    --tag-specifications "ResourceType=elastic-ip,Tags=[{Key=Name,Value=${TAG_NAME}},{Key=Project,Value=${TAG_NAME}}]" \
+    --query 'AllocationId' --output text)"
+else
+  alloc_id="$(echo "${eip_info}" | awk '{print $1}')"
+fi
+aws ec2 associate-address --instance-id "${instance_id}" --allocation-id "${alloc_id}" >/dev/null
+eip="$(aws ec2 describe-addresses --allocation-ids "${alloc_id}" --query 'Addresses[0].PublicIp' --output text)"
+
+echo
+echo "provisioned ✓"
+echo "  instance : ${instance_id} (${INSTANCE_TYPE}, ${REGION})"
+echo "  ip       : ${eip}"
+echo "  ssh      : ssh -i ${KEY_FILE} ${SSH_USER}@${eip}"
+echo
+echo "cloud-init is installing Docker (~2min). next: ./deploy.sh"
diff --git a/infra/aws/start.sh b/infra/aws/start.sh
new file mode 100755
index 0000000..ef8d7c0
--- /dev/null
+++ b/infra/aws/start.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Start the (stopped) sermon.guide instance. The Elastic IP and all docker
+# volumes persist across stop/start; every service has restart:
+# unless-stopped, so the whole stack comes back on its own (~2-3min until
+# Milvus is healthy).
+
+. "$(dirname "$0")/common.sh"
+
+require_aws
+instance_id="$(find_instance)"
+[ -n "${instance_id}" ] || die "no instance found — run provision.sh"
+
+state="$(instance_state "${instance_id}")"
+if [ "${state}" = "running" ]; then
+  echo "already running"
+else
+  echo "starting ${instance_id} (was: ${state}) …"
+  aws ec2 start-instances --instance-ids "${instance_id}" >/dev/null
+  aws ec2 wait instance-running --instance-ids "${instance_id}"
+fi
+
+eip_info="$(find_eip)"
+ip="$(echo "${eip_info}" | awk '{print $2}')"
+echo "running ✓  https://${ip}  (give the stack ~2-3min; compute billing is on)"
diff --git a/infra/aws/status.sh b/infra/aws/status.sh
new file mode 100755
index 0000000..7d863a1
--- /dev/null
+++ b/infra/aws/status.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Show the sermon.guide instance state, address, and rough cost posture.
+
+. "$(dirname "$0")/common.sh"
+
+require_aws
+instance_id="$(find_instance)"
+if [ -z "${instance_id}" ]; then
+  echo "no instance provisioned (run provision.sh)"
+  exit 0
+fi
+
+aws ec2 describe-instances --instance-ids "${instance_id}" \
+  --query 'Reservations[0].Instances[0].{id:InstanceId,state:State.Name,type:InstanceType,az:Placement.AvailabilityZone,launched:LaunchTime}' \
+  --output table
+
+eip_info="$(find_eip)"
+if [ -n "${eip_info}" ]; then
+  ip="$(echo "${eip_info}" | awk '{print $2}')"
+  echo "elastic ip : ${ip}"
+  echo "url        : https://${ip}"
+  echo "ssh        : ssh -i ${KEY_FILE} ${SSH_USER}@${ip}"
+fi
+
+state="$(instance_state "${instance_id}")"
+case "${state}" in
+  running) echo "billing    : compute ON (~\$0.15/hr for t3a.xlarge) + EBS + EIP" ;;
+  stopped) echo "billing    : compute OFF — EBS (~\$8/mo) + EIP (~\$3.65/mo) only" ;;
+  *)       echo "billing    : transitional (${state})" ;;
+esac
diff --git a/infra/aws/stop.sh b/infra/aws/stop.sh
new file mode 100755
index 0000000..74150e3
--- /dev/null
+++ b/infra/aws/stop.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Stop the sermon.guide instance. Compute billing stops; you keep paying only
+# EBS storage (~$8/mo for 100GB gp3) + the Elastic IP (~$3.65/mo). All data
+# (Postgres, Milvus, uploads, model cache) lives on the EBS volume and
+# survives. start.sh brings everything back on the same IP.
+
+. "$(dirname "$0")/common.sh"
+
+require_aws
+instance_id="$(find_instance)"
+[ -n "${instance_id}" ] || die "no instance found"
+
+state="$(instance_state "${instance_id}")"
+if [ "${state}" = "stopped" ]; then
+  echo "already stopped"
+  exit 0
+fi
+
+echo "stopping ${instance_id} …"
+aws ec2 stop-instances --instance-ids "${instance_id}" >/dev/null
+aws ec2 wait instance-stopped --instance-ids "${instance_id}"
+echo "stopped ✓  (resting cost ≈ \$12/mo: EBS + Elastic IP. ./start.sh to resume)"
diff --git a/infra/caddy/Caddyfile b/infra/caddy/Caddyfile
new file mode 100644
index 0000000..f4ecc2f
--- /dev/null
+++ b/infra/caddy/Caddyfile
@@ -0,0 +1,104 @@
+# Caddy edge config for the single-box AWS deploy.
+#
+# SITE_HOST comes from the environment (set in /opt/sermon/.env.prod by
+# infra/aws/deploy.sh) and is a BARE host — an IP for now, a domain later:
+#   - IP-only deploy:  SITE_HOST=<elastic-ip>  → Caddy's internal CA
+#     self-signs for the IP (browser shows a one-time warning).
+#   - Domain later:    SITE_HOST=sermon.guide  → also REMOVE the
+#     `tls internal` line below so automatic Let's Encrypt takes over,
+#     and consider adding HSTS then (deliberately NOT set now: HSTS + a
+#     self-signed IP cert would lock browsers out with no bypass).
+#
+# default_sni is LOAD-BEARING for the IP deploy: browsers do not send SNI
+# when the URL host is an IP address (RFC 6066), and without it Caddy can't
+# pick a certificate and aborts the handshake ("tlsv1 alert internal
+# error") — verified empirically against this exact image. The internal-CA
+# IP cert is correct on this box because it has a single interface/EIP; do
+# not add on_demand_tls here (known wrong-SAN failure modes on multi-IP
+# hosts — caddy#5479).
+#
+# Only web:3000 is proxied. The api is NOT publicly exposed at all — every
+# browser call goes through the Next.js same-origin route handlers, which
+# attach the JWT server-side (web/AGENTS.md auth model). Postgres/Redis/
+# Milvus/MinIO/etcd publish no host ports (see docker-compose.prod.yml).
+
+{
+	admin off
+	# rate_limit is a non-standard directive; give it an explicit slot.
+	order rate_limit before basic_auth
+	# default_sni: cert selection when a client sends NO SNI (browsers on
+	# bare-IP URLs). fallback_sni: when a client sends an SNI that matches
+	# no cert (e.g. busybox ssl_client sends a literal-IP SNI — observed
+	# breaking the container healthcheck on the first AWS deploy).
+	default_sni {$SITE_HOST}
+	fallback_sni {$SITE_HOST}
+}
+
+https://{$SITE_HOST} {
+
+	tls internal
+
+	encode zstd gzip
+
+	# Body caps at the edge. Two request_body handlers NEST MaxBytesReaders, so
+	# the smallest matching cap wins: JSON routes are capped at 1MB even though
+	# the catch-all also matches. The catch-all is 200MB (decimal), deliberately
+	# BELOW the api's 200MiB (209,715,200-byte) streamed cap, so oversized
+	# uploads are rejected here before the Next proxy buffers them via
+	# req.formData() — the api stays authoritative for everything smaller.
+	@json_routes path /api/auth/* /api/search-summary /api/tasks/*
+	request_body @json_routes {
+		max_size 1MB
+	}
+	request_body {
+		max_size 200MB
+	}
+
+	# Per-IP rate limits (HTTP 429 + Retry-After when exceeded). Keyed on
+	# {remote_host} = the TCP peer address — Caddy ignores client-supplied
+	# X-Forwarded-For unless trusted_proxies is configured, so the buckets
+	# can't be spoofed sideways.
+	#   auth:  blunt credential stuffing / mass signup (Phase 19 gap).
+	#   heavy: /search-summary is ~2min of CPU + a paid LLM call; /upload
+	#          triggers tens of minutes of CPU ingest. Keep them scarce.
+	#   general: per-IP flood backstop for everything else.
+	rate_limit {
+		zone auth {
+			match {
+				path /api/auth/*
+			}
+			key {remote_host}
+			events 10
+			window 1m
+		}
+		zone heavy {
+			match {
+				path /api/search-summary /api/upload
+			}
+			key {remote_host}
+			events 6
+			window 1m
+		}
+		zone general {
+			key {remote_host}
+			events 600
+			window 1m
+		}
+	}
+
+	# No response timeout overrides: /search-summary legitimately holds the
+	# connection ~130-300s (web's own upstream AbortSignal is 300s); Caddy's
+	# defaults don't time out upstream reads, so leave them alone.
+	reverse_proxy web:3000
+}
+
+# Container-internal health listener (loopback-only, NOT host-published).
+# The compose healthcheck probes this instead of the TLS site: it proves
+# caddy is up AND the proxy→web chain works, without depending on busybox
+# wget's TLS/SNI quirks. The real TLS path is exercised by deploy.sh's
+# external smoke test. NOTE: a second site block requires the braced
+# multi-site Caddyfile form — a bare-address single site swallows later
+# blocks as unknown directives (broke the previous deploy).
+http://127.0.0.1:8081 {
+	reverse_proxy web:3000
+}
diff --git a/infra/caddy/Dockerfile b/infra/caddy/Dockerfile
new file mode 100644
index 0000000..2f17940
--- /dev/null
+++ b/infra/caddy/Dockerfile
@@ -0,0 +1,12 @@
+# Caddy with the rate-limit module — the single public-facing service.
+#
+# Stock caddy images don't ship rate limiting; mholt/caddy-ratelimit is the
+# canonical module, compiled in via xcaddy. Rate limiting at the edge is the
+# no-app-code mitigation for the Phase 19 gap (open signup + expensive
+# /search-summary and /upload paths) until that phase lands properly.
+FROM caddy:2.11.4-builder AS builder
+RUN xcaddy build \
+    --with github.com/mholt/caddy-ratelimit
+
+FROM caddy:2.11.4
+COPY --from=builder /usr/bin/caddy /usr/bin/caddy
diff --git a/infra/docker-compose.prod.yml b/infra/docker-compose.prod.yml
new file mode 100644
index 0000000..0c215bf
--- /dev/null
+++ b/infra/docker-compose.prod.yml
@@ -0,0 +1,348 @@
+# sermon.guide v0 — single-box PRODUCTION stack (AWS EC2 + docker compose).
+#
+# Brought up by infra/aws/deploy.sh on the instance:
+#
+#   docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod …
+#
+# DELIBERATELY SELF-CONTAINED, not an overlay on docker-compose.yml: Compose
+# merges `ports:` lists additively, so an overlay cannot *remove* the dev
+# file's host-published data-plane ports — and "nothing but Caddy publishes
+# a port" is the security property this file exists to guarantee. Keep the
+# data-plane service definitions (images, healthchecks) in sync with
+# docker-compose.yml when bumping versions there.
+#
+# Public surface: caddy 80/443 — NOTHING ELSE. Postgres/Redis/etcd/MinIO/
+# Milvus/api/web are reachable only on the internal compose network (Milvus
+# has no auth at all; the AWS security group is the backstop, this file is
+# the guarantee).
+#
+# Secrets come from /opt/sermon/.env.prod (generated once by deploy.sh,
+# never committed). `${VAR:?…}` interpolation makes compose REFUSE to start
+# with a missing secret — the no-code-change stand-in for the Phase 18
+# startup guard, so the dev-default JWT secret can never silently reach prod.
+#
+# One-shots live behind the "ops" profile so `up` never starts them:
+#   docker compose … run --rm migrate            # alembic upgrade head
+#   docker compose … run --rm bootstrap-milvus   # create library_vectors
+#   docker compose … run --rm prewarm            # models → hf-cache volume
+#
+# NOTE: do not run this file on a machine that also runs the dev compose —
+# same project name + container names (one box, one stack).
+
+name: sermon
+
+x-logging: &logging
+  logging:
+    driver: json-file
+    options:
+      max-size: "10m"
+      max-file: "3"
+
+# Connection env shared by api / worker / one-shots: service names + the real
+# container-internal ports. The code defaults (localhost, 54322/63792) are
+# deliberately wrong so that prod config is always explicit — set everything.
+x-backend-env: &backend-env
+  SERMON_POSTGRES_HOST: postgres
+  SERMON_POSTGRES_PORT: "5432"
+  SERMON_POSTGRES_USER: ${SERMON_POSTGRES_USER:?}
+  SERMON_POSTGRES_PASSWORD: ${SERMON_POSTGRES_PASSWORD:?}
+  SERMON_POSTGRES_DB: ${SERMON_POSTGRES_DB:?}
+  SERMON_REDIS_HOST: redis
+  SERMON_REDIS_PORT: "6379"
+  SERMON_REDIS_PASSWORD: ${SERMON_REDIS_PASSWORD:?}
+  SERMON_MILVUS_HOST: milvus
+  SERMON_MILVUS_PORT: "19530"
+  # The worker never reads SERMON_API_UPLOAD_DIR today (ingest tasks receive
+  # absolute paths from the api), but keeping api and worker agreed on the
+  # value makes the shared-volume contract explicit and refactor-safe.
+  SERMON_API_UPLOAD_DIR: /data/uploads
+  HF_HOME: /hf-cache
+  HF_HUB_OFFLINE: "1"
+  TRANSFORMERS_OFFLINE: "1"
+
+services:
+
+  # --- data plane (mirrors docker-compose.yml, minus every host port) ---
+
+  postgres:
+    image: postgres:16-alpine
+    container_name: sermon-postgres
+    restart: unless-stopped
+    <<: *logging
+    environment:
+      POSTGRES_USER: ${SERMON_POSTGRES_USER:?}
+      POSTGRES_PASSWORD: ${SERMON_POSTGRES_PASSWORD:?}
+      POSTGRES_DB: ${SERMON_POSTGRES_DB:?}
+    volumes:
+      - sermon-postgres:/var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"]
+      interval: 5s
+      timeout: 5s
+      retries: 12
+
+  redis:
+    image: redis:7-alpine
+    container_name: sermon-redis
+    restart: unless-stopped
+    <<: *logging
+    environment:
+      REDIS_PASSWORD: ${SERMON_REDIS_PASSWORD:?}
+    command: >
+      redis-server
+      --appendonly yes
+      --requirepass ${SERMON_REDIS_PASSWORD:?}
+    volumes:
+      - sermon-redis:/data
+    healthcheck:
+      test: ["CMD-SHELL", "redis-cli --no-auth-warning -a $$REDIS_PASSWORD ping | grep -q PONG"]
+      interval: 5s
+      timeout: 5s
+      retries: 12
+
+  etcd:
+    image: quay.io/coreos/etcd:v3.5.25
+    container_name: sermon-etcd
+    restart: unless-stopped
+    <<: *logging
+    environment:
+      ETCD_AUTO_COMPACTION_MODE: revision
+      ETCD_AUTO_COMPACTION_RETENTION: "1000"
+      ETCD_QUOTA_BACKEND_BYTES: "4294967296"
+      ETCD_SNAPSHOT_COUNT: "50000"
+    command: >
+      etcd
+      -advertise-client-urls=http://etcd:2379
+      -listen-client-urls=http://0.0.0.0:2379
+      --data-dir=/etcd
+    volumes:
+      - sermon-etcd:/etcd
+    healthcheck:
+      test: ["CMD", "etcdctl", "endpoint", "health"]
+      interval: 10s
+      timeout: 10s
+      retries: 6
+
+  minio:
+    image: minio/minio:RELEASE.2024-05-28T17-19-04Z
+    container_name: sermon-minio
+    restart: unless-stopped
+    <<: *logging
+    environment:
+      MINIO_ROOT_USER: ${SERMON_MINIO_ROOT_USER:?}
+      MINIO_ROOT_PASSWORD: ${SERMON_MINIO_ROOT_PASSWORD:?}
+    command: server /data --console-address ":9001"
+    volumes:
+      - sermon-minio:/data
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+      interval: 10s
+      timeout: 10s
+      retries: 6
+
+  milvus:
+    image: milvusdb/milvus:v2.6.15
+    container_name: sermon-milvus
+    restart: unless-stopped
+    <<: *logging
+    command: ["milvus", "run", "standalone"]
+    security_opt:
+      - seccomp:unconfined
+    environment:
+      ETCD_ENDPOINTS: etcd:2379
+      MINIO_ADDRESS: minio:9000
+      MINIO_REGION: us-east-1
+      MINIO_ACCESSKEYID: ${SERMON_MINIO_ROOT_USER:?}
+      MINIO_SECRETACCESSKEY: ${SERMON_MINIO_ROOT_PASSWORD:?}
+    volumes:
+      - sermon-milvus:/var/lib/milvus
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
+      interval: 10s
+      timeout: 10s
+      retries: 12
+      start_period: 60s
+    depends_on:
+      etcd:
+        condition: service_healthy
+      minio:
+        condition: service_healthy
+
+  # --- app tier ---
+
+  api:
+    build:
+      context: ..
+      dockerfile: api/Dockerfile
+    image: sermon-api:v0
+    container_name: sermon-api
+    restart: unless-stopped
+    <<: *logging
+    environment:
+      <<: *backend-env
+      SERMON_API_JWT_SECRET: ${SERMON_API_JWT_SECRET:?generate with `openssl rand -hex 48`}
+      SERMON_API_CORS_ORIGINS: ${SERMON_API_CORS_ORIGINS:?JSON list, e.g. ["https://1.2.3.4"]}
+      SERMON_API_LLM_PROVIDER: ${SERMON_API_LLM_PROVIDER:-google}
+      SERMON_API_LLM_MODEL: ${SERMON_API_LLM_MODEL:-}
+      GOOGLE_API_KEY: ${GOOGLE_API_KEY:-}
+      PPQ_API_KEY: ${PPQ_API_KEY:-}
+    volumes:
+      - sermon-uploads:/data/uploads
+      - sermon-hf-cache:/hf-cache
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/healthz', timeout=3)"]
+      interval: 15s
+      timeout: 5s
+      retries: 6
+      start_period: 30s
+    depends_on:
+      postgres:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      milvus:
+        condition: service_healthy
+
+  worker:
+    build:
+      context: ..
+      dockerfile: worker/Dockerfile
+    image: sermon-worker:v0
+    container_name: sermon-worker
+    restart: unless-stopped
+    <<: *logging
+    environment:
+      <<: *backend-env
+    volumes:
+      # Same mount points as the api: the upload handoff is filesystem-based
+      # (api writes under SERMON_API_UPLOAD_DIR, the Celery task gets that
+      # absolute path), and both tiers lazy-load models from one HF cache.
+      - sermon-uploads:/data/uploads
+      - sermon-hf-cache:/hf-cache
+    # Warm shutdown grace before SIGKILL. A killed mid-ingest task is
+    # requeued by the broker after celery_app.py's 300s visibility timeout
+    # (acks_late + reject_on_worker_lost), and MinHash dedup makes the
+    # re-run converge — so 120s is comfort, not correctness.
+    stop_grace_period: 120s
+    healthcheck:
+      test: ["CMD-SHELL", "celery -A celery_app inspect ping -d celery@$$HOSTNAME --timeout 10 | grep -q pong"]
+      interval: 60s
+      timeout: 15s
+      retries: 3
+      start_period: 30s
+    depends_on:
+      postgres:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      milvus:
+        condition: service_healthy
+
+  web:
+    build:
+      context: ../web
+    image: sermon-web:v0
+    container_name: sermon-web
+    restart: unless-stopped
+    <<: *logging
+    environment:
+      # Server-only (lib/config.ts); internal compose address. NEVER expose
+      # the api publicly — browsers only ever talk to the Next route handlers.
+      API_BASE_URL: http://api:8000
+      NODE_ENV: production
+    healthcheck:
+      test: ["CMD-SHELL", "node -e \"fetch('http://127.0.0.1:3000/').then(r => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1))\""]
+      interval: 15s
+      timeout: 5s
+      retries: 6
+      start_period: 20s
+    depends_on:
+      api:
+        condition: service_healthy
+
+  caddy:
+    build:
+      context: ./caddy
+    image: sermon-caddy:v0
+    container_name: sermon-caddy
+    restart: unless-stopped
+    <<: *logging
+    # THE ONLY HOST-PUBLISHED PORTS IN THIS FILE. 443/udp is HTTP/3.
+    ports:
+      - "80:80"
+      - "443:443"
+      - "443:443/udp"
+    environment:
+      # Bare host (no scheme): the Elastic IP for now, the domain later.
+      # Feeds both the site address and default_sni in the Caddyfile.
+      SITE_HOST: ${SITE_HOST:?bare host, e.g. 1.2.3.4 (or sermon.guide once DNS exists)}
+    volumes:
+      - ./caddy/Caddyfile:/etc/caddy/Caddyfile:ro
+      - sermon-caddy-data:/data
+      - sermon-caddy-config:/config
+    healthcheck:
+      # Probes the loopback-only :8081 listener in the Caddyfile: proves
+      # caddy is serving AND the reverse_proxy→web chain works. Deliberately
+      # NOT the TLS site — busybox wget sends a literal-IP SNI that can't
+      # match the site cert (observed unhealthy-looping on the first AWS
+      # deploy); the TLS path is verified by deploy.sh's external smoke test.
+      test: ["CMD-SHELL", "wget -q --spider http://127.0.0.1:8081/login || exit 1"]
+      interval: 15s
+      timeout: 5s
+      retries: 6
+      start_period: 10s
+    depends_on:
+      web:
+        condition: service_healthy
+
+  # --- one-shots (profile "ops"; never started by `up`) ---
+
+  migrate:
+    image: sermon-worker:v0
+    profiles: ["ops"]
+    <<: *logging
+    environment:
+      <<: *backend-env
+    command: ["alembic", "upgrade", "head"]
+    depends_on:
+      postgres:
+        condition: service_healthy
+
+  bootstrap-milvus:
+    image: sermon-worker:v0
+    profiles: ["ops"]
+    <<: *logging
+    environment:
+      <<: *backend-env
+    command: ["python", "scripts/bootstrap_milvus.py"]
+    depends_on:
+      milvus:
+        condition: service_healthy
+
+  prewarm:
+    image: sermon-api:v0
+    profiles: ["ops"]
+    <<: *logging
+    environment:
+      # Only place the stack is allowed to talk to HuggingFace: populate the
+      # shared cache volume, then everything else runs OFFLINE=1 against it.
+      HF_HOME: /hf-cache
+      HF_HUB_OFFLINE: "0"
+      TRANSFORMERS_OFFLINE: "0"
+    volumes:
+      - sermon-hf-cache:/hf-cache
+      # Bind source is compose-file-relative (infra/), unlike the ..-relative
+      # build contexts above.
+      - ./scripts/prewarm_models.py:/prewarm_models.py:ro
+    command: ["python", "/prewarm_models.py"]
+
+volumes:
+  sermon-postgres:
+  sermon-redis:
+  sermon-etcd:
+  sermon-minio:
+  sermon-milvus:
+  sermon-uploads:
+  sermon-hf-cache:
+  sermon-caddy-data:
+  sermon-caddy-config:
diff --git a/infra/env.prod.template b/infra/env.prod.template
new file mode 100644
index 0000000..65aa776
--- /dev/null
+++ b/infra/env.prod.template
@@ -0,0 +1,42 @@
+# sermon.guide — PRODUCTION environment template.
+#
+# The real file lives at /opt/sermon/.env.prod ON THE INSTANCE, generated by
+# infra/aws/deploy.sh on first deploy (strong secrets via openssl rand, never
+# committed, never on a dev machine). This template documents every variable;
+# it is deliberately NOT named .env.* so repo tooling can read it (.claude
+# settings deny Read on .env*) and .gitignore can't accidentally track a
+# secret-bearing sibling.
+#
+# docker-compose.prod.yml uses ${VAR:?} interpolation for everything marked
+# REQUIRED — compose refuses to start if one is missing, so the dev-default
+# placeholders in code (jwt "change-me-…", passwords "sermon_local_dev")
+# can never silently reach production.
+
+# --- secrets (REQUIRED; deploy.sh generates these on first run) ---
+SERMON_API_JWT_SECRET=          # openssl rand -hex 48
+SERMON_POSTGRES_PASSWORD=       # openssl rand -hex 24
+SERMON_REDIS_PASSWORD=          # openssl rand -hex 24
+SERMON_MINIO_ROOT_PASSWORD=     # openssl rand -hex 24
+
+# --- identities (REQUIRED) ---
+SERMON_POSTGRES_USER=sermon
+SERMON_POSTGRES_DB=sermon
+SERMON_MINIO_ROOT_USER=sermon-minio
+
+# --- public surface (REQUIRED; deploy.sh fills from the Elastic IP) ---
+# SITE_HOST is a BARE host (no scheme): the Elastic IP now; the domain later
+# (then also remove `tls internal` from infra/caddy/Caddyfile for automatic
+# Let's Encrypt, update CORS to match, `docker compose up -d caddy api`).
+SITE_HOST=
+# CORS is currently vestigial — browsers only ever talk same-origin to the
+# Next route handlers and the api is not publicly exposed; kept correct for
+# a future where the api gets its own public origin.
+SERMON_API_CORS_ORIGINS=        # JSON list, e.g. ["https://1.2.3.4"]
+
+# --- LLM for /search-summary (optional; the route 503s cleanly until set) ---
+# Provider 'google' needs GOOGLE_API_KEY; 'ppq' needs PPQ_API_KEY (both
+# intentionally UNPREFIXED — api/settings.py reads them via validation_alias).
+SERMON_API_LLM_PROVIDER=google
+SERMON_API_LLM_MODEL=
+GOOGLE_API_KEY=
+PPQ_API_KEY=
diff --git a/infra/scripts/prewarm_models.py b/infra/scripts/prewarm_models.py
new file mode 100644
index 0000000..1050629
--- /dev/null
+++ b/infra/scripts/prewarm_models.py
@@ -0,0 +1,40 @@
+"""One-shot model prewarm for the shared HuggingFace cache volume.
+
+Runs as the `prewarm` service in infra/docker-compose.prod.yml (api image,
+HF_HUB_OFFLINE unset) BEFORE the api/worker start. Downloads the three
+inference models into HF_HOME (the `sermon-hf-cache` volume) so the runtime
+containers — which run with HF_HUB_OFFLINE=1 — load deterministically with
+zero network and never pay a multi-GB download on a user's first request.
+
+Model ids MUST stay in sync with the in-process loaders:
+  - BAAI/bge-large-en-v1.5            worker/embedding.py (+ chunking.py via
+                                      llama-index; same on-disk hub snapshot)
+  - cross-encoder/ms-marco-MiniLM-L-6-v2   api/rerank.py
+  - BAAI/bge-m3                       api/highlight.py
+
+Idempotent: warm cache entries are revalidated, not re-downloaded. Exits
+non-zero on any failure so deploy.sh aborts before flipping traffic.
+"""
+
+import time
+
+from sentence_transformers import CrossEncoder, SentenceTransformer
+
+MODELS: list[tuple[str, type[CrossEncoder] | type[SentenceTransformer]]] = [
+    ("BAAI/bge-large-en-v1.5", SentenceTransformer),
+    ("cross-encoder/ms-marco-MiniLM-L-6-v2", CrossEncoder),
+    ("BAAI/bge-m3", SentenceTransformer),
+]
+
+
+def main() -> None:
+    for name, loader in MODELS:
+        start = time.monotonic()
+        print(f"prewarm: loading {name} …", flush=True)
+        loader(name, device="cpu")
+        print(f"prewarm: {name} ready in {time.monotonic() - start:.1f}s", flush=True)
+    print("prewarm: all models cached", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/web/.dockerignore b/web/.dockerignore
new file mode 100644
index 0000000..937ee7a
--- /dev/null
+++ b/web/.dockerignore
@@ -0,0 +1,5 @@
+# Build-context filter for the web image (context = web/).
+node_modules
+.next
+.env
+.env.*
diff --git a/web/Dockerfile b/web/Dockerfile
new file mode 100644
index 0000000..5cb94fe
--- /dev/null
+++ b/web/Dockerfile
@@ -0,0 +1,38 @@
+# web/ — Next.js 15 frontend. BUILD FROM web/:
+#
+#   docker build web/
+#
+# web/ is fully independent (no Python imports; talks to api/ over HTTP
+# only), so its context is just this directory.
+#
+# Uses the `output: "standalone"` bundle from next.config.ts: the runner
+# stage ships only .next/standalone + .next/static and starts with
+# `node server.js` (NOT `next start`), keeping the image small.
+#
+# API_BASE_URL is a RUNTIME env var (read via server-only lib/config.ts,
+# never inlined at build), so one image works against any api origin —
+# do not pass it as a build ARG and do not invent a NEXT_PUBLIC_ variant
+# (that would leak the API origin into client JS and break the auth model).
+
+FROM node:20-slim AS builder
+# pnpm 9 pinned explicitly: package.json deliberately has NO packageManager
+# field (it conflicts with CI's pnpm/action-setup@v6 version pin — web/AGENTS.md).
+RUN npm install -g pnpm@9
+WORKDIR /app
+COPY package.json pnpm-lock.yaml ./
+RUN pnpm install --frozen-lockfile
+COPY . .
+RUN pnpm build
+
+FROM node:20-slim
+# NODE_ENV=production is load-bearing: lib/session.ts gates the session
+# cookie's Secure attribute on it. HOSTNAME=0.0.0.0 so the standalone
+# server binds beyond the container loopback.
+ENV NODE_ENV=production \
+    PORT=3000 \
+    HOSTNAME=0.0.0.0
+WORKDIR /app
+COPY --from=builder /app/.next/standalone ./
+COPY --from=builder /app/.next/static ./.next/static
+EXPOSE 3000
+CMD ["node", "server.js"]
diff --git a/web/next.config.ts b/web/next.config.ts
index 2f6491c..f1122cc 100644
--- a/web/next.config.ts
+++ b/web/next.config.ts
@@ -2,6 +2,10 @@ import type { NextConfig } from "next";
 
 const nextConfig: NextConfig = {
   reactStrictMode: true,
+  // Self-contained server bundle at .next/standalone for the Docker image
+  // (web/Dockerfile runs `node server.js`, not `next start`). Dev (`next dev`)
+  // and CI (`tsc`/`biome`/`vitest`, no build) are unaffected.
+  output: "standalone",
 };
 
 export default nextConfig;
diff --git a/worker/Dockerfile b/worker/Dockerfile
new file mode 100644
index 0000000..0de1416
--- /dev/null
+++ b/worker/Dockerfile
@@ -0,0 +1,57 @@
+# worker/ — Celery ingestion worker. BUILD FROM THE REPO ROOT (kept
+# symmetric with api/Dockerfile; the worker itself has no upward deps):
+#
+#   docker build -f worker/Dockerfile .
+#
+# The same image serves three roles in infra/docker-compose.prod.yml:
+#   1. the long-running Celery worker (default CMD below),
+#   2. the one-shot Alembic migration  (`command: alembic upgrade head`),
+#   3. the one-shot Milvus bootstrap   (`command: python scripts/bootstrap_milvus.py`).
+# All three need cwd == /app/worker because the project is intentionally
+# non-packaged ([tool.uv] package = false): `from extractors import …`,
+# `import dedup`, `celery -A celery_app` all resolve off the working dir.
+#
+# BGE-Large (~1.3GB) is NOT baked — it lives in the shared `sermon-hf-cache`
+# volume (see api/Dockerfile rationale). NLTK WordNet (~40MB) IS baked, so
+# the first dedup signature() never does a runtime download.
+
+FROM python:3.12-slim AS deps
+COPY --from=ghcr.io/astral-sh/uv:0.11.19 /uv /usr/local/bin/uv
+ENV UV_PYTHON_DOWNLOADS=never \
+    UV_LINK_MODE=copy \
+    UV_COMPILE_BYTECODE=1
+WORKDIR /app/worker
+COPY worker/pyproject.toml worker/uv.lock ./
+RUN uv sync --frozen --no-dev
+
+FROM python:3.12-slim
+# pandoc:    EPUB extraction shells out via pypandoc (extractors/epub.py).
+# libmagic1: MIME sniffing via python-magic (extractors/extract.py) — NOT
+#            preinstalled on -slim images.
+# libgomp1:  OpenMP runtime for torch-CPU / sentence-transformers wheels.
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends pandoc libmagic1 libgomp1 ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+COPY --from=deps /app/worker/.venv /app/worker/.venv
+COPY worker/ /app/worker/
+ENV PATH="/app/worker/.venv/bin:$PATH" \
+    PYTHONUNBUFFERED=1 \
+    NLTK_DATA=/usr/share/nltk_data
+# Bake WordNet (+ omw-1.4, which newer NLTK wants for some WordNet ops) so
+# dedup.signature() never downloads at runtime. The downloader leaves zips
+# that nltk.data.find('corpora/wordnet') can NOT resolve (verified against
+# this image) — extract and drop them, or the first signature() call tries
+# a network download anyway.
+RUN python -m nltk.downloader -d /usr/share/nltk_data wordnet omw-1.4 \
+    && python -c "import zipfile, pathlib; \
+[(zipfile.ZipFile(z).extractall(z.parent), z.unlink()) \
+ for z in pathlib.Path('/usr/share/nltk_data/corpora').glob('*.zip')]"
+WORKDIR /app/worker
+# --concurrency=1: each prefork child that ingests a new book loads BGE-Large
+# up to TWICE (chunking boundary-embedder + chunk embedder) ≈ 3-4GB resident;
+# the shared 16GB box can't afford parallel ingests alongside the api's models.
+# --time-limit: hard kill any single ingest at 1h (soft at 55min) so one
+# poisoned/oversized upload can't pin the CPU forever (celery_app.py already
+# sets acks_late + reject_on_worker_lost, so a killed task is not silently lost).
+CMD ["celery", "-A", "celery_app", "worker", "--loglevel=info", \
+     "--concurrency=1", "--time-limit=3600", "--soft-time-limit=3300"]