sovITxyz · sovITxyz · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,18 @@
+# Build-context filter for the api/ and worker/ images (context = repo root,
+# because api/ imports worker.db — see api/Dockerfile). web/ builds from its
+# own context (web/) with its own .dockerignore.
+.git
+.github
+.claude
+docs
+infra
+web
+**/.venv
+**/__pycache__
+**/.pytest_cache
+**/.ruff_cache
+**/.mypy_cache
+**/node_modules
+**/tests/samples
+.env
+.env.*
diff --git a/api/Dockerfile b/api/Dockerfile
@@ -0,0 +1,51 @@
+# api/ — FastAPI HTTP layer. BUILD FROM THE REPO ROOT:
+#
+#   docker build -f api/Dockerfile .
+#
+# The context must be the repo root because api/ imports `db`, `embedding`,
+# `retrieval`, and `scripts.bootstrap_milvus` from ../worker at RUNTIME (the
+# one allowed cross-package import — root CLAUDE.md). A `COPY api/`-only
+# build fails at boot with ModuleNotFoundError.
+#
+# Deps install from api/uv.lock with `uv sync --frozen` so the
+# [tool.uv.sources] CPU-only torch index is honored — re-resolving (or pip)
+# would drag in ~2GB of unused CUDA wheels.
+#
+# The three inference models (BGE-Large ~1.3GB, ms-marco cross-encoder
+# ~90MB, BGE-M3 ~2.3GB) are NOT baked into this image. They live in the
+# shared `sermon-hf-cache` volume (HF_HOME), downloaded once by the
+# `prewarm` one-shot in infra/docker-compose.prod.yml; the runtime then
+# loads them lazily with HF_HUB_OFFLINE=1 so no request ever blocks on
+# (or flakes over) a HuggingFace network round-trip.
+
+FROM python:3.12-slim AS deps
+COPY --from=ghcr.io/astral-sh/uv:0.11.19 /uv /usr/local/bin/uv
+ENV UV_PYTHON_DOWNLOADS=never \
+    UV_LINK_MODE=copy \
+    UV_COMPILE_BYTECODE=1
+WORKDIR /app/api
+COPY api/pyproject.toml api/uv.lock ./
+# package=false in pyproject: this installs dependencies only (no project
+# build), which is exactly what we want. --no-dev skips pytest/ruff/pyright.
+RUN uv sync --frozen --no-dev
+
+FROM python:3.12-slim
+# libgomp1: OpenMP runtime needed by the torch-CPU / sentence-transformers
+# native wheels on slim images. ca-certificates: HTTPS to the LLM endpoint
+# (generativelanguage.googleapis.com / api.ppq.ai).
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends libgomp1 ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+COPY --from=deps /app/api/.venv /app/api/.venv
+COPY worker/ /app/worker/
+COPY api/ /app/api/
+ENV PATH="/app/api/.venv/bin:$PATH" \
+    PYTHONPATH=/app/worker \
+    PYTHONUNBUFFERED=1
+WORKDIR /app/api
+EXPOSE 8000
+# Exactly ONE uvicorn worker: each process lazily loads ~3.7GB of models, so
+# --workers N would cost N × 3.7GB on the shared box. Handlers offload
+# blocking CPU/Milvus work via asyncio.to_thread, so a single worker serves
+# concurrent requests fine; the bottleneck is model wall-time, not async I/O.
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/docs/DEPLOY_AWS.md b/docs/DEPLOY_AWS.md
@@ -0,0 +1,132 @@
+# Deploying sermon.guide to AWS (v0, single box)
+
+Operator runbook for the v0 deployment: the whole stack — Postgres, Redis,
+Milvus (+etcd +MinIO), FastAPI api, Celery worker, Next.js web, Caddy TLS
+edge — on **one EC2 instance** via `infra/docker-compose.prod.yml`. This is
+the dollar-store rendering of ARCHITECTURE.md §1's "~$50/mo" target; the
+KEDA/k8s shape stays Phase 30.
+
+## TL;DR
+
+```bash
+aws configure                       # once: credentials + region
+cd infra/aws
+./provision.sh                      # EC2 t3a.xlarge + EIP + SG (~2min)
+./deploy.sh                         # clone, build, migrate, bootstrap, prewarm, smoke (~20min first run)
+                                    #   deploys your CURRENT branch — it must be pushed to origin
+                                    #   (deploy.sh preflights this and tells you if not)
+./stop.sh                           # done for the day → compute billing off
+./start.sh                          # back up on the same IP in ~3min
+./status.sh                         # state + URL + billing posture
+./destroy.sh                        # everything gone, $0
+```
+
+The site serves at `https://<elastic-ip>` with a self-signed cert (Caddy's
+internal CA) — the browser warns once per browser; proceed. No domain is
+required; see [Adding a domain](#adding-a-domain-later) for the day one exists.
+
+## Cost
+
+| State | What bills | ≈/mo |
+| --- | --- | --- |
+| Running 24/7 | t3a.xlarge ($0.1504/hr us-east-1) + 100GB gp3 + EIP | ~$122 |
+| Running 8h/day | compute ~⅓ + disk + EIP | ~$48 |
+| **Stopped** | 100GB gp3 (~$8) + EIP (~$3.65) | **~$12** |
+
+t3a is burstable (unlimited mode by default): sustained heavy CPU — e.g.
+hours of ingest — can accrue small credit-overage charges; fine for bursty
+beta use, watch CloudWatch `CPUSurplusCreditCharged` if you batch-ingest a
+whole library. **Set a billing alarm** (Billing → Budgets) — nothing in this
+stack does it for you.
+
+LLM spend is separate (per-query, via `GOOGLE_API_KEY`/`PPQ_API_KEY`); a
+warm `/search-summary` was ~6¢ in the Phase 14b live verify. Set a spend cap
+at the provider.
+
+## What deploy.sh actually does
+
+1. **Code** → `git clone`/`reset --hard` of the chosen branch into `/opt/sermon/app`.
+2. **Secrets** → first run generates `/opt/sermon/.env.prod` *on the box*
+   (`openssl rand`; JWT secret, Postgres/Redis/MinIO passwords). Secrets never
+   exist on a dev machine or in git. `docker-compose.prod.yml` uses `${VAR:?}`
+   so a missing secret refuses to boot rather than falling back to the
+   dev defaults baked into the code (the Phase 18 startup-guard gap, mitigated
+   at the compose layer).
+3. **LLM keys** → if `GOOGLE_API_KEY`/`PPQ_API_KEY` are set in your local
+   shell when you run `./deploy.sh`, they're forwarded into the box's env
+   file (PPQ also flips the provider). Without one, everything works except
+   `/search-summary`, which 503s naming the missing var.
+4. **Build** → all four images build on the instance (first run ~10–20min).
+5. **Bootstrap** → `alembic upgrade head`, `bootstrap_milvus.py` (both
+   idempotent), then the `prewarm` one-shot downloads the three models
+   (~3.7GB) into the shared `sermon-hf-cache` volume — the only moment the
+   stack talks to HuggingFace. Runtime containers run `HF_HUB_OFFLINE=1`.
+6. **Up + smoke** → `up -d --wait`, then an outside-in signup→login→/library
+   pass through Caddy with a cookie jar.
+
+## Security posture (read before sharing the URL)
+
+Mitigated at deploy time, no app-code changes:
+
+- **Only Caddy publishes ports** (80/443). Postgres/Redis/Milvus (which has
+  *no auth*)/MinIO/etcd/api/web are compose-network-only; the security group
+  (443/80 world, 22 admin-IP) is the backstop.
+- **Strong generated secrets**; compose hard-fails if any is missing.
+- **Per-IP rate limits at Caddy**: 10/min on `/api/auth/*`, 6/min on
+  `/api/search-summary` + `/api/upload`, 600/min general (Phase 19 gap).
+- **Body caps at the edge**: 1MB on JSON routes, 210MB global (the api's own
+  200MB streamed cap is the real gate; this protects the Node proxy).
+- **Celery `--time-limit=3600`** so one poisoned upload can't pin the CPU.
+- **Secure session cookie**: web runs `NODE_ENV=production` (bakes the
+  `Secure` attribute), everything redirects to HTTPS.
+- The api is **not publicly reachable at all** — browsers only ever hit the
+  Next route handlers, which attach the JWT server-side.
+
+Accepted v0 risks (documented in the Phase 17–30 plan, revisit before real
+launch): open signup (rate-limited but no email verification/CAPTCHA),
+`task_id`-as-capability on `/tasks/{id}`, no Pydantic `extra='forbid'`,
+no graceful degradation (a Milvus blip = 500), CPU latency (warm
+`/search-summary` ≈ 2min — "user is reading, not chatting").
+
+## Day-2 operations
+
+```bash
+# logs (whole stack / one service)
+ssh -i ~/.ssh/sermon-guide.pem ubuntu@<ip>
+cd /opt/sermon/app
+docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod logs -f [api|worker|web|caddy|milvus]
+
+# redeploy after pushing changes to the branch
+./deploy.sh                      # re-runs build/migrate/bootstrap idempotently
+
+# add or rotate the LLM key later
+ssh … 'sed -i "s|^GOOGLE_API_KEY=.*|GOOGLE_API_KEY=<key>|" /opt/sermon/.env.prod'
+ssh … 'cd /opt/sermon/app && docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod up -d api'
+
+# manual backup before risky changes (Phase 28 will do this properly)
+aws ec2 create-snapshot --volume-id $(aws ec2 describe-instances \
+  --filters Name=tag:Name,Values=sermon-guide Name=instance-state-name,Values=running,stopped \
+  --query 'Reservations[0].Instances[0].BlockDeviceMappings[0].Ebs.VolumeId' --output text) \
+  --description "sermon-guide manual backup"
+```
+
+## Adding a domain later
+
+1. Point an A record at the Elastic IP.
+2. On the box, edit `/opt/sermon/.env.prod`:
+   `SITE_HOST=sermon.guide` and
+   `SERMON_API_CORS_ORIGINS=["https://sermon.guide"]`.
+3. Remove the `tls internal` line from `infra/caddy/Caddyfile` (commit that
+   change) so automatic Let's Encrypt takes over.
+4. `docker compose … up -d caddy api`. Add an HSTS header in the Caddyfile
+   once the real cert is confirmed working (deliberately absent now — HSTS on
+   a self-signed IP would lock browsers out).
+
+## Known deltas vs the phase plan
+
+- This is operator tooling on branch `deploy/aws-v0`, not Phase 29/30:
+  images build on the box (no registry/CI), models live in a volume rather
+  than baked into images. When Phase 29 lands proper image-build CI, these
+  Dockerfiles are its starting point and `prewarm` becomes a build step.
+- `web/next.config.ts` gained `output: "standalone"` (required for the slim
+  web image; dev/CI behavior unchanged).
diff --git a/infra/AGENTS.md b/infra/AGENTS.md
@@ -1,14 +1,31 @@
 # infra/ — agent instructions
 
-Local-development infrastructure for sermon.guide v0. Production lives in
-k8s manifests later (see [docs/PHASES.md](../docs/PHASES.md), Beyond Phase 16).
+Local-development AND v0 single-box production infrastructure for
+sermon.guide. The k8s/KEDA shape stays post-v0
+(see [docs/PHASES.md](../docs/PHASES.md), Beyond Phase 16).
 
 ## What lives here
 
-- `docker-compose.yml` — Postgres 16, Redis 7, Milvus standalone v2.6 with its
-  required etcd + MinIO dependencies. Brought up via `make up` from repo root.
+- `docker-compose.yml` — local dev data plane: Postgres 16, Redis 7, Milvus
+  standalone v2.6 with its required etcd + MinIO dependencies. Brought up via
+  `make up` from repo root.
 - `.env.example` — template for `infra/.env` (gitignored). `make up` copies
   the example to `.env` on first run.
+- `docker-compose.prod.yml` — the v0 single-box AWS stack (data plane + api/
+  worker/web + Caddy edge). DELIBERATELY self-contained, not an overlay:
+  compose merges `ports:` additively, and "only Caddy publishes a port" is
+  the security property the file guarantees. Keep its data-plane blocks in
+  sync with `docker-compose.yml` when bumping versions. Runbook:
+  [docs/DEPLOY_AWS.md](../docs/DEPLOY_AWS.md).
+- `caddy/` — TLS edge (Dockerfile + Caddyfile: rate limits, body caps,
+  default_sni for bare-IP deploys).
+- `scripts/` — deploy-time one-shots (model prewarm into the shared HF cache).
+- `aws/` — provision/deploy/start/stop/status/destroy lifecycle scripts.
+  Tag-based and re-runnable; secrets are generated ON the instance, never
+  committed.
+- `env.prod.template` — documents `/opt/sermon/.env.prod` (generated on-box
+  by `aws/deploy.sh`). Deliberately NOT dot-env-named so repo tooling can
+  read it.
 - Future: `k8s/` Helm values + KEDA scaler config (post-v0).
 
 ## Conventions

diff --git a/infra/aws/common.sh b/infra/aws/common.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# Shared helpers for the sermon.guide AWS scripts. Source, don't execute.
+#
+# Conventions:
+#   - Everything is found by tag, not by stored state: the instance, SG, EIP
+#     and key pair all carry Name/Project=sermon-guide tags, so the scripts
+#     are re-runnable from any checkout.
+#   - Region/profile come from the ambient AWS CLI config (aws configure /
+#     AWS_PROFILE / AWS_REGION); override per-invocation with env vars.
+
+set -euo pipefail
+
+TAG_NAME="${SERMON_AWS_NAME:-sermon-guide}"
+KEY_NAME="${SERMON_AWS_KEY_NAME:-${TAG_NAME}}"
+KEY_FILE="${SERMON_AWS_KEY_FILE:-${HOME}/.ssh/${TAG_NAME}.pem}"
+SSH_USER="ubuntu"
+
+aws() {
+  command aws "$@"
+}
+
+region() {
+  aws configure get region 2>/dev/null || echo "${AWS_REGION:-${AWS_DEFAULT_REGION:-}}"
+}
+
+die() {
+  echo "ERROR: $*" >&2
+  exit 1
+}
+
+require_aws() {
+  command -v aws >/dev/null 2>&1 || die "aws CLI not found (expected on PATH, e.g. ~/.local/bin/aws)"
+  aws sts get-caller-identity >/dev/null 2>&1 \
+    || die "AWS credentials not configured — run: aws configure"
+  [ -n "$(region)" ] || die "no default region — set one via aws configure or AWS_REGION"
+}
+
+# Newest non-terminated instance tagged Name=$TAG_NAME; empty if none.
+find_instance() {
+  aws ec2 describe-instances \
+    --filters "Name=tag:Name,Values=${TAG_NAME}" \
+              "Name=instance-state-name,Values=pending,running,stopping,stopped" \
+    --query 'sort_by(Reservations[].Instances[], &LaunchTime)[-1].InstanceId' \
+    --output text 2>/dev/null | grep -v '^None$' || true
+}
+
+instance_state() {
+  aws ec2 describe-instances --instance-ids "$1" \
+    --query 'Reservations[0].Instances[0].State.Name' --output text
+}
+
+# Elastic IP tagged Name=$TAG_NAME; prints "ALLOC_ID IP" or nothing.
+find_eip() {
+  aws ec2 describe-addresses \
+    --filters "Name=tag:Name,Values=${TAG_NAME}" \
+    --query 'Addresses[0].[AllocationId,PublicIp]' --output text 2>/dev/null \
+    | grep -v '^None' || true
+}
+
+find_security_group() {
+  aws ec2 describe-security-groups \
+    --filters "Name=group-name,Values=${TAG_NAME}-sg" \
+    --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null \
+    | grep -v '^None$' || true
+}
+
+ssh_cmd() {
+  # accept-new = trust-on-first-use: conventional for a box we just created
+  # ourselves, but the first connect is unauthenticated — paranoid operators
+  # can pre-pin the host key from the EC2 console's system log.
+  ssh -i "${KEY_FILE}" \
+    -o StrictHostKeyChecking=accept-new \
+    -o ConnectTimeout=10 \
+    "${SSH_USER}@$1" "${@:2}"
+}
+
+wait_for_ssh() {
+  local ip="$1" tries=0
+  echo "waiting for SSH on ${ip} …"
+  until ssh_cmd "${ip}" true 2>/dev/null; do
+    tries=$((tries + 1))
+    [ "${tries}" -lt 40 ] || die "SSH to ${ip} not reachable after ~3min"
+    sleep 5
+  done
+}