Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Build-context filter for the api/ and worker/ images (context = repo root,
# because api/ imports worker.db — see api/Dockerfile). web/ builds from its
# own context (web/) with its own .dockerignore.
.git
.github
.claude
docs
infra
web
**/.venv
**/__pycache__
**/.pytest_cache
**/.ruff_cache
**/.mypy_cache
**/node_modules
**/tests/samples
.env
.env.*
51 changes: 51 additions & 0 deletions api/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# api/ — FastAPI HTTP layer. BUILD FROM THE REPO ROOT:
#
# docker build -f api/Dockerfile .
#
# The context must be the repo root because api/ imports `db`, `embedding`,
# `retrieval`, and `scripts.bootstrap_milvus` from ../worker at RUNTIME (the
# one allowed cross-package import — root CLAUDE.md). A `COPY api/`-only
# build fails at boot with ModuleNotFoundError.
#
# Deps install from api/uv.lock with `uv sync --frozen` so the
# [tool.uv.sources] CPU-only torch index is honored — re-resolving (or pip)
# would drag in ~2GB of unused CUDA wheels.
#
# The three inference models (BGE-Large ~1.3GB, ms-marco cross-encoder
# ~90MB, BGE-M3 ~2.3GB) are NOT baked into this image. They live in the
# shared `sermon-hf-cache` volume (HF_HOME), downloaded once by the
# `prewarm` one-shot in infra/docker-compose.prod.yml; the runtime then
# loads them lazily with HF_HUB_OFFLINE=1 so no request ever blocks on
# (or flakes over) a HuggingFace network round-trip.

FROM python:3.12-slim AS deps
COPY --from=ghcr.io/astral-sh/uv:0.11.19 /uv /usr/local/bin/uv
ENV UV_PYTHON_DOWNLOADS=never \
UV_LINK_MODE=copy \
UV_COMPILE_BYTECODE=1
WORKDIR /app/api
COPY api/pyproject.toml api/uv.lock ./
# package=false in pyproject: this installs dependencies only (no project
# build), which is exactly what we want. --no-dev skips pytest/ruff/pyright.
RUN uv sync --frozen --no-dev

FROM python:3.12-slim
# libgomp1: OpenMP runtime needed by the torch-CPU / sentence-transformers
# native wheels on slim images. ca-certificates: HTTPS to the LLM endpoint
# (generativelanguage.googleapis.com / api.ppq.ai).
RUN apt-get update \
&& apt-get install -y --no-install-recommends libgomp1 ca-certificates \
&& rm -rf /var/lib/apt/lists/*
COPY --from=deps /app/api/.venv /app/api/.venv
COPY worker/ /app/worker/
COPY api/ /app/api/
ENV PATH="/app/api/.venv/bin:$PATH" \
PYTHONPATH=/app/worker \
PYTHONUNBUFFERED=1
WORKDIR /app/api
EXPOSE 8000
# Exactly ONE uvicorn worker: each process lazily loads ~3.7GB of models, so
# --workers N would cost N × 3.7GB on the shared box. Handlers offload
# blocking CPU/Milvus work via asyncio.to_thread, so a single worker serves
# concurrent requests fine; the bottleneck is model wall-time, not async I/O.
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
132 changes: 132 additions & 0 deletions docs/DEPLOY_AWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# Deploying sermon.guide to AWS (v0, single box)

Operator runbook for the v0 deployment: the whole stack — Postgres, Redis,
Milvus (+etcd +MinIO), FastAPI api, Celery worker, Next.js web, Caddy TLS
edge — on **one EC2 instance** via `infra/docker-compose.prod.yml`. This is
the dollar-store rendering of ARCHITECTURE.md §1's "~$50/mo" target; the
KEDA/k8s shape stays Phase 30.

## TL;DR

```bash
aws configure # once: credentials + region
cd infra/aws
./provision.sh # EC2 t3a.xlarge + EIP + SG (~2min)
./deploy.sh # clone, build, migrate, bootstrap, prewarm, smoke (~20min first run)
# deploys your CURRENT branch — it must be pushed to origin
# (deploy.sh preflights this and tells you if not)
./stop.sh # done for the day → compute billing off
./start.sh # back up on the same IP in ~3min
./status.sh # state + URL + billing posture
./destroy.sh # everything gone, $0
```

The site serves at `https://<elastic-ip>` with a self-signed cert (Caddy's
internal CA) — the browser warns once per browser; proceed. No domain is
required; see [Adding a domain](#adding-a-domain-later) for the day one exists.

## Cost

| State | What bills | ≈/mo |
| --- | --- | --- |
| Running 24/7 | t3a.xlarge ($0.1504/hr us-east-1) + 100GB gp3 + EIP | ~$122 |
| Running 8h/day | compute ~⅓ + disk + EIP | ~$48 |
| **Stopped** | 100GB gp3 (~$8) + EIP (~$3.65) | **~$12** |

t3a is burstable (unlimited mode by default): sustained heavy CPU — e.g.
hours of ingest — can accrue small credit-overage charges; fine for bursty
beta use, watch CloudWatch `CPUSurplusCreditCharged` if you batch-ingest a
whole library. **Set a billing alarm** (Billing → Budgets) — nothing in this
stack does it for you.

LLM spend is separate (per-query, via `GOOGLE_API_KEY`/`PPQ_API_KEY`); a
warm `/search-summary` was ~6¢ in the Phase 14b live verify. Set a spend cap
at the provider.

## What deploy.sh actually does

1. **Code** → `git clone`/`reset --hard` of the chosen branch into `/opt/sermon/app`.
2. **Secrets** → first run generates `/opt/sermon/.env.prod` *on the box*
(`openssl rand`; JWT secret, Postgres/Redis/MinIO passwords). Secrets never
exist on a dev machine or in git. `docker-compose.prod.yml` uses `${VAR:?}`
so a missing secret refuses to boot rather than falling back to the
dev defaults baked into the code (the Phase 18 startup-guard gap, mitigated
at the compose layer).
3. **LLM keys** → if `GOOGLE_API_KEY`/`PPQ_API_KEY` are set in your local
shell when you run `./deploy.sh`, they're forwarded into the box's env
file (PPQ also flips the provider). Without one, everything works except
`/search-summary`, which 503s naming the missing var.
4. **Build** → all four images build on the instance (first run ~10–20min).
5. **Bootstrap** → `alembic upgrade head`, `bootstrap_milvus.py` (both
idempotent), then the `prewarm` one-shot downloads the three models
(~3.7GB) into the shared `sermon-hf-cache` volume — the only moment the
stack talks to HuggingFace. Runtime containers run `HF_HUB_OFFLINE=1`.
6. **Up + smoke** → `up -d --wait`, then an outside-in signup→login→/library
pass through Caddy with a cookie jar.

## Security posture (read before sharing the URL)

Mitigated at deploy time, no app-code changes:

- **Only Caddy publishes ports** (80/443). Postgres/Redis/Milvus (which has
*no auth*)/MinIO/etcd/api/web are compose-network-only; the security group
(443/80 world, 22 admin-IP) is the backstop.
- **Strong generated secrets**; compose hard-fails if any is missing.
- **Per-IP rate limits at Caddy**: 10/min on `/api/auth/*`, 6/min on
`/api/search-summary` + `/api/upload`, 600/min general (Phase 19 gap).
- **Body caps at the edge**: 1MB on JSON routes, 210MB global (the api's own
200MB streamed cap is the real gate; this protects the Node proxy).
- **Celery `--time-limit=3600`** so one poisoned upload can't pin the CPU.
- **Secure session cookie**: web runs `NODE_ENV=production` (bakes the
`Secure` attribute), everything redirects to HTTPS.
- The api is **not publicly reachable at all** — browsers only ever hit the
Next route handlers, which attach the JWT server-side.

Accepted v0 risks (documented in the Phase 17–30 plan, revisit before real
launch): open signup (rate-limited but no email verification/CAPTCHA),
`task_id`-as-capability on `/tasks/{id}`, no Pydantic `extra='forbid'`,
no graceful degradation (a Milvus blip = 500), CPU latency (warm
`/search-summary` ≈ 2min — "user is reading, not chatting").

## Day-2 operations

```bash
# logs (whole stack / one service)
ssh -i ~/.ssh/sermon-guide.pem ubuntu@<ip>
cd /opt/sermon/app
docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod logs -f [api|worker|web|caddy|milvus]

# redeploy after pushing changes to the branch
./deploy.sh # re-runs build/migrate/bootstrap idempotently

# add or rotate the LLM key later
ssh … 'sed -i "s|^GOOGLE_API_KEY=.*|GOOGLE_API_KEY=<key>|" /opt/sermon/.env.prod'
ssh … 'cd /opt/sermon/app && docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod up -d api'

# manual backup before risky changes (Phase 28 will do this properly)
aws ec2 create-snapshot --volume-id $(aws ec2 describe-instances \
--filters Name=tag:Name,Values=sermon-guide Name=instance-state-name,Values=running,stopped \
--query 'Reservations[0].Instances[0].BlockDeviceMappings[0].Ebs.VolumeId' --output text) \
--description "sermon-guide manual backup"
```

## Adding a domain later

1. Point an A record at the Elastic IP.
2. On the box, edit `/opt/sermon/.env.prod`:
`SITE_HOST=sermon.guide` and
`SERMON_API_CORS_ORIGINS=["https://sermon.guide"]`.
3. Remove the `tls internal` line from `infra/caddy/Caddyfile` (commit that
change) so automatic Let's Encrypt takes over.
4. `docker compose … up -d caddy api`. Add an HSTS header in the Caddyfile
once the real cert is confirmed working (deliberately absent now — HSTS on
a self-signed IP would lock browsers out).

## Known deltas vs the phase plan

- This is operator tooling on branch `deploy/aws-v0`, not Phase 29/30:
images build on the box (no registry/CI), models live in a volume rather
than baked into images. When Phase 29 lands proper image-build CI, these
Dockerfiles are its starting point and `prewarm` becomes a build step.
- `web/next.config.ts` gained `output: "standalone"` (required for the slim
web image; dev/CI behavior unchanged).
25 changes: 21 additions & 4 deletions infra/AGENTS.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,31 @@
# infra/ — agent instructions

Local-development infrastructure for sermon.guide v0. Production lives in
k8s manifests later (see [docs/PHASES.md](../docs/PHASES.md), Beyond Phase 16).
Local-development AND v0 single-box production infrastructure for
sermon.guide. The k8s/KEDA shape stays post-v0
(see [docs/PHASES.md](../docs/PHASES.md), Beyond Phase 16).

## What lives here

- `docker-compose.yml` — Postgres 16, Redis 7, Milvus standalone v2.6 with its
required etcd + MinIO dependencies. Brought up via `make up` from repo root.
- `docker-compose.yml` — local dev data plane: Postgres 16, Redis 7, Milvus
standalone v2.6 with its required etcd + MinIO dependencies. Brought up via
`make up` from repo root.
- `.env.example` — template for `infra/.env` (gitignored). `make up` copies
the example to `.env` on first run.
- `docker-compose.prod.yml` — the v0 single-box AWS stack (data plane + api/
worker/web + Caddy edge). DELIBERATELY self-contained, not an overlay:
compose merges `ports:` additively, and "only Caddy publishes a port" is
the security property the file guarantees. Keep its data-plane blocks in
sync with `docker-compose.yml` when bumping versions. Runbook:
[docs/DEPLOY_AWS.md](../docs/DEPLOY_AWS.md).
- `caddy/` — TLS edge (Dockerfile + Caddyfile: rate limits, body caps,
default_sni for bare-IP deploys).
- `scripts/` — deploy-time one-shots (model prewarm into the shared HF cache).
- `aws/` — provision/deploy/start/stop/status/destroy lifecycle scripts.
Tag-based and re-runnable; secrets are generated ON the instance, never
committed.
- `env.prod.template` — documents `/opt/sermon/.env.prod` (generated on-box
by `aws/deploy.sh`). Deliberately NOT dot-env-named so repo tooling can
read it.
- Future: `k8s/` Helm values + KEDA scaler config (post-v0).

## Conventions
Expand Down
85 changes: 85 additions & 0 deletions infra/aws/common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env bash
# Shared helpers for the sermon.guide AWS scripts. Source, don't execute.
#
# Conventions:
# - Everything is found by tag, not by stored state: the instance, SG, EIP
# and key pair all carry Name/Project=sermon-guide tags, so the scripts
# are re-runnable from any checkout.
# - Region/profile come from the ambient AWS CLI config (aws configure /
# AWS_PROFILE / AWS_REGION); override per-invocation with env vars.

set -euo pipefail

TAG_NAME="${SERMON_AWS_NAME:-sermon-guide}"
KEY_NAME="${SERMON_AWS_KEY_NAME:-${TAG_NAME}}"
KEY_FILE="${SERMON_AWS_KEY_FILE:-${HOME}/.ssh/${TAG_NAME}.pem}"
SSH_USER="ubuntu"

aws() {
command aws "$@"
}

region() {
aws configure get region 2>/dev/null || echo "${AWS_REGION:-${AWS_DEFAULT_REGION:-}}"
}

die() {
echo "ERROR: $*" >&2
exit 1
}

require_aws() {
command -v aws >/dev/null 2>&1 || die "aws CLI not found (expected on PATH, e.g. ~/.local/bin/aws)"
aws sts get-caller-identity >/dev/null 2>&1 \
|| die "AWS credentials not configured — run: aws configure"
[ -n "$(region)" ] || die "no default region — set one via aws configure or AWS_REGION"
}

# Newest non-terminated instance tagged Name=$TAG_NAME; empty if none.
find_instance() {
aws ec2 describe-instances \
--filters "Name=tag:Name,Values=${TAG_NAME}" \
"Name=instance-state-name,Values=pending,running,stopping,stopped" \
--query 'sort_by(Reservations[].Instances[], &LaunchTime)[-1].InstanceId' \
--output text 2>/dev/null | grep -v '^None$' || true
}

instance_state() {
aws ec2 describe-instances --instance-ids "$1" \
--query 'Reservations[0].Instances[0].State.Name' --output text
}

# Elastic IP tagged Name=$TAG_NAME; prints "ALLOC_ID IP" or nothing.
find_eip() {
aws ec2 describe-addresses \
--filters "Name=tag:Name,Values=${TAG_NAME}" \
--query 'Addresses[0].[AllocationId,PublicIp]' --output text 2>/dev/null \
| grep -v '^None' || true
}

find_security_group() {
aws ec2 describe-security-groups \
--filters "Name=group-name,Values=${TAG_NAME}-sg" \
--query 'SecurityGroups[0].GroupId' --output text 2>/dev/null \
| grep -v '^None$' || true
}

ssh_cmd() {
# accept-new = trust-on-first-use: conventional for a box we just created
# ourselves, but the first connect is unauthenticated — paranoid operators
# can pre-pin the host key from the EC2 console's system log.
ssh -i "${KEY_FILE}" \
-o StrictHostKeyChecking=accept-new \
-o ConnectTimeout=10 \
"${SSH_USER}@$1" "${@:2}"
}

wait_for_ssh() {
local ip="$1" tries=0
echo "waiting for SSH on ${ip} …"
until ssh_cmd "${ip}" true 2>/dev/null; do
tries=$((tries + 1))
[ "${tries}" -lt 40 ] || die "SSH to ${ip} not reachable after ~3min"
sleep 5
done
}
Loading