diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..9018dd8 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,18 @@ +# Build-context filter for the api/ and worker/ images (context = repo root, +# because api/ imports worker.db — see api/Dockerfile). web/ builds from its +# own context (web/) with its own .dockerignore. +.git +.github +.claude +docs +infra +web +**/.venv +**/__pycache__ +**/.pytest_cache +**/.ruff_cache +**/.mypy_cache +**/node_modules +**/tests/samples +.env +.env.* diff --git a/api/Dockerfile b/api/Dockerfile new file mode 100644 index 0000000..d5fdb82 --- /dev/null +++ b/api/Dockerfile @@ -0,0 +1,51 @@ +# api/ — FastAPI HTTP layer. BUILD FROM THE REPO ROOT: +# +# docker build -f api/Dockerfile . +# +# The context must be the repo root because api/ imports `db`, `embedding`, +# `retrieval`, and `scripts.bootstrap_milvus` from ../worker at RUNTIME (the +# one allowed cross-package import — root CLAUDE.md). A `COPY api/`-only +# build fails at boot with ModuleNotFoundError. +# +# Deps install from api/uv.lock with `uv sync --frozen` so the +# [tool.uv.sources] CPU-only torch index is honored — re-resolving (or pip) +# would drag in ~2GB of unused CUDA wheels. +# +# The three inference models (BGE-Large ~1.3GB, ms-marco cross-encoder +# ~90MB, BGE-M3 ~2.3GB) are NOT baked into this image. They live in the +# shared `sermon-hf-cache` volume (HF_HOME), downloaded once by the +# `prewarm` one-shot in infra/docker-compose.prod.yml; the runtime then +# loads them lazily with HF_HUB_OFFLINE=1 so no request ever blocks on +# (or flakes over) a HuggingFace network round-trip. + +FROM python:3.12-slim AS deps +COPY --from=ghcr.io/astral-sh/uv:0.11.19 /uv /usr/local/bin/uv +ENV UV_PYTHON_DOWNLOADS=never \ + UV_LINK_MODE=copy \ + UV_COMPILE_BYTECODE=1 +WORKDIR /app/api +COPY api/pyproject.toml api/uv.lock ./ +# package=false in pyproject: this installs dependencies only (no project +# build), which is exactly what we want. --no-dev skips pytest/ruff/pyright. +RUN uv sync --frozen --no-dev + +FROM python:3.12-slim +# libgomp1: OpenMP runtime needed by the torch-CPU / sentence-transformers +# native wheels on slim images. ca-certificates: HTTPS to the LLM endpoint +# (generativelanguage.googleapis.com / api.ppq.ai). +RUN apt-get update \ + && apt-get install -y --no-install-recommends libgomp1 ca-certificates \ + && rm -rf /var/lib/apt/lists/* +COPY --from=deps /app/api/.venv /app/api/.venv +COPY worker/ /app/worker/ +COPY api/ /app/api/ +ENV PATH="/app/api/.venv/bin:$PATH" \ + PYTHONPATH=/app/worker \ + PYTHONUNBUFFERED=1 +WORKDIR /app/api +EXPOSE 8000 +# Exactly ONE uvicorn worker: each process lazily loads ~3.7GB of models, so +# --workers N would cost N × 3.7GB on the shared box. Handlers offload +# blocking CPU/Milvus work via asyncio.to_thread, so a single worker serves +# concurrent requests fine; the bottleneck is model wall-time, not async I/O. +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docs/DEPLOY_AWS.md b/docs/DEPLOY_AWS.md new file mode 100644 index 0000000..696a2ef --- /dev/null +++ b/docs/DEPLOY_AWS.md @@ -0,0 +1,132 @@ +# Deploying sermon.guide to AWS (v0, single box) + +Operator runbook for the v0 deployment: the whole stack — Postgres, Redis, +Milvus (+etcd +MinIO), FastAPI api, Celery worker, Next.js web, Caddy TLS +edge — on **one EC2 instance** via `infra/docker-compose.prod.yml`. This is +the dollar-store rendering of ARCHITECTURE.md §1's "~$50/mo" target; the +KEDA/k8s shape stays Phase 30. + +## TL;DR + +```bash +aws configure # once: credentials + region +cd infra/aws +./provision.sh # EC2 t3a.xlarge + EIP + SG (~2min) +./deploy.sh # clone, build, migrate, bootstrap, prewarm, smoke (~20min first run) + # deploys your CURRENT branch — it must be pushed to origin + # (deploy.sh preflights this and tells you if not) +./stop.sh # done for the day → compute billing off +./start.sh # back up on the same IP in ~3min +./status.sh # state + URL + billing posture +./destroy.sh # everything gone, $0 +``` + +The site serves at `https://` with a self-signed cert (Caddy's +internal CA) — the browser warns once per browser; proceed. No domain is +required; see [Adding a domain](#adding-a-domain-later) for the day one exists. + +## Cost + +| State | What bills | ≈/mo | +| --- | --- | --- | +| Running 24/7 | t3a.xlarge ($0.1504/hr us-east-1) + 100GB gp3 + EIP | ~$122 | +| Running 8h/day | compute ~⅓ + disk + EIP | ~$48 | +| **Stopped** | 100GB gp3 (~$8) + EIP (~$3.65) | **~$12** | + +t3a is burstable (unlimited mode by default): sustained heavy CPU — e.g. +hours of ingest — can accrue small credit-overage charges; fine for bursty +beta use, watch CloudWatch `CPUSurplusCreditCharged` if you batch-ingest a +whole library. **Set a billing alarm** (Billing → Budgets) — nothing in this +stack does it for you. + +LLM spend is separate (per-query, via `GOOGLE_API_KEY`/`PPQ_API_KEY`); a +warm `/search-summary` was ~6¢ in the Phase 14b live verify. Set a spend cap +at the provider. + +## What deploy.sh actually does + +1. **Code** → `git clone`/`reset --hard` of the chosen branch into `/opt/sermon/app`. +2. **Secrets** → first run generates `/opt/sermon/.env.prod` *on the box* + (`openssl rand`; JWT secret, Postgres/Redis/MinIO passwords). Secrets never + exist on a dev machine or in git. `docker-compose.prod.yml` uses `${VAR:?}` + so a missing secret refuses to boot rather than falling back to the + dev defaults baked into the code (the Phase 18 startup-guard gap, mitigated + at the compose layer). +3. **LLM keys** → if `GOOGLE_API_KEY`/`PPQ_API_KEY` are set in your local + shell when you run `./deploy.sh`, they're forwarded into the box's env + file (PPQ also flips the provider). Without one, everything works except + `/search-summary`, which 503s naming the missing var. +4. **Build** → all four images build on the instance (first run ~10–20min). +5. **Bootstrap** → `alembic upgrade head`, `bootstrap_milvus.py` (both + idempotent), then the `prewarm` one-shot downloads the three models + (~3.7GB) into the shared `sermon-hf-cache` volume — the only moment the + stack talks to HuggingFace. Runtime containers run `HF_HUB_OFFLINE=1`. +6. **Up + smoke** → `up -d --wait`, then an outside-in signup→login→/library + pass through Caddy with a cookie jar. + +## Security posture (read before sharing the URL) + +Mitigated at deploy time, no app-code changes: + +- **Only Caddy publishes ports** (80/443). Postgres/Redis/Milvus (which has + *no auth*)/MinIO/etcd/api/web are compose-network-only; the security group + (443/80 world, 22 admin-IP) is the backstop. +- **Strong generated secrets**; compose hard-fails if any is missing. +- **Per-IP rate limits at Caddy**: 10/min on `/api/auth/*`, 6/min on + `/api/search-summary` + `/api/upload`, 600/min general (Phase 19 gap). +- **Body caps at the edge**: 1MB on JSON routes, 210MB global (the api's own + 200MB streamed cap is the real gate; this protects the Node proxy). +- **Celery `--time-limit=3600`** so one poisoned upload can't pin the CPU. +- **Secure session cookie**: web runs `NODE_ENV=production` (bakes the + `Secure` attribute), everything redirects to HTTPS. +- The api is **not publicly reachable at all** — browsers only ever hit the + Next route handlers, which attach the JWT server-side. + +Accepted v0 risks (documented in the Phase 17–30 plan, revisit before real +launch): open signup (rate-limited but no email verification/CAPTCHA), +`task_id`-as-capability on `/tasks/{id}`, no Pydantic `extra='forbid'`, +no graceful degradation (a Milvus blip = 500), CPU latency (warm +`/search-summary` ≈ 2min — "user is reading, not chatting"). + +## Day-2 operations + +```bash +# logs (whole stack / one service) +ssh -i ~/.ssh/sermon-guide.pem ubuntu@ +cd /opt/sermon/app +docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod logs -f [api|worker|web|caddy|milvus] + +# redeploy after pushing changes to the branch +./deploy.sh # re-runs build/migrate/bootstrap idempotently + +# add or rotate the LLM key later +ssh … 'sed -i "s|^GOOGLE_API_KEY=.*|GOOGLE_API_KEY=|" /opt/sermon/.env.prod' +ssh … 'cd /opt/sermon/app && docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod up -d api' + +# manual backup before risky changes (Phase 28 will do this properly) +aws ec2 create-snapshot --volume-id $(aws ec2 describe-instances \ + --filters Name=tag:Name,Values=sermon-guide Name=instance-state-name,Values=running,stopped \ + --query 'Reservations[0].Instances[0].BlockDeviceMappings[0].Ebs.VolumeId' --output text) \ + --description "sermon-guide manual backup" +``` + +## Adding a domain later + +1. Point an A record at the Elastic IP. +2. On the box, edit `/opt/sermon/.env.prod`: + `SITE_HOST=sermon.guide` and + `SERMON_API_CORS_ORIGINS=["https://sermon.guide"]`. +3. Remove the `tls internal` line from `infra/caddy/Caddyfile` (commit that + change) so automatic Let's Encrypt takes over. +4. `docker compose … up -d caddy api`. Add an HSTS header in the Caddyfile + once the real cert is confirmed working (deliberately absent now — HSTS on + a self-signed IP would lock browsers out). + +## Known deltas vs the phase plan + +- This is operator tooling on branch `deploy/aws-v0`, not Phase 29/30: + images build on the box (no registry/CI), models live in a volume rather + than baked into images. When Phase 29 lands proper image-build CI, these + Dockerfiles are its starting point and `prewarm` becomes a build step. +- `web/next.config.ts` gained `output: "standalone"` (required for the slim + web image; dev/CI behavior unchanged). diff --git a/infra/AGENTS.md b/infra/AGENTS.md index df24e73..2d1b84c 100644 --- a/infra/AGENTS.md +++ b/infra/AGENTS.md @@ -1,14 +1,31 @@ # infra/ — agent instructions -Local-development infrastructure for sermon.guide v0. Production lives in -k8s manifests later (see [docs/PHASES.md](../docs/PHASES.md), Beyond Phase 16). +Local-development AND v0 single-box production infrastructure for +sermon.guide. The k8s/KEDA shape stays post-v0 +(see [docs/PHASES.md](../docs/PHASES.md), Beyond Phase 16). ## What lives here -- `docker-compose.yml` — Postgres 16, Redis 7, Milvus standalone v2.6 with its - required etcd + MinIO dependencies. Brought up via `make up` from repo root. +- `docker-compose.yml` — local dev data plane: Postgres 16, Redis 7, Milvus + standalone v2.6 with its required etcd + MinIO dependencies. Brought up via + `make up` from repo root. - `.env.example` — template for `infra/.env` (gitignored). `make up` copies the example to `.env` on first run. +- `docker-compose.prod.yml` — the v0 single-box AWS stack (data plane + api/ + worker/web + Caddy edge). DELIBERATELY self-contained, not an overlay: + compose merges `ports:` additively, and "only Caddy publishes a port" is + the security property the file guarantees. Keep its data-plane blocks in + sync with `docker-compose.yml` when bumping versions. Runbook: + [docs/DEPLOY_AWS.md](../docs/DEPLOY_AWS.md). +- `caddy/` — TLS edge (Dockerfile + Caddyfile: rate limits, body caps, + default_sni for bare-IP deploys). +- `scripts/` — deploy-time one-shots (model prewarm into the shared HF cache). +- `aws/` — provision/deploy/start/stop/status/destroy lifecycle scripts. + Tag-based and re-runnable; secrets are generated ON the instance, never + committed. +- `env.prod.template` — documents `/opt/sermon/.env.prod` (generated on-box + by `aws/deploy.sh`). Deliberately NOT dot-env-named so repo tooling can + read it. - Future: `k8s/` Helm values + KEDA scaler config (post-v0). ## Conventions diff --git a/infra/aws/common.sh b/infra/aws/common.sh new file mode 100755 index 0000000..b2e3653 --- /dev/null +++ b/infra/aws/common.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# Shared helpers for the sermon.guide AWS scripts. Source, don't execute. +# +# Conventions: +# - Everything is found by tag, not by stored state: the instance, SG, EIP +# and key pair all carry Name/Project=sermon-guide tags, so the scripts +# are re-runnable from any checkout. +# - Region/profile come from the ambient AWS CLI config (aws configure / +# AWS_PROFILE / AWS_REGION); override per-invocation with env vars. + +set -euo pipefail + +TAG_NAME="${SERMON_AWS_NAME:-sermon-guide}" +KEY_NAME="${SERMON_AWS_KEY_NAME:-${TAG_NAME}}" +KEY_FILE="${SERMON_AWS_KEY_FILE:-${HOME}/.ssh/${TAG_NAME}.pem}" +SSH_USER="ubuntu" + +aws() { + command aws "$@" +} + +region() { + aws configure get region 2>/dev/null || echo "${AWS_REGION:-${AWS_DEFAULT_REGION:-}}" +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +require_aws() { + command -v aws >/dev/null 2>&1 || die "aws CLI not found (expected on PATH, e.g. ~/.local/bin/aws)" + aws sts get-caller-identity >/dev/null 2>&1 \ + || die "AWS credentials not configured — run: aws configure" + [ -n "$(region)" ] || die "no default region — set one via aws configure or AWS_REGION" +} + +# Newest non-terminated instance tagged Name=$TAG_NAME; empty if none. +find_instance() { + aws ec2 describe-instances \ + --filters "Name=tag:Name,Values=${TAG_NAME}" \ + "Name=instance-state-name,Values=pending,running,stopping,stopped" \ + --query 'sort_by(Reservations[].Instances[], &LaunchTime)[-1].InstanceId' \ + --output text 2>/dev/null | grep -v '^None$' || true +} + +instance_state() { + aws ec2 describe-instances --instance-ids "$1" \ + --query 'Reservations[0].Instances[0].State.Name' --output text +} + +# Elastic IP tagged Name=$TAG_NAME; prints "ALLOC_ID IP" or nothing. +find_eip() { + aws ec2 describe-addresses \ + --filters "Name=tag:Name,Values=${TAG_NAME}" \ + --query 'Addresses[0].[AllocationId,PublicIp]' --output text 2>/dev/null \ + | grep -v '^None' || true +} + +find_security_group() { + aws ec2 describe-security-groups \ + --filters "Name=group-name,Values=${TAG_NAME}-sg" \ + --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null \ + | grep -v '^None$' || true +} + +ssh_cmd() { + # accept-new = trust-on-first-use: conventional for a box we just created + # ourselves, but the first connect is unauthenticated — paranoid operators + # can pre-pin the host key from the EC2 console's system log. + ssh -i "${KEY_FILE}" \ + -o StrictHostKeyChecking=accept-new \ + -o ConnectTimeout=10 \ + "${SSH_USER}@$1" "${@:2}" +} + +wait_for_ssh() { + local ip="$1" tries=0 + echo "waiting for SSH on ${ip} …" + until ssh_cmd "${ip}" true 2>/dev/null; do + tries=$((tries + 1)) + [ "${tries}" -lt 40 ] || die "SSH to ${ip} not reachable after ~3min" + sleep 5 + done +} diff --git a/infra/aws/deploy.sh b/infra/aws/deploy.sh new file mode 100755 index 0000000..8cfba26 --- /dev/null +++ b/infra/aws/deploy.sh @@ -0,0 +1,207 @@ +#!/usr/bin/env bash +# Deploy (or re-deploy) sermon.guide onto the provisioned EC2 box. +# +# ./deploy.sh # deploys the branch you're currently on +# BRANCH=main ./deploy.sh # deploy a specific branch +# +# What it does, in order (every step idempotent, safe to re-run): +# 1. clone/pull the repo on the instance (/opt/sermon/app) +# 2. first run only: generate /opt/sermon/.env.prod — strong secrets are +# created ON the box with openssl and never leave it +# 3. forward GOOGLE_API_KEY / PPQ_API_KEY from the local env if set +# (so the LLM key never lands in git or chat either). Keys are STICKY: +# once set on the box they persist across deploys until you edit +# /opt/sermon/.env.prod by hand — running deploy.sh without the var +# exported leaves the old key (and provider) in place. +# 4. docker compose build (first build ~10-20min: torch wheels, Next build, +# xcaddy compile) +# 5. up the data plane, run one-shots: migrate → bootstrap-milvus → prewarm +# (prewarm downloads ~3.7GB of models into the hf-cache volume once) +# 6. up everything, then smoke-test from the OUTSIDE (signup→login→library +# through Caddy with a cookie jar) + +. "$(dirname "$0")/common.sh" + +REPO_URL="${REPO_URL:-https://github.com/sovITxyz/sermon.guide.git}" +BRANCH="${BRANCH:-$(git -C "$(dirname "$0")/../.." rev-parse --abbrev-ref HEAD 2>/dev/null || echo main)}" + +# Preflight: the box clones BRANCH from origin over anonymous HTTPS. If the +# branch isn't pushed, the remote `git clone --branch` fails with a cryptic +# exit-128 mid-SSH — catch it here with an actionable message instead. +GIT_TERMINAL_PROMPT=0 git ls-remote --exit-code --heads "${REPO_URL}" "refs/heads/${BRANCH}" >/dev/null 2>&1 \ + || die "branch '${BRANCH}' is not on origin (${REPO_URL}) — push it first: git push -u origin ${BRANCH}" + +require_aws + +instance_id="$(find_instance)" +[ -n "${instance_id}" ] || die "no instance found — run provision.sh first" +state="$(instance_state "${instance_id}")" +[ "${state}" = "running" ] || die "instance ${instance_id} is ${state} — run start.sh first" + +eip_info="$(find_eip)" +[ -n "${eip_info}" ] || die "no Elastic IP tagged ${TAG_NAME}" +ip="$(echo "${eip_info}" | awk '{print $2}')" + +echo "deploying branch '${BRANCH}' to ${instance_id} @ ${ip}" +wait_for_ssh "${ip}" + +# Wait for cloud-init (Docker install) on a fresh box. +ssh_cmd "${ip}" "cloud-init status --wait >/dev/null 2>&1 || true" +ssh_cmd "${ip}" "command -v docker >/dev/null" || die "Docker missing on instance — cloud-init failed? check /var/log/cloud-init-output.log" + +# Optional LLM keys forwarded from the local environment (never stored in git). +llm_env="" +[ -n "${GOOGLE_API_KEY:-}" ] && llm_env="GOOGLE_API_KEY=${GOOGLE_API_KEY}" +[ -n "${PPQ_API_KEY:-}" ] && llm_env="${llm_env} PPQ_API_KEY=${PPQ_API_KEY}" + +ssh_cmd "${ip}" "bash -s" < /opt/sermon/.env.prod < /opt/sermon/.env.prod.new ) + mv /opt/sermon/.env.prod.new /opt/sermon/.env.prod +} +if [ -n "\${GOOGLE_API_KEY:-}" ]; then + set_kv GOOGLE_API_KEY "\${GOOGLE_API_KEY}" + echo "GOOGLE_API_KEY updated" +fi +if [ -n "\${PPQ_API_KEY:-}" ]; then + set_kv PPQ_API_KEY "\${PPQ_API_KEY}" + set_kv SERMON_API_LLM_PROVIDER ppq + echo "PPQ_API_KEY updated (provider → ppq)" +fi + +cd /opt/sermon/app +compose() { + docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod "\$@" +} + +# --- 4. build --- +compose build + +# --- 5. data plane up → one-shots --- +# NOTE: every 'compose run' below MUST have stdin redirected (&2 + exit 1 + fi + echo "bootstrap-milvus attempt \${attempt} failed (cold Milvus gRPC?) — retrying in 10s" + sleep 10 +done + +# Prewarm only when the heaviest model isn't cached yet (~3.7GB once). +if ! docker run --rm -v sermon_sermon-hf-cache:/hf-cache alpine:3.20 \ + test -d /hf-cache/hub/models--BAAI--bge-m3 2>/dev/null; then + compose run --rm prewarm --description sermon-pre-destroy + +. "$(dirname "$0")/common.sh" + +require_aws +instance_id="$(find_instance)" +eip_info="$(find_eip)" +sg_id="$(find_security_group)" + +echo "will destroy:" +echo " instance : ${instance_id:-none} (+ its EBS volume — ALL DATA)" +echo " eip : ${eip_info:-none}" +echo " sg : ${sg_id:-none}" +printf 'type "destroy sermon-guide" to confirm: ' +read -r confirm +[ "${confirm}" = "destroy sermon-guide" ] || die "aborted" + +if [ -n "${instance_id}" ]; then + aws ec2 terminate-instances --instance-ids "${instance_id}" >/dev/null + echo "terminating ${instance_id} …" + aws ec2 wait instance-terminated --instance-ids "${instance_id}" +fi + +if [ -n "${eip_info}" ]; then + alloc_id="$(echo "${eip_info}" | awk '{print $1}')" + aws ec2 release-address --allocation-id "${alloc_id}" + echo "released EIP" +fi + +if [ -n "${sg_id}" ]; then + aws ec2 delete-security-group --group-id "${sg_id}" + echo "deleted security group" +fi + +if [ "${1:-}" = "--delete-key" ]; then + aws ec2 delete-key-pair --key-name "${KEY_NAME}" + rm -f "${KEY_FILE}" + echo "deleted key pair + ${KEY_FILE}" +fi + +echo "destroyed ✓ (billing for this stack is now \$0)" diff --git a/infra/aws/provision.sh b/infra/aws/provision.sh new file mode 100755 index 0000000..f7f8f18 --- /dev/null +++ b/infra/aws/provision.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash +# Provision the sermon.guide EC2 box: security group (443/80 world, 22 admin +# IP only), key pair, Ubuntu 24.04 instance with Docker via user-data, and an +# Elastic IP (so stop/start keeps the same address — stopped instances bill +# only EBS ~$8/mo + EIP ~$3.65/mo, not compute). +# +# Idempotent-ish: re-running finds the tagged instance and exits with info. +# +# ./provision.sh # t3a.xlarge, 100GB gp3, current region +# INSTANCE_TYPE=t3a.large ./provision.sh # override sizing +# SSH_CIDR=1.2.3.4/32 ./provision.sh # override admin-SSH source + +. "$(dirname "$0")/common.sh" + +INSTANCE_TYPE="${INSTANCE_TYPE:-t3a.xlarge}" +VOLUME_GB="${VOLUME_GB:-100}" + +require_aws +REGION="$(region)" +echo "region: ${REGION} type: ${INSTANCE_TYPE} disk: ${VOLUME_GB}GB gp3" + +existing="$(find_instance)" +if [ -n "${existing}" ]; then + state="$(instance_state "${existing}")" + eip_info="$(find_eip)" + echo "already provisioned: ${existing} (${state}) eip: ${eip_info:-none}" + echo "use deploy.sh / start.sh / stop.sh / destroy.sh" + exit 0 +fi + +# --- default VPC --- +vpc_id="$(aws ec2 describe-vpcs --filters Name=isDefault,Values=true \ + --query 'Vpcs[0].VpcId' --output text)" +if [ "${vpc_id}" = "None" ] || [ -z "${vpc_id}" ]; then + echo "no default VPC — creating one" + vpc_id="$(aws ec2 create-default-vpc --query 'Vpc.VpcId' --output text)" +fi +echo "vpc: ${vpc_id}" + +# --- security group: 80/443 world, 22 admin only --- +# Admin IP is resolved BEFORE the SG exists, and rules are (re)applied even +# when the SG already exists — so a half-created SG from an aborted earlier +# run self-heals instead of silently shipping with no ingress (SSH lockout). +if [ -z "${SSH_CIDR:-}" ]; then + my_ip="$(curl -fsS https://checkip.amazonaws.com || true)" + [ -n "${my_ip}" ] || die "could not determine admin IP — set SSH_CIDR=x.x.x.x/32 explicitly" + SSH_CIDR="${my_ip}/32" +fi + +sg_id="$(find_security_group)" +if [ -z "${sg_id}" ]; then + sg_id="$(aws ec2 create-security-group \ + --group-name "${TAG_NAME}-sg" \ + --description "sermon.guide single-box: 443/80 public, 22 admin" \ + --vpc-id "${vpc_id}" \ + --tag-specifications "ResourceType=security-group,Tags=[{Key=Name,Value=${TAG_NAME}},{Key=Project,Value=${TAG_NAME}}]" \ + --query 'GroupId' --output text)" +fi +echo "sg: ${sg_id} ssh from: ${SSH_CIDR}" +for perm in \ + "IpProtocol=tcp,FromPort=80,ToPort=80,IpRanges=[{CidrIp=0.0.0.0/0}],Ipv6Ranges=[{CidrIpv6=::/0}]" \ + "IpProtocol=tcp,FromPort=443,ToPort=443,IpRanges=[{CidrIp=0.0.0.0/0}],Ipv6Ranges=[{CidrIpv6=::/0}]" \ + "IpProtocol=udp,FromPort=443,ToPort=443,IpRanges=[{CidrIp=0.0.0.0/0}],Ipv6Ranges=[{CidrIpv6=::/0}]" \ + "IpProtocol=tcp,FromPort=22,ToPort=22,IpRanges=[{CidrIp=${SSH_CIDR}}]"; do + out="$(aws ec2 authorize-security-group-ingress --group-id "${sg_id}" \ + --ip-permissions "${perm}" 2>&1)" \ + || { echo "${out}" | grep -q InvalidPermission.Duplicate || die "SG ingress failed: ${out}"; } +done + +# --- key pair --- +if ! aws ec2 describe-key-pairs --key-names "${KEY_NAME}" >/dev/null 2>&1; then + echo "creating key pair ${KEY_NAME} → ${KEY_FILE}" + mkdir -p "$(dirname "${KEY_FILE}")" + aws ec2 create-key-pair --key-name "${KEY_NAME}" \ + --key-type ed25519 \ + --tag-specifications "ResourceType=key-pair,Tags=[{Key=Project,Value=${TAG_NAME}}]" \ + --query 'KeyMaterial' --output text > "${KEY_FILE}" + chmod 600 "${KEY_FILE}" +elif [ ! -f "${KEY_FILE}" ]; then + die "key pair ${KEY_NAME} exists in AWS but ${KEY_FILE} is missing locally — delete the AWS key pair or set SERMON_AWS_KEY_FILE" +fi + +# --- AMI: latest Ubuntu 24.04 LTS amd64 --- +ami_id="$(aws ssm get-parameters \ + --names /aws/service/canonical/ubuntu/server/24.04/stable/current/amd64/hvm/ebs-gp3/ami-id \ + --query 'Parameters[0].Value' --output text)" +echo "ami: ${ami_id}" + +# --- user-data: Docker engine + compose v2 (official repo; distro packages +# are too old for the compose features the prod file uses) --- +user_data="$(mktemp)" +trap 'rm -f "${user_data}"' EXIT +cat > "${user_data}" <<'CLOUDINIT' +#!/bin/bash +set -euxo pipefail +apt-get update +apt-get install -y ca-certificates curl +install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +chmod a+r /etc/apt/keyrings/docker.asc +echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] \ +https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" \ + > /etc/apt/sources.list.d/docker.list +apt-get update +apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +usermod -aG docker ubuntu +mkdir -p /opt/sermon +chown ubuntu:ubuntu /opt/sermon +CLOUDINIT + +# --- launch --- +instance_id="$(aws ec2 run-instances \ + --image-id "${ami_id}" \ + --instance-type "${INSTANCE_TYPE}" \ + --key-name "${KEY_NAME}" \ + --security-group-ids "${sg_id}" \ + --block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=${VOLUME_GB},VolumeType=gp3,DeleteOnTermination=true}" \ + --metadata-options "HttpEndpoint=enabled,HttpTokens=required" \ + --instance-initiated-shutdown-behavior stop \ + --user-data "file://${user_data}" \ + --tag-specifications \ + "ResourceType=instance,Tags=[{Key=Name,Value=${TAG_NAME}},{Key=Project,Value=${TAG_NAME}}]" \ + "ResourceType=volume,Tags=[{Key=Name,Value=${TAG_NAME}},{Key=Project,Value=${TAG_NAME}}]" \ + --query 'Instances[0].InstanceId' --output text)" +echo "instance: ${instance_id} — waiting for running state" +aws ec2 wait instance-running --instance-ids "${instance_id}" + +# --- elastic IP (survives stop/start) --- +eip_info="$(find_eip)" +if [ -z "${eip_info}" ]; then + alloc_id="$(aws ec2 allocate-address --domain vpc \ + --tag-specifications "ResourceType=elastic-ip,Tags=[{Key=Name,Value=${TAG_NAME}},{Key=Project,Value=${TAG_NAME}}]" \ + --query 'AllocationId' --output text)" +else + alloc_id="$(echo "${eip_info}" | awk '{print $1}')" +fi +aws ec2 associate-address --instance-id "${instance_id}" --allocation-id "${alloc_id}" >/dev/null +eip="$(aws ec2 describe-addresses --allocation-ids "${alloc_id}" --query 'Addresses[0].PublicIp' --output text)" + +echo +echo "provisioned ✓" +echo " instance : ${instance_id} (${INSTANCE_TYPE}, ${REGION})" +echo " ip : ${eip}" +echo " ssh : ssh -i ${KEY_FILE} ${SSH_USER}@${eip}" +echo +echo "cloud-init is installing Docker (~2min). next: ./deploy.sh" diff --git a/infra/aws/start.sh b/infra/aws/start.sh new file mode 100755 index 0000000..ef8d7c0 --- /dev/null +++ b/infra/aws/start.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Start the (stopped) sermon.guide instance. The Elastic IP and all docker +# volumes persist across stop/start; every service has restart: +# unless-stopped, so the whole stack comes back on its own (~2-3min until +# Milvus is healthy). + +. "$(dirname "$0")/common.sh" + +require_aws +instance_id="$(find_instance)" +[ -n "${instance_id}" ] || die "no instance found — run provision.sh" + +state="$(instance_state "${instance_id}")" +if [ "${state}" = "running" ]; then + echo "already running" +else + echo "starting ${instance_id} (was: ${state}) …" + aws ec2 start-instances --instance-ids "${instance_id}" >/dev/null + aws ec2 wait instance-running --instance-ids "${instance_id}" +fi + +eip_info="$(find_eip)" +ip="$(echo "${eip_info}" | awk '{print $2}')" +echo "running ✓ https://${ip} (give the stack ~2-3min; compute billing is on)" diff --git a/infra/aws/status.sh b/infra/aws/status.sh new file mode 100755 index 0000000..7d863a1 --- /dev/null +++ b/infra/aws/status.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Show the sermon.guide instance state, address, and rough cost posture. + +. "$(dirname "$0")/common.sh" + +require_aws +instance_id="$(find_instance)" +if [ -z "${instance_id}" ]; then + echo "no instance provisioned (run provision.sh)" + exit 0 +fi + +aws ec2 describe-instances --instance-ids "${instance_id}" \ + --query 'Reservations[0].Instances[0].{id:InstanceId,state:State.Name,type:InstanceType,az:Placement.AvailabilityZone,launched:LaunchTime}' \ + --output table + +eip_info="$(find_eip)" +if [ -n "${eip_info}" ]; then + ip="$(echo "${eip_info}" | awk '{print $2}')" + echo "elastic ip : ${ip}" + echo "url : https://${ip}" + echo "ssh : ssh -i ${KEY_FILE} ${SSH_USER}@${ip}" +fi + +state="$(instance_state "${instance_id}")" +case "${state}" in + running) echo "billing : compute ON (~\$0.15/hr for t3a.xlarge) + EBS + EIP" ;; + stopped) echo "billing : compute OFF — EBS (~\$8/mo) + EIP (~\$3.65/mo) only" ;; + *) echo "billing : transitional (${state})" ;; +esac diff --git a/infra/aws/stop.sh b/infra/aws/stop.sh new file mode 100755 index 0000000..74150e3 --- /dev/null +++ b/infra/aws/stop.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Stop the sermon.guide instance. Compute billing stops; you keep paying only +# EBS storage (~$8/mo for 100GB gp3) + the Elastic IP (~$3.65/mo). All data +# (Postgres, Milvus, uploads, model cache) lives on the EBS volume and +# survives. start.sh brings everything back on the same IP. + +. "$(dirname "$0")/common.sh" + +require_aws +instance_id="$(find_instance)" +[ -n "${instance_id}" ] || die "no instance found" + +state="$(instance_state "${instance_id}")" +if [ "${state}" = "stopped" ]; then + echo "already stopped" + exit 0 +fi + +echo "stopping ${instance_id} …" +aws ec2 stop-instances --instance-ids "${instance_id}" >/dev/null +aws ec2 wait instance-stopped --instance-ids "${instance_id}" +echo "stopped ✓ (resting cost ≈ \$12/mo: EBS + Elastic IP. ./start.sh to resume)" diff --git a/infra/caddy/Caddyfile b/infra/caddy/Caddyfile new file mode 100644 index 0000000..f4ecc2f --- /dev/null +++ b/infra/caddy/Caddyfile @@ -0,0 +1,104 @@ +# Caddy edge config for the single-box AWS deploy. +# +# SITE_HOST comes from the environment (set in /opt/sermon/.env.prod by +# infra/aws/deploy.sh) and is a BARE host — an IP for now, a domain later: +# - IP-only deploy: SITE_HOST= → Caddy's internal CA +# self-signs for the IP (browser shows a one-time warning). +# - Domain later: SITE_HOST=sermon.guide → also REMOVE the +# `tls internal` line below so automatic Let's Encrypt takes over, +# and consider adding HSTS then (deliberately NOT set now: HSTS + a +# self-signed IP cert would lock browsers out with no bypass). +# +# default_sni is LOAD-BEARING for the IP deploy: browsers do not send SNI +# when the URL host is an IP address (RFC 6066), and without it Caddy can't +# pick a certificate and aborts the handshake ("tlsv1 alert internal +# error") — verified empirically against this exact image. The internal-CA +# IP cert is correct on this box because it has a single interface/EIP; do +# not add on_demand_tls here (known wrong-SAN failure modes on multi-IP +# hosts — caddy#5479). +# +# Only web:3000 is proxied. The api is NOT publicly exposed at all — every +# browser call goes through the Next.js same-origin route handlers, which +# attach the JWT server-side (web/AGENTS.md auth model). Postgres/Redis/ +# Milvus/MinIO/etcd publish no host ports (see docker-compose.prod.yml). + +{ + admin off + # rate_limit is a non-standard directive; give it an explicit slot. + order rate_limit before basic_auth + # default_sni: cert selection when a client sends NO SNI (browsers on + # bare-IP URLs). fallback_sni: when a client sends an SNI that matches + # no cert (e.g. busybox ssl_client sends a literal-IP SNI — observed + # breaking the container healthcheck on the first AWS deploy). + default_sni {$SITE_HOST} + fallback_sni {$SITE_HOST} +} + +https://{$SITE_HOST} { + + tls internal + + encode zstd gzip + + # Body caps at the edge. Two request_body handlers NEST MaxBytesReaders, so + # the smallest matching cap wins: JSON routes are capped at 1MB even though + # the catch-all also matches. The catch-all is 200MB (decimal), deliberately + # BELOW the api's 200MiB (209,715,200-byte) streamed cap, so oversized + # uploads are rejected here before the Next proxy buffers them via + # req.formData() — the api stays authoritative for everything smaller. + @json_routes path /api/auth/* /api/search-summary /api/tasks/* + request_body @json_routes { + max_size 1MB + } + request_body { + max_size 200MB + } + + # Per-IP rate limits (HTTP 429 + Retry-After when exceeded). Keyed on + # {remote_host} = the TCP peer address — Caddy ignores client-supplied + # X-Forwarded-For unless trusted_proxies is configured, so the buckets + # can't be spoofed sideways. + # auth: blunt credential stuffing / mass signup (Phase 19 gap). + # heavy: /search-summary is ~2min of CPU + a paid LLM call; /upload + # triggers tens of minutes of CPU ingest. Keep them scarce. + # general: per-IP flood backstop for everything else. + rate_limit { + zone auth { + match { + path /api/auth/* + } + key {remote_host} + events 10 + window 1m + } + zone heavy { + match { + path /api/search-summary /api/upload + } + key {remote_host} + events 6 + window 1m + } + zone general { + key {remote_host} + events 600 + window 1m + } + } + + # No response timeout overrides: /search-summary legitimately holds the + # connection ~130-300s (web's own upstream AbortSignal is 300s); Caddy's + # defaults don't time out upstream reads, so leave them alone. + reverse_proxy web:3000 +} + +# Container-internal health listener (loopback-only, NOT host-published). +# The compose healthcheck probes this instead of the TLS site: it proves +# caddy is up AND the proxy→web chain works, without depending on busybox +# wget's TLS/SNI quirks. The real TLS path is exercised by deploy.sh's +# external smoke test. NOTE: a second site block requires the braced +# multi-site Caddyfile form — a bare-address single site swallows later +# blocks as unknown directives (broke the previous deploy). +http://127.0.0.1:8081 { + reverse_proxy web:3000 +} diff --git a/infra/caddy/Dockerfile b/infra/caddy/Dockerfile new file mode 100644 index 0000000..2f17940 --- /dev/null +++ b/infra/caddy/Dockerfile @@ -0,0 +1,12 @@ +# Caddy with the rate-limit module — the single public-facing service. +# +# Stock caddy images don't ship rate limiting; mholt/caddy-ratelimit is the +# canonical module, compiled in via xcaddy. Rate limiting at the edge is the +# no-app-code mitigation for the Phase 19 gap (open signup + expensive +# /search-summary and /upload paths) until that phase lands properly. +FROM caddy:2.11.4-builder AS builder +RUN xcaddy build \ + --with github.com/mholt/caddy-ratelimit + +FROM caddy:2.11.4 +COPY --from=builder /usr/bin/caddy /usr/bin/caddy diff --git a/infra/docker-compose.prod.yml b/infra/docker-compose.prod.yml new file mode 100644 index 0000000..0c215bf --- /dev/null +++ b/infra/docker-compose.prod.yml @@ -0,0 +1,348 @@ +# sermon.guide v0 — single-box PRODUCTION stack (AWS EC2 + docker compose). +# +# Brought up by infra/aws/deploy.sh on the instance: +# +# docker compose -f infra/docker-compose.prod.yml --env-file /opt/sermon/.env.prod … +# +# DELIBERATELY SELF-CONTAINED, not an overlay on docker-compose.yml: Compose +# merges `ports:` lists additively, so an overlay cannot *remove* the dev +# file's host-published data-plane ports — and "nothing but Caddy publishes +# a port" is the security property this file exists to guarantee. Keep the +# data-plane service definitions (images, healthchecks) in sync with +# docker-compose.yml when bumping versions there. +# +# Public surface: caddy 80/443 — NOTHING ELSE. Postgres/Redis/etcd/MinIO/ +# Milvus/api/web are reachable only on the internal compose network (Milvus +# has no auth at all; the AWS security group is the backstop, this file is +# the guarantee). +# +# Secrets come from /opt/sermon/.env.prod (generated once by deploy.sh, +# never committed). `${VAR:?…}` interpolation makes compose REFUSE to start +# with a missing secret — the no-code-change stand-in for the Phase 18 +# startup guard, so the dev-default JWT secret can never silently reach prod. +# +# One-shots live behind the "ops" profile so `up` never starts them: +# docker compose … run --rm migrate # alembic upgrade head +# docker compose … run --rm bootstrap-milvus # create library_vectors +# docker compose … run --rm prewarm # models → hf-cache volume +# +# NOTE: do not run this file on a machine that also runs the dev compose — +# same project name + container names (one box, one stack). + +name: sermon + +x-logging: &logging + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + +# Connection env shared by api / worker / one-shots: service names + the real +# container-internal ports. The code defaults (localhost, 54322/63792) are +# deliberately wrong so that prod config is always explicit — set everything. +x-backend-env: &backend-env + SERMON_POSTGRES_HOST: postgres + SERMON_POSTGRES_PORT: "5432" + SERMON_POSTGRES_USER: ${SERMON_POSTGRES_USER:?} + SERMON_POSTGRES_PASSWORD: ${SERMON_POSTGRES_PASSWORD:?} + SERMON_POSTGRES_DB: ${SERMON_POSTGRES_DB:?} + SERMON_REDIS_HOST: redis + SERMON_REDIS_PORT: "6379" + SERMON_REDIS_PASSWORD: ${SERMON_REDIS_PASSWORD:?} + SERMON_MILVUS_HOST: milvus + SERMON_MILVUS_PORT: "19530" + # The worker never reads SERMON_API_UPLOAD_DIR today (ingest tasks receive + # absolute paths from the api), but keeping api and worker agreed on the + # value makes the shared-volume contract explicit and refactor-safe. + SERMON_API_UPLOAD_DIR: /data/uploads + HF_HOME: /hf-cache + HF_HUB_OFFLINE: "1" + TRANSFORMERS_OFFLINE: "1" + +services: + + # --- data plane (mirrors docker-compose.yml, minus every host port) --- + + postgres: + image: postgres:16-alpine + container_name: sermon-postgres + restart: unless-stopped + <<: *logging + environment: + POSTGRES_USER: ${SERMON_POSTGRES_USER:?} + POSTGRES_PASSWORD: ${SERMON_POSTGRES_PASSWORD:?} + POSTGRES_DB: ${SERMON_POSTGRES_DB:?} + volumes: + - sermon-postgres:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"] + interval: 5s + timeout: 5s + retries: 12 + + redis: + image: redis:7-alpine + container_name: sermon-redis + restart: unless-stopped + <<: *logging + environment: + REDIS_PASSWORD: ${SERMON_REDIS_PASSWORD:?} + command: > + redis-server + --appendonly yes + --requirepass ${SERMON_REDIS_PASSWORD:?} + volumes: + - sermon-redis:/data + healthcheck: + test: ["CMD-SHELL", "redis-cli --no-auth-warning -a $$REDIS_PASSWORD ping | grep -q PONG"] + interval: 5s + timeout: 5s + retries: 12 + + etcd: + image: quay.io/coreos/etcd:v3.5.25 + container_name: sermon-etcd + restart: unless-stopped + <<: *logging + environment: + ETCD_AUTO_COMPACTION_MODE: revision + ETCD_AUTO_COMPACTION_RETENTION: "1000" + ETCD_QUOTA_BACKEND_BYTES: "4294967296" + ETCD_SNAPSHOT_COUNT: "50000" + command: > + etcd + -advertise-client-urls=http://etcd:2379 + -listen-client-urls=http://0.0.0.0:2379 + --data-dir=/etcd + volumes: + - sermon-etcd:/etcd + healthcheck: + test: ["CMD", "etcdctl", "endpoint", "health"] + interval: 10s + timeout: 10s + retries: 6 + + minio: + image: minio/minio:RELEASE.2024-05-28T17-19-04Z + container_name: sermon-minio + restart: unless-stopped + <<: *logging + environment: + MINIO_ROOT_USER: ${SERMON_MINIO_ROOT_USER:?} + MINIO_ROOT_PASSWORD: ${SERMON_MINIO_ROOT_PASSWORD:?} + command: server /data --console-address ":9001" + volumes: + - sermon-minio:/data + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 10s + timeout: 10s + retries: 6 + + milvus: + image: milvusdb/milvus:v2.6.15 + container_name: sermon-milvus + restart: unless-stopped + <<: *logging + command: ["milvus", "run", "standalone"] + security_opt: + - seccomp:unconfined + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + MINIO_REGION: us-east-1 + MINIO_ACCESSKEYID: ${SERMON_MINIO_ROOT_USER:?} + MINIO_SECRETACCESSKEY: ${SERMON_MINIO_ROOT_PASSWORD:?} + volumes: + - sermon-milvus:/var/lib/milvus + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] + interval: 10s + timeout: 10s + retries: 12 + start_period: 60s + depends_on: + etcd: + condition: service_healthy + minio: + condition: service_healthy + + # --- app tier --- + + api: + build: + context: .. + dockerfile: api/Dockerfile + image: sermon-api:v0 + container_name: sermon-api + restart: unless-stopped + <<: *logging + environment: + <<: *backend-env + SERMON_API_JWT_SECRET: ${SERMON_API_JWT_SECRET:?generate with `openssl rand -hex 48`} + SERMON_API_CORS_ORIGINS: ${SERMON_API_CORS_ORIGINS:?JSON list, e.g. ["https://1.2.3.4"]} + SERMON_API_LLM_PROVIDER: ${SERMON_API_LLM_PROVIDER:-google} + SERMON_API_LLM_MODEL: ${SERMON_API_LLM_MODEL:-} + GOOGLE_API_KEY: ${GOOGLE_API_KEY:-} + PPQ_API_KEY: ${PPQ_API_KEY:-} + volumes: + - sermon-uploads:/data/uploads + - sermon-hf-cache:/hf-cache + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/healthz', timeout=3)"] + interval: 15s + timeout: 5s + retries: 6 + start_period: 30s + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + milvus: + condition: service_healthy + + worker: + build: + context: .. + dockerfile: worker/Dockerfile + image: sermon-worker:v0 + container_name: sermon-worker + restart: unless-stopped + <<: *logging + environment: + <<: *backend-env + volumes: + # Same mount points as the api: the upload handoff is filesystem-based + # (api writes under SERMON_API_UPLOAD_DIR, the Celery task gets that + # absolute path), and both tiers lazy-load models from one HF cache. + - sermon-uploads:/data/uploads + - sermon-hf-cache:/hf-cache + # Warm shutdown grace before SIGKILL. A killed mid-ingest task is + # requeued by the broker after celery_app.py's 300s visibility timeout + # (acks_late + reject_on_worker_lost), and MinHash dedup makes the + # re-run converge — so 120s is comfort, not correctness. + stop_grace_period: 120s + healthcheck: + test: ["CMD-SHELL", "celery -A celery_app inspect ping -d celery@$$HOSTNAME --timeout 10 | grep -q pong"] + interval: 60s + timeout: 15s + retries: 3 + start_period: 30s + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + milvus: + condition: service_healthy + + web: + build: + context: ../web + image: sermon-web:v0 + container_name: sermon-web + restart: unless-stopped + <<: *logging + environment: + # Server-only (lib/config.ts); internal compose address. NEVER expose + # the api publicly — browsers only ever talk to the Next route handlers. + API_BASE_URL: http://api:8000 + NODE_ENV: production + healthcheck: + test: ["CMD-SHELL", "node -e \"fetch('http://127.0.0.1:3000/').then(r => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1))\""] + interval: 15s + timeout: 5s + retries: 6 + start_period: 20s + depends_on: + api: + condition: service_healthy + + caddy: + build: + context: ./caddy + image: sermon-caddy:v0 + container_name: sermon-caddy + restart: unless-stopped + <<: *logging + # THE ONLY HOST-PUBLISHED PORTS IN THIS FILE. 443/udp is HTTP/3. + ports: + - "80:80" + - "443:443" + - "443:443/udp" + environment: + # Bare host (no scheme): the Elastic IP for now, the domain later. + # Feeds both the site address and default_sni in the Caddyfile. + SITE_HOST: ${SITE_HOST:?bare host, e.g. 1.2.3.4 (or sermon.guide once DNS exists)} + volumes: + - ./caddy/Caddyfile:/etc/caddy/Caddyfile:ro + - sermon-caddy-data:/data + - sermon-caddy-config:/config + healthcheck: + # Probes the loopback-only :8081 listener in the Caddyfile: proves + # caddy is serving AND the reverse_proxy→web chain works. Deliberately + # NOT the TLS site — busybox wget sends a literal-IP SNI that can't + # match the site cert (observed unhealthy-looping on the first AWS + # deploy); the TLS path is verified by deploy.sh's external smoke test. + test: ["CMD-SHELL", "wget -q --spider http://127.0.0.1:8081/login || exit 1"] + interval: 15s + timeout: 5s + retries: 6 + start_period: 10s + depends_on: + web: + condition: service_healthy + + # --- one-shots (profile "ops"; never started by `up`) --- + + migrate: + image: sermon-worker:v0 + profiles: ["ops"] + <<: *logging + environment: + <<: *backend-env + command: ["alembic", "upgrade", "head"] + depends_on: + postgres: + condition: service_healthy + + bootstrap-milvus: + image: sermon-worker:v0 + profiles: ["ops"] + <<: *logging + environment: + <<: *backend-env + command: ["python", "scripts/bootstrap_milvus.py"] + depends_on: + milvus: + condition: service_healthy + + prewarm: + image: sermon-api:v0 + profiles: ["ops"] + <<: *logging + environment: + # Only place the stack is allowed to talk to HuggingFace: populate the + # shared cache volume, then everything else runs OFFLINE=1 against it. + HF_HOME: /hf-cache + HF_HUB_OFFLINE: "0" + TRANSFORMERS_OFFLINE: "0" + volumes: + - sermon-hf-cache:/hf-cache + # Bind source is compose-file-relative (infra/), unlike the ..-relative + # build contexts above. + - ./scripts/prewarm_models.py:/prewarm_models.py:ro + command: ["python", "/prewarm_models.py"] + +volumes: + sermon-postgres: + sermon-redis: + sermon-etcd: + sermon-minio: + sermon-milvus: + sermon-uploads: + sermon-hf-cache: + sermon-caddy-data: + sermon-caddy-config: diff --git a/infra/env.prod.template b/infra/env.prod.template new file mode 100644 index 0000000..65aa776 --- /dev/null +++ b/infra/env.prod.template @@ -0,0 +1,42 @@ +# sermon.guide — PRODUCTION environment template. +# +# The real file lives at /opt/sermon/.env.prod ON THE INSTANCE, generated by +# infra/aws/deploy.sh on first deploy (strong secrets via openssl rand, never +# committed, never on a dev machine). This template documents every variable; +# it is deliberately NOT named .env.* so repo tooling can read it (.claude +# settings deny Read on .env*) and .gitignore can't accidentally track a +# secret-bearing sibling. +# +# docker-compose.prod.yml uses ${VAR:?} interpolation for everything marked +# REQUIRED — compose refuses to start if one is missing, so the dev-default +# placeholders in code (jwt "change-me-…", passwords "sermon_local_dev") +# can never silently reach production. + +# --- secrets (REQUIRED; deploy.sh generates these on first run) --- +SERMON_API_JWT_SECRET= # openssl rand -hex 48 +SERMON_POSTGRES_PASSWORD= # openssl rand -hex 24 +SERMON_REDIS_PASSWORD= # openssl rand -hex 24 +SERMON_MINIO_ROOT_PASSWORD= # openssl rand -hex 24 + +# --- identities (REQUIRED) --- +SERMON_POSTGRES_USER=sermon +SERMON_POSTGRES_DB=sermon +SERMON_MINIO_ROOT_USER=sermon-minio + +# --- public surface (REQUIRED; deploy.sh fills from the Elastic IP) --- +# SITE_HOST is a BARE host (no scheme): the Elastic IP now; the domain later +# (then also remove `tls internal` from infra/caddy/Caddyfile for automatic +# Let's Encrypt, update CORS to match, `docker compose up -d caddy api`). +SITE_HOST= +# CORS is currently vestigial — browsers only ever talk same-origin to the +# Next route handlers and the api is not publicly exposed; kept correct for +# a future where the api gets its own public origin. +SERMON_API_CORS_ORIGINS= # JSON list, e.g. ["https://1.2.3.4"] + +# --- LLM for /search-summary (optional; the route 503s cleanly until set) --- +# Provider 'google' needs GOOGLE_API_KEY; 'ppq' needs PPQ_API_KEY (both +# intentionally UNPREFIXED — api/settings.py reads them via validation_alias). +SERMON_API_LLM_PROVIDER=google +SERMON_API_LLM_MODEL= +GOOGLE_API_KEY= +PPQ_API_KEY= diff --git a/infra/scripts/prewarm_models.py b/infra/scripts/prewarm_models.py new file mode 100644 index 0000000..1050629 --- /dev/null +++ b/infra/scripts/prewarm_models.py @@ -0,0 +1,40 @@ +"""One-shot model prewarm for the shared HuggingFace cache volume. + +Runs as the `prewarm` service in infra/docker-compose.prod.yml (api image, +HF_HUB_OFFLINE unset) BEFORE the api/worker start. Downloads the three +inference models into HF_HOME (the `sermon-hf-cache` volume) so the runtime +containers — which run with HF_HUB_OFFLINE=1 — load deterministically with +zero network and never pay a multi-GB download on a user's first request. + +Model ids MUST stay in sync with the in-process loaders: + - BAAI/bge-large-en-v1.5 worker/embedding.py (+ chunking.py via + llama-index; same on-disk hub snapshot) + - cross-encoder/ms-marco-MiniLM-L-6-v2 api/rerank.py + - BAAI/bge-m3 api/highlight.py + +Idempotent: warm cache entries are revalidated, not re-downloaded. Exits +non-zero on any failure so deploy.sh aborts before flipping traffic. +""" + +import time + +from sentence_transformers import CrossEncoder, SentenceTransformer + +MODELS: list[tuple[str, type[CrossEncoder] | type[SentenceTransformer]]] = [ + ("BAAI/bge-large-en-v1.5", SentenceTransformer), + ("cross-encoder/ms-marco-MiniLM-L-6-v2", CrossEncoder), + ("BAAI/bge-m3", SentenceTransformer), +] + + +def main() -> None: + for name, loader in MODELS: + start = time.monotonic() + print(f"prewarm: loading {name} …", flush=True) + loader(name, device="cpu") + print(f"prewarm: {name} ready in {time.monotonic() - start:.1f}s", flush=True) + print("prewarm: all models cached", flush=True) + + +if __name__ == "__main__": + main() diff --git a/web/.dockerignore b/web/.dockerignore new file mode 100644 index 0000000..937ee7a --- /dev/null +++ b/web/.dockerignore @@ -0,0 +1,5 @@ +# Build-context filter for the web image (context = web/). +node_modules +.next +.env +.env.* diff --git a/web/Dockerfile b/web/Dockerfile new file mode 100644 index 0000000..5cb94fe --- /dev/null +++ b/web/Dockerfile @@ -0,0 +1,38 @@ +# web/ — Next.js 15 frontend. BUILD FROM web/: +# +# docker build web/ +# +# web/ is fully independent (no Python imports; talks to api/ over HTTP +# only), so its context is just this directory. +# +# Uses the `output: "standalone"` bundle from next.config.ts: the runner +# stage ships only .next/standalone + .next/static and starts with +# `node server.js` (NOT `next start`), keeping the image small. +# +# API_BASE_URL is a RUNTIME env var (read via server-only lib/config.ts, +# never inlined at build), so one image works against any api origin — +# do not pass it as a build ARG and do not invent a NEXT_PUBLIC_ variant +# (that would leak the API origin into client JS and break the auth model). + +FROM node:20-slim AS builder +# pnpm 9 pinned explicitly: package.json deliberately has NO packageManager +# field (it conflicts with CI's pnpm/action-setup@v6 version pin — web/AGENTS.md). +RUN npm install -g pnpm@9 +WORKDIR /app +COPY package.json pnpm-lock.yaml ./ +RUN pnpm install --frozen-lockfile +COPY . . +RUN pnpm build + +FROM node:20-slim +# NODE_ENV=production is load-bearing: lib/session.ts gates the session +# cookie's Secure attribute on it. HOSTNAME=0.0.0.0 so the standalone +# server binds beyond the container loopback. +ENV NODE_ENV=production \ + PORT=3000 \ + HOSTNAME=0.0.0.0 +WORKDIR /app +COPY --from=builder /app/.next/standalone ./ +COPY --from=builder /app/.next/static ./.next/static +EXPOSE 3000 +CMD ["node", "server.js"] diff --git a/web/next.config.ts b/web/next.config.ts index 2f6491c..f1122cc 100644 --- a/web/next.config.ts +++ b/web/next.config.ts @@ -2,6 +2,10 @@ import type { NextConfig } from "next"; const nextConfig: NextConfig = { reactStrictMode: true, + // Self-contained server bundle at .next/standalone for the Docker image + // (web/Dockerfile runs `node server.js`, not `next start`). Dev (`next dev`) + // and CI (`tsc`/`biome`/`vitest`, no build) are unaffected. + output: "standalone", }; export default nextConfig; diff --git a/worker/Dockerfile b/worker/Dockerfile new file mode 100644 index 0000000..0de1416 --- /dev/null +++ b/worker/Dockerfile @@ -0,0 +1,57 @@ +# worker/ — Celery ingestion worker. BUILD FROM THE REPO ROOT (kept +# symmetric with api/Dockerfile; the worker itself has no upward deps): +# +# docker build -f worker/Dockerfile . +# +# The same image serves three roles in infra/docker-compose.prod.yml: +# 1. the long-running Celery worker (default CMD below), +# 2. the one-shot Alembic migration (`command: alembic upgrade head`), +# 3. the one-shot Milvus bootstrap (`command: python scripts/bootstrap_milvus.py`). +# All three need cwd == /app/worker because the project is intentionally +# non-packaged ([tool.uv] package = false): `from extractors import …`, +# `import dedup`, `celery -A celery_app` all resolve off the working dir. +# +# BGE-Large (~1.3GB) is NOT baked — it lives in the shared `sermon-hf-cache` +# volume (see api/Dockerfile rationale). NLTK WordNet (~40MB) IS baked, so +# the first dedup signature() never does a runtime download. + +FROM python:3.12-slim AS deps +COPY --from=ghcr.io/astral-sh/uv:0.11.19 /uv /usr/local/bin/uv +ENV UV_PYTHON_DOWNLOADS=never \ + UV_LINK_MODE=copy \ + UV_COMPILE_BYTECODE=1 +WORKDIR /app/worker +COPY worker/pyproject.toml worker/uv.lock ./ +RUN uv sync --frozen --no-dev + +FROM python:3.12-slim +# pandoc: EPUB extraction shells out via pypandoc (extractors/epub.py). +# libmagic1: MIME sniffing via python-magic (extractors/extract.py) — NOT +# preinstalled on -slim images. +# libgomp1: OpenMP runtime for torch-CPU / sentence-transformers wheels. +RUN apt-get update \ + && apt-get install -y --no-install-recommends pandoc libmagic1 libgomp1 ca-certificates \ + && rm -rf /var/lib/apt/lists/* +COPY --from=deps /app/worker/.venv /app/worker/.venv +COPY worker/ /app/worker/ +ENV PATH="/app/worker/.venv/bin:$PATH" \ + PYTHONUNBUFFERED=1 \ + NLTK_DATA=/usr/share/nltk_data +# Bake WordNet (+ omw-1.4, which newer NLTK wants for some WordNet ops) so +# dedup.signature() never downloads at runtime. The downloader leaves zips +# that nltk.data.find('corpora/wordnet') can NOT resolve (verified against +# this image) — extract and drop them, or the first signature() call tries +# a network download anyway. +RUN python -m nltk.downloader -d /usr/share/nltk_data wordnet omw-1.4 \ + && python -c "import zipfile, pathlib; \ +[(zipfile.ZipFile(z).extractall(z.parent), z.unlink()) \ + for z in pathlib.Path('/usr/share/nltk_data/corpora').glob('*.zip')]" +WORKDIR /app/worker +# --concurrency=1: each prefork child that ingests a new book loads BGE-Large +# up to TWICE (chunking boundary-embedder + chunk embedder) ≈ 3-4GB resident; +# the shared 16GB box can't afford parallel ingests alongside the api's models. +# --time-limit: hard kill any single ingest at 1h (soft at 55min) so one +# poisoned/oversized upload can't pin the CPU forever (celery_app.py already +# sets acks_late + reject_on_worker_lost, so a killed task is not silently lost). +CMD ["celery", "-A", "celery_app", "worker", "--loglevel=info", \ + "--concurrency=1", "--time-limit=3600", "--soft-time-limit=3300"]